1580 files changed, 67059 insertions, 38750 deletions
diff --git a/.gitignore b/.gitignore
index 2e2713a..d7dcc54 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,6 +15,8 @@
 *.orig
 # Byte compiled python modules.
 *.pyc
+# vim swap files
+.*.swp
 
 #==============================================================================#
 # Explicit files to ignore (only matches one).
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 471b45e..e0404cf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -154,7 +154,7 @@ else()
   option(LLVM_ENABLE_ASSERTIONS "Enable assertions" ON)
 endif()
 
-# All options refered to from HandleLLVMOptions have to be specified
+# All options referred to from HandleLLVMOptions have to be specified
 # BEFORE this include, otherwise options will not be correctly set on
 # first cmake run
 include(config-ix)
@@ -227,6 +227,13 @@ if( LLVM_INCLUDE_TOOLS )
   add_subdirectory(tools)
 endif()
 
+option(LLVM_BUILD_RUNTIME
+  "Build the LLVM runtime libraries. If OFF, just generate build targets." ON)
+option(LLVM_INCLUDE_RUNTIME "Generate build targets for the LLVM runtimes" ON)
+if( LLVM_INCLUDE_RUNTIME )
+  add_subdirectory(runtime)
+endif()
+
 option(LLVM_BUILD_EXAMPLES
   "Build the LLVM example programs. If OFF, just generate build targets." OFF)
 option(LLVM_INCLUDE_EXAMPLES "Generate build targets for the LLVM examples" ON)
@@ -235,7 +242,7 @@ if( LLVM_INCLUDE_EXAMPLES )
 endif()
 
 option(LLVM_BUILD_TESTS
-  "Build LLVM unit tests. If OFF, just generate build targes." OFF)
+  "Build LLVM unit tests. If OFF, just generate build targets." OFF)
 if( LLVM_INCLUDE_TESTS )
   add_subdirectory(test)
   add_subdirectory(utils/unittest)
diff --git a/LICENSE.TXT b/LICENSE.TXT
index b8d2c74..1b1047c 100644
--- a/LICENSE.TXT
+++ b/LICENSE.TXT
@@ -4,7 +4,7 @@ LLVM Release License
 University of Illinois/NCSA
 Open Source License
 
-Copyright (c) 2003-2010 University of Illinois at Urbana-Champaign.
+Copyright (c) 2003-2011 University of Illinois at Urbana-Champaign.
 All rights reserved.
 
 Developed by:
diff --git a/Makefile b/Makefile
index dbb759d..88e63e9 100644
--- a/Makefile
+++ b/Makefile
@@ -69,7 +69,7 @@ endif
 ifeq ($(MAKECMDGOALS),install-clang)
   DIRS := tools/clang/tools/driver tools/clang/lib/Headers \
           tools/clang/runtime tools/clang/docs \
-          tools/lto
+          tools/lto runtime
   OPTIONAL_DIRS :=
   NO_INSTALL = 1
 endif
@@ -83,7 +83,7 @@ ifeq ($(MAKECMDGOALS),install-clang-c)
 endif
 
 ifeq ($(MAKECMDGOALS),clang-only)
-  DIRS := $(filter-out tools runtime docs unittests, $(DIRS)) \
+  DIRS := $(filter-out tools docs unittests, $(DIRS)) \
           tools/clang tools/lto
   OPTIONAL_DIRS :=
 endif
@@ -168,6 +168,15 @@ install-clang: install
 install-clang-c: install
 install-libs: install
 
+# If SHOW_DIAGNOSTICS is enabled, clear the diagnostics file first.
+ifeq ($(SHOW_DIAGNOSTICS),1)
+clean-diagnostics:
+	$(Verb) rm -f $(LLVM_OBJ_ROOT)/$(BuildMode)/diags
+.PHONY: clean-diagnostics
+
+all-local:: clean-diagnostics
+endif
+
 #------------------------------------------------------------------------
 # Make sure the generated headers are up-to-date. This must be kept in
 # sync with the AC_CONFIG_HEADER invocations in autoconf/configure.ac
@@ -198,6 +207,12 @@ ifneq ($(ENABLE_OPTIMIZED),1)
 	$(Echo) '*****' optimized build. Use 'make ENABLE_OPTIMIZED=1' to
 	$(Echo) '*****' make an optimized build. Alternatively you can
 	$(Echo) '*****' configure with --enable-optimized.
+ifeq ($(SHOW_DIAGNOSTICS),1)
+	$(Verb) if test -s $(LLVM_OBJ_ROOT)/$(BuildMode)/diags; then \
+	  $(LLVM_SRC_ROOT)/utils/show-diagnostics \
+	    $(LLVM_OBJ_ROOT)/$(BuildMode)/diags; \
+	fi
+endif
 endif
 endif
 
diff --git a/Makefile.config.in b/Makefile.config.in
index 5c73758..9bdb075 100644
--- a/Makefile.config.in
+++ b/Makefile.config.in
@@ -123,6 +123,9 @@ TARGET_TRIPLE=@target@
 # Extra options to compile LLVM with
 EXTRA_OPTIONS=@EXTRA_OPTIONS@
 
+# Extra options to link LLVM with
+EXTRA_LD_OPTIONS=@EXTRA_LD_OPTIONS@
+
 # Endian-ness of the target
 ENDIAN=@ENDIAN@
 
diff --git a/Makefile.rules b/Makefile.rules
index 5fc77a5..162fd23 100644
--- a/Makefile.rules
+++ b/Makefile.rules
@@ -444,11 +444,11 @@ endif
 # LLVM Capable Compiler
 #--------------------------------------------------------------------
 
-ifeq ($(LLVMCC_OPTION),llvm-gcc)
+ifneq ($(findstring llvm-gcc,$(LLVMCC_OPTION)),)
   LLVMCC := $(LLVMGCC)
   LLVMCXX := $(LLVMGXX)
 else
-  ifeq ($(LLVMCC_OPTION),clang)
+  ifneq ($(findstring clang,$(LLVMCC_OPTION)),)
     ifneq ($(CLANGPATH),)
       LLVMCC := $(CLANGPATH)
       LLVMCXX := $(CLANGXXPATH)
@@ -582,6 +582,10 @@ endif
 # Options To Invoke Tools
 #----------------------------------------------------------
 
+ifdef EXTRA_LD_OPTIONS
+LD.Flags += $(EXTRA_LD_OPTIONS)
+endif
+
 ifndef NO_PEDANTIC
 CompileCommonOpts += -pedantic -Wno-long-long
 endif
@@ -646,25 +650,41 @@ CPP.Flags     += $(sort -I$(PROJ_OBJ_DIR) -I$(PROJ_SRC_DIR) \
 	         $(LLVM_OBJ_ROOT) $(LLVM_SRC_ROOT))) \
 	         $(CPP.BaseFlags)
 
+# SHOW_DIAGNOSTICS support.
+ifeq ($(SHOW_DIAGNOSTICS),1)
+  Compile.Wrapper := env CC_LOG_DIAGNOSTICS=1 \
+	                  CC_LOG_DIAGNOSTICS_FILE="$(LLVM_OBJ_ROOT)/$(BuildMode)/diags"
+else
+  Compile.Wrapper :=
+endif
+
 ifeq ($(BUILD_COMPONENT), 1)
-  Compile.C     = $(BUILD_CC) $(CPP.Flags) $(C.Flags) $(CFLAGS) $(CPPFLAGS) \
+  Compile.C     = $(Compile.Wrapper) \
+	          $(BUILD_CC) $(CPP.Flags) $(C.Flags) $(CFLAGS) $(CPPFLAGS) \
                   $(TargetCommonOpts) $(CompileCommonOpts) -c
-  Compile.CXX   = $(BUILD_CXX) $(CPP.Flags) $(CXX.Flags) $(CXXFLAGS) \
+  Compile.CXX   = $(Compile.Wrapper) \
+	          $(BUILD_CXX) $(CPP.Flags) $(CXX.Flags) $(CXXFLAGS) \
 		  $(CPPFLAGS) \
                   $(TargetCommonOpts) $(CompileCommonOpts) -c
-  Preprocess.CXX= $(BUILD_CXX) $(CPP.Flags) $(CPPFLAGS) $(TargetCommonOpts) \
+  Preprocess.CXX= $(Compile.Wrapper) \
+	          $(BUILD_CXX) $(CPP.Flags) $(CPPFLAGS) $(TargetCommonOpts) \
                   $(CompileCommonOpts) $(CXX.Flags) -E
-  Link          = $(BUILD_CXX) $(CPP.Flags) $(CXX.Flags) $(CXXFLAGS) \
+  Link          = $(Compile.Wrapper) \
+	          $(BUILD_CXX) $(CPP.Flags) $(CXX.Flags) $(CXXFLAGS) \
 		  $(LD.Flags) $(LDFLAGS) \
                   $(TargetCommonOpts) $(CompileCommonOpts) $(Strip)
 else
-  Compile.C     = $(CC) $(CPP.Flags) $(C.Flags) $(CFLAGS) $(CPPFLAGS) \
+  Compile.C     = $(Compile.Wrapper) \
+	          $(CC) $(CPP.Flags) $(C.Flags) $(CFLAGS) $(CPPFLAGS) \
                   $(TargetCommonOpts) $(CompileCommonOpts) -c
-  Compile.CXX   = $(CXX) $(CPP.Flags) $(CXX.Flags) $(CXXFLAGS) $(CPPFLAGS) \
+  Compile.CXX   = $(Compile.Wrapper) \
+	          $(CXX) $(CPP.Flags) $(CXX.Flags) $(CXXFLAGS) $(CPPFLAGS) \
                   $(TargetCommonOpts) $(CompileCommonOpts) -c
-  Preprocess.CXX= $(CXX) $(CPP.Flags) $(TargetCommonOpts) $(CPPFLAGS) \
+  Preprocess.CXX= $(Compile.Wrapper) \
+	          $(CXX) $(CPP.Flags) $(TargetCommonOpts) $(CPPFLAGS) \
                   $(CompileCommonOpts) $(CXX.Flags) -E
-  Link          = $(CXX) $(CPP.Flags) $(CXX.Flags) $(CXXFLAGS) $(LD.Flags) \
+  Link          = $(Compile.Wrapper) \
+	          $(CXX) $(CPP.Flags) $(CXX.Flags) $(CXXFLAGS) $(LD.Flags) \
                   $(LDFLAGS) $(TargetCommonOpts)  $(CompileCommonOpts) $(Strip)
 endif
 
@@ -1528,31 +1548,31 @@ BC_DEPEND_MOVEFILE = then $(MV) -f "$(ObjDir)/$*.bc.d.tmp" "$(ObjDir)/$*.bc.d";
 $(ObjDir)/%.ll: %.cpp $(ObjDir)/.dir $(BUILT_SOURCES) $(LLVMCXX)
 	$(Echo) "Compiling $*.cpp for $(BuildMode) build (bytecode)"
 	$(Verb) if $(BCCompile.CXX) $(BC_DEPEND_OPTIONS) \
-			$< -o $(ObjDir)/$*.ll -S -$(LLVMCC_EMITIR_FLAG) ; \
+			$< -o $(ObjDir)/$*.ll -S $(LLVMCC_EMITIR_FLAG) ; \
 	        $(BC_DEPEND_MOVEFILE)
 
 $(ObjDir)/%.ll: %.mm $(ObjDir)/.dir $(BUILT_SOURCES) $(LLVMCXX)
 	$(Echo) "Compiling $*.mm for $(BuildMode) build (bytecode)"
 	$(Verb) if $(BCCompile.CXX) $(BC_DEPEND_OPTIONS) \
-			$< -o $(ObjDir)/$*.ll -S -$(LLVMCC_EMITIR_FLAG) ; \
+			$< -o $(ObjDir)/$*.ll -S $(LLVMCC_EMITIR_FLAG) ; \
 	        $(BC_DEPEND_MOVEFILE)
 
 $(ObjDir)/%.ll: %.cc $(ObjDir)/.dir $(BUILT_SOURCES) $(LLVMCXX)
 	$(Echo) "Compiling $*.cc for $(BuildMode) build (bytecode)"
 	$(Verb) if $(BCCompile.CXX) $(BC_DEPEND_OPTIONS) \
-			$< -o $(ObjDir)/$*.ll -S -$(LLVMCC_EMITIR_FLAG) ; \
+			$< -o $(ObjDir)/$*.ll -S $(LLVMCC_EMITIR_FLAG) ; \
 	        $(BC_DEPEND_MOVEFILE)
 
 $(ObjDir)/%.ll: %.c $(ObjDir)/.dir $(BUILT_SOURCES) $(LLVMCC)
 	$(Echo) "Compiling $*.c for $(BuildMode) build (bytecode)"
 	$(Verb) if $(BCCompile.C) $(BC_DEPEND_OPTIONS) \
-			$< -o $(ObjDir)/$*.ll -S -$(LLVMCC_EMITIR_FLAG) ; \
+			$< -o $(ObjDir)/$*.ll -S $(LLVMCC_EMITIR_FLAG) ; \
 	        $(BC_DEPEND_MOVEFILE)
 
 $(ObjDir)/%.ll: %.m $(ObjDir)/.dir $(BUILT_SOURCES) $(LLVMCC)
 	$(Echo) "Compiling $*.m for $(BuildMode) build (bytecode)"
 	$(Verb) if $(BCCompile.C) $(BC_DEPEND_OPTIONS) \
-			$< -o $(ObjDir)/$*.ll -S -$(LLVMCC_EMITIR_FLAG) ; \
+			$< -o $(ObjDir)/$*.ll -S $(LLVMCC_EMITIR_FLAG) ; \
 	        $(BC_DEPEND_MOVEFILE)
 
 # Provide alternate rule sets if dependencies are disabled
@@ -1580,23 +1600,23 @@ $(ObjDir)/%.o: %.m $(ObjDir)/.dir $(BUILT_SOURCES)
 
 $(ObjDir)/%.ll: %.cpp $(ObjDir)/.dir $(BUILT_SOURCES) $(LLVMCXX)
 	$(Echo) "Compiling $*.cpp for $(BuildMode) build (bytecode)"
-	$(BCCompile.CXX) $< -o $@ -S -$(LLVMCC_EMITIR_FLAG)
+	$(BCCompile.CXX) $< -o $@ -S $(LLVMCC_EMITIR_FLAG)
 
 $(ObjDir)/%.ll: %.mm $(ObjDir)/.dir $(BUILT_SOURCES) $(LLVMCXX)
 	$(Echo) "Compiling $*.mm for $(BuildMode) build (bytecode)"
-	$(BCCompile.CXX) $< -o $@ -S -$(LLVMCC_EMITIR_FLAG)
+	$(BCCompile.CXX) $< -o $@ -S $(LLVMCC_EMITIR_FLAG)
 
 $(ObjDir)/%.ll: %.cc $(ObjDir)/.dir $(BUILT_SOURCES) $(LLVMCXX)
 	$(Echo) "Compiling $*.cc for $(BuildMode) build (bytecode)"
-	$(BCCompile.CXX) $< -o $@ -S -$(LLVMCC_EMITIR_FLAG)
+	$(BCCompile.CXX) $< -o $@ -S $(LLVMCC_EMITIR_FLAG)
 
 $(ObjDir)/%.ll: %.c $(ObjDir)/.dir $(BUILT_SOURCES) $(LLVMCC)
 	$(Echo) "Compiling $*.c for $(BuildMode) build (bytecode)"
-	$(BCCompile.C) $< -o $@ -S -$(LLVMCC_EMITIR_FLAG)
+	$(BCCompile.C) $< -o $@ -S $(LLVMCC_EMITIR_FLAG)
 
 $(ObjDir)/%.ll: %.m $(ObjDir)/.dir $(BUILT_SOURCES) $(LLVMCC)
 	$(Echo) "Compiling $*.m for $(BuildMode) build (bytecode)"
-	$(BCCompile.C) $< -o $@ -S -$(LLVMCC_EMITIR_FLAG)
+	$(BCCompile.C) $< -o $@ -S $(LLVMCC_EMITIR_FLAG)
 
 endif
 
@@ -2008,7 +2028,7 @@ $(DistZip) : $(TopDistDir)/.makedistdir
 	$(Verb) cd $(PROJ_OBJ_ROOT) ; $(ZIP) -rq $(DistZip) $(DistName)
 
 dist :: $(DistTarGZip) $(DistTarBZ2) $(DistZip)
-	$(Echo) ===== DISTRIBUTION PACKAGING SUCESSFUL =====
+	$(Echo) ===== DISTRIBUTION PACKAGING SUCCESSFUL =====
 
 DistCheckDir := $(PROJ_OBJ_ROOT)/_distcheckdir
 
diff --git a/autoconf/config.guess b/autoconf/config.guess
index 865fe53..9807c91 100755
--- a/autoconf/config.guess
+++ b/autoconf/config.guess
@@ -789,13 +789,12 @@ EOF
 	echo ${UNAME_MACHINE}-unknown-bsdi${UNAME_RELEASE}
 	exit ;;
     *:FreeBSD:*:*)
+        UNAME_PROCESSOR=`/usr/bin/uname -p`
 	case ${UNAME_MACHINE} in
-	    pc98)
-		echo i386-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
 	    amd64)
 		echo x86_64-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
 	    *)
-		echo ${UNAME_MACHINE}-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
+		echo ${UNAME_PROCESSOR}-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
 	esac
 	exit ;;
     i*:CYGWIN*:*)
diff --git a/autoconf/configure.ac b/autoconf/configure.ac
index 9c73916..a39a685 100644
--- a/autoconf/configure.ac
+++ b/autoconf/configure.ac
@@ -35,8 +35,8 @@ AC_INIT([[llvm]],[[3.0svn]],[llvmbugs@cs.uiuc.edu])
 
 dnl Provide a copyright substitution and ensure the copyright notice is included
 dnl in the output of --version option of the generated configure script.
-AC_SUBST(LLVM_COPYRIGHT,["Copyright (c) 2003-2010 University of Illinois at Urbana-Champaign."])
-AC_COPYRIGHT([Copyright (c) 2003-2010 University of Illinois at Urbana-Champaign.])
+AC_SUBST(LLVM_COPYRIGHT,["Copyright (c) 2003-2011 University of Illinois at Urbana-Champaign."])
+AC_COPYRIGHT([Copyright (c) 2003-2011 University of Illinois at Urbana-Champaign.])
 
 dnl Indicate that we require autoconf 2.59 or later. Ths is needed because we
 dnl use some autoconf macros only available in 2.59.
@@ -831,6 +831,17 @@ case "$withval" in
 esac
 AC_SUBST(EXTRA_OPTIONS,$EXTRA_OPTIONS)
 
+dnl Specify extra linker build options
+AC_ARG_WITH(extra-ld-options,
+  AS_HELP_STRING([--with-extra-ld-options],
+                 [Specify additional options to link LLVM with]),,
+                 withval=default)
+case "$withval" in
+  default) EXTRA_LD_OPTIONS= ;;
+  *) EXTRA_LD_OPTIONS=$withval ;;
+esac
+AC_SUBST(EXTRA_LD_OPTIONS,$EXTRA_LD_OPTIONS)
+
 dnl Allow specific bindings to be specified for building (or not)
 AC_ARG_ENABLE([bindings],AS_HELP_STRING([--enable-bindings],
     [Build specific language bindings: all,auto,none,{binding-name} (default=auto)]),,
@@ -1429,6 +1440,24 @@ if test "$llvm_cv_os_type" = "MingW" ; then
   AC_CHECK_LIB(gcc,__cmpdi2,AC_DEFINE([HAVE___CMPDI2],[1],[Have host's __cmpdi2]))
 fi
 
+dnl Check Win32 API EnumerateLoadedModules.
+if test "$llvm_cv_os_type" = "MingW" ; then
+  AC_MSG_CHECKING([whether EnumerateLoadedModules() accepts new decl])
+  AC_COMPILE_IFELSE([[#include <windows.h>
+#include <imagehlp.h>
+extern void foo(PENUMLOADED_MODULES_CALLBACK);
+extern void foo(BOOL(CALLBACK*)(PCSTR,ULONG_PTR,ULONG,PVOID));]],
+[
+  AC_MSG_RESULT([yes])
+  llvm_cv_win32_elmcb_pcstr="PCSTR"
+],
+[
+  AC_MSG_RESULT([no])
+  llvm_cv_win32_elmcb_pcstr="PSTR"
+])
+  AC_DEFINE_UNQUOTED([WIN32_ELMCB_PCSTR],$llvm_cv_win32_elmcb_pcstr,[Type of 1st arg on ELM Callback])
+fi
+
 dnl Check for variations in the Standard C++ library and STL. These macros are
 dnl provided by LLVM in the autoconf/m4 directory.
 AC_FUNC_ISNAN
@@ -1510,7 +1539,7 @@ dnl Set the flags needed to emit LLVM IR and to disable optimizations
 dnl in llvmgcc
 if test "$llvm_cv_llvmgcc_dragonegg" = "yes" ; then
   LLVMCC_EMITIR_FLAG="-fplugin-arg-dragonegg-emit-ir"
-  LLVMCC_DISABLEOPT_FLAGS="-fplugin-arg-dragonegg-disable-llvm-optzns"
+  LLVMCC_DISABLEOPT_FLAGS="-fplugin-arg-dragonegg-llvm-ir-optimize=0"
 else
   LLVMCC_EMITIR_FLAG="-emit-llvm"
   LLVMCC_DISABLEOPT_FLAGS="-mllvm -disable-llvm-optzns"
@@ -1705,6 +1734,12 @@ AC_CONFIG_FILES([Makefile.config])
 dnl Configure the RPM spec file for LLVM
 AC_CONFIG_FILES([llvm.spec])
 
+dnl Configure doxygen's configuration file
+AC_CONFIG_FILES([docs/doxygen.cfg])
+if test -f ${srcdir}/tools/clang/README.txt; then
+  AC_CONFIG_FILES([tools/clang/docs/doxygen.cfg])
+fi
+
 dnl Configure llvmc's Base plugin
 AC_CONFIG_FILES([tools/llvmc/src/Base.td])
 
diff --git a/autoconf/m4/libtool.m4 b/autoconf/m4/libtool.m4
index a8b5e6a..e89738c 100644
--- a/autoconf/m4/libtool.m4
+++ b/autoconf/m4/libtool.m4
@@ -1118,7 +1118,7 @@ if test -n "$_LT_AC_TAGVAR(hardcode_libdir_flag_spec, $1)" || \
    test -n "$_LT_AC_TAGVAR(runpath_var, $1)" || \
    test "X$_LT_AC_TAGVAR(hardcode_automatic, $1)" = "Xyes" ; then
 
-  # We can hardcode non-existant directories.
+  # We can hardcode non-existent directories.
   if test "$_LT_AC_TAGVAR(hardcode_direct, $1)" != no &&
      # If the only mechanism to avoid hardcoding is shlibpath_var, we
      # have to relink, otherwise we might link with an installed library
diff --git a/autoconf/m4/ltdl.m4 b/autoconf/m4/ltdl.m4
index bc9e2ad..407a16e 100644
--- a/autoconf/m4/ltdl.m4
+++ b/autoconf/m4/ltdl.m4
@@ -156,7 +156,7 @@ AC_CACHE_CHECK([whether deplibs are loaded by dlopen],
   osf[[1234]]*)
     # dlopen did load deplibs (at least at 4.x), but until the 5.x series,
     # it did *not* use an RPATH in a shared library to find objects the
-    # library depends on, so we explictly say `no'.
+    # library depends on, so we explicitly say `no'.
     libltdl_cv_sys_dlopen_deplibs=no
     ;;
   osf5.0|osf5.0a|osf5.1)
diff --git a/cmake/config-ix.cmake b/cmake/config-ix.cmake
index e446853..c1b22d4 100755
--- a/cmake/config-ix.cmake
+++ b/cmake/config-ix.cmake
@@ -366,6 +366,21 @@ else( MSVC )
   set(LTDL_DLOPEN_DEPLIBS 0)  # TODO
 endif( MSVC )
 
+if( PURE_WINDOWS )
+  CHECK_CXX_SOURCE_COMPILES("
+    #include <windows.h>
+    #include <imagehlp.h>
+    extern \"C\" void foo(PENUMLOADED_MODULES_CALLBACK);
+    extern \"C\" void foo(BOOL(CALLBACK*)(PCSTR,ULONG_PTR,ULONG,PVOID));
+    int main(){return 0;}"
+    HAVE_ELMCB_PCSTR)
+  if( HAVE_ELMCB_PCSTR )
+    set(WIN32_ELMCB_PCSTR "PCSTR")
+  else()
+    set(WIN32_ELMCB_PCSTR "PSTR")
+  endif()
+endif( PURE_WINDOWS )
+
 # FIXME: Signal handler return type, currently hardcoded to 'void'
 set(RETSIGTYPE void)
 
diff --git a/cmake/modules/AddLLVM.cmake b/cmake/modules/AddLLVM.cmake
index 6087094..c13143b 100755
--- a/cmake/modules/AddLLVM.cmake
+++ b/cmake/modules/AddLLVM.cmake
@@ -17,9 +17,13 @@ macro(add_llvm_library name)
   # list. Without this, linking the unit tests on MinGW fails.
   link_system_libs( ${name} )
 
-  install(TARGETS ${name}
-    LIBRARY DESTINATION lib${LLVM_LIBDIR_SUFFIX}
-    ARCHIVE DESTINATION lib${LLVM_LIBDIR_SUFFIX})
+  if( EXCLUDE_FROM_ALL )
+    set_target_properties( ${name} PROPERTIES EXCLUDE_FROM_ALL ON)
+  else()
+    install(TARGETS ${name}
+      LIBRARY DESTINATION lib${LLVM_LIBDIR_SUFFIX}
+      ARCHIVE DESTINATION lib${LLVM_LIBDIR_SUFFIX})
+  endif()
   # The LLVM Target library shall be built before its sublibraries
   # (asmprinter, etc) because those may use tablegenned files which
   # generation is triggered by the main LLVM target library. Necessary
@@ -57,9 +61,13 @@ ${name} ignored.")
         LINK_FLAGS "-Wl,-flat_namespace -Wl,-undefined -Wl,suppress")
     endif()
 
-    install(TARGETS ${name}
-      LIBRARY DESTINATION lib${LLVM_LIBDIR_SUFFIX}
-      ARCHIVE DESTINATION lib${LLVM_LIBDIR_SUFFIX})
+    if( EXCLUDE_FROM_ALL )
+      set_target_properties( ${name} PROPERTIES EXCLUDE_FROM_ALL ON)
+    else()
+      install(TARGETS ${name}
+	LIBRARY DESTINATION lib${LLVM_LIBDIR_SUFFIX}
+	ARCHIVE DESTINATION lib${LLVM_LIBDIR_SUFFIX})
+    endif()
   endif()
 
   set_target_properties(${name} PROPERTIES FOLDER "Loadable modules")
diff --git a/cmake/modules/CMakeLists.txt b/cmake/modules/CMakeLists.txt
index 036ee05..257deb6 100644
--- a/cmake/modules/CMakeLists.txt
+++ b/cmake/modules/CMakeLists.txt
@@ -1,14 +1,21 @@
 set(llvm_cmake_builddir "${LLVM_BINARY_DIR}/share/llvm/cmake")
+set(LLVM_INSTALL_PREFIX ${CMAKE_INSTALL_PREFIX})
 
 get_property(llvm_libs GLOBAL PROPERTY LLVM_LIBS)
 
 configure_file(
-  LLVM.cmake
-  ${llvm_cmake_builddir}/LLVM.cmake
+  LLVMConfig.cmake.in
+  ${llvm_cmake_builddir}/LLVMConfig.cmake
+  @ONLY)
+
+configure_file(
+  LLVMConfigVersion.cmake.in
+  ${llvm_cmake_builddir}/LLVMConfigVersion.cmake
   @ONLY)
 
 install(FILES
-  ${llvm_cmake_builddir}/LLVM.cmake
+  ${llvm_cmake_builddir}/LLVMConfig.cmake
+  ${llvm_cmake_builddir}/LLVMConfigVersion.cmake
   LLVM-Config.cmake
   LLVMLibDeps.cmake
   DESTINATION share/llvm/cmake)
@@ -17,16 +24,11 @@ install(DIRECTORY .
   DESTINATION share/llvm/cmake
   FILES_MATCHING PATTERN *.cmake
   PATTERN .svn EXCLUDE
-  PATTERN LLVM.cmake EXCLUDE
+  PATTERN LLVMConfig.cmake EXCLUDE
+  PATTERN LLVMConfigVersion.cmake EXCLUDE
   PATTERN LLVM-Config.cmake EXCLUDE
   PATTERN LLVMLibDeps.cmake EXCLUDE
   PATTERN FindBison.cmake EXCLUDE
   PATTERN GetTargetTriple.cmake EXCLUDE
   PATTERN VersionFromVCS.cmake EXCLUDE
   PATTERN CheckAtomic.cmake EXCLUDE)
-
-install(FILES
-  ${llvm_cmake_builddir}/LLVM.cmake
-  LLVM-Config.cmake
-  LLVMLibDeps.cmake
-  DESTINATION share/llvm/cmake)
diff --git a/cmake/modules/HandleLLVMOptions.cmake b/cmake/modules/HandleLLVMOptions.cmake
index 2c216f3..e725684 100644
--- a/cmake/modules/HandleLLVMOptions.cmake
+++ b/cmake/modules/HandleLLVMOptions.cmake
@@ -1,5 +1,11 @@
 include(AddLLVMDefinitions)
 
+if( CMAKE_COMPILER_IS_GNUCXX )
+  set(LLVM_COMPILER_IS_GCC_COMPATIBLE ON)
+elseif( "${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang" )
+  set(LLVM_COMPILER_IS_GCC_COMPATIBLE ON)
+endif()
+
 # Run-time build mode; It is used for unittests.
 if(MSVC_IDE)
   # Expect "$(Configuration)", "$(OutDir)", etc.
@@ -30,8 +36,13 @@ if( LLVM_ENABLE_ASSERTIONS )
   # explicitly undefine it:
   if( uppercase_CMAKE_BUILD_TYPE STREQUAL "RELEASE" )
     add_definitions( -UNDEBUG )
+    set(LLVM_BUILD_MODE "Release")
+  else()
+    set(LLVM_BUILD_MODE "Debug")
   endif()
+  set(LLVM_BUILD_MODE "${LLVM_BUILD_MODE}+Asserts")
 else()
+  set(LLVM_BUILD_MODE "Release")
   if( NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "RELEASE" )
     if( NOT MSVC_IDE AND NOT XCODE )
       add_definitions( -DNDEBUG )
@@ -153,7 +164,7 @@ if( MSVC )
     -wd4715 # Suppress ''function' : not all control paths return a value'
     -wd4800 # Suppress ''type' : forcing value to bool 'true' or 'false' (performance warning)'
     -wd4065 # Suppress 'switch statement contains 'default' but no 'case' labels'
-
+    -wd4181 # Suppress 'qualifier applied to reference type; ignored'
     -w14062 # Promote "enumerator in switch of enum is not handled" to level 1 warning.
     )
 
@@ -167,7 +178,7 @@ if( MSVC )
   if (LLVM_ENABLE_WERROR)
     add_llvm_definitions( /WX )
   endif (LLVM_ENABLE_WERROR)
-elseif( CMAKE_COMPILER_IS_GNUCXX )
+elseif( LLVM_COMPILER_IS_GCC_COMPATIBLE )
   if (LLVM_ENABLE_WARNINGS)
     add_llvm_definitions( -Wall -W -Wno-unused-parameter -Wwrite-strings )
     if (LLVM_ENABLE_PEDANTIC)
diff --git a/cmake/modules/LLVM-Config.cmake b/cmake/modules/LLVM-Config.cmake
index bd6a7a2..a6286fe 100755
--- a/cmake/modules/LLVM-Config.cmake
+++ b/cmake/modules/LLVM-Config.cmake
@@ -135,7 +135,7 @@ function(explicit_map_components_to_libraries out_libs)
       string(TOUPPER "${c}" capitalized)
       list(FIND capitalized_libs LLVM${capitalized} lib_idx)
       if( lib_idx LESS 0 )
-	# The component is unkown. Maybe is an ommitted target?
+	# The component is unknown. Maybe is an omitted target?
 	is_llvm_target_library(${c} iltl_result)
 	if( NOT iltl_result )
 	  message(FATAL_ERROR "Library `${c}' not found in list of llvm libraries.")
diff --git a/cmake/modules/LLVM.cmake b/cmake/modules/LLVMConfig.cmake.in
index 04abb9d..5a048b7 100644
--- a/cmake/modules/LLVM.cmake
+++ b/cmake/modules/LLVMConfig.cmake.in
@@ -10,6 +10,8 @@ set(LLVM_ALL_TARGETS @LLVM_ALL_TARGETS@)
 
 set(LLVM_TARGETS_TO_BUILD @LLVM_TARGETS_TO_BUILD@)
 
+set(LLVM_TARGETS_WITH_JIT @LLVM_TARGETS_WITH_JIT@)
+
 set(TARGET_TRIPLE "@TARGET_TRIPLE@")
 
 set(LLVM_TOOLS_BINARY_DIR @LLVM_TOOLS_BINARY_DIR@)
@@ -25,6 +27,11 @@ set(HAVE_LIBPTHREAD @HAVE_LIBPTHREAD@)
 set(LLVM_ON_UNIX @LLVM_ON_UNIX@)
 set(LLVM_ON_WIN32 @LLVM_ON_WIN32@)
 
+set(LLVM_INSTALL_PREFIX @LLVM_INSTALL_PREFIX@)
+set(LLVM_INCLUDE_DIRS ${LLVM_INSTALL_PREFIX}/include)
+set(LLVM_LIBRARY_DIRS ${LLVM_INSTALL_PREFIX}/lib)
+set(LLVM_DEFINITIONS "-D__STDC_LIMIT_MACROS" "-D__STDC_CONSTANT_MACROS")
+
 # We try to include using the current setting of CMAKE_MODULE_PATH,
 # which suppossedly was filled by the user with the directory where
 # this file was installed:
diff --git a/cmake/modules/LLVMConfigVersion.cmake.in b/cmake/modules/LLVMConfigVersion.cmake.in
new file mode 100644
index 0000000..add5aa9
--- /dev/null
+++ b/cmake/modules/LLVMConfigVersion.cmake.in
@@ -0,0 +1 @@
+set(PACKAGE_VERSION "@PACKAGE_VERSION@")
+\ No newline at end of file
diff --git a/cmake/modules/LLVMLibDeps.cmake b/cmake/modules/LLVMLibDeps.cmake
index 1996cec..509ac52 100644
--- a/cmake/modules/LLVMLibDeps.cmake
+++ b/cmake/modules/LLVMLibDeps.cmake
@@ -30,10 +30,10 @@ set(MSVC_LIB_DEPS_LLVMLinker LLVMArchive LLVMBitReader LLVMCore LLVMSupport LLVM
 set(MSVC_LIB_DEPS_LLVMMBlazeAsmParser LLVMMBlazeCodeGen LLVMMBlazeInfo LLVMMC LLVMMCParser LLVMSupport LLVMTarget)
 set(MSVC_LIB_DEPS_LLVMMBlazeAsmPrinter LLVMMC LLVMSupport)
 set(MSVC_LIB_DEPS_LLVMMBlazeCodeGen LLVMAsmPrinter LLVMCodeGen LLVMCore LLVMMBlazeAsmPrinter LLVMMBlazeInfo LLVMMC LLVMSelectionDAG LLVMSupport LLVMTarget)
-set(MSVC_LIB_DEPS_LLVMMBlazeDisassembler LLVMMBlazeCodeGen LLVMMBlazeInfo LLVMMC LLVMSupport)
+set(MSVC_LIB_DEPS_LLVMMBlazeDisassembler LLVMMBlazeCodeGen LLVMMBlazeInfo LLVMMC)
 set(MSVC_LIB_DEPS_LLVMMBlazeInfo LLVMMC LLVMSupport)
 set(MSVC_LIB_DEPS_LLVMMC LLVMSupport)
-set(MSVC_LIB_DEPS_LLVMMCDisassembler LLVMARMAsmParser LLVMARMCodeGen LLVMARMDisassembler LLVMARMInfo LLVMAlphaCodeGen LLVMAlphaInfo LLVMBlackfinCodeGen LLVMBlackfinInfo LLVMCBackend LLVMCBackendInfo LLVMCellSPUCodeGen LLVMCellSPUInfo LLVMCppBackend LLVMCppBackendInfo LLVMMBlazeAsmParser LLVMMBlazeCodeGen LLVMMBlazeDisassembler LLVMMBlazeInfo LLVMMC LLVMMCParser LLVMMSP430CodeGen LLVMMSP430Info LLVMMipsCodeGen LLVMMipsInfo LLVMPTXCodeGen LLVMPTXInfo LLVMPowerPCCodeGen LLVMPowerPCInfo LLVMSparcCodeGen LLVMSparcInfo LLVMSupport LLVMSystemZCodeGen LLVMSystemZInfo LLVMX86AsmParser LLVMX86CodeGen LLVMX86Disassembler LLVMX86Info LLVMXCoreCodeGen LLVMXCoreInfo)
+set(MSVC_LIB_DEPS_LLVMMCDisassembler LLVMARMAsmParser LLVMARMCodeGen LLVMARMDisassembler LLVMARMInfo LLVMAlphaCodeGen LLVMAlphaInfo LLVMBlackfinCodeGen LLVMBlackfinInfo LLVMCBackend LLVMCBackendInfo LLVMCellSPUCodeGen LLVMCellSPUInfo LLVMCppBackend LLVMCppBackendInfo LLVMMBlazeAsmParser LLVMMBlazeCodeGen LLVMMBlazeDisassembler LLVMMBlazeInfo LLVMMC LLVMMCParser LLVMMSP430CodeGen LLVMMSP430Info LLVMMipsCodeGen LLVMMipsInfo LLVMPTXCodeGen LLVMPTXInfo LLVMPowerPCCodeGen LLVMPowerPCInfo LLVMSparcCodeGen LLVMSparcInfo LLVMSupport LLVMSystemZCodeGen LLVMSystemZInfo LLVMTarget LLVMX86AsmParser LLVMX86CodeGen LLVMX86Disassembler LLVMX86Info LLVMXCoreCodeGen LLVMXCoreInfo)
 set(MSVC_LIB_DEPS_LLVMMCJIT LLVMCore LLVMExecutionEngine LLVMRuntimeDyld LLVMSupport LLVMTarget)
 set(MSVC_LIB_DEPS_LLVMMCParser LLVMMC LLVMSupport)
 set(MSVC_LIB_DEPS_LLVMMSP430AsmPrinter LLVMMC LLVMSupport)
diff --git a/cmake/modules/LLVMProcessSources.cmake b/cmake/modules/LLVMProcessSources.cmake
index 270292a..641f1b3 100644
--- a/cmake/modules/LLVMProcessSources.cmake
+++ b/cmake/modules/LLVMProcessSources.cmake
@@ -56,7 +56,7 @@ function(llvm_process_sources OUT_VAR)
 
   # Set common compiler options:
   if( NOT LLVM_REQUIRES_EH )
-    if( CMAKE_COMPILER_IS_GNUCXX )
+    if( LLVM_COMPILER_IS_GCC_COMPATIBLE )
       add_definitions( -fno-exceptions )
     elseif( MSVC )
       llvm_replace_compiler_option(CMAKE_CXX_FLAGS "/EHsc" "/EHs-c-")
@@ -64,7 +64,7 @@ function(llvm_process_sources OUT_VAR)
     endif()
   endif()
   if( NOT LLVM_REQUIRES_RTTI )
-    if( CMAKE_COMPILER_IS_GNUCXX )
+    if( LLVM_COMPILER_IS_GCC_COMPATIBLE )
       llvm_replace_compiler_option(CMAKE_CXX_FLAGS "-frtti" "-fno-rtti")
     elseif( MSVC )
       llvm_replace_compiler_option(CMAKE_CXX_FLAGS "/GR" "/GR-")
diff --git a/configure b/configure
index 5277423..496854d 100755
--- a/configure
+++ b/configure
@@ -9,7 +9,7 @@
 # This configure script is free software; the Free Software Foundation
 # gives unlimited permission to copy, distribute and modify it.
 #
-# Copyright (c) 2003-2010 University of Illinois at Urbana-Champaign.
+# Copyright (c) 2003-2011 University of Illinois at Urbana-Champaign.
 ## --------------------- ##
 ## M4sh Initialization.  ##
 ## --------------------- ##
@@ -707,6 +707,7 @@ CLANGXXPATH
 ENABLE_BUILT_CLANG
 OPTIMIZE_OPTION
 EXTRA_OPTIONS
+EXTRA_LD_OPTIONS
 BINUTILS_INCDIR
 CXX
 CXXFLAGS
@@ -1454,6 +1455,7 @@ Optional Packages:
   --with-optimize-option  Select the compiler options to use for optimized
                           builds
   --with-extra-options    Specify additional options to compile LLVM with
+  --with-extra-ld-options Specify additional options to link LLVM with
   --with-ocaml-libdir     Specify install location for ocaml bindings (default
                           is stdlib)
   --with-clang-resource-dir
@@ -1559,7 +1561,7 @@ Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
 This configure script is free software; the Free Software Foundation
 gives unlimited permission to copy, distribute and modify it.
 
-Copyright (c) 2003-2010 University of Illinois at Urbana-Champaign.
+Copyright (c) 2003-2011 University of Illinois at Urbana-Champaign.
 _ACEOF
   exit
 fi
@@ -1921,7 +1923,7 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
 
 
-LLVM_COPYRIGHT="Copyright (c) 2003-2010 University of Illinois at Urbana-Champaign."
+LLVM_COPYRIGHT="Copyright (c) 2003-2011 University of Illinois at Urbana-Champaign."
 
 
 
@@ -5346,6 +5348,21 @@ esac
 EXTRA_OPTIONS=$EXTRA_OPTIONS
 
 
+
+# Check whether --with-extra-ld-options was given.
+if test "${with_extra_ld_options+set}" = set; then
+  withval=$with_extra_ld_options;
+else
+  withval=default
+fi
+
+case "$withval" in
+  default) EXTRA_LD_OPTIONS= ;;
+  *) EXTRA_LD_OPTIONS=$withval ;;
+esac
+EXTRA_LD_OPTIONS=$EXTRA_LD_OPTIONS
+
+
 # Check whether --enable-bindings was given.
 if test "${enable_bindings+set}" = set; then
   enableval=$enable_bindings;
@@ -11571,7 +11588,7 @@ else
   lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
   lt_status=$lt_dlunknown
   cat > conftest.$ac_ext <<EOF
-#line 11574 "configure"
+#line 11591 "configure"
 #include "confdefs.h"
 
 #if HAVE_DLFCN_H
@@ -11723,7 +11740,7 @@ else
   osf[1234]*)
     # dlopen did load deplibs (at least at 4.x), but until the 5.x series,
     # it did *not* use an RPATH in a shared library to find objects the
-    # library depends on, so we explictly say `no'.
+    # library depends on, so we explicitly say `no'.
     libltdl_cv_sys_dlopen_deplibs=no
     ;;
   osf5.0|osf5.0a|osf5.1)
@@ -20660,6 +20677,73 @@ fi
 
 fi
 
+if test "$llvm_cv_os_type" = "MingW" ; then
+  { echo "$as_me:$LINENO: checking whether EnumerateLoadedModules() accepts new decl" >&5
+echo $ECHO_N "checking whether EnumerateLoadedModules() accepts new decl... $ECHO_C" >&6; }
+  cat >conftest.$ac_ext <<_ACEOF
+#include <windows.h>
+#include <imagehlp.h>
+extern void foo(PENUMLOADED_MODULES_CALLBACK);
+extern void foo(BOOL(CALLBACK*)(PCSTR,ULONG_PTR,ULONG,PVOID));
+_ACEOF
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } &&
+	 { ac_try='test -z "$ac_c_werror_flag" || test ! -s conftest.err'
+  { (case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; } &&
+	 { ac_try='test -s conftest.$ac_objext'
+  { (case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; }; then
+
+  { echo "$as_me:$LINENO: result: yes" >&5
+echo "${ECHO_T}yes" >&6; }
+  llvm_cv_win32_elmcb_pcstr="PCSTR"
+
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+
+  { echo "$as_me:$LINENO: result: no" >&5
+echo "${ECHO_T}no" >&6; }
+  llvm_cv_win32_elmcb_pcstr="PSTR"
+
+fi
+
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+
+cat >>confdefs.h <<_ACEOF
+#define WIN32_ELMCB_PCSTR $llvm_cv_win32_elmcb_pcstr
+_ACEOF
+
+fi
+
 
 { echo "$as_me:$LINENO: checking for isnan in <math.h>" >&5
 echo $ECHO_N "checking for isnan in <math.h>... $ECHO_C" >&6; }
@@ -22178,7 +22262,7 @@ echo "${ECHO_T}$llvm_cv_llvmgcc_dragonegg" >&6; }
 
 if test "$llvm_cv_llvmgcc_dragonegg" = "yes" ; then
   LLVMCC_EMITIR_FLAG="-fplugin-arg-dragonegg-emit-ir"
-  LLVMCC_DISABLEOPT_FLAGS="-fplugin-arg-dragonegg-disable-llvm-optzns"
+  LLVMCC_DISABLEOPT_FLAGS="-fplugin-arg-dragonegg-llvm-ir-optimize=0"
 else
   LLVMCC_EMITIR_FLAG="-emit-llvm"
   LLVMCC_DISABLEOPT_FLAGS="-mllvm -disable-llvm-optzns"
@@ -22507,6 +22591,13 @@ ac_config_files="$ac_config_files Makefile.config"
 ac_config_files="$ac_config_files llvm.spec"
 
 
+ac_config_files="$ac_config_files docs/doxygen.cfg"
+
+if test -f ${srcdir}/tools/clang/README.txt; then
+  ac_config_files="$ac_config_files tools/clang/docs/doxygen.cfg"
+
+fi
+
 ac_config_files="$ac_config_files tools/llvmc/src/Base.td"
 
 
@@ -23127,6 +23218,8 @@ do
     "include/llvm/Support/DataTypes.h") CONFIG_HEADERS="$CONFIG_HEADERS include/llvm/Support/DataTypes.h" ;;
     "Makefile.config") CONFIG_FILES="$CONFIG_FILES Makefile.config" ;;
     "llvm.spec") CONFIG_FILES="$CONFIG_FILES llvm.spec" ;;
+    "docs/doxygen.cfg") CONFIG_FILES="$CONFIG_FILES docs/doxygen.cfg" ;;
+    "tools/clang/docs/doxygen.cfg") CONFIG_FILES="$CONFIG_FILES tools/clang/docs/doxygen.cfg" ;;
     "tools/llvmc/src/Base.td") CONFIG_FILES="$CONFIG_FILES tools/llvmc/src/Base.td" ;;
     "tools/llvm-config/llvm-config.in") CONFIG_FILES="$CONFIG_FILES tools/llvm-config/llvm-config.in" ;;
     "setup") CONFIG_COMMANDS="$CONFIG_COMMANDS setup" ;;
@@ -23351,6 +23444,7 @@ CLANGXXPATH!$CLANGXXPATH$ac_delim
 ENABLE_BUILT_CLANG!$ENABLE_BUILT_CLANG$ac_delim
 OPTIMIZE_OPTION!$OPTIMIZE_OPTION$ac_delim
 EXTRA_OPTIONS!$EXTRA_OPTIONS$ac_delim
+EXTRA_LD_OPTIONS!$EXTRA_LD_OPTIONS$ac_delim
 BINUTILS_INCDIR!$BINUTILS_INCDIR$ac_delim
 CXX!$CXX$ac_delim
 CXXFLAGS!$CXXFLAGS$ac_delim
@@ -23439,7 +23533,6 @@ LLVM_MANDIR!$LLVM_MANDIR$ac_delim
 LLVM_CONFIGTIME!$LLVM_CONFIGTIME$ac_delim
 BINDINGS_TO_BUILD!$BINDINGS_TO_BUILD$ac_delim
 ALL_BINDINGS!$ALL_BINDINGS$ac_delim
-OCAML_LIBDIR!$OCAML_LIBDIR$ac_delim
 _ACEOF
 
   if test `sed -n "s/.*$ac_delim\$/X/p" conf$$subs.sed | grep -c X` = 97; then
@@ -23481,6 +23574,7 @@ _ACEOF
 ac_delim='%!_!# '
 for ac_last_try in false false false false false :; do
   cat >conf$$subs.sed <<_ACEOF
+OCAML_LIBDIR!$OCAML_LIBDIR$ac_delim
 ENABLE_VISIBILITY_INLINES_HIDDEN!$ENABLE_VISIBILITY_INLINES_HIDDEN$ac_delim
 RPATH!$RPATH$ac_delim
 RDYNAMIC!$RDYNAMIC$ac_delim
@@ -23488,7 +23582,7 @@ LIBOBJS!$LIBOBJS$ac_delim
 LTLIBOBJS!$LTLIBOBJS$ac_delim
 _ACEOF
 
-  if test `sed -n "s/.*$ac_delim\$/X/p" conf$$subs.sed | grep -c X` = 5; then
+  if test `sed -n "s/.*$ac_delim\$/X/p" conf$$subs.sed | grep -c X` = 6; then
     break
   elif $ac_last_try; then
     { { echo "$as_me:$LINENO: error: could not make $CONFIG_STATUS" >&5
diff --git a/docs/AliasAnalysis.html b/docs/AliasAnalysis.html
index 7baa946..5e36ae1 100644
--- a/docs/AliasAnalysis.html
+++ b/docs/AliasAnalysis.html
@@ -7,9 +7,9 @@
 </head>
 <body>
 
-<div class="doc_title">
+<h1>
   LLVM Alias Analysis Infrastructure
-</div>
+</h1>
 
 <ol>
   <li><a href="#introduction">Introduction</a></li>
@@ -59,12 +59,12 @@
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="introduction">Introduction</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Alias Analysis (aka Pointer Analysis) is a class of techniques which attempt
 to determine whether or not two pointers ever can point to the same object in
@@ -96,12 +96,12 @@ know</a>.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="overview"><tt>AliasAnalysis</tt> Class Overview</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>The <a
 href="http://llvm.org/doxygen/classllvm_1_1AliasAnalysis.html"><tt>AliasAnalysis</tt></a>
@@ -122,14 +122,12 @@ multiple values, values which are not
 <a href="LangRef.html#constants">constants</a> are all defined within the
 same function.</p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="pointers">Representation of Pointers</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>Most importantly, the <tt>AliasAnalysis</tt> class provides several methods
 which are used to query whether or not two memory objects alias, whether
@@ -181,11 +179,11 @@ that the accesses alias.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="alias">The <tt>alias</tt> method</a>
-</div>
+</h3>
   
-<div class="doc_text">
+<div>
 <p>The <tt>alias</tt> method is the primary interface used to determine whether
 or not two memory objects alias each other.  It takes two memory objects as
 input and returns MustAlias, PartialAlias, MayAlias, or NoAlias as
@@ -194,14 +192,13 @@ appropriate.</p>
 <p>Like all <tt>AliasAnalysis</tt> interfaces, the <tt>alias</tt> method requires
 that either the two pointer values be defined within the same function, or at
 least one of the values is a <a href="LangRef.html#constants">constant</a>.</p>
-</div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="MustMayNo">Must, May, and No Alias Responses</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 <p>The NoAlias response may be used when there is never an immediate dependence
 between any memory reference <i>based</i> on one pointer and any memory
 reference <i>based</i> the other. The most obvious example is when the two
@@ -227,12 +224,14 @@ implies that the pointers compare equal.</p>
 
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="ModRefInfo">The <tt>getModRefInfo</tt> methods</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>The <tt>getModRefInfo</tt> methods return information about whether the
 execution of an instruction can read or modify a memory location.  Mod/Ref
@@ -250,25 +249,23 @@ memory written to by CS2.  Note that this relation is not commutative.</p>
 
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="OtherItfs">Other useful <tt>AliasAnalysis</tt> methods</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>
 Several other tidbits of information are often collected by various alias
 analysis implementations and can be put to good use by various clients.
 </p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   The <tt>pointsToConstantMemory</tt> method
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>The <tt>pointsToConstantMemory</tt> method returns true if and only if the
 analysis can prove that the pointer only points to unchanging memory locations
@@ -279,12 +276,12 @@ memory location to be modified.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="simplemodref">The <tt>doesNotAccessMemory</tt> and
   <tt>onlyReadsMemory</tt> methods</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>These methods are used to provide very simple mod/ref information for
 function calls.  The <tt>doesNotAccessMemory</tt> method returns true for a
@@ -307,13 +304,17 @@ functions that satisfy the <tt>doesNotAccessMemory</tt> method also satisfies
 
 </div>
 
+</div>
+
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="writingnew">Writing a new <tt>AliasAnalysis</tt> Implementation</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Writing a new alias analysis implementation for LLVM is quite
 straight-forward.  There are already several implementations that you can use
@@ -321,14 +322,12 @@ for examples, and the following information should help fill in any details.
 For a examples, take a look at the <a href="#impls">various alias analysis
 implementations</a> included with LLVM.</p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="passsubclasses">Different Pass styles</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>The first step to determining what type of <a
 href="WritingAnLLVMPass.html">LLVM pass</a> you need to use for your Alias
@@ -352,11 +351,11 @@ solve:</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="requiredcalls">Required initialization calls</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>Your subclass of <tt>AliasAnalysis</tt> is required to invoke two methods on
 the <tt>AliasAnalysis</tt> base class: <tt>getAnalysisUsage</tt> and
@@ -393,11 +392,11 @@ bool run(Module &amp;M) {
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="interfaces">Interfaces which may be specified</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>All of the <a
 href="/doxygen/classllvm_1_1AliasAnalysis.html"><tt>AliasAnalysis</tt></a>
@@ -412,11 +411,11 @@ implementing, you just override the interfaces you can improve.</p>
 
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="chaining"><tt>AliasAnalysis</tt> chaining behavior</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>With only two special exceptions (the <tt><a
 href="#basic-aa">basicaa</a></tt> and <a href="#no-aa"><tt>no-aa</tt></a>
@@ -451,11 +450,11 @@ updated.</p>
 
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="updating">Updating analysis results for transformations</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 <p>
 Alias analysis information is initially computed for a static snapshot of the
 program, but clients will use this information to make transformations to the
@@ -471,12 +470,11 @@ their internal data structures are kept up-to-date as the program changes (for
 example, when an instruction is deleted), and clients of alias analysis must be
 sure to call these interfaces appropriately.
 </p>
-</div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">The <tt>deleteValue</tt> method</div>
+<h4>The <tt>deleteValue</tt> method</h4>
 
-<div class="doc_text">
+<div>
 The <tt>deleteValue</tt> method is called by transformations when they remove an
 instruction or any other value from the program (including values that do not
 use pointers).  Typically alias analyses keep data structures that have entries
@@ -485,9 +483,9 @@ any entries for the specified value, if they exist.
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">The <tt>copyValue</tt> method</div>
+<h4>The <tt>copyValue</tt> method</h4>
 
-<div class="doc_text">
+<div>
 The <tt>copyValue</tt> method is used when a new value is introduced into the
 program.  There is no way to introduce a value into the program that did not
 exist before (this doesn't make sense for a safe compiler transformation), so
@@ -496,9 +494,9 @@ new value has exactly the same properties as the value being copied.
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">The <tt>replaceWithNewValue</tt> method</div>
+<h4>The <tt>replaceWithNewValue</tt> method</h4>
 
-<div class="doc_text">
+<div>
 This method is a simple helper method that is provided to make clients easier to
 use.  It is implemented by copying the old analysis information to the new
 value, then deleting the old value.  This method cannot be overridden by alias
@@ -506,9 +504,9 @@ analysis implementations.
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">The <tt>addEscapingUse</tt> method</div>
+<h4>The <tt>addEscapingUse</tt> method</h4>
 
-<div class="doc_text">
+<div>
 <p>The <tt>addEscapingUse</tt> method is used when the uses of a pointer
 value have changed in ways that may invalidate precomputed analysis information. 
 Implementations may either use this callback to provide conservative responses
@@ -527,12 +525,14 @@ uses below:</p>
 </ul>
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="implefficiency">Efficiency Issues</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>From the LLVM perspective, the only thing you need to do to provide an
 efficient alias analysis is to make sure that alias analysis <b>queries</b> are
@@ -544,11 +544,11 @@ method as possible (within reason).</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="limitations">Limitations</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>The AliasAnalysis infrastructure has several limitations which make
 writing a new <tt>AliasAnalysis</tt> implementation difficult.</p>
@@ -567,7 +567,7 @@ which are intended to allow a pass to keep an AliasAnalysis consistent,
 however there's no way for a pass to declare in its
 <tt>getAnalysisUsage</tt> that it does so. Some passes attempt to use
 <tt>AU.addPreserved&lt;AliasAnalysis&gt;</tt>, however this doesn't
-actually have any effect.</tt>
+actually have any effect.</p>
 
 <p><tt>AliasAnalysisCounter</tt> (<tt>-count-aa</tt>) and <tt>AliasDebugger</tt>
 (<tt>-debug-aa</tt>) are implemented as <tt>ModulePass</tt> classes, so if your
@@ -616,25 +616,25 @@ from itself.</p>
 
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="using">Using alias analysis results</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>There are several different ways to use alias analysis results.  In order of
 preference, these are...</p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="memdep">Using the <tt>MemoryDependenceAnalysis</tt> Pass</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>The <tt>memdep</tt> pass uses alias analysis to provide high-level dependence
 information about memory-using instructions.  This will tell you which store
@@ -645,11 +645,11 @@ efficient, and is used by Dead Store Elimination, GVN, and memcpy optimizations.
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="ast">Using the <tt>AliasSetTracker</tt> class</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>Many transformations need information about alias <b>sets</b> that are active
 in some scope, rather than information about pairwise aliasing.  The <tt><a
@@ -678,14 +678,12 @@ sunk to outside of the loop, promoting the memory location to a register for the
 duration of the loop nest.  Both of these transformations only apply if the
 pointer argument is loop-invariant.</p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   The AliasSetTracker implementation
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>The AliasSetTracker class is implemented to be as efficient as possible.  It
 uses the union-find algorithm to efficiently merge AliasSets when a pointer is
@@ -706,12 +704,14 @@ are.</p>
 
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="direct">Using the <tt>AliasAnalysis</tt> interface directly</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>If neither of these utility class are what your pass needs, you should use
 the interfaces exposed by the <tt>AliasAnalysis</tt> class directly.  Try to use
@@ -721,13 +721,15 @@ best precision and efficiency.</p>
 
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="exist">Existing alias analysis implementations and clients</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>If you're going to be working with the LLVM alias analysis infrastructure,
 you should know what clients and implementations of alias analysis are
@@ -735,28 +737,24 @@ available.  In particular, if you are implementing an alias analysis, you should
 be aware of the <a href="#aliasanalysis-debug">the clients</a> that are useful
 for monitoring and evaluating different implementations.</p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="impls">Available <tt>AliasAnalysis</tt> implementations</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>This section lists the various implementations of the <tt>AliasAnalysis</tt>
-interface.  With the exception of the <a href="#no-aa"><tt>-no-aa</tt></a> and
-<a href="#basic-aa"><tt>-basicaa</tt></a> implementations, all of these <a
-href="#chaining">chain</a> to other alias analysis implementations.</p>
-
-</div>
+interface.  With the exception of the <a href="#no-aa"><tt>-no-aa</tt></a>
+implementation, all of these <a href="#chaining">chain</a> to other alias
+analysis implementations.</p>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="no-aa">The <tt>-no-aa</tt> pass</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>The <tt>-no-aa</tt> pass is just like what it sounds: an alias analysis that
 never returns any useful information.  This pass can be useful if you think that
@@ -766,11 +764,11 @@ problem.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="basic-aa">The <tt>-basicaa</tt> pass</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>The <tt>-basicaa</tt> pass is an aggressive local analysis that "knows"
 many important facts:</p>
@@ -794,11 +792,11 @@ many important facts:</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="globalsmodref">The <tt>-globalsmodref-aa</tt> pass</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>This pass implements a simple context-sensitive mod/ref and alias analysis
 for internal global variables that don't "have their address taken".  If a
@@ -818,11 +816,11 @@ non-address taken globals), but is very quick analysis.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="steens-aa">The <tt>-steens-aa</tt> pass</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>The <tt>-steens-aa</tt> pass implements a variation on the well-known
 "Steensgaard's algorithm" for interprocedural alias analysis.  Steensgaard's
@@ -841,11 +839,11 @@ module, it is not part of the LLVM core.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="ds-aa">The <tt>-ds-aa</tt> pass</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>The <tt>-ds-aa</tt> pass implements the full Data Structure Analysis
 algorithm.  Data Structure Analysis is a modular unification-based,
@@ -864,11 +862,11 @@ module, it is not part of the LLVM core.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="scev-aa">The <tt>-scev-aa</tt> pass</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>The <tt>-scev-aa</tt> pass implements AliasAnalysis queries by
 translating them into ScalarEvolution queries. This gives it a
@@ -877,22 +875,23 @@ and loop induction variables than other alias analyses have.</p>
 
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="aliasanalysis-xforms">Alias analysis driven transformations</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 LLVM includes several alias-analysis driven transformations which can be used
 with any of the implementations above.
-</div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="adce">The <tt>-adce</tt> pass</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>The <tt>-adce</tt> pass, which implements Aggressive Dead Code Elimination
 uses the <tt>AliasAnalysis</tt> interface to delete calls to functions that do
@@ -902,11 +901,11 @@ not have side-effects and are not used.</p>
 
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="licm">The <tt>-licm</tt> pass</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>The <tt>-licm</tt> pass implements various Loop Invariant Code Motion related
 transformations.  It uses the <tt>AliasAnalysis</tt> interface for several
@@ -927,11 +926,11 @@ no may aliases to the loaded/stored memory location.</li>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="argpromotion">The <tt>-argpromotion</tt> pass</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 <p>
 The <tt>-argpromotion</tt> pass promotes by-reference arguments to be passed in
 by-value instead.  In particular, if pointer arguments are only loaded from it
@@ -942,38 +941,38 @@ pointer.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="gvn">The <tt>-gvn</tt>, <tt>-memcpyopt</tt>, and <tt>-dse</tt>
      passes</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>These passes use AliasAnalysis information to reason about loads and stores.
 </p>
 
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="aliasanalysis-debug">Clients for debugging and evaluation of
   implementations</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>These passes are useful for evaluating the various alias analysis
 implementations.  You can use them with commands like '<tt>opt -ds-aa
 -aa-eval foo.bc -disable-output -stats</tt>'.</p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="print-alias-sets">The <tt>-print-alias-sets</tt> pass</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>The <tt>-print-alias-sets</tt> pass is exposed as part of the
 <tt>opt</tt> tool to print out the Alias Sets formed by the <a
@@ -990,11 +989,11 @@ the <tt>AliasSetTracker</tt> class.  To use it, use something like:</p>
 
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="count-aa">The <tt>-count-aa</tt> pass</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>The <tt>-count-aa</tt> pass is useful to see how many queries a particular
 pass is making and what responses are returned by the alias analysis.  As an
@@ -1014,11 +1013,11 @@ when debugging a transformation or an alias analysis implementation.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="aa-eval">The <tt>-aa-eval</tt> pass</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>The <tt>-aa-eval</tt> pass simply iterates through all pairs of pointers in a
 function and asks an alias analysis whether or not the pointers alias.  This
@@ -1028,13 +1027,17 @@ algorithm will have a lower number of may aliases).</p>
 
 </div>
 
+</div>
+
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="memdep">Memory Dependence Analysis</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>If you're just looking to be a client of alias analysis information, consider
 using the Memory Dependence Analysis interface instead.  MemDep is a lazy, 
@@ -1056,7 +1059,7 @@ analysis directly.</p>
   src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
 
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
-  <a href="http://llvm.org">LLVM Compiler Infrastructure</a><br>
+  <a href="http://llvm.org/">LLVM Compiler Infrastructure</a><br>
   Last modified: $Date$
 </address>
 
diff --git a/docs/BitCodeFormat.html b/docs/BitCodeFormat.html
index 0b8747c..d208848 100644
--- a/docs/BitCodeFormat.html
+++ b/docs/BitCodeFormat.html
@@ -7,7 +7,7 @@
   <link rel="stylesheet" href="llvm.css" type="text/css">
 </head>
 <body>
-<div class="doc_title"> LLVM Bitcode File Format </div>
+<h1> LLVM Bitcode File Format</h1>
 <ol>
   <li><a href="#abstract">Abstract</a></li>
   <li><a href="#overview">Overview</a></li>
@@ -47,10 +47,10 @@
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"> <a name="abstract">Abstract</a></div>
+<h2><a name="abstract">Abstract</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>This document describes the LLVM bitstream file format and the encoding of
 the LLVM IR into it.</p>
@@ -58,10 +58,10 @@ the LLVM IR into it.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"> <a name="overview">Overview</a></div>
+<h2><a name="overview">Overview</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>
 What is commonly known as the LLVM bitcode file format (also, sometimes
@@ -88,10 +88,10 @@ wrapper format, then describes the record structure used by LLVM IR files.
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"> <a name="bitstream">Bitstream Format</a></div>
+<h2><a name="bitstream">Bitstream Format</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>
 The bitstream format is literally a stream of bits, with a very simple
@@ -114,13 +114,12 @@ href="CommandGuide/html/llvm-bcanalyzer.html">llvm-bcanalyzer</a> tool can be
 used to dump and inspect arbitrary bitstreams, which is very useful for
 understanding the encoding.</p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="magic">Magic Numbers</a>
-</div>
+<h3>
+  <a name="magic">Magic Numbers</a>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>The first two bytes of a bitcode file are 'BC' (0x42, 0x43).
 The second two bytes are an application-specific magic number.  Generic
@@ -130,10 +129,11 @@ bitcode, while application-specific programs will want to look at all four.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="primitives">Primitives</a>
-</div>
+<h3>
+  <a name="primitives">Primitives</a>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>
 A bitstream literally consists of a stream of bits, which are read in order
@@ -144,13 +144,12 @@ Width Integers</a> or as <a href="#variablewidth">Variable Width
 Integers</a>.
 </p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"> <a name="fixedwidth">Fixed Width Integers</a>
-</div>
+<h4>
+  <a name="fixedwidth">Fixed Width Integers</a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>Fixed-width integer values have their low bits emitted directly to the file.
    For example, a 3-bit integer value encodes 1 as 001.  Fixed width integers
@@ -161,10 +160,11 @@ Integers</a>.
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"> <a name="variablewidth">Variable Width
-Integers</a></div>
+<h4>
+  <a name="variablewidth">Variable Width Integers</a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>Variable-width integer (VBR) values encode values of arbitrary size,
 optimizing for the case where the values are small.  Given a 4-bit VBR field,
@@ -182,9 +182,9 @@ value of 24 (011 << 3) with no continuation.  The sum (3+24) yields the value
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"> <a name="char6">6-bit characters</a></div>
+<h4><a name="char6">6-bit characters</a></h4>
 
-<div class="doc_text">
+<div>
 
 <p>6-bit characters encode common characters into a fixed 6-bit field.  They
 represent the following characters with the following 6-bit values:</p>
@@ -206,9 +206,9 @@ characters not in the set.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"> <a name="wordalign">Word Alignment</a></div>
+<h4><a name="wordalign">Word Alignment</a></h4>
 
-<div class="doc_text">
+<div>
 
 <p>Occasionally, it is useful to emit zero bits until the bitstream is a
 multiple of 32 bits.  This ensures that the bit position in the stream can be
@@ -216,12 +216,14 @@ represented as a multiple of 32-bit words.</p>
 
 </div>
 
+</div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="abbrevid">Abbreviation IDs</a>
-</div>
+<h3>
+  <a name="abbrevid">Abbreviation IDs</a>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>
 A bitstream is a sequential series of <a href="#blocks">Blocks</a> and
@@ -253,10 +255,11 @@ an <a href="#abbrev_records">abbreviated record encoding</a>.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="blocks">Blocks</a>
-</div>
+<h3>
+  <a name="blocks">Blocks</a>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>
 Blocks in a bitstream denote nested regions of the stream, and are identified by
@@ -297,13 +300,10 @@ its own set of abbreviations, and its own abbrev id width.  When a sub-block is
 popped, the saved values are restored.
 </p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"> <a name="ENTER_SUBBLOCK">ENTER_SUBBLOCK
-Encoding</a></div>
+<h4><a name="ENTER_SUBBLOCK">ENTER_SUBBLOCK Encoding</a></h4>
 
-<div class="doc_text">
+<div>
 
 <p><tt>[ENTER_SUBBLOCK, blockid<sub>vbr8</sub>, newabbrevlen<sub>vbr4</sub>,
      &lt;align32bits&gt;, blocklen<sub>32</sub>]</tt></p>
@@ -322,10 +322,9 @@ reader to skip over the entire block in one jump.
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"> <a name="END_BLOCK">END_BLOCK
-Encoding</a></div>
+<h4><a name="END_BLOCK">END_BLOCK Encoding</a></h4>
 
-<div class="doc_text">
+<div>
 
 <p><tt>[END_BLOCK, &lt;align32bits&gt;]</tt></p>
 
@@ -337,13 +336,14 @@ an even multiple of 32-bits.
 
 </div>
 
-
+</div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="datarecord">Data Records</a>
-</div>
+<h3>
+  <a name="datarecord">Data Records</a>
+</h3>
 
-<div class="doc_text">
+<div>
 <p>
 Data records consist of a record code and a number of (up to) 64-bit
 integer values.  The interpretation of the code and values is
@@ -355,13 +355,10 @@ which encodes the target triple of a module.  The code is
 ASCII codes for the characters in the string.
 </p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"> <a name="UNABBREV_RECORD">UNABBREV_RECORD
-Encoding</a></div>
+<h4><a name="UNABBREV_RECORD">UNABBREV_RECORD Encoding</a></h4>
 
-<div class="doc_text">
+<div>
 
 <p><tt>[UNABBREV_RECORD, code<sub>vbr6</sub>, numops<sub>vbr6</sub>,
        op0<sub>vbr6</sub>, op1<sub>vbr6</sub>, ...]</tt></p>
@@ -385,10 +382,9 @@ bits.  This is not an efficient encoding, but it is fully general.
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"> <a name="abbrev_records">Abbreviated Record
-Encoding</a></div>
+<h4><a name="abbrev_records">Abbreviated Record Encoding</a></h4>
 
-<div class="doc_text">
+<div>
 
 <p><tt>[&lt;abbrevid&gt;, fields...]</tt></p>
 
@@ -409,11 +405,14 @@ operand value).</p>
 
 </div>
 
-<!-- ======================================================================= -->
-<div class="doc_subsection"><a name="abbreviations">Abbreviations</a>
 </div>
 
-<div class="doc_text">
+<!-- ======================================================================= -->
+<h3>
+  <a name="abbreviations">Abbreviations</a>
+</h3>
+
+<div>
 <p>
 Abbreviations are an important form of compression for bitstreams.  The idea is
 to specify a dense encoding for a class of records once, then use that encoding
@@ -431,13 +430,11 @@ As a concrete example, LLVM IR files usually emit an abbreviation
 for binary operators.  If a specific LLVM module contained no or few binary
 operators, the abbreviation does not need to be emitted.
 </p>
-</div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"><a name="DEFINE_ABBREV">DEFINE_ABBREV
- Encoding</a></div>
+<h4><a name="DEFINE_ABBREV">DEFINE_ABBREV  Encoding</a></h4>
 
-<div class="doc_text">
+<div>
 
 <p><tt>[DEFINE_ABBREV, numabbrevops<sub>vbr5</sub>, abbrevop0, abbrevop1,
  ...]</tt></p>
@@ -552,11 +549,14 @@ used for any other string value.
 
 </div>
 
-<!-- ======================================================================= -->
-<div class="doc_subsection"><a name="stdblocks">Standard Blocks</a>
 </div>
 
-<div class="doc_text">
+<!-- ======================================================================= -->
+<h3>
+  <a name="stdblocks">Standard Blocks</a>
+</h3>
+
+<div>
 
 <p>
 In addition to the basic block structure and record encodings, the bitstream
@@ -565,13 +565,10 @@ stream is to be decoded or other metadata.  In the future, new standard blocks
 may be added.  Block IDs 0-7 are reserved for standard blocks.
 </p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"><a name="BLOCKINFO">#0 - BLOCKINFO
-Block</a></div>
+<h4><a name="BLOCKINFO">#0 - BLOCKINFO Block</a></h4>
 
-<div class="doc_text">
+<div>
 
 <p>
 The <tt>BLOCKINFO</tt> block allows the description of metadata for other
@@ -620,11 +617,15 @@ from the corresponding blocks.  It is not safe to skip them.
 
 </div>
 
+</div>
+
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section"> <a name="wrapper">Bitcode Wrapper Format</a></div>
+<h2><a name="wrapper">Bitcode Wrapper Format</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>
 Bitcode files for LLVM IR may optionally be wrapped in a simple wrapper
@@ -652,10 +653,10 @@ value that can be used to encode the CPU of the target.
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"> <a name="llvmir">LLVM IR Encoding</a></div>
+<h2><a name="llvmir">LLVM IR Encoding</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>
 LLVM IR is encoded into a bitstream by defining blocks and records.  It uses
@@ -666,16 +667,17 @@ that the writer uses, as these are fully self-described in the file, and the
 reader is not allowed to build in any knowledge of this.
 </p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="basics">Basics</a>
-</div>
+<h3>
+  <a name="basics">Basics</a>
+</h3>
+
+<div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"><a name="ir_magic">LLVM IR Magic Number</a></div>
+<h4><a name="ir_magic">LLVM IR Magic Number</a></h4>
 
-<div class="doc_text">
+<div>
 
 <p>
 The magic number for LLVM IR files is:
@@ -695,9 +697,9 @@ When combined with the bitcode magic number and viewed as bytes, this is
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"><a name="ir_signed_vbr">Signed VBRs</a></div>
+<h4><a name="ir_signed_vbr">Signed VBRs</a></h4>
 
-<div class="doc_text">
+<div>
 
 <p>
 <a href="#variablewidth">Variable Width Integer</a> encoding is an efficient way to
@@ -728,9 +730,9 @@ within <tt>CONSTANTS_BLOCK</tt> blocks.
 
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"><a name="ir_blocks">LLVM IR Blocks</a></div>
+<h4><a name="ir_blocks">LLVM IR Blocks</a></h4>
 
-<div class="doc_text">
+<div>
 
 <p>
 LLVM IR is defined with the following blocks:
@@ -758,11 +760,14 @@ LLVM IR is defined with the following blocks:
 
 </div>
 
-<!-- ======================================================================= -->
-<div class="doc_subsection"><a name="MODULE_BLOCK">MODULE_BLOCK Contents</a>
 </div>
 
-<div class="doc_text">
+<!-- ======================================================================= -->
+<h3>
+  <a name="MODULE_BLOCK">MODULE_BLOCK Contents</a>
+</h3>
+
+<div>
 
 <p>The <tt>MODULE_BLOCK</tt> block (id 8) is the top-level block for LLVM
 bitcode files, and each bitcode file must contain exactly one. In
@@ -782,13 +787,10 @@ following sub-blocks:
 <li><a href="#METADATA_BLOCK"><tt>METADATA_BLOCK</tt></a></li>
 </ul>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"><a name="MODULE_CODE_VERSION">MODULE_CODE_VERSION Record</a>
-</div>
+<h4><a name="MODULE_CODE_VERSION">MODULE_CODE_VERSION Record</a></h4>
 
-<div class="doc_text">
+<div>
 
 <p><tt>[VERSION, version#]</tt></p>
 
@@ -798,10 +800,9 @@ time.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"><a name="MODULE_CODE_TRIPLE">MODULE_CODE_TRIPLE Record</a>
-</div>
+<h4><a name="MODULE_CODE_TRIPLE">MODULE_CODE_TRIPLE Record</a></h4>
 
-<div class="doc_text">
+<div>
 <p><tt>[TRIPLE, ...string...]</tt></p>
 
 <p>The <tt>TRIPLE</tt> record (code 2) contains a variable number of
@@ -810,10 +811,9 @@ specification string.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"><a name="MODULE_CODE_DATALAYOUT">MODULE_CODE_DATALAYOUT Record</a>
-</div>
+<h4><a name="MODULE_CODE_DATALAYOUT">MODULE_CODE_DATALAYOUT Record</a></h4>
 
-<div class="doc_text">
+<div>
 <p><tt>[DATALAYOUT, ...string...]</tt></p>
 
 <p>The <tt>DATALAYOUT</tt> record (code 3) contains a variable number of
@@ -822,10 +822,9 @@ specification string.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"><a name="MODULE_CODE_ASM">MODULE_CODE_ASM Record</a>
-</div>
+<h4><a name="MODULE_CODE_ASM">MODULE_CODE_ASM Record</a></h4>
 
-<div class="doc_text">
+<div>
 <p><tt>[ASM, ...string...]</tt></p>
 
 <p>The <tt>ASM</tt> record (code 4) contains a variable number of
@@ -834,10 +833,9 @@ individual assembly blocks separated by newline (ASCII 10) characters.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"><a name="MODULE_CODE_SECTIONNAME">MODULE_CODE_SECTIONNAME Record</a>
-</div>
+<h4><a name="MODULE_CODE_SECTIONNAME">MODULE_CODE_SECTIONNAME Record</a></h4>
 
-<div class="doc_text">
+<div>
 <p><tt>[SECTIONNAME, ...string...]</tt></p>
 
 <p>The <tt>SECTIONNAME</tt> record (code 5) contains a variable number
@@ -850,10 +848,9 @@ referenced by the 1-based index in the <i>section</i> fields of
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"><a name="MODULE_CODE_DEPLIB">MODULE_CODE_DEPLIB Record</a>
-</div>
+<h4><a name="MODULE_CODE_DEPLIB">MODULE_CODE_DEPLIB Record</a></h4>
 
-<div class="doc_text">
+<div>
 <p><tt>[DEPLIB, ...string...]</tt></p>
 
 <p>The <tt>DEPLIB</tt> record (code 6) contains a variable number of
@@ -864,10 +861,9 @@ library name referenced.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"><a name="MODULE_CODE_GLOBALVAR">MODULE_CODE_GLOBALVAR Record</a>
-</div>
+<h4><a name="MODULE_CODE_GLOBALVAR">MODULE_CODE_GLOBALVAR Record</a></h4>
 
-<div class="doc_text">
+<div>
 <p><tt>[GLOBALVAR, pointer type, isconst, initid, linkage, alignment, section, visibility, threadlocal]</tt></p>
 
 <p>The <tt>GLOBALVAR</tt> record (code 7) marks the declaration or
@@ -923,16 +919,15 @@ encoding of the visibility of this variable:
 is <tt>thread_local</tt></li>
 
 <li><i>unnamed_addr</i>: If present and non-zero, indicates that the variable
-has <tt>unnamed_addr<tt></li>
+has <tt>unnamed_addr</tt></li>
 
 </ul>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"><a name="MODULE_CODE_FUNCTION">MODULE_CODE_FUNCTION Record</a>
-</div>
+<h4><a name="MODULE_CODE_FUNCTION">MODULE_CODE_FUNCTION Record</a></h4>
 
-<div class="doc_text">
+<div>
 
 <p><tt>[FUNCTION, type, callingconv, isproto, linkage, paramattr, alignment, section, visibility, gc]</tt></p>
 
@@ -980,16 +975,15 @@ index in the table of
 <a href="#MODULE_CODE_GCNAME">MODULE_CODE_GCNAME</a> entries.</li>
 
 <li><i>unnamed_addr</i>: If present and non-zero, indicates that the function
-has <tt>unnamed_addr<tt></li>
+has <tt>unnamed_addr</tt></li>
 
 </ul>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"><a name="MODULE_CODE_ALIAS">MODULE_CODE_ALIAS Record</a>
-</div>
+<h4><a name="MODULE_CODE_ALIAS">MODULE_CODE_ALIAS Record</a></h4>
 
-<div class="doc_text">
+<div>
 
 <p><tt>[ALIAS, alias type, aliasee val#, linkage, visibility]</tt></p>
 
@@ -1011,10 +1005,9 @@ for this alias</li>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"><a name="MODULE_CODE_PURGEVALS">MODULE_CODE_PURGEVALS Record</a>
-</div>
+<h4><a name="MODULE_CODE_PURGEVALS">MODULE_CODE_PURGEVALS Record</a></h4>
 
-<div class="doc_text">
+<div>
 <p><tt>[PURGEVALS, numvals]</tt></p>
 
 <p>The <tt>PURGEVALS</tt> record (code 10) resets the module-level
@@ -1025,10 +1018,9 @@ new value indices will start from the given <i>numvals</i> value.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"><a name="MODULE_CODE_GCNAME">MODULE_CODE_GCNAME Record</a>
-</div>
+<h4><a name="MODULE_CODE_GCNAME">MODULE_CODE_GCNAME Record</a></h4>
 
-<div class="doc_text">
+<div>
 <p><tt>[GCNAME, ...string...]</tt></p>
 
 <p>The <tt>GCNAME</tt> record (code 11) contains a variable number of
@@ -1039,11 +1031,14 @@ the module. These records can be referenced by 1-based index in the <i>gc</i>
 fields of <tt>FUNCTION</tt> records.</p>
 </div>
 
-<!-- ======================================================================= -->
-<div class="doc_subsection"><a name="PARAMATTR_BLOCK">PARAMATTR_BLOCK Contents</a>
 </div>
 
-<div class="doc_text">
+<!-- ======================================================================= -->
+<h3>
+  <a name="PARAMATTR_BLOCK">PARAMATTR_BLOCK Contents</a>
+</h3>
+
+<div>
 
 <p>The <tt>PARAMATTR_BLOCK</tt> block (id 9) contains a table of
 entries describing the attributes of function parameters. These
@@ -1057,14 +1052,10 @@ href="#FUNC_CODE_INST_CALL"><tt>INST_CALL</tt></a> records.</p>
 that each is unique (i.e., no two indicies represent equivalent
 attribute lists). </p>
 
-</div>
-
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"><a name="PARAMATTR_CODE_ENTRY">PARAMATTR_CODE_ENTRY Record</a>
-</div>
+<h4><a name="PARAMATTR_CODE_ENTRY">PARAMATTR_CODE_ENTRY Record</a></h4>
 
-<div class="doc_text">
+<div>
 
 <p><tt>[ENTRY, paramidx0, attr0, paramidx1, attr1...]</tt></p>
 
@@ -1105,11 +1096,14 @@ the logarithm base 2 of the requested alignment, plus 1</li>
 </ul>
 </div>
 
-<!-- ======================================================================= -->
-<div class="doc_subsection"><a name="TYPE_BLOCK">TYPE_BLOCK Contents</a>
 </div>
 
-<div class="doc_text">
+<!-- ======================================================================= -->
+<h3>
+  <a name="TYPE_BLOCK">TYPE_BLOCK Contents</a>
+</h3>
+
+<div>
 
 <p>The <tt>TYPE_BLOCK</tt> block (id 10) contains records which
 constitute a table of type operator entries used to represent types
@@ -1124,13 +1118,10 @@ type operator records.
 each entry is unique (i.e., no two indicies represent structurally
 equivalent types). </p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"><a name="TYPE_CODE_NUMENTRY">TYPE_CODE_NUMENTRY Record</a>
-</div>
+<h4><a name="TYPE_CODE_NUMENTRY">TYPE_CODE_NUMENTRY Record</a></h4>
 
-<div class="doc_text">
+<div>
 
 <p><tt>[NUMENTRY, numentries]</tt></p>
 
@@ -1142,10 +1133,9 @@ in the block.
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"><a name="TYPE_CODE_VOID">TYPE_CODE_VOID Record</a>
-</div>
+<h4><a name="TYPE_CODE_VOID">TYPE_CODE_VOID Record</a></h4>
 
-<div class="doc_text">
+<div>
 
 <p><tt>[VOID]</tt></p>
 
@@ -1155,10 +1145,9 @@ type table.
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"><a name="TYPE_CODE_FLOAT">TYPE_CODE_FLOAT Record</a>
-</div>
+<h4><a name="TYPE_CODE_FLOAT">TYPE_CODE_FLOAT Record</a></h4>
 
-<div class="doc_text">
+<div>
 
 <p><tt>[FLOAT]</tt></p>
 
@@ -1168,10 +1157,9 @@ floating point) type to the type table.
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"><a name="TYPE_CODE_DOUBLE">TYPE_CODE_DOUBLE Record</a>
-</div>
+<h4><a name="TYPE_CODE_DOUBLE">TYPE_CODE_DOUBLE Record</a></h4>
 
-<div class="doc_text">
+<div>
 
 <p><tt>[DOUBLE]</tt></p>
 
@@ -1181,10 +1169,9 @@ floating point) type to the type table.
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"><a name="TYPE_CODE_LABEL">TYPE_CODE_LABEL Record</a>
-</div>
+<h4><a name="TYPE_CODE_LABEL">TYPE_CODE_LABEL Record</a></h4>
 
-<div class="doc_text">
+<div>
 
 <p><tt>[LABEL]</tt></p>
 
@@ -1194,10 +1181,9 @@ the type table.
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"><a name="TYPE_CODE_OPAQUE">TYPE_CODE_OPAQUE Record</a>
-</div>
+<h4><a name="TYPE_CODE_OPAQUE">TYPE_CODE_OPAQUE Record</a></h4>
 
-<div class="doc_text">
+<div>
 
 <p><tt>[OPAQUE]</tt></p>
 
@@ -1208,10 +1194,9 @@ unified.
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"><a name="TYPE_CODE_INTEGER">TYPE_CODE_INTEGER  Record</a>
-</div>
+<h4><a name="TYPE_CODE_INTEGER">TYPE_CODE_INTEGER  Record</a></h4>
 
-<div class="doc_text">
+<div>
 
 <p><tt>[INTEGER, width]</tt></p>
 
@@ -1222,10 +1207,9 @@ integer type.
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"><a name="TYPE_CODE_POINTER">TYPE_CODE_POINTER Record</a>
-</div>
+<h4><a name="TYPE_CODE_POINTER">TYPE_CODE_POINTER Record</a></h4>
 
-<div class="doc_text">
+<div>
 
 <p><tt>[POINTER, pointee type, address space]</tt></p>
 
@@ -1243,10 +1227,9 @@ default address space is zero.
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"><a name="TYPE_CODE_FUNCTION">TYPE_CODE_FUNCTION Record</a>
-</div>
+<h4><a name="TYPE_CODE_FUNCTION">TYPE_CODE_FUNCTION Record</a></h4>
 
-<div class="doc_text">
+<div>
 
 <p><tt>[FUNCTION, vararg, ignored, retty, ...paramty... ]</tt></p>
 
@@ -1268,10 +1251,9 @@ parameter types of the function</li>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"><a name="TYPE_CODE_STRUCT">TYPE_CODE_STRUCT Record</a>
-</div>
+<h4><a name="TYPE_CODE_STRUCT">TYPE_CODE_STRUCT Record</a></h4>
 
-<div class="doc_text">
+<div>
 
 <p><tt>[STRUCT, ispacked, ...eltty...]</tt></p>
 
@@ -1287,10 +1269,9 @@ types of the structure</li>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"><a name="TYPE_CODE_ARRAY">TYPE_CODE_ARRAY Record</a>
-</div>
+<h4><a name="TYPE_CODE_ARRAY">TYPE_CODE_ARRAY Record</a></h4>
 
-<div class="doc_text">
+<div>
 
 <p><tt>[ARRAY, numelts, eltty]</tt></p>
 
@@ -1305,10 +1286,9 @@ table.  The operand fields are</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"><a name="TYPE_CODE_VECTOR">TYPE_CODE_VECTOR Record</a>
-</div>
+<h4><a name="TYPE_CODE_VECTOR">TYPE_CODE_VECTOR Record</a></h4>
 
-<div class="doc_text">
+<div>
 
 <p><tt>[VECTOR, numelts, eltty]</tt></p>
 
@@ -1323,10 +1303,9 @@ table.  The operand fields are</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"><a name="TYPE_CODE_X86_FP80">TYPE_CODE_X86_FP80 Record</a>
-</div>
+<h4><a name="TYPE_CODE_X86_FP80">TYPE_CODE_X86_FP80 Record</a></h4>
 
-<div class="doc_text">
+<div>
 
 <p><tt>[X86_FP80]</tt></p>
 
@@ -1336,10 +1315,9 @@ floating point) type to the type table.
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"><a name="TYPE_CODE_FP128">TYPE_CODE_FP128 Record</a>
-</div>
+<h4><a name="TYPE_CODE_FP128">TYPE_CODE_FP128 Record</a></h4>
 
-<div class="doc_text">
+<div>
 
 <p><tt>[FP128]</tt></p>
 
@@ -1349,10 +1327,9 @@ floating point) type to the type table.
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"><a name="TYPE_CODE_PPC_FP128">TYPE_CODE_PPC_FP128 Record</a>
-</div>
+<h4><a name="TYPE_CODE_PPC_FP128">TYPE_CODE_PPC_FP128 Record</a></h4>
 
-<div class="doc_text">
+<div>
 
 <p><tt>[PPC_FP128]</tt></p>
 
@@ -1362,10 +1339,9 @@ floating point) type to the type table.
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"><a name="TYPE_CODE_METADATA">TYPE_CODE_METADATA Record</a>
-</div>
+<h4><a name="TYPE_CODE_METADATA">TYPE_CODE_METADATA Record</a></h4>
 
-<div class="doc_text">
+<div>
 
 <p><tt>[METADATA]</tt></p>
 
@@ -1374,11 +1350,14 @@ type to the type table.
 </p>
 </div>
 
-<!-- ======================================================================= -->
-<div class="doc_subsection"><a name="CONSTANTS_BLOCK">CONSTANTS_BLOCK Contents</a>
 </div>
 
-<div class="doc_text">
+<!-- ======================================================================= -->
+<h3>
+  <a name="CONSTANTS_BLOCK">CONSTANTS_BLOCK Contents</a>
+</h3>
+
+<div>
 
 <p>The <tt>CONSTANTS_BLOCK</tt> block (id 11) ...
 </p>
@@ -1387,10 +1366,11 @@ type to the type table.
 
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="FUNCTION_BLOCK">FUNCTION_BLOCK Contents</a>
-</div>
+<h3>
+  <a name="FUNCTION_BLOCK">FUNCTION_BLOCK Contents</a>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>The <tt>FUNCTION_BLOCK</tt> block (id 12) ...
 </p>
@@ -1409,23 +1389,21 @@ type to the type table.
 
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="TYPE_SYMTAB_BLOCK">TYPE_SYMTAB_BLOCK Contents</a>
-</div>
+<h3>
+  <a name="TYPE_SYMTAB_BLOCK">TYPE_SYMTAB_BLOCK Contents</a>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>The <tt>TYPE_SYMTAB_BLOCK</tt> block (id 13) contains entries which
 map between module-level named types and their corresponding type
 indices.
 </p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"><a name="TST_CODE_ENTRY">TST_CODE_ENTRY Record</a>
-</div>
+<h4><a name="TST_CODE_ENTRY">TST_CODE_ENTRY Record</a></h4>
 
-<div class="doc_text">
+<div>
 
 <p><tt>[ENTRY, typeid, ...string...]</tt></p>
 
@@ -1436,12 +1414,14 @@ name. Each entry corresponds to a single named type.
 </p>
 </div>
 
+</div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="VALUE_SYMTAB_BLOCK">VALUE_SYMTAB_BLOCK Contents</a>
-</div>
+<h3>
+  <a name="VALUE_SYMTAB_BLOCK">VALUE_SYMTAB_BLOCK Contents</a>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>The <tt>VALUE_SYMTAB_BLOCK</tt> block (id 14) ... 
 </p>
@@ -1450,10 +1430,11 @@ name. Each entry corresponds to a single named type.
 
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="METADATA_BLOCK">METADATA_BLOCK Contents</a>
-</div>
+<h3>
+  <a name="METADATA_BLOCK">METADATA_BLOCK Contents</a>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>The <tt>METADATA_BLOCK</tt> block (id 15) ...
 </p>
@@ -1462,16 +1443,18 @@ name. Each entry corresponds to a single named type.
 
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="METADATA_ATTACHMENT">METADATA_ATTACHMENT Contents</a>
-</div>
+<h3>
+  <a name="METADATA_ATTACHMENT">METADATA_ATTACHMENT Contents</a>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>The <tt>METADATA_ATTACHMENT</tt> block (id 16) ...
 </p>
 
 </div>
 
+</div>
 
 <!-- *********************************************************************** -->
 <hr>
@@ -1480,7 +1463,7 @@ name. Each entry corresponds to a single named type.
 <a href="http://validator.w3.org/check/referer"><img
  src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
  <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
-<a href="http://llvm.org">The LLVM Compiler Infrastructure</a><br>
+<a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
 Last modified: $Date$
 </address>
 </body>
diff --git a/docs/Bugpoint.html b/docs/Bugpoint.html
index bf75b5b..154edfd 100644
--- a/docs/Bugpoint.html
+++ b/docs/Bugpoint.html
@@ -6,9 +6,9 @@
   <link rel="stylesheet" href="llvm.css" type="text/css">
 </head>
 
-<div class="doc_title">
+<h1>
   LLVM bugpoint tool: design and usage
-</div>
+</h1>
 
 <ul>
   <li><a href="#desc">Description</a></li>
@@ -27,12 +27,12 @@
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
 <a name="desc">Description</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p><tt>bugpoint</tt> narrows down the source of problems in LLVM tools and
 passes.  It can be used to debug three types of failures: optimizer crashes,
@@ -50,12 +50,12 @@ href="HowToSubmitABug.html">How To Submit a Bug Report document</a>.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
 <a name="design">Design Philosophy</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p><tt>bugpoint</tt> is designed to be a useful tool without requiring any
 hooks into the LLVM infrastructure at all.  It works with any and all LLVM
@@ -68,14 +68,12 @@ is still worth it. Note that <tt>bugpoint</tt> is generally very quick unless
 debugging a miscompilation where each test of the program (which requires 
 executing it) takes a long time.</p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="autoselect">Automatic Debugger Selection</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p><tt>bugpoint</tt> reads each <tt>.bc</tt> or <tt>.ll</tt> file specified on
 the command line and links them together into a single module, called the test
@@ -104,11 +102,11 @@ Otherwise, there is no problem <tt>bugpoint</tt> can debug.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="crashdebug">Crash debugger</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>If an optimizer or code generator crashes, <tt>bugpoint</tt> will try as hard
 as it can to reduce the list of passes (for optimizer crashes) and the size of
@@ -129,11 +127,11 @@ reproduce the failure with <tt>opt</tt> or <tt>llc</tt>.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="codegendebug">Code generator debugger</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>The code generator debugger attempts to narrow down the amount of code that
 is being miscompiled by the selected code generator.  To do this, it takes the
@@ -150,11 +148,11 @@ good code.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="miscompilationdebug">Miscompilation debugger</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>The miscompilation debugger works similarly to the code generator debugger.
 It works by splitting the test program into two pieces, running the
@@ -166,13 +164,15 @@ assumes that the selected code generator is working properly.</p>
 
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="advice">Advice for using bugpoint</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <tt>bugpoint</tt> can be a remarkably useful tool, but it sometimes works in
 non-obvious ways.  Here are some hints and tips:<p>
@@ -242,7 +242,7 @@ non-obvious ways.  Here are some hints and tips:<p>
   src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
 
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
-  <a href="http://llvm.org">LLVM Compiler Infrastructure</a><br>
+  <a href="http://llvm.org/">LLVM Compiler Infrastructure</a><br>
   Last modified: $Date$
 </address>
 
diff --git a/docs/CFEBuildInstrs.html b/docs/CFEBuildInstrs.html
index ed2f295..ab10844 100644
--- a/docs/CFEBuildInstrs.html
+++ b/docs/CFEBuildInstrs.html
@@ -21,7 +21,7 @@ This page has moved <a href="GCCFEBuildInstrs.html">here</A>.
   <a href="http://validator.w3.org/check/referer"><img
   src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
 
-  <a href="http://llvm.org">LLVM Compiler Infrastructure</a><br>
+  <a href="http://llvm.org/">LLVM Compiler Infrastructure</a><br>
   Last modified: $Date: 2008-02-13 17:46:10 +0100 (Wed, 13 Feb 2008) $
 </address>
 
diff --git a/docs/CMake.html b/docs/CMake.html
index c88b124..0d8cf62 100644
--- a/docs/CMake.html
+++ b/docs/CMake.html
@@ -6,9 +6,9 @@
   <link rel="stylesheet" href="llvm.css" type="text/css">
 </head>
 
-<div class="doc_title">
+<h1>
   Building LLVM with CMake
-</div>
+</h1>
 
 <ul>
   <li><a href="#intro">Introduction</a></li>
@@ -22,6 +22,9 @@
   <li><a href="#testing">Executing the test suite</a>
   <li><a href="#cross">Cross compiling</a>
   <li><a href="#embedding">Embedding LLVM in your project</a>
+    <ul>
+    <li><a href="#passdev">Developing LLVM pass out of source</a></li>
+  </ul></li>
   <li><a href="#specifics">Compiler/Platform specific topics</a>
     <ul>
     <li><a href="#msvc">Microsoft Visual C++</a></li>
@@ -33,12 +36,12 @@
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
 <a name="intro">Introduction</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
   <p><a href="http://www.cmake.org/">CMake</a> is a cross-platform
     build-generator tool. CMake does not build the project, it generates
@@ -56,12 +59,12 @@
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
 <a name="quickstart">Quick start</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p> We use here the command-line, non-interactive CMake interface </p>
 
@@ -109,12 +112,12 @@
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="usage">Basic CMake usage</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
   <p>This section explains basic aspects of CMake, mostly for
     explaining those options which you may need on your day-to-day
@@ -157,12 +160,12 @@
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="options">Options and variables</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
   <p>Variables customize how the build will be generated. Options are
     boolean variables, with possible values ON/OFF. Options and
@@ -191,14 +194,12 @@
     <p><tt>cmake -DVARIABLE:TYPE=value path/to/llvm/source</tt></p>
   </div>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="freccmake">Frequently-used CMake variables</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>Here are listed some of the CMake variables that are used often,
   along with a brief explanation and LLVM-specific notes. For full
@@ -237,11 +238,11 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="llvmvars">LLVM-specific variables</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <dl>
   <dt><b>LLVM_TARGETS_TO_BUILD</b>:STRING</dt>
@@ -342,7 +343,7 @@
   <dt><b>LLVM_LIT_TOOLS_DIR</b>:STRING</dt>
   <dd>The path to GnuWin32 tools for tests. Valid on Windows host.
     Defaults to "", then Lit seeks tools according to %PATH%.
-    Lit can find tools(eg. grep, sort, &c) on LLVM_LIT_TOOLS_DIR at first,
+    Lit can find tools(eg. grep, sort, &amp;c) on LLVM_LIT_TOOLS_DIR at first,
     without specifying GnuWin32 to %PATH%.</dd>
 
   <dt><b>LLVM_ENABLE_FFI</b>:BOOL</dt>
@@ -354,13 +355,15 @@
 
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="testing">Executing the test suite</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Testing is performed when the <i>check</i> target is built. For
   instance, if you are using makefiles, execute this command while on
@@ -375,12 +378,12 @@
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="cross">Cross compiling</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>See <a href="http://www.vtk.org/Wiki/CMake_Cross_Compiling">this
     wiki page</a> for generic instructions on how to cross-compile
@@ -396,12 +399,12 @@
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="embedding">Embedding LLVM in your project</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
   <p>The most difficult part of adding LLVM to the build of a project
     is to determine the set of LLVM libraries corresponding to the set
@@ -418,7 +421,7 @@
     endif()
     <b># We incorporate the CMake features provided by LLVM:</b>
     set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${LLVM_ROOT}/share/llvm/cmake")
-    include(LLVM)
+    include(LLVMConfig)
     <b># Now set the header and library paths:</b>
     include_directories( ${LLVM_ROOT}/include )
     link_directories( ${LLVM_ROOT}/lib )
@@ -436,27 +439,100 @@
     headers on the LLVM source directory (if we are building
     out-of-source.)</p>
 
-</div>
+  <p>Alternativaly, you can utilize CMake's <i>find_package</i>
+    functionality. Here is an equivalent variant of snippet shown above:</p>
+
+  <div class="doc_code">
+    <pre>
+    find_package(LLVM)
+
+    if( NOT LLVM_FOUND )
+      message(FATAL_ERROR "LLVM package can't be found. Set CMAKE_PREFIX_PATH variable to LLVM's installation prefix.")
+    endif()
+
+    include_directories( ${LLVM_INCLUDE_DIRS} )
+    link_directories( ${LLVM_LIBRARY_DIRS} )
+
+    llvm_map_components_to_libraries(REQ_LLVM_LIBRARIES jit native)
+
+    target_link_libraries(mycompiler ${REQ_LLVM_LIBRARIES})
+    </pre>
+  </div>
+
+<!-- ======================================================================= -->
+<h3>
+  <a name="passdev">Developing LLVM pass out of source</a>
+</h3>
 
+<div>
+
+  <p>It is possible to develop LLVM passes against installed LLVM.
+     An example of project layout provided below:</p>
+
+  <div class="doc_code">
+    <pre>
+      &lt;project dir&gt;/
+          |
+          CMakeLists.txt
+          &lt;pass name&gt;/
+              |
+              CMakeLists.txt
+              Pass.cpp
+              ...
+    </pre>
+  </div>
+
+  <p>Contents of &lt;project dir&gt;/CMakeLists.txt:</p>
+
+  <div class="doc_code">
+    <pre>
+    find_package(LLVM)
+
+    <b># Define add_llvm_* macro's.</b>
+    include(AddLLVM)
+
+    add_definitions(${LLVM_DEFINITIONS})
+    include_directories(${LLVM_INCLUDE_DIRS})
+    link_directories(${LLVM_LIBRARY_DIRS})
+
+    add_subdirectory(&lt;pass name&gt;)
+    </pre>
+  </div>
+
+  <p>Contents of &lt;project dir&gt;/&lt;pass name&gt;/CMakeLists.txt:</p>
+
+  <div class="doc_code">
+    <pre>
+    add_llvm_loadable_module(LLVMPassname
+      Pass.cpp
+      )
+    </pre>
+  </div>
+
+  <p>When you are done developing your pass, you may wish to integrate it
+     into LLVM source tree. You can achieve it in two easy steps:<br>
+     1. Copying &lt;pass name&gt; folder into &lt;LLVM root&gt;/lib/Transform directory.<br>
+     2. Adding "add_subdirectory(&lt;pass name&gt;)" line into &lt;LLVM root&gt;/lib/Transform/CMakeLists.txt</p>
+</div>
 <!-- *********************************************************************** -->
 
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="specifics">Compiler/Platform specific topics</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Notes for specific compilers and/or platforms.</p>
 
-</div>
-
-<div class="doc_subsection">
+<h3>
   <a name="msvc">Microsoft Visual C++</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <dl>
   <dt><b>LLVM_COMPILER_JOBS</b>:STRING</dt>
@@ -468,6 +544,8 @@
 
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
 
 <hr>
@@ -478,7 +556,7 @@
   src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
 
   <a href="mailto:ofv@wanadoo.es">Oscar Fuentes</a><br>
-  <a href="http://llvm.org">LLVM Compiler Infrastructure</a><br>
+  <a href="http://llvm.org/">LLVM Compiler Infrastructure</a><br>
   Last modified: $Date: 2010-08-09 03:59:36 +0100 (Mon, 9 Aug 2010) $
 </address>
 
diff --git a/docs/CodeGenerator.html b/docs/CodeGenerator.html
index 4a656a2..44b835d 100644
--- a/docs/CodeGenerator.html
+++ b/docs/CodeGenerator.html
@@ -19,9 +19,9 @@
 </head>
 <body>
 
-<div class="doc_title">
+<h1>
   The LLVM Target-Independent Code Generator
-</div>
+</h1>
 
 <ol>
   <li><a href="#introduction">Introduction</a>
@@ -127,12 +127,12 @@
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="introduction">Introduction</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>The LLVM target-independent code generator is a framework that provides a
    suite of reusable components for translating the LLVM internal representation
@@ -188,14 +188,12 @@
    depend on the target-description and machine code representation classes,
    ensuring that it is portable.</p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
  <a name="required">Required components in the code generator</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>The two pieces of the LLVM code generator are the high-level interface to the
    code generator and the set of reusable components that can be used to build
@@ -223,11 +221,11 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
  <a name="high-level-design">The high-level design of the code generator</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>The LLVM target-independent code generator is designed to support efficient
    and quality code generation for standard register-based microprocessors.
@@ -297,11 +295,11 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
  <a name="tablegen">Using TableGen for target description</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>The target description classes require a detailed description of the target
    architecture.  These target descriptions often have a large amount of common
@@ -324,13 +322,15 @@
 
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="targetdesc">Target description classes</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>The LLVM target description classes (located in the
    <tt>include/llvm/Target</tt> directory) provide an abstract description of
@@ -346,14 +346,12 @@
    <tt><a href="#targetmachine">TargetMachine</a></tt> class provides accessors
    that should be implemented by the target.</p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="targetmachine">The <tt>TargetMachine</tt> class</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>The <tt>TargetMachine</tt> class provides virtual methods that are used to
    access the target-specific implementations of the various target description
@@ -369,11 +367,11 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="targetdata">The <tt>TargetData</tt> class</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>The <tt>TargetData</tt> class is the only required target description class,
    and it is the only class that is not extensible (you cannot derived a new
@@ -385,11 +383,11 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="targetlowering">The <tt>TargetLowering</tt> class</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>The <tt>TargetLowering</tt> class is used by SelectionDAG based instruction
    selectors primarily to describe how LLVM code should be lowered to
@@ -411,11 +409,11 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="targetregisterinfo">The <tt>TargetRegisterInfo</tt> class</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>The <tt>TargetRegisterInfo</tt> class is used to describe the register file
    of the target and any interactions between the registers.</p>
@@ -445,11 +443,11 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="targetinstrinfo">The <tt>TargetInstrInfo</tt> class</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>The <tt>TargetInstrInfo</tt> class is used to describe the machine
    instructions supported by the target. It is essentially an array of
@@ -463,11 +461,11 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="targetframeinfo">The <tt>TargetFrameInfo</tt> class</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>The <tt>TargetFrameInfo</tt> class is used to provide information about the
    stack frame layout of the target. It holds the direction of stack growth, the
@@ -479,11 +477,11 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="targetsubtarget">The <tt>TargetSubtarget</tt> class</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>The <tt>TargetSubtarget</tt> class is used to provide information about the
    specific chip set being targeted.  A sub-target informs code generation of
@@ -495,11 +493,11 @@
 
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="targetjitinfo">The <tt>TargetJITInfo</tt> class</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>The <tt>TargetJITInfo</tt> class exposes an abstract interface used by the
    Just-In-Time code generator to perform target-specific activities, such as
@@ -509,13 +507,15 @@
 
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="codegendesc">Machine code description classes</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>At the high-level, LLVM code is translated to a machine specific
    representation formed out of
@@ -528,14 +528,12 @@
    SSA representation for machine code, as well as a register allocated, non-SSA
    form.</p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="machineinstr">The <tt>MachineInstr</tt> class</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>Target machine instructions are represented as instances of the
    <tt>MachineInstr</tt> class.  This class is an extremely abstract way of
@@ -576,14 +574,12 @@
 <p>Also if the first operand is a def, it is easier to <a href="#buildmi">create
    instructions</a> whose only def is the first operand.</p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="buildmi">Using the <tt>MachineInstrBuilder.h</tt> functions</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>Machine instructions are created by using the <tt>BuildMI</tt> functions,
    located in the <tt>include/llvm/CodeGen/MachineInstrBuilder.h</tt> file.  The
@@ -630,11 +626,11 @@ MI.addReg(Reg, RegState::Define);
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="fixedregs">Fixed (preassigned) registers</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>One important issue that the code generator needs to be aware of is the
    presence of fixed registers.  In particular, there are often places in the
@@ -702,11 +698,11 @@ ret
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="ssa">Machine code in SSA form</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p><tt>MachineInstr</tt>'s are initially selected in SSA-form, and are
    maintained in SSA-form until register allocation happens.  For the most part,
@@ -719,12 +715,14 @@ ret
 
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="machinebasicblock">The <tt>MachineBasicBlock</tt> class</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>The <tt>MachineBasicBlock</tt> class contains a list of machine instructions
    (<tt><a href="#machineinstr">MachineInstr</a></tt> instances).  It roughly
@@ -737,11 +735,11 @@ ret
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="machinefunction">The <tt>MachineFunction</tt> class</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>The <tt>MachineFunction</tt> class contains a list of machine basic blocks
    (<tt><a href="#machinebasicblock">MachineBasicBlock</a></tt> instances).  It
@@ -754,14 +752,15 @@ ret
 
 </div>
 
+</div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="mc">The "MC" Layer</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>
 The MC Layer is used to represent and process code at the raw machine code
@@ -770,7 +769,7 @@ level, devoid of "high level" information like "constant pools", "jump tables",
 like label names, machine instructions, and sections in the object file.  The
 code in this layer is used for a number of important purposes: the tail end of
 the code generator uses it to write a .s or .o file, and it is also used by the
-llvm-mc tool to implement standalone machine codeassemblers and disassemblers.
+llvm-mc tool to implement standalone machine code assemblers and disassemblers.
 </p>
 
 <p>
@@ -779,15 +778,12 @@ of important subsystems that interact at this layer, they are described later
 in this manual.
 </p>
 
-</div>
-
-
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="mcstreamer">The <tt>MCStreamer</tt> API</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>
 MCStreamer is best thought of as an assembler API.  It is an abstract API which
@@ -817,11 +813,11 @@ MCObjectStreamer implements a full assembler.
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="mccontext">The <tt>MCContext</tt> class</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>
 The MCContext class is the owner of a variety of uniqued data structures at the
@@ -832,11 +828,11 @@ interact with to create symbols and sections.  This class can not be subclassed.
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="mcsymbol">The <tt>MCSymbol</tt> class</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>
 The MCSymbol class represents a symbol (aka label) in the assembly file.  There
@@ -864,11 +860,11 @@ like this to the .s file:<p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="mcsection">The <tt>MCSection</tt> class</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>
 The MCSection class represents an object-file specific section. It is subclassed
@@ -882,11 +878,11 @@ directive in a .s file).
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="mcinst">The <tt>MCInst</tt> class</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>
 The MCInst class is a target-independent representation of an instruction.  It
@@ -904,27 +900,26 @@ printer, and the type generated by the assembly parser and disassembler.
 
 </div>
 
+</div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="codegenalgs">Target-independent code generation algorithms</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>This section documents the phases described in the
    <a href="#high-level-design">high-level design of the code generator</a>.
    It explains how they work and some of the rationale behind their design.</p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="instselect">Instruction Selection</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>Instruction Selection is the process of translating LLVM code presented to
    the code generator into target-specific machine instructions.  There are
@@ -936,14 +931,12 @@ printer, and the type generated by the assembly parser and disassembler.
    selector to be generated from these <tt>.td</tt> files, though currently
    there are still things that require custom C++ code.</p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="selectiondag_intro">Introduction to SelectionDAGs</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>The SelectionDAG provides an abstraction for code representation in a way
    that is amenable to instruction selection using automatic techniques
@@ -1001,11 +994,11 @@ printer, and the type generated by the assembly parser and disassembler.
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="selectiondag_process">SelectionDAG Instruction Selection Process</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>SelectionDAG-based instruction selection consists of the following steps:</p>
 
@@ -1082,11 +1075,11 @@ printer, and the type generated by the assembly parser and disassembler.
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="selectiondag_build">Initial SelectionDAG Construction</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>The initial SelectionDAG is na&iuml;vely peephole expanded from the LLVM
    input by the <tt>SelectionDAGLowering</tt> class in the
@@ -1102,11 +1095,11 @@ printer, and the type generated by the assembly parser and disassembler.
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="selectiondag_legalize_types">SelectionDAG LegalizeTypes Phase</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>The Legalize phase is in charge of converting a DAG to only use the types
    that are natively supported by the target.</p>
@@ -1135,11 +1128,11 @@ printer, and the type generated by the assembly parser and disassembler.
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="selectiondag_legalize">SelectionDAG Legalize Phase</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>The Legalize phase is in charge of converting a DAG to only use the
    operations that are natively supported by the target.</p>
@@ -1167,12 +1160,13 @@ printer, and the type generated by the assembly parser and disassembler.
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
-  <a name="selectiondag_optimize">SelectionDAG Optimization Phase: the DAG
-  Combiner</a>
-</div>
+<h4>
+  <a name="selectiondag_optimize">
+    SelectionDAG Optimization Phase: the DAG Combiner
+  </a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>The SelectionDAG optimization phase is run multiple times for code
    generation, immediately after the DAG is built and once after each
@@ -1202,11 +1196,11 @@ printer, and the type generated by the assembly parser and disassembler.
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="selectiondag_select">SelectionDAG Select Phase</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>The Select phase is the bulk of the target-specific code for instruction
    selection.  This phase takes a legal SelectionDAG as input, pattern matches
@@ -1363,11 +1357,11 @@ def : Pat&lt;(i32 imm:$imm),
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="selectiondag_sched">SelectionDAG Scheduling and Formation Phase</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>The scheduling phase takes the DAG of target instructions from the selection
    phase and assigns an order.  The scheduler can pick an order depending on
@@ -1384,11 +1378,11 @@ def : Pat&lt;(i32 imm:$imm),
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="selectiondag_future">Future directions for the SelectionDAG</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <ol>
   <li>Optional function-at-a-time selection.</li>
@@ -1398,18 +1392,20 @@ def : Pat&lt;(i32 imm:$imm),
 
 </div>
  
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="ssamco">SSA-based Machine Code Optimizations</a>
-</div>
-<div class="doc_text"><p>To Be Written</p></div>
+</h3>
+<div><p>To Be Written</p></div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="liveintervals">Live Intervals</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>Live Intervals are the ranges (intervals) where a variable is <i>live</i>.
    They are used by some <a href="#regalloc">register allocator</a> passes to
@@ -1417,14 +1413,12 @@ def : Pat&lt;(i32 imm:$imm),
    register are live at the same point in the program (i.e., they conflict).
    When this situation occurs, one virtual register must be <i>spilled</i>.</p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="livevariable_analysis">Live Variable Analysis</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>The first step in determining the live intervals of variables is to calculate
    the set of registers that are immediately dead after the instruction (i.e.,
@@ -1466,11 +1460,11 @@ def : Pat&lt;(i32 imm:$imm),
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="liveintervals_analysis">Live Intervals Analysis</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>We now have the information available to perform the live intervals analysis
    and build the live intervals themselves.  We start off by numbering the basic
@@ -1485,12 +1479,14 @@ def : Pat&lt;(i32 imm:$imm),
 
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="regalloc">Register Allocation</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>The <i>Register Allocation problem</i> consists in mapping a program
    <i>P<sub>v</sub></i>, that can use an unbounded number of virtual registers,
@@ -1500,15 +1496,13 @@ def : Pat&lt;(i32 imm:$imm),
    accommodate all the virtual registers, some of them will have to be mapped
    into memory. These virtuals are called <i>spilled virtuals</i>.</p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
 
-<div class="doc_subsubsection">
+<h4>
   <a name="regAlloc_represent">How registers are represented in LLVM</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>In LLVM, physical registers are denoted by integer numbers that normally
    range from 1 to 1023. To see how this numbering is defined for a particular
@@ -1617,11 +1611,11 @@ bool RegMapping_Fer::compatible_class(MachineFunction &amp;mf,
 
 <!-- _______________________________________________________________________ -->
 
-<div class="doc_subsubsection">
+<h4>
   <a name="regAlloc_howTo">Mapping virtual registers to physical registers</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>There are two ways to map virtual registers to physical registers (or to
    memory slots). The first way, that we will call <i>direct mapping</i>, is
@@ -1667,11 +1661,11 @@ bool RegMapping_Fer::compatible_class(MachineFunction &amp;mf,
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="regAlloc_twoAddr">Handling two address instructions</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>With very rare exceptions (e.g., function calls), the LLVM machine code
    instructions are three address instructions. That is, each instruction is
@@ -1703,11 +1697,11 @@ bool RegMapping_Fer::compatible_class(MachineFunction &amp;mf,
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="regAlloc_ssaDecon">The SSA deconstruction phase</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>An important transformation that happens during register allocation is called
    the <i>SSA Deconstruction Phase</i>. The SSA form simplifies many analyses
@@ -1727,11 +1721,11 @@ bool RegMapping_Fer::compatible_class(MachineFunction &amp;mf,
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="regAlloc_fold">Instruction folding</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p><i>Instruction folding</i> is an optimization performed during register
    allocation that removes unnecessary copy instructions. For instance, a
@@ -1764,11 +1758,11 @@ bool RegMapping_Fer::compatible_class(MachineFunction &amp;mf,
 
 <!-- _______________________________________________________________________ -->
 
-<div class="doc_subsubsection">
+<h4>
   <a name="regAlloc_builtIn">Built in register allocators</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>The LLVM infrastructure provides the application developer with three
    different register allocators:</p>
@@ -1805,23 +1799,25 @@ $ llc -regalloc=pbqp file.bc -o pbqp.s;
 
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="proepicode">Prolog/Epilog Code Insertion</a>
-</div>
-<div class="doc_text"><p>To Be Written</p></div>
+</h3>
+<div><p>To Be Written</p></div>
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="latemco">Late Machine Code Optimizations</a>
-</div>
-<div class="doc_text"><p>To Be Written</p></div>
+</h3>
+<div><p>To Be Written</p></div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="codeemit">Code Emission</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>The code emission step of code generation is responsible for lowering from
 the code generator abstractions (like <a 
@@ -1880,14 +1876,15 @@ to implement an assembler for your target.</p>
 
 </div>
 
+</div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="nativeassembler">Implementing a Native Assembler</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Though you're probably reading this because you want to write or maintain a
 compiler backend, LLVM also fully supports building a native assemblers too.
@@ -1896,20 +1893,18 @@ We've tried hard to automate the generation of the assembler from the .td files
 part of the manual and repetitive data entry can be factored and shared with the
 compiler.</p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsection" id="na_instparsing">Instruction Parsing</div>
+<h3 id="na_instparsing">Instruction Parsing</h3>
 
-<div class="doc_text"><p>To Be Written</p></div>
+<div><p>To Be Written</p></div>
 
 
 <!-- ======================================================================= -->
-<div class="doc_subsection" id="na_instaliases">
+<h3 id="na_instaliases">
   Instruction Alias Processing
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 <p>Once the instruction is parsed, it enters the MatchInstructionImpl function.
 The MatchInstructionImpl function performs alias processing and then does
 actual matching.</p>
@@ -1922,12 +1917,10 @@ complex/powerful).  Generally you want to use the first alias mechanism that
 meets the needs of your instruction, because it will allow a more concise
 description.</p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">Mnemonic Aliases</div>
+<h4>Mnemonic Aliases</h4>
 
-<div class="doc_text">
+<div>
 
 <p>The first phase of alias processing is simple instruction mnemonic
 remapping for classes of instructions which are allowed with two different
@@ -1965,9 +1958,9 @@ on the current instruction set.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">Instruction Aliases</div>
+<h4>Instruction Aliases</h4>
 
-<div class="doc_text">
+<div>
 
 <p>The most general phase of alias processing occurs while matching is
 happening: it provides new forms for the matcher to match along with a specific
@@ -2026,38 +2019,40 @@ def : InstAlias&lt;"fcomi $reg", (COM_FIr RST:$reg)&gt;;
 <p>Instruction aliases can also have a Requires clause to make them
 subtarget specific.</p>
 
-</div>
+<p>If the back-end supports it, the instruction printer can automatically emit
+   the alias rather than what's being aliased. It typically leads to better,
+   more readable code. If it's better to print out what's being aliased, then
+   pass a '0' as the third parameter to the InstAlias definition.</p>
 
+</div>
 
+</div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection" id="na_matching">Instruction Matching</div>
-
-<div class="doc_text"><p>To Be Written</p></div>
-
+<h3 id="na_matching">Instruction Matching</h3>
 
+<div><p>To Be Written</p></div>
 
+</div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="targetimpls">Target-specific Implementation Notes</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>This section of the document explains features or design decisions that are
    specific to the code generator for a particular target.  First we start
    with a table that summarizes what features are supported by each target.</p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="targetfeatures">Target Feature Matrix</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>Note that this table does not include the C backend or Cpp backends, since
 they do not use the target independent code generator infrastructure.  It also
@@ -2228,12 +2223,10 @@ is the key:</p>
 
 </table>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection" id="feat_reliable">Is Generally Reliable</div>
+<h4 id="feat_reliable">Is Generally Reliable</h4>
 
-<div class="doc_text">
+<div>
 <p>This box indicates whether the target is considered to be production quality.
 This indicates that the target has been used as a static compiler to
 compile large amounts of code by a variety of different people and is in
@@ -2241,9 +2234,9 @@ continuous use.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection" id="feat_asmparser">Assembly Parser</div>
+<h4 id="feat_asmparser">Assembly Parser</h4>
 
-<div class="doc_text">
+<div>
 <p>This box indicates whether the target supports parsing target specific .s
 files by implementing the MCAsmParser interface.  This is required for llvm-mc
 to be able to act as a native assembler and is required for inline assembly
@@ -2253,18 +2246,18 @@ support in the native .o file writer.</p>
 
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection" id="feat_disassembler">Disassembler</div>
+<h4 id="feat_disassembler">Disassembler</h4>
 
-<div class="doc_text">
+<div>
 <p>This box indicates whether the target supports the MCDisassembler API for
 disassembling machine opcode bytes into MCInst's.</p>
 
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection" id="feat_inlineasm">Inline Asm</div>
+<h4 id="feat_inlineasm">Inline Asm</h4>
 
-<div class="doc_text">
+<div>
 <p>This box indicates whether the target supports most popular inline assembly
 constraints and modifiers.</p>
 
@@ -2274,9 +2267,9 @@ constraints relating to the X86 floating point stack.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection" id="feat_jit">JIT Support</div>
+<h4 id="feat_jit">JIT Support</h4>
 
-<div class="doc_text">
+<div>
 <p>This box indicates whether the target supports the JIT compiler through
 the ExecutionEngine interface.</p>
 
@@ -2286,9 +2279,9 @@ in ARM codegen mode, but lacks NEON and full Thumb support.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection" id="feat_objectwrite">.o File Writing</div>
+<h4 id="feat_objectwrite">.o File Writing</h4>
 
-<div class="doc_text">
+<div>
 
 <p>This box indicates whether the target supports writing .o files (e.g. MachO,
 ELF, and/or COFF) files directly from the target.  Note that the target also
@@ -2302,9 +2295,9 @@ file to a .o file (as is the case for many C compilers).</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection" id="feat_tailcall">Tail Calls</div>
+<h4 id="feat_tailcall">Tail Calls</h4>
 
-<div class="doc_text">
+<div>
 
 <p>This box indicates whether the target supports guaranteed tail calls.  These
 are calls marked "<a href="LangRef.html#i_call">tail</a>" and use the fastcc
@@ -2313,15 +2306,14 @@ more more details</a>.</p>
 
 </div>
 
-
-
+</div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="tailcallopt">Tail call optimization</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>Tail call optimization, callee reusing the stack of the caller, is currently
    supported on x86/x86-64 and PowerPC. It is performed if:</p>
@@ -2383,11 +2375,11 @@ define fastcc i32 @tailcaller(i32 %in1, i32 %in2) {
 
 </div>
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="sibcallopt">Sibling call optimization</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>Sibling call optimization is a restricted form of tail call optimization.
    Unlike tail call optimization described in the previous section, it can be
@@ -2427,24 +2419,22 @@ entry:
 
 </div>
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="x86">The X86 backend</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>The X86 code generator lives in the <tt>lib/Target/X86</tt> directory.  This
    code generator is capable of targeting a variety of x86-32 and x86-64
    processors, and includes support for ISA extensions such as MMX and SSE.</p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="x86_tt">X86 Target Triples supported</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>The following are the known target triples that are supported by the X86
    backend.  This is not an exhaustive list, and it would be useful to add those
@@ -2469,31 +2459,34 @@ entry:
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="x86_cc">X86 Calling Conventions supported</a>
-</div>
+</h4>
 
 
-<div class="doc_text">
+<div>
 
 <p>The following target-specific calling conventions are known to backend:</p>
 
 <ul>
-  <li><b>x86_StdCall</b> &mdash; stdcall calling convention seen on Microsoft
-      Windows platform (CC ID = 64).</li>
-
-  <li><b>x86_FastCall</b> &mdash; fastcall calling convention seen on Microsoft
-      Windows platform (CC ID = 65).</li>
+<li><b>x86_StdCall</b> &mdash; stdcall calling convention seen on Microsoft
+    Windows platform (CC ID = 64).</li>
+<li><b>x86_FastCall</b> &mdash; fastcall calling convention seen on Microsoft
+    Windows platform (CC ID = 65).</li>
+<li><b>x86_ThisCall</b> &mdash; Similar to X86_StdCall. Passes first argument
+    in ECX,  others via stack. Callee is responsible for stack cleaning. This
+    convention is used by MSVC by default for methods in its ABI
+    (CC ID = 70).</li>
 </ul>
 
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="x86_memory">Representing X86 addressing modes in MachineInstrs</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>The x86 has a very flexible way of accessing memory.  It is capable of
    forming memory addresses of the following expression directly in integer
@@ -2526,11 +2519,11 @@ OperandTy: VirtReg, | VirtReg, UnsImm, VirtReg,   SignExtImm  PhysReg
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="x86_memory">X86 address spaces supported</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>x86 has a feature which provides
    the ability to perform loads and stores to different address spaces
@@ -2571,11 +2564,11 @@ OperandTy: VirtReg, | VirtReg, UnsImm, VirtReg,   SignExtImm  PhysReg
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="x86_names">Instruction naming</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>An instruction name consists of the base name, a default operand size, and a
    a character per operand with an optional special size. For example:</p>
@@ -2591,25 +2584,25 @@ MOVSX32rm16 -&gt; movsx, 32-bit register, 16-bit memory
 
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="ppc">The PowerPC backend</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>The PowerPC code generator lives in the lib/Target/PowerPC directory.  The
    code generation is retargetable to several variations or <i>subtargets</i> of
    the PowerPC ISA; including ppc32, ppc64 and altivec.</p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="ppc_abi">LLVM PowerPC ABI</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>LLVM follows the AIX PowerPC ABI, with two deviations. LLVM uses a PC
    relative (PIC) or static addressing for accessing global values, so no TOC
@@ -2625,11 +2618,11 @@ MOVSX32rm16 -&gt; movsx, 32-bit register, 16-bit memory
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="ppc_frame">Frame Layout</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>The size of a PowerPC frame is usually fixed for the duration of a
    function's invocation.  Since the frame is fixed size, all references
@@ -2772,11 +2765,11 @@ MOVSX32rm16 -&gt; movsx, 32-bit register, 16-bit memory
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="ppc_prolog">Prolog/Epilog</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>The llvm prolog and epilog are the same as described in the PowerPC ABI, with
    the following exceptions.  Callee saved registers are spilled after the frame
@@ -2789,16 +2782,19 @@ MOVSX32rm16 -&gt; movsx, 32-bit register, 16-bit memory
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="ppc_dynamic">Dynamic Allocation</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p><i>TODO - More to come.</i></p>
 
 </div>
 
+</div>
+
+</div>
 
 <!-- *********************************************************************** -->
 <hr>
@@ -2809,7 +2805,7 @@ MOVSX32rm16 -&gt; movsx, 32-bit register, 16-bit memory
   src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
 
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
-  <a href="http://llvm.org">The LLVM Compiler Infrastructure</a><br>
+  <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
   Last modified: $Date$
 </address>
 
diff --git a/docs/CodingStandards.html b/docs/CodingStandards.html
index f290712..7f92b9a 100644
--- a/docs/CodingStandards.html
+++ b/docs/CodingStandards.html
@@ -7,9 +7,9 @@
 </head>
 <body>
 
-<div class="doc_title">
+<h1>
   LLVM Coding Standards
-</div>
+</h1>
 
 <ol>
   <li><a href="#introduction">Introduction</a></li>
@@ -83,12 +83,12 @@
 
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="introduction">Introduction</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>This document attempts to describe a few coding standards that are being used
 in the LLVM source tree.  Although no coding standards should be regarded as
@@ -117,22 +117,26 @@ href="mailto:sabre@nondot.org">Chris</a>.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="mechanicalissues">Mechanical Source Issues</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
+<div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="sourceformating">Source Code Formatting</a>
-</div>
+</h3>
+
+<div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="scf_commenting">Commenting</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>Comments are one critical part of readability and maintainability.  Everyone
 knows they should comment, so should you.  When writing comments, write them as
@@ -141,7 +145,9 @@ etc.  Although we all should probably
 comment our code more than we do, there are a few very critical places that
 documentation is very useful:</p>
 
-<b>File Headers</b>
+<h5>File Headers</h5>
+
+<div>
 
 <p>Every source file should have a header on it that describes the basic 
 purpose of the file.  If a file does not have a header, it should not be 
@@ -184,7 +190,9 @@ Here it's only two lines.  If an algorithm is being implemented or something
 tricky is going on, a reference to the paper where it is published should be
 included, as well as any notes or "gotchas" in the code to watch out for.</p>
 
-<b>Class overviews</b>
+</div>
+
+<h5>Class overviews</h5>
 
 <p>Classes are one fundamental part of a good object oriented design.  As such,
 a class definition should have a comment block that explains what the class is
@@ -193,7 +201,9 @@ could figure it out, it's probably safe to leave it out.  Naming classes
 something sane goes a long ways towards avoiding writing documentation.</p>
 
 
-<b>Method information</b>
+<h5>Method information</h5>
+
+<div>
 
 <p>Methods defined in a class (as well as any global functions) should also be
 documented properly.  A quick note about what it does and a description of the
@@ -207,12 +217,14 @@ happens: does the method return null?  Abort?  Format your hard disk?</p>
 
 </div>
 
+</div>
+
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="scf_commentformat">Comment Formatting</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>In general, prefer C++ style (<tt>//</tt>) comments.  They take less space,
 require less typing, don't have nesting problems, etc.  There are a few cases
@@ -233,11 +245,11 @@ These nest properly and are better behaved in general than C style comments.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="scf_includes"><tt>#include</tt> Style</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>Immediately after the <a href="#scf_commenting">header file comment</a> (and
 include guards if working on a header file), the <a
@@ -273,11 +285,11 @@ implements are defined.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="scf_codewidth">Source Code Width</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>Write your code to fit within 80 columns of text.  This helps those of us who
 like to print out code and look at your code in an xterm without resizing
@@ -298,11 +310,11 @@ for debate.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="scf_spacestabs">Use Spaces Instead of Tabs</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>In all cases, prefer spaces to tabs in source files.  People have different
 preferred indentation levels, and different styles of indentation that they
@@ -319,11 +331,11 @@ makes for incredible diffs that are absolutely worthless.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="scf_indentation">Indent Code Consistently</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>Okay, in your first year of programming you were told that indentation is
 important.  If you didn't believe and internalize this then, now is the time.
@@ -331,19 +343,21 @@ Just do it.</p>
 
 </div>
 
+</div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="compilerissues">Compiler Issues</a>
-</div>
+</h3>
 
+<div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="ci_warningerrors">Treat Compiler Warnings Like Errors</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>If your code has compiler warnings in it, something is wrong &mdash; you
 aren't casting values correctly, your have "questionable" constructs in your
@@ -393,11 +407,11 @@ be fixed by massaging the code appropriately.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="ci_portable_code">Write Portable Code</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>In almost all cases, it is possible and within reason to write completely
 portable code.  If there are cases where it isn't possible to write portable
@@ -412,10 +426,10 @@ libSystem.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
 <a name="ci_rtti_exceptions">Do not use RTTI or Exceptions</a>
-</div>
-<div class="doc_text">
+</h4>
+<div>
 
 <p>In an effort to reduce code and executable size, LLVM does not use RTTI
 (e.g. <tt>dynamic_cast&lt;&gt;</tt>) or exceptions.  These two language features
@@ -433,10 +447,10 @@ than <tt>dynamic_cast&lt;&gt;</tt>.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
 <a name="ci_class_struct">Use of <tt>class</tt> and <tt>struct</tt> Keywords</a>
-</div>
-<div class="doc_text">
+</h4>
+<div>
 
 <p>In C++, the <tt>class</tt> and <tt>struct</tt> keywords can be used almost
 interchangeably. The only difference is when they are used to declare a class:
@@ -454,26 +468,32 @@ which case <tt>struct</tt> is allowed.</p>
 
 </div>
 
+</div>
+
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="styleissues">Style Issues</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
+<div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="macro">The High-Level Issues</a>
-</div>
+</h3>
 <!-- ======================================================================= -->
 
+<div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="hl_module">A Public Header File <b>is</b> a Module</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>C++ doesn't do too well in the modularity department.  There is no real
 encapsulation or data hiding (unless you use expensive protocol classes), but it
@@ -499,11 +519,11 @@ translation unit.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="hl_dontinclude"><tt>#include</tt> as Little as Possible</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p><tt>#include</tt> hurts compile time performance.  Don't do it unless you
 have to, especially in header files.</p>
@@ -528,11 +548,11 @@ dependencies that you'll find out about later.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="hl_privateheaders">Keep "Internal" Headers Private</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>Many modules have a complex implementation that causes them to use more than
 one implementation (<tt>.cpp</tt>) file.  It is often tempting to put the
@@ -549,11 +569,11 @@ class itself. Just make them private (or protected) and all is well.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="hl_earlyexit">Use Early Exits and <tt>continue</tt> to Simplify Code</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>When reading code, keep in mind how much state and how many previous
 decisions have to be remembered by the reader to understand a block of code.
@@ -658,11 +678,11 @@ can be a big understandability win.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="hl_else_after_return">Don't use <tt>else</tt> after a <tt>return</tt></a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>For similar reasons above (reduction of indentation and easier reading),
 please do not use '<tt>else</tt>' or '<tt>else if</tt>' after something that
@@ -741,11 +761,11 @@ track of when reading the code.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="hl_predicateloops">Turn Predicate Loops into Predicate Functions</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>It is very common to write small loops that just compute a boolean value.
 There are a number of ways that people commonly write these, but an example of
@@ -802,20 +822,24 @@ locality.</p>
 
 </div>
 
+</div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="micro">The Low-Level Issues</a>
-</div>
+</h3>
 <!-- ======================================================================= -->
 
+<div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
-  <a name="ll_naming">Name Types, Functions, Variables, and Enumerators Properly</a>
-</div>
+<h4>
+  <a name="ll_naming">
+    Name Types, Functions, Variables, and Enumerators Properly
+  </a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>Poorly-chosen names can mislead the reader and cause bugs. We cannot stress
 enough how important it is to use <em>descriptive</em> names.  Pick names that
@@ -894,11 +918,11 @@ Vehicle MakeVehicle(VehicleType Type) {
 
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="ll_assert">Assert Liberally</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>Use the "<tt>assert</tt>" macro to its fullest.  Check all of your
 preconditions and assumptions, you never know when a bug (not necessarily even
@@ -997,11 +1021,11 @@ assert(NewToSet &amp;&amp; "The value shouldn't be in the set yet");
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="ll_ns_std">Do Not Use '<tt>using namespace std</tt>'</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>In LLVM, we prefer to explicitly prefix all identifiers from the standard
 namespace with an "<tt>std::</tt>" prefix, rather than rely on
@@ -1035,12 +1059,13 @@ use any others.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
-  <a name="ll_virtual_anch">Provide a Virtual Method Anchor for Classes
-  in Headers</a>
-</div>
+<h4>
+  <a name="ll_virtual_anch">
+    Provide a Virtual Method Anchor for Classes in Headers
+  </a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>If a class is defined in a header file and has a v-table (either it has 
 virtual methods or it derives from classes with virtual methods), it must 
@@ -1052,11 +1077,11 @@ increasing link times.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="ll_end">Don't evaluate <tt>end()</tt> every time through a loop</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>Because C++ doesn't have a standard "<tt>foreach</tt>" loop (though it can be
 emulated with macros and may be coming in C++'0x) we end up writing a lot of
@@ -1114,11 +1139,11 @@ prefer it.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="ll_iostream"><tt>#include &lt;iostream&gt;</tt> is Forbidden</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>The use of <tt>#include &lt;iostream&gt;</tt> in library files is
 hereby <b><em>forbidden</em></b>. The primary reason for doing this is to
@@ -1149,11 +1174,11 @@ the <tt>llvm::MemoryBuffer</tt> API for reading files.</b></p>
 
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="ll_raw_ostream">Use <tt>raw_ostream</tt></a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>LLVM includes a lightweight, simple, and efficient stream implementation
 in <tt>llvm/Support/raw_ostream.h</tt>, which provides all of the common
@@ -1169,11 +1194,11 @@ declarations and constant references to <tt>raw_ostream</tt> instances.</p>
 
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="ll_avoidendl">Avoid <tt>std::endl</tt></a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>The <tt>std::endl</tt> modifier, when used with <tt>iostreams</tt> outputs a
 newline to the output stream specified.  In addition to doing this, however, it
@@ -1191,22 +1216,25 @@ it's better to use a literal <tt>'\n'</tt>.</p>
 
 </div>
 
+</div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="nano">Microscopic Details</a>
-</div>
+</h3>
 <!-- ======================================================================= -->
 
+<div>
+
 <p>This section describes preferred low-level formatting guidelines along with
 reasoning on why we prefer them.</p>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="micro_spaceparen">Spaces Before Parentheses</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>We prefer to put a space before an open parenthesis only in control flow
 statements, but not in normal function call expressions and function-like
@@ -1260,11 +1288,11 @@ this misinterpretation.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="micro_preincrement">Prefer Preincrement</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>Hard fast rule: Preincrement (<tt>++X</tt>) may be no slower than
 postincrement (<tt>X++</tt>) and could very well be a lot faster than it.  Use
@@ -1280,11 +1308,11 @@ get in the habit of always using preincrement, and you won't have a problem.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="micro_namespaceindent">Namespace Indentation</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>
 In general, we strive to reduce indentation wherever possible.  This is useful
@@ -1368,11 +1396,11 @@ the contents of the namespace.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="micro_anonns">Anonymous Namespaces</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>After talking about namespaces in general, you may be wondering about
 anonymous namespaces in particular.
@@ -1452,15 +1480,17 @@ namespace just because it was declared there.
 
 </div>
 
+</div>
 
+</div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="seealso">See Also</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>A lot of these comments and recommendations have been culled for other
 sources.  Two particularly important books for our work are:</p>
@@ -1491,7 +1521,7 @@ something.</p>
   src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
 
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
-  <a href="http://llvm.org">LLVM Compiler Infrastructure</a><br>
+  <a href="http://llvm.org/">LLVM Compiler Infrastructure</a><br>
   Last modified: $Date$
 </address>
 
diff --git a/docs/CommandGuide/FileCheck.pod b/docs/CommandGuide/FileCheck.pod
index 3ccaa63..dbd626c 100644
--- a/docs/CommandGuide/FileCheck.pod
+++ b/docs/CommandGuide/FileCheck.pod
@@ -240,6 +240,6 @@ define two separate CHECK lines that match on the same line.
 
 =head1 AUTHORS
 
-Maintained by The LLVM Team (L<http://llvm.org>).
+Maintained by The LLVM Team (L<http://llvm.org/>).
 
 =cut
diff --git a/docs/CommandGuide/bugpoint.pod b/docs/CommandGuide/bugpoint.pod
index 1870a0d..31db62f 100644
--- a/docs/CommandGuide/bugpoint.pod
+++ b/docs/CommandGuide/bugpoint.pod
@@ -21,7 +21,7 @@ distribution.
 
 =head1 OPTIONS
 
-=over 
+=over
 
 =item B<--additional-so> F<library>
 
@@ -87,7 +87,7 @@ mis-management.
 =item B<-find-bugs>
 
 Continually randomize the specified passes and run them on the test program
-until a bug is found or the user kills B<bugpoint>. 
+until a bug is found or the user kills B<bugpoint>.
 
 =item B<-help>
 
@@ -147,6 +147,21 @@ This option defines the command to use with the B<--run-custom> and
 B<--safe-custom> options to execute the bitcode testcase. This can
 be useful for cross-compilation.
 
+=item B<--compile-command> I<command>
+
+This option defines the command to use with the B<--compile-custom>
+option to compile the bitcode testcase. This can be useful for
+testing compiler output without running any link or execute stages. To
+generate a reduced unit test, you may add CHECK directives to the
+testcase and pass the name of an executable compile-command script in this form:
+
+    #!/bin/sh
+    llc "$@"
+    not FileCheck [bugpoint input file].ll < bugpoint-test-program.s
+
+This script will "fail" as long as FileCheck passes. So the result
+will be the minimum bitcode that passes FileCheck.
+
 =item B<--safe-path> I<path>
 
 This option defines the path to the command to execute with the
@@ -166,6 +181,6 @@ L<opt|opt>
 
 =head1 AUTHOR
 
-Maintained by the LLVM Team (L<http://llvm.org>).
+Maintained by the LLVM Team (L<http://llvm.org/>).
 
 =cut
diff --git a/docs/CommandGuide/index.html b/docs/CommandGuide/index.html
index 8854fbb..b839d3c 100644
--- a/docs/CommandGuide/index.html
+++ b/docs/CommandGuide/index.html
@@ -3,15 +3,15 @@
 <html>
 <head>
   <title>LLVM Command Guide</title>
-  <link rel="stylesheet" href="/docs/llvm.css" type="text/css">
+  <link rel="stylesheet" href="../llvm.css" type="text/css">
 </head>
 <body>
 
-<div class="doc_title">
+<h1>
   LLVM Command Guide
-</div>
+</h1>
 
-<div class="doc_text">
+<div>
 
 <p>These documents are HTML versions of the <a href="man/man1/">man pages</a>
 for all of the LLVM tools.  These pages describe how to use the LLVM commands
@@ -23,12 +23,12 @@ options) arguments to the tool you are interested in.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="basic">Basic Commands</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <ul>
 
@@ -80,12 +80,12 @@ options) arguments to the tool you are interested in.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="frontend">C and C++ Front-end Commands</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 <ul>
 
 <li><a href="/cmds/llvmgcc.html"><b>llvm-gcc</b></a> -
@@ -99,13 +99,13 @@ options) arguments to the tool you are interested in.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="debug">Debugging Tools</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
 
-<div class="doc_text">
+<div>
 
 <ul>
 
@@ -123,12 +123,12 @@ options) arguments to the tool you are interested in.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="internal">Internal Tools</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 <ul>
 
 <li><a href="/cmds/FileCheck.html"><b>FileCheck</b></a> -
@@ -150,7 +150,7 @@ options) arguments to the tool you are interested in.</p>
   <a href="http://validator.w3.org/check/referer"><img
   src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
 
-  <a href="http://llvm.org">LLVM Compiler Infrastructure</a><br>
+  <a href="http://llvm.org/">LLVM Compiler Infrastructure</a><br>
   Last modified: $Date$
 </address>
 
diff --git a/docs/CommandGuide/lit.pod b/docs/CommandGuide/lit.pod
index 989a5d7..faf4811 100644
--- a/docs/CommandGuide/lit.pod
+++ b/docs/CommandGuide/lit.pod
@@ -349,6 +349,6 @@ L<valgrind(1)>
 
 =head1 AUTHOR
 
-Written by Daniel Dunbar and maintained by the LLVM Team (L<http://llvm.org>).
+Written by Daniel Dunbar and maintained by the LLVM Team (L<http://llvm.org/>).
 
 =cut
diff --git a/docs/CommandGuide/llc.pod b/docs/CommandGuide/llc.pod
index eb26ec0..50b45c8 100644
--- a/docs/CommandGuide/llc.pod
+++ b/docs/CommandGuide/llc.pod
@@ -196,6 +196,6 @@ L<lli|lli>
 
 =head1 AUTHORS
 
-Maintained by the LLVM Team (L<http://llvm.org>).
+Maintained by the LLVM Team (L<http://llvm.org/>).
 
 =cut
diff --git a/docs/CommandGuide/lli.pod b/docs/CommandGuide/lli.pod
index 52a2721..a313a31 100644
--- a/docs/CommandGuide/lli.pod
+++ b/docs/CommandGuide/lli.pod
@@ -214,6 +214,6 @@ L<llc|llc>
 
 =head1 AUTHOR
 
-Maintained by the LLVM Team (L<http://llvm.org>).
+Maintained by the LLVM Team (L<http://llvm.org/>).
 
 =cut
diff --git a/docs/CommandGuide/llvm-ar.pod b/docs/CommandGuide/llvm-ar.pod
index 63ba43f..a8f01b0 100644
--- a/docs/CommandGuide/llvm-ar.pod
+++ b/docs/CommandGuide/llvm-ar.pod
@@ -401,6 +401,6 @@ L<llvm-ranlib|llvm-ranlib>, ar(1)
 
 =head1 AUTHORS
 
-Maintained by the LLVM Team (L<http://llvm.org>).
+Maintained by the LLVM Team (L<http://llvm.org/>).
 
 =cut
diff --git a/docs/CommandGuide/llvm-as.pod b/docs/CommandGuide/llvm-as.pod
index 185c009..cc81887 100644
--- a/docs/CommandGuide/llvm-as.pod
+++ b/docs/CommandGuide/llvm-as.pod
@@ -72,6 +72,6 @@ L<llvm-dis|llvm-dis>, L<gccas|gccas>
 
 =head1 AUTHORS
 
-Maintained by the LLVM Team (L<http://llvm.org>).
+Maintained by the LLVM Team (L<http://llvm.org/>).
 
 =cut
diff --git a/docs/CommandGuide/llvm-bcanalyzer.pod b/docs/CommandGuide/llvm-bcanalyzer.pod
index b0bc0cd..9c5021b 100644
--- a/docs/CommandGuide/llvm-bcanalyzer.pod
+++ b/docs/CommandGuide/llvm-bcanalyzer.pod
@@ -268,7 +268,7 @@ The number of bytes consumed by instructions in the function.
 
 =item B<Average Instruction Size>
 
-The average number of bytes consumed by the instructions in the funtion. This
+The average number of bytes consumed by the instructions in the function. This
 value is computed by dividing Instruction Size by Instructions.
 
 =item B<Bytes Per Instruction>
@@ -310,6 +310,6 @@ L<llvm-dis|llvm-dis>, L<http://llvm.org/docs/BitCodeFormat.html>
 
 =head1 AUTHORS
 
-Maintained by the LLVM Team (L<http://llvm.org>).
+Maintained by the LLVM Team (L<http://llvm.org/>).
 
 =cut
diff --git a/docs/CommandGuide/llvm-config.pod b/docs/CommandGuide/llvm-config.pod
index 4e38dae..7d68564 100644
--- a/docs/CommandGuide/llvm-config.pod
+++ b/docs/CommandGuide/llvm-config.pod
@@ -126,6 +126,6 @@ occurs, it will exit with a non-zero value.
 
 =head1 AUTHORS
 
-Maintained by the LLVM Team (L<http://llvm.org>).
+Maintained by the LLVM Team (L<http://llvm.org/>).
 
 =cut
diff --git a/docs/CommandGuide/llvm-diff.pod b/docs/CommandGuide/llvm-diff.pod
index c8cfdb3..ffe0b48 100644
--- a/docs/CommandGuide/llvm-diff.pod
+++ b/docs/CommandGuide/llvm-diff.pod
@@ -48,6 +48,6 @@ massive detected differences in blocks.
 
 =head1 AUTHORS
 
-Maintained by the LLVM Team (L<http://llvm.org>).
+Maintained by the LLVM Team (L<http://llvm.org/>).
 
 =cut
diff --git a/docs/CommandGuide/llvm-dis.pod b/docs/CommandGuide/llvm-dis.pod
index 5b2f4ef..9f4026c 100644
--- a/docs/CommandGuide/llvm-dis.pod
+++ b/docs/CommandGuide/llvm-dis.pod
@@ -55,6 +55,6 @@ L<llvm-as|llvm-as>
 
 =head1 AUTHORS
 
-Maintained by the LLVM Team (L<http://llvm.org>).
+Maintained by the LLVM Team (L<http://llvm.org/>).
 
 =cut
diff --git a/docs/CommandGuide/llvm-extract.pod b/docs/CommandGuide/llvm-extract.pod
index d4baab7..797e79d 100644
--- a/docs/CommandGuide/llvm-extract.pod
+++ b/docs/CommandGuide/llvm-extract.pod
@@ -68,6 +68,6 @@ L<bugpoint|bugpoint>
 
 =head1 AUTHORS
 
-Maintained by the LLVM Team (L<http://llvm.org>).
+Maintained by the LLVM Team (L<http://llvm.org/>).
 
 =cut
diff --git a/docs/CommandGuide/llvm-ld.pod b/docs/CommandGuide/llvm-ld.pod
index 536ab0f..efa9ebd 100644
--- a/docs/CommandGuide/llvm-ld.pod
+++ b/docs/CommandGuide/llvm-ld.pod
@@ -229,6 +229,6 @@ L<llvm-link|llvm-link>
 
 =head1 AUTHORS
 
-Maintained by the LLVM Team (L<http://llvm.org>).
+Maintained by the LLVM Team (L<http://llvm.org/>).
 
 =cut
diff --git a/docs/CommandGuide/llvm-link.pod b/docs/CommandGuide/llvm-link.pod
index 8d06cc9..1e466a5 100644
--- a/docs/CommandGuide/llvm-link.pod
+++ b/docs/CommandGuide/llvm-link.pod
@@ -74,6 +74,6 @@ L<gccld|gccld>
 
 =head1 AUTHORS
 
-Maintained by the LLVM Team (L<http://llvm.org>).
+Maintained by the LLVM Team (L<http://llvm.org/>).
 
 =cut
diff --git a/docs/CommandGuide/llvm-nm.pod b/docs/CommandGuide/llvm-nm.pod
index a580d3f..a6dc490 100644
--- a/docs/CommandGuide/llvm-nm.pod
+++ b/docs/CommandGuide/llvm-nm.pod
@@ -117,6 +117,6 @@ L<llvm-dis|llvm-dis>, ar(1), nm(1)
 
 =head1 AUTHOR
 
-Maintained by the LLVM Team (L<http://llvm.org>).
+Maintained by the LLVM Team (L<http://llvm.org/>).
 
 =cut
diff --git a/docs/CommandGuide/llvm-prof.pod b/docs/CommandGuide/llvm-prof.pod
index 9541b05..4b2e09d 100644
--- a/docs/CommandGuide/llvm-prof.pod
+++ b/docs/CommandGuide/llvm-prof.pod
@@ -52,6 +52,6 @@ information. Otherwise, it exits with zero.
 
 =head1 AUTHOR
 
-B<llvm-prof> is maintained by the LLVM Team (L<http://llvm.org>).
+B<llvm-prof> is maintained by the LLVM Team (L<http://llvm.org/>).
 
 =cut
diff --git a/docs/CommandGuide/llvm-ranlib.pod b/docs/CommandGuide/llvm-ranlib.pod
index 53cd34b..431bc55 100644
--- a/docs/CommandGuide/llvm-ranlib.pod
+++ b/docs/CommandGuide/llvm-ranlib.pod
@@ -47,6 +47,6 @@ L<llvm-ar|llvm-ar>, ranlib(1)
 
 =head1 AUTHORS
 
-Maintained by the LLVM Team (L<http://llvm.org>).
+Maintained by the LLVM Team (L<http://llvm.org/>).
 
 =cut
diff --git a/docs/CommandGuide/llvmc.pod b/docs/CommandGuide/llvmc.pod
index d237ca4..95a9e5e 100644
--- a/docs/CommandGuide/llvmc.pod
+++ b/docs/CommandGuide/llvmc.pod
@@ -185,6 +185,6 @@ L<llvm-dis|llvm-dis>, L<llc|llc>, L<llvm-link|llvm-link>
 
 =head1 AUTHORS
 
-Maintained by the LLVM Team (L<http://llvm.org>).
+Maintained by the LLVM Team (L<http://llvm.org/>).
 
 =cut
diff --git a/docs/CommandGuide/llvmgcc.pod b/docs/CommandGuide/llvmgcc.pod
index 9892ca7..30af0a0 100644
--- a/docs/CommandGuide/llvmgcc.pod
+++ b/docs/CommandGuide/llvmgcc.pod
@@ -70,7 +70,7 @@ L<llvm-g++|llvmgxx>
 
 =head1 AUTHORS
 
-Maintained by the LLVM Team (L<http://llvm.org>).
+Maintained by the LLVM Team (L<http://llvm.org/>).
 
 =cut
 
diff --git a/docs/CommandGuide/llvmgxx.pod b/docs/CommandGuide/llvmgxx.pod
index 64b670e..1ea3d49 100644
--- a/docs/CommandGuide/llvmgxx.pod
+++ b/docs/CommandGuide/llvmgxx.pod
@@ -79,7 +79,7 @@ L<llvm-gcc|llvmgcc>
 
 =head1 AUTHORS
 
-Maintained by the LLVM Team (L<http://llvm.org>).
+Maintained by the LLVM Team (L<http://llvm.org/>).
 
 =cut
 
diff --git a/docs/CommandGuide/opt.pod b/docs/CommandGuide/opt.pod
index d1d1db5..f5f4968 100644
--- a/docs/CommandGuide/opt.pod
+++ b/docs/CommandGuide/opt.pod
@@ -138,6 +138,6 @@ occurs, it will exit with a non-zero value.
 
 =head1 AUTHORS
 
-Maintained by the LLVM Team (L<http://llvm.org>).
+Maintained by the LLVM Team (L<http://llvm.org/>).
 
 =cut
diff --git a/docs/CommandGuide/tblgen.pod b/docs/CommandGuide/tblgen.pod
index d127492..fe1be5e 100644
--- a/docs/CommandGuide/tblgen.pod
+++ b/docs/CommandGuide/tblgen.pod
@@ -110,6 +110,6 @@ occurs, it will exit with a non-zero value.
 
 =head1 AUTHORS
 
-Maintained by The LLVM Team (L<http://llvm.org>).
+Maintained by The LLVM Team (L<http://llvm.org/>).
 
 =cut
diff --git a/docs/CommandLine.html b/docs/CommandLine.html
index 47ab2cc..179e8e6 100644
--- a/docs/CommandLine.html
+++ b/docs/CommandLine.html
@@ -8,9 +8,9 @@
 </head>
 <body>
 
-<div class="doc_title">
+<h1>
   CommandLine 2.0 Library Manual
-</div>
+</h1>
 
 <ol>
   <li><a href="#introduction">Introduction</a></li>
@@ -100,12 +100,12 @@
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="introduction">Introduction</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>This document describes the CommandLine argument processing library.  It will
 show you how to use it, and what it can do.  The CommandLine library uses a
@@ -184,12 +184,12 @@ href="mailto:sabre@nondot.org">Chris Lattner</a>.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="quickstart">Quick Start Guide</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>This section of the manual runs through a simple CommandLine'ification of a
 basic compiler tool.  This is intended to show you how to jump into using the
@@ -231,11 +231,11 @@ represented like this:</p>
 <a href="#cl::opt">cl::opt</a>&lt;string&gt; OutputFilename("<i>o</i>", <a href="#cl::desc">cl::desc</a>("<i>Specify output filename</i>"), <a href="#cl::value_desc">cl::value_desc</a>("<i>filename</i>"));
 </pre></div>
 
-<p>This declares a global variable "<tt>OutputFilename</tt>" that is used to
-capture the result of the "<tt>o</tt>" argument (first parameter).  We specify
-that this is a simple scalar option by using the "<tt><a
-href="#cl::opt">cl::opt</a></tt>" template (as opposed to the <a
-href="#list">"<tt>cl::list</tt> template</a>), and tell the CommandLine library
+<p>This declares a global variable &quot;<tt>OutputFilename</tt>&quot; that is used to
+capture the result of the &quot;<tt>o</tt>&quot; argument (first parameter).  We specify
+that this is a simple scalar option by using the &quot;<tt><a
+href="#cl::opt">cl::opt</a></tt>&quot; template (as opposed to the <a
+href="#list">&quot;<tt>cl::list</tt> template</a>), and tell the CommandLine library
 that the data type that we are parsing is a string.</p>
 
 <p>The second and third parameters (which are optional) are used to specify what
@@ -321,14 +321,12 @@ OPTIONS:
 
 <p>... indicating that an input filename is expected.</p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="bool">Boolean Arguments</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>In addition to input and output filenames, we would like the compiler example
 to support three boolean flags: "<tt>-f</tt>" to force writing binary output to
@@ -406,11 +404,11 @@ and <a href="#list">lists</a> of options.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="alias">Argument Aliases</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>So far, the example works well, except for the fact that we need to check the
 quiet condition like this now:</p>
@@ -456,12 +454,12 @@ uses.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="onealternative">Selecting an alternative from a set of
   possibilities</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>So far we have seen how the CommandLine library handles builtin types like
 <tt>std::string</tt>, <tt>bool</tt> and <tt>int</tt>, but how does it handle
@@ -567,11 +565,11 @@ which is when you would use it.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="namedalternatives">Named Alternatives</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>Another useful argument form is a named alternative style.  We shall use this
 style in our compiler to specify different debug levels that can be used.
@@ -629,11 +627,11 @@ that you can choose the form most appropriate for your application.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="list">Parsing a list of options</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>Now that we have the standard run-of-the-mill argument types out of the way,
 lets get a little wild and crazy.  Lets say that we want our optimizer to accept
@@ -699,11 +697,11 @@ checking we have to do.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="bits">Collecting options as a set of flags</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>Instead of collecting sets of options in a list, it is also possible to
 gather information for enum values in a <b>bit vector</b>.  The representation used by
@@ -758,11 +756,11 @@ href="#list"> <tt>cl::list</tt></a> option.</p>
 
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="description">Adding freeform text to help output</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>As our program grows and becomes more mature, we may decide to put summary
 information about what it does into the help output.  The help output is styled
@@ -800,28 +798,27 @@ OPTIONS:
 
 </div>
 
+</div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="referenceguide">Reference Guide</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Now that you know the basics of how to use the CommandLine library, this
 section will give you the detailed information you need to tune how command line
 options work, as well as information on more "advanced" command line option
 processing capabilities.</p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="positional">Positional Arguments</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>Positional arguments are those arguments that are not named, and are not
 specified with a hyphen.  Positional arguments should be used when an option is
@@ -854,15 +851,12 @@ that command line options will be ordered according to how they are listed in a
 are defined in multiple .cpp files.  The fix for this problem is simply to
 define all of your positional arguments in one .cpp file.</p>
 
-</div>
-
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="--">Specifying positional options with hyphens</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>Sometimes you may want to specify a value to your positional argument that
 starts with a hyphen (for example, searching for '<tt>-foo</tt>' in a file).  At
@@ -895,10 +889,10 @@ can use it like this:</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="getPosition">Determining absolute position with getPosition()</a>
-</div>
-<div class="doc_text">
+</h4>
+<div>
   <p>Sometimes an option can affect or modify the meaning of another option. For
   example, consider <tt>gcc</tt>'s <tt>-x LANG</tt> option. This tells
   <tt>gcc</tt> to ignore the suffix of subsequent positional arguments and force
@@ -954,11 +948,11 @@ can use it like this:</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="cl::ConsumeAfter">The <tt>cl::ConsumeAfter</tt> modifier</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>The <tt>cl::ConsumeAfter</tt> <a href="#formatting">formatting option</a> is
 used to construct programs that use "interpreter style" option processing.  With
@@ -1006,12 +1000,14 @@ href="#cl::list">cl::list</a> option.</p>
 
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="storage">Internal vs External Storage</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>By default, all command line options automatically hold the value that they
 parse from the command line.  This is very convenient in the common case,
@@ -1076,11 +1072,11 @@ that <tt>DebugFlag</tt> is automatically set.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="attributes">Option Attributes</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>This section describes the basic attributes that you can specify on
 options.</p>
@@ -1166,11 +1162,11 @@ obviously).</li>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="modifiers">Option Modifiers</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>Option modifiers are the flags and expressions that you pass into the
 constructors for <tt><a href="#cl::opt">cl::opt</a></tt> and <tt><a
@@ -1196,14 +1192,12 @@ category.  The CommandLine library specifies defaults for all of these settings
 that are the most useful in practice and the most common, which mean that you
 usually shouldn't have to worry about these.</p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="hiding">Hiding an option from <tt>-help</tt> output</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>The <tt>cl::NotHidden</tt>, <tt>cl::Hidden</tt>, and
 <tt>cl::ReallyHidden</tt> modifiers are used to control whether or not an option
@@ -1230,12 +1224,12 @@ indicates that the option should not appear in any help output.</li>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="numoccurrences">Controlling the number of occurrences required and
   allowed</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>This group of options is used to control how many time an option is allowed
 (or required) to be specified on the command line of your program.  Specifying a
@@ -1279,11 +1273,11 @@ retained.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="valrequired">Controlling whether or not a value must be specified</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>This group of options is used to control whether or not the option allows a
 value to be present.  In the case of the CommandLine library, a value is either
@@ -1328,11 +1322,11 @@ when <a href="#extensionguide">extending the library</a>.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="formatting">Controlling other formatting options</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>The formatting option group is used to specify that the command line option
 has special abilities and is otherwise different from other command line
@@ -1409,11 +1403,11 @@ strategy basically looks like this:</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="misc">Miscellaneous option modifiers</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>The miscellaneous option modifiers are the only flags where you can specify
 more than one flag from the set: they are not mutually exclusive.  These flags
@@ -1453,11 +1447,11 @@ only makes sense with a <a href="#cl::list">cl::list</a> option.</li>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="response">Response files</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>Some systems, such as certain variants of Microsoft Windows and
 some older Unices have a relatively low limit on command-line
@@ -1474,13 +1468,14 @@ and
 
 </div>
 
+</div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="toplevel">Top-Level Classes and Functions</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>Despite all of the built-in flexibility, the CommandLine option library
 really only consists of one function (<a
@@ -1490,15 +1485,13 @@ href="#cl::list"><tt>cl::list</tt></a>, and <a
 href="#cl::alias"><tt>cl::alias</tt></a>.  This section describes these three
 classes in detail.</p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="cl::ParseCommandLineOptions">The <tt>cl::ParseCommandLineOptions</tt>
   function</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>The <tt>cl::ParseCommandLineOptions</tt> function is designed to be called
 directly from <tt>main</tt>, and is used to fill in the values of all of the
@@ -1514,12 +1507,12 @@ which holds <a href="#description">additional extra text</a> to emit when the
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="cl::ParseEnvironmentOptions">The <tt>cl::ParseEnvironmentOptions</tt>
   function</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>The <tt>cl::ParseEnvironmentOptions</tt> function has mostly the same effects
 as <a
@@ -1551,12 +1544,12 @@ input.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="cl::SetVersionPrinter">The <tt>cl::SetVersionPrinter</tt>
   function</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>The <tt>cl::SetVersionPrinter</tt> function is designed to be called
 directly from <tt>main</tt> and <i>before</i>
@@ -1572,11 +1565,11 @@ called when the <tt>--version</tt> option is given by the user.</p>
 
 </div>
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="cl::opt">The <tt>cl::opt</tt> class</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>The <tt>cl::opt</tt> class is the class used to represent scalar command line
 options, and is the one used most of the time.  It is a templated class which
@@ -1607,11 +1600,11 @@ href="#customparser">custom parser</a>.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="cl::list">The <tt>cl::list</tt> class</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>The <tt>cl::list</tt> class is the class used to represent a list of command
 line options.  It too is a templated class which can take up to three
@@ -1634,11 +1627,11 @@ be used.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="cl::bits">The <tt>cl::bits</tt> class</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>The <tt>cl::bits</tt> class is the class used to represent a list of command
 line options in the form of a bit vector.  It is also a templated class which
@@ -1659,11 +1652,11 @@ must be of <b>type</b> <tt>unsigned</tt> if external storage is used.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="cl::alias">The <tt>cl::alias</tt> class</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>The <tt>cl::alias</tt> class is a nontemplated class that is used to form
 aliases for other arguments.</p>
@@ -1682,11 +1675,11 @@ the conversion from string to data.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="cl::extrahelp">The <tt>cl::extrahelp</tt> class</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>The <tt>cl::extrahelp</tt> class is a nontemplated class that allows extra
 help text to be printed out for the <tt>-help</tt> option.</p>
@@ -1709,12 +1702,14 @@ single <tt>cl::extrahelp</tt> instance.</p>
 </pre></div>
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="builtinparsers">Builtin parsers</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>Parsers control how the string value taken from the command line is
 translated into a typed value, suitable for use in a C++ program.  By default,
@@ -1773,27 +1768,27 @@ exponential notation (ex: <tt>1.7e15</tt>) and properly supports locales.
 
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="extensionguide">Extension Guide</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Although the CommandLine library has a lot of functionality built into it
 already (as discussed previously), one of its true strengths lie in its
 extensibility.  This section discusses how the CommandLine library works under
 the covers and illustrates how to do some simple, common, extensions.</p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="customparser">Writing a custom parser</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>One of the simplest and most common extensions is the use of a custom parser.
 As <a href="#builtinparsers">discussed previously</a>, parsers are the portion
@@ -1932,11 +1927,11 @@ tutorial.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="explotingexternal">Exploiting external storage</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
   <p>Several of the LLVM libraries define static <tt>cl::opt</tt> instances that
   will automatically be included in any program that links with that library.
   This is a feature. However, sometimes it is necessary to know the value of the
@@ -1951,16 +1946,18 @@ tutorial.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="dynamicopts">Dynamically adding command line options</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>TODO: fill in this section</p>
 
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
 
 <hr>
@@ -1971,7 +1968,7 @@ tutorial.</p>
   src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
 
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
-  <a href="http://llvm.org">LLVM Compiler Infrastructure</a><br>
+  <a href="http://llvm.org/">LLVM Compiler Infrastructure</a><br>
   Last modified: $Date$
 </address>
 
diff --git a/docs/CompilerDriver.html b/docs/CompilerDriver.html
index 3c82e2b..0f5d359 100644
--- a/docs/CompilerDriver.html
+++ b/docs/CompilerDriver.html
@@ -3,7 +3,7 @@
 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
 <head>
 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
-<meta name="generator" content="Docutils 0.5: http://docutils.sourceforge.net/" />
+<meta name="generator" content="Docutils 0.6: http://docutils.sourceforge.net/" />
 <title>Customizing LLVMC: Reference Manual</title>
 <link rel="stylesheet" href="llvm.css" type="text/css" />
 </head>
@@ -17,28 +17,23 @@ The ReST source lives in the directory 'tools/llvmc/doc'. -->
 <div class="contents topic" id="contents">
 <p class="topic-title first">Contents</p>
 <ul class="simple">
-<li><a class="reference internal" href="#introduction" id="id8">Introduction</a></li>
-<li><a class="reference internal" href="#compiling-with-llvmc" id="id9">Compiling with LLVMC</a></li>
-<li><a class="reference internal" href="#predefined-options" id="id10">Predefined options</a></li>
-<li><a class="reference internal" href="#compiling-llvmc-plugins" id="id11">Compiling LLVMC plugins</a></li>
-<li><a class="reference internal" href="#compiling-standalone-llvmc-based-drivers" id="id12">Compiling standalone LLVMC-based drivers</a></li>
-<li><a class="reference internal" href="#customizing-llvmc-the-compilation-graph" id="id13">Customizing LLVMC: the compilation graph</a></li>
-<li><a class="reference internal" href="#describing-options" id="id14">Describing options</a><ul>
-<li><a class="reference internal" href="#external-options" id="id15">External options</a></li>
+<li><a class="reference internal" href="#introduction" id="id7">Introduction</a></li>
+<li><a class="reference internal" href="#compiling-with-llvmc" id="id8">Compiling with <tt class="docutils literal">llvmc</tt></a></li>
+<li><a class="reference internal" href="#predefined-options" id="id9">Predefined options</a></li>
+<li><a class="reference internal" href="#compiling-llvmc-based-drivers" id="id10">Compiling LLVMC-based drivers</a></li>
+<li><a class="reference internal" href="#customizing-llvmc-the-compilation-graph" id="id11">Customizing LLVMC: the compilation graph</a></li>
+<li><a class="reference internal" href="#describing-options" id="id12">Describing options</a></li>
+<li><a class="reference internal" href="#conditional-evaluation" id="id13">Conditional evaluation</a></li>
+<li><a class="reference internal" href="#writing-a-tool-description" id="id14">Writing a tool description</a><ul>
+<li><a class="reference internal" href="#id4" id="id15">Actions</a></li>
 </ul>
 </li>
-<li><a class="reference internal" href="#conditional-evaluation" id="id16">Conditional evaluation</a></li>
-<li><a class="reference internal" href="#writing-a-tool-description" id="id17">Writing a tool description</a><ul>
-<li><a class="reference internal" href="#id5" id="id18">Actions</a></li>
-</ul>
-</li>
-<li><a class="reference internal" href="#language-map" id="id19">Language map</a></li>
-<li><a class="reference internal" href="#option-preprocessor" id="id20">Option preprocessor</a></li>
-<li><a class="reference internal" href="#more-advanced-topics" id="id21">More advanced topics</a><ul>
-<li><a class="reference internal" href="#hooks-and-environment-variables" id="id22">Hooks and environment variables</a></li>
-<li><a class="reference internal" href="#how-plugins-are-loaded" id="id23">How plugins are loaded</a></li>
-<li><a class="reference internal" href="#debugging" id="id24">Debugging</a></li>
-<li><a class="reference internal" href="#conditioning-on-the-executable-name" id="id25">Conditioning on the executable name</a></li>
+<li><a class="reference internal" href="#language-map" id="id16">Language map</a></li>
+<li><a class="reference internal" href="#option-preprocessor" id="id17">Option preprocessor</a></li>
+<li><a class="reference internal" href="#more-advanced-topics" id="id18">More advanced topics</a><ul>
+<li><a class="reference internal" href="#hooks-and-environment-variables" id="id19">Hooks and environment variables</a></li>
+<li><a class="reference internal" href="#debugging" id="id20">Debugging</a></li>
+<li><a class="reference internal" href="#conditioning-on-the-executable-name" id="id21">Conditioning on the executable name</a></li>
 </ul>
 </li>
 </ul>
@@ -46,25 +41,24 @@ The ReST source lives in the directory 'tools/llvmc/doc'. -->
 <div class="doc_author">
 <p>Written by <a href="mailto:foldr@codedgers.com">Mikhail Glushenkov</a></p>
 </div><div class="section" id="introduction">
-<h1><a class="toc-backref" href="#id8">Introduction</a></h1>
+<h1><a class="toc-backref" href="#id7">Introduction</a></h1>
 <p>LLVMC is a generic compiler driver, designed to be customizable and
-extensible. It plays the same role for LLVM as the <tt class="docutils literal"><span class="pre">gcc</span></tt> program
-does for GCC - LLVMC's job is essentially to transform a set of input
-files into a set of targets depending on configuration rules and user
-options. What makes LLVMC different is that these transformation rules
-are completely customizable - in fact, LLVMC knows nothing about the
-specifics of transformation (even the command-line options are mostly
-not hard-coded) and regards the transformation structure as an
-abstract graph. The structure of this graph is completely determined
-by plugins, which can be either statically or dynamically linked. This
-makes it possible to easily adapt LLVMC for other purposes - for
-example, as a build tool for game resources.</p>
+extensible. It plays the same role for LLVM as the <tt class="docutils literal">gcc</tt> program does for
+GCC - LLVMC's job is essentially to transform a set of input files into a set of
+targets depending on configuration rules and user options. What makes LLVMC
+different is that these transformation rules are completely customizable - in
+fact, LLVMC knows nothing about the specifics of transformation (even the
+command-line options are mostly not hard-coded) and regards the transformation
+structure as an abstract graph. The structure of this graph is described in
+high-level TableGen code, from which an efficient C++ representation is
+automatically derived. This makes it possible to adapt LLVMC for other
+purposes - for example, as a build tool for game resources.</p>
 <p>Because LLVMC employs <a class="reference external" href="http://llvm.org/docs/TableGenFundamentals.html">TableGen</a> as its configuration language, you
 need to be familiar with it to customize LLVMC.</p>
 </div>
 <div class="section" id="compiling-with-llvmc">
-<h1><a class="toc-backref" href="#id9">Compiling with LLVMC</a></h1>
-<p>LLVMC tries hard to be as compatible with <tt class="docutils literal"><span class="pre">gcc</span></tt> as possible,
+<h1><a class="toc-backref" href="#id8">Compiling with <tt class="docutils literal">llvmc</tt></a></h1>
+<p>LLVMC tries hard to be as compatible with <tt class="docutils literal">gcc</tt> as possible,
 although there are some small differences. Most of the time, however,
 you shouldn't be able to notice them:</p>
 <pre class="literal-block">
@@ -74,11 +68,11 @@ $ ./a.out
 hello
 </pre>
 <p>One nice feature of LLVMC is that one doesn't have to distinguish between
-different compilers for different languages (think <tt class="docutils literal"><span class="pre">g++</span></tt> vs.  <tt class="docutils literal"><span class="pre">gcc</span></tt>) - the
+different compilers for different languages (think <tt class="docutils literal">g++</tt> vs.  <tt class="docutils literal">gcc</tt>) - the
 right toolchain is chosen automatically based on input language names (which
 are, in turn, determined from file extensions). If you want to force files
 ending with &quot;.c&quot; to compile as C++, use the <tt class="docutils literal"><span class="pre">-x</span></tt> option, just like you would
-do it with <tt class="docutils literal"><span class="pre">gcc</span></tt>:</p>
+do it with <tt class="docutils literal">gcc</tt>:</p>
 <pre class="literal-block">
 $ # hello.c is really a C++ file
 $ llvmc -x c++ hello.c
@@ -97,138 +91,100 @@ $ ./a.out
 hello
 </pre>
 <p>By default, LLVMC uses <tt class="docutils literal"><span class="pre">llvm-gcc</span></tt> to compile the source code. It is also
-possible to choose the <tt class="docutils literal"><span class="pre">clang</span></tt> compiler with the <tt class="docutils literal"><span class="pre">-clang</span></tt> option.</p>
+possible to choose the <tt class="docutils literal">clang</tt> compiler with the <tt class="docutils literal"><span class="pre">-clang</span></tt> option.</p>
 </div>
 <div class="section" id="predefined-options">
-<h1><a class="toc-backref" href="#id10">Predefined options</a></h1>
-<p>LLVMC has some built-in options that can't be overridden in the
-configuration libraries:</p>
+<h1><a class="toc-backref" href="#id9">Predefined options</a></h1>
+<p>LLVMC has some built-in options that can't be overridden in the TableGen code:</p>
 <ul class="simple">
-<li><tt class="docutils literal"><span class="pre">-o</span> <span class="pre">FILE</span></tt> - Output file name.</li>
-<li><tt class="docutils literal"><span class="pre">-x</span> <span class="pre">LANGUAGE</span></tt> - Specify the language of the following input files
+<li><tt class="docutils literal"><span class="pre">-o</span> FILE</tt> - Output file name.</li>
+<li><tt class="docutils literal"><span class="pre">-x</span> LANGUAGE</tt> - Specify the language of the following input files
 until the next -x option.</li>
-<li><tt class="docutils literal"><span class="pre">-load</span> <span class="pre">PLUGIN_NAME</span></tt> - Load the specified plugin DLL. Example:
-<tt class="docutils literal"><span class="pre">-load</span> <span class="pre">$LLVM_DIR/Release/lib/LLVMCSimple.so</span></tt>.</li>
 <li><tt class="docutils literal"><span class="pre">-v</span></tt> - Enable verbose mode, i.e. print out all executed commands.</li>
 <li><tt class="docutils literal"><span class="pre">--save-temps</span></tt> - Write temporary files to the current directory and do not
 delete them on exit. This option can also take an argument: the
 <tt class="docutils literal"><span class="pre">--save-temps=obj</span></tt> switch will write files into the directory specified with
 the <tt class="docutils literal"><span class="pre">-o</span></tt> option. The <tt class="docutils literal"><span class="pre">--save-temps=cwd</span></tt> and <tt class="docutils literal"><span class="pre">--save-temps</span></tt> switches are
 both synonyms for the default behaviour.</li>
-<li><tt class="docutils literal"><span class="pre">--temp-dir</span> <span class="pre">DIRECTORY</span></tt> - Store temporary files in the given directory. This
+<li><tt class="docutils literal"><span class="pre">--temp-dir</span> DIRECTORY</tt> - Store temporary files in the given directory. This
 directory is deleted on exit unless <tt class="docutils literal"><span class="pre">--save-temps</span></tt> is specified. If
 <tt class="docutils literal"><span class="pre">--save-temps=obj</span></tt> is also specified, <tt class="docutils literal"><span class="pre">--temp-dir</span></tt> is given the
 precedence.</li>
 <li><tt class="docutils literal"><span class="pre">--check-graph</span></tt> - Check the compilation for common errors like mismatched
-output/input language names, multiple default edges and cycles. Because of
-plugins, these checks can't be performed at compile-time. Exit with code zero
-if no errors were found, and return the number of found errors
-otherwise. Hidden option, useful for debugging LLVMC plugins.</li>
+output/input language names, multiple default edges and cycles. Exit with code
+zero if no errors were found, and return the number of found errors
+otherwise. Hidden option, useful for debugging.</li>
 <li><tt class="docutils literal"><span class="pre">--view-graph</span></tt> - Show a graphical representation of the compilation graph
-and exit. Requires that you have <tt class="docutils literal"><span class="pre">dot</span></tt> and <tt class="docutils literal"><span class="pre">gv</span></tt> programs installed. Hidden
-option, useful for debugging LLVMC plugins.</li>
+and exit. Requires that you have <tt class="docutils literal">dot</tt> and <tt class="docutils literal">gv</tt> programs installed. Hidden
+option, useful for debugging.</li>
 <li><tt class="docutils literal"><span class="pre">--write-graph</span></tt> - Write a <tt class="docutils literal"><span class="pre">compilation-graph.dot</span></tt> file in the current
 directory with the compilation graph description in Graphviz format (identical
 to the file used by the <tt class="docutils literal"><span class="pre">--view-graph</span></tt> option). The <tt class="docutils literal"><span class="pre">-o</span></tt> option can be
-used to set the output file name. Hidden option, useful for debugging LLVMC
-plugins.</li>
-<li><tt class="docutils literal"><span class="pre">-help</span></tt>, <tt class="docutils literal"><span class="pre">-help-hidden</span></tt>, <tt class="docutils literal"><span class="pre">--version</span></tt> - These options have
+used to set the output file name. Hidden option, useful for debugging.</li>
+<li><tt class="docutils literal"><span class="pre">--help</span></tt>, <tt class="docutils literal"><span class="pre">--help-hidden</span></tt>, <tt class="docutils literal"><span class="pre">--version</span></tt> - These options have
 their standard meaning.</li>
 </ul>
 </div>
-<div class="section" id="compiling-llvmc-plugins">
-<h1><a class="toc-backref" href="#id11">Compiling LLVMC plugins</a></h1>
-<p>It's easiest to start working on your own LLVMC plugin by copying the
-skeleton project which lives under <tt class="docutils literal"><span class="pre">$LLVMC_DIR/plugins/Simple</span></tt>:</p>
+<div class="section" id="compiling-llvmc-based-drivers">
+<h1><a class="toc-backref" href="#id10">Compiling LLVMC-based drivers</a></h1>
+<p>It's easiest to start working on your own LLVMC driver by copying the skeleton
+project which lives under <tt class="docutils literal">$LLVMC_DIR/examples/Skeleton</tt>:</p>
 <pre class="literal-block">
-$ cd $LLVMC_DIR/plugins
-$ cp -r Simple MyPlugin
-$ cd MyPlugin
+$ cd $LLVMC_DIR/examples
+$ cp -r Skeleton MyDriver
+$ cd MyDriver
 $ ls
-Makefile PluginMain.cpp Simple.td
-</pre>
-<p>As you can see, our basic plugin consists of only two files (not
-counting the build script). <tt class="docutils literal"><span class="pre">Simple.td</span></tt> contains TableGen
-description of the compilation graph; its format is documented in the
-following sections. <tt class="docutils literal"><span class="pre">PluginMain.cpp</span></tt> is just a helper file used to
-compile the auto-generated C++ code produced from TableGen source. It
-can also contain hook definitions (see <a class="reference internal" href="#hooks">below</a>).</p>
-<p>The first thing that you should do is to change the <tt class="docutils literal"><span class="pre">LLVMC_PLUGIN</span></tt>
-variable in the <tt class="docutils literal"><span class="pre">Makefile</span></tt> to avoid conflicts (since this variable
-is used to name the resulting library):</p>
-<pre class="literal-block">
-LLVMC_PLUGIN=MyPlugin
+AutoGenerated.td  Hooks.cpp  Main.cpp  Makefile
 </pre>
-<p>It is also a good idea to rename <tt class="docutils literal"><span class="pre">Simple.td</span></tt> to something less
-generic:</p>
+<p>As you can see, our basic driver consists of only three files (not counting the
+build script). <tt class="docutils literal">AutoGenerated.td</tt> contains TableGen description of the
+compilation graph; its format is documented in the following
+sections. <tt class="docutils literal">Hooks.cpp</tt> is an empty file that should be used for hook
+definitions (see <a class="reference internal" href="#hooks">below</a>). <tt class="docutils literal">Main.cpp</tt> is just a helper used to compile the
+auto-generated C++ code produced from TableGen source.</p>
+<p>The first thing that you should do is to change the <tt class="docutils literal">LLVMC_BASED_DRIVER</tt>
+variable in the <tt class="docutils literal">Makefile</tt>:</p>
 <pre class="literal-block">
-$ mv Simple.td MyPlugin.td
+LLVMC_BASED_DRIVER=MyDriver
 </pre>
-<p>To build your plugin as a dynamic library, just <tt class="docutils literal"><span class="pre">cd</span></tt> to its source
-directory and run <tt class="docutils literal"><span class="pre">make</span></tt>. The resulting file will be called
-<tt class="docutils literal"><span class="pre">plugin_llvmc_$(LLVMC_PLUGIN).$(DLL_EXTENSION)</span></tt> (in our case,
-<tt class="docutils literal"><span class="pre">plugin_llvmc_MyPlugin.so</span></tt>). This library can be then loaded in with the
-<tt class="docutils literal"><span class="pre">-load</span></tt> option. Example:</p>
-<pre class="literal-block">
-$ cd $LLVMC_DIR/plugins/Simple
-$ make
-$ llvmc -load $LLVM_DIR/Release/lib/plugin_llvmc_Simple.so
-</pre>
-</div>
-<div class="section" id="compiling-standalone-llvmc-based-drivers">
-<h1><a class="toc-backref" href="#id12">Compiling standalone LLVMC-based drivers</a></h1>
-<p>By default, the <tt class="docutils literal"><span class="pre">llvmc</span></tt> executable consists of a driver core plus several
-statically linked plugins (<tt class="docutils literal"><span class="pre">Base</span></tt> and <tt class="docutils literal"><span class="pre">Clang</span></tt> at the moment). You can
-produce a standalone LLVMC-based driver executable by linking the core with your
-own plugins. The recommended way to do this is by starting with the provided
-<tt class="docutils literal"><span class="pre">Skeleton</span></tt> example (<tt class="docutils literal"><span class="pre">$LLVMC_DIR/example/Skeleton</span></tt>):</p>
+<p>It can also be a good idea to put your TableGen code into a file with a less
+generic name:</p>
 <pre class="literal-block">
-$ cd $LLVMC_DIR/example/
-$ cp -r Skeleton mydriver
-$ cd mydriver
-$ vim Makefile
+$ touch MyDriver.td
+$ vim AutoGenerated.td
 [...]
-$ make
+include &quot;MyDriver.td&quot;
 </pre>
+<p>If you have more than one TableGen source file, they all should be included from
+<tt class="docutils literal">AutoGenerated.td</tt>, since this file is used by the build system to generate
+C++ code.</p>
+<p>To build your driver, just <tt class="docutils literal">cd</tt> to its source directory and run <tt class="docutils literal">make</tt>. The
+resulting executable will be put into <tt class="docutils literal"><span class="pre">$LLVM_OBJ_DIR/$(BuildMode)/bin</span></tt>.</p>
 <p>If you're compiling LLVM with different source and object directories, then you
-must perform the following additional steps before running <tt class="docutils literal"><span class="pre">make</span></tt>:</p>
+must perform the following additional steps before running <tt class="docutils literal">make</tt>:</p>
 <pre class="literal-block">
 # LLVMC_SRC_DIR = $LLVM_SRC_DIR/tools/llvmc/
 # LLVMC_OBJ_DIR = $LLVM_OBJ_DIR/tools/llvmc/
-$ cp $LLVMC_SRC_DIR/example/mydriver/Makefile \
-  $LLVMC_OBJ_DIR/example/mydriver/
-$ cd $LLVMC_OBJ_DIR/example/mydriver
+$ mkdir $LLVMC_OBJ_DIR/examples/MyDriver/
+$ cp $LLVMC_SRC_DIR/examples/MyDriver/Makefile \
+  $LLVMC_OBJ_DIR/examples/MyDriver/
+$ cd $LLVMC_OBJ_DIR/examples/MyDriver
 $ make
 </pre>
-<p>Another way to do the same thing is by using the following command:</p>
-<pre class="literal-block">
-$ cd $LLVMC_DIR
-$ make LLVMC_BUILTIN_PLUGINS=MyPlugin LLVMC_BASED_DRIVER_NAME=mydriver
-</pre>
-<p>This works with both srcdir == objdir and srcdir != objdir, but assumes that the
-plugin source directory was placed under <tt class="docutils literal"><span class="pre">$LLVMC_DIR/plugins</span></tt>.</p>
-<p>Sometimes, you will want a 'bare-bones' version of LLVMC that has no
-built-in plugins. It can be compiled with the following command:</p>
-<pre class="literal-block">
-$ cd $LLVMC_DIR
-$ make LLVMC_BUILTIN_PLUGINS=&quot;&quot;
-</pre>
 </div>
 <div class="section" id="customizing-llvmc-the-compilation-graph">
-<h1><a class="toc-backref" href="#id13">Customizing LLVMC: the compilation graph</a></h1>
-<p>Each TableGen configuration file should include the common
-definitions:</p>
+<h1><a class="toc-backref" href="#id11">Customizing LLVMC: the compilation graph</a></h1>
+<p>Each TableGen configuration file should include the common definitions:</p>
 <pre class="literal-block">
 include &quot;llvm/CompilerDriver/Common.td&quot;
 </pre>
-<p>Internally, LLVMC stores information about possible source
-transformations in form of a graph. Nodes in this graph represent
-tools, and edges between two nodes represent a transformation path. A
-special &quot;root&quot; node is used to mark entry points for the
-transformations. LLVMC also assigns a weight to each edge (more on
-this later) to choose between several alternative edges.</p>
-<p>The definition of the compilation graph (see file
-<tt class="docutils literal"><span class="pre">plugins/Base/Base.td</span></tt> for an example) is just a list of edges:</p>
+<p>Internally, LLVMC stores information about possible source transformations in
+form of a graph. Nodes in this graph represent tools, and edges between two
+nodes represent a transformation path. A special &quot;root&quot; node is used to mark
+entry points for the transformations. LLVMC also assigns a weight to each edge
+(more on this later) to choose between several alternative edges.</p>
+<p>The definition of the compilation graph (see file <tt class="docutils literal">llvmc/src/Base.td</tt> for an
+example) is just a list of edges:</p>
 <pre class="literal-block">
 def CompilationGraph : CompilationGraph&lt;[
     Edge&lt;&quot;root&quot;, &quot;llvm_gcc_c&quot;&gt;,
@@ -253,39 +209,33 @@ def CompilationGraph : CompilationGraph&lt;[
 
     ]&gt;;
 </pre>
-<p>As you can see, the edges can be either default or optional, where
-optional edges are differentiated by an additional <tt class="docutils literal"><span class="pre">case</span></tt> expression
-used to calculate the weight of this edge. Notice also that we refer
-to tools via their names (as strings). This makes it possible to add
-edges to an existing compilation graph in plugins without having to
-know about all tool definitions used in the graph.</p>
-<p>The default edges are assigned a weight of 1, and optional edges get a
-weight of 0 + 2*N where N is the number of tests that evaluated to
-true in the <tt class="docutils literal"><span class="pre">case</span></tt> expression. It is also possible to provide an
-integer parameter to <tt class="docutils literal"><span class="pre">inc_weight</span></tt> and <tt class="docutils literal"><span class="pre">dec_weight</span></tt> - in this case,
-the weight is increased (or decreased) by the provided value instead
-of the default 2. It is also possible to change the default weight of
-an optional edge by using the <tt class="docutils literal"><span class="pre">default</span></tt> clause of the <tt class="docutils literal"><span class="pre">case</span></tt>
+<p>As you can see, the edges can be either default or optional, where optional
+edges are differentiated by an additional <tt class="docutils literal">case</tt> expression used to calculate
+the weight of this edge. Notice also that we refer to tools via their names (as
+strings). This makes it possible to add edges to an existing compilation graph
+without having to know about all tool definitions used in the graph.</p>
+<p>The default edges are assigned a weight of 1, and optional edges get a weight of
+0 + 2*N where N is the number of tests that evaluated to true in the <tt class="docutils literal">case</tt>
+expression. It is also possible to provide an integer parameter to
+<tt class="docutils literal">inc_weight</tt> and <tt class="docutils literal">dec_weight</tt> - in this case, the weight is increased (or
+decreased) by the provided value instead of the default 2. Default weight of an
+optional edge can be changed by using the <tt class="docutils literal">default</tt> clause of the <tt class="docutils literal">case</tt>
 construct.</p>
-<p>When passing an input file through the graph, LLVMC picks the edge
-with the maximum weight. To avoid ambiguity, there should be only one
-default edge between two nodes (with the exception of the root node,
-which gets a special treatment - there you are allowed to specify one
-default edge <em>per language</em>).</p>
-<p>When multiple plugins are loaded, their compilation graphs are merged
-together. Since multiple edges that have the same end nodes are not
-allowed (i.e. the graph is not a multigraph), an edge defined in
-several plugins will be replaced by the definition from the plugin
-that was loaded last. Plugin load order can be controlled by using the
-plugin priority feature described above.</p>
-<p>To get a visual representation of the compilation graph (useful for
-debugging), run <tt class="docutils literal"><span class="pre">llvmc</span> <span class="pre">--view-graph</span></tt>. You will need <tt class="docutils literal"><span class="pre">dot</span></tt> and
-<tt class="docutils literal"><span class="pre">gsview</span></tt> installed for this to work properly.</p>
+<p>When passing an input file through the graph, LLVMC picks the edge with the
+maximum weight. To avoid ambiguity, there should be only one default edge
+between two nodes (with the exception of the root node, which gets a special
+treatment - there you are allowed to specify one default edge <em>per language</em>).</p>
+<p>When multiple compilation graphs are defined, they are merged together. Multiple
+edges with the same end nodes are not allowed (i.e. the graph is not a
+multigraph), and will lead to a compile-time error.</p>
+<p>To get a visual representation of the compilation graph (useful for debugging),
+run <tt class="docutils literal">llvmc <span class="pre">--view-graph</span></tt>. You will need <tt class="docutils literal">dot</tt> and <tt class="docutils literal">gsview</tt> installed for
+this to work properly.</p>
 </div>
 <div class="section" id="describing-options">
-<h1><a class="toc-backref" href="#id14">Describing options</a></h1>
-<p>Command-line options that the plugin supports are defined by using an
-<tt class="docutils literal"><span class="pre">OptionList</span></tt>:</p>
+<h1><a class="toc-backref" href="#id12">Describing options</a></h1>
+<p>Command-line options supported by the driver are defined by using an
+<tt class="docutils literal">OptionList</tt>:</p>
 <pre class="literal-block">
 def Options : OptionList&lt;[
 (switch_option &quot;E&quot;, (help &quot;Help string&quot;)),
@@ -293,101 +243,95 @@ def Options : OptionList&lt;[
 ...
 ]&gt;;
 </pre>
-<p>As you can see, the option list is just a list of DAGs, where each DAG
-is an option description consisting of the option name and some
-properties. A plugin can define more than one option list (they are
-all merged together in the end), which can be handy if one wants to
-separate option groups syntactically.</p>
+<p>As you can see, the option list is just a list of DAGs, where each DAG is an
+option description consisting of the option name and some properties. More than
+one option list can be defined (they are all merged together in the end), which
+can be handy if one wants to separate option groups syntactically.</p>
 <ul>
 <li><p class="first">Possible option types:</p>
 <blockquote>
 <ul class="simple">
-<li><tt class="docutils literal"><span class="pre">switch_option</span></tt> - a simple boolean switch without arguments, for example
-<tt class="docutils literal"><span class="pre">-O2</span></tt> or <tt class="docutils literal"><span class="pre">-time</span></tt>. At most one occurrence is allowed.</li>
-<li><tt class="docutils literal"><span class="pre">parameter_option</span></tt> - option that takes one argument, for example
+<li><tt class="docutils literal">switch_option</tt> - a simple boolean switch without arguments, for example
+<tt class="docutils literal"><span class="pre">-O2</span></tt> or <tt class="docutils literal"><span class="pre">-time</span></tt>. At most one occurrence is allowed by default.</li>
+<li><tt class="docutils literal">parameter_option</tt> - option that takes one argument, for example
 <tt class="docutils literal"><span class="pre">-std=c99</span></tt>. It is also allowed to use spaces instead of the equality
-sign: <tt class="docutils literal"><span class="pre">-std</span> <span class="pre">c99</span></tt>. At most one occurrence is allowed.</li>
-<li><tt class="docutils literal"><span class="pre">parameter_list_option</span></tt> - same as the above, but more than one option
-occurence is allowed.</li>
-<li><tt class="docutils literal"><span class="pre">prefix_option</span></tt> - same as the parameter_option, but the option name and
+sign: <tt class="docutils literal"><span class="pre">-std</span> c99</tt>. At most one occurrence is allowed.</li>
+<li><tt class="docutils literal">parameter_list_option</tt> - same as the above, but more than one option
+occurrence is allowed.</li>
+<li><tt class="docutils literal">prefix_option</tt> - same as the parameter_option, but the option name and
 argument do not have to be separated. Example: <tt class="docutils literal"><span class="pre">-ofile</span></tt>. This can be also
-specified as <tt class="docutils literal"><span class="pre">-o</span> <span class="pre">file</span></tt>; however, <tt class="docutils literal"><span class="pre">-o=file</span></tt> will be parsed incorrectly
-(<tt class="docutils literal"><span class="pre">=file</span></tt> will be interpreted as option value). At most one occurrence is
+specified as <tt class="docutils literal"><span class="pre">-o</span> file</tt>; however, <tt class="docutils literal"><span class="pre">-o=file</span></tt> will be parsed incorrectly
+(<tt class="docutils literal">=file</tt> will be interpreted as option value). At most one occurrence is
 allowed.</li>
-<li><tt class="docutils literal"><span class="pre">prefix_list_option</span></tt> - same as the above, but more than one occurence of
+<li><tt class="docutils literal">prefix_list_option</tt> - same as the above, but more than one occurrence of
 the option is allowed; example: <tt class="docutils literal"><span class="pre">-lm</span> <span class="pre">-lpthread</span></tt>.</li>
-<li><tt class="docutils literal"><span class="pre">alias_option</span></tt> - a special option type for creating aliases. Unlike other
+<li><tt class="docutils literal">alias_option</tt> - a special option type for creating aliases. Unlike other
 option types, aliases are not allowed to have any properties besides the
-aliased option name. Usage example: <tt class="docutils literal"><span class="pre">(alias_option</span> <span class="pre">&quot;preprocess&quot;,</span> <span class="pre">&quot;E&quot;)</span></tt></li>
+aliased option name.
+Usage example: <tt class="docutils literal">(alias_option &quot;preprocess&quot;, &quot;E&quot;)</tt></li>
+<li><tt class="docutils literal">switch_list_option</tt> - like <tt class="docutils literal">switch_option</tt> with the <tt class="docutils literal">zero_or_more</tt>
+property, but remembers how many times the switch was turned on. Useful
+mostly for forwarding. Example: when <tt class="docutils literal"><span class="pre">-foo</span></tt> is a switch option (with the
+<tt class="docutils literal">zero_or_more</tt> property), the command <tt class="docutils literal">driver <span class="pre">-foo</span> <span class="pre">-foo</span></tt> is forwarded
+as <tt class="docutils literal"><span class="pre">some-tool</span> <span class="pre">-foo</span></tt>, but when <tt class="docutils literal"><span class="pre">-foo</span></tt> is a switch list, the same command
+is forwarded as <tt class="docutils literal"><span class="pre">some-tool</span> <span class="pre">-foo</span> <span class="pre">-foo</span></tt>.</li>
 </ul>
 </blockquote>
 </li>
 <li><p class="first">Possible option properties:</p>
 <blockquote>
 <ul class="simple">
-<li><tt class="docutils literal"><span class="pre">help</span></tt> - help string associated with this option. Used for <tt class="docutils literal"><span class="pre">-help</span></tt>
+<li><tt class="docutils literal">help</tt> - help string associated with this option. Used for <tt class="docutils literal"><span class="pre">--help</span></tt>
 output.</li>
-<li><tt class="docutils literal"><span class="pre">required</span></tt> - this option must be specified exactly once (or, in case of
-the list options without the <tt class="docutils literal"><span class="pre">multi_val</span></tt> property, at least
-once). Incompatible with <tt class="docutils literal"><span class="pre">zero_or_one</span></tt> and <tt class="docutils literal"><span class="pre">one_or_more</span></tt>.</li>
-<li><tt class="docutils literal"><span class="pre">one_or_more</span></tt> - the option must be specified at least one time. Useful
-only for list options in conjunction with <tt class="docutils literal"><span class="pre">multi_val</span></tt>; for ordinary lists
-it is synonymous with <tt class="docutils literal"><span class="pre">required</span></tt>. Incompatible with <tt class="docutils literal"><span class="pre">required</span></tt> and
-<tt class="docutils literal"><span class="pre">zero_or_one</span></tt>.</li>
-<li><tt class="docutils literal"><span class="pre">optional</span></tt> - the option can be specified zero or one times. Useful only
-for list options in conjunction with <tt class="docutils literal"><span class="pre">multi_val</span></tt>. Incompatible with
-<tt class="docutils literal"><span class="pre">required</span></tt> and <tt class="docutils literal"><span class="pre">one_or_more</span></tt>.</li>
-<li><tt class="docutils literal"><span class="pre">hidden</span></tt> - the description of this option will not appear in
-the <tt class="docutils literal"><span class="pre">-help</span></tt> output (but will appear in the <tt class="docutils literal"><span class="pre">-help-hidden</span></tt>
+<li><tt class="docutils literal">required</tt> - this option must be specified exactly once (or, in case of
+the list options without the <tt class="docutils literal">multi_val</tt> property, at least
+once). Incompatible with <tt class="docutils literal">optional</tt> and <tt class="docutils literal">one_or_more</tt>.</li>
+<li><tt class="docutils literal">optional</tt> - the option can be specified either zero times or exactly
+once. The default for switch options. Useful only for list options in
+conjunction with <tt class="docutils literal">multi_val</tt>. Incompatible with <tt class="docutils literal">required</tt>,
+<tt class="docutils literal">zero_or_more</tt> and <tt class="docutils literal">one_or_more</tt>.</li>
+<li><tt class="docutils literal">one_or_more</tt> - the option must be specified at least once. Can be useful
+to allow switch options be both obligatory and be specified multiple
+times. For list options is useful only in conjunction with <tt class="docutils literal">multi_val</tt>;
+for ordinary it is synonymous with <tt class="docutils literal">required</tt>. Incompatible with
+<tt class="docutils literal">required</tt>, <tt class="docutils literal">optional</tt> and <tt class="docutils literal">zero_or_more</tt>.</li>
+<li><tt class="docutils literal">zero_or_more</tt> - the option can be specified zero or more times. Useful
+to allow a single switch option to be specified more than
+once. Incompatible with <tt class="docutils literal">required</tt>, <tt class="docutils literal">optional</tt> and <tt class="docutils literal">one_or_more</tt>.</li>
+<li><tt class="docutils literal">hidden</tt> - the description of this option will not appear in
+the <tt class="docutils literal"><span class="pre">--help</span></tt> output (but will appear in the <tt class="docutils literal"><span class="pre">--help-hidden</span></tt>
 output).</li>
-<li><tt class="docutils literal"><span class="pre">really_hidden</span></tt> - the option will not be mentioned in any help
+<li><tt class="docutils literal">really_hidden</tt> - the option will not be mentioned in any help
 output.</li>
-<li><tt class="docutils literal"><span class="pre">comma_separated</span></tt> - Indicates that any commas specified for an option's
+<li><tt class="docutils literal">comma_separated</tt> - Indicates that any commas specified for an option's
 value should be used to split the value up into multiple values for the
 option. This property is valid only for list options. In conjunction with
-<tt class="docutils literal"><span class="pre">forward_value</span></tt> can be used to implement option forwarding in style of
+<tt class="docutils literal">forward_value</tt> can be used to implement option forwarding in style of
 gcc's <tt class="docutils literal"><span class="pre">-Wa,</span></tt>.</li>
-<li><tt class="docutils literal"><span class="pre">multi_val</span> <span class="pre">n</span></tt> - this option takes <em>n</em> arguments (can be useful in some
-special cases). Usage example: <tt class="docutils literal"><span class="pre">(parameter_list_option</span> <span class="pre">&quot;foo&quot;,</span> <span class="pre">(multi_val</span>
-<span class="pre">3))</span></tt>; the command-line syntax is '-foo a b c'. Only list options can have
-this attribute; you can, however, use the <tt class="docutils literal"><span class="pre">one_or_more</span></tt>, <tt class="docutils literal"><span class="pre">optional</span></tt>
-and <tt class="docutils literal"><span class="pre">required</span></tt> properties.</li>
-<li><tt class="docutils literal"><span class="pre">init</span></tt> - this option has a default value, either a string (if it is a
+<li><tt class="docutils literal">multi_val n</tt> - this option takes <em>n</em> arguments (can be useful in some
+special cases). Usage example: <tt class="docutils literal">(parameter_list_option &quot;foo&quot;, (multi_val
+3))</tt>; the command-line syntax is '-foo a b c'. Only list options can have
+this attribute; you can, however, use the <tt class="docutils literal">one_or_more</tt>, <tt class="docutils literal">optional</tt>
+and <tt class="docutils literal">required</tt> properties.</li>
+<li><tt class="docutils literal">init</tt> - this option has a default value, either a string (if it is a
 parameter), or a boolean (if it is a switch; as in C++, boolean constants
-are called <tt class="docutils literal"><span class="pre">true</span></tt> and <tt class="docutils literal"><span class="pre">false</span></tt>). List options can't have <tt class="docutils literal"><span class="pre">init</span></tt>
+are called <tt class="docutils literal">true</tt> and <tt class="docutils literal">false</tt>). List options can't have <tt class="docutils literal">init</tt>
 attribute.
-Usage examples: <tt class="docutils literal"><span class="pre">(switch_option</span> <span class="pre">&quot;foo&quot;,</span> <span class="pre">(init</span> <span class="pre">true))</span></tt>; <tt class="docutils literal"><span class="pre">(prefix_option</span>
-<span class="pre">&quot;bar&quot;,</span> <span class="pre">(init</span> <span class="pre">&quot;baz&quot;))</span></tt>.</li>
-<li><tt class="docutils literal"><span class="pre">extern</span></tt> - this option is defined in some other plugin, see <a class="reference internal" href="#extern">below</a>.</li>
+Usage examples: <tt class="docutils literal">(switch_option &quot;foo&quot;, (init true))</tt>; <tt class="docutils literal">(prefix_option
+&quot;bar&quot;, (init <span class="pre">&quot;baz&quot;))</span></tt>.</li>
 </ul>
 </blockquote>
 </li>
 </ul>
-<div class="section" id="external-options">
-<span id="extern"></span><h2><a class="toc-backref" href="#id15">External options</a></h2>
-<p>Sometimes, when linking several plugins together, one plugin needs to
-access options defined in some other plugin. Because of the way
-options are implemented, such options must be marked as
-<tt class="docutils literal"><span class="pre">extern</span></tt>. This is what the <tt class="docutils literal"><span class="pre">extern</span></tt> option property is
-for. Example:</p>
-<pre class="literal-block">
-...
-(switch_option &quot;E&quot;, (extern))
-...
-</pre>
-<p>If an external option has additional attributes besides 'extern', they are
-ignored. See also the section on plugin <a class="reference internal" href="#priorities">priorities</a>.</p>
-</div>
 </div>
 <div class="section" id="conditional-evaluation">
-<span id="case"></span><h1><a class="toc-backref" href="#id16">Conditional evaluation</a></h1>
-<p>The 'case' construct is the main means by which programmability is
-achieved in LLVMC. It can be used to calculate edge weights, program
-actions and modify the shell commands to be executed. The 'case'
-expression is designed after the similarly-named construct in
-functional languages and takes the form <tt class="docutils literal"><span class="pre">(case</span> <span class="pre">(test_1),</span> <span class="pre">statement_1,</span>
-<span class="pre">(test_2),</span> <span class="pre">statement_2,</span> <span class="pre">...</span> <span class="pre">(test_N),</span> <span class="pre">statement_N)</span></tt>. The statements
-are evaluated only if the corresponding tests evaluate to true.</p>
+<span id="case"></span><h1><a class="toc-backref" href="#id13">Conditional evaluation</a></h1>
+<p>The 'case' construct is the main means by which programmability is achieved in
+LLVMC. It can be used to calculate edge weights, program actions and modify the
+shell commands to be executed. The 'case' expression is designed after the
+similarly-named construct in functional languages and takes the form <tt class="docutils literal">(case
+(test_1), statement_1, (test_2), statement_2, ... (test_N), statement_N)</tt>. The
+statements are evaluated only if the corresponding tests evaluate to true.</p>
 <p>Examples:</p>
 <pre class="literal-block">
 // Edge weight calculation
@@ -410,129 +354,139 @@ are evaluated only if the corresponding tests evaluate to true.</p>
     (switch_on &quot;B&quot;), &quot;cmdline2&quot;,
     (default), &quot;cmdline3&quot;)
 </pre>
-<p>Note the slight difference in 'case' expression handling in contexts
-of edge weights and command line specification - in the second example
-the value of the <tt class="docutils literal"><span class="pre">&quot;B&quot;</span></tt> switch is never checked when switch <tt class="docutils literal"><span class="pre">&quot;A&quot;</span></tt> is
-enabled, and the whole expression always evaluates to <tt class="docutils literal"><span class="pre">&quot;cmdline1&quot;</span></tt> in
-that case.</p>
+<p>Note the slight difference in 'case' expression handling in contexts of edge
+weights and command line specification - in the second example the value of the
+<tt class="docutils literal">&quot;B&quot;</tt> switch is never checked when switch <tt class="docutils literal">&quot;A&quot;</tt> is enabled, and the whole
+expression always evaluates to <tt class="docutils literal">&quot;cmdline1&quot;</tt> in that case.</p>
 <p>Case expressions can also be nested, i.e. the following is legal:</p>
 <pre class="literal-block">
 (case (switch_on &quot;E&quot;), (case (switch_on &quot;o&quot;), ..., (default), ...)
       (default), ...)
 </pre>
-<p>You should, however, try to avoid doing that because it hurts
-readability. It is usually better to split tool descriptions and/or
-use TableGen inheritance instead.</p>
+<p>You should, however, try to avoid doing that because it hurts readability. It is
+usually better to split tool descriptions and/or use TableGen inheritance
+instead.</p>
 <ul class="simple">
 <li>Possible tests are:<ul>
-<li><tt class="docutils literal"><span class="pre">switch_on</span></tt> - Returns true if a given command-line switch is provided by
-the user. Can be given a list as argument, in that case <tt class="docutils literal"><span class="pre">(switch_on</span> <span class="pre">[&quot;foo&quot;,</span>
-<span class="pre">&quot;bar&quot;,</span> <span class="pre">&quot;baz&quot;])</span></tt> is equivalent to <tt class="docutils literal"><span class="pre">(and</span> <span class="pre">(switch_on</span> <span class="pre">&quot;foo&quot;),</span> <span class="pre">(switch_on</span>
-<span class="pre">&quot;bar&quot;),</span> <span class="pre">(switch_on</span> <span class="pre">&quot;baz&quot;))</span></tt>.
-Example: <tt class="docutils literal"><span class="pre">(switch_on</span> <span class="pre">&quot;opt&quot;)</span></tt>.</li>
-<li><tt class="docutils literal"><span class="pre">any_switch_on</span></tt> - Given a list of switch options, returns true if any of
+<li><tt class="docutils literal">switch_on</tt> - Returns true if a given command-line switch is provided by
+the user. Can be given multiple arguments, in that case <tt class="docutils literal">(switch_on &quot;foo&quot;,
+&quot;bar&quot;, &quot;baz&quot;)</tt> is equivalent to <tt class="docutils literal">(and (switch_on <span class="pre">&quot;foo&quot;),</span> (switch_on
+<span class="pre">&quot;bar&quot;),</span> (switch_on <span class="pre">&quot;baz&quot;))</span></tt>.
+Example: <tt class="docutils literal">(switch_on &quot;opt&quot;)</tt>.</li>
+<li><tt class="docutils literal">any_switch_on</tt> - Given a number of switch options, returns true if any of
 the switches is turned on.
-Example: <tt class="docutils literal"><span class="pre">(any_switch_on</span> <span class="pre">[&quot;foo&quot;,</span> <span class="pre">&quot;bar&quot;,</span> <span class="pre">&quot;baz&quot;])</span></tt> is equivalent to <tt class="docutils literal"><span class="pre">(or</span>
-<span class="pre">(switch_on</span> <span class="pre">&quot;foo&quot;),</span> <span class="pre">(switch_on</span> <span class="pre">&quot;bar&quot;),</span> <span class="pre">(switch_on</span> <span class="pre">&quot;baz&quot;))</span></tt>.</li>
-<li><tt class="docutils literal"><span class="pre">parameter_equals</span></tt> - Returns true if a command-line parameter equals
-a given value.
-Example: <tt class="docutils literal"><span class="pre">(parameter_equals</span> <span class="pre">&quot;W&quot;,</span> <span class="pre">&quot;all&quot;)</span></tt>.</li>
-<li><tt class="docutils literal"><span class="pre">element_in_list</span></tt> - Returns true if a command-line parameter
-list contains a given value.
-Example: <tt class="docutils literal"><span class="pre">(element_in_list</span> <span class="pre">&quot;l&quot;,</span> <span class="pre">&quot;pthread&quot;)</span></tt>.</li>
-<li><tt class="docutils literal"><span class="pre">input_languages_contain</span></tt> - Returns true if a given language
+Example: <tt class="docutils literal">(any_switch_on &quot;foo&quot;, &quot;bar&quot;, &quot;baz&quot;)</tt> is equivalent to <tt class="docutils literal">(or
+(switch_on <span class="pre">&quot;foo&quot;),</span> (switch_on <span class="pre">&quot;bar&quot;),</span> (switch_on <span class="pre">&quot;baz&quot;))</span></tt>.</li>
+<li><tt class="docutils literal">parameter_equals</tt> - Returns true if a command-line parameter (first
+argument) equals a given value (second argument).
+Example: <tt class="docutils literal">(parameter_equals &quot;W&quot;, &quot;all&quot;)</tt>.</li>
+<li><tt class="docutils literal">element_in_list</tt> - Returns true if a command-line parameter list (first
+argument) contains a given value (second argument).
+Example: <tt class="docutils literal">(element_in_list &quot;l&quot;, &quot;pthread&quot;)</tt>.</li>
+<li><tt class="docutils literal">input_languages_contain</tt> - Returns true if a given language
 belongs to the current input language set.
-Example: <tt class="docutils literal"><span class="pre">(input_languages_contain</span> <span class="pre">&quot;c++&quot;)</span></tt>.</li>
-<li><tt class="docutils literal"><span class="pre">in_language</span></tt> - Evaluates to true if the input file language is equal to
-the argument. At the moment works only with <tt class="docutils literal"><span class="pre">cmd_line</span></tt> and <tt class="docutils literal"><span class="pre">actions</span></tt> (on
+Example: <tt class="docutils literal">(input_languages_contain <span class="pre">&quot;c++&quot;)</span></tt>.</li>
+<li><tt class="docutils literal">in_language</tt> - Evaluates to true if the input file language is equal to
+the argument. At the moment works only with <tt class="docutils literal">command</tt> and <tt class="docutils literal">actions</tt> (on
 non-join nodes).
-Example: <tt class="docutils literal"><span class="pre">(in_language</span> <span class="pre">&quot;c++&quot;)</span></tt>.</li>
-<li><tt class="docutils literal"><span class="pre">not_empty</span></tt> - Returns true if a given option (which should be either a
-parameter or a parameter list) is set by the user. Like <tt class="docutils literal"><span class="pre">switch_on</span></tt>, can
-be also given a list as argument.
-Example: <tt class="docutils literal"><span class="pre">(not_empty</span> <span class="pre">&quot;o&quot;)</span></tt>.</li>
-<li><tt class="docutils literal"><span class="pre">any_not_empty</span></tt> - Returns true if <tt class="docutils literal"><span class="pre">not_empty</span></tt> returns true for any of
-the options in the list.
-Example: <tt class="docutils literal"><span class="pre">(any_not_empty</span> <span class="pre">[&quot;foo&quot;,</span> <span class="pre">&quot;bar&quot;,</span> <span class="pre">&quot;baz&quot;])</span></tt> is equivalent to <tt class="docutils literal"><span class="pre">(or</span>
-<span class="pre">(not_empty</span> <span class="pre">&quot;foo&quot;),</span> <span class="pre">(not_empty</span> <span class="pre">&quot;bar&quot;),</span> <span class="pre">(not_empty</span> <span class="pre">&quot;baz&quot;))</span></tt>.</li>
-<li><tt class="docutils literal"><span class="pre">empty</span></tt> - The opposite of <tt class="docutils literal"><span class="pre">not_empty</span></tt>. Equivalent to <tt class="docutils literal"><span class="pre">(not</span> <span class="pre">(not_empty</span>
-<span class="pre">X))</span></tt>. Provided for convenience. Can be given a list as argument.</li>
-<li><tt class="docutils literal"><span class="pre">any_not_empty</span></tt> - Returns true if <tt class="docutils literal"><span class="pre">not_empty</span></tt> returns true for any of
-the options in the list.
-Example: <tt class="docutils literal"><span class="pre">(any_empty</span> <span class="pre">[&quot;foo&quot;,</span> <span class="pre">&quot;bar&quot;,</span> <span class="pre">&quot;baz&quot;])</span></tt> is equivalent to <tt class="docutils literal"><span class="pre">(not</span> <span class="pre">(and</span>
-<span class="pre">(not_empty</span> <span class="pre">&quot;foo&quot;),</span> <span class="pre">(not_empty</span> <span class="pre">&quot;bar&quot;),</span> <span class="pre">(not_empty</span> <span class="pre">&quot;baz&quot;)))</span></tt>.</li>
-<li><tt class="docutils literal"><span class="pre">single_input_file</span></tt> - Returns true if there was only one input file
+Example: <tt class="docutils literal">(in_language <span class="pre">&quot;c++&quot;)</span></tt>.</li>
+<li><tt class="docutils literal">not_empty</tt> - Returns true if a given option (which should be either a
+parameter or a parameter list) is set by the user. Like <tt class="docutils literal">switch_on</tt>, can
+be also given multiple arguments.
+Examples: <tt class="docutils literal">(not_empty &quot;o&quot;)</tt>, <tt class="docutils literal">(not_empty &quot;o&quot;, &quot;l&quot;)</tt>.</li>
+<li><tt class="docutils literal">any_not_empty</tt> - Returns true if <tt class="docutils literal">not_empty</tt> returns true for any of
+the provided options.
+Example: <tt class="docutils literal">(any_not_empty &quot;foo&quot;, &quot;bar&quot;, &quot;baz&quot;)</tt> is equivalent to <tt class="docutils literal">(or
+(not_empty <span class="pre">&quot;foo&quot;),</span> (not_empty <span class="pre">&quot;bar&quot;),</span> (not_empty <span class="pre">&quot;baz&quot;))</span></tt>.</li>
+<li><tt class="docutils literal">empty</tt> - The opposite of <tt class="docutils literal">not_empty</tt>. Equivalent to <tt class="docutils literal">(not (not_empty
+X))</tt>. Can be given multiple arguments.</li>
+<li><tt class="docutils literal">any_not_empty</tt> - Returns true if <tt class="docutils literal">not_empty</tt> returns true for any of
+the provided options.
+Example: <tt class="docutils literal">(any_empty &quot;foo&quot;, &quot;bar&quot;, &quot;baz&quot;)</tt> is equivalent to <tt class="docutils literal">(or
+(not_empty <span class="pre">&quot;foo&quot;),</span> (not_empty <span class="pre">&quot;bar&quot;),</span> (not_empty <span class="pre">&quot;baz&quot;))</span></tt>.</li>
+<li><tt class="docutils literal">single_input_file</tt> - Returns true if there was only one input file
 provided on the command-line. Used without arguments:
-<tt class="docutils literal"><span class="pre">(single_input_file)</span></tt>.</li>
-<li><tt class="docutils literal"><span class="pre">multiple_input_files</span></tt> - Equivalent to <tt class="docutils literal"><span class="pre">(not</span> <span class="pre">(single_input_file))</span></tt> (the
+<tt class="docutils literal">(single_input_file)</tt>.</li>
+<li><tt class="docutils literal">multiple_input_files</tt> - Equivalent to <tt class="docutils literal">(not (single_input_file))</tt> (the
 case of zero input files is considered an error).</li>
-<li><tt class="docutils literal"><span class="pre">default</span></tt> - Always evaluates to true. Should always be the last
-test in the <tt class="docutils literal"><span class="pre">case</span></tt> expression.</li>
-<li><tt class="docutils literal"><span class="pre">and</span></tt> - A standard binary logical combinator that returns true iff all of
-its arguments return true. Used like this: <tt class="docutils literal"><span class="pre">(and</span> <span class="pre">(test1),</span> <span class="pre">(test2),</span>
-<span class="pre">...</span> <span class="pre">(testN))</span></tt>. Nesting of <tt class="docutils literal"><span class="pre">and</span></tt> and <tt class="docutils literal"><span class="pre">or</span></tt> is allowed, but not
+<li><tt class="docutils literal">default</tt> - Always evaluates to true. Should always be the last
+test in the <tt class="docutils literal">case</tt> expression.</li>
+<li><tt class="docutils literal">and</tt> - A standard logical combinator that returns true iff all of
+its arguments return true. Used like this: <tt class="docutils literal">(and (test1), (test2),
+... (testN))</tt>. Nesting of <tt class="docutils literal">and</tt> and <tt class="docutils literal">or</tt> is allowed, but not
 encouraged.</li>
-<li><tt class="docutils literal"><span class="pre">or</span></tt> - A binary logical combinator that returns true iff any of its
-arguments returns true. Example: <tt class="docutils literal"><span class="pre">(or</span> <span class="pre">(test1),</span> <span class="pre">(test2),</span> <span class="pre">...</span> <span class="pre">(testN))</span></tt>.</li>
-<li><tt class="docutils literal"><span class="pre">not</span></tt> - Standard unary logical combinator that negates its
-argument. Example: <tt class="docutils literal"><span class="pre">(not</span> <span class="pre">(or</span> <span class="pre">(test1),</span> <span class="pre">(test2),</span> <span class="pre">...</span> <span class="pre">(testN)))</span></tt>.</li>
+<li><tt class="docutils literal">or</tt> - A logical combinator that returns true iff any of its arguments
+return true.
+Example: <tt class="docutils literal">(or (test1), (test2), ... (testN))</tt>.</li>
+<li><tt class="docutils literal">not</tt> - Standard unary logical combinator that negates its
+argument.
+Example: <tt class="docutils literal">(not (or (test1), (test2), ... <span class="pre">(testN)))</span></tt>.</li>
 </ul>
 </li>
 </ul>
 </div>
 <div class="section" id="writing-a-tool-description">
-<h1><a class="toc-backref" href="#id17">Writing a tool description</a></h1>
-<p>As was said earlier, nodes in the compilation graph represent tools,
-which are described separately. A tool definition looks like this
-(taken from the <tt class="docutils literal"><span class="pre">include/llvm/CompilerDriver/Tools.td</span></tt> file):</p>
+<h1><a class="toc-backref" href="#id14">Writing a tool description</a></h1>
+<p>As was said earlier, nodes in the compilation graph represent tools, which are
+described separately. A tool definition looks like this (taken from the
+<tt class="docutils literal">llvmc/src/Base.td</tt> file):</p>
 <pre class="literal-block">
 def llvm_gcc_cpp : Tool&lt;[
     (in_language &quot;c++&quot;),
     (out_language &quot;llvm-assembler&quot;),
     (output_suffix &quot;bc&quot;),
-    (cmd_line &quot;llvm-g++ -c $INFILE -o $OUTFILE -emit-llvm&quot;),
+    (command &quot;llvm-g++ -c -emit-llvm&quot;),
     (sink)
     ]&gt;;
 </pre>
-<p>This defines a new tool called <tt class="docutils literal"><span class="pre">llvm_gcc_cpp</span></tt>, which is an alias for
-<tt class="docutils literal"><span class="pre">llvm-g++</span></tt>. As you can see, a tool definition is just a list of
-properties; most of them should be self-explanatory. The <tt class="docutils literal"><span class="pre">sink</span></tt>
-property means that this tool should be passed all command-line
-options that aren't mentioned in the option list.</p>
+<p>This defines a new tool called <tt class="docutils literal">llvm_gcc_cpp</tt>, which is an alias for
+<tt class="docutils literal"><span class="pre">llvm-g++</span></tt>. As you can see, a tool definition is just a list of properties;
+most of them should be self-explanatory. The <tt class="docutils literal">sink</tt> property means that this
+tool should be passed all command-line options that aren't mentioned in the
+option list.</p>
 <p>The complete list of all currently implemented tool properties follows.</p>
 <ul class="simple">
 <li>Possible tool properties:<ul>
-<li><tt class="docutils literal"><span class="pre">in_language</span></tt> - input language name. Can be either a string or a
-list, in case the tool supports multiple input languages.</li>
-<li><tt class="docutils literal"><span class="pre">out_language</span></tt> - output language name. Multiple output languages are not
-allowed.</li>
-<li><tt class="docutils literal"><span class="pre">output_suffix</span></tt> - output file suffix. Can also be changed
-dynamically, see documentation on actions.</li>
-<li><tt class="docutils literal"><span class="pre">cmd_line</span></tt> - the actual command used to run the tool. You can
-use <tt class="docutils literal"><span class="pre">$INFILE</span></tt> and <tt class="docutils literal"><span class="pre">$OUTFILE</span></tt> variables, output redirection
-with <tt class="docutils literal"><span class="pre">&gt;</span></tt>, hook invocations (<tt class="docutils literal"><span class="pre">$CALL</span></tt>), environment variables
-(via <tt class="docutils literal"><span class="pre">$ENV</span></tt>) and the <tt class="docutils literal"><span class="pre">case</span></tt> construct.</li>
-<li><tt class="docutils literal"><span class="pre">join</span></tt> - this tool is a &quot;join node&quot; in the graph, i.e. it gets a
-list of input files and joins them together. Used for linkers.</li>
-<li><tt class="docutils literal"><span class="pre">sink</span></tt> - all command-line options that are not handled by other
-tools are passed to this tool.</li>
-<li><tt class="docutils literal"><span class="pre">actions</span></tt> - A single big <tt class="docutils literal"><span class="pre">case</span></tt> expression that specifies how
-this tool reacts on command-line options (described in more detail
-<a class="reference internal" href="#actions">below</a>).</li>
+<li><tt class="docutils literal">in_language</tt> - input language name. Can be given multiple arguments, in
+case the tool supports multiple input languages. Used for typechecking and
+mapping file extensions to tools.</li>
+<li><tt class="docutils literal">out_language</tt> - output language name. Multiple output languages are
+allowed. Used for typechecking the compilation graph.</li>
+<li><tt class="docutils literal">output_suffix</tt> - output file suffix. Can also be changed dynamically, see
+documentation on <a class="reference internal" href="#actions">actions</a>.</li>
 </ul>
 </li>
 </ul>
-<div class="section" id="id5">
-<span id="actions"></span><h2><a class="toc-backref" href="#id18">Actions</a></h2>
-<p>A tool often needs to react to command-line options, and this is
-precisely what the <tt class="docutils literal"><span class="pre">actions</span></tt> property is for. The next example
-illustrates this feature:</p>
+<blockquote>
+<ul class="simple">
+<li><tt class="docutils literal">command</tt> - the actual command used to run the tool. You can use output
+redirection with <tt class="docutils literal">&gt;</tt>, hook invocations (<tt class="docutils literal">$CALL</tt>), environment variables
+(via <tt class="docutils literal">$ENV</tt>) and the <tt class="docutils literal">case</tt> construct.</li>
+<li><tt class="docutils literal">join</tt> - this tool is a &quot;join node&quot; in the graph, i.e. it gets a list of
+input files and joins them together. Used for linkers.</li>
+<li><tt class="docutils literal">sink</tt> - all command-line options that are not handled by other tools are
+passed to this tool.</li>
+<li><tt class="docutils literal">actions</tt> - A single big <tt class="docutils literal">case</tt> expression that specifies how this tool
+reacts on command-line options (described in more detail <a class="reference internal" href="#actions">below</a>).</li>
+</ul>
+</blockquote>
+<blockquote>
+<ul class="simple">
+<li><tt class="docutils literal">out_file_option</tt>, <tt class="docutils literal">in_file_option</tt> - Options appended to the
+<tt class="docutils literal">command</tt> string to designate output and input files. Default values are
+<tt class="docutils literal"><span class="pre">&quot;-o&quot;</span></tt> and <tt class="docutils literal">&quot;&quot;</tt>, respectively.</li>
+</ul>
+</blockquote>
+<div class="section" id="id4">
+<span id="actions"></span><h2><a class="toc-backref" href="#id15">Actions</a></h2>
+<p>A tool often needs to react to command-line options, and this is precisely what
+the <tt class="docutils literal">actions</tt> property is for. The next example illustrates this feature:</p>
 <pre class="literal-block">
 def llvm_gcc_linker : Tool&lt;[
     (in_language &quot;object-code&quot;),
     (out_language &quot;executable&quot;),
     (output_suffix &quot;out&quot;),
-    (cmd_line &quot;llvm-gcc $INFILE -o $OUTFILE&quot;),
+    (command &quot;llvm-gcc&quot;),
     (join),
     (actions (case (not_empty &quot;L&quot;), (forward &quot;L&quot;),
                    (not_empty &quot;l&quot;), (forward &quot;l&quot;),
@@ -540,47 +494,46 @@ def llvm_gcc_linker : Tool&lt;[
                              [(append_cmd &quot;-dummy1&quot;), (append_cmd &quot;-dummy2&quot;)])
     ]&gt;;
 </pre>
-<p>The <tt class="docutils literal"><span class="pre">actions</span></tt> tool property is implemented on top of the omnipresent
-<tt class="docutils literal"><span class="pre">case</span></tt> expression. It associates one or more different <em>actions</em>
-with given conditions - in the example, the actions are <tt class="docutils literal"><span class="pre">forward</span></tt>,
-which forwards a given option unchanged, and <tt class="docutils literal"><span class="pre">append_cmd</span></tt>, which
-appends a given string to the tool execution command. Multiple actions
-can be associated with a single condition by using a list of actions
-(used in the example to append some dummy options). The same <tt class="docutils literal"><span class="pre">case</span></tt>
-construct can also be used in the <tt class="docutils literal"><span class="pre">cmd_line</span></tt> property to modify the
-tool command line.</p>
-<p>The &quot;join&quot; property used in the example means that this tool behaves
-like a linker.</p>
+<p>The <tt class="docutils literal">actions</tt> tool property is implemented on top of the omnipresent <tt class="docutils literal">case</tt>
+expression. It associates one or more different <em>actions</em> with given
+conditions - in the example, the actions are <tt class="docutils literal">forward</tt>, which forwards a given
+option unchanged, and <tt class="docutils literal">append_cmd</tt>, which appends a given string to the tool
+execution command. Multiple actions can be associated with a single condition by
+using a list of actions (used in the example to append some dummy options). The
+same <tt class="docutils literal">case</tt> construct can also be used in the <tt class="docutils literal">cmd_line</tt> property to modify
+the tool command line.</p>
+<p>The &quot;join&quot; property used in the example means that this tool behaves like a
+linker.</p>
 <p>The list of all possible actions follows.</p>
 <ul>
 <li><p class="first">Possible actions:</p>
 <blockquote>
 <ul class="simple">
-<li><tt class="docutils literal"><span class="pre">append_cmd</span></tt> - Append a string to the tool invocation command.
-Example: <tt class="docutils literal"><span class="pre">(case</span> <span class="pre">(switch_on</span> <span class="pre">&quot;pthread&quot;),</span> <span class="pre">(append_cmd</span> <span class="pre">&quot;-lpthread&quot;))</span></tt>.</li>
-<li><tt class="docutils literal"><span class="pre">error</span></tt> - Exit with error.
-Example: <tt class="docutils literal"><span class="pre">(error</span> <span class="pre">&quot;Mixing</span> <span class="pre">-c</span> <span class="pre">and</span> <span class="pre">-S</span> <span class="pre">is</span> <span class="pre">not</span> <span class="pre">allowed!&quot;)</span></tt>.</li>
-<li><tt class="docutils literal"><span class="pre">warning</span></tt> - Print a warning.
-Example: <tt class="docutils literal"><span class="pre">(warning</span> <span class="pre">&quot;Specifying</span> <span class="pre">both</span> <span class="pre">-O1</span> <span class="pre">and</span> <span class="pre">-O2</span> <span class="pre">is</span> <span class="pre">meaningless!&quot;)</span></tt>.</li>
-<li><tt class="docutils literal"><span class="pre">forward</span></tt> - Forward the option unchanged.
-Example: <tt class="docutils literal"><span class="pre">(forward</span> <span class="pre">&quot;Wall&quot;)</span></tt>.</li>
-<li><tt class="docutils literal"><span class="pre">forward_as</span></tt> - Change the option's name, but forward the argument
+<li><tt class="docutils literal">append_cmd</tt> - Append a string to the tool invocation command.
+Example: <tt class="docutils literal">(case (switch_on <span class="pre">&quot;pthread&quot;),</span> (append_cmd <span class="pre">&quot;-lpthread&quot;))</span></tt>.</li>
+<li><tt class="docutils literal">error</tt> - Exit with error.
+Example: <tt class="docutils literal">(error &quot;Mixing <span class="pre">-c</span> and <span class="pre">-S</span> is not <span class="pre">allowed!&quot;)</span></tt>.</li>
+<li><tt class="docutils literal">warning</tt> - Print a warning.
+Example: <tt class="docutils literal">(warning &quot;Specifying both <span class="pre">-O1</span> and <span class="pre">-O2</span> is <span class="pre">meaningless!&quot;)</span></tt>.</li>
+<li><tt class="docutils literal">forward</tt> - Forward the option unchanged.
+Example: <tt class="docutils literal">(forward &quot;Wall&quot;)</tt>.</li>
+<li><tt class="docutils literal">forward_as</tt> - Change the option's name, but forward the argument
 unchanged.
-Example: <tt class="docutils literal"><span class="pre">(forward_as</span> <span class="pre">&quot;O0&quot;,</span> <span class="pre">&quot;--disable-optimization&quot;)</span></tt>.</li>
-<li><tt class="docutils literal"><span class="pre">forward_value</span></tt> - Forward only option's value. Cannot be used with switch
+Example: <tt class="docutils literal">(forward_as &quot;O0&quot;, <span class="pre">&quot;--disable-optimization&quot;)</span></tt>.</li>
+<li><tt class="docutils literal">forward_value</tt> - Forward only option's value. Cannot be used with switch
 options (since they don't have values), but works fine with lists.
-Example: <tt class="docutils literal"><span class="pre">(forward_value</span> <span class="pre">&quot;Wa,&quot;)</span></tt>.</li>
-<li><tt class="docutils literal"><span class="pre">forward_transformed_value</span></tt> - As above, but applies a hook to the
+Example: <tt class="docutils literal">(forward_value <span class="pre">&quot;Wa,&quot;)</span></tt>.</li>
+<li><tt class="docutils literal">forward_transformed_value</tt> - As above, but applies a hook to the
 option's value before forwarding (see <a class="reference internal" href="#hooks">below</a>). When
-<tt class="docutils literal"><span class="pre">forward_transformed_value</span></tt> is applied to a list
+<tt class="docutils literal">forward_transformed_value</tt> is applied to a list
 option, the hook must have signature
-<tt class="docutils literal"><span class="pre">std::string</span> <span class="pre">hooks::HookName</span> <span class="pre">(const</span> <span class="pre">std::vector&lt;std::string&gt;&amp;)</span></tt>.
-Example: <tt class="docutils literal"><span class="pre">(forward_transformed_value</span> <span class="pre">&quot;m&quot;,</span> <span class="pre">&quot;ConvertToMAttr&quot;)</span></tt>.</li>
-<li><tt class="docutils literal"><span class="pre">output_suffix</span></tt> - Modify the output suffix of this tool.
-Example: <tt class="docutils literal"><span class="pre">(output_suffix</span> <span class="pre">&quot;i&quot;)</span></tt>.</li>
-<li><tt class="docutils literal"><span class="pre">stop_compilation</span></tt> - Stop compilation after this tool processes its
+<tt class="docutils literal"><span class="pre">std::string</span> <span class="pre">hooks::HookName</span> (const <span class="pre">std::vector&lt;std::string&gt;&amp;)</span></tt>.
+Example: <tt class="docutils literal">(forward_transformed_value &quot;m&quot;, &quot;ConvertToMAttr&quot;)</tt>.</li>
+<li><tt class="docutils literal">output_suffix</tt> - Modify the output suffix of this tool.
+Example: <tt class="docutils literal">(output_suffix &quot;i&quot;)</tt>.</li>
+<li><tt class="docutils literal">stop_compilation</tt> - Stop compilation after this tool processes its
 input. Used without arguments.
-Example: <tt class="docutils literal"><span class="pre">(stop_compilation)</span></tt>.</li>
+Example: <tt class="docutils literal">(stop_compilation)</tt>.</li>
 </ul>
 </blockquote>
 </li>
@@ -588,11 +541,11 @@ Example: <tt class="docutils literal"><span class="pre">(stop_compilation)</span
 </div>
 </div>
 <div class="section" id="language-map">
-<h1><a class="toc-backref" href="#id19">Language map</a></h1>
-<p>If you are adding support for a new language to LLVMC, you'll need to
-modify the language map, which defines mappings from file extensions
-to language names. It is used to choose the proper toolchain(s) for a
-given input file set. Language map definition looks like this:</p>
+<h1><a class="toc-backref" href="#id16">Language map</a></h1>
+<p>If you are adding support for a new language to LLVMC, you'll need to modify the
+language map, which defines mappings from file extensions to language names. It
+is used to choose the proper toolchain(s) for a given input file set. Language
+map definition looks like this:</p>
 <pre class="literal-block">
 def LanguageMap : LanguageMap&lt;
     [LangToSuffixes&lt;&quot;c++&quot;, [&quot;cc&quot;, &quot;cp&quot;, &quot;cxx&quot;, &quot;cpp&quot;, &quot;CPP&quot;, &quot;c++&quot;, &quot;C&quot;]&gt;,
@@ -606,73 +559,69 @@ $ llvmc hello.cpp
 llvmc: Unknown suffix: cpp
 </pre>
 <p>The language map entries are needed only for the tools that are linked from the
-root node. Since a tool can't have multiple output languages, for inner nodes of
-the graph the input and output languages should match. This is enforced at
-compile-time.</p>
+root node. A tool can have multiple output languages.</p>
 </div>
 <div class="section" id="option-preprocessor">
-<h1><a class="toc-backref" href="#id20">Option preprocessor</a></h1>
+<h1><a class="toc-backref" href="#id17">Option preprocessor</a></h1>
 <p>It is sometimes useful to run error-checking code before processing the
 compilation graph. For example, if optimization options &quot;-O1&quot; and &quot;-O2&quot; are
 implemented as switches, we might want to output a warning if the user invokes
 the driver with both of these options enabled.</p>
-<p>The <tt class="docutils literal"><span class="pre">OptionPreprocessor</span></tt> feature is reserved specially for these
-occasions. Example (adapted from the built-in Base plugin):</p>
+<p>The <tt class="docutils literal">OptionPreprocessor</tt> feature is reserved specially for these
+occasions. Example (adapted from <tt class="docutils literal">llvm/src/Base.td.in</tt>):</p>
 <pre class="literal-block">
 def Preprocess : OptionPreprocessor&lt;
-(case (not (any_switch_on [&quot;O0&quot;, &quot;O1&quot;, &quot;O2&quot;, &quot;O3&quot;])),
+(case (not (any_switch_on &quot;O0&quot;, &quot;O1&quot;, &quot;O2&quot;, &quot;O3&quot;)),
            (set_option &quot;O2&quot;),
-      (and (switch_on &quot;O3&quot;), (any_switch_on [&quot;O0&quot;, &quot;O1&quot;, &quot;O2&quot;])),
-           (unset_option [&quot;O0&quot;, &quot;O1&quot;, &quot;O2&quot;]),
-      (and (switch_on &quot;O2&quot;), (any_switch_on [&quot;O0&quot;, &quot;O1&quot;])),
-           (unset_option [&quot;O0&quot;, &quot;O1&quot;]),
+      (and (switch_on &quot;O3&quot;), (any_switch_on &quot;O0&quot;, &quot;O1&quot;, &quot;O2&quot;)),
+           (unset_option &quot;O0&quot;, &quot;O1&quot;, &quot;O2&quot;),
+      (and (switch_on &quot;O2&quot;), (any_switch_on &quot;O0&quot;, &quot;O1&quot;)),
+           (unset_option &quot;O0&quot;, &quot;O1&quot;),
       (and (switch_on &quot;O1&quot;), (switch_on &quot;O0&quot;)),
            (unset_option &quot;O0&quot;))
 &gt;;
 </pre>
-<p>Here, <tt class="docutils literal"><span class="pre">OptionPreprocessor</span></tt> is used to unset all spurious <tt class="docutils literal"><span class="pre">-O</span></tt> options so
+<p>Here, <tt class="docutils literal">OptionPreprocessor</tt> is used to unset all spurious <tt class="docutils literal"><span class="pre">-O</span></tt> options so
 that they are not forwarded to the compiler. If no optimization options are
 specified, <tt class="docutils literal"><span class="pre">-O2</span></tt> is enabled.</p>
-<p><tt class="docutils literal"><span class="pre">OptionPreprocessor</span></tt> is basically a single big <tt class="docutils literal"><span class="pre">case</span></tt> expression, which is
-evaluated only once right after the plugin is loaded. The only allowed actions
-in <tt class="docutils literal"><span class="pre">OptionPreprocessor</span></tt> are <tt class="docutils literal"><span class="pre">error</span></tt>, <tt class="docutils literal"><span class="pre">warning</span></tt>, and two special actions:
-<tt class="docutils literal"><span class="pre">unset_option</span></tt> and <tt class="docutils literal"><span class="pre">set_option</span></tt>. As their names suggest, they can be used to
-set or unset a given option. To set an option with <tt class="docutils literal"><span class="pre">set_option</span></tt>, use the
-two-argument form: <tt class="docutils literal"><span class="pre">(set_option</span> <span class="pre">&quot;parameter&quot;,</span> <span class="pre">VALUE)</span></tt>. Here, <tt class="docutils literal"><span class="pre">VALUE</span></tt> can be
+<p><tt class="docutils literal">OptionPreprocessor</tt> is basically a single big <tt class="docutils literal">case</tt> expression, which is
+evaluated only once right after the driver is started. The only allowed actions
+in <tt class="docutils literal">OptionPreprocessor</tt> are <tt class="docutils literal">error</tt>, <tt class="docutils literal">warning</tt>, and two special actions:
+<tt class="docutils literal">unset_option</tt> and <tt class="docutils literal">set_option</tt>. As their names suggest, they can be used to
+set or unset a given option. To set an option with <tt class="docutils literal">set_option</tt>, use the
+two-argument form: <tt class="docutils literal">(set_option &quot;parameter&quot;, VALUE)</tt>. Here, <tt class="docutils literal">VALUE</tt> can be
 either a string, a string list, or a boolean constant.</p>
-<p>For convenience, <tt class="docutils literal"><span class="pre">set_option</span></tt> and <tt class="docutils literal"><span class="pre">unset_option</span></tt> also work on lists. That
-is, instead of <tt class="docutils literal"><span class="pre">[(unset_option</span> <span class="pre">&quot;A&quot;),</span> <span class="pre">(unset_option</span> <span class="pre">&quot;B&quot;)]</span></tt> you can use
-<tt class="docutils literal"><span class="pre">(unset_option</span> <span class="pre">[&quot;A&quot;,</span> <span class="pre">&quot;B&quot;])</span></tt>. Obviously, <tt class="docutils literal"><span class="pre">(set_option</span> <span class="pre">[&quot;A&quot;,</span> <span class="pre">&quot;B&quot;])</span></tt> is valid
-only if both <tt class="docutils literal"><span class="pre">A</span></tt> and <tt class="docutils literal"><span class="pre">B</span></tt> are switches.</p>
+<p>For convenience, <tt class="docutils literal">set_option</tt> and <tt class="docutils literal">unset_option</tt> also work with multiple
+arguments. That is, instead of <tt class="docutils literal">[(unset_option <span class="pre">&quot;A&quot;),</span> (unset_option <span class="pre">&quot;B&quot;)]</span></tt> you
+can use <tt class="docutils literal">(unset_option &quot;A&quot;, &quot;B&quot;)</tt>. Obviously, <tt class="docutils literal">(set_option &quot;A&quot;, &quot;B&quot;)</tt> is
+only valid if both <tt class="docutils literal">A</tt> and <tt class="docutils literal">B</tt> are switches.</p>
 </div>
 <div class="section" id="more-advanced-topics">
-<h1><a class="toc-backref" href="#id21">More advanced topics</a></h1>
+<h1><a class="toc-backref" href="#id18">More advanced topics</a></h1>
 <div class="section" id="hooks-and-environment-variables">
-<span id="hooks"></span><h2><a class="toc-backref" href="#id22">Hooks and environment variables</a></h2>
-<p>Normally, LLVMC executes programs from the system <tt class="docutils literal"><span class="pre">PATH</span></tt>. Sometimes,
-this is not sufficient: for example, we may want to specify tool paths
-or names in the configuration file. This can be easily achieved via
-the hooks mechanism. To write your own hooks, just add their
-definitions to the <tt class="docutils literal"><span class="pre">PluginMain.cpp</span></tt> or drop a <tt class="docutils literal"><span class="pre">.cpp</span></tt> file into the
-your plugin directory. Hooks should live in the <tt class="docutils literal"><span class="pre">hooks</span></tt> namespace
-and have the signature <tt class="docutils literal"><span class="pre">std::string</span> <span class="pre">hooks::MyHookName</span> <span class="pre">([const</span> <span class="pre">char*</span>
-<span class="pre">Arg0</span> <span class="pre">[</span> <span class="pre">const</span> <span class="pre">char*</span> <span class="pre">Arg2</span> <span class="pre">[,</span> <span class="pre">...]]])</span></tt>. They can be used from the
-<tt class="docutils literal"><span class="pre">cmd_line</span></tt> tool property:</p>
+<span id="hooks"></span><h2><a class="toc-backref" href="#id19">Hooks and environment variables</a></h2>
+<p>Normally, LLVMC searches for programs in the system <tt class="docutils literal">PATH</tt>. Sometimes, this is
+not sufficient: for example, we may want to specify tool paths or names in the
+configuration file. This can be achieved via the hooks mechanism. To write your
+own hooks, add their definitions to the <tt class="docutils literal">Hooks.cpp</tt> or drop a <tt class="docutils literal">.cpp</tt> file
+into your driver directory. Hooks should live in the <tt class="docutils literal">hooks</tt> namespace and
+have the signature <tt class="docutils literal"><span class="pre">std::string</span> <span class="pre">hooks::MyHookName</span> ([const char* Arg0 [ const
+char* Arg2 [, <span class="pre">...]]])</span></tt>. They can be used from the <tt class="docutils literal">command</tt> tool property:</p>
 <pre class="literal-block">
-(cmd_line &quot;$CALL(MyHook)/path/to/file -o $CALL(AnotherHook)&quot;)
+(command &quot;$CALL(MyHook)/path/to/file -o $CALL(AnotherHook)&quot;)
 </pre>
 <p>To pass arguments to hooks, use the following syntax:</p>
 <pre class="literal-block">
-(cmd_line &quot;$CALL(MyHook, 'Arg1', 'Arg2', 'Arg # 3')/path/to/file -o1 -o2&quot;)
+(command &quot;$CALL(MyHook, 'Arg1', 'Arg2', 'Arg # 3')/path/to/file -o1 -o2&quot;)
 </pre>
 <p>It is also possible to use environment variables in the same manner:</p>
 <pre class="literal-block">
-(cmd_line &quot;$ENV(VAR1)/path/to/file -o $ENV(VAR2)&quot;)
+(command &quot;$ENV(VAR1)/path/to/file -o $ENV(VAR2)&quot;)
 </pre>
 <p>To change the command line string based on user-provided options use
-the <tt class="docutils literal"><span class="pre">case</span></tt> expression (documented <a class="reference internal" href="#case">above</a>):</p>
+the <tt class="docutils literal">case</tt> expression (documented <a class="reference internal" href="#case">above</a>):</p>
 <pre class="literal-block">
-(cmd_line
+(command
   (case
     (switch_on &quot;E&quot;),
        &quot;llvm-g++ -E -x c $INFILE -o $OUTFILE&quot;,
@@ -680,41 +629,23 @@ the <tt class="docutils literal"><span class="pre">case</span></tt> expression (
        &quot;llvm-g++ -c -x c $INFILE -o $OUTFILE -emit-llvm&quot;))
 </pre>
 </div>
-<div class="section" id="how-plugins-are-loaded">
-<span id="priorities"></span><h2><a class="toc-backref" href="#id23">How plugins are loaded</a></h2>
-<p>It is possible for LLVMC plugins to depend on each other. For example,
-one can create edges between nodes defined in some other plugin. To
-make this work, however, that plugin should be loaded first. To
-achieve this, the concept of plugin priority was introduced. By
-default, every plugin has priority zero; to specify the priority
-explicitly, put the following line in your plugin's TableGen file:</p>
-<pre class="literal-block">
-def Priority : PluginPriority&lt;$PRIORITY_VALUE&gt;;
-# Where PRIORITY_VALUE is some integer &gt; 0
-</pre>
-<p>Plugins are loaded in order of their (increasing) priority, starting
-with 0. Therefore, the plugin with the highest priority value will be
-loaded last.</p>
-</div>
 <div class="section" id="debugging">
-<h2><a class="toc-backref" href="#id24">Debugging</a></h2>
-<p>When writing LLVMC plugins, it can be useful to get a visual view of
-the resulting compilation graph. This can be achieved via the command
-line option <tt class="docutils literal"><span class="pre">--view-graph</span></tt>. This command assumes that <a class="reference external" href="http://www.graphviz.org/">Graphviz</a> and
-<a class="reference external" href="http://pages.cs.wisc.edu/~ghost/">Ghostview</a> are installed. There is also a <tt class="docutils literal"><span class="pre">--write-graph</span></tt> option that
-creates a Graphviz source file (<tt class="docutils literal"><span class="pre">compilation-graph.dot</span></tt>) in the
-current directory.</p>
-<p>Another useful <tt class="docutils literal"><span class="pre">llvmc</span></tt> option is <tt class="docutils literal"><span class="pre">--check-graph</span></tt>. It checks the
-compilation graph for common errors like mismatched output/input
-language names, multiple default edges and cycles. These checks can't
-be performed at compile-time because the plugins can load code
-dynamically. When invoked with <tt class="docutils literal"><span class="pre">--check-graph</span></tt>, <tt class="docutils literal"><span class="pre">llvmc</span></tt> doesn't
-perform any compilation tasks and returns the number of encountered
-errors as its status code.</p>
+<h2><a class="toc-backref" href="#id20">Debugging</a></h2>
+<p>When writing LLVMC-based drivers, it can be useful to get a visual view of the
+resulting compilation graph. This can be achieved via the command line option
+<tt class="docutils literal"><span class="pre">--view-graph</span></tt> (which assumes that <a class="reference external" href="http://www.graphviz.org/">Graphviz</a> and <a class="reference external" href="http://pages.cs.wisc.edu/~ghost/">Ghostview</a> are
+installed). There is also a <tt class="docutils literal"><span class="pre">--write-graph</span></tt> option that creates a Graphviz
+source file (<tt class="docutils literal"><span class="pre">compilation-graph.dot</span></tt>) in the current directory.</p>
+<p>Another useful <tt class="docutils literal">llvmc</tt> option is <tt class="docutils literal"><span class="pre">--check-graph</span></tt>. It checks the compilation
+graph for common errors like mismatched output/input language names, multiple
+default edges and cycles. When invoked with <tt class="docutils literal"><span class="pre">--check-graph</span></tt>, <tt class="docutils literal">llvmc</tt> doesn't
+perform any compilation tasks and returns the number of encountered errors as
+its status code. In the future, these checks will be performed at compile-time
+and this option will disappear.</p>
 </div>
 <div class="section" id="conditioning-on-the-executable-name">
-<h2><a class="toc-backref" href="#id25">Conditioning on the executable name</a></h2>
-<p>For now, the executable name (the value passed to the driver in <tt class="docutils literal"><span class="pre">argv[0]</span></tt>) is
+<h2><a class="toc-backref" href="#id21">Conditioning on the executable name</a></h2>
+<p>For now, the executable name (the value passed to the driver in <tt class="docutils literal">argv[0]</tt>) is
 accessible only in the C++ code (i.e. hooks). Use the following code:</p>
 <pre class="literal-block">
 namespace llvmc {
@@ -734,8 +665,8 @@ if (strcmp(ProgramName, &quot;mydriver&quot;) == 0) {
 </pre>
 <p>In general, you're encouraged not to make the behaviour dependent on the
 executable file name, and use command-line switches instead. See for example how
-the <tt class="docutils literal"><span class="pre">Base</span></tt> plugin behaves when it needs to choose the correct linker options
-(think <tt class="docutils literal"><span class="pre">g++</span></tt> vs. <tt class="docutils literal"><span class="pre">gcc</span></tt>).</p>
+the <tt class="docutils literal">llvmc</tt> program behaves when it needs to choose the correct linker options
+(think <tt class="docutils literal">g++</tt> vs. <tt class="docutils literal">gcc</tt>).</p>
 <hr />
 <address>
 <a href="http://jigsaw.w3.org/css-validator/check/referer">
diff --git a/docs/CompilerDriverTutorial.html b/docs/CompilerDriverTutorial.html
index 317b1d1..4ed373a 100644
--- a/docs/CompilerDriverTutorial.html
+++ b/docs/CompilerDriverTutorial.html
@@ -3,7 +3,7 @@
 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
 <head>
 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
-<meta name="generator" content="Docutils 0.5: http://docutils.sourceforge.net/" />
+<meta name="generator" content="Docutils 0.6: http://docutils.sourceforge.net/" />
 <title>Tutorial - Using LLVMC</title>
 <link rel="stylesheet" href="llvm.css" type="text/css" />
 </head>
@@ -18,7 +18,7 @@ The ReST source lives in the directory 'tools/llvmc/doc'. -->
 <p class="topic-title first">Contents</p>
 <ul class="simple">
 <li><a class="reference internal" href="#introduction" id="id1">Introduction</a></li>
-<li><a class="reference internal" href="#compiling-with-llvmc" id="id2">Compiling with LLVMC</a></li>
+<li><a class="reference internal" href="#using-the-llvmc-program" id="id2">Using the <tt class="docutils literal">llvmc</tt> program</a></li>
 <li><a class="reference internal" href="#using-llvmc-to-generate-toolchain-drivers" id="id3">Using LLVMC to generate toolchain drivers</a></li>
 </ul>
 </div>
@@ -26,51 +26,47 @@ The ReST source lives in the directory 'tools/llvmc/doc'. -->
 <p>Written by <a href="mailto:foldr@codedgers.com">Mikhail Glushenkov</a></p>
 </div><div class="section" id="introduction">
 <h1><a class="toc-backref" href="#id1">Introduction</a></h1>
-<p>LLVMC is a generic compiler driver, which plays the same role for LLVM
-as the <tt class="docutils literal"><span class="pre">gcc</span></tt> program does for GCC - the difference being that LLVMC
-is designed to be more adaptable and easier to customize. Most of
-LLVMC functionality is implemented via plugins, which can be loaded
-dynamically or compiled in. This tutorial describes the basic usage
-and configuration of LLVMC.</p>
+<p>LLVMC is a generic compiler driver, which plays the same role for LLVM as the
+<tt class="docutils literal">gcc</tt> program does for GCC - the difference being that LLVMC is designed to be
+more adaptable and easier to customize. Most of LLVMC functionality is
+implemented via high-level TableGen code, from which a corresponding C++ source
+file is automatically generated. This tutorial describes the basic usage and
+configuration of LLVMC.</p>
 </div>
-<div class="section" id="compiling-with-llvmc">
-<h1><a class="toc-backref" href="#id2">Compiling with LLVMC</a></h1>
-<p>In general, LLVMC tries to be command-line compatible with <tt class="docutils literal"><span class="pre">gcc</span></tt> as
-much as possible, so most of the familiar options work:</p>
+<div class="section" id="using-the-llvmc-program">
+<h1><a class="toc-backref" href="#id2">Using the <tt class="docutils literal">llvmc</tt> program</a></h1>
+<p>In general, <tt class="docutils literal">llvmc</tt> tries to be command-line compatible with <tt class="docutils literal">gcc</tt> as much
+as possible, so most of the familiar options work:</p>
 <pre class="literal-block">
 $ llvmc -O3 -Wall hello.cpp
 $ ./a.out
 hello
 </pre>
-<p>This will invoke <tt class="docutils literal"><span class="pre">llvm-g++</span></tt> under the hood (you can see which
-commands are executed by using the <tt class="docutils literal"><span class="pre">-v</span></tt> option). For further help on
-command-line LLVMC usage, refer to the <tt class="docutils literal"><span class="pre">llvmc</span> <span class="pre">--help</span></tt> output.</p>
+<p>This will invoke <tt class="docutils literal"><span class="pre">llvm-g++</span></tt> under the hood (you can see which commands are
+executed by using the <tt class="docutils literal"><span class="pre">-v</span></tt> option). For further help on command-line LLVMC
+usage, refer to the <tt class="docutils literal">llvmc <span class="pre">--help</span></tt> output.</p>
 </div>
 <div class="section" id="using-llvmc-to-generate-toolchain-drivers">
 <h1><a class="toc-backref" href="#id3">Using LLVMC to generate toolchain drivers</a></h1>
-<p>LLVMC plugins are written mostly using <a class="reference external" href="http://llvm.org/docs/TableGenFundamentals.html">TableGen</a>, so you need to
-be familiar with it to get anything done.</p>
-<p>Start by compiling <tt class="docutils literal"><span class="pre">example/Simple</span></tt>, which is a primitive wrapper for
-<tt class="docutils literal"><span class="pre">gcc</span></tt>:</p>
+<p>LLVMC-based drivers are written mostly using <a class="reference external" href="http://llvm.org/docs/TableGenFundamentals.html">TableGen</a>, so you need to be
+familiar with it to get anything done.</p>
+<p>Start by compiling <tt class="docutils literal">example/Simple</tt>, which is a primitive wrapper for
+<tt class="docutils literal">gcc</tt>:</p>
 <pre class="literal-block">
-$ cd $LLVM_DIR/tools/llvmc
-$ cp -r example/Simple plugins/Simple
-
-  # NB: A less verbose way to compile standalone LLVMC-based drivers is
-  # described in the reference manual.
-
-$ make LLVMC_BASED_DRIVER_NAME=mygcc LLVMC_BUILTIN_PLUGINS=Simple
+$ cd $LLVM_OBJ_DIR/tools/examples/Simple
+$ make
 $ cat &gt; hello.c
-[...]
-$ mygcc hello.c
+#include &lt;stdio.h&gt;
+int main() { printf(&quot;Hello\n&quot;); }
+$ $LLVM_BIN_DIR/Simple -v hello.c
+gcc hello.c -o hello.out
 $ ./hello.out
 Hello
 </pre>
-<p>Here we link our plugin with the LLVMC core statically to form an executable
-file called <tt class="docutils literal"><span class="pre">mygcc</span></tt>. It is also possible to build our plugin as a dynamic
-library to be loaded by the <tt class="docutils literal"><span class="pre">llvmc</span></tt> executable (or any other LLVMC-based
-standalone driver); this is described in the reference manual.</p>
-<p>Contents of the file <tt class="docutils literal"><span class="pre">Simple.td</span></tt> look like this:</p>
+<p>We have thus produced a simple driver called, appropriately, <tt class="docutils literal">Simple</tt>, from
+the input TableGen file <tt class="docutils literal">Simple.td</tt>. The <tt class="docutils literal">llvmc</tt> program itself is generated
+using a similar process (see <tt class="docutils literal">llvmc/src</tt>). Contents of the file <tt class="docutils literal">Simple.td</tt>
+look like this:</p>
 <pre class="literal-block">
 // Include common definitions
 include &quot;llvm/CompilerDriver/Common.td&quot;
@@ -80,33 +76,36 @@ def gcc : Tool&lt;
 [(in_language &quot;c&quot;),
  (out_language &quot;executable&quot;),
  (output_suffix &quot;out&quot;),
- (cmd_line &quot;gcc $INFILE -o $OUTFILE&quot;),
- (sink)
+ (command &quot;gcc&quot;),
+ (sink),
+
+ // -o is what is used by default, out_file_option here is included for
+ // instructive purposes.
+ (out_file_option &quot;-o&quot;)
 ]&gt;;
 
 // Language map
-def LanguageMap : LanguageMap&lt;[LangToSuffixes&lt;&quot;c&quot;, [&quot;c&quot;]&gt;]&gt;;
+def LanguageMap : LanguageMap&lt;[(lang_to_suffixes &quot;c&quot;, &quot;c&quot;)]&gt;;
 
 // Compilation graph
-def CompilationGraph : CompilationGraph&lt;[Edge&lt;&quot;root&quot;, &quot;gcc&quot;&gt;]&gt;;
+def CompilationGraph : CompilationGraph&lt;[(edge &quot;root&quot;, &quot;gcc&quot;)]&gt;;
 </pre>
-<p>As you can see, this file consists of three parts: tool descriptions,
-language map, and the compilation graph definition.</p>
-<p>At the heart of LLVMC is the idea of a compilation graph: vertices in
-this graph are tools, and edges represent a transformation path
-between two tools (for example, assembly source produced by the
-compiler can be transformed into executable code by an assembler). The
-compilation graph is basically a list of edges; a special node named
-<tt class="docutils literal"><span class="pre">root</span></tt> is used to mark graph entry points.</p>
-<p>Tool descriptions are represented as property lists: most properties
-in the example above should be self-explanatory; the <tt class="docutils literal"><span class="pre">sink</span></tt> property
-means that all options lacking an explicit description should be
-forwarded to this tool.</p>
-<p>The <tt class="docutils literal"><span class="pre">LanguageMap</span></tt> associates a language name with a list of suffixes
-and is used for deciding which toolchain corresponds to a given input
-file.</p>
-<p>To learn more about LLVMC customization, refer to the reference
-manual and plugin source code in the <tt class="docutils literal"><span class="pre">plugins</span></tt> directory.</p>
+<p>As you can see, this file consists of three parts: tool descriptions, language
+map, and the compilation graph definition.</p>
+<p>At the heart of LLVMC is the idea of a compilation graph: vertices in this graph
+are tools, and edges represent a transformation path between two tools (for
+example, assembly source produced by the compiler can be transformed into
+executable code by an assembler). The compilation graph is basically a list of
+edges; a special node named <tt class="docutils literal">root</tt> is used to mark graph entry points.</p>
+<p>Tool descriptions are represented as property lists: most properties in the
+example above should be self-explanatory; the <tt class="docutils literal">sink</tt> property means that all
+options lacking an explicit description should be forwarded to this tool.</p>
+<p>The <tt class="docutils literal">LanguageMap</tt> associates a language name with a list of suffixes and is
+used for deciding which toolchain corresponds to a given input file.</p>
+<p>To learn more about writing your own drivers with LLVMC, refer to the reference
+manual and examples in the <tt class="docutils literal">examples</tt> directory. Of a particular interest is
+the <tt class="docutils literal">Skeleton</tt> example, which can serve as a template for your LLVMC-based
+drivers.</p>
 <hr />
 <address>
 <a href="http://jigsaw.w3.org/css-validator/check/referer">
diff --git a/docs/CompilerWriterInfo.html b/docs/CompilerWriterInfo.html
index 5d071f7..d9201fc 100644
--- a/docs/CompilerWriterInfo.html
+++ b/docs/CompilerWriterInfo.html
@@ -9,9 +9,9 @@
 
 <body>
 
-<div class="doc_title">
+<h1>
   Architecture/platform information for compiler writers
-</div>
+</h1>
 
 <div class="doc_warning">
   <p>Note: This document is a work-in-progress.  Additions and clarifications
@@ -43,13 +43,15 @@
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="hw">Hardware</a></div>
+<h2><a name="hw">Hardware</a></h2>
 <!-- *********************************************************************** -->
 
+<div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="alpha">Alpha</a></div>
+<h3><a name="alpha">Alpha</a></h3>
 
-<div class="doc_text">
+<div>
 <ul>
 <li><a
 href="http://ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html">Alpha manuals</a> 
@@ -58,9 +60,9 @@ href="http://ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-libra
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="arm">ARM</a></div>
+<h3><a name="arm">ARM</a></h3>
 
-<div class="doc_text">
+<div>
 <ul>
 <li><a href="http://www.arm.com/documentation/">ARM documentation</a> 
 (<a href="http://www.arm.com/documentation/ARMProcessor_Cores/">Processor
@@ -70,9 +72,9 @@ Cores</a>)</li>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="ia64">Itanium (ia64)</a></div>
+<h3><a name="ia64">Itanium (ia64)</a></h3>
 
-<div class="doc_text">
+<div>
 <ul>
 <li><a
 href="http://developer.intel.com/design/itanium2/documentation.htm">Itanium documentation</a> 
@@ -81,9 +83,9 @@ href="http://developer.intel.com/design/itanium2/documentation.htm">Itanium docu
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="mips">MIPS</a></div>
+<h3><a name="mips">MIPS</a></h3>
 
-<div class="doc_text">
+<div>
 <ul>
 <li><a
 href="http://mips.com/content/Documentation/MIPSDocumentation/ProcessorArchitecture/doclibrary">MIPS
@@ -92,12 +94,14 @@ Processor Architecture</a></li>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="ppc">PowerPC</a></div>
+<h3><a name="ppc">PowerPC</a></h3>
+
+<div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">IBM - Official manuals and docs</div>
+<h4>IBM - Official manuals and docs</h4>
 
-<div class="doc_text">
+<div>
 
 <ul>
 <li><a
@@ -129,9 +133,9 @@ PowerPC architecture</a></li>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">Other documents, collections, notes</div>
+<h4>Other documents, collections, notes</h4>
 
-<div class="doc_text">
+<div>
 
 <ul>
 <li><a href="http://penguinppc.org/dev/#library">PowerPC ABI documents</a></li>
@@ -143,10 +147,12 @@ branch stubs for powerpc64-linux (from binutils)</a></li>
 
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="sparc">SPARC</a></div>
+<h3><a name="sparc">SPARC</a></h3>
 
-<div class="doc_text">
+<div>
 
 <ul>
 <li><a href="http://www.sparc.org/resource.htm">SPARC resources</a></li>
@@ -156,12 +162,14 @@ branch stubs for powerpc64-linux (from binutils)</a></li>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="x86">X86</a></div>
+<h3><a name="x86">X86</a></h3>
+
+<div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">AMD - Official manuals and docs</div>
+<h4>AMD - Official manuals and docs</h4>
 
-<div class="doc_text">
+<div>
 <ul>
 <li><a
 href="http://www.amd.com/us-en/Processors/TechnicalResources/0,,30_182_739,00.html">AMD processor manuals</a></li>
@@ -170,9 +178,9 @@ href="http://www.amd.com/us-en/Processors/TechnicalResources/0,,30_182_739,00.ht
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">Intel - Official manuals and docs</div>
+<h4>Intel - Official manuals and docs</h4>
 
-<div class="doc_text">
+<div>
 <ul>
 <li><a
 href="http://developer.intel.com/design/pentium4/manuals/index_new.htm">IA-32
@@ -184,19 +192,21 @@ Itanium documentation</a></li>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">Other x86-specific information</div>
+<h4>Other x86-specific information</h4>
 
-<div class="doc_text">
+<div>
 <ul>
 <li><a href="http://www.agner.org/assem/calling_conventions.pdf">Calling
 conventions for different C++ compilers and operating systems</a></li>
 </ul>
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="other">Other relevant lists</a></div>
+<h3><a name="other">Other relevant lists</a></h3>
 
-<div class="doc_text">
+<div>
 
 <ul>
 <li><a href="http://gcc.gnu.org/readings.html">GCC reading list</a></li>
@@ -204,14 +214,18 @@ conventions for different C++ compilers and operating systems</a></li>
 
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="abi">ABI</a></div>
+<h2><a name="abi">ABI</a></h2>
 <!-- *********************************************************************** -->
 
+<div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="linux">Linux</a></div>
+<h3><a name="linux">Linux</a></h3>
 
-<div class="doc_text">
+<div>
 <ol>
 <li><a href="http://www.linuxbase.org/spec/ELF/ppc64/">PowerPC 64-bit ELF ABI
 Supplement</a></li>
@@ -219,9 +233,9 @@ Supplement</a></li>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="osx">OS X</a></div>
+<h3><a name="osx">OS X</a></h3>
 
-<div class="doc_text">
+<div>
 <ol>
 <li><a
 href="http://developer.apple.com/documentation/Darwin/RuntimeArchitecture-date.html">Mach-O
@@ -232,8 +246,10 @@ ABI</a></li>
 
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="misc">Miscellaneous resources</a></div>
+<h2><a name="misc">Miscellaneous resources</a></h2>
 <!-- *********************************************************************** -->
 
 <ul>
@@ -255,7 +271,7 @@ processors.</li>
   src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
 
   <a href="http://misha.brukman.net">Misha Brukman</a><br>
-  <a href="http://llvm.org">LLVM Compiler Infrastructure</a><br>
+  <a href="http://llvm.org/">LLVM Compiler Infrastructure</a><br>
   Last modified: $Date$
 </address>
 
diff --git a/docs/DebuggingJITedCode.html b/docs/DebuggingJITedCode.html
index 7c998bb..3911ea7 100644
--- a/docs/DebuggingJITedCode.html
+++ b/docs/DebuggingJITedCode.html
@@ -7,7 +7,7 @@
 </head>
 <body>
 
-<div class="doc_title">Debugging JITed Code With GDB</div>
+<h1>Debugging JITed Code With GDB</h1>
 <ol>
   <li><a href="#example">Example usage</a></li>
   <li><a href="#background">Background</a></li>
@@ -15,9 +15,9 @@
 <div class="doc_author">Written by Reid Kleckner</div>
 
 <!--=========================================================================-->
-<div class="doc_section"><a name="example">Example usage</a></div>
+<h2><a name="example">Example usage</a></h2>
 <!--=========================================================================-->
-<div class="doc_text">
+<div>
 
 <p>In order to debug code JITed by LLVM, you need GDB 7.0 or newer, which is
 available on most modern distributions of Linux.  The version of GDB that Apple
@@ -96,9 +96,9 @@ function names.
 </div>
 
 <!--=========================================================================-->
-<div class="doc_section"><a name="background">Background</a></div>
+<h2><a name="background">Background</a></h2>
 <!--=========================================================================-->
-<div class="doc_text">
+<div>
 
 <p>Without special runtime support, debugging dynamically generated code with
 GDB (as well as most debuggers) can be quite painful.  Debuggers generally read
@@ -145,7 +145,7 @@ coordinate with GDB to get better debug information.
   <a href="http://validator.w3.org/check/referer"><img
   src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
   <a href="mailto:reid.kleckner@gmail.com">Reid Kleckner</a><br>
-  <a href="http://llvm.org">The LLVM Compiler Infrastructure</a><br>
+  <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
   Last modified: $Date$
 </address>
 </body>
diff --git a/docs/DeveloperPolicy.html b/docs/DeveloperPolicy.html
index ef7ba39..fee2838 100644
--- a/docs/DeveloperPolicy.html
+++ b/docs/DeveloperPolicy.html
@@ -8,7 +8,7 @@
 </head>
 <body>
       
-<div class="doc_title">LLVM Developer Policy</div>
+<h1>LLVM Developer Policy</h1>
 <ol>
   <li><a href="#introduction">Introduction</a></li>
   <li><a href="#policies">Developer Policies</a>
@@ -34,9 +34,9 @@
 <div class="doc_author">Written by the LLVM Oversight Team</div>
 
 <!--=========================================================================-->
-<div class="doc_section"><a name="introduction">Introduction</a></div>
+<h2><a name="introduction">Introduction</a></h2>
 <!--=========================================================================-->
-<div class="doc_text">
+<div>
 <p>This document contains the LLVM Developer Policy which defines the project's
    policy towards developers and their contributions. The intent of this policy
    is to eliminate miscommunication, rework, and confusion that might arise from
@@ -63,20 +63,19 @@
 </div>
 
 <!--=========================================================================-->
-<div class="doc_section"><a name="policies">Developer Policies</a></div>
+<h2><a name="policies">Developer Policies</a></h2>
 <!--=========================================================================-->
-<div class="doc_text">
+<div>
 <p>This section contains policies that pertain to frequent LLVM developers.  We
    always welcome <a href="#patches">one-off patches</a> from people who do not
    routinely contribute to LLVM, but we expect more from frequent contributors
    to keep the system as efficient as possible for everyone.  Frequent LLVM
    contributors are expected to meet the following requirements in order for
    LLVM to maintain a high standard of quality.<p>
-</div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsection"> <a name="informed">Stay Informed</a> </div>
-<div class="doc_text">
+<h3><a name="informed">Stay Informed</a></h3>
+<div>
 <p>Developers should stay informed by reading at least the "dev" mailing list
    for the projects you are interested in, such as 
    <a href="http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev">llvmdev</a> for
@@ -102,9 +101,9 @@
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsection"> <a name="patches">Making a Patch</a></div>
+<h3><a name="patches">Making a Patch</a></h3>
 
-<div class="doc_text">
+<div>
 <p>When making a patch for review, the goal is to make it as easy for the
    reviewer to read it as possible.  As such, we recommend that you:</p>
 
@@ -142,8 +141,8 @@
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsection"> <a name="reviews">Code Reviews</a></div>
-<div class="doc_text">
+<h3><a name="reviews">Code Reviews</a></h3>
+<div>
 <p>LLVM has a code review policy. Code review is one way to increase the quality
    of software. We generally follow these policies:</p>
 
@@ -174,8 +173,8 @@
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsection"> <a name="owners">Code Owners</a></div>
-<div class="doc_text">
+<h3><a name="owners">Code Owners</a></h3>
+<div>
 
 <p>The LLVM Project relies on two features of its process to maintain rapid
    development in addition to the high quality of its source base: the
@@ -225,8 +224,8 @@
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsection"> <a name="testcases">Test Cases</a></div>
-<div class="doc_text">
+<h3><a name="testcases">Test Cases</a></h3>
+<div>
 <p>Developers are required to create test cases for any bugs fixed and any new
    features added.  Some tips for getting your testcase approved:</p>
 
@@ -258,8 +257,8 @@
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsection"> <a name="quality">Quality</a></div>
-<div class="doc_text">
+<h3><a name="quality">Quality</a></h3>
+<div>
 <p>The minimum quality standards that any change must satisfy before being
    committed to the main development branch are:</p>
 
@@ -318,9 +317,8 @@
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsection">
-  <a name="commitaccess">Obtaining Commit Access</a></div>
-<div class="doc_text">
+<h3><a name="commitaccess">Obtaining Commit Access</a></h3>
+<div>
 
 <p>We grant commit access to contributors with a track record of submitting high
    quality patches.  If you would like commit access, please send an email to
@@ -381,8 +379,8 @@
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsection"> <a name="newwork">Making a Major Change</a></div>
-<div class="doc_text">
+<h3><a name="newwork">Making a Major Change</a></h3>
+<div>
 <p>When a developer begins a major new project with the aim of contributing it
    back to LLVM, s/he should inform the community with an email to
    the <a href="http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev">llvmdev</a>
@@ -410,9 +408,8 @@
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsection"> <a name="incremental">Incremental Development</a>
-</div>
-<div class="doc_text">
+<h3><a name="incremental">Incremental Development</a></h3>
+<div>
 <p>In the LLVM project, we do all significant changes as a series of incremental
    patches.  We have a strong dislike for huge changes or long-term development
    branches.  Long-term development branches have a number of drawbacks:</p>
@@ -472,9 +469,8 @@
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsection"><a name="attribution">Attribution of 
-Changes</a></div>
-<div class="doc_text">
+<h3><a name="attribution">Attribution of Changes</a></h3>
+<div>
 <p>We believe in correct attribution of contributions to their contributors.
    However, we do not want the source code to be littered with random
    attributions "this code written by J. Random Hacker" (this is noisy and
@@ -486,13 +482,15 @@ Changes</a></div>
 <p>Overall, please do not add contributor names to the source code.</p>
 </div>
 
+</div>
+
 <!--=========================================================================-->
-<div class="doc_section">
+<h2>
   <a name="clp">Copyright, License, and Patents</a>
-</div>
+</h2>
 <!--=========================================================================-->
 
-<div class="doc_text">
+<div>
 <p>This section addresses the issues of copyright, license and patents for the
    LLVM project.  Currently, the University of Illinois is the LLVM copyright
    holder and the terms of its license to LLVM users and developers is the
@@ -504,11 +502,10 @@ Changes</a></div>
    legal matters but does not provide legal advice.  We are not lawyers, please
    seek legal counsel from an attorney.</p>
 </div>
-</div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsection"><a name="copyright">Copyright</a></div>
-<div class="doc_text">
+<h3><a name="copyright">Copyright</a></h3>
+<div>
 
 <p>The LLVM project does not require copyright assignments, which means that the
    copyright for the code in the project is held by its respective contributors
@@ -530,8 +527,8 @@ Changes</a></div>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsection"><a name="license">License</a></div>
-<div class="doc_text">
+<h3><a name="license">License</a></h3>
+<div>
 <p>We intend to keep LLVM perpetually open source and to use a liberal open
    source license. All of the code in LLVM is available under the
    <a href="http://www.opensource.org/licenses/UoI-NCSA.php">University of
@@ -585,8 +582,8 @@ Changes</a></div>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsection"><a name="patents">Patents</a></div>
-<div class="doc_text">
+<h3><a name="patents">Patents</a></h3>
+<div>
 <p>To the best of our knowledge, LLVM does not infringe on any patents (we have
    actually removed code from LLVM in the past that was found to infringe).
    Having code in LLVM that infringes on patents would violate an important goal
@@ -602,6 +599,8 @@ Changes</a></div>
    details.</p>
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
 <hr>
 <address>
@@ -611,7 +610,7 @@ Changes</a></div>
   src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
   Written by the 
   <a href="mailto:llvm-oversight@cs.uiuc.edu">LLVM Oversight Group</a><br>
-  <a href="http://llvm.org">The LLVM Compiler Infrastructure</a><br>
+  <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
   Last modified: $Date$
 </address>
 </body>
diff --git a/docs/ExceptionHandling.html b/docs/ExceptionHandling.html
index d597ffb..247448d 100644
--- a/docs/ExceptionHandling.html
+++ b/docs/ExceptionHandling.html
@@ -11,7 +11,7 @@
 
 <body>
 
-<div class="doc_title">Exception Handling in LLVM</div>
+<h1>Exception Handling in LLVM</h1>
 
 <table class="layout" style="width:100%">
   <tr class="layout">
@@ -35,6 +35,7 @@
   <ol>
   	<li><a href="#llvm_eh_exception"><tt>llvm.eh.exception</tt></a></li>
   	<li><a href="#llvm_eh_selector"><tt>llvm.eh.selector</tt></a></li>
+  	<li><a href="#llvm_eh_resume"><tt>llvm.eh.resume</tt></a></li>
   	<li><a href="#llvm_eh_typeid_for"><tt>llvm.eh.typeid.for</tt></a></li>
   	<li><a href="#llvm_eh_sjlj_setjmp"><tt>llvm.eh.sjlj.setjmp</tt></a></li>
   	<li><a href="#llvm_eh_sjlj_longjmp"><tt>llvm.eh.sjlj.longjmp</tt></a></li>
@@ -58,10 +59,10 @@
 
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="introduction">Introduction</a></div>
+<h2><a name="introduction">Introduction</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>This document is the central repository for all information pertaining to
    exception handling in LLVM.  It describes the format that LLVM exception
@@ -70,14 +71,12 @@
    provides specific examples of what exception handling information is used for
    in C/C++.</p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="itanium">Itanium ABI Zero-cost Exception Handling</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>Exception handling for most programming languages is designed to recover from
    conditions that rarely occur during general use of an application.  To that
@@ -106,11 +105,11 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="sjlj">Setjmp/Longjmp Exception Handling</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>Setjmp/Longjmp (SJLJ) based exception handling uses LLVM intrinsics
    <a href="#llvm_eh_sjlj_setjmp"><tt>llvm.eh.sjlj.setjmp</tt></a> and
@@ -138,11 +137,11 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="overview">Overview</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>When an exception is thrown in LLVM code, the runtime does its best to find a
    handler suited to processing the circumstance.</p>
@@ -185,12 +184,14 @@
 
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_section">
+<h2>
   <a name="codegen">LLVM Code Generation</a>
-</div>
+</h2>
 
-<div class="doc_text">
+<div>
 
 <p>At the time of this writing, only C++ exception handling support is available
    in LLVM.  So the remainder of this document will be somewhat C++-centric.</p>
@@ -200,14 +201,12 @@
    we will describe the implementation of LLVM exception handling in terms of
    C++ examples.</p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="throw">Throw</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>Languages that support exception handling typically provide a <tt>throw</tt>
    operation to initiate the exception process.  Internally, a throw operation
@@ -225,11 +224,11 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="try_catch">Try/Catch</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>A call within the scope of a <i>try</i> statement can potentially raise an
    exception.  In those circumstances, the LLVM C++ front-end replaces the call
@@ -313,30 +312,43 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="cleanups">Cleanups</a>
-</div>
+</h3>
+
+<div>
 
-<div class="doc_text">
+<p>A cleanup is extra code which needs to be run as part of unwinding
+   a scope.  C++ destructors are a prominent example, but other
+   languages and language extensions provide a variety of different
+   kinds of cleanup.  In general, a landing pad may need to run
+   arbitrary amounts of cleanup code before actually entering a catch
+   block.  To indicate the presence of cleanups, a landing pad's call
+   to <a href="#llvm_eh_selector"><tt>llvm.eh.selector</tt></a> should
+   end with the argument <tt>i32 0</tt>; otherwise, the unwinder will
+   not stop at the landing pad if there are no catches or filters that
+   require it to.</p>
 
-<p>To handle destructors and cleanups in <tt>try</tt> code, control may not run
-   directly from a landing pad to the first catch.  Control may actually flow
-   from the landing pad to clean up code and then to the first catch.  Since the
-   required clean up for each <tt>invoke</tt> in a <tt>try</tt> may be different
-   (e.g. intervening constructor), there may be several landing pads for a given
-   try.  If cleanups need to be run, an <tt>i32 0</tt> should be passed as the
-   last <a href="#llvm_eh_selector"><tt>llvm.eh.selector</tt></a> argument.
-   However, when using DWARF exception handling with C++, a <tt>i8* null</tt>
-   <a href="#restrictions">must</a> be passed instead.</p>
+<p>Do not allow a new exception to propagate out of the execution of a
+   cleanup.  This can corrupt the internal state of the unwinder.
+   Different languages describe different high-level semantics for
+   these situations: for example, C++ requires that the process be
+   terminated, whereas Ada cancels both exceptions and throws a third.</p>
+
+<p>When all cleanups have completed, if the exception is not handled
+   by the current function, resume unwinding by calling the
+   <a href="#llvm_eh_resume"><tt>llvm.eh.resume</tt></a> intrinsic,
+   passing in the results of <tt>llvm.eh.exception</tt> and
+   <tt>llvm.eh.selector</tt> for the original landing pad.</p>
 
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="throw_filters">Throw Filters</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>C++ allows the specification of which exception types can be thrown from a
    function.  To represent this a top level landing pad may exist to filter out
@@ -359,50 +371,57 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="restrictions">Restrictions</a>
-</div>
+</h3>
+
+<div>
 
-<div class="doc_text">
-
-<p>The semantics of the invoke instruction require that any exception that
-   unwinds through an invoke call should result in a branch to the invoke's
-   unwind label.  However such a branch will only happen if the
-   <a href="#llvm_eh_selector"><tt>llvm.eh.selector</tt></a> matches. Thus in
-   order to ensure correct operation, the front-end must only generate
-   <a href="#llvm_eh_selector"><tt>llvm.eh.selector</tt></a> calls that are
-   guaranteed to always match whatever exception unwinds through the invoke.
-   For most languages it is enough to pass zero, indicating the presence of
-   a <a href="#cleanups">cleanup</a>, as the
-   last <a href="#llvm_eh_selector"><tt>llvm.eh.selector</tt></a> argument.
-   However for C++ this is not sufficient, because the C++ personality function
-   will terminate the program if it detects that unwinding the exception only
-   results in matches with cleanups.  For C++ a <tt>null i8*</tt> should be
-   passed as the last <a href="#llvm_eh_selector"><tt>llvm.eh.selector</tt></a>
-   argument instead.  This is interpreted as a catch-all by the C++ personality
-   function, and will always match.</p>
+<p>The unwinder delegates the decision of whether to stop in a call
+   frame to that call frame's language-specific personality function.
+   Not all personalities functions guarantee that they will stop to
+   perform cleanups: for example, the GNU C++ personality doesn't do
+   so unless the exception is actually caught somewhere further up the
+   stack.  When using this personality to implement EH for a language
+   that guarantees that cleanups will always be run, be sure to
+   indicate a catch-all in the
+   <a href="#llvm_eh_selector"><tt>llvm.eh.selector</tt></a> call
+   rather than just cleanups.</p>
+
+<p>In order for inlining to behave correctly, landing pads must be
+   prepared to handle selector results that they did not originally
+   advertise.  Suppose that a function catches exceptions of
+   type <tt>A</tt>, and it's inlined into a function that catches
+   exceptions of type <tt>B</tt>.  The inliner will update the
+   selector for the inlined landing pad to include the fact
+   that <tt>B</tt> is caught.  If that landing pad assumes that it
+   will only be entered to catch an <tt>A</tt>, it's in for a rude
+   surprise.  Consequently, landing pads must test for the selector
+   results they understand and then resume exception propagation
+   with the <a href="#llvm_eh_resume"><tt>llvm.eh.resume</tt></a>
+   intrinsic if none of the conditions match.</p>
+
+</div>
 
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_section">
+<h2>
   <a name="format_common_intrinsics">Exception Handling Intrinsics</a>
-</div>
+</h2>
 
-<div class="doc_text">
+<div>
 
 <p>LLVM uses several intrinsic functions (name prefixed with "llvm.eh") to
    provide exception handling information at various points in generated
    code.</p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsubsection">
+<h4>
   <a name="llvm_eh_exception">llvm.eh.exception</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <pre>
   i8* %<a href="#llvm_eh_exception">llvm.eh.exception</a>()
@@ -413,11 +432,11 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection">
+<h4>
   <a name="llvm_eh_selector">llvm.eh.selector</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <pre>
   i32 %<a href="#llvm_eh_selector">llvm.eh.selector</a>(i8*, i8*, ...)
@@ -426,30 +445,40 @@
 <p>This intrinsic is used to compare the exception with the given type infos,
    filters and cleanups.</p>
 
-<p><a href="#llvm_eh_selector"><tt>llvm.eh.selector</tt></a> takes a minimum of
-   three arguments.  The first argument is the reference to the exception
-   structure. The second argument is a reference to the personality function to
-   be used for this try catch sequence. Each of the remaining arguments is
-   either a reference to the type info for a catch statement,
-   a <a href="#throw_filters">filter</a> expression, or the number zero
-   representing a <a href="#cleanups">cleanup</a>.  The exception is tested
-   against the arguments sequentially from first to last.  The result of
-   the <a href="#llvm_eh_selector"><tt>llvm.eh.selector</tt></a> is a positive
-   number if the exception matched a type info, a negative number if it matched
-   a filter, and zero if it matched a cleanup.  If nothing is matched, the
-   behaviour of the program is <a href="#restrictions">undefined</a>.  If a type
-   info matched then the selector value is the index of the type info in the
-   exception table, which can be obtained using the
+<p><a href="#llvm_eh_selector"><tt>llvm.eh.selector</tt></a> takes a
+   minimum of three arguments.  The first argument is the reference to
+   the exception structure. The second argument is a reference to the
+   personality function to be used for this try catch sequence. Each
+   of the remaining arguments is either a reference to the type info
+   for a catch statement, a <a href="#throw_filters">filter</a>
+   expression, or the number zero representing
+   a <a href="#cleanups">cleanup</a>.  The exception is tested against
+   the arguments sequentially from first to last.  The result of
+   the <a href="#llvm_eh_selector"><tt>llvm.eh.selector</tt></a> is a
+   positive number if the exception matched a type info, a negative
+   number if it matched a filter, and zero if it matched a cleanup.
+   If nothing is matched, or if only a cleanup is matched, different
+   personality functions may or may not cause control to stop at the
+   landing pad; see <a href="#restrictions">the restrictions</a> for
+   more information.  If a type info matched then the selector value
+   is the index of the type info in the exception table, which can be
+   obtained using the
    <a href="#llvm_eh_typeid_for"><tt>llvm.eh.typeid.for</tt></a> intrinsic.</p>
 
+<p>If a landing pad containing a call to <tt>llvm.eh.selector</tt> is
+   inlined into an <tt>invoke</tt> instruction, the selector arguments
+   for the outer landing pad are appended to those of the inlined
+   landing pad.  Consequently, landing pads must be written to ignore
+   selector values that they did not originally advertise.</p>
+
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection">
+<h4>
   <a name="llvm_eh_typeid_for">llvm.eh.typeid.for</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <pre>
   i32 %<a href="#llvm_eh_typeid_for">llvm.eh.typeid.for</a>(i8*)
@@ -463,11 +492,38 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection">
-  <a name="llvm_eh_sjlj_setjmp">llvm.eh.sjlj.setjmp</a>
+<h4>
+  <a name="llvm_eh_resume">llvm.eh.resume</a>
+</h4>
+
+<div>
+
+<pre>
+  void %<a href="#llvm_eh_resume">llvm.eh.resume</a>(i8*, i32) noreturn
+</pre>
+
+<p>This intrinsic is used to resume propagation of an exception after
+   landing at a landing pad.  The first argument should be the result
+   of <a href="#llvm_eh_exception">llvm.eh.exception</a> for that
+   landing pad, and the second argument should be the result of
+   <a href="#llvm_eh_selector">llvm.eh.selector</a>.  When a call to
+   this intrinsic is inlined into an invoke, the call is transformed
+   into a branch to the invoke's unwind destination, using its
+   arguments in place of the calls
+   to <a href="#llvm_eh_exception">llvm.eh.exception</a> and
+   <a href="#llvm_eh_selector">llvm.eh.selector</a> there.</p>
+
+<p>This intrinsic is not implicitly <tt>nounwind</tt>; calls to it
+   will always throw.  It may not be invoked.</p>
+
 </div>
 
-<div class="doc_text">
+<!-- ======================================================================= -->
+<h4>
+  <a name="llvm_eh_sjlj_setjmp">llvm.eh.sjlj.setjmp</a>
+</h4>
+
+<div>
 
 <pre>
   i32 %<a href="#llvm_eh_sjlj_setjmp">llvm.eh.sjlj.setjmp</a>(i8*)
@@ -492,11 +548,11 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection">
+<h4>
   <a name="llvm_eh_sjlj_longjmp">llvm.eh.sjlj.longjmp</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <pre>
   void %<a href="#llvm_eh_sjlj_longjmp">llvm.eh.sjlj.setjmp</a>(i8*)
@@ -507,16 +563,16 @@
    style exception handling. The single parameter is a pointer to a
    buffer populated by <a href="#llvm_eh_sjlj_setjmp">
      <tt>llvm.eh.sjlj.setjmp</tt></a>. The frame pointer and stack pointer
-   are restored from the buffer, then control is transfered to the
+   are restored from the buffer, then control is transferred to the
    destination address.</p>
 
 </div>
 <!-- ======================================================================= -->
-<div class="doc_subsubsection">
+<h4>
   <a name="llvm_eh_sjlj_lsda">llvm.eh.sjlj.lsda</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <pre>
   i8* %<a href="#llvm_eh_sjlj_lsda">llvm.eh.sjlj.lsda</a>()
@@ -531,11 +587,11 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection">
+<h4>
   <a name="llvm_eh_sjlj_callsite">llvm.eh.sjlj.callsite</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <pre>
   void %<a href="#llvm_eh_sjlj_callsite">llvm.eh.sjlj.callsite</a>(i32)
@@ -549,11 +605,11 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection">
+<h4>
   <a name="llvm_eh_sjlj_dispatchsetup">llvm.eh.sjlj.dispatchsetup</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <pre>
   void %<a href="#llvm_eh_sjlj_dispatchsetup">llvm.eh.sjlj.dispatchsetup</a>(i32)
@@ -565,24 +621,24 @@
 
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_section">
+<h2>
   <a name="asm">Asm Table Formats</a>
-</div>
+</h2>
 
-<div class="doc_text">
+<div>
 
 <p>There are two tables that are used by the exception handling runtime to
    determine which actions should take place when an exception is thrown.</p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="unwind_tables">Exception Handling Frame</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>An exception handling frame <tt>eh_frame</tt> is very similar to the unwind
    frame used by dwarf debug info.  The frame contains all the information
@@ -596,11 +652,11 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="exception_tables">Exception Tables</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>An exception table contains information about what actions to take when an
    exception is thrown in a particular part of a function's code.  There is one
@@ -611,12 +667,14 @@
 
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_section">
+<h2>
   <a name="todo">ToDo</a>
-</div>
+</h2>
 
-<div class="doc_text">
+<div>
 
 <ol>
 
@@ -636,7 +694,7 @@
   src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
 
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
-  <a href="http://llvm.org">LLVM Compiler Infrastructure</a><br>
+  <a href="http://llvm.org/">LLVM Compiler Infrastructure</a><br>
   Last modified: $Date$
 </address>
 
diff --git a/docs/ExtendingLLVM.html b/docs/ExtendingLLVM.html
index 647fa01..b720911 100644
--- a/docs/ExtendingLLVM.html
+++ b/docs/ExtendingLLVM.html
@@ -8,9 +8,9 @@
 
 <body>
 
-<div class="doc_title">
+<h1>
   Extending LLVM: Adding instructions, intrinsics, types, etc.
-</div>
+</h1>
 
 <ol>
   <li><a href="#introduction">Introduction and Warning</a></li>
@@ -31,12 +31,12 @@
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="introduction">Introduction and Warning</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>During the course of using LLVM, you may wish to customize it for your
 research project or for experimentation. At this point, you may realize that
@@ -68,12 +68,12 @@ effort by doing so.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="intrinsic">Adding a new intrinsic function</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Adding a new intrinsic function to LLVM is much easier than adding a new
 instruction.  Almost all extensions to LLVM should start as an intrinsic
@@ -130,12 +130,12 @@ support for it.  Generally you must do the following steps:</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="sdnode">Adding a new SelectionDAG node</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>As with intrinsics, adding a new SelectionDAG node to LLVM is much easier
 than adding a new instruction.  New nodes are often added to help represent
@@ -220,12 +220,12 @@ complicated behavior in a single node (rotate).</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="instruction">Adding a new instruction</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p><span class="doc_warning">WARNING: adding instructions changes the bitcode
 format, and it will take some effort to maintain compatibility with
@@ -277,25 +277,23 @@ to understand this new instruction.</p>
 
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="type">Adding a new type</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p><span class="doc_warning">WARNING: adding new types changes the bitcode
 format, and will break compatibility with currently-existing LLVM
 installations.</span> Only add new types if it is absolutely necessary.</p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="fund_type">Adding a fundamental type</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <ol>
 
@@ -317,11 +315,11 @@ installations.</span> Only add new types if it is absolutely necessary.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="derived_type">Adding a derived type</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <ol>
 <li><tt>llvm/include/llvm/Type.h</tt>:
@@ -373,6 +371,8 @@ void calcTypeName(const Type *Ty,
 
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
 
 <hr>
@@ -382,7 +382,7 @@ void calcTypeName(const Type *Ty,
   <a href="http://validator.w3.org/check/referer"><img
   src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
 
-  <a href="http://llvm.org">The LLVM Compiler Infrastructure</a>
+  <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a>
   <br>
   Last modified: $Date$
 </address>
diff --git a/docs/FAQ.html b/docs/FAQ.html
index 88c7676..620cf25 100644
--- a/docs/FAQ.html
+++ b/docs/FAQ.html
@@ -12,9 +12,9 @@
 </head>
 <body>
 
-<div class="doc_title">
+<h1>
   LLVM: Frequently Asked Questions
-</div>
+</h1>
 
 <ol>
   <li><a href="#license">License</a>
@@ -133,14 +133,14 @@
 </ol>
 
 <div class="doc_author">
-  <p>Written by <a href="http://llvm.org">The LLVM Team</a></p>
+  <p>Written by <a href="http://llvm.org/">The LLVM Team</a></p>
 </div>
 
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="license">License</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
 <div class="question">
@@ -189,9 +189,9 @@
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="source">Source Code</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
 <div class="question">
@@ -227,9 +227,9 @@ LLVM have been ported to a plethora of platforms.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="build">Build Problems</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
 <div class="question">
@@ -449,7 +449,9 @@ Stop.
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="felangs">Source Languages</a></div>
+<h2>
+  <a name="felangs">Source Languages</a>
+</h2>
 
 <div class="question">
 <p><a name="langs">What source languages are supported?</a></p>
@@ -555,9 +557,9 @@ Stop.
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="cfe">Using the GCC Front End</a>
-</div>
+</h2>
 
 <div class="question">
 <p>When I compile software that uses a configure script, the configure script
@@ -712,9 +714,9 @@ Stop.
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="cfe_code">Questions about code generated by the GCC front-end</a>
-</div>
+</h2>
 
 <div class="question">
 <p><a name="iosinit">What is this <tt>llvm.global_ctors</tt> and
@@ -930,7 +932,7 @@ F.i:
   <a href="http://validator.w3.org/check/referer"><img
   src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
 
-  <a href="http://llvm.org">LLVM Compiler Infrastructure</a><br>
+  <a href="http://llvm.org/">LLVM Compiler Infrastructure</a><br>
   Last modified: $Date$
 </address>
 
diff --git a/docs/GCCFEBuildInstrs.html b/docs/GCCFEBuildInstrs.html
index 8fe0c31..6eb409b 100644
--- a/docs/GCCFEBuildInstrs.html
+++ b/docs/GCCFEBuildInstrs.html
@@ -8,9 +8,9 @@
 </head>
 <body>
 
-<div class="doc_title">
+<h1>
   Building the LLVM GCC Front-End
-</div>
+</h1>
 
 <ol>
   <li><a href="#instructions">Building llvm-gcc from Source</a></li>
@@ -24,10 +24,10 @@
 </div>
 
 <!-- *********************************************************************** -->
-<h1><a name="instructions">Building llvm-gcc from Source</a></h1>
+<h2><a name="instructions">Building llvm-gcc from Source</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>This section describes how to acquire and build llvm-gcc 4.2, which is based
 on the GCC 4.2.1 front-end.  Supported languages are Ada, C, C++, Fortran,
@@ -67,10 +67,10 @@ svn co http://llvm.org/svn/llvm-project/llvm-gcc-4.2/trunk <i>dst-directory</i>
 </div>
 
 <!-- *********************************************************************** -->
-<h1><a name="ada">Building the Ada front-end</a></h1>
+<h2><a name="ada">Building the Ada front-end</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 <p>Building with support for Ada amounts to following the directions in the
 top-level <tt>README.LLVM</tt> file, adding ",ada" to EXTRALANGS, for example:
 <tt>EXTRALANGS=,ada</tt></p>
@@ -100,7 +100,7 @@ top-level <tt>README.LLVM</tt> file, adding ",ada" to EXTRALANGS, for example:
   <li><p>Because the Ada front-end is experimental, it is wise to build the
       compiler with checking enabled.  This causes it to run much slower, but
       helps catch mistakes in the compiler (please report any problems using
-      <a href="http://llvm.org/bugs">LLVM bugzilla</a>).</p></li>
+      <a href="http://llvm.org/bugs/">LLVM bugzilla</a>).</p></li>
   <li><p>The Ada front-end <a href="http://llvm.org/PR2007">fails to
       bootstrap</a>, due to lack of LLVM support for
       <tt>setjmp</tt>/<tt>longjmp</tt> style exception handling (used
@@ -233,10 +233,10 @@ make install
 </div>
 
 <!-- *********************************************************************** -->
-<h1><a name="fortran">Building the Fortran front-end</a></h1>
+<h2><a name="fortran">Building the Fortran front-end</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 <p>To build with support for Fortran, follow the directions in the top-level
 <tt>README.LLVM</tt> file, adding ",fortran" to EXTRALANGS, for example:</p>
 
@@ -247,10 +247,10 @@ EXTRALANGS=,fortran
 </div>
 
 <!-- *********************************************************************** -->
-<h1><a name="license">License Information</a></h1>
+<h2><a name="license">License Information</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 <p>
 The LLVM GCC frontend is licensed to you under the GNU General Public License
 and the GNU Lesser General Public License.  Please see the files COPYING and
@@ -271,7 +271,7 @@ More information is <a href="FAQ.html#license">available in the FAQ</a>.
   <a href="http://validator.w3.org/check/referer"><img
   src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
 
-  <a href="http://llvm.org">LLVM Compiler Infrastructure</a><br>
+  <a href="http://llvm.org/">LLVM Compiler Infrastructure</a><br>
   Last modified: $Date$
 </address>
 
diff --git a/docs/GarbageCollection.html b/docs/GarbageCollection.html
index 56085ca..761e1d0 100644
--- a/docs/GarbageCollection.html
+++ b/docs/GarbageCollection.html
@@ -13,9 +13,9 @@
 </head>
 <body>
 
-<div class="doc_title">
+<h1>
   Accurate Garbage Collection with LLVM
-</div>
+</h1>
 
 <ol>
   <li><a href="#introduction">Introduction</a>
@@ -79,12 +79,12 @@
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="introduction">Introduction</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Garbage collection is a widely used technique that frees the programmer from
 having to know the lifetimes of heap objects, making software easier to produce
@@ -124,14 +124,12 @@ techniques dominates any low-level losses.</p>
 <p>This document describes the mechanisms and interfaces provided by LLVM to
 support accurate garbage collection.</p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="feature">Goals and non-goals</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>LLVM's intermediate representation provides <a href="#intrinsics">garbage
 collection intrinsics</a> that offer support for a broad class of
@@ -151,14 +149,14 @@ collector models. For instance, the intrinsics permit:</p>
 support a broad class of garbage collected languages including Scheme, ML, Java,
 C#, Perl, Python, Lua, Ruby, other scripting languages, and more.</p>
 
-<p>However, LLVM does not itself provide a garbage collector&#151;this should
+<p>However, LLVM does not itself provide a garbage collector&mdash;this should
 be part of your language's runtime library. LLVM provides a framework for
 compile time <a href="#plugin">code generation plugins</a>. The role of these
 plugins is to generate code and data structures which conforms to the <em>binary
 interface</em> specified by the <em>runtime library</em>. This is similar to the
 relationship between LLVM and DWARF debugging info, for example. The
 difference primarily lies in the lack of an established standard in the domain
-of garbage collection&#151;thus the plugins.</p>
+of garbage collection&mdash;thus the plugins.</p>
 
 <p>The aspects of the binary interface with which LLVM's GC support is
 concerned are:</p>
@@ -198,13 +196,15 @@ compiler matures.</p>
 
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="quickstart">Getting started</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Using a GC with LLVM implies many things, for example:</p>
 
@@ -246,14 +246,12 @@ compiler matures.</p>
 includes a highly portable, built-in ShadowStack code generator. It is compiled
 into <tt>llc</tt> and works even with the interpreter and C backends.</p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="quickstart-compiler">In your compiler</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>To turn the shadow stack on for your functions, first call:</p>
 
@@ -276,11 +274,11 @@ switching to a more advanced GC.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="quickstart-runtime">In your runtime</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>The shadow stack doesn't imply a memory allocation algorithm. A semispace
 collector or building atop <tt>malloc</tt> are great places to start, and can
@@ -343,11 +341,11 @@ void visitGCRoots(void (*Visitor)(void **Root, const void *Meta)) {
 }</pre></div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="shadow-stack">About the shadow stack</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>Unlike many GC algorithms which rely on a cooperative code generator to
 compile stack maps, this algorithm carefully maintains a linked list of stack
@@ -372,13 +370,15 @@ in order to improve performance.</p>
 
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="core">IR features</a><a name="intrinsics"></a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>This section describes the garbage collection facilities provided by the
 <a href="LangRef.html">LLVM intermediate representation</a>. The exact behavior
@@ -390,18 +390,16 @@ intended to be a complete interface to any garbage collector. A program will
 need to interface with the GC library using the facilities provided by that
 program.</p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="gcattr">Specifying GC code generation: <tt>gc "..."</tt></a>
-</div>
+</h3>
 
 <div class="doc_code"><tt>
   define <i>ty</i> @<i>name</i>(...) <span style="text-decoration: underline">gc "<i>name</i>"</span> { ...
 </tt></div>
 
-<div class="doc_text">
+<div>
 
 <p>The <tt>gc</tt> function attribute is used to specify the desired GC style
 to the compiler. Its programmatic equivalent is the <tt>setGC</tt> method of
@@ -418,15 +416,15 @@ programs that use different garbage collection algorithms (or none at all).</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="gcroot">Identifying GC roots on the stack: <tt>llvm.gcroot</tt></a>
-</div>
+</h3>
 
 <div class="doc_code"><tt>
   void @llvm.gcroot(i8** %ptrloc, i8* %metadata)
 </tt></div>
 
-<div class="doc_text">
+<div>
 
 <p>The <tt>llvm.gcroot</tt> intrinsic is used to inform LLVM that a stack
 variable references an object on the heap and is to be tracked for garbage
@@ -494,11 +492,11 @@ CodeBlock:
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="barriers">Reading and writing references in the heap</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>Some collectors need to be informed when the mutator (the program that needs
 garbage collection) either reads a pointer from or writes a pointer to a field
@@ -534,18 +532,16 @@ require the corresponding barrier. Such a GC plugin will replace the intrinsic
 calls with the corresponding <tt>load</tt> or <tt>store</tt> instruction if they
 are used.</p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsubsection">
+<h4>
   <a name="gcwrite">Write barrier: <tt>llvm.gcwrite</tt></a>
-</div>
+</h4>
 
 <div class="doc_code"><tt>
 void @llvm.gcwrite(i8* %value, i8* %object, i8** %derived)
 </tt></div>
 
-<div class="doc_text">
+<div>
 
 <p>For write barriers, LLVM provides the <tt>llvm.gcwrite</tt> intrinsic
 function. It has exactly the same semantics as a non-volatile <tt>store</tt> to
@@ -559,15 +555,15 @@ implement reference counting.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection">
+<h4>
   <a name="gcread">Read barrier: <tt>llvm.gcread</tt></a>
-</div>
+</h4>
 
 <div class="doc_code"><tt>
 i8* @llvm.gcread(i8* %object, i8** %derived)<br>
 </tt></div>
 
-<div class="doc_text">
+<div>
 
 <p>For read barriers, LLVM provides the <tt>llvm.gcread</tt> intrinsic function.
 It has exactly the same semantics as a non-volatile <tt>load</tt> from the
@@ -580,13 +576,17 @@ writes.</p>
 
 </div>
 
+</div>
+
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="plugin">Implementing a collector plugin</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>User code specifies which GC code generation to use with the <tt>gc</tt>
 function attribute or, equivalently, with the <tt>setGC</tt> method of
@@ -666,14 +666,12 @@ $ llvm-as &lt; sample.ll | llc -load=MyGC.so</pre></blockquote>
 <p>It is also possible to statically link the collector plugin into tools, such
 as a language-specific compiler front-end.</p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="collector-algos">Overview of available features</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p><tt>GCStrategy</tt> provides a range of features through which a plugin
 may do useful work. Some of these are callbacks, some are algorithms that can
@@ -958,11 +956,11 @@ interest.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="stack-map">Computing stack maps</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>LLVM automatically computes a stack map. One of the most important features
 of a <tt>GCStrategy</tt> is to compile this information into the executable in
@@ -1014,11 +1012,11 @@ for collector plugins which implement reference counting or a shadow stack.</p>
 
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="init-roots">Initializing roots to null: <tt>InitRoots</tt></a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <blockquote><pre
 >MyGC::MyGC() {
@@ -1039,12 +1037,12 @@ this feature should be used by all GC plugins. It is enabled by default.</p>
 
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="custom">Custom lowering of intrinsics: <tt>CustomRoots</tt>, 
     <tt>CustomReadBarriers</tt>, and <tt>CustomWriteBarriers</tt></a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>For GCs which use barriers or unusual treatment of stack roots, these
 flags allow the collector to perform arbitrary transformations of the LLVM
@@ -1129,11 +1127,11 @@ bool MyGC::performCustomLowering(Function &amp;F) {
 
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="safe-points">Generating safe points: <tt>NeededSafePoints</tt></a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>LLVM can compute four kinds of safe points:</p>
 
@@ -1193,11 +1191,11 @@ safe point (because only the topmost function has been patched).</p>
 
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="assembly">Emitting assembly code: <tt>GCMetadataPrinter</tt></a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>LLVM allows a plugin to print arbitrary assembly code before and after the
 rest of a module's assembly code. At the end of the module, the GC can compile
@@ -1341,14 +1339,15 @@ void MyGCPrinter::finishAssembly(std::ostream &amp;OS, AsmPrinter &amp;AP,
 
 </div>
 
+</div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="references">References</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p><a name="appel89">[Appel89]</a> Runtime Tags Aren't Necessary. Andrew
 W. Appel. Lisp and Symbolic Computation 19(7):703-705, July 1989.</p>
@@ -1379,7 +1378,7 @@ Fergus Henderson. International Symposium on Memory Management 2002.</p>
   src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
 
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
-  <a href="http://llvm.org">LLVM Compiler Infrastructure</a><br>
+  <a href="http://llvm.org/">LLVM Compiler Infrastructure</a><br>
   Last modified: $Date$
 </address>
 
diff --git a/docs/GetElementPtr.html b/docs/GetElementPtr.html
index 41c45ca..2c32a9e 100644
--- a/docs/GetElementPtr.html
+++ b/docs/GetElementPtr.html
@@ -11,9 +11,9 @@
 </head>
 <body>
 
-<div class="doc_title">
+<h1>
   The Often Misunderstood GEP Instruction
-</div>
+</h1>
 
 <ol>
   <li><a href="#intro">Introduction</a></li>
@@ -58,10 +58,10 @@
 
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="intro"><b>Introduction</b></a></div>
+<h2><a name="intro">Introduction</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text"> 
+<div>
   <p>This document seeks to dispel the mystery and confusion surrounding LLVM's
   <a href="LangRef.html#i_getelementptr">GetElementPtr</a> (GEP) instruction.
   Questions about the wily GEP instruction are
@@ -72,21 +72,20 @@
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="addresses"><b>Address Computation</b></a></div>
+<h2><a name="addresses">Address Computation</a></h2>
 <!-- *********************************************************************** -->
-<div class="doc_text">
+<div>
   <p>When people are first confronted with the GEP instruction, they tend to
   relate it to known concepts from other programming paradigms, most notably C
   array indexing and field selection. GEP closely resembles C array indexing
   and field selection, however it's is a little different and this leads to
   the following questions.</p>
-</div>
 
 <!-- *********************************************************************** -->
-<div class="doc_subsection">
-  <a name="firstptr"><b>What is the first index of the GEP instruction?</b></a>
-</div>
-<div class="doc_text">
+<h3>
+  <a name="firstptr">What is the first index of the GEP instruction?</a>
+</h3>
+<div>
   <p>Quick answer: The index stepping through the first operand.</p> 
   <p>The confusion with the first index usually arises from thinking about 
   the GetElementPtr instruction as if it was a C index operator. They aren't the
@@ -205,11 +204,11 @@ idx3 = (char*) &amp;MyVar + 8
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_subsection">
-  <a name="extra_index"><b>Why is the extra 0 index required?</b></a>
-</div>
+<h3>
+  <a name="extra_index">Why is the extra 0 index required?</a>
+</h3>
 <!-- *********************************************************************** -->
-<div class="doc_text">
+<div>
   <p>Quick answer: there are no superfluous indices.</p>
   <p>This question arises most often when the GEP instruction is applied to a
   global variable which is always a pointer type. For example, consider
@@ -247,10 +246,10 @@ idx3 = (char*) &amp;MyVar + 8
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_subsection">
-  <a name="deref"><b>What is dereferenced by GEP?</b></a>
-</div>
-<div class="doc_text">
+<h3>
+  <a name="deref">What is dereferenced by GEP?</a>
+</h3>
+<div>
   <p>Quick answer: nothing.</p> 
   <p>The GetElementPtr instruction dereferences nothing. That is, it doesn't
   access memory in any way. That's what the Load and Store instructions are for.
@@ -302,10 +301,10 @@ idx3 = (char*) &amp;MyVar + 8
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_subsection">
-  <a name="lead0"><b>Why don't GEP x,0,0,1 and GEP x,1 alias?</b></a>
-</div>
-<div class="doc_text">
+<h3>
+  <a name="lead0">Why don't GEP x,0,0,1 and GEP x,1 alias?</a>
+</h3>
+<div>
   <p>Quick Answer: They compute different address locations.</p>
   <p>If you look at the first indices in these GEP
   instructions you find that they are different (0 and 1), therefore the address
@@ -331,10 +330,10 @@ idx3 = (char*) &amp;MyVar + 8
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_subsection">
-  <a name="trail0"><b>Why do GEP x,1,0,0 and GEP x,1 alias?</b></a>
-</div>
-<div class="doc_text">
+<h3>
+  <a name="trail0">Why do GEP x,1,0,0 and GEP x,1 alias?</a>
+</h3>
+<div>
   <p>Quick Answer: They compute the same address location.</p>
   <p>These two GEP instructions will compute the same address because indexing
   through the 0th element does not change the address. However, it does change
@@ -355,10 +354,10 @@ idx3 = (char*) &amp;MyVar + 8
 
 <!-- *********************************************************************** -->
 
-<div class="doc_subsection">
-  <a name="vectors"><b>Can GEP index into vector elements?</b></a>
-</div>
-<div class="doc_text">
+<h3>
+  <a name="vectors">Can GEP index into vector elements?</a>
+</h3>
+<div>
   <p>This hasn't always been forcefully disallowed, though it's not recommended.
      It leads to awkward special cases in the optimizers, and fundamental
      inconsistency in the IR. In the future, it will probably be outright
@@ -368,10 +367,10 @@ idx3 = (char*) &amp;MyVar + 8
 
 <!-- *********************************************************************** -->
 
-<div class="doc_subsection">
-  <a name="addrspace"><b>What effect do address spaces have on GEPs?</b></a>
-</div>
-<div class="doc_text">
+<h3>
+  <a name="addrspace">What effect do address spaces have on GEPs?</a>
+</h3>
+<div>
    <p>None, except that the address space qualifier on the first operand pointer
       type always matches the address space qualifier on the result type.</p>
 
@@ -379,11 +378,12 @@ idx3 = (char*) &amp;MyVar + 8
 
 <!-- *********************************************************************** -->
 
-<div class="doc_subsection">
-  <a name="int"><b>How is GEP different from ptrtoint, arithmetic,
-                   and inttoptr?</b></a>
-</div>
-<div class="doc_text">
+<h3>
+  <a name="int">
+    How is GEP different from ptrtoint, arithmetic, and inttoptr?
+  </a>
+</h3>
+<div>
   <p>It's very similar; there are only subtle differences.</p>
 
   <p>With ptrtoint, you have to pick an integer type. One approach is to pick i64;
@@ -409,11 +409,13 @@ idx3 = (char*) &amp;MyVar + 8
 
 <!-- *********************************************************************** -->
 
-<div class="doc_subsection">
-  <a name="be"><b>I'm writing a backend for a target which needs custom
-                  lowering for GEP. How do I do this?</b></a>
-</div>
-<div class="doc_text">
+<h3>
+  <a name="be">
+    I'm writing a backend for a target which needs custom lowering for GEP.
+    How do I do this?
+  </a>
+</h3>
+<div>
   <p>You don't. The integer computation implied by a GEP is target-independent.
      Typically what you'll need to do is make your backend pattern-match
      expressions trees involving ADD, MUL, etc., which are what GEP is lowered
@@ -431,10 +433,10 @@ idx3 = (char*) &amp;MyVar + 8
 
 <!-- *********************************************************************** -->
 
-<div class="doc_subsection">
-  <a name="vla"><b>How does VLA addressing work with GEPs?</b></a>
-</div>
-<div class="doc_text">
+<h3>
+  <a name="vla">How does VLA addressing work with GEPs?</a>
+</h3>
+<div>
   <p>GEPs don't natively support VLAs. LLVM's type system is entirely static,
      and GEP address computations are guided by an LLVM type.</p>
 
@@ -450,16 +452,18 @@ idx3 = (char*) &amp;MyVar + 8
      VLA and non-VLA indexing in the same manner.</p>
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="rules"><b>Rules</b></a></div>
+<h2><a name="rules">Rules</a></h2>
 <!-- *********************************************************************** -->
-
+<div>
 <!-- *********************************************************************** -->
 
-<div class="doc_subsection">
-  <a name="bounds"><b>What happens if an array index is out of bounds?</b></a>
-</div>
-<div class="doc_text">
+<h3>
+  <a name="bounds">What happens if an array index is out of bounds?</a>
+</h3>
+<div>
   <p>There are two senses in which an array index can be out of bounds.</p>
 
   <p>First, there's the array type which comes from the (static) type of
@@ -498,20 +502,20 @@ idx3 = (char*) &amp;MyVar + 8
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_subsection">
-  <a name="negative"><b>Can array indices be negative?</b></a>
-</div>
-<div class="doc_text">
+<h3>
+  <a name="negative">Can array indices be negative?</a>
+</h3>
+<div>
   <p>Yes. This is basically a special case of array indices being out
      of bounds.</p>
 
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_subsection">
-  <a name="compare"><b>Can I compare two values computed with GEPs?</b></a>
-</div>
-<div class="doc_text">
+<h3>
+  <a name="compare">Can I compare two values computed with GEPs?</a>
+</h3>
+<div>
   <p>Yes. If both addresses are within the same allocated object, or 
      one-past-the-end, you'll get the comparison result you expect. If either
      is outside of it, integer arithmetic wrapping may occur, so the
@@ -520,11 +524,13 @@ idx3 = (char*) &amp;MyVar + 8
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_subsection">
-  <a name="types"><b>Can I do GEP with a different pointer type than the type of
-                     the underlying object?</b></a>
-</div>
-<div class="doc_text">
+<h3>
+  <a name="types">
+    Can I do GEP with a different pointer type than the type of
+    the underlying object?
+  </a>
+</h3>
+<div>
   <p>Yes. There are no restrictions on bitcasting a pointer value to an arbitrary
      pointer type. The types in a GEP serve only to define the parameters for the
      underlying integer computation. They need not correspond with the actual
@@ -538,11 +544,12 @@ idx3 = (char*) &amp;MyVar + 8
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_subsection">
-  <a name="null"><b>Can I cast an object's address to integer and add it
-                    to null?</b></a>
-</div>
-<div class="doc_text">
+<h3>
+  <a name="null">
+    Can I cast an object's address to integer and add it to null?
+  </a>
+</h3>
+<div>
   <p>You can compute an address that way, but if you use GEP to do the add,
      you can't use that pointer to actually access the object, unless the
      object is managed outside of LLVM.</p>
@@ -562,11 +569,13 @@ idx3 = (char*) &amp;MyVar + 8
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_subsection">
-  <a name="ptrdiff"><b>Can I compute the distance between two objects, and add
-                       that value to one address to compute the other address?</b></a>
-</div>
-<div class="doc_text">
+<h3>
+  <a name="ptrdiff">
+    Can I compute the distance between two objects, and add
+    that value to one address to compute the other address?
+  </a>
+</h3>
+<div>
   <p>As with arithmetic on null, You can use GEP to compute an address that
      way, but you can't use that pointer to actually access the object if you
      do, unless the object is managed outside of LLVM.</p>
@@ -577,10 +586,10 @@ idx3 = (char*) &amp;MyVar + 8
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_subsection">
-  <a name="tbaa"><b>Can I do type-based alias analysis on LLVM IR?</b></a>
-</div>
-<div class="doc_text">
+<h3>
+  <a name="tbaa">Can I do type-based alias analysis on LLVM IR?</a>
+</h3>
+<div>
   <p>You can't do type-based alias analysis using LLVM's built-in type system,
      because LLVM has no restrictions on mixing types in addressing, loads or
      stores.</p>
@@ -594,10 +603,10 @@ idx3 = (char*) &amp;MyVar + 8
 
 <!-- *********************************************************************** -->
 
-<div class="doc_subsection">
-  <a name="overflow"><b>What happens if a GEP computation overflows?</b></a>
-</div>
-<div class="doc_text">
+<h3>
+  <a name="overflow">What happens if a GEP computation overflows?</a>
+</h3>
+<div>
    <p>If the GEP lacks the <tt>inbounds</tt> keyword, the value is the result
       from evaluating the implied two's complement integer computation. However,
       since there's no guarantee of where an object will be allocated in the
@@ -624,11 +633,12 @@ idx3 = (char*) &amp;MyVar + 8
 
 <!-- *********************************************************************** -->
 
-<div class="doc_subsection">
-  <a name="check"><b>How can I tell if my front-end is following the
-                     rules?</b></a>
-</div>
-<div class="doc_text">
+<h3>
+  <a name="check">
+    How can I tell if my front-end is following the rules?
+  </a>
+</h3>
+<div>
    <p>There is currently no checker for the getelementptr rules. Currently,
       the only way to do this is to manually check each place in your front-end
       where GetElementPtr operators are created.</p>
@@ -641,16 +651,18 @@ idx3 = (char*) &amp;MyVar + 8
 
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="rationale"><b>Rationale</b></a></div>
+<h2><a name="rationale">Rationale</a></h2>
 <!-- *********************************************************************** -->
-
+<div>
 <!-- *********************************************************************** -->
 
-<div class="doc_subsection">
-  <a name="goals"><b>Why is GEP designed this way?</b></a>
-</div>
-<div class="doc_text">
+<h3>
+  <a name="goals">Why is GEP designed this way?</a>
+</h3>
+<div>
    <p>The design of GEP has the following goals, in rough unofficial
       order of priority:</p>
    <ul>
@@ -669,10 +681,10 @@ idx3 = (char*) &amp;MyVar + 8
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_subsection">
-  <a name="i32"><b>Why do struct member indices always use i32?</b></a>
-</div>
-<div class="doc_text">
+<h3>
+  <a name="i32">Why do struct member indices always use i32?</a>
+</h3>
+<div>
   <p>The specific type i32 is probably just a historical artifact, however it's
      wide enough for all practical purposes, so there's been no need to change it.
      It doesn't necessarily imply i32 address arithmetic; it's just an identifier
@@ -684,10 +696,10 @@ idx3 = (char*) &amp;MyVar + 8
 
 <!-- *********************************************************************** -->
 
-<div class="doc_subsection">
-  <a name="uglygep"><b>What's an uglygep?</b></a>
-</div>
-<div class="doc_text">
+<h3>
+  <a name="uglygep">What's an uglygep?</a>
+</h3>
+<div>
   <p>Some LLVM optimizers operate on GEPs by internally lowering them into
      more primitive integer expressions, which allows them to be combined
      with other integer expressions and/or split into multiple separate
@@ -704,11 +716,13 @@ idx3 = (char*) &amp;MyVar + 8
 
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="summary"><b>Summary</b></a></div>
+<h2><a name="summary">Summary</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
   <p>In summary, here's some things to always remember about the GetElementPtr
   instruction:</p>
   <ol>
@@ -732,7 +746,7 @@ idx3 = (char*) &amp;MyVar + 8
   src="http://jigsaw.w3.org/css-validator/images/vcss-blue" alt="Valid CSS"></a>
   <a href="http://validator.w3.org/check/referer"><img
   src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
-  <a href="http://llvm.org">The LLVM Compiler Infrastructure</a><br/>
+  <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br/>
   Last modified: $Date$
 </address>
 </body>
diff --git a/docs/GettingStarted.html b/docs/GettingStarted.html
index 45a8ec0..7360893 100644
--- a/docs/GettingStarted.html
+++ b/docs/GettingStarted.html
@@ -8,9 +8,9 @@
 </head>
 <body>
 
-<div class="doc_title">
+<h1>
   Getting Started with the LLVM System  
-</div>
+</h1>
 
 <ul>
   <li><a href="#overview">Overview</a>
@@ -62,7 +62,7 @@
   <p>Written by: 
     <a href="mailto:criswell@uiuc.edu">John Criswell</a>, 
     <a href="mailto:sabre@nondot.org">Chris Lattner</a>,
-    <a href="http://misha.brukman.net">Misha Brukman</a>, 
+    <a href="http://misha.brukman.net/">Misha Brukman</a>, 
     <a href="http://www.cs.uiuc.edu/~vadve">Vikram Adve</a>, and
     <a href="mailto:gshi1@uiuc.edu">Guochun Shi</a>.
   </p>
@@ -70,12 +70,12 @@
 
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
-  <a name="overview"><b>Overview</b></a>
-</div>
+<h2>
+  <a name="overview">Overview</a>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Welcome to LLVM! In order to get started, you first need to know some
 basic information.</p>
@@ -102,12 +102,12 @@ and performance.
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
-  <a name="quickstart"><b>Getting Started Quickly (A Summary)</b></a>
-</div>
+<h2>
+  <a name="quickstart">Getting Started Quickly (A Summary)</a>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Here's the short story for getting up and running quickly with LLVM:</p>
 
@@ -116,13 +116,13 @@ and performance.
   <li>Read the documentation.</li>
   <li>Remember that you were warned twice about reading the documentation.</li>
   <li>Install the llvm-gcc-4.2 front end if you intend to compile C or C++
-      (see <a href="#installcf">Install the GCC Front End</a> for details):</li>
+      (see <a href="#installcf">Install the GCC Front End</a> for details):
     <ol>
       <li><tt>cd <i>where-you-want-the-C-front-end-to-live</i></tt></li>
       <li><tt>gunzip --stdout llvm-gcc-4.2-<i>version</i>-<i>platform</i>.tar.gz | tar -xvf -</tt></li>
 	  <li><tt><i>install-binutils-binary-from-MinGW</i></tt> (Windows only)</li>
 	  <li>Note: If the binary extension is "<tt>.bz</tt>" use <tt>bunzip2</tt> instead of <tt>gunzip</tt>.</li>
-	  <li>Note: On Windows, use <a href="http://www.7-zip.org">7-Zip</a> or a similar archiving tool.</li>
+	  <li>Note: On Windows, use <a href="http://www.7-zip.org/">7-Zip</a> or a similar archiving tool.</li>
 	  <li>Add <tt>llvm-gcc</tt>'s "<tt>bin</tt>" directory to your <tt>PATH</tt> environment variable.</li>
     </ol></li>
 
@@ -191,25 +191,23 @@ Layout</a> to learn about the layout of the source code tree.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
-  <a name="requirements"><b>Requirements</b></a>
-</div>
+<h2>
+  <a name="requirements">Requirements</a>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Before you begin to use the LLVM system, review the requirements given below.
 This may save you some trouble by knowing ahead of time what hardware and
 software you will need.</p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsection">
-  <a name="hardware"><b>Hardware</b></a>
-</div>
+<h3>
+  <a name="hardware">Hardware</a>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>LLVM is known to work on the following platforms:</p>
 
@@ -344,9 +342,9 @@ up</a></li>
     ActivePerl, as these have Windows-specifics that will cause the
     build to fail.</a></li>
 <li><a name="pf_11">To use LLVM modules on Win32-based system,
-    you may configure LLVM with <i>&quot;--enable-shared&quot;</i>.</li>
+    you may configure LLVM with <i>&quot;--enable-shared&quot;</i>.</a></li>
 <li><a name="pf_12">To compile SPU backend, you need to add
-    <tt>&quot;LDFLAGS=-Wl,--stack,16777216&quot;</tt> to configure.</li>
+    <tt>&quot;LDFLAGS=-Wl,--stack,16777216&quot;</tt> to configure.</a></li>
 </ol>
 </div>
 
@@ -370,8 +368,10 @@ href="GCCFEBuildInstrs.html">try to compile it</a> on your platform.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="software"><b>Software</b></a></div>
-<div class="doc_text">
+<h3>
+  <a name="software">Software</a>
+</h3>
+<div>
   <p>Compiling LLVM requires that you have several software packages 
   installed. The table below lists those required packages. The Package column
   is the usual name for the software package that LLVM depends on. The Version
@@ -387,13 +387,13 @@ href="GCCFEBuildInstrs.html">try to compile it</a> on your platform.</p>
     </tr>
 
     <tr>
-      <td><a href="http://gcc.gnu.org">GCC</a></td>
+      <td><a href="http://gcc.gnu.org/">GCC</a></td>
       <td>3.4.2</td>
       <td>C/C++ compiler<sup><a href="#sf1">1</a></sup></td>
     </tr>
 
     <tr>
-      <td><a href="http://www.gnu.org/software/texinfo">TeXinfo</a></td>
+      <td><a href="http://www.gnu.org/software/texinfo/">TeXinfo</a></td>
       <td>4.5</td>
       <td>For building the CFE</td>
     </tr>
@@ -440,13 +440,13 @@ href="GCCFEBuildInstrs.html">try to compile it</a> on your platform.</p>
     </tr>
 
     <tr>
-      <td><a href="http://www.gnu.org/software/autoconf">GNU Autoconf</a></td>
+      <td><a href="http://www.gnu.org/software/autoconf/">GNU Autoconf</a></td>
       <td>2.60</td>
       <td>Configuration script builder<sup><a href="#sf4">4</a></sup></td>
     </tr>
 
     <tr>
-      <td><a href="http://www.gnu.org/software/automake">GNU Automake</a></td>
+      <td><a href="http://www.gnu.org/software/automake/">GNU Automake</a></td>
       <td>1.9.6</td>
       <td>aclocal macro generator<sup><a href="#sf4">4</a></sup></td>
     </tr>
@@ -508,11 +508,11 @@ href="GCCFEBuildInstrs.html">try to compile it</a> on your platform.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="brokengcc">Broken versions of GCC and other tools</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>LLVM is very demanding of the host C++ compiler, and as such tends to expose
 bugs in the compiler.  In particular, several versions of GCC crash when trying
@@ -605,15 +605,15 @@ upgrading to a newer version of Gold.</p>
 
 </div>
 
-
+</div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
-  <a name="starting"><b>Getting Started with LLVM</b></a>
-</div>
+<h2>
+  <a name="starting">Getting Started with LLVM</a>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>The remainder of this guide is meant to get you up and running with
 LLVM and to give you some basic information about the LLVM environment.</p>
@@ -623,14 +623,13 @@ href="#layout">general layout</a> of the the LLVM source tree, a <a
 href="#tutorial">simple example</a> using the LLVM tool chain, and <a
 href="#links">links</a> to find more information about LLVM or to get
 help via e-mail.</p>
-</div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="terminology">Terminology and Notation</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>Throughout this manual, the following names are used to denote paths
 specific to the local system and working environment.  <i>These are not
@@ -663,11 +662,11 @@ All these paths are absolute:</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="environment">Setting Up Your Environment</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>
 In order to compile and use LLVM, you may need to set some environment
@@ -686,11 +685,11 @@ variables.
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="unpack">Unpacking the LLVM Archives</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>
 If you have the LLVM distribution, you will need to unpack it before you
@@ -720,11 +719,11 @@ compressed with the gzip program.
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="checkout">Checkout LLVM from Subversion</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>If you have access to our Subversion repository, you can get a fresh copy of
 the entire source code.  All you need to do is check it out from Subversion as
@@ -791,30 +790,30 @@ instructions</a> to successfully get and build the LLVM GCC front-end.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="git_mirror">GIT mirror</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>GIT mirrors are available for a number of LLVM subprojects. These mirrors
   sync automatically with each Subversion commit and contain all necessary
   git-svn marks (so, you can recreate git-svn metadata locally). Note that right
   now mirrors reflect only <tt>trunk</tt> for each project. You can do the
-  read-only GIT clone of LLVM via: 
+  read-only GIT clone of LLVM via:</p>
+
 <pre>
 % git clone http://llvm.org/git/llvm.git
 </pre>
-</p>
 
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="installcf">Install the GCC Front End</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>Before configuring and compiling the LLVM suite (or if you want to use just the LLVM
 GCC front end) you can optionally extract the front end from the binary distribution.
@@ -823,7 +822,7 @@ you can optionally <a href="GCCFEBuildInstrs.html">build llvm-gcc yourself</a> a
 main LLVM repository.</p>
 
 <p>To install the GCC front end, do the following (on Windows, use an archival tool
-like <a href="http://www.7-zip.org">7-zip</a> that understands gzipped tars):</p>
+like <a href="http://www.7-zip.org/">7-zip</a> that understands gzipped tars):</p>
 
 <ol>
   <li><tt>cd <i>where-you-want-the-front-end-to-live</i></tt></li>
@@ -880,11 +879,11 @@ please let us know how you would like to see things improved by dropping us a no
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="config">Local LLVM Configuration</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
   <p>Once checked out from the Subversion repository, the LLVM suite source 
   code must be
@@ -1002,11 +1001,11 @@ script to configure the build system:</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="compile">Compiling the LLVM Suite Source Code</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>Once you have configured LLVM, you can build it.  There are three types of
 builds:</p>
@@ -1136,11 +1135,11 @@ that directory that is out of date.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="cross-compile">Cross-Compiling LLVM</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
   <p>It is possible to cross-compile LLVM itself. That is, you can create LLVM
   executables and libraries to be hosted on a platform different from the
   platform where they are build (a Canadian Cross build). To configure a
@@ -1154,11 +1153,11 @@ that directory that is out of date.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="objfiles">The Location of LLVM Object Files</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>The LLVM build system is capable of sharing a single LLVM source tree among
 several LLVM builds.  Hence, it is possible to build LLVM for several different
@@ -1214,11 +1213,11 @@ named after the build type:</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="optionalconfig">Optional Configuration Items</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>
 If you're running on a Linux system that supports the "<a
@@ -1238,7 +1237,7 @@ $ ./hello.bc
 
 <p>
 This allows you to execute LLVM bitcode files directly.  On Debian, you 
-can also use this command instead of the 'echo' command above:</p>
+can also use this command instead of the 'echo' command above:
 </p>
 
 <div class="doc_code">
@@ -1249,31 +1248,37 @@ $ sudo update-binfmts --install llvm /path/to/lli --magic 'BC'
 
 </div>
 
-<!-- *********************************************************************** -->
-<div class="doc_section">
-  <a name="layout"><b>Program Layout</b></a>
 </div>
+
+<!-- *********************************************************************** -->
+<h2>
+  <a name="layout">Program Layout</a>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>One useful source of information about the LLVM source base is the LLVM <a
-href="http://www.doxygen.org">doxygen</a> documentation available at <tt><a
+href="http://www.doxygen.org/">doxygen</a> documentation available at <tt><a
 href="http://llvm.org/doxygen/">http://llvm.org/doxygen/</a></tt>.
 The following is a brief introduction to code layout:</p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="examples"><tt>llvm/examples</tt></a></div>
-<div class="doc_text">
+<h3>
+  <a name="examples"><tt>llvm/examples</tt></a>
+</h3>
+
+<div>
   <p>This directory contains some simple examples of how to use the LLVM IR and
   JIT.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="include"><tt>llvm/include</tt></a></div>
-<div class="doc_text">
+<h3>
+  <a name="include"><tt>llvm/include</tt></a>
+</h3>
+
+<div>
 
 <p>This directory contains public header files exported from the LLVM
 library. The three main subdirectories of this directory are:</p>
@@ -1300,8 +1305,11 @@ library. The three main subdirectories of this directory are:</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="lib"><tt>llvm/lib</tt></a></div>
-<div class="doc_text">
+<h3>
+  <a name="lib"><tt>llvm/lib</tt></a>
+</h3>
+
+<div>
 
 <p>This directory contains most of the source files of the LLVM system. In LLVM,
 almost all code exists in libraries, making it very easy to share code among the
@@ -1366,8 +1374,11 @@ different <a href="#tools">tools</a>.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="projects"><tt>llvm/projects</tt></a></div>
-<div class="doc_text">
+<h3>
+  <a name="projects"><tt>llvm/projects</tt></a>
+</h3>
+
+<div>
   <p>This directory contains projects that are not strictly part of LLVM but are
   shipped with LLVM. This is also the directory where you should create your own
   LLVM-based projects. See <tt>llvm/projects/sample</tt> for an example of how
@@ -1375,8 +1386,11 @@ different <a href="#tools">tools</a>.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="runtime"><tt>llvm/runtime</tt></a></div>
-<div class="doc_text">
+<h3>
+  <a name="runtime"><tt>llvm/runtime</tt></a>
+</h3>
+
+<div>
 
 <p>This directory contains libraries which are compiled into LLVM bitcode and
 used when linking programs with the GCC front end.  Most of these libraries are
@@ -1389,16 +1403,22 @@ end to compile.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="test"><tt>llvm/test</tt></a></div>
-<div class="doc_text">
+<h3>
+  <a name="test"><tt>llvm/test</tt></a>
+</h3>
+
+<div>
   <p>This directory contains feature and regression tests and other basic sanity
   checks on the LLVM infrastructure. These are intended to run quickly and cover
   a lot of territory without being exhaustive.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="test-suite"><tt>test-suite</tt></a></div>
-<div class="doc_text">
+<h3>
+  <a name="test-suite"><tt>test-suite</tt></a>
+</h3>
+
+<div>
   <p>This is not a directory in the normal llvm module; it is a separate
   Subversion
   module that must be checked out (usually to <tt>projects/test-suite</tt>). 
@@ -1413,8 +1433,11 @@ end to compile.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="tools"><tt>llvm/tools</tt></a></div>
-<div class="doc_text">
+<h3>
+  <a name="tools"><tt>llvm/tools</tt></a>
+</h3>
+
+<div>
 
 <p>The <b>tools</b> directory contains the executables built out of the
 libraries above, which form the main part of the user interface.  You can
@@ -1498,8 +1521,11 @@ information is in the <a href="CommandGuide/index.html">Command Guide</a>.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="utils"><tt>llvm/utils</tt></a></div>
-<div class="doc_text">
+<h3>
+  <a name="utils"><tt>llvm/utils</tt></a>
+</h3>
+
+<div>
 
 <p>This directory contains utilities for working with LLVM source code, and some
 of the utilities are actually required as part of the build process because they
@@ -1560,13 +1586,15 @@ are code generators for parts of LLVM infrastructure.</p>
 
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="tutorial">An Example Using the LLVM Tool Chain</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 <p>This section gives an example of using LLVM.  llvm-gcc3 is now obsolete,
 so we only include instructions for llvm-gcc4.
 </p>
@@ -1577,12 +1605,13 @@ create bitcode by default: <i>gcc4</i> produces native code. As the example belo
 the '--emit-llvm' flag is needed to produce LLVM bitcode output. For <i>makefiles</i> and
 <i>configure</i> scripts, the CFLAGS variable needs '--emit-llvm' to produce bitcode
 output.</p>
-</div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="tutorial4">Example with llvm-gcc4</a></div>
+<h3>
+  <a name="tutorial4">Example with llvm-gcc4</a>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <ol>
   <li><p>First, create a simple C file, name it 'hello.c':</p>
@@ -1663,14 +1692,15 @@ int main() {
 
 </div>
 
+</div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="problems">Common Problems</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>If you are having problems building or using LLVM, or if you have any other
 general questions about LLVM, please consult the <a href="FAQ.html">Frequently
@@ -1679,12 +1709,12 @@ Asked Questions</a> page.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="links">Links</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>This document is just an <b>introduction</b> on how to use LLVM to do
 some simple things... there are many more interesting and complicated things
@@ -1712,7 +1742,7 @@ out:</p>
 
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
   <a href="http://llvm.x10sys.com/rspencer/">Reid Spencer</a><br>
-  <a href="http://llvm.org">The LLVM Compiler Infrastructure</a><br>
+  <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
   Last modified: $Date$
 </address>
 </body>
diff --git a/docs/GettingStartedVS.html b/docs/GettingStartedVS.html
index 7475904..d6bf1b6 100644
--- a/docs/GettingStartedVS.html
+++ b/docs/GettingStartedVS.html
@@ -8,9 +8,9 @@
 </head>
 <body>
 
-<div class="doc_title">
+<h1>
   Getting Started with the LLVM System using Microsoft Visual Studio
-</div>
+</h1>
 
 <ul>
   <li><a href="#overview">Overview</a>
@@ -26,17 +26,17 @@
 </ul>
 
 <div class="doc_author">
-  <p>Written by: <a href="http://llvm.org">The LLVM Team</a></p>
+  <p>Written by: <a href="http://llvm.org/">The LLVM Team</a></p>
 </div>
 
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="overview"><b>Overview</b></a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
   <p>Welcome to LLVM on Windows! This document only covers LLVM on Windows using
   Visual Studio, not mingw or cygwin. In order to get started, you first need to
@@ -70,25 +70,23 @@
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="requirements"><b>Requirements</b></a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
   <p>Before you begin to use the LLVM system, review the requirements given
   below.  This may save you some trouble by knowing ahead of time what hardware
   and software you will need.</p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="hardware"><b>Hardware</b></a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
   <p>Any system that can adequately run Visual Studio .NET 2005 SP1 is fine.
   The LLVM source tree and object files, libraries and executables will consume
@@ -97,8 +95,8 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="software"><b>Software</b></a></div>
-<div class="doc_text">
+<h3><a name="software"><b>Software</b></a></h3>
+<div>
 
   <p>You will need Visual Studio .NET 2005 SP1 or higher.  The VS2005 SP1
   beta and the normal VS2005 still have bugs that are not completely
@@ -118,13 +116,15 @@
 
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="quickstart"><b>Getting Started</b></a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Here's the short story for getting up and running quickly with LLVM:</p>
 
@@ -194,7 +194,9 @@
   <ul>
     <li>If %PATH% does not contain GnuWin32, you may specify LLVM_LIT_TOOLS_DIR
     on CMake for the path to GnuWin32.</li>
-    <li>You can run LLVM tests to build the project "check".</li>
+    <li>You can run LLVM tests by merely building the project
+      "check". The test results will be shown in the VS output
+      window.</li>
   </ul>
   </li>
 
@@ -213,25 +215,26 @@
     <p>Note that quite a few of these test will fail.</p>
     </li>
 
-    <li>A specific test or test directory can be run with:</li>
+    <li>A specific test or test directory can be run with:
 
 <div class="doc_code">
 <pre>
 % llvm-lit test/path/to/test
 </pre>
 </div>
-
+    </li>
+  </ul>
 </ol>
 
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="tutorial">An Example Using the LLVM Tool Chain</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <ol>
   <li><p>First, create a simple C file, name it 'hello.c':</p>
@@ -316,12 +319,12 @@ int main() {
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="problems">Common Problems</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>If you are having problems building or using LLVM, or if you have any other
 general questions about LLVM, please consult the <a href="FAQ.html">Frequently
@@ -330,12 +333,12 @@ Asked Questions</a> page.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="links">Links</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>This document is just an <b>introduction</b> to how to use LLVM to do
 some simple things... there are many more interesting and complicated things
@@ -359,7 +362,7 @@ out:</p>
   <a href="http://validator.w3.org/check/referer"><img
   src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
 
-  <a href="http://llvm.org">The LLVM Compiler Infrastructure</a><br>
+  <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
   Last modified: $Date$
 </address>
 </body>
diff --git a/docs/GoldPlugin.html b/docs/GoldPlugin.html
index 68c5cf1..e25c457 100644
--- a/docs/GoldPlugin.html
+++ b/docs/GoldPlugin.html
@@ -7,7 +7,7 @@
 </head>
 <body>
       
-<div class="doc_title">LLVM gold plugin</div>
+<h1>LLVM gold plugin</h1>
 <ol>
   <li><a href="#introduction">Introduction</a></li>
   <li><a href="#build">How to build it</a></li>
@@ -21,9 +21,9 @@
 <div class="doc_author">Written by Nick Lewycky</div>
 
 <!--=========================================================================-->
-<div class="doc_section"><a name="introduction">Introduction</a></div>
+<h2><a name="introduction">Introduction</a></h2>
 <!--=========================================================================-->
-<div class="doc_text">
+<div>
   <p>Building with link time optimization requires cooperation from the
 system linker. LTO support on Linux systems requires that you use
 the <a href="http://sourceware.org/binutils">gold linker</a> which supports
@@ -33,14 +33,14 @@ project.</p>
   <p>The LLVM gold plugin implements the
 <a href="http://gcc.gnu.org/wiki/whopr/driver">gold plugin interface</a>
 on top of
-<a href="http://llvm.org/docs/LinkTimeOptimization.html#lto">libLTO</a>.
+<a href="LinkTimeOptimization.html#lto">libLTO</a>.
 The same plugin can also be used by other tools such as <tt>ar</tt> and
 <tt>nm</tt>.
 </div>
 <!--=========================================================================-->
-<div class="doc_section"><a name="build">How to build it</a></div>
+<h2><a name="build">How to build it</a></h2>
 <!--=========================================================================-->
-<div class="doc_text">
+<div>
   <p>You need to have gold with plugin support and build the LLVMgold
 plugin. Check whether you have gold running <tt>/usr/bin/ld -v</tt>. It will
 report &#8220;GNU gold&#8221; or else &#8220GNU ld&#8221; if not. If you have
@@ -72,9 +72,9 @@ placed.
 </ul>
 </div>
 <!--=========================================================================-->
-<div class="doc_section"><a name="usage">Usage</a></div>
+<h2><a name="usage">Usage</a></h2>
 <!--=========================================================================-->
-<div class="doc_text">
+<div>
   <p>The linker takes a <tt>-plugin</tt> option that points to the path of
   the plugin <tt>.so</tt> file. To find out what link command <tt>gcc</tt>
   would run in a given situation, run <tt>gcc -v <em>[...]</em></tt> and look
@@ -95,14 +95,13 @@ placed.
   own gold, be sure to install the <tt>ar</tt> and <tt>nm-new</tt> you built to
   <tt>/usr/bin</tt>.
   <p>
-</div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="example1">Example of link time optimization</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
   <p>The following example shows a worked example of the gold plugin mixing
   LLVM bitcode and native code.
 <pre class="doc_code">
@@ -145,14 +144,20 @@ $ llvm-gcc -use-gold-plugin a.a b.o -o main # &lt;-- link with LLVMgold plugin
 </pre>
   <p>Gold informs the plugin that foo3 is never referenced outside the IR,
   leading LLVM to delete that function. However, unlike in the
-  <a href="http://llvm.org/docs/LinkTimeOptimization.html#example1">libLTO
+  <a href="LinkTimeOptimization.html#example1">libLTO
   example</a> gold does not currently eliminate foo4.</p>
 </div>
 
+</div>
+
 <!--=========================================================================-->
-<div class="doc_section"><a name="lto_autotools">Quickstart for using LTO with autotooled projects</a></div>
+<h2>
+  <a name="lto_autotools">
+    Quickstart for using LTO with autotooled projects
+  </a>
+</h2>
 <!--=========================================================================-->
-<div class="doc_text">
+<div>
   <p>Once your system <tt>ld</tt>, <tt>ar</tt> and <tt>nm</tt> all support LLVM
   bitcode, everything is in place for an easy to use LTO build of autotooled
   projects:</p>
@@ -189,9 +194,9 @@ export CFLAGS="-O4"
 </div>
 
 <!--=========================================================================-->
-<div class="doc_section"><a name="licensing">Licensing</a></div>
+<h2><a name="licensing">Licensing</a></h2>
 <!--=========================================================================-->
-<div class="doc_text">
+<div>
   <p>Gold is licensed under the GPLv3. LLVMgold uses the interface file
 <tt>plugin-api.h</tt> from gold which means that the resulting LLVMgold.so
 binary is also GPLv3. This can still be used to link non-GPLv3 programs just
@@ -206,7 +211,7 @@ as much as gold could without the plugin.</p>
   <a href="http://validator.w3.org/check/referer"><img
   src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
   <a href="mailto:nicholas@metrix.on.ca">Nick Lewycky</a><br>
-  <a href="http://llvm.org">The LLVM Compiler Infrastructure</a><br>
+  <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
   Last modified: $Date: 2010-04-16 23:58:21 -0800 (Fri, 16 Apr 2010) $
 </address>
 </body>
diff --git a/docs/HistoricalNotes/2000-11-18-EarlyDesignIdeasResp.txt b/docs/HistoricalNotes/2000-11-18-EarlyDesignIdeasResp.txt
index 1c725f5..81ca539 100644
--- a/docs/HistoricalNotes/2000-11-18-EarlyDesignIdeasResp.txt
+++ b/docs/HistoricalNotes/2000-11-18-EarlyDesignIdeasResp.txt
@@ -60,11 +60,11 @@ Understood.  :)
 
 Yup, I think that this makes a lot of sense.  I am still intrigued,
 however, by the prospect of a minimally allocated VM representation... I
-think that it could have definate advantages for certain applications
+think that it could have definite advantages for certain applications
 (think very small machines, like PDAs).  I don't, however, think that our
 initial implementations should focus on this.  :)
 
-Here are some other auxilliary goals that I think we should consider:
+Here are some other auxiliary goals that I think we should consider:
 
 1. Primary goal: Support a high performance dynamic compilation
    system.  This means that we have an "ideal" division of labor between
diff --git a/docs/HistoricalNotes/2000-12-06-MeetingSummary.txt b/docs/HistoricalNotes/2000-12-06-MeetingSummary.txt
index b66e185..01b644b 100644
--- a/docs/HistoricalNotes/2000-12-06-MeetingSummary.txt
+++ b/docs/HistoricalNotes/2000-12-06-MeetingSummary.txt
@@ -40,7 +40,7 @@ IDEAS TO CONSIDER
    packaged with the bytecodes themselves.  As a conceptual implementation 
    idea, we could include an immediate dominator number for each basic block
    in the LLVM bytecode program.  Basic blocks could be numbered according
-   to the order of occurance in the bytecode representation.
+   to the order of occurrence in the bytecode representation.
 
 2. Including loop header and body information.  This would facilitate
    detection of intervals and natural loops.
diff --git a/docs/HistoricalNotes/2001-02-06-TypeNotationDebateResp4.txt b/docs/HistoricalNotes/2001-02-06-TypeNotationDebateResp4.txt
index 7b90327..8397324 100644
--- a/docs/HistoricalNotes/2001-02-06-TypeNotationDebateResp4.txt
+++ b/docs/HistoricalNotes/2001-02-06-TypeNotationDebateResp4.txt
@@ -39,7 +39,7 @@ declaration and calling syntax.
 
 Very true.  If you're implementing an object oriented language, however,
 remember that you have to do all the pointer to member function stuff
-yourself.... so everytime you invoke a virtual method one is involved
+yourself.... so every time you invoke a virtual method one is involved
 (instead of having C++ hide it for you behind "syntactic sugar").
 
 > And the old array syntax:
diff --git a/docs/HistoricalNotes/2001-02-09-AdveCommentsResponse.txt b/docs/HistoricalNotes/2001-02-09-AdveCommentsResponse.txt
index 5c87330..da50263 100644
--- a/docs/HistoricalNotes/2001-02-09-AdveCommentsResponse.txt
+++ b/docs/HistoricalNotes/2001-02-09-AdveCommentsResponse.txt
@@ -18,7 +18,7 @@ suggested, as specified below:
 
 Very true.  We should discuss this more, but my reasoning is more of a
 consistency argument.  There are VERY few instructions that can have all
-of the types eliminated, and doing so when available unnecesarily makes
+of the types eliminated, and doing so when available unnecessarily makes
 the language more difficult to handle.  Especially when you see 'int
 %this' and 'bool %that' all over the place, I think it would be
 disorienting to see:
@@ -44,7 +44,7 @@ branches).
 
 No.  This was something I was debating for a while, and didn't really feel
 strongly about either way.  It is common to switch on other types in HLL's
-(for example signed int's are particually common), but in this case, all
+(for example signed int's are particularly common), but in this case, all
 that will be added is an additional 'cast' instruction.  I removed that
 from the spec.
 
@@ -160,7 +160,7 @@ that can be trivally translated into a conditional move...
 > I agree that we need a static data space.  Otherwise, emulating global
 > data gets unnecessarily complex.
 
-Definately.  Also a later item though.  :)
+Definitely.  Also a later item though.  :)
 
 > We once talked about adding a symbolic thread-id field to each
 > ..
diff --git a/docs/HistoricalNotes/2001-06-01-GCCOptimizations2.txt b/docs/HistoricalNotes/2001-06-01-GCCOptimizations2.txt
index 6c9e097..e61042f 100644
--- a/docs/HistoricalNotes/2001-06-01-GCCOptimizations2.txt
+++ b/docs/HistoricalNotes/2001-06-01-GCCOptimizations2.txt
@@ -42,7 +42,7 @@ Does using GCC's backend buy us anything?
 > optimization (step 16 in your list).  Do you have a breakdown of that?
 
 Not really.  The irritating part of GCC is that it mixes it all up and
-doesn't have a clean seperation of concerns.  A lot of the "back end
+doesn't have a clean separation of concerns.  A lot of the "back end
 optimization" happens right along with other data optimizations (ie, CSE
 of machine specific things).
 
diff --git a/docs/HistoricalNotes/2002-05-12-InstListChange.txt b/docs/HistoricalNotes/2002-05-12-InstListChange.txt
index 004edb0..638682b 100644
--- a/docs/HistoricalNotes/2002-05-12-InstListChange.txt
+++ b/docs/HistoricalNotes/2002-05-12-InstListChange.txt
@@ -17,7 +17,7 @@ iterator to an instruction, which, given just an Instruction*, requires a
 linear search of the basic block the instruction is contained in... just 
 to insert an instruction before another instruction, or to delete an 
 instruction!  This complicates algorithms that should be very simple (like 
-simple constant propogation), because they aren't actually sparse anymore, 
+simple constant propagation), because they aren't actually sparse anymore,
 they have to traverse basic blocks to remove constant propogated 
 instructions.
 
diff --git a/docs/HowToReleaseLLVM.html b/docs/HowToReleaseLLVM.html
index 2ea8b43..8a7d7f4 100644
--- a/docs/HowToReleaseLLVM.html
+++ b/docs/HowToReleaseLLVM.html
@@ -7,7 +7,7 @@
 </head>
 <body>
 
-<div class="doc_title">How To Release LLVM To The Public</div>
+<h1>How To Release LLVM To The Public</h1>
 <ol>
   <li><a href="#introduction">Introduction</a></li>
   <li><a href="#criteria">Qualification Criteria</a></li>
@@ -23,10 +23,10 @@
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="introduction">Introduction</a></div>
+<h2><a name="introduction">Introduction</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>This document contains information about successfully releasing LLVM &mdash;
    including subprojects: e.g., <tt>llvm-gcc</tt> and <tt>clang</tt> &mdash; to
@@ -36,9 +36,9 @@
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="process">Release Timeline</a></div>
+<h2><a name="process">Release Timeline</a></h2>
 <!-- *********************************************************************** -->
-<div class="doc_text">
+<div>
 
 <p>LLVM is released on a time based schedule &mdash; roughly every 6 months. We
    do not normally have dot releases because of the nature of LLVM's incremental
@@ -76,18 +76,19 @@
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="process">Release Process</a></div>
+<h2><a name="process">Release Process</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <ol>
-  <li><a href="#release-admin">Release Administrative Tasks</a></li>
+  <li><a href="#release-admin">Release Administrative Tasks</a>
   <ol>
     <li><a href="#branch">Create Release Branch</a></li>
     <li><a href="#verchanges">Update Version Numbers</a></li>
   </ol>
-  <li><a href="#release-build">Building the Release</a></li>
+  </li>
+  <li><a href="#release-build">Building the Release</a>
   <ol>
     <li><a href="#dist">Build the LLVM Source Distributions</a></li>
     <li><a href="#build">Build LLVM</a></li>
@@ -95,18 +96,19 @@
     <li><a href="#clangbin">Build the Clang Binary Distribution</a></li>
     <li><a href="#target-build">Target Specific Build Details</a></li>
   </ol>
-  <li><a href="#release-qualify">Release Qualification Criteria</a></li>
+  </li>
+  <li><a href="#release-qualify">Release Qualification Criteria</a>
   <ol>
     <li><a href="#llvm-qualify">Qualify LLVM</a></li>
     <li><a href="#llvmgcc-qualify">Qualify LLVM-GCC</a></li>
     <li><a href="#clang-qualify">Qualify Clang</a></li>
     <li><a href="#targets">Specific Target Qualification Details</a></li>
   </ol>
+  </li>
 
   <li><a href="#commTest">Community Testing</a></li>    
   <li><a href="#release-patch">Release Patch Rules</a></li>
-  <li><a href="#release-final">Release final tasks</a></li>
-
+  <li><a href="#release-final">Release final tasks</a>
   <ol>
     <li><a href="#updocs">Update Documentation</a></li>
     <li><a href="#tag">Tag the LLVM Final Release</a></li>
@@ -114,14 +116,13 @@
     <li><a href="#webupdates">Update the LLVM Website</a></li>
     <li><a href="#announce">Announce the Release</a></li>
   </ol>
+  </li>
 </ol>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="release-admin">Release Administrative Tasks</a></div>
+<h3><a name="release-admin">Release Administrative Tasks</a></h3>
 
-<div class="doc_text">
+<div>
 
 <p>This section describes a few administrative tasks that need to be done for
    the release process to begin. Specifically, it involves:</p>
@@ -132,12 +133,10 @@
   <li>Tagging release candidates for the release team to begin testing</li>
 </ul>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsubsection"><a name="branch">Create Release Branch</a></div>
+<h4><a name="branch">Create Release Branch</a></h4>
 
-<div class="doc_text">
+<div>
 
 <p>Branch the Subversion trunk using the following procedure:</p>
 
@@ -195,9 +194,9 @@ $ svn co https://llvm.org/svn/llvm-project/cfe/branches/release_<i>XY</i> clang-
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection"><a name="verchanges">Update LLVM Version</a></div>
+<h4><a name="verchanges">Update LLVM Version</a></h4>
 
-<div class="doc_text">
+<div>
 
 <p>After creating the LLVM release branch, update the release branches'
    <tt>autoconf</tt> and <tt>configure.ac</tt> versions from '<tt>X.Ysvn</tt>'
@@ -211,9 +210,9 @@ $ svn co https://llvm.org/svn/llvm-project/cfe/branches/release_<i>XY</i> clang-
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection"><a name="dist">Build the LLVM Release Candidates</a></div>
+<h4><a name="dist">Build the LLVM Release Candidates</a></h4>
 
-<div class="doc_text">
+<div>
 
 <p>Create release candidates for <tt>llvm</tt>, <tt>llvm-gcc</tt>,
    <tt>clang</tt>, and the LLVM <tt>test-suite</tt> by tagging the branch with
@@ -265,10 +264,12 @@ $ tar -cvf - clang-<i>X.Y</i>rc1       | gzip &gt; clang-<i>X.Y</i>rc1.src.tar.g
 
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="release-build">Building the Release</a></div>
+<h3><a name="release-build">Building the Release</a></h3>
 
-<div class="doc_text">
+<div>
 
 <p>The builds of <tt>llvm</tt>, <tt>llvm-gcc</tt>, and <tt>clang</tt>
    <em>must</em> be free of errors and warnings in Debug, Release+Asserts, and
@@ -284,24 +285,22 @@ $ tar -cvf - clang-<i>X.Y</i>rc1       | gzip &gt; clang-<i>X.Y</i>rc1.src.tar.g
   <tr align="left"><td>Release</td><td><tt>ENABLE_OPTIMIZED=1 DISABLE_ASSERTIONS=1</tt></td></tr>
 </table>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsubsection"><a name="build">Build LLVM</a></div>
+<h4><a name="build">Build LLVM</a></h4>
 
-<div class="doc_text">
+<div>
 
 <p>Build <tt>Debug</tt>, <tt>Release+Asserts</tt>, and <tt>Release</tt> versions
    of <tt>llvm</tt> on all supported platforms. Directions to build
    <tt>llvm</tt> are
-   <a href="http://llvm.org/docs/GettingStarted.html#quickstart">here</a>.</p>
+   <a href="GettingStarted.html#quickstart">here</a>.</p>
 
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection"><a name="llvmgccbin">Build the LLVM GCC Binary Distribution</a></div>
+<h4><a name="llvmgccbin">Build the LLVM GCC Binary Distribution</a></h4>
 
-<div class="doc_text">
+<div>
 
 <p>Creating the <tt>llvm-gcc</tt> binary distribution (Release/Optimized)
    requires performing the following steps for each supported platform:</p>
@@ -326,9 +325,9 @@ $ tar -cvf - clang-<i>X.Y</i>rc1       | gzip &gt; clang-<i>X.Y</i>rc1.src.tar.g
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection"><a name="clangbin">Build Clang Binary Distribution</a></div>
+<h4><a name="clangbin">Build Clang Binary Distribution</a></h4>
 
-<div class="doc_text">
+<div>
 
 <p>Creating the <tt>clang</tt> binary distribution
    (Debug/Release+Asserts/Release) requires performing the following steps for
@@ -347,9 +346,9 @@ $ tar -cvf - clang-<i>X.Y</i>rc1       | gzip &gt; clang-<i>X.Y</i>rc1.src.tar.g
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection"><a name="target-build">Target Specific Build Details</a></div>
+<h4><a name="target-build">Target Specific Build Details</a></h4>
 
-<div class="doc_text">
+<div>
 
 <p>The table below specifies which compilers are used for each Arch/OS
    combination when qualifying the build of <tt>llvm</tt>, <tt>llvm-gcc</tt>,
@@ -368,11 +367,12 @@ $ tar -cvf - clang-<i>X.Y</i>rc1       | gzip &gt; clang-<i>X.Y</i>rc1.src.tar.g
 
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="release-qualify">
-Building the Release</a></div>
+<h3><a name="release-qualify">Building the Release</a></h3>
 
-<div class="doc_text">
+<div>
 
 <p>A release is qualified when it has no regressions from the previous release
    (or baseline). Regressions are related to correctness first and performance
@@ -388,12 +388,10 @@ Building the Release</a></div>
    criteria, but these are the criteria which we found to be most important and
    which must be satisfied before a release can go out</b></p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsubsection"><a name="llvm-qualify">Qualify LLVM</a></div>
+<h4><a name="llvm-qualify">Qualify LLVM</a></h4>
 
-<div class="doc_text">
+<div>
 
 <p>LLVM is qualified when it has a clean test run without a front-end. And it
    has no regressions when using either <tt>llvm-gcc</tt> or <tt>clang</tt> with
@@ -402,9 +400,9 @@ Building the Release</a></div>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection"><a name="llvmgcc-qualify">Qualify LLVM-GCC</a></div>
+<h4><a name="llvmgcc-qualify">Qualify LLVM-GCC</a></h4>
 
-<div class="doc_text">
+<div>
 
 <p><tt>LLVM-GCC</tt> is qualified when front-end specific tests in the
    <tt>llvm</tt> regression test suite all pass and there are no regressions in
@@ -415,9 +413,9 @@ Building the Release</a></div>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection"><a name="clang-qualify">Qualify Clang</a></div>
+<h4><a name="clang-qualify">Qualify Clang</a></h4>
 
-<div class="doc_text">
+<div>
 
 <p><tt>Clang</tt> is qualified when front-end specific tests in the 
    <tt>llvm</tt> dejagnu test suite all pass, clang's own test suite passes
@@ -426,10 +424,9 @@ Building the Release</a></div>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection"><a name="targets">Specific Target 
-Qualification Details</a></div>
+<h4><a name="targets">Specific Target Qualification Details</a></h4>
 
-<div class="doc_text">
+<div>
 
 <table>
   <tr><th>Architecture</th><th>OS</th><th>llvm-gcc baseline</th><th>clang baseline</th><th>tests</th></tr>
@@ -443,9 +440,11 @@ Qualification Details</a></div>
 
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="commTest">Community Testing</a></div>
-<div class="doc_text">
+<h3><a name="commTest">Community Testing</a></h3>
+<div>
 
 <p>Once all testing has been completed and appropriate bugs filed, the release
    candidate tarballs are put on the website and the LLVM community is
@@ -481,9 +480,9 @@ Qualification Details</a></div>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="release-patch">Release Patch Rules</a></div>
+<h3><a name="release-patch">Release Patch Rules</a></h3>
 
-<div class="doc_text">
+<div>
 
 <p>Below are the rules regarding patching the release branch:</p>
 
@@ -505,22 +504,18 @@ Qualification Details</a></div>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="release-final">Release Final Tasks 
-</a></div>
+<h3><a name="release-final">Release Final Tasks</a></h3>
 
-<div class="doc_text">
+<div>
 
 <p>The final stages of the release process involves tagging the "final" release
    branch, updating documentation that refers to the release, and updating the
    demo page.</p>
 
-</div>
-
-
 <!-- ======================================================================= -->
-<div class="doc_subsubsection"><a name="updocs">Update Documentation</a></div>
+<h4><a name="updocs">Update Documentation</a></h4>
 
-<div class="doc_text">
+<div>
 
 <p>Review the documentation and ensure that it is up to date. The "Release
    Notes" must be updated to reflect new features, bug fixes, new known issues,
@@ -532,9 +527,9 @@ Qualification Details</a></div>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection"><a name="tag">Tag the LLVM Final Release</a></div>
+<h4><a name="tag">Tag the LLVM Final Release</a></h4>
 
-<div class="doc_text">
+<div>
 
 <p>Tag the final release sources using the following procedure:</p>
 
@@ -556,20 +551,20 @@ $ svn copy https://llvm.org/svn/llvm-project/cfe/branches/release_XY \
 
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="updemo">Update the LLVM Demo Page</a></div>
+<h3><a name="updemo">Update the LLVM Demo Page</a></h3>
 
-<div class="doc_text">
+<div>
 
 <p>The LLVM demo page must be updated to use the new release. This consists of
    using the new <tt>llvm-gcc</tt> binary and building LLVM.</p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsubsection"><a name="webupdates">Update the LLVM Website</a></div>
+<h4><a name="webupdates">Update the LLVM Website</a></h4>
 
-<div class="doc_text">
+<div>
 
 <p>The website must be updated before the release announcement is sent out. Here
    is what to do:</p>
@@ -603,14 +598,18 @@ $ svn copy https://llvm.org/svn/llvm-project/cfe/branches/release_XY \
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection"><a name="announce">Announce the Release</a></div>
+<h4><a name="announce">Announce the Release</a></h4>
 
-<div class="doc_text">
+<div>
 
 <p>Have Chris send out the release announcement when everything is finished.</p>
 
 </div>
 
+</div>
+
+</div>
+
 <!-- *********************************************************************** -->
 <hr>
 <address>
@@ -618,7 +617,7 @@ $ svn copy https://llvm.org/svn/llvm-project/cfe/branches/release_XY \
   src="http://jigsaw.w3.org/css-validator/images/vcss-blue" alt="Valid CSS"></a>
   <a href="http://validator.w3.org/check/referer"><img
   src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
-  <a href="http://llvm.cs.uiuc.edu">The LLVM Compiler Infrastructure</a>
+  <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a>
   <br>
   Last modified: $Date$
 </address>
diff --git a/docs/HowToSubmitABug.html b/docs/HowToSubmitABug.html
index 90efbe3..10ff9de 100644
--- a/docs/HowToSubmitABug.html
+++ b/docs/HowToSubmitABug.html
@@ -7,9 +7,9 @@
 </head>
 <body>
 
-<div class="doc_title">
+<h1>
   How to submit an LLVM bug report
-</div>
+</h1>
 
 <table class="layout" style="width: 90%" >
 <tr class="layout">
@@ -37,12 +37,12 @@
 </table>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="introduction">Introduction - Got bugs?</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>If you're working with LLVM and run into a bug, we definitely want to know
 about it.  This document describes what you can do to increase the odds of
@@ -76,12 +76,12 @@ information:</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="crashers">Crashing Bugs</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>More often than not, bugs in the compiler cause it to crash&mdash;often due
 to an assertion failure of some sort. The most important
@@ -109,14 +109,12 @@ with the following extra command line options:</p>
 
 </ul>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="front-end">Front-end bugs</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>If the problem is in the front-end, you should re-run the same
 <tt>llvm-gcc</tt> command that resulted in the crash, but add the
@@ -137,11 +135,11 @@ has instructions on the best way to use delta.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="ct_optimizer">Compile-time optimization bugs</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>If you find that a bug crashes in the optimizer, compile your test-case to a
 <tt>.bc</tt> file by passing "<tt><b>-emit-llvm -O0 -c -o foo.bc</b></tt>".
@@ -153,7 +151,7 @@ Then run:</p>
 </div>
 
 <p>This command should do two things: it should print out a list of passes, and
-then it should crash in the same was as llvm-gcc.  If it doesn't crash, please
+then it should crash in the same way as llvm-gcc.  If it doesn't crash, please
 follow the instructions for a <a href="#front-end">front-end bug</a>.</p>
 
 <p>If this does crash, then you should be able to debug this with the following
@@ -171,11 +169,11 @@ that bugpoint emits.  If something goes wrong with bugpoint, please submit the
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="ct_codegen">Code generator bugs</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>If you find a bug that crashes llvm-gcc in the code generator, compile your
 source file to a .bc file by passing "<tt><b>-emit-llvm -c -o foo.bc</b></tt>"
@@ -207,13 +205,15 @@ that bugpoint emits.  If something goes wrong with bugpoint, please submit the
 
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="miscompilations">Miscompilations</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>If llvm-gcc successfully produces an executable, but that executable doesn't
 run right, this is either a bug in the code or a bug in the
@@ -241,12 +241,12 @@ error.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="codegen">Incorrect code generation</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Similarly to debugging incorrect compilation by mis-behaving passes, you can
 debug incorrect code generation by either LLC or the JIT, using
@@ -338,7 +338,7 @@ the following:</p>
   src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
 
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
-  <a href="http://llvm.org">The LLVM Compiler Infrastructure</a>
+  <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a>
   <br>
   Last modified: $Date$
 </address>
diff --git a/docs/LangRef.html b/docs/LangRef.html
index eb36f09..7823f12 100644
--- a/docs/LangRef.html
+++ b/docs/LangRef.html
@@ -12,7 +12,7 @@
 
 <body>
 
-<div class="doc_title"> LLVM Language Reference Manual </div>
+<h1>LLVM Language Reference Manual</h1>
 <ol>
   <li><a href="#abstract">Abstract</a></li>
   <li><a href="#introduction">Introduction</a></li>
@@ -239,6 +239,8 @@
           <li><a href="#int_sin">'<tt>llvm.sin.*</tt>' Intrinsic</a></li>
           <li><a href="#int_cos">'<tt>llvm.cos.*</tt>' Intrinsic</a></li>
           <li><a href="#int_pow">'<tt>llvm.pow.*</tt>' Intrinsic</a></li>
+          <li><a href="#int_exp">'<tt>llvm.exp.*</tt>' Intrinsic</a></li>
+          <li><a href="#int_log">'<tt>llvm.log.*</tt>' Intrinsic</a></li>
         </ol>
       </li>
       <li><a href="#int_manip">Bit Manipulation Intrinsics</a>
@@ -321,10 +323,10 @@
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"> <a name="abstract">Abstract </a></div>
+<h2><a name="abstract">Abstract</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>This document is a reference manual for the LLVM assembly language. LLVM is
    a Static Single Assignment (SSA) based representation that provides type
@@ -335,10 +337,10 @@
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"> <a name="introduction">Introduction</a> </div>
+<h2><a name="introduction">Introduction</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>The LLVM code representation is designed to be used in three different forms:
    as an in-memory compiler IR, as an on-disk bitcode representation (suitable
@@ -359,12 +361,12 @@
    variable is never accessed outside of the current function, allowing it to
    be promoted to a simple SSA value instead of a memory location.</p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"> <a name="wellformed">Well-Formedness</a> </div>
+<h4>
+  <a name="wellformed">Well-Formedness</a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>It is important to note that this document describes 'well formed' LLVM
    assembly language.  There is a difference between what the parser accepts and
@@ -384,13 +386,15 @@
 
 </div>
 
+</div>
+
 <!-- Describe the typesetting conventions here. -->
 
 <!-- *********************************************************************** -->
-<div class="doc_section"> <a name="identifiers">Identifiers</a> </div>
+<h2><a name="identifiers">Identifiers</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>LLVM identifiers come in two basic types: global and local. Global
    identifiers (functions, global variables) begin with the <tt>'@'</tt>
@@ -475,14 +479,15 @@
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"> <a name="highlevel">High Level Structure</a> </div>
+<h2><a name="highlevel">High Level Structure</a></h2>
 <!-- *********************************************************************** -->
-
+<div>
 <!-- ======================================================================= -->
-<div class="doc_subsection"> <a name="modulestructure">Module Structure</a>
-</div>
+<h3>
+  <a name="modulestructure">Module Structure</a>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>LLVM programs are composed of "Module"s, each of which is a translation unit
    of the input programs.  Each module consists of functions, global variables,
@@ -528,11 +533,11 @@ define i32 @main() {   <i>; i32()* </i>&nbsp;
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="linkage">Linkage Types</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>All Global Variables and Functions have one of the following types of
    linkage:</p>
@@ -677,11 +682,11 @@ define i32 @main() {   <i>; i32()* </i>&nbsp;
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="callingconv">Calling Conventions</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>LLVM <a href="#functionstructure">functions</a>, <a href="#i_call">calls</a>
    and <a href="#i_invoke">invokes</a> can all have an optional calling
@@ -750,11 +755,11 @@ define i32 @main() {   <i>; i32()* </i>&nbsp;
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="visibility">Visibility Styles</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>All Global Variables and Functions have one of the following visibility
    styles:</p>
@@ -784,11 +789,11 @@ define i32 @main() {   <i>; i32()* </i>&nbsp;
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="namedtypes">Named Types</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>LLVM IR allows you to specify name aliases for certain types.  This can make
    it easier to read the IR and make the IR more condensed (particularly when
@@ -815,11 +820,11 @@ define i32 @main() {   <i>; i32()* </i>&nbsp;
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="globalvars">Global Variables</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>Global variables define regions of memory allocated at compilation time
    instead of run-time.  Global variables may optionally be initialized, may
@@ -883,11 +888,11 @@ define i32 @main() {   <i>; i32()* </i>&nbsp;
 
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="functionstructure">Functions</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>LLVM function definitions consist of the "<tt>define</tt>" keyword, an
    optional <a href="#linkage">linkage type</a>, an optional
@@ -946,11 +951,11 @@ define [<a href="#linkage">linkage</a>] [<a href="#visibility">visibility</a>]
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="aliasstructure">Aliases</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>Aliases act as "second name" for the aliasee value (which can be either
    function, global variable, another alias or bitcast of global value). Aliases
@@ -965,11 +970,11 @@ define [<a href="#linkage">linkage</a>] [<a href="#visibility">visibility</a>]
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="namedmetadatastructure">Named Metadata</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>Named metadata is a collection of metadata. <a href="#metadata">Metadata
    nodes</a> (but not metadata strings) are the only valid operands for
@@ -988,9 +993,11 @@ define [<a href="#linkage">linkage</a>] [<a href="#visibility">visibility</a>]
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="paramattrs">Parameter Attributes</a></div>
+<h3>
+  <a name="paramattrs">Parameter Attributes</a>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>The return type and each parameter of a function type may have a set of
    <i>parameter attributes</i> associated with them. Parameter attributes are
@@ -1097,11 +1104,11 @@ declare signext i8 @returns_signed_char()
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="gc">Garbage Collector Names</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>Each function may specify a garbage collector name, which is simply a
    string:</p>
@@ -1117,11 +1124,11 @@ define void @f() gc "name" { ... }
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="fnattrs">Function Attributes</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>Function attributes are set to communicate additional information about a
    function. Function attributes are considered to be part of the function, not
@@ -1157,6 +1164,11 @@ define void @f() optsize { ... }
       Most of the functions in the Windows system DLLs in Windows XP SP2 or
       higher were compiled in this fashion.</dd>
 
+  <dt><tt><b>nonlazybind</b></tt></dt>
+  <dd>This attribute suppresses lazy symbol binding for the function. This
+      may make calls to the function faster, at the cost of extra program
+      startup time if the function is not called during program startup.</dd>
+
   <dt><tt><b>inlinehint</b></tt></dt>
   <dd>This attribute indicates that the source code contained a hint that inlining
       this function is desirable (such as the "inline" keyword in C/C++).  It
@@ -1240,11 +1252,11 @@ define void @f() optsize { ... }
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="moduleasm">Module-Level Inline Assembly</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>Modules may contain "module-level inline asm" blocks, which corresponds to
    the GCC "file scope inline asm" blocks.  These blocks are internally
@@ -1266,11 +1278,11 @@ module asm "more can go here"
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="datalayout">Data Layout</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>A module may specify a target specific data layout string that specifies how
    data is to be laid out in memory. The syntax for the data layout is
@@ -1378,11 +1390,11 @@ target datalayout = "<i>layout specification</i>"
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="pointeraliasing">Pointer Aliasing Rules</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>Any memory access must be done through a pointer value associated
 with an address range of the memory access, otherwise the behavior
@@ -1442,11 +1454,11 @@ to implement type-based alias analysis.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="volatile">Volatile Memory Accesses</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>Certain memory accesses, such as <a href="#i_load"><tt>load</tt></a>s, <a
 href="#i_store"><tt>store</tt></a>s, and <a
@@ -1459,11 +1471,13 @@ synchronization behavior.</p>
 
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section"> <a name="typesystem">Type System</a> </div>
+<h2><a name="typesystem">Type System</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>The LLVM type system is one of the most important features of the
    intermediate representation.  Being typed enables a number of optimizations
@@ -1473,13 +1487,12 @@ synchronization behavior.</p>
    and transformations that are not feasible to perform on normal three address
    code representations.</p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsection"> <a name="t_classifications">Type
-Classifications</a> </div>
+<h3>
+  <a name="t_classifications">Type Classifications</a>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>The types fall into a few useful classifications:</p>
 
@@ -1536,19 +1549,21 @@ Classifications</a> </div>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"> <a name="t_primitive">Primitive Types</a> </div>
+<h3>
+  <a name="t_primitive">Primitive Types</a>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>The primitive types are the fundamental building blocks of the LLVM
    system.</p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"> <a name="t_integer">Integer Type</a> </div>
+<h4>
+  <a name="t_integer">Integer Type</a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Overview:</h5>
 <p>The integer type is a very simple type that simply specifies an arbitrary
@@ -1582,9 +1597,11 @@ Classifications</a> </div>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"> <a name="t_floating">Floating Point Types</a> </div>
+<h4>
+  <a name="t_floating">Floating Point Types</a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <table>
   <tbody>
@@ -1600,9 +1617,11 @@ Classifications</a> </div>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"> <a name="t_x86mmx">X86mmx Type</a> </div>
+<h4>
+  <a name="t_x86mmx">X86mmx Type</a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Overview:</h5>
 <p>The x86mmx type represents a value held in an MMX register on an x86 machine.  The operations allowed on it are quite limited:  parameters and return values, load and store, and bitcast.  User-specified MMX instructions are represented as intrinsic or asm calls with arguments and/or results of this type.  There are no arrays, vectors or constants of this type.</p>
@@ -1615,9 +1634,11 @@ Classifications</a> </div>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"> <a name="t_void">Void Type</a> </div>
+<h4>
+  <a name="t_void">Void Type</a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Overview:</h5>
 <p>The void type does not represent any value and has no size.</p>
@@ -1630,9 +1651,11 @@ Classifications</a> </div>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"> <a name="t_label">Label Type</a> </div>
+<h4>
+  <a name="t_label">Label Type</a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Overview:</h5>
 <p>The label type represents code labels.</p>
@@ -1645,9 +1668,11 @@ Classifications</a> </div>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"> <a name="t_metadata">Metadata Type</a> </div>
+<h4>
+  <a name="t_metadata">Metadata Type</a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Overview:</h5>
 <p>The metadata type represents embedded metadata. No derived types may be
@@ -1661,11 +1686,14 @@ Classifications</a> </div>
 
 </div>
 
+</div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"> <a name="t_derived">Derived Types</a> </div>
+<h3>
+  <a name="t_derived">Derived Types</a>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>The real power in LLVM comes from the derived types in the system.  This is
    what allows a programmer to represent arrays, functions, pointers, and other
@@ -1675,12 +1703,12 @@ Classifications</a> </div>
    of another array.</p>
 
    
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"> <a name="t_aggregate">Aggregate Types</a> </div>
+<h4>
+  <a name="t_aggregate">Aggregate Types</a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>Aggregate Types are a subset of derived types that can contain multiple
   member types. <a href="#t_array">Arrays</a>,
@@ -1690,9 +1718,11 @@ Classifications</a> </div>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"> <a name="t_array">Array Type</a> </div>
+<h4>
+  <a name="t_array">Array Type</a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Overview:</h5>
 <p>The array type is a very simple derived type that arranges elements
@@ -1748,9 +1778,11 @@ Classifications</a> </div>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"> <a name="t_function">Function Type</a> </div>
+<h4>
+  <a name="t_function">Function Type</a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Overview:</h5>
 <p>The function type can be thought of as a function signature.  It consists of
@@ -1801,9 +1833,11 @@ Classifications</a> </div>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"> <a name="t_struct">Structure Type</a> </div>
+<h4>
+  <a name="t_struct">Structure Type</a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Overview:</h5>
 <p>The structure type is used to represent a collection of data members together
@@ -1839,10 +1873,11 @@ Classifications</a> </div>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"> <a name="t_pstruct">Packed Structure Type</a>
-</div>
+<h4>
+  <a name="t_pstruct">Packed Structure Type</a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Overview:</h5>
 <p>The packed structure type is used to represent a collection of data members
@@ -1877,9 +1912,11 @@ Classifications</a> </div>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"> <a name="t_pointer">Pointer Type</a> </div>
+<h4>
+  <a name="t_pointer">Pointer Type</a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Overview:</h5>
 <p>The pointer type is used to specify memory locations.
@@ -1921,9 +1958,11 @@ Classifications</a> </div>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"> <a name="t_vector">Vector Type</a> </div>
+<h4>
+  <a name="t_vector">Vector Type</a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Overview:</h5>
 <p>A vector type is a simple derived type that represents a vector of elements.
@@ -1960,8 +1999,11 @@ Classifications</a> </div>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"> <a name="t_opaque">Opaque Type</a> </div>
-<div class="doc_text">
+<h4>
+  <a name="t_opaque">Opaque Type</a>
+</h4>
+
+<div>
 
 <h5>Overview:</h5>
 <p>Opaque types are used to represent unknown types in the system.  This
@@ -1984,12 +2026,14 @@ Classifications</a> </div>
 
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="t_uprefs">Type Up-references</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <h5>Overview:</h5>
 <p>An "up reference" allows you to refer to a lexically enclosing type without
@@ -2032,21 +2076,23 @@ Classifications</a> </div>
 
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section"> <a name="constants">Constants</a> </div>
+<h2><a name="constants">Constants</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>LLVM has several different basic types of constants.  This section describes
    them all and their syntax.</p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="simpleconstants">Simple Constants</a></div>
+<h3>
+  <a name="simpleconstants">Simple Constants</a>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <dl>
   <dt><b>Boolean constants</b></dt>
@@ -2099,12 +2145,12 @@ Classifications</a> </div>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
 <a name="aggregateconstants"></a> <!-- old anchor -->
 <a name="complexconstants">Complex Constants</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>Complex constants are a (potentially recursive) combination of simple
    constants and smaller complex constants.</p>
@@ -2154,11 +2200,11 @@ Classifications</a> </div>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="globalconstants">Global Variable and Function Addresses</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>The addresses of <a href="#globalvars">global variables</a>
    and <a href="#functionstructure">functions</a> are always implicitly valid
@@ -2176,8 +2222,11 @@ Classifications</a> </div>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="undefvalues">Undefined Values</a></div>
-<div class="doc_text">
+<h3>
+  <a name="undefvalues">Undefined Values</a>
+</h3>
+
+<div>
 
 <p>The string '<tt>undef</tt>' can be used anywhere a constant is expected, and
    indicates that the user of the value may receive an unspecified bit-pattern.
@@ -2316,8 +2365,11 @@ b: unreachable
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="trapvalues">Trap Values</a></div>
-<div class="doc_text">
+<h3>
+  <a name="trapvalues">Trap Values</a>
+</h3>
+
+<div>
 
 <p>Trap values are similar to <a href="#undefvalues">undef values</a>, however
    instead of representing an unspecified bit pattern, they represent the
@@ -2369,7 +2421,12 @@ b: unreachable
     <a href="#terminators">terminator instruction</a>
     if the terminator instruction has multiple successors and the instruction
     is always executed when control transfers to one of the successors, and
-    may not be executed when control is transfered to another.</li>
+    may not be executed when control is transferred to another.</li>
+
+<li>Additionally, an instruction also <i>control-depends</i> on a terminator
+    instruction if the set of instructions it otherwise depends on would be
+    different if the terminator had transferred control to a different
+    successor.</li>
 
 <li>Dependence is transitive.</li>
 
@@ -2396,11 +2453,11 @@ entry:
 
   %narrowaddr = bitcast i32* @g to i16*
   %wideaddr = bitcast i32* @g to i64*
-  %trap3 = load 16* %narrowaddr      ; Returns a trap value.
-  %trap4 = load i64* %widaddr        ; Returns a trap value.
+  %trap3 = load i16* %narrowaddr     ; Returns a trap value.
+  %trap4 = load i64* %wideaddr       ; Returns a trap value.
 
-  %cmp = icmp i32 slt %trap, 0       ; Returns a trap value.
-  %br i1 %cmp, %true, %end           ; Branch to either destination.
+  %cmp = icmp slt i32 %trap, 0       ; Returns a trap value.
+  br i1 %cmp, label %true, label %end ; Branch to either destination.
 
 true:
   volatile store i32 0, i32* @g      ; This is control-dependent on %cmp, so
@@ -2413,17 +2470,34 @@ end:
                                      ; control-dependent on %cmp, so this
                                      ; always results in a trap value.
 
-  volatile store i32 0, i32* @g      ; %end is control-equivalent to %entry
-                                     ; so this is defined (ignoring earlier
+  volatile store i32 0, i32* @g      ; This would depend on the store in %true
+                                     ; if %cmp is true, or the store in %entry
+                                     ; otherwise, so this is undefined behavior.
+
+  br i1 %cmp, label %second_true, label %second_end
+                                     ; The same branch again, but this time the
+                                     ; true block doesn't have side effects.
+
+second_true:
+  ; No side effects!
+  ret void
+
+second_end:
+  volatile store i32 0, i32* @g      ; This time, the instruction always depends
+                                     ; on the store in %end. Also, it is
+                                     ; control-equivalent to %end, so this is
+                                     ; well-defined (again, ignoring earlier
                                      ; undefined behavior in this example).
 </pre>
 
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="blockaddress">Addresses of Basic
-    Blocks</a></div>
-<div class="doc_text">
+<h3>
+  <a name="blockaddress">Addresses of Basic Blocks</a>
+</h3>
+
+<div>
 
 <p><b><tt>blockaddress(@function, %block)</tt></b></p>
 
@@ -2448,10 +2522,11 @@ end:
 
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="constantexprs">Constant Expressions</a>
-</div>
+<h3>
+  <a name="constantexprs">Constant Expressions</a>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>Constant expressions are used to allow expressions involving other constants
    to be used as constants.  Constant expressions may be of
@@ -2577,16 +2652,18 @@ end:
 
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section"> <a name="othervalues">Other Values</a> </div>
+<h2><a name="othervalues">Other Values</a></h2>
 <!-- *********************************************************************** -->
-
+<div>
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
 <a name="inlineasm">Inline Assembler Expressions</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>LLVM supports inline assembler expressions (as opposed
    to <a href="#moduleasm"> Module-Level Inline Assembly</a>) through the use of
@@ -2635,13 +2712,12 @@ call void asm alignstack "eieio", ""()
    documented here.  Constraints on what can be done (e.g. duplication, moving,
    etc need to be documented).  This is probably best done by reference to
    another document that covers inline asm from a holistic perspective.</p>
-</div>
 
-<div class="doc_subsubsection">
+<h4>
 <a name="inlineasm_md">Inline Asm Metadata</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>The call instructions that wrap inline asm nodes may have a "!srcloc" MDNode
    attached to it that contains a list of constant integers.  If present, the
@@ -2662,12 +2738,14 @@ call void asm sideeffect "something bad", ""()<b>, !srcloc !42</b>
 
 </div>
 
-<!-- ======================================================================= -->
-<div class="doc_subsection"><a name="metadata">Metadata Nodes and Metadata
-  Strings</a>
 </div>
 
-<div class="doc_text">
+<!-- ======================================================================= -->
+<h3>
+  <a name="metadata">Metadata Nodes and Metadata Strings</a>
+</h3>
+
+<div>
 
 <p>LLVM IR allows metadata to be attached to instructions in the program that
    can convey extra information about the code to the optimizers and code
@@ -2709,13 +2787,14 @@ call void @llvm.dbg.value(metadata !24, i64 0, metadata !25)
 
 </div>
 
+</div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="intrinsic_globals">Intrinsic Global Variables</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
-
+<div>
 <p>LLVM has a number of "magic" global variables that contain data that affect
 code generation or other IR semantics.  These are documented here.  All globals
 of this sort should have a section specified as "<tt>llvm.metadata</tt>".  This
@@ -2723,11 +2802,11 @@ section and all globals that start with "<tt>llvm.</tt>" are reserved for use
 by LLVM.</p>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
 <a name="intg_used">The '<tt>llvm.used</tt>' Global Variable</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>The <tt>@llvm.used</tt> global is an array with i8* element type which has <a
 href="#linkage_appending">appending linkage</a>.  This array contains a list of
@@ -2758,11 +2837,13 @@ object file to prevent the assembler and linker from molesting the symbol.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
-<a name="intg_compiler_used">The '<tt>llvm.compiler.used</tt>' Global Variable</a>
-</div>
+<h3>
+  <a name="intg_compiler_used">
+    The '<tt>llvm.compiler.used</tt>' Global Variable
+  </a>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>The <tt>@llvm.compiler.used</tt> directive is the same as the
 <tt>@llvm.used</tt> directive, except that it only prevents the compiler from
@@ -2776,11 +2857,11 @@ should not be exposed to source languages.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
 <a name="intg_global_ctors">The '<tt>llvm.global_ctors</tt>' Global Variable</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 <pre>
 %0 = type { i32, void ()* }
 @llvm.global_ctors = appending global [1 x %0] [%0 { i32 65535, void ()* @ctor }]
@@ -2791,11 +2872,11 @@ should not be exposed to source languages.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
 <a name="intg_global_dtors">The '<tt>llvm.global_dtors</tt>' Global Variable</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 <pre>
 %0 = type { i32, void ()* }
 @llvm.global_dtors = appending global [1 x %0] [%0 { i32 65535, void ()* @dtor }]
@@ -2806,12 +2887,13 @@ should not be exposed to source languages.</p>
 
 </div>
 
+</div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"> <a name="instref">Instruction Reference</a> </div>
+<h2><a name="instref">Instruction Reference</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>The LLVM instruction set consists of several different classifications of
    instructions: <a href="#terminators">terminator
@@ -2820,13 +2902,12 @@ should not be exposed to source languages.</p>
    <a href="#memoryops">memory instructions</a>, and
    <a href="#otherops">other instructions</a>.</p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsection"> <a name="terminators">Terminator
-Instructions</a> </div>
+<h3>
+  <a name="terminators">Terminator Instructions</a>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>As mentioned <a href="#functionstructure">previously</a>, every basic block
    in a program ends with a "Terminator" instruction, which indicates which
@@ -2844,13 +2925,12 @@ Instructions</a> </div>
    '<a href="#i_unwind"><tt>unwind</tt></a>' instruction, and the
    '<a href="#i_unreachable"><tt>unreachable</tt></a>' instruction.</p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"> <a name="i_ret">'<tt>ret</tt>'
-Instruction</a> </div>
+<h4>
+  <a name="i_ret">'<tt>ret</tt>' Instruction</a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -2896,9 +2976,11 @@ Instruction</a> </div>
 
 </div>
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"> <a name="i_br">'<tt>br</tt>' Instruction</a> </div>
+<h4>
+  <a name="i_br">'<tt>br</tt>' Instruction</a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -2937,11 +3019,11 @@ IfUnequal:
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
    <a name="i_switch">'<tt>switch</tt>' Instruction</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -2992,11 +3074,11 @@ IfUnequal:
 
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
    <a name="i_indirectbr">'<tt>indirectbr</tt>' Instruction</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -3040,11 +3122,11 @@ IfUnequal:
 
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="i_invoke">'<tt>invoke</tt>' Instruction</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -3130,10 +3212,11 @@ that the invoke/unwind semantics are likely to change in future versions.</p>
 
 <!-- _______________________________________________________________________ -->
 
-<div class="doc_subsubsection"> <a name="i_unwind">'<tt>unwind</tt>'
-Instruction</a> </div>
+<h4>
+  <a name="i_unwind">'<tt>unwind</tt>' Instruction</a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -3161,10 +3244,11 @@ that the invoke/unwind semantics are likely to change in future versions.</p>
 
 <!-- _______________________________________________________________________ -->
 
-<div class="doc_subsubsection"> <a name="i_unreachable">'<tt>unreachable</tt>'
-Instruction</a> </div>
+<h4>
+  <a name="i_unreachable">'<tt>unreachable</tt>' Instruction</a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -3182,10 +3266,14 @@ Instruction</a> </div>
 
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection"> <a name="binaryops">Binary Operations</a> </div>
+<h3>
+  <a name="binaryops">Binary Operations</a>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>Binary operators are used to do most of the computation in a program.  They
    require two operands of the same type, execute an operation on them, and
@@ -3195,14 +3283,12 @@ Instruction</a> </div>
 
 <p>There are several different binary operators:</p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="i_add">'<tt>add</tt>' Instruction</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -3243,11 +3329,11 @@ Instruction</a> </div>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="i_fadd">'<tt>fadd</tt>' Instruction</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -3273,11 +3359,11 @@ Instruction</a> </div>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
    <a name="i_sub">'<tt>sub</tt>' Instruction</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -3325,11 +3411,11 @@ Instruction</a> </div>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
    <a name="i_fsub">'<tt>fsub</tt>' Instruction</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -3361,11 +3447,11 @@ Instruction</a> </div>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="i_mul">'<tt>mul</tt>' Instruction</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -3411,11 +3497,11 @@ Instruction</a> </div>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="i_fmul">'<tt>fmul</tt>' Instruction</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -3441,10 +3527,11 @@ Instruction</a> </div>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"> <a name="i_udiv">'<tt>udiv</tt>' Instruction
-</a></div>
+<h4>
+  <a name="i_udiv">'<tt>udiv</tt>' Instruction</a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -3481,10 +3568,11 @@ Instruction</a> </div>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"> <a name="i_sdiv">'<tt>sdiv</tt>' Instruction
-</a> </div>
+<h4>
+  <a name="i_sdiv">'<tt>sdiv</tt>' Instruction</a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -3523,10 +3611,11 @@ Instruction</a> </div>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"> <a name="i_fdiv">'<tt>fdiv</tt>'
-Instruction</a> </div>
+<h4>
+  <a name="i_fdiv">'<tt>fdiv</tt>' Instruction</a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -3552,10 +3641,11 @@ Instruction</a> </div>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"> <a name="i_urem">'<tt>urem</tt>' Instruction</a>
-</div>
+<h4>
+  <a name="i_urem">'<tt>urem</tt>' Instruction</a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -3589,11 +3679,11 @@ Instruction</a> </div>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="i_srem">'<tt>srem</tt>' Instruction</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -3640,10 +3730,11 @@ Instruction</a> </div>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
-  <a name="i_frem">'<tt>frem</tt>' Instruction</a> </div>
+<h4>
+  <a name="i_frem">'<tt>frem</tt>' Instruction</a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -3670,11 +3761,14 @@ Instruction</a> </div>
 
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection"> <a name="bitwiseops">Bitwise Binary
-Operations</a> </div>
+<h3>
+  <a name="bitwiseops">Bitwise Binary Operations</a>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>Bitwise binary operators are used to do various forms of bit-twiddling in a
    program.  They are generally very efficient instructions and can commonly be
@@ -3682,13 +3776,12 @@ Operations</a> </div>
    same type, execute an operation on them, and produce a single value.  The
    resulting value is the same type as its operands.</p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"> <a name="i_shl">'<tt>shl</tt>'
-Instruction</a> </div>
+<h4>
+  <a name="i_shl">'<tt>shl</tt>' Instruction</a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -3735,10 +3828,11 @@ Instruction</a> </div>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"> <a name="i_lshr">'<tt>lshr</tt>'
-Instruction</a> </div>
+<h4>
+  <a name="i_lshr">'<tt>lshr</tt>' Instruction</a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -3781,9 +3875,11 @@ Instruction</a> </div>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"> <a name="i_ashr">'<tt>ashr</tt>'
-Instruction</a> </div>
-<div class="doc_text">
+<h4>
+  <a name="i_ashr">'<tt>ashr</tt>' Instruction</a>
+</h4>
+
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -3826,10 +3922,11 @@ Instruction</a> </div>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"> <a name="i_and">'<tt>and</tt>'
-Instruction</a> </div>
+<h4>
+  <a name="i_and">'<tt>and</tt>' Instruction</a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -3886,9 +3983,11 @@ Instruction</a> </div>
 </pre>
 </div>
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"> <a name="i_or">'<tt>or</tt>' Instruction</a> </div>
+<h4>
+  <a name="i_or">'<tt>or</tt>' Instruction</a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -3947,10 +4046,11 @@ Instruction</a> </div>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"> <a name="i_xor">'<tt>xor</tt>'
-Instruction</a> </div>
+<h4>
+  <a name="i_xor">'<tt>xor</tt>' Instruction</a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -4010,12 +4110,14 @@ Instruction</a> </div>
 
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="vectorops">Vector Operations</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>LLVM supports several instructions to represent vector operations in a
    target-independent manner.  These instructions cover the element-access and
@@ -4024,14 +4126,12 @@ Instruction</a> </div>
    will want to use target-specific intrinsics to take full advantage of a
    specific target.</p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
    <a name="i_extractelement">'<tt>extractelement</tt>' Instruction</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -4063,11 +4163,11 @@ Instruction</a> </div>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
    <a name="i_insertelement">'<tt>insertelement</tt>' Instruction</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -4099,11 +4199,11 @@ Instruction</a> </div>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
    <a name="i_shufflevector">'<tt>shufflevector</tt>' Instruction</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -4146,24 +4246,24 @@ Instruction</a> </div>
 
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="aggregateops">Aggregate Operations</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>LLVM supports several instructions for working with
   <a href="#t_aggregate">aggregate</a> values.</p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
    <a name="i_extractvalue">'<tt>extractvalue</tt>' Instruction</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -4201,15 +4301,15 @@ Instruction</a> </div>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
    <a name="i_insertvalue">'<tt>insertvalue</tt>' Instruction</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
-  &lt;result&gt; = insertvalue &lt;aggregate type&gt; &lt;val&gt;, &lt;ty&gt; &lt;elt&gt;, &lt;idx&gt;    <i>; yields &lt;aggregate type&gt;</i>
+  &lt;result&gt; = insertvalue &lt;aggregate type&gt; &lt;val&gt;, &lt;ty&gt; &lt;elt&gt;, &lt;idx&gt;{, <idx>}*    <i>; yields &lt;aggregate type&gt;</i>
 </pre>
 
 <h5>Overview:</h5>
@@ -4233,33 +4333,33 @@ Instruction</a> </div>
 
 <h5>Example:</h5>
 <pre>
-  %agg1 = insertvalue {i32, float} undef, i32 1, 0         <i>; yields {i32 1, float undef}</i>
-  %agg2 = insertvalue {i32, float} %agg1, float %val, 1    <i>; yields {i32 1, float %val}</i>
+  %agg1 = insertvalue {i32, float} undef, i32 1, 0              <i>; yields {i32 1, float undef}</i>
+  %agg2 = insertvalue {i32, float} %agg1, float %val, 1         <i>; yields {i32 1, float %val}</i>
+  %agg3 = insertvalue {i32, {float}} %agg1, float %val, 1, 0    <i>; yields {i32 1, float %val}</i>
 </pre>
 
 </div>
 
+</div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="memoryops">Memory Access and Addressing Operations</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>A key design point of an SSA-based representation is how it represents
    memory.  In LLVM, no memory locations are in SSA form, which makes things
    very simple.  This section describes how to read, write, and allocate
    memory in LLVM.</p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="i_alloca">'<tt>alloca</tt>' Instruction</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -4306,10 +4406,11 @@ Instruction</a> </div>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"> <a name="i_load">'<tt>load</tt>'
-Instruction</a> </div>
+<h4>
+  <a name="i_load">'<tt>load</tt>' Instruction</a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -4364,10 +4465,11 @@ Instruction</a> </div>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"> <a name="i_store">'<tt>store</tt>'
-Instruction</a> </div>
+<h4>
+  <a name="i_store">'<tt>store</tt>' Instruction</a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -4425,11 +4527,11 @@ Instruction</a> </div>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
    <a name="i_getelementptr">'<tt>getelementptr</tt>' Instruction</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -4555,23 +4657,25 @@ entry:
 
 </div>
 
-<!-- ======================================================================= -->
-<div class="doc_subsection"> <a name="convertops">Conversion Operations</a>
 </div>
 
-<div class="doc_text">
+<!-- ======================================================================= -->
+<h3>
+  <a name="convertops">Conversion Operations</a>
+</h3>
+
+<div>
 
 <p>The instructions in this category are the conversion instructions (casting)
    which all take a single operand and a type. They perform various bit
    conversions on the operand.</p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
    <a name="i_trunc">'<tt>trunc .. to</tt>' Instruction</a>
-</div>
-<div class="doc_text">
+</h4>
+
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -4607,10 +4711,11 @@ entry:
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
    <a name="i_zext">'<tt>zext .. to</tt>' Instruction</a>
-</div>
-<div class="doc_text">
+</h4>
+
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -4646,10 +4751,11 @@ entry:
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
    <a name="i_sext">'<tt>sext .. to</tt>' Instruction</a>
-</div>
-<div class="doc_text">
+</h4>
+
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -4684,11 +4790,11 @@ entry:
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
    <a name="i_fptrunc">'<tt>fptrunc .. to</tt>' Instruction</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -4722,10 +4828,11 @@ entry:
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
    <a name="i_fpext">'<tt>fpext .. to</tt>' Instruction</a>
-</div>
-<div class="doc_text">
+</h4>
+
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -4758,10 +4865,11 @@ entry:
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
    <a name="i_fptoui">'<tt>fptoui .. to</tt>' Instruction</a>
-</div>
-<div class="doc_text">
+</h4>
+
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -4795,10 +4903,11 @@ entry:
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
    <a name="i_fptosi">'<tt>fptosi .. to</tt>' Instruction</a>
-</div>
-<div class="doc_text">
+</h4>
+
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -4833,10 +4942,11 @@ entry:
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
    <a name="i_uitofp">'<tt>uitofp .. to</tt>' Instruction</a>
-</div>
-<div class="doc_text">
+</h4>
+
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -4869,10 +4979,11 @@ entry:
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
    <a name="i_sitofp">'<tt>sitofp .. to</tt>' Instruction</a>
-</div>
-<div class="doc_text">
+</h4>
+
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -4904,10 +5015,11 @@ entry:
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
    <a name="i_ptrtoint">'<tt>ptrtoint .. to</tt>' Instruction</a>
-</div>
-<div class="doc_text">
+</h4>
+
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -4941,10 +5053,11 @@ entry:
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
    <a name="i_inttoptr">'<tt>inttoptr .. to</tt>' Instruction</a>
-</div>
-<div class="doc_text">
+</h4>
+
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -4978,10 +5091,11 @@ entry:
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
    <a name="i_bitcast">'<tt>bitcast .. to</tt>' Instruction</a>
-</div>
-<div class="doc_text">
+</h4>
+
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -5020,21 +5134,24 @@ entry:
 
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection"> <a name="otherops">Other Operations</a> </div>
+<h3>
+  <a name="otherops">Other Operations</a>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>The instructions in this category are the "miscellaneous" instructions, which
    defy better classification.</p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"><a name="i_icmp">'<tt>icmp</tt>' Instruction</a>
-</div>
+<h4>
+  <a name="i_icmp">'<tt>icmp</tt>' Instruction</a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -5133,10 +5250,11 @@ entry:
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"><a name="i_fcmp">'<tt>fcmp</tt>' Instruction</a>
-</div>
+<h4>
+  <a name="i_fcmp">'<tt>fcmp</tt>' Instruction</a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -5253,11 +5371,11 @@ entry:
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="i_phi">'<tt>phi</tt>' Instruction</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -5301,11 +5419,11 @@ Loop:       ; Infinite loop that counts from 0 on up...
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
    <a name="i_select">'<tt>select</tt>' Instruction</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -5344,11 +5462,11 @@ Loop:       ; Infinite loop that counts from 0 on up...
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="i_call">'<tt>call</tt>' Instruction</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -5453,11 +5571,11 @@ freestanding environments and non-C-based languages.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="i_va_arg">'<tt>va_arg</tt>' Instruction</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -5498,11 +5616,15 @@ freestanding environments and non-C-based languages.</p>
 
 </div>
 
+</div>
+
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section"> <a name="intrinsics">Intrinsic Functions</a> </div>
+<h2><a name="intrinsics">Intrinsic Functions</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>LLVM supports the notion of an "intrinsic function".  These functions have
    well known names and semantics and are required to follow certain
@@ -5545,14 +5667,12 @@ freestanding environments and non-C-based languages.</p>
 <p>To learn how to add an intrinsic function, please see the
    <a href="ExtendingLLVM.html">Extending LLVM Guide</a>.</p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="int_varargs">Variable Argument Handling Intrinsics</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>Variable argument support is defined in LLVM with
    the <a href="#i_va_arg"><tt>va_arg</tt></a> instruction and these three
@@ -5594,15 +5714,13 @@ declare void @llvm.va_copy(i8*, i8*)
 declare void @llvm.va_end(i8*)
 </pre>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="int_va_start">'<tt>llvm.va_start</tt>' Intrinsic</a>
-</div>
+</h4>
 
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -5628,11 +5746,11 @@ declare void @llvm.va_end(i8*)
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
  <a name="int_va_end">'<tt>llvm.va_end</tt>' Intrinsic</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -5659,11 +5777,11 @@ declare void @llvm.va_end(i8*)
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="int_va_copy">'<tt>llvm.va_copy</tt>' Intrinsic</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -5689,12 +5807,14 @@ declare void @llvm.va_end(i8*)
 
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="int_gc">Accurate Garbage Collection Intrinsics</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>LLVM support for <a href="GarbageCollection.html">Accurate Garbage
 Collection</a> (GC) requires the implementation and generation of these
@@ -5709,14 +5829,12 @@ LLVM</a>.</p>
 <p>The garbage collection intrinsics only operate on objects in the generic
    address space (address space zero).</p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="int_gcroot">'<tt>llvm.gcroot</tt>' Intrinsic</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -5743,11 +5861,11 @@ LLVM</a>.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="int_gcread">'<tt>llvm.gcread</tt>' Intrinsic</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -5775,11 +5893,11 @@ LLVM</a>.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="int_gcwrite">'<tt>llvm.gcwrite</tt>' Intrinsic</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -5806,24 +5924,24 @@ LLVM</a>.</p>
 
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="int_codegen">Code Generator Intrinsics</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>These intrinsics are provided by LLVM to expose special features that may
    only be implemented with code generator support.</p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="int_returnaddress">'<tt>llvm.returnaddress</tt>' Intrinsic</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -5854,11 +5972,11 @@ LLVM</a>.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="int_frameaddress">'<tt>llvm.frameaddress</tt>' Intrinsic</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -5888,11 +6006,11 @@ LLVM</a>.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="int_stacksave">'<tt>llvm.stacksave</tt>' Intrinsic</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -5918,11 +6036,11 @@ LLVM</a>.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="int_stackrestore">'<tt>llvm.stackrestore</tt>' Intrinsic</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -5943,15 +6061,15 @@ LLVM</a>.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="int_prefetch">'<tt>llvm.prefetch</tt>' Intrinsic</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
-  declare void @llvm.prefetch(i8* &lt;address&gt;, i32 &lt;rw&gt;, i32 &lt;locality&gt;)
+  declare void @llvm.prefetch(i8* &lt;address&gt;, i32 &lt;rw&gt;, i32 &lt;locality&gt;, i32 &lt;cache type&gt;)
 </pre>
 
 <h5>Overview:</h5>
@@ -5964,8 +6082,10 @@ LLVM</a>.</p>
 <p><tt>address</tt> is the address to be prefetched, <tt>rw</tt> is the
    specifier determining if the fetch should be for a read (0) or write (1),
    and <tt>locality</tt> is a temporal locality specifier ranging from (0) - no
-   locality, to (3) - extremely local keep in cache.  The <tt>rw</tt>
-   and <tt>locality</tt> arguments must be constant integers.</p>
+   locality, to (3) - extremely local keep in cache. The <tt>cache type</tt>
+   specifies whether the prefetch is performed on the data (1) or instruction (0)
+   cache. The <tt>rw</tt>, <tt>locality</tt> and <tt>cache type</tt> arguments
+   must be constant integers.</p>
 
 <h5>Semantics:</h5>
 <p>This intrinsic does not modify the behavior of the program.  In particular,
@@ -5976,11 +6096,11 @@ LLVM</a>.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="int_pcmarker">'<tt>llvm.pcmarker</tt>' Intrinsic</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -6007,11 +6127,11 @@ LLVM</a>.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="int_readcyclecounter">'<tt>llvm.readcyclecounter</tt>' Intrinsic</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -6033,26 +6153,26 @@ LLVM</a>.</p>
 
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="int_libc">Standard C Library Intrinsics</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>LLVM provides intrinsics for a few important standard C library functions.
    These intrinsics allow source-language front-ends to pass information about
    the alignment of the pointer arguments to the code generator, providing
    opportunity for more efficient code generation.</p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="int_memcpy">'<tt>llvm.memcpy</tt>' Intrinsic</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <p>This is an overloaded intrinsic. You can use <tt>llvm.memcpy</tt> on any
@@ -6102,11 +6222,11 @@ LLVM</a>.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="int_memmove">'<tt>llvm.memmove</tt>' Intrinsic</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <p>This is an overloaded intrinsic. You can use llvm.memmove on any integer bit
@@ -6158,11 +6278,11 @@ LLVM</a>.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="int_memset">'<tt>llvm.memset.*</tt>' Intrinsics</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <p>This is an overloaded intrinsic. You can use llvm.memset on any integer bit
@@ -6208,11 +6328,11 @@ LLVM</a>.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="int_sqrt">'<tt>llvm.sqrt.*</tt>' Intrinsic</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <p>This is an overloaded intrinsic. You can use <tt>llvm.sqrt</tt> on any
@@ -6246,11 +6366,11 @@ LLVM</a>.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="int_powi">'<tt>llvm.powi.*</tt>' Intrinsic</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <p>This is an overloaded intrinsic. You can use <tt>llvm.powi</tt> on any
@@ -6282,11 +6402,11 @@ LLVM</a>.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="int_sin">'<tt>llvm.sin.*</tt>' Intrinsic</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <p>This is an overloaded intrinsic. You can use <tt>llvm.sin</tt> on any
@@ -6316,11 +6436,11 @@ LLVM</a>.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="int_cos">'<tt>llvm.cos.*</tt>' Intrinsic</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <p>This is an overloaded intrinsic. You can use <tt>llvm.cos</tt> on any
@@ -6350,11 +6470,11 @@ LLVM</a>.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="int_pow">'<tt>llvm.pow.*</tt>' Intrinsic</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <p>This is an overloaded intrinsic. You can use <tt>llvm.pow</tt> on any
@@ -6384,24 +6504,90 @@ LLVM</a>.</p>
 
 </div>
 
+</div>
+
+<!-- _______________________________________________________________________ -->
+<h4>
+  <a name="int_exp">'<tt>llvm.exp.*</tt>' Intrinsic</a>
+</h4>
+
+<div>
+
+<h5>Syntax:</h5>
+<p>This is an overloaded intrinsic. You can use <tt>llvm.exp</tt> on any
+   floating point or vector of floating point type. Not all targets support all
+   types however.</p>
+
+<pre>
+  declare float     @llvm.exp.f32(float  %Val)
+  declare double    @llvm.exp.f64(double %Val)
+  declare x86_fp80  @llvm.exp.f80(x86_fp80  %Val)
+  declare fp128     @llvm.exp.f128(fp128 %Val)
+  declare ppc_fp128 @llvm.exp.ppcf128(ppc_fp128  %Val)
+</pre>
+
+<h5>Overview:</h5>
+<p>The '<tt>llvm.exp.*</tt>' intrinsics perform the exp function.</p>
+
+<h5>Arguments:</h5>
+<p>The argument and return value are floating point numbers of the same
+   type.</p>
+
+<h5>Semantics:</h5>
+<p>This function returns the same values as the libm <tt>exp</tt> functions
+   would, and handles error conditions in the same way.</p>
+
+</div>
+
+<!-- _______________________________________________________________________ -->
+<h4>
+  <a name="int_log">'<tt>llvm.log.*</tt>' Intrinsic</a>
+</h4>
+
+<div>
+
+<h5>Syntax:</h5>
+<p>This is an overloaded intrinsic. You can use <tt>llvm.log</tt> on any
+   floating point or vector of floating point type. Not all targets support all
+   types however.</p>
+
+<pre>
+  declare float     @llvm.log.f32(float  %Val)
+  declare double    @llvm.log.f64(double %Val)
+  declare x86_fp80  @llvm.log.f80(x86_fp80  %Val)
+  declare fp128     @llvm.log.f128(fp128 %Val)
+  declare ppc_fp128 @llvm.log.ppcf128(ppc_fp128  %Val)
+</pre>
+
+<h5>Overview:</h5>
+<p>The '<tt>llvm.log.*</tt>' intrinsics perform the log function.</p>
+
+<h5>Arguments:</h5>
+<p>The argument and return value are floating point numbers of the same
+   type.</p>
+
+<h5>Semantics:</h5>
+<p>This function returns the same values as the libm <tt>log</tt> functions
+   would, and handles error conditions in the same way.</p>
+
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="int_manip">Bit Manipulation Intrinsics</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>LLVM provides intrinsics for a few important bit manipulation operations.
    These allow efficient code generation for some algorithms.</p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="int_bswap">'<tt>llvm.bswap.*</tt>' Intrinsics</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <p>This is an overloaded intrinsic function. You can use bswap on any integer
@@ -6432,11 +6618,11 @@ LLVM</a>.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="int_ctpop">'<tt>llvm.ctpop.*</tt>' Intrinsic</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <p>This is an overloaded intrinsic. You can use llvm.ctpop on any integer bit
@@ -6464,11 +6650,11 @@ LLVM</a>.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="int_ctlz">'<tt>llvm.ctlz.*</tt>' Intrinsic</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <p>This is an overloaded intrinsic. You can use <tt>llvm.ctlz</tt> on any
@@ -6498,11 +6684,11 @@ LLVM</a>.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="int_cttz">'<tt>llvm.cttz.*</tt>' Intrinsic</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <p>This is an overloaded intrinsic. You can use <tt>llvm.cttz</tt> on any
@@ -6531,23 +6717,25 @@ LLVM</a>.</p>
 
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="int_overflow">Arithmetic with Overflow Intrinsics</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>LLVM provides intrinsics for some arithmetic with overflow operations.</p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
-  <a name="int_sadd_overflow">'<tt>llvm.sadd.with.overflow.*</tt>' Intrinsics</a>
-</div>
+<h4>
+  <a name="int_sadd_overflow">
+    '<tt>llvm.sadd.with.overflow.*</tt>' Intrinsics
+  </a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <p>This is an overloaded intrinsic. You can use <tt>llvm.sadd.with.overflow</tt>
@@ -6589,11 +6777,13 @@ LLVM</a>.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
-  <a name="int_uadd_overflow">'<tt>llvm.uadd.with.overflow.*</tt>' Intrinsics</a>
-</div>
+<h4>
+  <a name="int_uadd_overflow">
+    '<tt>llvm.uadd.with.overflow.*</tt>' Intrinsics
+  </a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <p>This is an overloaded intrinsic. You can use <tt>llvm.uadd.with.overflow</tt>
@@ -6634,11 +6824,13 @@ LLVM</a>.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
-  <a name="int_ssub_overflow">'<tt>llvm.ssub.with.overflow.*</tt>' Intrinsics</a>
-</div>
+<h4>
+  <a name="int_ssub_overflow">
+    '<tt>llvm.ssub.with.overflow.*</tt>' Intrinsics
+  </a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <p>This is an overloaded intrinsic. You can use <tt>llvm.ssub.with.overflow</tt>
@@ -6680,11 +6872,13 @@ LLVM</a>.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
-  <a name="int_usub_overflow">'<tt>llvm.usub.with.overflow.*</tt>' Intrinsics</a>
-</div>
+<h4>
+  <a name="int_usub_overflow">
+    '<tt>llvm.usub.with.overflow.*</tt>' Intrinsics
+  </a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <p>This is an overloaded intrinsic. You can use <tt>llvm.usub.with.overflow</tt>
@@ -6726,11 +6920,13 @@ LLVM</a>.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
-  <a name="int_smul_overflow">'<tt>llvm.smul.with.overflow.*</tt>' Intrinsics</a>
-</div>
+<h4>
+  <a name="int_smul_overflow">
+    '<tt>llvm.smul.with.overflow.*</tt>' Intrinsics
+  </a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <p>This is an overloaded intrinsic. You can use <tt>llvm.smul.with.overflow</tt>
@@ -6773,11 +6969,13 @@ LLVM</a>.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
-  <a name="int_umul_overflow">'<tt>llvm.umul.with.overflow.*</tt>' Intrinsics</a>
-</div>
+<h4>
+  <a name="int_umul_overflow">
+    '<tt>llvm.umul.with.overflow.*</tt>' Intrinsics
+  </a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <p>This is an overloaded intrinsic. You can use <tt>llvm.umul.with.overflow</tt>
@@ -6818,12 +7016,14 @@ LLVM</a>.</p>
 
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="int_fp16">Half Precision Floating Point Intrinsics</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>Half precision floating point is a storage-only format. This means that it is
    a dense encoding (in memory) but does not support computation in the
@@ -6837,14 +7037,15 @@ LLVM</a>.</p>
    float if needed, then converted to i16 with
    <a href="#int_convert_to_fp16"><tt>llvm.convert.to.fp16</tt></a>, then
    storing as an i16 value.</p>
-</div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
-  <a name="int_convert_to_fp16">'<tt>llvm.convert.to.fp16</tt>' Intrinsic</a>
-</div>
+<h4>
+  <a name="int_convert_to_fp16">
+    '<tt>llvm.convert.to.fp16</tt>' Intrinsic
+  </a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -6875,11 +7076,13 @@ LLVM</a>.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
- <a name="int_convert_from_fp16">'<tt>llvm.convert.from.fp16</tt>' Intrinsic</a>
-</div>
+<h4>
+  <a name="int_convert_from_fp16">
+    '<tt>llvm.convert.from.fp16</tt>' Intrinsic
+  </a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -6909,12 +7112,14 @@ LLVM</a>.</p>
 
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="int_debugger">Debugger Intrinsics</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>The LLVM debugger intrinsics (which all start with <tt>llvm.dbg.</tt>
    prefix), are described in
@@ -6924,11 +7129,11 @@ LLVM</a>.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="int_eh">Exception Handling Intrinsics</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>The LLVM exception handling intrinsics (which all start with
    <tt>llvm.eh.</tt> prefix), are described in
@@ -6938,11 +7143,11 @@ LLVM</a>.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="int_trampoline">Trampoline Intrinsic</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>This intrinsic makes it possible to excise one parameter, marked with
    the <a href="#nest"><tt>nest</tt></a> attribute, from a function.
@@ -6968,14 +7173,14 @@ LLVM</a>.</p>
 <p>The call <tt>%val = call i32 %fp(i32 %x, i32 %y)</tt> is then equivalent
    to <tt>%val = call i32 %f(i8* %nval, i32 %x, i32 %y)</tt>.</p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
-  <a name="int_it">'<tt>llvm.init.trampoline</tt>' Intrinsic</a>
-</div>
+<h4>
+  <a name="int_it">
+    '<tt>llvm.init.trampoline</tt>' Intrinsic
+  </a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -7012,12 +7217,14 @@ LLVM</a>.</p>
 
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="int_atomics">Atomic Operations and Synchronization Intrinsics</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>These intrinsic functions expand the "universal IR" of LLVM to represent
    hardware constructs for atomic operations and memory synchronization.  This
@@ -7037,13 +7244,12 @@ LLVM</a>.</p>
    No one model or paradigm should be selected above others unless the hardware
    itself ubiquitously does so.</p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="int_memory_barrier">'<tt>llvm.memory.barrier</tt>' Intrinsic</a>
-</div>
-<div class="doc_text">
+</h4>
+
+<div>
 <h5>Syntax:</h5>
 <pre>
   declare void @llvm.memory.barrier(i1 &lt;ll&gt;, i1 &lt;ls&gt;, i1 &lt;sl&gt;, i1 &lt;ss&gt;, i1 &lt;device&gt;)
@@ -7111,11 +7317,11 @@ LLVM</a>.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="int_atomic_cmp_swap">'<tt>llvm.atomic.cmp.swap.*</tt>' Intrinsic</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <p>This is an overloaded intrinsic. You can use <tt>llvm.atomic.cmp.swap</tt> on
@@ -7171,10 +7377,11 @@ LLVM</a>.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="int_atomic_swap">'<tt>llvm.atomic.swap.*</tt>' Intrinsic</a>
-</div>
-<div class="doc_text">
+</h4>
+
+<div>
 <h5>Syntax:</h5>
 
 <p>This is an overloaded intrinsic. You can use <tt>llvm.atomic.swap</tt> on any
@@ -7227,12 +7434,11 @@ LLVM</a>.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="int_atomic_load_add">'<tt>llvm.atomic.load.add.*</tt>' Intrinsic</a>
+</h4>
 
-</div>
-
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <p>This is an overloaded intrinsic. You can use <tt>llvm.atomic.load.add</tt> on
@@ -7277,12 +7483,11 @@ LLVM</a>.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="int_atomic_load_sub">'<tt>llvm.atomic.load.sub.*</tt>' Intrinsic</a>
+</h4>
 
-</div>
-
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <p>This is an overloaded intrinsic. You can use <tt>llvm.atomic.load.sub</tt> on
@@ -7329,14 +7534,25 @@ LLVM</a>.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
-  <a name="int_atomic_load_and">'<tt>llvm.atomic.load.and.*</tt>' Intrinsic</a><br>
-  <a name="int_atomic_load_nand">'<tt>llvm.atomic.load.nand.*</tt>' Intrinsic</a><br>
-  <a name="int_atomic_load_or">'<tt>llvm.atomic.load.or.*</tt>' Intrinsic</a><br>
-  <a name="int_atomic_load_xor">'<tt>llvm.atomic.load.xor.*</tt>' Intrinsic</a><br>
-</div>
-
-<div class="doc_text">
+<h4>
+  <a name="int_atomic_load_and">
+    '<tt>llvm.atomic.load.and.*</tt>' Intrinsic
+  </a>
+  <br>
+  <a name="int_atomic_load_nand">
+    '<tt>llvm.atomic.load.nand.*</tt>' Intrinsic
+  </a>
+  <br>
+  <a name="int_atomic_load_or">
+    '<tt>llvm.atomic.load.or.*</tt>' Intrinsic
+  </a>
+  <br>
+  <a name="int_atomic_load_xor">
+    '<tt>llvm.atomic.load.xor.*</tt>' Intrinsic
+  </a>
+</h4>
+
+<div>
 
 <h5>Syntax:</h5>
 <p>These are overloaded intrinsics. You can
@@ -7409,14 +7625,25 @@ LLVM</a>.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
-  <a name="int_atomic_load_max">'<tt>llvm.atomic.load.max.*</tt>' Intrinsic</a><br>
-  <a name="int_atomic_load_min">'<tt>llvm.atomic.load.min.*</tt>' Intrinsic</a><br>
-  <a name="int_atomic_load_umax">'<tt>llvm.atomic.load.umax.*</tt>' Intrinsic</a><br>
-  <a name="int_atomic_load_umin">'<tt>llvm.atomic.load.umin.*</tt>' Intrinsic</a><br>
-</div>
-
-<div class="doc_text">
+<h4>
+  <a name="int_atomic_load_max">
+    '<tt>llvm.atomic.load.max.*</tt>' Intrinsic
+  </a>
+  <br>
+  <a name="int_atomic_load_min">
+    '<tt>llvm.atomic.load.min.*</tt>' Intrinsic
+  </a>
+  <br>
+  <a name="int_atomic_load_umax">
+    '<tt>llvm.atomic.load.umax.*</tt>' Intrinsic
+  </a>
+  <br>
+  <a name="int_atomic_load_umin">
+    '<tt>llvm.atomic.load.umin.*</tt>' Intrinsic
+  </a>
+</h4>
+
+<div>
 
 <h5>Syntax:</h5>
 <p>These are overloaded intrinsics. You can use <tt>llvm.atomic.load_max</tt>,
@@ -7487,25 +7714,24 @@ LLVM</a>.</p>
 
 </div>
 
+</div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="int_memorymarkers">Memory Use Markers</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>This class of intrinsics exists to information about the lifetime of memory
    objects and ranges where variables are immutable.</p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="int_lifetime_start">'<tt>llvm.lifetime.start</tt>' Intrinsic</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -7531,11 +7757,11 @@ LLVM</a>.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="int_lifetime_end">'<tt>llvm.lifetime.end</tt>' Intrinsic</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -7560,11 +7786,11 @@ LLVM</a>.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="int_invariant_start">'<tt>llvm.invariant.start</tt>' Intrinsic</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -7588,11 +7814,11 @@ LLVM</a>.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="int_invariant_end">'<tt>llvm.invariant.end</tt>' Intrinsic</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -7614,24 +7840,24 @@ LLVM</a>.</p>
 
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="int_general">General Intrinsics</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>This class of intrinsics is designed to be generic and has no specific
    purpose.</p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="int_var_annotation">'<tt>llvm.var.annotation</tt>' Intrinsic</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -7655,11 +7881,11 @@ LLVM</a>.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="int_annotation">'<tt>llvm.annotation.*</tt>' Intrinsic</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <p>This is an overloaded intrinsic. You can use '<tt>llvm.annotation</tt>' on
@@ -7691,11 +7917,11 @@ LLVM</a>.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="int_trap">'<tt>llvm.trap</tt>' Intrinsic</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -7716,11 +7942,11 @@ LLVM</a>.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="int_stackprotector">'<tt>llvm.stackprotector</tt>' Intrinsic</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -7750,11 +7976,11 @@ LLVM</a>.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="int_objectsize">'<tt>llvm.objectsize</tt>' Intrinsic</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <h5>Syntax:</h5>
 <pre>
@@ -7784,6 +8010,10 @@ LLVM</a>.</p>
 
 </div>
 
+</div>
+
+</div>
+
 <!-- *********************************************************************** -->
 <hr>
 <address>
@@ -7793,7 +8023,7 @@ LLVM</a>.</p>
   src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
 
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
-  <a href="http://llvm.org">The LLVM Compiler Infrastructure</a><br>
+  <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
   Last modified: $Date$
 </address>
 
diff --git a/docs/Lexicon.html b/docs/Lexicon.html
index 82a58aa..b1c2638 100644
--- a/docs/Lexicon.html
+++ b/docs/Lexicon.html
@@ -9,12 +9,12 @@
   content="A glossary of terms used with the LLVM project.">
 </head>
 <body>
-<div class="doc_title">The LLVM Lexicon</div>
+<h1>The LLVM Lexicon</h1>
 <p class="doc_warning">NOTE: This document is a work in progress!</p>
 <!-- *********************************************************************** -->
-<div class="doc_section">Table Of Contents</div>
+<h2>Table Of Contents</h2>
 <!-- *********************************************************************** -->
-<div class="doc_text">
+<div>
   <table>
     <tr><th colspan="8"><b>- <a href="#A">A</a> -</b></th></tr>
     <tr>
@@ -83,19 +83,20 @@
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">Definitions</div>
+<h2>Definitions</h2>
 <!-- *********************************************************************** -->
+<div>
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsection"><a name="A">- A -</a></div>
-<div class="doc_text">
+<h3><a name="A">- A -</a></h3>
+<div>
   <dl>
     <dt><a name="ADCE"><b>ADCE</b></a></dt>
     <dd>Aggressive Dead Code Elimination</dd>
   </dl>
 </div>
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsection"><a name="B">- B -</a></div>
-<div class="doc_text">
+<h3><a name="B">- B -</a></h3>
+<div>
   <dl>
     <dt><a name="BURS"><b>BURS</b></a></dt>
     <dd>Bottom Up Rewriting System&mdash;A method of instruction selection for
@@ -104,8 +105,8 @@ href="http://www.program-transformation.org/Transform/BURG">BURG</a> tool.</dd>
   </dl>
 </div>
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsection"><a name="C">- C -</a></div>
-<div class="doc_text">
+<h3><a name="C">- C -</a></h3>
+<div>
   <dl>
     <dt><a name="CSE"><b>CSE</b></a></dt>
     <dd>Common Subexpression Elimination. An optimization that removes common
@@ -116,8 +117,8 @@ href="http://www.program-transformation.org/Transform/BURG">BURG</a> tool.</dd>
   </dl>
 </div>
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsection"><a name="D">- D -</a></div>
-<div class="doc_text">
+<h3><a name="D">- D -</a></h3>
+<div>
   <dl>
     <dt><a name="DAG"><b>DAG</b></a></dt>
     <dd>Directed Acyclic Graph</dd>
@@ -136,8 +137,8 @@ href="http://www.program-transformation.org/Transform/BURG">BURG</a> tool.</dd>
   </dl>
 </div>
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsection"><a name="G">- G -</a></div>
-<div class="doc_text">
+<h3><a name="G">- G -</a></h3>
+<div>
   <dl>
     <dt><a name="GC"><b>GC</b></a></dt>
     <dd>Garbage Collection. The practice of using reachability analysis instead
@@ -145,8 +146,8 @@ href="http://www.program-transformation.org/Transform/BURG">BURG</a> tool.</dd>
   </dl>
 </div>
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsection"><a name="H">- H -</a></div>
-<div class="doc_text">
+<h3><a name="H">- H -</a></h3>
+<div>
   <dl>
     <dt><a name="Heap"><b>Heap</b></a></dt>
     <dd>In garbage collection, the region of memory which is managed using
@@ -154,8 +155,8 @@ href="http://www.program-transformation.org/Transform/BURG">BURG</a> tool.</dd>
   </dl>
 </div>
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsection"><a name="I">- I -</a></div>
-<div class="doc_text">
+<h3><a name="I">- I -</a></h3>
+<div>
   <dl>
     <dt><a name="IPA"><b>IPA</b></a></dt>
     <dd>Inter-Procedural Analysis. Refers to any variety of code analysis that
@@ -169,8 +170,8 @@ href="http://www.program-transformation.org/Transform/BURG">BURG</a> tool.</dd>
   </dl>
 </div>
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsection"><a name="L">- L -</a></div>
-<div class="doc_text">
+<h3><a name="L">- L -</a></h3>
+<div>
   <dl>
     <dt><a name="LCSSA"><b>LCSSA</b></a></dt>
     <dd>Loop-Closed Static Single Assignment Form</dd>
@@ -183,16 +184,16 @@ href="http://www.program-transformation.org/Transform/BURG">BURG</a> tool.</dd>
   </dl>
 </div>
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsection"><a name="M">- M -</a></div>
-<div class="doc_text">
+<h3><a name="M">- M -</a></h3>
+<div>
   <dl>
     <dt><a name="MC"><b>MC</b></a></dt>
     <dd>Machine Code</dd>
   </dl>
 </div>
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsection"><a name="O">- O -</a></div>
-<div class="doc_text">
+<h3><a name="O">- O -</a></h3>
+<div>
   <dl>
     <dt><a name="Object_Pointer"><b>Object Pointer</b></a></dt>
     <dd>A pointer to an object such that the garbage collector is able to trace
@@ -202,8 +203,8 @@ href="http://www.program-transformation.org/Transform/BURG">BURG</a> tool.</dd>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsection"><a name="P">- P -</a></div>
-<div class="doc_text">
+<h3><a name="P">- P -</a></h3>
+<div>
   <dl>
     <dt><a name="PRE"><b>PRE</b></a></dt>
     <dd>Partial Redundancy Elimination</dd>
@@ -211,8 +212,8 @@ href="http://www.program-transformation.org/Transform/BURG">BURG</a> tool.</dd>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsection"><a name="R">- R -</a></div>
-<div class="doc_text">
+<h3><a name="R">- R -</a></h3>
+<div>
   <dl>
   	<dt><a name="RAUW"><b>RAUW</b></a></dt> <dd>An abbreviation for Replace
   	All Uses With. The functions User::replaceUsesOfWith(), 
@@ -234,8 +235,8 @@ href="http://www.program-transformation.org/Transform/BURG">BURG</a> tool.</dd>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsection"><a name="S">- S -</a></div>
-<div class="doc_text">
+<h3><a name="S">- S -</a></h3>
+<div>
   <dl>
     <dt><a name="Safe_Point"><b>Safe Point</b></a></dt>
     <dd>In garbage collection, it is necessary to identify <a href="#Root">stack
@@ -261,6 +262,8 @@ href="http://www.program-transformation.org/Transform/BURG">BURG</a> tool.</dd>
     function.</dd>
   </dl>
 </div>
+
+</div>
 <!-- *********************************************************************** -->
 <hr>
 <address> <a href="http://jigsaw.w3.org/css-validator/check/referer"><img
@@ -268,7 +271,7 @@ href="http://www.program-transformation.org/Transform/BURG">BURG</a> tool.</dd>
  href="http://validator.w3.org/check/referer"><img
  src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a><a
  href="http://llvm.org/">The LLVM Team</a><br>
-<a href="http://llvm.org">The LLVM Compiler Infrastructure</a><br>
+<a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
 Last modified: $Date$
 </address>
 <!-- vim: sw=2
diff --git a/docs/LinkTimeOptimization.html b/docs/LinkTimeOptimization.html
index dbe8f38..b3bc481 100644
--- a/docs/LinkTimeOptimization.html
+++ b/docs/LinkTimeOptimization.html
@@ -6,9 +6,9 @@
   <link rel="stylesheet" href="llvm.css" type="text/css">
 </head>
 
-<div class="doc_title">
+<h1>
   LLVM Link Time Optimization: Design and Implementation
-</div>
+</h1>
 
 <ul>
   <li><a href="#desc">Description</a></li>
@@ -36,12 +36,12 @@
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
 <a name="desc">Description</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 <p>
 LLVM features powerful intermodular optimizations which can be used at link 
 time.  Link Time Optimization (LTO) is another name for intermodular optimization 
@@ -50,12 +50,12 @@ and design between the LTO optimizer and the linker.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
 <a name="design">Design Philosophy</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 <p>
 The LLVM Link Time Optimizer provides complete transparency, while doing 
 intermodular optimization, in the compiler tool chain. Its main goal is to let 
@@ -69,14 +69,13 @@ the linker and LLVM optimizer helps to do optimizations that are not possible
 in other models. The linker input allows the optimizer to avoid relying on 
 conservative escape analysis.
 </p>
-</div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="example1">Example of link time optimization</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
   <p>The following example illustrates the advantages of LTO's integrated
   approach and clean interface. This example requires a system linker which
   supports LTO through the interface described in this document.  Here,
@@ -145,11 +144,11 @@ $ llvm-gcc a.o main.o -o main # &lt;-- standard link command without any modific
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="alternative_approaches">Alternative Approaches</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
   <dl>
     <dt><b>Compiler driver invokes link time optimizer separately.</b></dt>
     <dd>In this model the link time optimizer is not able to take advantage of 
@@ -175,12 +174,14 @@ $ llvm-gcc a.o main.o -o main # &lt;-- standard link command without any modific
   </dl>
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="multiphase">Multi-phase communication between libLTO and linker</a>
-</div>
+</h2>
 
-<div class="doc_text">
+<div>
   <p>The linker collects information about symbol defininitions and uses in 
   various link objects which is more accurate than any information collected 
   by other tools during typical build cycles.  The linker collects this 
@@ -192,14 +193,13 @@ $ llvm-gcc a.o main.o -o main # &lt;-- standard link command without any modific
   Our goal is to take advantage of tight integration between the linker and 
   the optimizer by sharing this information during various linking phases.
 </p>
-</div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="phase1">Phase 1 : Read LLVM Bitcode Files</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
   <p>The linker first reads all object files in natural order and collects 
   symbol information. This includes native object files as well as LLVM bitcode 
   files.  To minimize the cost to the linker in the case that all .o files
@@ -219,11 +219,11 @@ $ llvm-gcc a.o main.o -o main # &lt;-- standard link command without any modific
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="phase2">Phase 2 : Symbol Resolution</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
   <p>In this stage, the linker resolves symbols using global symbol table. 
   It may report undefined symbol errors, read archive members, replace 
   weak symbols, etc.  The linker is able to do this seamlessly even though it 
@@ -233,10 +233,10 @@ $ llvm-gcc a.o main.o -o main # &lt;-- standard link command without any modific
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="phase3">Phase 3 : Optimize Bitcode Files</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>After symbol resolution, the linker tells the LTO shared object which
   symbols are needed by native object files.  In the example above, the linker 
   reports that only <tt>foo1()</tt> is used by native object files using 
@@ -248,11 +248,11 @@ $ llvm-gcc a.o main.o -o main # &lt;-- standard link command without any modific
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="phase4">Phase 4 : Symbol Resolution after optimization</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
   <p>In this phase, the linker reads optimized a native object file and 
   updates the internal global symbol table to reflect any changes. The linker 
   also collects information about any changes in use of external symbols by 
@@ -264,12 +264,14 @@ $ llvm-gcc a.o main.o -o main # &lt;-- standard link command without any modific
   bitcode files.</p>
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
 <a name="lto">libLTO</a>
-</div>
+</h2>
 
-<div class="doc_text">
+<div>
   <p><tt>libLTO</tt> is a shared object that is part of the LLVM tools, and 
   is intended for use by a linker. <tt>libLTO</tt> provides an abstract C 
   interface to use the LLVM interprocedural optimizer without exposing details 
@@ -278,14 +280,13 @@ $ llvm-gcc a.o main.o -o main # &lt;-- standard link command without any modific
   be possible for a completely different compilation technology to provide
   a different libLTO that works with their object files and the standard
   linker tool.</p>
-</div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="lto_module_t">lto_module_t</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>A non-native object file is handled via an <tt>lto_module_t</tt>.  
 The following functions allow the linker to check if a file (on disk
@@ -325,11 +326,11 @@ lto_module_get_symbol_attribute(lto_module_t, unsigned int)
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="lto_code_gen_t">lto_code_gen_t</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>Once the linker has loaded each non-native object files into an
 <tt>lto_module_t</tt>, it can request libLTO to process them all and
@@ -371,6 +372,8 @@ of the native object files.</p>
 
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
 
 <hr>
@@ -381,7 +384,7 @@ of the native object files.</p>
   src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
 
   Devang Patel and Nick Kledzik<br>
-  <a href="http://llvm.org">LLVM Compiler Infrastructure</a><br>
+  <a href="http://llvm.org/">LLVM Compiler Infrastructure</a><br>
   Last modified: $Date$
 </address>
 
diff --git a/docs/MakefileGuide.html b/docs/MakefileGuide.html
index 2ef0954..ee0115d 100644
--- a/docs/MakefileGuide.html
+++ b/docs/MakefileGuide.html
@@ -7,7 +7,7 @@
 </head>
 <body>
 
-<div class="doc_title">LLVM Makefile Guide</div>
+<h1>LLVM Makefile Guide</h1>
 
 <ol>
   <li><a href="#introduction">Introduction</a></li>
@@ -77,10 +77,10 @@
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="introduction">Introduction </a></div>
+<h2><a name="introduction">Introduction</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
   <p>This document provides <em>usage</em> information about the LLVM makefile 
   system. While loosely patterned after the BSD makefile system, LLVM has taken 
   a departure from BSD in order to implement additional features needed by LLVM.
@@ -99,20 +99,19 @@
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="general">General Concepts</a></div>
+<h2><a name="general">General Concepts</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
   <p>The LLVM Makefile System is the component of LLVM that is responsible for
   building the software, testing it,  generating distributions, checking those
   distributions, installing and uninstalling, etc. It consists of a several
   files throughout the source tree. These files and other general concepts are
   described in this section.</p>
-</div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="projects">Projects</a></div>
-<div class="doc_text">
+<h3><a name="projects">Projects</a></h3>
+<div>
   <p>The LLVM Makefile System is quite generous. It not only builds its own
   software, but it can build yours too. Built into the system is knowledge of
   the <tt>llvm/projects</tt> directory. Any directory under <tt>projects</tt>
@@ -129,8 +128,8 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="varvalues">Variable Values</a></div>
-<div class="doc_text">
+<h3><a name="varvalues">Variable Values</a></h3>
+<div>
   <p>To use the makefile system, you simply create a file named 
   <tt>Makefile</tt> in your directory and declare values for certain variables. 
   The variables and values that you select determine what the makefile system
@@ -139,16 +138,15 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="including">Including Makefiles</a></div>
-<div class="doc_text">
+<h3><a name="including">Including Makefiles</a></h3>
+<div>
   <p>Setting variables alone is not enough. You must include into your Makefile
   additional files that provide the rules of the LLVM Makefile system. The 
   various files involved are described in the sections that follow.</p>
-</div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection"><a name="Makefile">Makefile</a></div>
-<div class="doc_text">
+<h4><a name="Makefile">Makefile</a></h4>
+<div>
   <p>Each directory to participate in the build needs to have a file named
   <tt>Makefile</tt>. This is the file first read by <tt>make</tt>. It has three
   sections:</p>
@@ -163,9 +161,8 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection"><a name="Makefile.common">Makefile.common</a>
-</div>
-<div class="doc_text">
+<h4><a name="Makefile.common">Makefile.common</a></h4>
+<div>
   <p>Every project must have a <tt>Makefile.common</tt> file at its top source 
   directory. This file serves three purposes:</p>
   <ol>
@@ -182,9 +179,8 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection"><a name="Makefile.config">Makefile.config</a>
-</div>
-<div class="doc_text">
+<h4><a name="Makefile.config">Makefile.config</a></h4>
+<div>
   <p>Every project must have a <tt>Makefile.config</tt> at the top of its
   <em>build</em> directory. This file is <b>generated</b> by the
   <tt>configure</tt> script from the pattern provided by the
@@ -196,8 +192,8 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection"><a name="Makefile.rules">Makefile.rules</a></div>
-<div class="doc_text">
+<h4><a name="Makefile.rules">Makefile.rules</a></h4>
+<div>
   <p>This file, located at <tt>$(LLVM_SRC_ROOT)/Makefile.rules</tt> is the heart
   of the LLVM Makefile System. It provides all the logic, dependencies, and
   rules for building the targets supported by the system. What it does largely
@@ -205,9 +201,11 @@
   have been set <em>before</em> <tt>Makefile.rules</tt> is included.
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="Comments">Comments</a></div>
-<div class="doc_text">
+<h3><a name="Comments">Comments</a></h3>
+<div>
   <p>User Makefiles need not have comments in them unless the construction is
   unusual or it does not strictly follow the rules and patterns of the LLVM
   makefile system. Makefile comments are invoked with the pound (#) character.
@@ -215,19 +213,20 @@
   by <tt>make</tt>.</p>
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="tutorial">Tutorial</a></div>
+<h2><a name="tutorial">Tutorial</a></h2>
 <!-- *********************************************************************** -->
-<div class="doc_text">
+<div>
   <p>This section provides some examples of the different kinds of modules you
   can build with the LLVM makefile system. In general, each directory you 
   provide will build a single object although that object may be composed of
   additionally compiled components.</p>
-</div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="libraries">Libraries</a></div>
-<div class="doc_text">
+<h3><a name="libraries">Libraries</a></h3>
+<div>
   <p>Only a few variable definitions are needed to build a regular library.
   Normally, the makefile system will build all the software into a single
   <tt>libname.o</tt> (pre-linked) object. This means the library is not
@@ -256,11 +255,10 @@
   <tt>-load</tt> option. See the 
   <a href="WritingAnLLVMPass.html#makefile">WritingAnLLVMPass.html</a> document
   for an example of why you might want to do this.
-</div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection"><a name="BCModules">Bitcode Modules</a></div>
-<div class="doc_text">
+<h4><a name="BCModules">Bitcode Modules</a></h4>
+<div>
   <p>In some situations, it is desirable to build a single bitcode module from
   a variety of sources, instead of an archive, shared library, or bitcode 
   library. Bitcode modules can be specified in addition to any of the other
@@ -280,10 +278,10 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection">
+<h4>
   <a name="LoadableModules">Loadable Modules</a>
-</div>
-<div class="doc_text">
+</h4>
+<div>
   <p>In some situations, you need to create a loadable module. Loadable modules
   can be loaded into programs like <tt>opt</tt> or <tt>llc</tt> to specify
   additional passes to run or targets to support.  Loadable modules are also
@@ -311,9 +309,11 @@
   library which is part of <tt>lib/System</tt> implementation.</p>
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="tools">Tools</a></div>
-<div class="doc_text">
+<h3><a name="tools">Tools</a></h3>
+<div>
   <p>For building executable programs (tools), you must provide the name of the
   tool and the names of the libraries you wish to link with the tool. For
   example:</p>
@@ -344,11 +344,10 @@
   syntax is used. Note that in order to use the <tt>.a</tt> suffix, the library
   in question must have been built with the <tt>ARCHIVE_LIBRARY</tt> option set.
   </p>
-</div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection"><a name="JIT">JIT Tools</a></div>
-<div class="doc_text">
+<h4><a name="JIT">JIT Tools</a></h4>
+<div>
   <p>Many tools will want to use the JIT features of LLVM.  To do this, you
      simply specify that you want an execution 'engine', and the makefiles will
      automatically link in the appropriate JIT for the host or an interpreter
@@ -367,11 +366,15 @@
   </tt></pre>
 </div>
 
+</div>
+
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="targets">Targets Supported</a></div>
+<h2><a name="targets">Targets Supported</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
   <p>This section describes each of the targets that can be built using the LLVM
   Makefile system. Any target can be invoked from any directory but not all are
   applicable to a given directory (e.g. "check", "dist" and "install" will
@@ -426,11 +429,10 @@
       <td>Remove built objects from installation directory.
     </td></tr>
   </table>
-</div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="all">all (default)</a></div>
-<div class="doc_text">
+<h3><a name="all">all (default)</a></h3>
+<div>
   <p>When you invoke <tt>make</tt> with no arguments, you are implicitly
   instructing it to seek the "all" target (goal). This target is used for
   building the software recursively and will do different things in different 
@@ -440,15 +442,15 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="all-local">all-local</a></div>
-<div class="doc_text">
+<h3><a name="all-local">all-local</a></h3>
+<div>
   <p>This target is the same as <a href="#all">all</a> but it operates only on
   the current directory instead of recursively.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="check">check</a></div>
-<div class="doc_text">
+<h3><a name="check">check</a></h3>
+<div>
   <p>This target can be invoked from anywhere within a project's directories
   but always invokes the <a href="#check-local"><tt>check-local</tt></a> target 
   in the project's <tt>test</tt> directory, if it exists and has a 
@@ -464,8 +466,8 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="check-local">check-local</a></div>
-<div class="doc_text">
+<h3><a name="check-local">check-local</a></h3>
+<div>
   <p>This target should be implemented by the <tt>Makefile</tt> in the project's
   <tt>test</tt> directory. It is invoked by the <tt>check</tt> target elsewhere.
   Each project is free to define the actions of <tt>check-local</tt> as 
@@ -475,8 +477,8 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="clean">clean</a></div>
-<div class="doc_text">
+<h3><a name="clean">clean</a></h3>
+<div>
   <p>This target cleans the build directory, recursively removing all things
   that the Makefile builds. The cleaning rules have been made guarded so they 
   shouldn't go awry (via <tt>rm -f $(UNSET_VARIABLE)/*</tt> which will attempt
@@ -484,15 +486,15 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="clean-local">clean-local</a></div>
-<div class="doc_text">
+<h3><a name="clean-local">clean-local</a></h3>
+<div>
   <p>This target does the same thing as <tt>clean</tt> but only for the current
   (local) directory.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="dist">dist</a></div>
-<div class="doc_text">
+<h3><a name="dist">dist</a></h3>
+<div>
   <p>This target builds a distribution tarball. It first builds the entire
   project using the <tt>all</tt> target and then tars up the necessary files and
   compresses it. The generated tarball is sufficient for a casual source 
@@ -500,8 +502,8 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="dist-check">dist-check</a></div>
-<div class="doc_text">
+<h3><a name="dist-check">dist-check</a></h3>
+<div>
   <p>This target does the same thing as the <tt>dist</tt> target but also checks
   the distribution tarball. The check is made by unpacking the tarball to a new
   directory, configuring it, building it, installing it, and then verifying that
@@ -512,16 +514,16 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="dist-clean">dist-clean</a></div>
-<div class="doc_text">
+<h3><a name="dist-clean">dist-clean</a></h3>
+<div>
   <p>This is a special form of the <tt>clean</tt> clean target. It performs a
   normal <tt>clean</tt> but also removes things pertaining to building the
   distribution.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="install">install</a></div>
-<div class="doc_text">
+<h3><a name="install">install</a></h3>
+<div>
   <p>This target finalizes shared objects and executables and copies all
   libraries, headers, executables and documentation to the directory given 
   with the <tt>--prefix</tt> option to <tt>configure</tt>.  When completed, 
@@ -538,8 +540,8 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="preconditions">preconditions</a></div>
-<div class="doc_text">
+<h3><a name="preconditions">preconditions</a></h3>
+<div>
   <p>This utility target checks to see if the <tt>Makefile</tt> in the object
   directory is older than the <tt>Makefile</tt> in the source directory and
   copies it if so. It also reruns the <tt>configure</tt> script if that needs to
@@ -549,15 +551,15 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="printvars">printvars</a></div>
-<div class="doc_text">
+<h3><a name="printvars">printvars</a></h3>
+<div>
   <p>This utility target just causes the LLVM makefiles to print out some of 
   the makefile variables so that you can double check how things are set. </p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="reconfigure">reconfigure</a></div>
-<div class="doc_text">
+<h3><a name="reconfigure">reconfigure</a></h3>
+<div>
   <p>This utility target will force a reconfigure of LLVM or your project. It 
   simply runs <tt>$(PROJ_OBJ_ROOT)/config.status --recheck</tt> to rerun the
   configuration tests and rebuild the configured files. This isn't generally
@@ -566,8 +568,8 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="spotless">spotless</a></div>
-<div class="doc_text">
+<h3><a name="spotless">spotless</a></h3>
+<div>
   <p>This utility target, only available when <tt>$(PROJ_OBJ_ROOT)</tt> is not 
   the same as <tt>$(PROJ_SRC_ROOT)</tt>, will completely clean the
   <tt>$(PROJ_OBJ_ROOT)</tt> directory by removing its content entirely and 
@@ -578,8 +580,8 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="tags">tags</a></div>
-<div class="doc_text">
+<h3><a name="tags">tags</a></h3>
+<div>
   <p>This target will generate a <tt>TAGS</tt> file in the top-level source
   directory. It is meant for use with emacs, XEmacs, or ViM. The TAGS file
   provides an index of symbol definitions so that the editor can jump you to the
@@ -587,18 +589,20 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="uninstall">uninstall</a></div>
-<div class="doc_text">
+<h3><a name="uninstall">uninstall</a></h3>
+<div>
   <p>This target is the opposite of the <tt>install</tt> target. It removes the
   header, library and executable files from the installation directories. Note
   that the directories themselves are not removed because it is not guaranteed
   that LLVM is the only thing installing there (e.g. --prefix=/usr).</p>
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="variables">Variables</a></div>
+<h2><a name="variables">Variables</a></h2>
 <!-- *********************************************************************** -->
-<div class="doc_text">
+<div>
   <p>Variables are used to tell the LLVM Makefile System what to do and to
   obtain information from it. Variables are also used internally by the LLVM
   Makefile System. Variable names that contain only the upper case alphabetic
@@ -606,11 +610,10 @@
   variables are internal to the LLVM Makefile System and should not be relied
   upon nor modified. The sections below describe how to use the LLVM Makefile 
   variables.</p>
-</div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="setvars">Control Variables</a></div>
-<div class="doc_text">
+<h3><a name="setvars">Control Variables</a></h3>
+<div>
   <p>Variables listed in the table below should be set <em>before</em> the 
   inclusion of <a href="#Makefile.common"><tt>$(LEVEL)/Makefile.common</tt></a>.
   These variables provide input to the LLVM make system that tell it what to do 
@@ -762,8 +765,8 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="overvars">Override Variables</a></div>
-<div class="doc_text">
+<h3><a name="overvars">Override Variables</a></h3>
+<div>
   <p>Override variables can be used to override the default
   values provided by the LLVM makefile system. These variables can be set in 
   several ways:</p>
@@ -868,8 +871,8 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="getvars">Readable Variables</a></div>
-<div class="doc_text">
+<h3><a name="getvars">Readable Variables</a></h3>
+<div>
   <p>Variables listed in the table below can be used by the user's Makefile but
   should not be changed. Changing the value will generally cause the build to go
   wrong, so don't do it.</p>
@@ -939,8 +942,8 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="intvars">Internal Variables</a></div>
-<div class="doc_text">
+<h3><a name="intvars">Internal Variables</a></h3>
+<div>
   <p>Variables listed below are used by the LLVM Makefile System 
   and considered internal. You should not use these variables under any
   circumstances.</p>
@@ -1018,6 +1021,8 @@
   </tt></p>
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
 <hr>
 <address>
@@ -1027,7 +1032,7 @@
   src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
 
   <a href="mailto:rspencer@x10sys.com">Reid Spencer</a><br>
-  <a href="http://llvm.org">The LLVM Compiler Infrastructure</a><br>
+  <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
   Last modified: $Date$
 </address>
 </body>
diff --git a/docs/Packaging.html b/docs/Packaging.html
index 217590e..7261cc2 100644
--- a/docs/Packaging.html
+++ b/docs/Packaging.html
@@ -7,7 +7,7 @@
 </head>
 <body>
 
-<div class="doc_title">Advice on Packaging LLVM</div>
+<h1>Advice on Packaging LLVM</h1>
 <ol>
   <li><a href="#overview">Overview</a></li>
   <li><a href="#compilation">Compile Flags</a></li>
@@ -17,9 +17,9 @@
 </ol>
 
 <!--=========================================================================-->
-<div class="doc_section"><a name="overview">Overview</a></div>
+<h2><a name="overview">Overview</a></h2>
 <!--=========================================================================-->
-<div class="doc_text">
+<div>
 
 <p>LLVM sets certain default configure options to make sure our developers don't
 break things for constrained platforms.  These settings are not optimal for most
@@ -34,9 +34,9 @@ developed against each.
 </div>
 
 <!--=========================================================================-->
-<div class="doc_section"><a name="compilation">Compile Flags</a></div>
+<h2><a name="compilation">Compile Flags</a></h2>
 <!--=========================================================================-->
-<div class="doc_text">
+<div>
 
 <p>LLVM runs much more quickly when it's optimized and assertions are removed.
 However, such a build is currently incompatible with users who build without
@@ -65,9 +65,9 @@ versions of LLVM in parallel.  The following configure flags are relevant:
 </div>
 
 <!--=========================================================================-->
-<div class="doc_section"><a name="cxx-features">C++ Features</a></div>
+<h2><a name="cxx-features">C++ Features</a></h2>
 <!--=========================================================================-->
-<div class="doc_text">
+<div>
 
 <dl>
   <dt>RTTI</dt><dd>LLVM disables RTTI by default.  Add <tt>REQUIRES_RTTI=1</tt>
@@ -78,9 +78,9 @@ versions of LLVM in parallel.  The following configure flags are relevant:
 </div>
 
 <!--=========================================================================-->
-<div class="doc_section"><a name="shared-library">Shared Library</a></div>
+<h2><a name="shared-library">Shared Library</a></h2>
 <!--=========================================================================-->
-<div class="doc_text">
+<div>
 
 <p>Configure with <tt>--enable-shared</tt> to build
 <tt>libLLVM-<var>major</var>.<var>minor</var>.(so|dylib)</tt> and link the tools
@@ -89,9 +89,9 @@ against it.  This saves lots of binary size at the cost of some startup time.
 </div>
 
 <!--=========================================================================-->
-<div class="doc_section"><a name="deps">Dependencies</a></div>
+<h2><a name="deps">Dependencies</a></h2>
 <!--=========================================================================-->
-<div class="doc_text">
+<div>
 
 <dl>
 <dt><tt>--enable-libffi</tt></dt><dd>Depend on <a
@@ -111,7 +111,7 @@ line numbers.</dd>
   src="http://jigsaw.w3.org/css-validator/images/vcss-blue" alt="Valid CSS"></a>
   <a href="http://validator.w3.org/check/referer"><img
   src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
-  <a href="http://llvm.org">The LLVM Compiler Infrastructure</a><br>
+  <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
   Last modified: $Date$
 </address>
 </body>
diff --git a/docs/Passes.html b/docs/Passes.html
index 2be32ef..b7f70b9 100644
--- a/docs/Passes.html
+++ b/docs/Passes.html
@@ -40,7 +40,7 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 
   -->
 
-<div class="doc_title">LLVM's Analysis and Transform Passes</div>
+<h1>LLVM's Analysis and Transform Passes</h1>
 
 <ol>
   <li><a href="#intro">Introduction</a></li>
@@ -55,8 +55,8 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_section"> <a name="intro">Introduction</a> </div>
-<div class="doc_text">
+<h2><a name="intro">Introduction</a></h2>
+<div>
   <p>This document serves as a high level summary of the optimization features 
   that LLVM provides. Optimizations are implemented as Passes that traverse some
   portion of a program to either collect information or transform the program.
@@ -69,8 +69,7 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
   bitcode are neither analysis nor transform passes.
   <p>The table below provides a quick summary of each pass and links to the more
   complete pass description later in the document.</p>
-</div>
-<div class="doc_text" >
+
 <table>
 <tr><th colspan="2"><b>ANALYSIS PASSES</b></th></tr>
 <tr><th>Option</th><th>Name</th></tr>
@@ -201,19 +200,19 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 <tr><td><a href="#view-postdom">-view-postdom</a></td><td>View postdominance tree of function</td></tr>
 <tr><td><a href="#view-postdom-only">-view-postdom-only</a></td><td>View postdominance tree of function (with no function bodies)</td></tr>
 </table>
+
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_section"> <a name="example">Analysis Passes</a></div>
-<div class="doc_text">
+<h2><a name="analyses">Analysis Passes</a></h2>
+<div>
   <p>This section describes the LLVM Analysis Passes.</p>
-</div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="aa-eval">-aa-eval: Exhaustive Alias Analysis Precision Evaluator</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>This is a simple N^2 alias analysis accuracy evaluator.
   Basically, for each function in the program, it simply queries to see how the
   alias analysis implementation answers alias queries between each pair of
@@ -224,10 +223,10 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="basicaa">-basicaa: Basic Alias Analysis (stateless AA impl)</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   This is the default implementation of the Alias Analysis interface
   that simply implements a few identities (two different globals cannot alias,
@@ -236,18 +235,18 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="basiccg">-basiccg: Basic CallGraph Construction</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>Yet to be written.</p>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="count-aa">-count-aa: Count Alias Analysis Query Responses</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   A pass which can be used to count how many alias queries
   are being made and how the alias analysis implementation being used responds.
@@ -255,10 +254,10 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="debug-aa">-debug-aa: AA use debugger</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   This simple pass checks alias analysis users to ensure that if they
   create a new value, they do not query AA without informing it of the value.
@@ -272,10 +271,10 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="domfrontier">-domfrontier: Dominance Frontier Construction</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   This pass is a simple dominator construction algorithm for finding forward
   dominator frontiers.
@@ -283,10 +282,10 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="domtree">-domtree: Dominator Tree Construction</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   This pass is a simple dominator construction algorithm for finding forward
   dominators.
@@ -294,10 +293,10 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="dot-callgraph">-dot-callgraph: Print Call Graph to 'dot' file</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   This pass, only available in <code>opt</code>, prints the call graph into a
   <code>.dot</code> graph.  This graph can then be processed with the "dot" tool
@@ -306,10 +305,10 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="dot-cfg">-dot-cfg: Print CFG of function to 'dot' file</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   This pass, only available in <code>opt</code>, prints the control flow graph
   into a <code>.dot</code> graph.  This graph can then be processed with the
@@ -318,10 +317,10 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="dot-cfg-only">-dot-cfg-only: Print CFG of function to 'dot' file (with no function bodies)</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   This pass, only available in <code>opt</code>, prints the control flow graph
   into a <code>.dot</code> graph, omitting the function bodies.  This graph can
@@ -331,10 +330,10 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="dot-dom">-dot-dom: Print dominance tree of function to 'dot' file</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   This pass, only available in <code>opt</code>, prints the dominator tree
   into a <code>.dot</code> graph.  This graph can then be processed with the
@@ -343,10 +342,10 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="dot-dom-only">-dot-dom-only: Print dominance tree of function to 'dot' file (with no function bodies)</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   This pass, only available in <code>opt</code>, prints the dominator tree
   into a <code>.dot</code> graph, omitting the function bodies.  This graph can
@@ -356,10 +355,10 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="dot-postdom">-dot-postdom: Print postdominance tree of function to 'dot' file</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   This pass, only available in <code>opt</code>, prints the post dominator tree
   into a <code>.dot</code> graph.  This graph can then be processed with the
@@ -368,10 +367,10 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="dot-postdom-only">-dot-postdom-only: Print postdominance tree of function to 'dot' file (with no function bodies)</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   This pass, only available in <code>opt</code>, prints the post dominator tree
   into a <code>.dot</code> graph, omitting the function bodies.  This graph can
@@ -381,10 +380,10 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="globalsmodref-aa">-globalsmodref-aa: Simple mod/ref analysis for globals</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   This simple pass provides alias and mod/ref information for global values
   that do not have their address taken, and keeps track of whether functions
@@ -394,20 +393,20 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="instcount">-instcount: Counts the various types of Instructions</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   This pass collects the count of all instructions and reports them
   </p>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="intervals">-intervals: Interval Partition Construction</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   This analysis calculates and represents the interval partition of a function,
   or a preexisting interval partition.
@@ -420,44 +419,44 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="iv-users">-iv-users: Induction Variable Users</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>Bookkeeping for "interesting" users of expressions computed from 
   induction variables.</p>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="lazy-value-info">-lazy-value-info: Lazy Value Information Analysis</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>Interface for lazy computation of value constraint information.</p>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="lda">-lda: Loop Dependence Analysis</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>Loop dependence analysis framework, which is used to detect dependences in
   memory accesses in loops.</p>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="libcall-aa">-libcall-aa: LibCall Alias Analysis</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>LibCall Alias Analysis.</p>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="lint">-lint: Statically lint-checks LLVM IR</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>This pass statically checks for common and easily-identified constructs
   which produce undefined or likely unintended behavior in LLVM IR.</p>
  
@@ -485,10 +484,10 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="loops">-loops: Natural Loop Information</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   This analysis is used to identify natural loops and determine the loop depth
   of various nodes of the CFG.  Note that the loops identified may actually be
@@ -498,10 +497,10 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="memdep">-memdep: Memory Dependence Analysis</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   An analysis that determines, for a given memory operation, what preceding 
   memory operations it depends on.  It builds on alias analysis information, and 
@@ -511,10 +510,10 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="module-debuginfo">-module-debuginfo: Decodes module-level debug info</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>This pass decodes the debug info metadata in a module and prints in a
  (sufficiently-prepared-) human-readable form.
 
@@ -524,10 +523,10 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="no-aa">-no-aa: No Alias Analysis (always returns 'may' alias)</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   Always returns "I don't know" for alias queries.  NoAA is unlike other alias
   analysis implementations, in that it does not chain to a previous analysis. As
@@ -536,10 +535,10 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="no-profile">-no-profile: No Profile Information</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   The default "no profile" implementation of the abstract
   <code>ProfileInfo</code> interface.
@@ -547,10 +546,10 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="postdomfrontier">-postdomfrontier: Post-Dominance Frontier Construction</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   This pass is a simple post-dominator construction algorithm for finding
   post-dominator frontiers.
@@ -558,10 +557,10 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="postdomtree">-postdomtree: Post-Dominator Tree Construction</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   This pass is a simple post-dominator construction algorithm for finding
   post-dominators.
@@ -569,18 +568,18 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="print-alias-sets">-print-alias-sets: Alias Set Printer</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>Yet to be written.</p>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="print-callgraph">-print-callgraph: Print a call graph</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   This pass, only available in <code>opt</code>, prints the call graph to
   standard error in a human-readable form.
@@ -588,10 +587,10 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="print-callgraph-sccs">-print-callgraph-sccs: Print SCCs of the Call Graph</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   This pass, only available in <code>opt</code>, prints the SCCs of the call
   graph to standard error in a human-readable form.
@@ -599,10 +598,10 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="print-cfg-sccs">-print-cfg-sccs: Print SCCs of each function CFG</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   This pass, only available in <code>opt</code>, prints the SCCs of each
   function CFG to standard error in a human-readable form.
@@ -610,10 +609,10 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="print-dbginfo">-print-dbginfo: Print debug info in human readable form</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>Pass that prints instructions, and associated debug info:</p>
   <ul>
   
@@ -624,18 +623,18 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="print-dom-info">-print-dom-info: Dominator Info Printer</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>Dominator Info Printer.</p>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="print-externalfnconstants">-print-externalfnconstants: Print external fn callsites passed constants</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   This pass, only available in <code>opt</code>, prints out call sites to
   external functions that are called with constant arguments.  This can be
@@ -645,10 +644,10 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="print-function">-print-function: Print function to stderr</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   The <code>PrintFunctionPass</code> class is designed to be pipelined with
   other <code>FunctionPass</code>es, and prints out the functions of the module
@@ -657,20 +656,20 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="print-module">-print-module: Print module to stderr</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   This pass simply prints out the entire module when it is executed.
   </p>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="print-used-types">-print-used-types: Find Used Types</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   This pass is used to seek out all of the types in use by the program.  Note
   that this analysis explicitly does not include types only used by the symbol
@@ -678,20 +677,20 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="profile-estimator">-profile-estimator: Estimate profiling information</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>Profiling information that estimates the profiling information 
   in a very crude and unimaginative way.
   </p>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="profile-loader">-profile-loader: Load profile information from llvmprof.out</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   A concrete implementation of profiling information that loads the information
   from a profile dump file.
@@ -699,16 +698,16 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="profile-verifier">-profile-verifier: Verify profiling information</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>Pass that checks profiling information for plausibility.</p>
 </div>
-<div class="doc_subsection">
+<h3>
   <a name="regions">-regions: Detect single entry single exit regions</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   The <code>RegionInfo</code> pass detects single entry single exit regions in a
   function, where a region is defined as any subgraph that is connected to the
@@ -718,10 +717,10 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="scalar-evolution">-scalar-evolution: Scalar Evolution Analysis</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   The <code>ScalarEvolution</code> analysis can be used to analyze and
   catagorize scalar expressions in loops.  It specializes in recognizing general
@@ -737,10 +736,10 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="scev-aa">-scev-aa: ScalarEvolution-based Alias Analysis</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>Simple alias analysis implemented in terms of ScalarEvolution queries.
  
   This differs from traditional loop dependence analysis in that it tests
@@ -753,25 +752,26 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="targetdata">-targetdata: Target Data Layout</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>Provides other passes access to information on how the size and alignment
   required by the the target ABI for various data types.</p>
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_section"> <a name="transform">Transform Passes</a></div>
-<div class="doc_text">
+<h2><a name="transforms">Transform Passes</a></h2>
+<div>
   <p>This section describes the LLVM Transform Passes.</p>
-</div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="adce">-adce: Aggressive Dead Code Elimination</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>ADCE aggressively tries to eliminate code. This pass is similar to
   <a href="#dce">DCE</a> but it assumes that values are dead until proven 
   otherwise. This is similar to <a href="#sccp">SCCP</a>, except applied to 
@@ -779,19 +779,19 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="always-inline">-always-inline: Inliner for always_inline functions</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>A custom inliner that handles only functions that are marked as 
   "always inline".</p>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="argpromotion">-argpromotion: Promote 'by reference' arguments to scalars</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   This pass promotes "by reference" arguments to be "by value" arguments.  In
   practice, this means looking for internal functions that have pointer
@@ -819,10 +819,10 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="block-placement">-block-placement: Profile Guided Basic Block Placement</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>This pass is a very simple profile guided basic block placement algorithm.
   The idea is to put frequently executed blocks together at the start of the
   function and hopefully increase the number of fall-through conditional
@@ -831,10 +831,10 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="break-crit-edges">-break-crit-edges: Break critical edges in CFG</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   Break all of the critical edges in the CFG by inserting a dummy basic block.
   It may be "required" by passes that cannot deal with critical edges. This
@@ -844,20 +844,20 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="codegenprepare">-codegenprepare: Optimize for code generation</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   This pass munges the code in the input function to better prepare it for
   SelectionDAG-based code generation. This works around limitations in it's
   basic-block-at-a-time approach. It should eventually be removed.
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="constmerge">-constmerge: Merge Duplicate Global Constants</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   Merges duplicate global constants together into a single constant that is
   shared.  This is useful because some passes (ie TraceValues) insert a lot of
@@ -867,10 +867,10 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="constprop">-constprop: Simple constant propagation</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>This file implements constant propagation and merging. It looks for
   instructions involving only constant operands and replaces them with a
   constant value instead of an instruction. For example:</p>
@@ -883,10 +883,10 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="dce">-dce: Dead Code Elimination</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   Dead code elimination is similar to <a href="#die">dead instruction
   elimination</a>, but it rechecks instructions that were used by removed
@@ -895,10 +895,10 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="deadargelim">-deadargelim: Dead Argument Elimination</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   This pass deletes dead arguments from internal functions.  Dead argument
   elimination removes arguments which are directly dead, as well as arguments
@@ -913,10 +913,10 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="deadtypeelim">-deadtypeelim: Dead Type Elimination</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   This pass is used to cleanup the output of GCC.  It eliminate names for types
   that are unused in the entire translation unit, using the <a
@@ -925,10 +925,10 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="die">-die: Dead Instruction Elimination</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   Dead instruction elimination performs a single pass over the function,
   removing instructions that are obviously dead.
@@ -936,10 +936,10 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="dse">-dse: Dead Store Elimination</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   A trivial dead store elimination that only considers basic-block local
   redundant stores.
@@ -947,10 +947,10 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="functionattrs">-functionattrs: Deduce function attributes</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>A simple interprocedural pass which walks the call-graph, looking for 
   functions which do not access or only read non-local memory, and marking them 
   readnone/readonly.  In addition, it marks function arguments (of pointer type) 
@@ -962,10 +962,10 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="globaldce">-globaldce: Dead Global Elimination</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   This transform is designed to eliminate unreachable internal globals from the
   program.  It uses an aggressive algorithm, searching out globals that are
@@ -976,10 +976,10 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="globalopt">-globalopt: Global Variable Optimizer</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   This pass transforms simple global variables that never have their address
   taken.  If obviously true, it marks read/write globals as constant, deletes
@@ -988,10 +988,10 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="gvn">-gvn: Global Value Numbering</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   This pass performs global value numbering to eliminate fully and partially
   redundant instructions.  It also performs redundant load elimination.
@@ -999,10 +999,10 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="indvars">-indvars: Canonicalize Induction Variables</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   This transformation analyzes and transforms the induction variables (and
   computations derived from them) into simpler forms suitable for subsequent
@@ -1050,20 +1050,20 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="inline">-inline: Function Integration/Inlining</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   Bottom-up inlining of functions into callees.
   </p>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="insert-edge-profiling">-insert-edge-profiling: Insert instrumentation for edge profiling</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   This pass instruments the specified program with counters for edge profiling.
   Edge profiling can give a reasonable approximation of the hot paths through a
@@ -1078,10 +1078,10 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="insert-optimal-edge-profiling">-insert-optimal-edge-profiling: Insert optimal instrumentation for edge profiling</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>This pass instruments the specified program with counters for edge profiling.
   Edge profiling can give a reasonable approximation of the hot paths through a
   program, and is used for a wide variety of program transformations.
@@ -1089,10 +1089,10 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="instcombine">-instcombine: Combine redundant instructions</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   Combine instructions to form fewer, simple
   instructions.  This pass does not modify the CFG This pass is where algebraic
@@ -1143,10 +1143,10 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="internalize">-internalize: Internalize Global Symbols</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   This pass loops over all of the functions in the input module, looking for a
   main function.  If a main function is found, all other functions and all
@@ -1155,10 +1155,10 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="ipconstprop">-ipconstprop: Interprocedural constant propagation</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   This pass implements an <em>extremely</em> simple interprocedural constant
   propagation pass.  It could certainly be improved in many different ways,
@@ -1169,10 +1169,10 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="ipsccp">-ipsccp: Interprocedural Sparse Conditional Constant Propagation</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   An interprocedural variant of <a href="#sccp">Sparse Conditional Constant 
   Propagation</a>.
@@ -1180,10 +1180,10 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="jump-threading">-jump-threading: Jump Threading</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   Jump threading tries to find distinct threads of control flow running through
   a basic block. This pass looks at blocks that have multiple predecessors and
@@ -1209,10 +1209,10 @@ if (X &lt; 3) {</pre>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="lcssa">-lcssa: Loop-Closed SSA Form Pass</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   This pass transforms loops by placing phi nodes at the end of the loops for
   all values that are live across the loop boundary.  For example, it turns
@@ -1238,10 +1238,10 @@ if (X &lt; 3) {</pre>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="licm">-licm: Loop Invariant Code Motion</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   This pass performs loop invariant code motion, attempting to remove as much
   code from the body of a loop as possible.  It does this by either hoisting
@@ -1275,10 +1275,10 @@ if (X &lt; 3) {</pre>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="loop-deletion">-loop-deletion: Delete dead loops</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   This file implements the Dead Loop Deletion Pass.  This pass is responsible
   for eliminating loops with non-infinite computable trip counts that have no
@@ -1288,10 +1288,10 @@ if (X &lt; 3) {</pre>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="loop-extract">-loop-extract: Extract loops into new functions</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   A pass wrapper around the <code>ExtractLoop()</code> scalar transformation to 
   extract each top-level loop into its own new function. If the loop is the
@@ -1301,10 +1301,10 @@ if (X &lt; 3) {</pre>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="loop-extract-single">-loop-extract-single: Extract at most one loop into a new function</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   Similar to <a href="#loop-extract">Extract loops into new functions</a>,
   this pass extracts one natural loop from the program into a function if it
@@ -1313,10 +1313,10 @@ if (X &lt; 3) {</pre>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="loop-reduce">-loop-reduce: Loop Strength Reduction</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   This pass performs a strength reduction on array references inside loops that
   have as one or more of their components the loop induction variable.  This is
@@ -1327,18 +1327,18 @@ if (X &lt; 3) {</pre>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="loop-rotate">-loop-rotate: Rotate Loops</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>A simple loop rotation transformation.</p>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="loop-simplify">-loop-simplify: Canonicalize natural loops</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   This pass performs several transformations to transform natural loops into a
   simpler form, which makes subsequent analyses and transformations simpler and
@@ -1376,10 +1376,10 @@ if (X &lt; 3) {</pre>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="loop-unroll">-loop-unroll: Unroll loops</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   This pass implements a simple loop unroller.  It works best when loops have
   been canonicalized by the <a href="#indvars"><tt>-indvars</tt></a> pass,
@@ -1388,10 +1388,10 @@ if (X &lt; 3) {</pre>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="loop-unswitch">-loop-unswitch: Unswitch loops</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   This pass transforms loops that contain branches on loop-invariant conditions
   to have multiple loops.  For example, it turns the left into the right code:
@@ -1418,10 +1418,10 @@ if (X &lt; 3) {</pre>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="loweratomic">-loweratomic: Lower atomic intrinsics to non-atomic form</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   This pass lowers atomic intrinsics to non-atomic form for use in a known
   non-preemptible environment.
@@ -1436,10 +1436,10 @@ if (X &lt; 3) {</pre>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="lowerinvoke">-lowerinvoke: Lower invoke and unwind, for unwindless code generators</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   This transformation is designed for use by code generators which do not yet
   support stack unwinding.  This pass supports two models of exception handling
@@ -1477,10 +1477,10 @@ if (X &lt; 3) {</pre>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="lowersetjmp">-lowersetjmp: Lower Set Jump</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
    Lowers <tt>setjmp</tt> and <tt>longjmp</tt> to use the LLVM invoke and unwind
    instructions as necessary.
@@ -1506,10 +1506,10 @@ if (X &lt; 3) {</pre>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="lowerswitch">-lowerswitch: Lower SwitchInst's to branches</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   Rewrites <tt>switch</tt> instructions with a sequence of branches, which
   allows targets to get away with not implementing the switch instruction until
@@ -1518,10 +1518,10 @@ if (X &lt; 3) {</pre>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="mem2reg">-mem2reg: Promote Memory to Register</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   This file promotes memory references to be register references.  It promotes
   <tt>alloca</tt> instructions which only have <tt>load</tt>s and
@@ -1534,10 +1534,10 @@ if (X &lt; 3) {</pre>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="memcpyopt">-memcpyopt: MemCpy Optimization</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   This pass performs various transformations related to eliminating memcpy
   calls, or transforming sets of stores into memset's.
@@ -1545,10 +1545,10 @@ if (X &lt; 3) {</pre>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="mergefunc">-mergefunc: Merge Functions</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>This pass looks for equivalent functions that are mergable and folds them.
  
   A hash is computed from the function, based on its type and number of
@@ -1566,10 +1566,10 @@ if (X &lt; 3) {</pre>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="mergereturn">-mergereturn: Unify function exit nodes</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   Ensure that functions have at most one <tt>ret</tt> instruction in them.
   Additionally, it keeps track of which node is the new exit node of the CFG.
@@ -1577,20 +1577,20 @@ if (X &lt; 3) {</pre>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="partial-inliner">-partial-inliner: Partial Inliner</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>This pass performs partial inlining, typically by inlining an if 
   statement that surrounds the body of the function.
   </p>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="prune-eh">-prune-eh: Remove unused exception handling info</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   This file implements a simple interprocedural pass which walks the call-graph,
   turning <tt>invoke</tt> instructions into <tt>call</tt> instructions if and
@@ -1600,10 +1600,10 @@ if (X &lt; 3) {</pre>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="reassociate">-reassociate: Reassociate expressions</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   This pass reassociates commutative expressions in an order that is designed
   to promote better constant propagation, GCSE, LICM, PRE, etc.
@@ -1623,10 +1623,10 @@ if (X &lt; 3) {</pre>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="reg2mem">-reg2mem: Demote all values to stack slots</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   This file demotes all registers to memory references.  It is intented to be
   the inverse of <a href="#mem2reg"><tt>-mem2reg</tt></a>.  By converting to
@@ -1640,10 +1640,10 @@ if (X &lt; 3) {</pre>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="scalarrepl">-scalarrepl: Scalar Replacement of Aggregates (DT)</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   The well-known scalar replacement of aggregates transformation.  This
   transform breaks up <tt>alloca</tt> instructions of aggregate type (structure
@@ -1662,10 +1662,10 @@ if (X &lt; 3) {</pre>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="sccp">-sccp: Sparse Conditional Constant Propagation</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   Sparse conditional constant propagation and merging, which can be summarized
   as:
@@ -1685,10 +1685,10 @@ if (X &lt; 3) {</pre>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="simplify-libcalls">-simplify-libcalls: Simplify well-known library calls</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   Applies a variety of small optimizations for calls to specific well-known 
   function calls (e.g. runtime library functions). For example, a call
@@ -1698,10 +1698,10 @@ if (X &lt; 3) {</pre>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="simplifycfg">-simplifycfg: Simplify the CFG</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   Performs dead code elimination and basic block merging. Specifically:
   </p>
@@ -1717,20 +1717,20 @@ if (X &lt; 3) {</pre>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="sink">-sink: Code sinking</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>This pass moves instructions into successor blocks, when possible, so that
  they aren't executed on paths where their results aren't needed.
   </p>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="sretpromotion">-sretpromotion: Promote sret arguments to multiple ret values</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   This pass finds functions that return a struct (using a pointer to the struct
   as the first argument of the function, marked with the '<tt>sret</tt>' attribute) and
@@ -1750,10 +1750,10 @@ if (X &lt; 3) {</pre>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="strip">-strip: Strip all symbols from a module</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   performs code stripping. this transformation can delete:
   </p>
@@ -1772,10 +1772,10 @@ if (X &lt; 3) {</pre>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="strip-dead-debug-info">-strip-dead-debug-info: Strip debug info for unused symbols</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   performs code stripping. this transformation can delete:
   </p>
@@ -1794,10 +1794,10 @@ if (X &lt; 3) {</pre>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="strip-dead-prototypes">-strip-dead-prototypes: Strip Unused Function Prototypes</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   This pass loops over all of the functions in the input module, looking for
   dead declarations and removes them. Dead declarations are declarations of
@@ -1807,10 +1807,10 @@ if (X &lt; 3) {</pre>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="strip-debug-declare">-strip-debug-declare: Strip all llvm.dbg.declare intrinsics</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>This pass implements code stripping. Specifically, it can delete:</p>
   <ul>
   <li>names for virtual registers</li>
@@ -1825,10 +1825,10 @@ if (X &lt; 3) {</pre>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="strip-nondebug">-strip-nondebug: Strip all symbols, except dbg symbols, from a module</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>This pass implements code stripping. Specifically, it can delete:</p>
   <ul>
   <li>names for virtual registers</li>
@@ -1843,10 +1843,10 @@ if (X &lt; 3) {</pre>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="tailcallelim">-tailcallelim: Tail Call Elimination</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   This file transforms calls of the current function (self recursion) followed
   by a return instruction with a branch to the entry of the function, creating
@@ -1875,10 +1875,10 @@ if (X &lt; 3) {</pre>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="tailduplicate">-tailduplicate: Tail Duplication</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   This pass performs a limited form of tail duplication, intended to simplify
   CFGs by removing some unconditional branches.  This pass is necessary to
@@ -1888,17 +1888,18 @@ if (X &lt; 3) {</pre>
   </p>
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_section"> <a name="transform">Utility Passes</a></div>
-<div class="doc_text">
+<h2><a name="utilities">Utility Passes</a></h2>
+<div>
   <p>This section describes the LLVM Utility Passes.</p>
-</div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="deadarghaX0r">-deadarghaX0r: Dead Argument Hacking (BUGPOINT USE ONLY; DO NOT USE)</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   Same as dead argument elimination, but deletes arguments to functions which
   are external.  This is only for use by <a
@@ -1906,20 +1907,20 @@ if (X &lt; 3) {</pre>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="extract-blocks">-extract-blocks: Extract Basic Blocks From Module (for bugpoint use)</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   This pass is used by bugpoint to extract all blocks from the module into their
   own functions.</p>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="instnamer">-instnamer: Assign names to anonymous instructions</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>This is a little utility pass that gives instructions names, this is mostly
  useful when diffing the effect of an optimization because deleting an
  unnamed instruction can change all other instruction numbering, making the
@@ -1928,10 +1929,10 @@ if (X &lt; 3) {</pre>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="preverify">-preverify: Preliminary module verification</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   Ensures that the module is in the form required by the <a
   href="#verifier">Module Verifier</a> pass.
@@ -1944,10 +1945,10 @@ if (X &lt; 3) {</pre>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="verify">-verify: Module Verifier</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   Verifies an LLVM IR code. This is useful to run after an optimization which is
   undergoing testing. Note that <tt>llvm-as</tt> verifies its input before
@@ -1995,20 +1996,20 @@ if (X &lt; 3) {</pre>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="view-cfg">-view-cfg: View CFG of function</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   Displays the control flow graph using the GraphViz tool.
   </p>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="view-cfg-only">-view-cfg-only: View CFG of function (with no function bodies)</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   Displays the control flow graph using the GraphViz tool, but omitting function
   bodies.
@@ -2016,20 +2017,20 @@ if (X &lt; 3) {</pre>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="view-dom">-view-dom: View dominance tree of function</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   Displays the dominator tree using the GraphViz tool.
   </p>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="view-dom-only">-view-dom-only: View dominance tree of function (with no function bodies)</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   Displays the dominator tree using the GraphViz tool, but omitting function
   bodies.
@@ -2037,26 +2038,28 @@ if (X &lt; 3) {</pre>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="view-postdom">-view-postdom: View postdominance tree of function</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   Displays the post dominator tree using the GraphViz tool.
   </p>
 </div>
 
 <!-------------------------------------------------------------------------- -->
-<div class="doc_subsection">
+<h3>
   <a name="view-postdom-only">-view-postdom-only: View postdominance tree of function (with no function bodies)</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
   <p>
   Displays the post dominator tree using the GraphViz tool, but omitting
   function bodies.
   </p>
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
 
 <hr>
@@ -2067,7 +2070,7 @@ if (X &lt; 3) {</pre>
   src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
 
   <a href="mailto:rspencer@x10sys.com">Reid Spencer</a><br>
-  <a href="http://llvm.org">LLVM Compiler Infrastructure</a><br>
+  <a href="http://llvm.org/">LLVM Compiler Infrastructure</a><br>
   Last modified: $Date$
 </address>
 
diff --git a/docs/ProgrammersManual.html b/docs/ProgrammersManual.html
index c46f596..49a76ee 100644
--- a/docs/ProgrammersManual.html
+++ b/docs/ProgrammersManual.html
@@ -8,9 +8,9 @@
 </head>
 <body>
 
-<div class="doc_title">
+<h1>
   LLVM Programmer's Manual
-</div>
+</h1>
 
 <ol>
   <li><a href="#introduction">Introduction</a></li>
@@ -64,6 +64,7 @@ option</a></li>
       <li><a href="#dss_deque">&lt;deque&gt;</a></li>
       <li><a href="#dss_list">&lt;list&gt;</a></li>
       <li><a href="#dss_ilist">llvm/ADT/ilist.h</a></li>
+      <li><a href="#dss_packedvector">llvm/ADT/PackedVector.h</a></li>
       <li><a href="#dss_other">Other Sequential Container Options</a></li>
     </ul></li>
     <li><a href="#ds_set">Set-Like Containers (std::set, SmallSet, SetVector, etc)</a>
@@ -210,12 +211,12 @@ with another <tt>Value</tt></a> </li>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="introduction">Introduction </a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>This document is meant to highlight some of the important classes and
 interfaces available in the LLVM source-base.  This manual is not
@@ -242,24 +243,22 @@ href="/doxygen/InstVisitor_8h-source.html">InstVisitor</a></tt> template.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="general">General Information</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>This section contains general information that is useful if you are working
 in the LLVM source-base, but that isn't specific to any particular API.</p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="stl">The C++ Standard Template Library</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>LLVM makes heavy use of the C++ Standard Template Library (STL),
 perhaps much more than you are used to, or have seen before.  Because of
@@ -305,11 +304,11 @@ to write maintainable code more than where to put your curly braces.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="stl">Other useful references</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <ol>
 <li><a href="http://www.fortran-2000.com/ArnaudRecipes/sharedlib.html">Using
@@ -318,26 +317,26 @@ static and shared libraries across platforms</a></li>
 
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="apis">Important and useful LLVM APIs</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Here we highlight some LLVM APIs that are generally useful and good to
 know about when writing transformations.</p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="isa">The <tt>isa&lt;&gt;</tt>, <tt>cast&lt;&gt;</tt> and
   <tt>dyn_cast&lt;&gt;</tt> templates</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>The LLVM source-base makes extensive use of a custom form of RTTI.
 These templates have many similarities to the C++ <tt>dynamic_cast&lt;&gt;</tt>
@@ -442,12 +441,12 @@ are lots of examples in the LLVM source base.</p>
 
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="string_apis">Passing strings (the <tt>StringRef</tt>
 and <tt>Twine</tt> classes)</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>Although LLVM generally does not do much string manipulation, we do have
 several important APIs which take strings.  Two important examples are the
@@ -461,14 +460,12 @@ clients to perform a heap allocation which is usually unnecessary.  Instead,
 many LLVM APIs use a <tt>StringRef</tt> or a <tt>const Twine&amp;</tt> for
 passing strings efficiently.</p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="StringRef">The <tt>StringRef</tt> class</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>The <tt>StringRef</tt> data type represents a reference to a constant string
 (a character array and a length) and supports the common operations available
@@ -504,11 +501,11 @@ small and pervasive enough in LLVM that it should always be passed by value.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="Twine">The <tt>Twine</tt> class</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>The <tt>Twine</tt> class is an efficient way for APIs to accept concatenated
 strings.  For example, a common LLVM paradigm is to name one instruction based on
@@ -539,13 +536,14 @@ accept concatenated strings.</p>
 
 </div>
 
+</div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="DEBUG">The <tt>DEBUG()</tt> macro and <tt>-debug</tt> option</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>Often when working on your pass you will put a bunch of debugging printouts
 and other code into your pass.  After you get it working, you want to remove
@@ -591,15 +589,13 @@ enable or disable it directly in gdb.  Just use "<tt>set DebugFlag=0</tt>" or
 program hasn't been started yet, you can always just run it with
 <tt>-debug</tt>.</p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="DEBUG_TYPE">Fine grained debug info with <tt>DEBUG_TYPE</tt> and
   the <tt>-debug-only</tt> option</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>Sometimes you may find yourself in a situation where enabling <tt>-debug</tt>
 just turns on <b>too much</b> information (such as when working on the code
@@ -667,13 +663,15 @@ DEBUG_WITH_TYPE("", errs() &lt;&lt; "No debug type (2)\n");
 
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="Statistic">The <tt>Statistic</tt> class &amp; <tt>-stats</tt>
   option</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>The "<tt><a
 href="/doxygen/Statistic_8h-source.html">llvm/ADT/Statistic.h</a></tt>" file
@@ -768,11 +766,11 @@ maintainable and useful.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="ViewGraph">Viewing graphs while debugging code</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>Several of the important data structures in LLVM are graphs: for example
 CFGs made out of LLVM <a href="#BasicBlock">BasicBlock</a>s, CFGs made out of
@@ -814,15 +812,21 @@ found at <a href="http://www.graphviz.org/doc/info/attrs.html">Graph
 Attributes</a>.)  If you want to restart and clear all the current graph
 attributes, then you can <tt>call DAG.clearGraphAttrs()</tt>. </p>
 
+<p>Note that graph visualization features are compiled out of Release builds
+to reduce file size.  This means that you need a Debug+Asserts or 
+Release+Asserts build to use these features.</p>
+
+</div>
+
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="datastructure">Picking the Right Data Structure for a Task</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>LLVM has a plethora of data structures in the <tt>llvm/ADT/</tt> directory,
  and we commonly use STL data structures.  This section describes the trade-offs
@@ -878,24 +882,21 @@ elements (but could contain many), for example, it's much better to use
 .  Doing so avoids (relatively) expensive malloc/free calls, which dwarf the
 cost of adding the elements to the container. </p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="ds_sequential">Sequential Containers (std::vector, std::list, etc)</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 There are a variety of sequential containers available for you, based on your
 needs.  Pick the first in this section that will do what you want.
-</div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="dss_arrayref">llvm/ADT/ArrayRef.h</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 <p>The llvm::ArrayRef class is the preferred class to use in an interface that
    accepts a sequential list of elements in memory and just reads from them.  By
    taking an ArrayRef, the API can be passed a fixed size array, an std::vector,
@@ -906,22 +907,22 @@ needs.  Pick the first in this section that will do what you want.
 
   
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="dss_fixedarrays">Fixed Size Arrays</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 <p>Fixed size arrays are very simple and very fast.  They are good if you know
 exactly how many elements you have, or you have a (low) upper bound on how many
 you have.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="dss_heaparrays">Heap Allocated Arrays</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 <p>Heap allocated arrays (new[] + delete[]) are also simple.  They are good if
 the number of elements is variable, if you know how many elements you will need
 before the array is allocated, and if the array is usually large (if not,
@@ -933,11 +934,11 @@ construct those elements actually used).</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="dss_smallvector">"llvm/ADT/SmallVector.h"</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 <p><tt>SmallVector&lt;Type, N&gt;</tt> is a simple class that looks and smells
 just like <tt>vector&lt;Type&gt;</tt>:
 it supports efficient iteration, lays out elements in memory order (so you can
@@ -962,11 +963,11 @@ SmallVectors are most useful when on the stack.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="dss_vector">&lt;vector&gt;</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 <p>
 std::vector is well loved and respected.  It is useful when SmallVector isn't:
 when the size of the vector is often large (thus the small optimization will
@@ -1004,11 +1005,11 @@ the loop.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="dss_deque">&lt;deque&gt;</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 <p>std::deque is, in some senses, a generalized version of std::vector.  Like
 std::vector, it provides constant time random access and other similar
 properties, but it also provides efficient access to the front of the list.  It
@@ -1020,11 +1021,11 @@ something cheaper.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="dss_list">&lt;list&gt;</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 <p>std::list is an extremely inefficient class that is rarely useful.
 It performs a heap allocation for every element inserted into it, thus having an
 extremely high constant factor, particularly for small data types.  std::list
@@ -1038,11 +1039,11 @@ not invalidate iterator or pointers to other elements in the list.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="dss_ilist">llvm/ADT/ilist.h</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 <p><tt>ilist&lt;T&gt;</tt> implements an 'intrusive' doubly-linked list.  It is
 intrusive, because it requires the element to store and provide access to the
 prev/next pointers for the list.</p>
@@ -1068,22 +1069,60 @@ Related classes of interest are explained in the following subsections:
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
-  <a name="dss_ilist_traits">ilist_traits</a>
+<h4>
+  <a name="dss_packedvector">llvm/ADT/PackedVector.h</a>
+</h4>
+
+<div>
+<p>
+Useful for storing a vector of values using only a few number of bits for each
+value. Apart from the standard operations of a vector-like container, it can
+also perform an 'or' set operation. 
+</p>
+
+<p>For example:</p>
+
+<div class="doc_code">
+<pre>
+enum State {
+    None = 0x0,
+    FirstCondition = 0x1,
+    SecondCondition = 0x2,
+    Both = 0x3
+};
+
+State get() {
+    PackedVector&lt;State, 2&gt; Vec1;
+    Vec1.push_back(FirstCondition);
+
+    PackedVector&lt;State, 2&gt; Vec2;
+    Vec2.push_back(SecondCondition);
+
+    Vec1 |= Vec2;
+    return Vec1[0]; // returns 'Both'.
+}
+</pre>
+</div>
+
 </div>
 
-<div class="doc_text">
+<!-- _______________________________________________________________________ -->
+<h4>
+  <a name="dss_ilist_traits">ilist_traits</a>
+</h4>
+
+<div>
 <p><tt>ilist_traits&lt;T&gt;</tt> is <tt>ilist&lt;T&gt;</tt>'s customization
 mechanism. <tt>iplist&lt;T&gt;</tt> (and consequently <tt>ilist&lt;T&gt;</tt>)
 publicly derive from this traits class.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="dss_iplist">iplist</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 <p><tt>iplist&lt;T&gt;</tt> is <tt>ilist&lt;T&gt;</tt>'s base and as such
 supports a slightly narrower interface. Notably, inserters from
 <tt>T&amp;</tt> are absent.</p>
@@ -1093,11 +1132,11 @@ used for a wide variety of customizations.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="dss_ilist_node">llvm/ADT/ilist_node.h</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 <p><tt>ilist_node&lt;T&gt;</tt> implements a the forward and backward links
 that are expected by the <tt>ilist&lt;T&gt;</tt> (and analogous containers)
 in the default manner.</p>
@@ -1108,11 +1147,11 @@ in the default manner.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="dss_ilist_sentinel">Sentinels</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 <p><tt>ilist</tt>s have another specialty that must be considered. To be a good
 citizen in the C++ ecosystem, it needs to support the standard container
 operations, such as <tt>begin</tt> and <tt>end</tt> iterators, etc. Also, the
@@ -1146,11 +1185,11 @@ field in the ghostly sentinel which can be legally accessed.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="dss_other">Other Sequential Container options</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 <p>Other STL containers are available, such as std::string.</p>
 
 <p>There are also various STL adapter classes such as std::queue,
@@ -1159,27 +1198,25 @@ underlying container but don't affect the cost of the container itself.</p>
 
 </div>
 
+</div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="ds_set">Set-Like Containers (std::set, SmallSet, SetVector, etc)</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>Set-like containers are useful when you need to canonicalize multiple values
 into a single representation.  There are several different choices for how to do
 this, providing various trade-offs.</p>
 
-</div>
-
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="dss_sortedvectorset">A sorted 'vector'</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>If you intend to insert a lot of elements, then do a lot of queries, a
 great approach is to use a vector (or other sequential container) with
@@ -1197,11 +1234,11 @@ efficiently queried with a standard binary or radix search.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="dss_smallset">"llvm/ADT/SmallSet.h"</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>If you have a set-like data structure that is usually small and whose elements
 are reasonably small, a <tt>SmallSet&lt;Type, N&gt;</tt> is a good choice.  This set
@@ -1220,11 +1257,11 @@ and erasing, but does not support iteration.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="dss_smallptrset">"llvm/ADT/SmallPtrSet.h"</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>SmallPtrSet has all the advantages of <tt>SmallSet</tt> (and a <tt>SmallSet</tt> of pointers is 
 transparently implemented with a <tt>SmallPtrSet</tt>), but also supports iterators.  If
@@ -1240,11 +1277,11 @@ visited in sorted order.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="dss_denseset">"llvm/ADT/DenseSet.h"</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>
 DenseSet is a simple quadratically probed hash table.  It excels at supporting
@@ -1259,11 +1296,11 @@ href="#dss_densemap">DenseMap</a> has.
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="dss_FoldingSet">"llvm/ADT/FoldingSet.h"</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>
 FoldingSet is an aggregate class that is really good at uniquing
@@ -1296,11 +1333,11 @@ elements.
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="dss_set">&lt;set&gt;</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p><tt>std::set</tt> is a reasonable all-around set class, which is decent at
 many things but great at nothing.  std::set allocates memory for each element
@@ -1321,11 +1358,11 @@ std::set is almost never a good choice.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="dss_setvector">"llvm/ADT/SetVector.h"</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 <p>LLVM's SetVector&lt;Type&gt; is an adapter class that combines your choice of
 a set-like container along with a <a href="#ds_sequential">Sequential 
 Container</a>.  The important property
@@ -1361,11 +1398,11 @@ heap traffic.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="dss_uniquevector">"llvm/ADT/UniqueVector.h"</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>
 UniqueVector is similar to <a href="#dss_setvector">SetVector</a>, but it
@@ -1381,11 +1418,11 @@ factors, and produces a lot of malloc traffic.  It should be avoided.</p>
 
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="dss_otherset">Other Set-Like Container Options</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>
 The STL provides several other options, such as std::multiset and the various 
@@ -1401,22 +1438,23 @@ better.</p>
 
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="ds_map">Map-Like Containers (std::map, DenseMap, etc)</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 Map-like containers are useful when you want to associate data to a key.  As
 usual, there are a lot of different ways to do this. :)
-</div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="dss_sortedvectormap">A sorted 'vector'</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>
 If your usage pattern follows a strict insert-then-query approach, you can
@@ -1429,11 +1467,11 @@ vectors for sets.
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="dss_stringmap">"llvm/ADT/StringMap.h"</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>
 Strings are commonly used as keys in maps, and they are difficult to support
@@ -1463,11 +1501,11 @@ copies a string if a value is inserted into the table.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="dss_indexedmap">"llvm/ADT/IndexedMap.h"</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 <p>
 IndexedMap is a specialized container for mapping small dense integers (or
 values that can be mapped to small dense integers) to some other type.  It is
@@ -1483,11 +1521,11 @@ virtual register ID).</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="dss_densemap">"llvm/ADT/DenseMap.h"</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>
 DenseMap is a simple quadratically probed hash table.  It excels at supporting
@@ -1509,11 +1547,11 @@ inserted into the map) that it needs internally.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="dss_valuemap">"llvm/ADT/ValueMap.h"</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>
 ValueMap is a wrapper around a <a href="#dss_densemap">DenseMap</a> mapping
@@ -1526,11 +1564,11 @@ a <code>Config</code> parameter to the ValueMap template.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="dss_intervalmap">"llvm/ADT/IntervalMap.h"</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p> IntervalMap is a compact map for small keys and values. It maps key
 intervals instead of single keys, and it will automatically coalesce adjacent
@@ -1543,11 +1581,11 @@ as STL iterators. The heavyweight iterators allow a smaller data structure.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="dss_map">&lt;map&gt;</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>
 std::map has similar characteristics to <a href="#dss_set">std::set</a>: it uses
@@ -1563,11 +1601,11 @@ another element takes place).</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="dss_inteqclasses">"llvm/ADT/IntEqClasses.h"</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>IntEqClasses provides a compact representation of equivalence classes of
 small integers. Initially, each integer in the range 0..n-1 has its own
@@ -1583,11 +1621,11 @@ it can be edited again.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="dss_othermap">Other Map-Like Container Options</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>
 The STL provides several other options, such as std::multimap and the various 
@@ -1601,12 +1639,14 @@ always better.</p>
 
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="ds_string">String-like containers</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>
 TODO: const char* vs stringref vs smallstring vs std::string.  Describe twine,
@@ -1616,11 +1656,11 @@ xref to #string_apis.
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="ds_bit">Bit storage containers (BitVector, SparseBitVector)</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 <p>Unlike the other containers, there are only two bit storage containers, and 
 choosing when to use each is relatively straightforward.</p>
 
@@ -1630,14 +1670,13 @@ implementation in many common compilers (e.g. commonly available versions of
 GCC) is extremely inefficient and 2) the C++ standards committee is likely to
 deprecate this container and/or change it significantly somehow.  In any case,
 please don't use it.</p>
-</div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="dss_bitvector">BitVector</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 <p> The BitVector container provides a dynamic size set of bits for manipulation.
 It supports individual bit setting/testing, as well as set operations.  The set
 operations take time O(size of bitvector), but operations are performed one word
@@ -1648,11 +1687,11 @@ the number of set bits to be high (IE a dense set).
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="dss_smallbitvector">SmallBitVector</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 <p> The SmallBitVector container provides the same interface as BitVector, but
 it is optimized for the case where only a small number of bits, less than
 25 or so, are needed. It also transparently supports larger bit counts, but
@@ -1667,11 +1706,11 @@ and its operator[] does not provide an assignable lvalue.
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="dss_sparsebitvector">SparseBitVector</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 <p> The SparseBitVector container is much like BitVector, with one major
 difference: Only the bits that are set, are stored.  This makes the
 SparseBitVector much more space efficient than BitVector when the set is sparse,
@@ -1681,13 +1720,17 @@ universe).  The downside to the SparseBitVector is that setting and testing of r
 </p>
 </div>
 
+</div>
+
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="common">Helpful Hints for Common Operations</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>This section describes how to perform some very simple transformations of
 LLVM code.  This is meant to give examples of common idioms used, showing the
@@ -1696,15 +1739,13 @@ you should also read about the main classes that you will be working with.  The
 <a href="#coreclasses">Core LLVM Class Hierarchy Reference</a> contains details
 and descriptions of the main classes that you should know about.</p>
 
-</div>
-
 <!-- NOTE: this section should be heavy on example code -->
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="inspection">Basic Inspection and Traversal Routines</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>The LLVM compiler infrastructure have many different data structures that may
 be traversed.  Following the example of the C++ standard template library, the
@@ -1721,16 +1762,14 @@ on them, and it is easier to remember how to iterate. First we show a few common
 examples of the data structures that need to be traversed.  Other data
 structures are traversed in very similar ways.</p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="iterate_function">Iterating over the </a><a
   href="#BasicBlock"><tt>BasicBlock</tt></a>s in a <a
   href="#Function"><tt>Function</tt></a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>It's quite common to have a <tt>Function</tt> instance that you'd like to
 transform in some way; in particular, you'd like to manipulate its
@@ -1759,13 +1798,13 @@ exactly equivalent to <tt>(*i).size()</tt> just like you'd expect.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="iterate_basicblock">Iterating over the </a><a
   href="#Instruction"><tt>Instruction</tt></a>s in a <a
   href="#BasicBlock"><tt>BasicBlock</tt></a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>Just like when dealing with <tt>BasicBlock</tt>s in <tt>Function</tt>s, it's
 easy to iterate over the individual instructions that make up
@@ -1790,13 +1829,13 @@ basic block itself: <tt>errs() &lt;&lt; *blk &lt;&lt; "\n";</tt>.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="iterate_institer">Iterating over the </a><a
   href="#Instruction"><tt>Instruction</tt></a>s in a <a
   href="#Function"><tt>Function</tt></a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>If you're finding that you commonly iterate over a <tt>Function</tt>'s
 <tt>BasicBlock</tt>s and then that <tt>BasicBlock</tt>'s <tt>Instruction</tt>s,
@@ -1836,12 +1875,12 @@ for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I)
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="iterate_convert">Turning an iterator into a class pointer (and
   vice-versa)</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>Sometimes, it'll be useful to grab a reference (or pointer) to a class
 instance when all you've got at hand is an iterator.  Well, extracting
@@ -1913,12 +1952,12 @@ and <tt>operator*</tt> changed to return a pointer instead of a reference.</p>
 </div>
 
 <!--_______________________________________________________________________-->
-<div class="doc_subsubsection">
+<h4>
   <a name="iterate_complex">Finding call sites: a slightly more complex
   example</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>Say that you're writing a FunctionPass and would like to count all the
 locations in the entire module (that is, across every <tt>Function</tt>) where a
@@ -1975,11 +2014,11 @@ class OurFunctionPass : public FunctionPass {
 </div>
 
 <!--_______________________________________________________________________-->
-<div class="doc_subsubsection">
+<h4>
   <a name="calls_and_invokes">Treating calls and invokes the same way</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>You may have noticed that the previous example was a bit oversimplified in
 that it did not deal with call sites generated by 'invoke' instructions. In
@@ -2002,11 +2041,11 @@ If you look at its definition, it has only a single pointer member.</p>
 </div>
 
 <!--_______________________________________________________________________-->
-<div class="doc_subsubsection">
+<h4>
   <a name="iterate_chains">Iterating over def-use &amp; use-def chains</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>Frequently, we might have an instance of the <a
 href="/doxygen/classllvm_1_1Value.html">Value Class</a> and we want to
@@ -2063,12 +2102,12 @@ calling <tt>use/op_begin()</tt> on <tt>const Value*</tt>s or
 </div>
 
 <!--_______________________________________________________________________-->
-<div class="doc_subsubsection">
+<h4>
   <a name="iterate_preds">Iterating over predecessors &amp;
 successors of blocks</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>Iterating over the predecessors and successors of a block is quite easy
 with the routines defined in <tt>"llvm/Support/CFG.h"</tt>.  Just use code like
@@ -2091,13 +2130,14 @@ succ_iterator/succ_begin/succ_end.</p>
 
 </div>
 
+</div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="simplechanges">Making simple changes</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>There are some primitive transformation operations present in the LLVM
 infrastructure that are worth knowing about.  When performing
@@ -2105,15 +2145,13 @@ transformations, it's fairly common to manipulate the contents of basic
 blocks. This section describes some of the common methods for doing so
 and gives example code.</p>
 
-</div>
-
 <!--_______________________________________________________________________-->
-<div class="doc_subsubsection">
+<h4>
   <a name="schanges_creating">Creating and inserting new
   <tt>Instruction</tt>s</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p><i>Instantiating Instructions</i></p>
 
@@ -2249,11 +2287,11 @@ Instruction* newInst = new Instruction(..., pi);
 </div>
 
 <!--_______________________________________________________________________-->
-<div class="doc_subsubsection">
+<h4>
   <a name="schanges_deleting">Deleting <tt>Instruction</tt>s</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>Deleting an instruction from an existing sequence of instructions that form a
 <a href="#BasicBlock"><tt>BasicBlock</tt></a> is very straight-forward: just
@@ -2273,12 +2311,12 @@ block but not delete it, you can use the <tt>removeFromParent()</tt> method.</p>
 </div>
 
 <!--_______________________________________________________________________-->
-<div class="doc_subsubsection">
+<h4>
   <a name="schanges_replacing">Replacing an <tt>Instruction</tt> with another
   <tt>Value</tt></a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p><i>Replacing individual instructions</i></p>
 
@@ -2286,7 +2324,7 @@ block but not delete it, you can use the <tt>removeFromParent()</tt> method.</p>
 permits use of two very useful replace functions: <tt>ReplaceInstWithValue</tt>
 and <tt>ReplaceInstWithInst</tt>.</p>
 
-<h4><a name="schanges_deleting">Deleting <tt>Instruction</tt>s</a></h4>
+<h5><a name="schanges_deleting">Deleting <tt>Instruction</tt>s</a></h5>
 
 <ul>
   <li><tt>ReplaceInstWithValue</tt>
@@ -2339,11 +2377,11 @@ ReplaceInstWithValue, ReplaceInstWithInst -->
 </div>
 
 <!--_______________________________________________________________________-->
-<div class="doc_subsubsection">
+<h4>
   <a name="schanges_deletingGV">Deleting <tt>GlobalVariable</tt>s</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>Deleting a global variable from a module is just as easy as deleting an 
 Instruction. First, you must have a pointer to the global variable that you wish
@@ -2360,12 +2398,14 @@ GV-&gt;eraseFromParent();
 
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="create_types">How to Create Types</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>In generating IR, you may need some complex types.  If you know these types
 statically, you can use <tt>TypeBuilder&lt;...&gt;::get()</tt>, defined
@@ -2400,13 +2440,15 @@ comment</a> for more details.</p>
 
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="threading">Threads and LLVM</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 <p>
 This section describes the interaction of the LLVM APIs with multithreading,
 both on the part of client applications, and in the JIT, in the hosted
@@ -2429,14 +2471,13 @@ compiler, consider compiling LLVM and LLVM-GCC in single-threaded mode, and
 using the resultant compiler to build a copy of LLVM with multithreading
 support.
 </p>
-</div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="startmultithreaded">Entering and Exiting Multithreaded Mode</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>
 In order to properly protect its internal data structures while avoiding 
@@ -2469,11 +2510,11 @@ result in concurrent LLVM API calls.
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="shutdown">Ending Execution with <tt>llvm_shutdown()</tt></a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 <p>
 When you are done using the LLVM APIs, you should call <tt>llvm_shutdown()</tt>
 to deallocate memory used for internal structures.  This will also invoke 
@@ -2489,11 +2530,11 @@ destructor.
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="managedstatic">Lazy Initialization with <tt>ManagedStatic</tt></a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 <p>
 <tt>ManagedStatic</tt> is a utility class in LLVM used to implement static
 initialization of static resources, such as the global type tables.  Before the
@@ -2518,11 +2559,11 @@ and only if you know what you're doing!
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="llvmcontext">Achieving Isolation with <tt>LLVMContext</tt></a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 <p>
 <tt>LLVMContext</tt> is an opaque class in the LLVM API which clients can use
 to operate multiple, isolated instances of LLVM concurrently within the same
@@ -2562,11 +2603,11 @@ isolation is not a concern.
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="jitthreading">Threads and the JIT</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 <p>
 LLVM's "eager" JIT compiler is safe to use in threaded programs.  Multiple
 threads can call <tt>ExecutionEngine::getPointerToFunction()</tt> or
@@ -2589,26 +2630,27 @@ access, but we suggest using only the eager JIT in threaded programs.
 </p>
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="advanced">Advanced Topics</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 <p>
 This section describes some of the advanced or obscure API's that most clients
 do not need to be aware of.  These API's tend manage the inner workings of the
 LLVM system, and only need to be accessed in unusual circumstances.
 </p>
-</div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="TypeResolve">LLVM Type Resolution</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>
 The LLVM type system has a very simple goal: allow clients to compare types for
@@ -2637,14 +2679,12 @@ Third, a concrete type is a type that is not an abstract type (e.g. "<tt>{ i32,
 float }</tt>").
 </p>
 
-</div>
-
 <!-- ______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="BuildRecType">Basic Recursive Type Construction</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>
 Because the most common question is "how do I build a recursive type with LLVM",
@@ -2696,11 +2736,11 @@ href="#PATypeHolder">PATypeHolder class</a>.
 </div>
 
 <!-- ______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="refineAbstractTypeTo">The <tt>refineAbstractTypeTo</tt> method</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 <p>
 The <tt>refineAbstractTypeTo</tt> method starts the type unification process.
 While this method is actually a member of the DerivedType class, it is most
@@ -2726,11 +2766,11 @@ complex datastructures.
 </div>
 
 <!-- ______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="PATypeHolder">The PATypeHolder Class</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 <p>
 PATypeHolder is a form of a "smart pointer" for Type objects.  When VMCore
 happily goes about nuking types that become isomorphic to existing types, it
@@ -2748,11 +2788,11 @@ Type is maintained by PATypeHolder objects.
 </div>
 
 <!-- ______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="AbstractTypeUser">The AbstractTypeUser Class</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>
 Some data structures need more to perform more complex updates when types get
@@ -2766,14 +2806,15 @@ objects) can never be refined.
 </p>
 </div>
 
+</div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="SymbolTable">The <tt>ValueSymbolTable</tt> and
    <tt>TypeSymbolTable</tt> classes</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 <p>The <tt><a href="http://llvm.org/doxygen/classllvm_1_1ValueSymbolTable.html">
 ValueSymbolTable</a></tt> class provides a symbol table that the <a
 href="#Function"><tt>Function</tt></a> and <a href="#Module">
@@ -2804,11 +2845,11 @@ insert entries into the symbol table.</p>
 
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="UserLayout">The <tt>User</tt> and owned <tt>Use</tt> classes' memory layout</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 <p>The <tt><a href="http://llvm.org/doxygen/classllvm_1_1User.html">
 User</a></tt> class provides a basis for expressing the ownership of <tt>User</tt>
 towards other <tt><a href="http://llvm.org/doxygen/classllvm_1_1Value.html">
@@ -2817,18 +2858,19 @@ Use</a></tt> helper class is employed to do the bookkeeping and to facilitate <i
 addition and removal.</p>
 
 <!-- ______________________________________________________________________ -->
-<div class="doc_subsubsection">
-  <a name="Use2User">Interaction and relationship between <tt>User</tt> and <tt>Use</tt> objects</a>
-</div>
+<h4>
+  <a name="Use2User">
+    Interaction and relationship between <tt>User</tt> and <tt>Use</tt> objects
+  </a>
+</h4>
 
-<div class="doc_text">
+<div>
 <p>
 A subclass of <tt>User</tt> can choose between incorporating its <tt>Use</tt> objects
 or refer to them out-of-line by means of a pointer. A mixed variant
 (some <tt>Use</tt>s inline others hung off) is impractical and breaks the invariant
 that the <tt>Use</tt> objects belonging to the same <tt>User</tt> form a contiguous array.
 </p>
-</div>
 
 <p>
 We have 2 different layouts in the <tt>User</tt> (sub)classes:
@@ -2877,17 +2919,18 @@ enforce the following memory layouts:</p>
 <i>(In the above figures '<tt>P</tt>' stands for the <tt>Use**</tt> that
     is stored in each <tt>Use</tt> object in the member <tt>Use::Prev</tt>)</i>
 
+</div>
+
 <!-- ______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="Waymarking">The waymarking algorithm</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 <p>
 Since the <tt>Use</tt> objects are deprived of the direct (back)pointer to
 their <tt>User</tt> objects, there must be a fast and exact method to
 recover it. This is accomplished by the following scheme:</p>
-</div>
 
 A bit-encoding in the 2 LSBits (least significant bits) of the <tt>Use::Prev</tt> allows to find the
 start of the <tt>User</tt> object:
@@ -2918,15 +2961,16 @@ Only the significant number of bits need to be stored between the
 stops, so that the <i>worst case is 20 memory accesses</i> when there are
 1000 <tt>Use</tt> objects associated with a <tt>User</tt>.</p>
 
+</div>
+
 <!-- ______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="ReferenceImpl">Reference implementation</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 <p>
 The following literate Haskell fragment demonstrates the concept:</p>
-</div>
 
 <div class="doc_code">
 <pre>
@@ -3008,10 +3052,14 @@ And here is the result of &lt;deepCheck identityProp&gt;:</p>
 OK, passed 500 tests.
 </pre>
 
+</div>
+
 <!-- ______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="Tagging">Tagging considerations</a>
-</div>
+</h4>
+
+<div>
 
 <p>
 To maintain the invariant that the 2 LSBits of each <tt>Use**</tt> in <tt>Use</tt>
@@ -3027,13 +3075,17 @@ the LSBit set. (Portability is relying on the fact that all known compilers plac
 
 </div>
 
-  <!-- *********************************************************************** -->
-<div class="doc_section">
-  <a name="coreclasses">The Core LLVM Class Hierarchy Reference </a>
 </div>
+
+</div>
+
+<!-- *********************************************************************** -->
+<h2>
+  <a name="coreclasses">The Core LLVM Class Hierarchy Reference </a>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 <p><tt>#include "<a href="/doxygen/Type_8h-source.html">llvm/Type.h</a>"</tt>
 <br>doxygen info: <a href="/doxygen/classllvm_1_1Type.html">Type Class</a></p>
 
@@ -3042,14 +3094,12 @@ being inspected or transformed.  The core LLVM classes are defined in
 header files in the <tt>include/llvm/</tt> directory, and implemented in
 the <tt>lib/VMCore</tt> directory.</p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="Type">The <tt>Type</tt> class and Derived Types</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
   <p><tt>Type</tt> is a superclass of all type classes. Every <tt>Value</tt> has
   a <tt>Type</tt>. <tt>Type</tt> cannot be instantiated directly but only
@@ -3064,14 +3114,13 @@ the <tt>lib/VMCore</tt> directory.</p>
   be performed with address equality of the Type Instance. That is, given two 
   <tt>Type*</tt> values, the types are identical if the pointers are identical.
   </p>
-</div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="m_Type">Important Public Methods</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <ul>
   <li><tt>bool isIntegerTy() const</tt>: Returns true for any integer type.</li>
@@ -3089,10 +3138,10 @@ the <tt>lib/VMCore</tt> directory.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="derivedtypes">Important Derived Types</a>
-</div>
-<div class="doc_text">
+</h4>
+<div>
 <dl>
   <dt><tt>IntegerType</tt></dt>
   <dd>Subclass of DerivedType that represents integer types of any bit width. 
@@ -3154,14 +3203,14 @@ the <tt>lib/VMCore</tt> directory.</p>
 </dl>
 </div>
 
-
+</div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="Module">The <tt>Module</tt> class</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p><tt>#include "<a
 href="/doxygen/Module_8h-source.html">llvm/Module.h</a>"</tt><br> doxygen info:
@@ -3176,14 +3225,12 @@ href="#GlobalVariable"><tt>GlobalVariable</tt></a>s, and a <a
 href="#SymbolTable"><tt>SymbolTable</tt></a>.  Additionally, it contains a few
 helpful member functions that try to make common operations easy.</p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="m_Module">Important Public Members of the <tt>Module</tt> class</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <ul>
   <li><tt>Module::Module(std::string name = "")</tt></li>
@@ -3282,13 +3329,14 @@ provide a name for it (probably based on the name of the translation unit).</p>
 
 </div>
 
+</div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="Value">The <tt>Value</tt> class</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p><tt>#include "<a href="/doxygen/Value_8h-source.html">llvm/Value.h</a>"</tt>
 <br> 
@@ -3339,14 +3387,12 @@ the class that
 represents this value.  Although this may take some getting used to, it
 simplifies the representation and makes it easier to manipulate.</p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="m_Value">Important Public Members of the <tt>Value</tt> class</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <ul>
   <li><tt>Value::use_iterator</tt> - Typedef for iterator over the
@@ -3393,12 +3439,14 @@ Inst-&gt;replaceAllUsesWith(ConstVal);
 
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="User">The <tt>User</tt> class</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
   
 <p>
 <tt>#include "<a href="/doxygen/User_8h-source.html">llvm/User.h</a>"</tt><br>
@@ -3417,14 +3465,12 @@ Single Assignment (SSA) form, there can only be one definition referred to,
 allowing this direct connection.  This connection provides the use-def
 information in LLVM.</p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="m_User">Important Public Members of the <tt>User</tt> class</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>The <tt>User</tt> class exposes the operand list in two ways: through
 an index access interface and through an iterator based interface.</p>
@@ -3447,12 +3493,14 @@ the operands of a <tt>User</tt>.</p></li>
 
 </div>    
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="Instruction">The <tt>Instruction</tt> class</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p><tt>#include "</tt><tt><a
 href="/doxygen/Instruction_8h-source.html">llvm/Instruction.h</a>"</tt><br>
@@ -3483,14 +3531,13 @@ href="#CmpInst">CmpInst</a></tt>).  Unfortunately, the use of macros in
 this file confuses doxygen, so these enum values don't show up correctly in the
 <a href="/doxygen/classllvm_1_1Instruction.html">doxygen output</a>.</p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
-  <a name="s_Instruction">Important Subclasses of the <tt>Instruction</tt>
-  class</a>
-</div>
-<div class="doc_text">
+<h4>
+  <a name="s_Instruction">
+    Important Subclasses of the <tt>Instruction</tt> class
+  </a>
+</h4>
+<div>
   <ul>
     <li><tt><a name="BinaryOperator">BinaryOperator</a></tt>
     <p>This subclasses represents all two operand instructions whose operands
@@ -3509,12 +3556,13 @@ this file confuses doxygen, so these enum values don't show up correctly in the
   </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
-  <a name="m_Instruction">Important Public Members of the <tt>Instruction</tt>
-  class</a>
-</div>
+<h4>
+  <a name="m_Instruction">
+    Important Public Members of the <tt>Instruction</tt> class
+  </a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <ul>
   <li><tt><a href="#BasicBlock">BasicBlock</a> *getParent()</tt>
@@ -3534,12 +3582,14 @@ and it has no name</p></li>
 
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="Constant">The <tt>Constant</tt> class and subclasses</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>Constant represents a base class for different types of constants. It
 is subclassed by ConstantInt, ConstantArray, etc. for representing 
@@ -3547,11 +3597,9 @@ the various types of Constants.  <a href="#GlobalValue">GlobalValue</a> is also
 a subclass, which represents the address of a global variable or function.
 </p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">Important Subclasses of Constant </div>
-<div class="doc_text">
+<h4>Important Subclasses of Constant</h4>
+<div>
 <ul>
   <li>ConstantInt : This subclass of Constant represents an integer constant of
   any width.
@@ -3599,13 +3647,14 @@ a subclass, which represents the address of a global variable or function.
 </ul>
 </div>
 
+</div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="GlobalValue">The <tt>GlobalValue</tt> class</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p><tt>#include "<a
 href="/doxygen/GlobalValue_8h-source.html">llvm/GlobalValue.h</a>"</tt><br>
@@ -3645,15 +3694,14 @@ dereference the pointer with <tt>GetElementPtrInst</tt> first, then its elements
 can be accessed. This is explained in the <a href="LangRef.html#globalvars">LLVM
 Language Reference Manual</a>.</p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
-  <a name="m_GlobalValue">Important Public Members of the <tt>GlobalValue</tt>
-  class</a>
-</div>
+<h4>
+  <a name="m_GlobalValue">
+    Important Public Members of the <tt>GlobalValue</tt> class
+  </a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <ul>
   <li><tt>bool hasInternalLinkage() const</tt><br>
@@ -3669,12 +3717,14 @@ GlobalValue is currently embedded into.</p></li>
 
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="Function">The <tt>Function</tt> class</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p><tt>#include "<a
 href="/doxygen/Function_8h-source.html">llvm/Function.h</a>"</tt><br> doxygen
@@ -3721,15 +3771,15 @@ href="#Argument"><tt>Argument</tt></a>s in the function body.</p>
 <p>Note that <tt>Function</tt> is a <a href="#GlobalValue">GlobalValue</a>
 and therefore also a <a href="#Constant">Constant</a>. The value of the function
 is its address (after linking) which is guaranteed to be constant.</p>
-</div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
-  <a name="m_Function">Important Public Members of the <tt>Function</tt>
-  class</a>
-</div>
+<h4>
+  <a name="m_Function">
+    Important Public Members of the <tt>Function</tt> class
+  </a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <ul>
   <li><tt>Function(const </tt><tt><a href="#FunctionType">FunctionType</a>
@@ -3807,12 +3857,14 @@ iterator<br>
 
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="GlobalVariable">The <tt>GlobalVariable</tt> class</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p><tt>#include "<a
 href="/doxygen/GlobalVariable_8h-source.html">llvm/GlobalVariable.h</a>"</tt>
@@ -3834,15 +3886,15 @@ variables may have an initial value (which must be a
 <a href="#Constant"><tt>Constant</tt></a>), and if they have an initializer, 
 they may be marked as "constant" themselves (indicating that their contents 
 never change at runtime).</p>
-</div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
-  <a name="m_GlobalVariable">Important Public Members of the
-  <tt>GlobalVariable</tt> class</a>
-</div>
+<h4>
+  <a name="m_GlobalVariable">
+    Important Public Members of the <tt>GlobalVariable</tt> class
+  </a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <ul>
   <li><tt>GlobalVariable(const </tt><tt><a href="#Type">Type</a> *Ty, bool
@@ -3880,13 +3932,14 @@ never change at runtime).</p>
 
 </div>
 
+</div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="BasicBlock">The <tt>BasicBlock</tt> class</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p><tt>#include "<a
 href="/doxygen/BasicBlock_8h-source.html">llvm/BasicBlock.h</a>"</tt><br>
@@ -3911,15 +3964,14 @@ href="#Value"><tt>Value</tt></a>s, because they are referenced by instructions
 like branches and can go in the switch tables. <tt>BasicBlock</tt>s have type
 <tt>label</tt>.</p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
-  <a name="m_BasicBlock">Important Public Members of the <tt>BasicBlock</tt>
-  class</a>
-</div>
+<h4>
+  <a name="m_BasicBlock">
+    Important Public Members of the <tt>BasicBlock</tt> class
+  </a>
+</h4>
 
-<div class="doc_text">
+<div>
 <ul>
 
 <li><tt>BasicBlock(const std::string &amp;Name = "", </tt><tt><a
@@ -3971,13 +4023,14 @@ returned.</p></li>
 
 </div>
 
+</div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="Argument">The <tt>Argument</tt> class</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>This subclass of Value defines the interface for incoming formal
 arguments to a function. A Function maintains a list of its formal
@@ -3985,6 +4038,8 @@ arguments. An argument has a pointer to the parent Function.</p>
 
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
 <hr>
 <address>
@@ -3995,7 +4050,7 @@ arguments. An argument has a pointer to the parent Function.</p>
 
   <a href="mailto:dhurjati@cs.uiuc.edu">Dinakar Dhurjati</a> and
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
-  <a href="http://llvm.org">The LLVM Compiler Infrastructure</a><br>
+  <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
   Last modified: $Date$
 </address>
 
diff --git a/docs/Projects.html b/docs/Projects.html
index ada6196..910ebdb 100644
--- a/docs/Projects.html
+++ b/docs/Projects.html
@@ -7,7 +7,7 @@
 </head>
 <body>
 
-<div class="doc_title">Creating an LLVM Project</div>
+<h1>Creating an LLVM Project</h1>
 
 <ol>
 <li><a href="#overview">Overview</a></li>
@@ -30,10 +30,10 @@
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="overview">Overview</a></div>
+<h2><a name="overview">Overview</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>The LLVM build system is designed to facilitate the building of third party
 projects that use LLVM header files, libraries, and tools.  In order to use
@@ -49,7 +49,7 @@ these facilities, a Makefile from a project must do the following things:</p>
     <li><tt>PROJ_SRC_ROOT</tt> - The root of the project's source tree.</li>
     <li><tt>PROJ_OBJ_ROOT</tt> - The root of the project's object tree.</li>
     <li><tt>PROJ_INSTALL_ROOT</tt> - The root installation directory.</li>
-    <li><tt>LEVEL</tt> - The relative path from the current directory to the 
+    <li><tt>LEVEL</tt> - The relative path from the current directory to the
     project's root ($PROJ_OBJ_ROOT).</li>
   </ul></li>
   <li>Include <tt>Makefile.config</tt> from <tt>$(LLVM_OBJ_ROOT)</tt>.</li>
@@ -59,9 +59,9 @@ these facilities, a Makefile from a project must do the following things:</p>
 <p>There are two ways that you can set all of these variables:</p>
 <ol>
   <li>You can write your own Makefiles which hard-code these values.</li>
-  <li>You can use the pre-made LLVM sample project. This sample project 
-  includes Makefiles, a configure script that can be used to configure the 
-  location of LLVM, and the ability to support multiple object directories 
+  <li>You can use the pre-made LLVM sample project. This sample project
+  includes Makefiles, a configure script that can be used to configure the
+  location of LLVM, and the ability to support multiple object directories
   from a single source directory.</li>
 </ol>
 
@@ -73,12 +73,12 @@ provide enough information on how to write your own Makefiles.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="create">Create a Project from the Sample Project</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Follow these simple steps to start your project:</p>
 
@@ -88,9 +88,9 @@ choosing.  You can place it anywhere you like.  Rename the directory to match
 the name of your project.</li>
 
 <li>
-If you downloaded LLVM using Subversion, remove all the directories named .svn 
-(and all the files therein) from your project's new source tree.  This will 
-keep Subversion from thinking that your project is inside 
+If you downloaded LLVM using Subversion, remove all the directories named .svn
+(and all the files therein) from your project's new source tree.  This will
+keep Subversion from thinking that your project is inside
 <tt>llvm/trunk/projects/sample</tt>.</li>
 
 <li>Add your source code and Makefiles to your source tree.</li>
@@ -139,18 +139,18 @@ can find LLVM:
 </ol>
 
 <p>That's it!  Now all you have to do is type <tt>gmake</tt> (or <tt>make</tt>
-if your on a GNU/Linux system) in the root of your object directory, and your 
+if your on a GNU/Linux system) in the root of your object directory, and your
 project should build.</p>
 
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="source">Source Tree Layout</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>In order to use the LLVM build system, you will want to organize your
 source code so that it can benefit from the build system's features.
@@ -209,7 +209,7 @@ directories:</p>
     test procedure uses RUN lines in the actual test case to determine
     how to run the test.  See the <a
     href="TestingGuide.html">TestingGuide</a> for more details. You
-    can easily write Makefile support similar to the Makefiles in 
+    can easily write Makefile support similar to the Makefiles in
     <tt>llvm/test</tt> to use Dejagnu to run your project's tests.<br></li>
     <li>
     LLVM contains an optional package called <tt>llvm-test</tt>
@@ -230,26 +230,24 @@ your <b>tools</b> directory.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="makefiles">Writing LLVM Style Makefiles</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>The LLVM build system provides a convenient way to build libraries and
 executables.  Most of your project Makefiles will only need to define a few
 variables.  Below is a list of the variables one can set and what they can
 do:</p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="reqVars">Required Variables</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <dl>
   <dt>LEVEL
@@ -263,11 +261,11 @@ do:</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="varsBuildDir">Variables for Building Subdirectories</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <dl>
   <dt>DIRS
@@ -294,11 +292,11 @@ do:</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="varsBuildLib">Variables for Building Libraries</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <dl>
   <dt>LIBRARYNAME
@@ -325,11 +323,11 @@ do:</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="varsBuildProg">Variables for Building Programs</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <dl>
   <dt>TOOLNAME
@@ -341,16 +339,41 @@ do:</p>
 
   <dt>USEDLIBS
   <dd>
-  This variable holds a space separated list of libraries that
-  should be linked into the program.  These libraries must either
-  be LLVM libraries or libraries that come from your <b>lib</b>
-  directory.  The libraries must be specified by their base name.
-  For example, to link libsample.a, you would set USEDLIBS to
-  <tt>sample</tt>.
+  This variable holds a space separated list of libraries that should
+  be linked into the program.  These libraries must be libraries that
+  come from your <b>lib</b> directory.  The libraries must be
+  specified without their "lib" prefix.  For example, to link
+  libsample.a, you would set USEDLIBS to
+  <tt>sample.a</tt>.
   <p>
   Note that this works only for statically linked libraries.
   <p>
 
+  <dt>LLVMLIBS
+  <dd>
+  This variable holds a space separated list of libraries that should
+  be linked into the program.  These libraries must be LLVM libraries.
+  The libraries must be specified without their "lib" prefix.  For
+  example, to link with a driver that performs an IR transformation
+  you might set LLVMLIBS to this minimal set of libraries
+  <tt>LLVMSupport.a LLVMCore.a LLVMBitReader.a LLVMAsmParser.a LLVMAnalysis.a LLVMTransformUtils.a LLVMScalarOpts.a LLVMTarget.a</tt>.
+  <p>
+  Note that this works only for statically linked libraries. LLVM is
+  split into a large number of static libraries, and the list of libraries you
+  require may be much longer than the list above. To see a full list
+  of libraries use:
+  <tt>llvm-config --libs all</tt>.
+  Using LINK_COMPONENTS as described below, obviates the need to set LLVMLIBS.
+  <p>
+
+  <dt>LINK_COMPONENTS
+  <dd>This variable holds a space separated list of components that
+  the LLVM Makefiles pass to the <tt>llvm-config</tt> tool to generate
+  a link line for the program. For example, to link with all LLVM
+  libraries use
+  <tt>LINK_COMPONENTS = all</tt>.
+  <p>
+
   <dt>LIBS
   <dd>
   To link dynamic libraries, add <tt>-l&lt;library base name&gt;</tt> to
@@ -363,16 +386,19 @@ do:</p>
   <tt>
   LIBS += -lsample
   </tt>
+  <p>
+  Note that LIBS must occur in the Makefile after the inclusion of Makefile.common.
+  <p>
 </dl>
 
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="miscVars">Miscellaneous Variables</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <dl>
   <dt>ExtraSource
@@ -398,13 +424,15 @@ do:</p>
 
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="objcode">Placement of Object Code</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>The final location of built libraries and executables will depend upon
 whether you do a Debug, Release, or Profile build.</p>
@@ -427,12 +455,12 @@ whether you do a Debug, Release, or Profile build.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="help">Further Help</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>If you have any questions or need any help creating an LLVM project,
 the LLVM team would be more than happy to help.  You can always post your
@@ -441,7 +469,7 @@ href="http://mail.cs.uiuc.edu/mailman/listinfo/llvmdev">LLVM Developers
 Mailing List</a>.</p>
 
 </div>
-  
+
 <!-- *********************************************************************** -->
 <hr>
 <address>
@@ -451,7 +479,7 @@ Mailing List</a>.</p>
   src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
 
   <a href="mailto:criswell@uiuc.edu">John Criswell</a><br>
-  <a href="http://llvm.org">The LLVM Compiler Infrastructure</a>
+  <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a>
   <br>
   Last modified: $Date$
 </address>
diff --git a/docs/ReleaseNotes.html b/docs/ReleaseNotes.html
index 8e72077..a5610db 100644
--- a/docs/ReleaseNotes.html
+++ b/docs/ReleaseNotes.html
@@ -3,13 +3,12 @@
 <html>
 <head>
   <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
-  <meta encoding="utf8">
   <link rel="stylesheet" href="llvm.css" type="text/css">
-  <title>LLVM 2.9 Release Notes</title>
+  <title>LLVM 3.0 Release Notes</title>
 </head>
 <body>
 
-<h1 class="doc_title">LLVM 2.9 Release Notes</h1>
+<h1>LLVM 3.0 Release Notes</h1>
 
 <img align=right src="http://llvm.org/img/DragonSmall.png"
     width="136" height="136" alt="LLVM Dragon Logo">
@@ -17,35 +16,35 @@
 <ol>
   <li><a href="#intro">Introduction</a></li>
   <li><a href="#subproj">Sub-project Status Update</a></li>
-  <li><a href="#externalproj">External Projects Using LLVM 2.9</a></li>
-  <li><a href="#whatsnew">What's New in LLVM 2.9?</a></li>
+  <li><a href="#externalproj">External Projects Using LLVM 3.0</a></li>
+  <li><a href="#whatsnew">What's New in LLVM 3.0?</a></li>
   <li><a href="GettingStarted.html">Installation Instructions</a></li>
   <li><a href="#knownproblems">Known Problems</a></li>
   <li><a href="#additionalinfo">Additional Information</a></li>
 </ol>
 
 <div class="doc_author">
-  <p>Written by the <a href="http://llvm.org">LLVM Team</a></p>
+  <p>Written by the <a href="http://llvm.org/">LLVM Team</a></p>
 </div>
 
 <!--
-<h1 style="color:red">These are in-progress notes for the upcoming LLVM 2.9
+<h1 style="color:red">These are in-progress notes for the upcoming LLVM 3.0
 release.<br>
 You may prefer the
-<a href="http://llvm.org/releases/2.8/docs/ReleaseNotes.html">LLVM 2.8
+<a href="http://llvm.org/releases/2.9/docs/ReleaseNotes.html">LLVM 2.9
 Release Notes</a>.</h1>
  -->
 
 <!-- *********************************************************************** -->
-<h1>
+<h2>
   <a name="intro">Introduction</a>
-</h1>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>This document contains the release notes for the LLVM Compiler
-Infrastructure, release 2.9.  Here we describe the status of LLVM, including
+Infrastructure, release 3.0.  Here we describe the status of LLVM, including
 major improvements from the previous release and significant known problems.
 All LLVM releases may be downloaded from the <a
 href="http://llvm.org/releases/">LLVM releases web site</a>.</p>
@@ -73,29 +72,26 @@ current one.  To see the release notes for a specific release, please see the
  -->
  
 <!-- *********************************************************************** -->
-<h1>
+<h2>
   <a name="subproj">Sub-project Status Update</a>
-</h1>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 <p>
-The LLVM 2.9 distribution currently consists of code from the core LLVM
+The LLVM 3.0 distribution currently consists of code from the core LLVM
 repository (which roughly includes the LLVM optimizers, code generators
 and supporting tools), the Clang repository and the llvm-gcc repository.  In
 addition to this code, the LLVM Project includes other sub-projects that are in
 development.  Here we include updates on these subprojects.
 </p>
 
-</div>
-
-
 <!--=========================================================================-->
-<h2>
+<h3>
 <a name="clang">Clang: C/C++/Objective-C Frontend Toolkit</a>
-</h2>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p><a href="http://clang.llvm.org/">Clang</a> is an LLVM front end for the C,
 C++, and Objective-C languages. Clang aims to provide a better user experience
@@ -106,29 +102,21 @@ integrating with other development tools. Clang is considered a
 production-quality compiler for C, Objective-C, C++ and Objective-C++ on x86
 (32- and 64-bit), and for darwin/arm targets.</p>
 
-<p>In the LLVM 2.9 time-frame, the Clang team has made many improvements in C,
-C++ and Objective-C support.  C++ support is now generally rock solid, has
-been exercised on a broad variety of code, and has several new <a 
-href="http://clang.llvm.org/cxx_status.html#cxx0x">C++'0x features</a>
-implemented (such as rvalue references and variadic templates).  LLVM 2.9 has
-also brought in a large range of bug fixes and minor features (e.g. __label__
-support), and is much more compatible with the Linux Kernel.</p>  
+<p>In the LLVM 3.0 time-frame, the Clang team has made many improvements:</p>
   
 <p>If Clang rejects your code but another compiler accepts it, please take a
 look at the <a href="http://clang.llvm.org/compatibility.html">language
 compatibility</a> guide to make sure this is not intentional or a known issue.
 </p>
 
-<ul>
-</ul>
 </div>
 
 <!--=========================================================================-->
-<h2>
+<h3>
 <a name="dragonegg">DragonEgg: GCC front-ends, LLVM back-end</a>
-</h2>
+</h3>
 
-<div class="doc_text">
+<div>
 <p>
 <a href="http://dragonegg.llvm.org/">DragonEgg</a> is a
 <a href="http://gcc.gnu.org/wiki/plugins">gcc plugin</a> that replaces GCC's
@@ -142,25 +130,21 @@ not known whether the compiled code actually works or not!
 </p>
 
 <p>
-The 2.9 release has the following notable changes:
+The 3.0 release has the following notable changes:
 <ul>
-<li>The plugin is much more stable when compiling Fortran.</li>
-<li>Inline assembly where an asm output is tied to an input of a different size
-is now supported in many more cases.</li>
-<li>Basic support for the __float128 type was added.  It is now possible to
-generate LLVM IR from programs using __float128 but code generation does not
-work yet.</li>
-<li>Compiling Java programs no longer systematically crashes the plugin.</li>
+<!--
+<li></li>
+-->
 </ul>
 
 </div>
 
 <!--=========================================================================-->
-<h2>
+<h3>
 <a name="compiler-rt">compiler-rt: Compiler Runtime Library</a>
-</h2>
+</h3>
 
-<div class="doc_text">
+<div>
 <p>
 The new LLVM <a href="http://compiler-rt.llvm.org/">compiler-rt project</a>
 is a simple library that provides an implementation of the low-level
@@ -171,22 +155,16 @@ function. The compiler-rt library provides highly optimized implementations of
 this and other low-level routines (some are 3x faster than the equivalent
 libgcc routines).</p>
 
-<p>In the LLVM 2.9 timeframe, compiler_rt has had several minor changes for
-  better ARM support, and a fairly major license change.  All of the code in the
-  compiler-rt project is now <a href="DeveloperPolicy.html#license">dual
-  licensed</a> under MIT and UIUC license, which allows you to use compiler-rt
-  in applications without the binary copyright reproduction clause.  If you
-  prefer the LLVM/UIUC license, you are free to continue using it under that
-  license as well.</p>
+<p>In the LLVM 3.0 timeframe,</p>
 
 </div>
 
 <!--=========================================================================-->
-<h2>
+<h3>
 <a name="lldb">LLDB: Low Level Debugger</a>
-</h2>
+</h3>
 
-<div class="doc_text">
+<div>
 <p>
 <a href="http://lldb.llvm.org/">LLDB</a> is a brand new member of the LLVM
 umbrella of projects. LLDB is a next generation, high-performance debugger. It
@@ -195,7 +173,7 @@ libraries in the larger LLVM Project, such as the Clang expression parser, the
 LLVM disassembler and the LLVM JIT.</p>
 
 <p>
-LLDB is has advanced by leaps and bounds in the 2.9 timeframe.  It is
+LLDB is has advanced by leaps and bounds in the 3.0 timeframe.  It is
 dramatically more stable and useful, and includes both a new <a 
 href="http://lldb.llvm.org/tutorial.html">tutorial</a> and a <a
 href="http://lldb.llvm.org/lldb-gdb.html">side-by-side comparison with 
@@ -204,11 +182,11 @@ GDB</a>.</p>
 </div>
 
 <!--=========================================================================-->
-<h2>
+<h3>
 <a name="libc++">libc++: C++ Standard Library</a>
-</h2>
+</h3>
 
-<div class="doc_text">
+<div>
 <p>
 <a href="http://libcxx.llvm.org/">libc++</a> is another new member of the LLVM
 family.  It is an implementation of the C++ standard library, written from the
@@ -216,8 +194,7 @@ ground up to specifically target the forthcoming C++'0X standard and focus on
 delivering great performance.</p>
 
 <p>
-In the LLVM 2.9 timeframe, libc++ has had numerous bugs fixed, and is now being
-co-developed with Clang's C++'0x mode.</p>
+In the LLVM 3.0 timeframe,</p>
   
 <p>
 Like compiler_rt, libc++ is now <a href="DeveloperPolicy.html#license">dual
@@ -229,11 +206,11 @@ Like compiler_rt, libc++ is now <a href="DeveloperPolicy.html#license">dual
 
 
 <!--=========================================================================-->
-<h2>
+<h3>
 <a name="LLBrowse">LLBrowse: IR Browser</a>
-</h2>
+</h3>
 
-<div class="doc_text">
+<div>
 <p>
 <a href="http://llvm.org/svn/llvm-project/llbrowse/trunk/doc/LLBrowse.html">
   LLBrowse</a> is an interactive viewer for LLVM modules. It can load any LLVM
@@ -244,14 +221,14 @@ Like compiler_rt, libc++ is now <a href="DeveloperPolicy.html#license">dual
 </div>
 
 <!--=========================================================================-->
-<h2>
+<h3>
 <a name="vmkit">VMKit</a>
-</h2>
+</h3>
 
-<div class="doc_text">
+<div>
 <p>The <a href="http://vmkit.llvm.org/">VMKit project</a> is an implementation
   of a Java Virtual Machine (Java VM or JVM) that uses LLVM for static and
-  just-in-time compilation. As of LLVM 2.9, VMKit now supports generational
+  just-in-time compilation. As of LLVM 3.0, VMKit now supports generational
   garbage collectors. The garbage collectors are provided by the MMTk framework,
   and VMKit can be configured to use one of the numerous implemented collectors
   of MMTk.
@@ -261,11 +238,11 @@ Like compiler_rt, libc++ is now <a href="DeveloperPolicy.html#license">dual
   
 <!--=========================================================================-->
 <!--
-<h2>
+<h3>
 <a name="klee">KLEE: A Symbolic Execution Virtual Machine</a>
-</h2>
+</h3>
 
-<div class="doc_text">
+<div>
 <p>
 <a href="http://klee.llvm.org/">KLEE</a> is a symbolic execution framework for
 programs in LLVM bitcode form. KLEE tries to symbolically evaluate "all" paths
@@ -277,25 +254,24 @@ be used to verify some algorithms.
 <p>UPDATE!</p>
 </div>-->
 
+</div>
 
 <!-- *********************************************************************** -->
-<h1>
-  <a name="externalproj">External Open Source Projects Using LLVM 2.9</a>
-</h1>
+<h2>
+  <a name="externalproj">External Open Source Projects Using LLVM 3.0</a>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>An exciting aspect of LLVM is that it is used as an enabling technology for
    a lot of other language and tools projects.  This section lists some of the
-   projects that have already been updated to work with LLVM 2.9.</p>
-</div>
-
+   projects that have already been updated to work with LLVM 3.0.</p>
 
 <!--=========================================================================-->
-<h2>Crack Programming Language</h2>
+<h3>Crack Programming Language</h3>
 
-<div class="doc_text">
+<div>
 <p>
 <a href="http://code.google.com/p/crack-language/">Crack</a> aims to provide the
 ease of development of a scripting language with the performance of a compiled
@@ -305,9 +281,9 @@ object-oriented programming, operator overloading and strong typing.</p>
   
   
 <!--=========================================================================-->
-<h2>TTA-based Codesign Environment (TCE)</h2>
+<h3>TTA-based Codesign Environment (TCE)</h3>
   
-<div class="doc_text">
+<div>
 <p>TCE is a toolset for designing application-specific processors (ASP) based on
 the Transport triggered architecture (TTA). The toolset provides a complete
 co-design flow from C/C++ programs down to synthesizable VHDL and parallel
@@ -324,9 +300,9 @@ of larger parts of the compiler chain.</p>
 
   
 <!--=========================================================================-->
-<h2>PinaVM</h2>
+<h3>PinaVM</h3>
   
-<div class="doc_text">
+<div>
 <p><a href="http://gitorious.org/pinavm/pages/Home">PinaVM</a> is an open
 source, <a href="http://www.systemc.org/">SystemC</a> front-end. Unlike many
 other front-ends, PinaVM actually executes the elaboration of the
@@ -335,9 +311,9 @@ bitcode with SystemC-specific information.</p>
 </div>
 
 <!--=========================================================================-->
-<h2>Pure</h2>
+<h3>Pure</h3>
   
-<div class="doc_text">
+<div>
 <p><a href="http://pure-lang.googlecode.com/">Pure</a> is an
   algebraic/functional
   programming language based on term rewriting. Programs are collections
@@ -351,14 +327,14 @@ bitcode with SystemC-specific information.</p>
   modules, and inline C, C++, Fortran and Faust code in Pure programs if
   the corresponding LLVM-enabled compilers are installed).</p>
   
-<p>Pure version 0.47 has been tested and is known to work with LLVM 2.9
+<p>Pure version 0.47 has been tested and is known to work with LLVM 3.0
   (and continues to work with older LLVM releases &gt;= 2.5).</p>
 </div>
 
 <!--=========================================================================-->
-<h2 id="icedtea">IcedTea Java Virtual Machine Implementation</h2>
+<h3 id="icedtea">IcedTea Java Virtual Machine Implementation</h3>
 
-<div class="doc_text">
+<div>
 <p>
 <a href="http://icedtea.classpath.org/wiki/Main_Page">IcedTea</a> provides a
 harness to build OpenJDK using only free software build tools and to provide
@@ -370,14 +346,14 @@ code.
 </p>
 
 <p> OpenJDK 7 b112, IcedTea6 1.9 and IcedTea7 1.13 and later have been tested
-and are known to work with LLVM 2.9 (and continue to work with older LLVM
+and are known to work with LLVM 3.0 (and continue to work with older LLVM
 releases &gt;= 2.6 as well).</p>
 </div>
 
 <!--=========================================================================-->
-<h2>Glasgow Haskell Compiler (GHC)</h2>
+<h3>Glasgow Haskell Compiler (GHC)</h3>
   
-<div class="doc_text">
+<div>
 <p>GHC is an open source, state-of-the-art programming suite for Haskell,
 a standard lazy functional programming language. It includes an
 optimizing static compiler generating good code for a variety of
@@ -389,9 +365,9 @@ supports an LLVM code generator. GHC supports LLVM 2.7 and later.</p>
 </div>
 
 <!--=========================================================================-->
-<h2>Polly - Polyhedral optimizations for LLVM</h2>
+<h3>Polly - Polyhedral optimizations for LLVM</h3>
   
-<div class="doc_text">
+<div>
 <p>Polly is a project that aims to provide advanced memory access optimizations
 to better take advantage of SIMD units, cache hierarchies, multiple cores or
 even vector accelerators for LLVM. Built around an abstract mathematical
@@ -404,9 +380,9 @@ and parallelism.</p>
 </div>
 
 <!--=========================================================================-->
-<h2>Rubinius</h2>
+<h3>Rubinius</h3>
 
-<div class="doc_text">
+<div>
   <p><a href="http://github.com/evanphx/rubinius">Rubinius</a> is an environment
   for running Ruby code which strives to write as much of the implementation in
   Ruby as possible. Combined with a bytecode interpreting VM, it uses LLVM to
@@ -417,154 +393,85 @@ and parallelism.</p>
 
 
 <!--=========================================================================-->
-<div class="doc_subsection">
+<h3>
 <a name="FAUST">FAUST Real-Time Audio Signal Processing Language</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 <p>
 <a href="http://faust.grame.fr">FAUST</a> is a compiled language for real-time
 audio signal processing. The name FAUST stands for Functional AUdio STream. Its
 programming model combines two approaches: functional programming and block
 diagram composition. In addition with the C, C++, JAVA output formats, the
-Faust compiler can now generate LLVM bitcode, and works with LLVM 2.7-2.9.</p>
+Faust compiler can now generate LLVM bitcode, and works with LLVM 2.7-3.0.</p>
 
 </div>
   
+</div>
+
 <!-- *********************************************************************** -->
-<h1>
-  <a name="whatsnew">What's New in LLVM 2.9?</a>
-</h1>
+<h2>
+  <a name="whatsnew">What's New in LLVM 3.0?</a>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>This release includes a huge number of bug fixes, performance tweaks and
 minor improvements.  Some of the major improvements and new features are listed
 in this section.
 </p>
 
-</div>
-
 <!--=========================================================================-->
-<h2>
+<h3>
 <a name="majorfeatures">Major New Features</a>
-</h2>
+</h3>
 
-<div class="doc_text">
+<div>
 
-<p>LLVM 2.9 includes several major new capabilities:</p>
+<p>LLVM 3.0 includes several major new capabilities:</p>
 
 <ul>
-  
-<li>Type Based Alias Analysis (TBAA) is now implemented and turned on by default
-  in Clang.  This allows substantially better load/store optimization in some
-  cases.  TBAA can be disabled by passing -fno-strict-aliasing.
-</li>
-
-<li>This release has seen a continued focus on quality of debug information. 
-  LLVM now generates much higher fidelity debug information, particularly when
-  debugging optimized code.</li>
 
-<li>Inline assembly now supports multiple alternative constraints.</li>  
-
-<li>A new backend for the NVIDIA PTX virtual ISA (used to target its GPUs) is
-  under rapid development.  It is not generally useful in 2.9, but is making
-  rapid progress.</li>
+<!--
+<li></li>
+-->
   
 </ul>
   
 </div>
 
 <!--=========================================================================-->
-<h2>
+<h3>
 <a name="coreimprovements">LLVM IR and Core Improvements</a>
-</h2>
+</h3>
 
-<div class="doc_text">
+<div>
 <p>LLVM IR has several new features for better support of new targets and that
 expose new optimization opportunities:</p>
 
 <ul>
-<li>The <a href="LangRef.html#bitwiseops">udiv, ashr, lshr, and shl</a>
-  instructions now have support exact and nuw/nsw bits to indicate that they
-  don't overflow or shift out bits.  This is useful for optimization of <a
-    href="http://llvm.org/PR8862">pointer differences</a> and other cases.</li>
-  
-<li>LLVM IR now supports the <a href="LangRef.html#globalvars">unnamed_addr</a>
-  attribute to indicate that constant global variables with identical
-  initializers can be merged.  This fixed <a href="http://llvm.org/PR8927">an
-  issue</a> where LLVM would incorrectly merge two globals which were supposed
-  to have distinct addresses.</li>
-  
-<li>The new <a href="LangRef.html#fnattrs">hotpatch attribute</a> has been added
-  to allow runtime patching of functions.</li> 
+<!--
+<li></li>
+-->
 </ul>
 
 </div>
 
 <!--=========================================================================-->
-<h2>
+<h3>
 <a name="optimizer">Optimizer Improvements</a>
-</h2>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>In addition to a large array of minor performance tweaks and bug fixes, this
 release includes a few major enhancements and additions to the optimizers:</p>
 
 <ul>
-<li>Link Time Optimization (LTO) has been improved to use MC for parsing inline
-  assembly and now can build large programs like Firefox 4 on both Mac OS X and
-  Linux.</li>
-  
-<li>The new -loop-idiom pass recognizes memset/memcpy loops (and memset_pattern
-  on darwin), turning them into library calls, which are typically better
-  optimized than inline code.  If you are building a libc and notice that your
-  memcpy and memset functions are compiled into infinite recursion, please build
-  with -ffreestanding or -fno-builtin to disable this pass.</li>
-  
-<li>A new -early-cse pass does a fast pass over functions to fold constants,
-  simplify expressions, perform simple dead store elimination, and perform
-  common subexpression elimination.  It does a good job at catching some of the
-  trivial redundancies that exist in unoptimized code, making later passes more
-  effective.</li>
-
-<li>A new -loop-instsimplify pass is used to clean up loop bodies in the loop
-  optimizer.</li>
-  
-<li>The new TargetLibraryInfo interface allows mid-level optimizations to know
-  whether the current target's runtime library has certain functions.  For
-  example, the optimizer can now transform integer-only printf calls to call
-  iprintf, allowing reduced code size for embedded C libraries (e.g. newlib).
-</li>
-    
-<li>LLVM has a new <a href="WritingAnLLVMPass.html#RegionPass">RegionPass</a>
-  infrastructure for region-based optimizations.</li>
-
-<li>Several optimizer passes have been substantially sped up:
-  GVN is much faster on functions with deep dominator trees and lots of basic
-  blocks.  The dominator tree and dominance frontier passes are much faster to
-  compute, and preserved by more passes (so they are computed less often).  The
-  -scalar-repl pass is also much faster and doesn't use DominanceFrontier.
-</li>
-
-<li>The Dead Store Elimination pass is more aggressive optimizing stores of
-  different types: e.g. a large store following a small one to the same address.
-  The MemCpyOptimizer pass handles several new forms of memcpy elimination.</li>
-  
-<li>LLVM now optimizes various idioms for overflow detection into check of the
-  flag register on various CPUs.  For example, we now compile:
-  
-  <pre>
-   unsigned long t = a+b;
-   if (t &lt; a) ...
-  </pre>
-  into:
-  <pre>
-   addq %rdi, %rbx
-   jno  LBB0_2
-  </pre>
+<!--
+<li></li>
+-->
 </li>
   
 </ul>
@@ -572,11 +479,11 @@ release includes a few major enhancements and additions to the optimizers:</p>
 </div>
 
 <!--=========================================================================-->
-<h2>
+<h3>
 <a name="mc">MC Level Improvements</a>
-</h2>
+</h3>
 
-<div class="doc_text">
+<div>
 <p>
 The LLVM Machine Code (aka MC) subsystem was created to solve a number
 of problems in the realm of assembly, disassembly, object file format handling,
@@ -584,38 +491,9 @@ and a number of other related areas that CPU instruction-set level tools work
 in.</p>
 
 <ul>
-<li>ELF MC support has matured enough for the integrated assembler to be turned
-  on by default in Clang on X86-32 and X86-64 ELF systems.</li>
-  
-<li>MC supports and CodeGen uses the <tt>.file</tt> and <tt>.loc</tt> directives
-  for producing line number debug info. This produces more compact line
-  tables and easier to read .s files.</li>
-  
-<li>MC supports the <tt>.cfi_*</tt> directives for producing DWARF
-  frame information, but it is still not used by CodeGen by default.</li>
-
-  
-<li>The MC assembler now generates much better diagnostics for common errors,
-  is much faster at matching instructions, is much more bug-compatible with
-  the GAS assembler, and is now generally useful for a broad range of X86
-  assembly.</li>
-  
-<li>We now have some basic <a href="CodeGenerator.html#mc">internals
-  documentation</a> for MC.</li>
-  
-<li>.td files can now specify assembler aliases directly with the <a 
-   href="CodeGenerator.html#na_instparsing">MnemonicAlias and InstAlias</a>
-   tblgen classes.</li>
-  
-<li>LLVM now has an experimental format-independent object file manipulation
-  library (lib/Object).  It supports both PE/COFF and ELF.  The llvm-nm tool has
-  been extended to work with native object files, and the new llvm-objdump tool
-  supports disassembly of object files (but no relocations are displayed yet).
-</li>
-  
-<li>Win32 PE-COFF support in the MC assembler has made a lot of progress in the
-  2.9 timeframe, but is still not generally useful.</li>
-
+<!--
+<li></li>
+-->
 </ul>
 
 <p>For more information, please see the <a
@@ -626,219 +504,128 @@ LLVM MC Project Blog Post</a>.
 </div>
 
 <!--=========================================================================-->
-<h2>
+<h3>
 <a name="codegen">Target Independent Code Generator Improvements</a>
-</h2>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>We have put a significant amount of work into the code generator
 infrastructure, which allows us to implement more aggressive algorithms and make
 it run faster:</p>
 
 <ul>
-<li>The pre-register-allocation (preRA) instruction scheduler models register
-  pressure much more accurately in some cases. This allows the adoption of more
-  aggressive scheduling heuristics without causing spills to be generated.
-</li>
-  
-<li>LiveDebugVariables is a new pass that keeps track of debugging information
-  for user variables that are promoted to registers in optimized builds.</li>  
-
-<li>The scheduler now models operand latency and pipeline forwarding.</li>
-
-<li>A major register allocator infrastructure rewrite is underway.  It is not on
-    by default for 2.9 and you are not advised to use it, but it has made
-    substantial progress in the 2.9 timeframe:
-  <ul>
-  <li>A new -regalloc=basic "basic" register allocator can be used as a simple
-      fallback when debugging.  It uses the new infrastructure.</li>
-  <li>New infrastructure is in place for live range splitting.  "SplitKit" can
-      break a live interval into smaller pieces while preserving SSA form, and
-      SpillPlacement can help find the best split points. This is a work in
-      progress so the API is changing quickly.</li>
-   <li>The inline spiller has learned to clean up after live range splitting. It
-      can hoist spills out of loops, and it can eliminate redundant spills.</li>
-   <li>Rematerialization works with live range splitting.</li>
-   <li>The new "greedy" register allocator using live range splitting. This will
-     be the default register allocator in the next LLVM release, but it is not
-     turned on by default in 2.9.</li>
-   </ul>
-</li>
+<!--
+<li></li>
+-->
 </ul>
 </div>
 
 <!--=========================================================================-->
-<h2>
+<h3>
 <a name="x86">X86-32 and X86-64 Target Improvements</a>
-</h2>
+</h3>
 
-<div class="doc_text">
+<div>
 <p>New features and major changes in the X86 target include:
 </p>
 
 <ul>
-<li>LLVM 2.9 includes a complete reimplementation of the MMX instruction set.
-  The reimplementation uses a new LLVM IR <a 
-  href="LangRef.html#t_x86mmx">x86_mmx</a> type to ensure that MMX operations
-  are <em>only</em> generated from source that uses MMX builtin operations. With
-  this, random types like &lt;2 x i32&gt; are not turned into MMX operations
-  (which can be catastrophic without proper "emms" insertion).  Because the X86
-  code generator always generates reliable code, the -disable-mmx flag is now
-  removed.
-</li>
-  
-<li>X86 support for FS/GS relative loads and stores using <a 
-    href="CodeGenerator.html#x86_memory">address space 256/257</a> works reliably
-    now.</li>
-  
-<li>LLVM 2.9 generates much better code in several cases by using adc/sbb to
-   avoid generation of conditional move instructions for conditional increment
-   and other idioms.</li>
+<li>The CRC32 intrinsics have been renamed.  The intrinsics were previously
+  @llvm.x86.sse42.crc32.[8|16|32] and @llvm.x86.sse42.crc64.[8|64].  They have
+  been renamed to @llvm.x86.sse42.crc32.32.[8|16|32] and 
+  @llvm.x86.sse42.crc32.64.[8|64].</li>
 
-<li>The X86 backend has adopted a new preRA scheduling mode, "list-ilp", to
-  shorten the height of instruction schedules without inducing register spills.
-</li>
-
-<li>The MC assembler supports 3dNow! and 3DNowA instructions.</li>
-  
-<li>Several bugs have been fixed for Windows x64 code generator.</li>
 </ul>
 
 </div>
 
 <!--=========================================================================-->
-<h2>
+<h3>
 <a name="ARM">ARM Target Improvements</a>
-</h2>
+</h3>
 
-<div class="doc_text">
+<div>
 <p>New features of the ARM target include:
 </p>
 
 <ul>
-<li>The ARM backend now has a fast instruction selector, which dramatically
-     improves -O0 compile times.</li>
-<li>The ARM backend has new tuning for Cortex-A8 and Cortex-A9 CPUs.</li>
-<li>The __builtin_prefetch builtin (and llvm.prefetch intrinsic) is compiled
-    into prefetch instructions instead of being discarded.</li>
-
-<li>  The ARM backend preRA scheduler now models machine resources at cycle
-  granularity. This allows the scheduler to both accurately model
-  instruction latency and avoid overcommitting functional units.</li>
-
-<li>Countless ARM microoptimizations have landed in LLVM 2.9.</li>
+<!--
+<li></li>
+-->
 </ul>
 </div>
   
 <!--=========================================================================-->
-<h2>
+<h3>
 <a name="OtherTS">Other Target Specific Improvements</a>
-</h2>
+</h3>
 
-<div class="doc_text">
+<div>
 <ul>
-<li>MicroBlaze: major updates for aggressive delay slot filler, MC-based
-  assembly printing, assembly instruction parsing, ELF .o file emission, and MC
-  instruction disassembler have landed.</li>
-
-<li>SPARC: Many improvements, including using the Y registers for
-  multiplications and addition of a simple delay slot filler.</li>
-
-<li>PowerPC: The backend has been largely MC'ized and is ready to support
-  directly writing out mach-o object files.  No one seems interested in finishing
-  this final step though.</li>
-
-<li>Mips: Improved o32 ABI support, including better varags handling.
-More instructions supported in codegen: madd, msub, rotr, rotrv and clo.
-It also now supports lowering block addresses.</li>
-
+<!--
+<li></li>
+-->
 </ul>
 </div>
 
 <!--=========================================================================-->
-<h2>
+<h3>
 <a name="changes">Major Changes and Removed Features</a>
-</h2>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>If you're already an LLVM user or developer with out-of-tree changes based
-on LLVM 2.8, this section lists some "gotchas" that you may run into upgrading
+on LLVM 2.9, this section lists some "gotchas" that you may run into upgrading
 from the previous release.</p>
 
 <ul>
-<li><b>This is the last release to support the llvm-gcc frontend.</b></li>
-
-<li>LLVM has a new <a href="CodingStandards.html#ll_naming">naming
-  convention standard</a>, though the codebase hasn't fully adopted it yet.</li>
-  
-<li>The new DIBuilder class provides a simpler interface for front ends to
-    encode debug info in LLVM IR, and has replaced DIFactory.</li>
-
-<li>LLVM IR and other tools always work on normalized target triples (which have
-  been run through <tt>Triple::normalize</tt>).</li>
-
-<li>The target triple x86_64--mingw64 is obsoleted. Use x86_64--mingw32 
-  instead.</li>
-
-<li>The PointerTracking pass has been removed from mainline, and moved to The
-  ClamAV project (its only client).</li>
-    
-<li>The LoopIndexSplit, LiveValues, SimplifyHalfPowrLibCalls, GEPSplitter, and
-  PartialSpecialization passes were removed.  They were unmaintained,
-  buggy, or deemed to be a bad idea.</li>
+<!--
+<li></li>
+-->
 </ul>
 
 </div>
 
 <!--=========================================================================-->
-<h2>
+<h3>
 <a name="api_changes">Internal API Changes</a>
-</h2>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>In addition, many APIs have changed in this release.  Some of the major
   LLVM API changes are:</p>
 
 <ul>
-<li>include/llvm/System merged into include/llvm/Support.</li>
-<li>The <a href="http://llvm.org/PR5207">llvm::APInt API</a> was significantly
-  cleaned up.</li>
-
-<li>In the code generator, MVT::Flag was renamed to MVT::Glue to more accurately
-  describe its behavior.</li>
-
-<li>The system_error header from C++0x was added, and is now pervasively used to
-  capture and handle i/o and other errors in LLVM.</li>
-  
-<li>The old sys::Path API has been deprecated in favor of the new PathV2 API,
-    which is more efficient and flexible.</li>
+<!--
+<li></ld>
+-->
 </ul>
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
-<h1>
+<h2>
   <a name="knownproblems">Known Problems</a>
-</h1>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>This section contains significant known problems with the LLVM system,
 listed by component.  If you run into a problem, please check the <a
 href="http://llvm.org/bugs/">LLVM bug database</a> and submit a bug if
 there isn't already one.</p>
 
-</div>
-
 <!-- ======================================================================= -->
-<h2>
+<h3>
   <a name="experimental">Experimental features included with this release</a>
-</h2>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>The following components of this LLVM release are either untested, known to
 be broken or unreliable, or are in early development.  These components should
@@ -858,11 +645,11 @@ href="http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev">LLVMdev list</a>.</p>
 </div>
 
 <!-- ======================================================================= -->
-<h2>
+<h3>
   <a name="x86-be">Known problems with the X86 back-end</a>
-</h2>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <ul>
   <li>The X86 backend does not yet support
@@ -891,11 +678,11 @@ href="http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev">LLVMdev list</a>.</p>
 </div>
 
 <!-- ======================================================================= -->
-<h2>
+<h3>
   <a name="ppc-be">Known problems with the PowerPC back-end</a>
-</h2>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <ul>
 <li>The Linux PPC32/ABI support needs testing for the interpreter and static
@@ -905,11 +692,11 @@ compilation, and lacks support for debug information.</li>
 </div>
 
 <!-- ======================================================================= -->
-<h2>
+<h3>
   <a name="arm-be">Known problems with the ARM back-end</a>
-</h2>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <ul>
 <li>Thumb mode works only on ARMv6 or higher processors. On sub-ARMv6
@@ -922,11 +709,11 @@ results (<a href="http://llvm.org/PR1388">PR1388</a>).</li>
 </div>
 
 <!-- ======================================================================= -->
-<h2>
+<h3>
   <a name="sparc-be">Known problems with the SPARC back-end</a>
-</h2>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <ul>
 <li>The SPARC backend only supports the 32-bit SPARC ABI (-m32); it does not
@@ -936,11 +723,11 @@ results (<a href="http://llvm.org/PR1388">PR1388</a>).</li>
 </div>
 
 <!-- ======================================================================= -->
-<h2>
+<h3>
   <a name="mips-be">Known problems with the MIPS back-end</a>
-</h2>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <ul>
 <li>64-bit MIPS targets are not supported yet.</li>
@@ -949,11 +736,11 @@ results (<a href="http://llvm.org/PR1388">PR1388</a>).</li>
 </div>
 
 <!-- ======================================================================= -->
-<h2>
+<h3>
   <a name="alpha-be">Known problems with the Alpha back-end</a>
-</h2>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <ul>
 
@@ -964,11 +751,11 @@ appropriate nops inserted to ensure restartability.</li>
 </div>
 
 <!-- ======================================================================= -->
-<h2>
+<h3>
   <a name="c-be">Known problems with the C back-end</a>
-</h2>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>The C backend has numerous problems and is not being actively maintained.
 Depending on it for anything serious is not advised.</p>
@@ -987,13 +774,13 @@ Depending on it for anything serious is not advised.</p>
 
 
 <!-- ======================================================================= -->
-<h2>
+<h3>
   <a name="llvm-gcc">Known problems with the llvm-gcc front-end</a>
-</h2>
+</h3>
 
-<div class="doc_text">
+<div>
 
-<p><b>LLVM 2.9 will be the last release of llvm-gcc.</b></p>
+<p><b>LLVM 3.0 will be the last release of llvm-gcc.</b></p>
 
 <p>llvm-gcc is generally very stable for the C family of languages.  The only
    major language feature of GCC not supported by llvm-gcc is the
@@ -1014,16 +801,18 @@ actively maintained.  If you are interested in Ada, we recommend that you
 consider using <a href="#dragonegg">dragonegg</a> instead.</p>
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
-<h1>
+<h2>
   <a name="additionalinfo">Additional Information</a>
-</h1>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>A wide variety of additional information is available on the <a
-href="http://llvm.org">LLVM web page</a>, in particular in the <a
+href="http://llvm.org/">LLVM web page</a>, in particular in the <a
 href="http://llvm.org/docs/">documentation</a> section.  The web page also
 contains versions of the API documentation which is up-to-date with the
 Subversion version of the source code.
diff --git a/docs/SourceLevelDebugging.html b/docs/SourceLevelDebugging.html
index 79ea71a..0fc7730 100644
--- a/docs/SourceLevelDebugging.html
+++ b/docs/SourceLevelDebugging.html
@@ -8,7 +8,7 @@
 </head>
 <body>
 
-<div class="doc_title">Source Level Debugging with LLVM</div>
+<h1>Source Level Debugging with LLVM</h1>
 
 <table class="layout" style="width:100%">
   <tr class="layout">
@@ -68,10 +68,10 @@ height="369">
 
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="introduction">Introduction</a></div> 
+<h2><a name="introduction">Introduction</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>This document is the central repository for all information pertaining to
    debug information in LLVM.  It describes the <a href="#format">actual format
@@ -80,14 +80,12 @@ height="369">
    Further, this document provides specific examples of what debug information
    for C/C++ looks like.</p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="phil">Philosophy behind LLVM debugging information</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>The idea of the LLVM debugging information is to capture how the important
    pieces of the source-language's Abstract Syntax Tree map onto LLVM code.
@@ -133,11 +131,11 @@ height="369">
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="consumers">Debug information consumers</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>The role of debug information is to provide meta information normally
    stripped away during the compilation process.  This meta information provides
@@ -157,11 +155,11 @@ height="369">
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="debugopt">Debugging optimized code</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>An extremely high priority of LLVM debugging information is to make it
    interact well with optimizations and analysis.  In particular, the LLVM debug
@@ -176,22 +174,15 @@ height="369">
       as setting program variables, or calling functions that have been
       deleted.</li>
 
-  <li>LLVM optimizations gracefully interact with debugging information.  If
-      they are not aware of debug information, they are automatically disabled
-      as necessary in the cases that would invalidate the debug info.  This
-      retains the LLVM features, making it easy to write new
-      transformations.</li>
-
   <li>As desired, LLVM optimizations can be upgraded to be aware of the LLVM
       debugging information, allowing them to update the debugging information
       as they perform aggressive optimizations.  This means that, with effort,
       the LLVM optimizers could optimize debug code just as well as non-debug
       code.</li>
 
-  <li>LLVM debug information does not prevent many important optimizations from
+  <li>LLVM debug information does not prevent optimizations from
       happening (for example inlining, basic block reordering/merging/cleanup,
-      tail duplication, etc), further reducing the amount of the compiler that
-      eventually is "aware" of debugging information.</li>
+      tail duplication, etc).</li>
 
   <li>LLVM debug information is automatically optimized along with the rest of
       the program, using existing facilities.  For example, duplicate
@@ -226,13 +217,15 @@ height="369">
 
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="format">Debugging information format</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>LLVM debugging information has been carefully designed to make it possible
    for the optimizer to optimize the program and debugging information without
@@ -265,14 +258,12 @@ height="369">
    common to any source-language.  The <a href="#ccxx_frontend">next section</a>
    describes the data layout conventions used by the C and C++ front-ends.</p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="debug_info_descriptors">Debug information descriptors</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>In consideration of the complexity and volume of debug information, LLVM
    provides a specification for well formed debug descriptors. </p>
@@ -312,14 +303,12 @@ height="369">
 
 <p>The details of the various descriptors follow.</p>  
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsubsection">
+<h4>
   <a name="format_compile_units">Compile unit descriptors</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <div class="doc_code">
 <pre>
@@ -346,16 +335,18 @@ height="369">
    that produced it.</p>
 
 <p>Compile unit descriptors provide the root context for objects declared in a
-   specific compilation unit. File descriptors are defined using this context.</p>
+   specific compilation unit. File descriptors are defined using this context.
+   These descriptors are collected by a named metadata 
+   <tt>!llvm.dbg.cu</tt>.
 
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection">
+<h4>
   <a name="format_files">File descriptors</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <div class="doc_code">
 <pre>
@@ -380,11 +371,11 @@ height="369">
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection">
+<h4>
   <a name="format_global_variables">Global variable descriptors</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <div class="doc_code">
 <pre>
@@ -413,11 +404,11 @@ global variables are collected by named metadata <tt>!llvm.dbg.gv</tt>.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection">
+<h4>
   <a name="format_subprograms">Subprogram descriptors</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <div class="doc_code">
 <pre>
@@ -433,15 +424,16 @@ global variables are collected by named metadata <tt>!llvm.dbg.gv</tt>.</p>
   i32,      ;; Line number where defined
   metadata, ;; Reference to type descriptor
   i1,       ;; True if the global is local to compile unit (static)
-  i1        ;; True if the global is defined in the compile unit (not extern)
-  i32       ;; Virtuality, e.g. dwarf::DW_VIRTUALITY__virtual
-  i32       ;; Index into a virtual function
+  i1,       ;; True if the global is defined in the compile unit (not extern)
+  i32,      ;; Virtuality, e.g. dwarf::DW_VIRTUALITY__virtual
+  i32,      ;; Index into a virtual function
   metadata, ;; indicates which base type contains the vtable pointer for the 
             ;; derived class
-  i1        ;; isArtificial
-  i1        ;; isOptimized
-  Function *;; Pointer to LLVM function
-  metadata  ;; Lists function template parameters
+  i1,       ;; isArtificial
+  i1,       ;; isOptimized
+  Function *,;; Pointer to LLVM function
+  metadata, ;; Lists function template parameters
+  metadata  ;; Function declaration descriptor
 }
 </pre>
 </div>
@@ -456,11 +448,11 @@ global variables are collected by named metadata <tt>!llvm.dbg.gv</tt>.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection">
+<h4>
   <a name="format_blocks">Block descriptors</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <div class="doc_code">
 <pre>
@@ -482,11 +474,11 @@ global variables are collected by named metadata <tt>!llvm.dbg.gv</tt>.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection">
+<h4>
   <a name="format_basic_type">Basic type descriptors</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <div class="doc_code">
 <pre>
@@ -534,11 +526,11 @@ DW_ATE_unsigned_char = 8
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection">
+<h4>
   <a name="format_derived_type">Derived type descriptors</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <div class="doc_code">
 <pre>
@@ -551,7 +543,12 @@ DW_ATE_unsigned_char = 8
   i64,      ;; Size in bits
   i64,      ;; Alignment in bits
   i64,      ;; Offset in bits
-  metadata  ;; Reference to type derived from
+  metadata, ;; Reference to type derived from
+  metadata, ;; (optional) Name of the Objective C property assoicated with 
+            ;; Objective-C an ivar 
+  metadata, ;; (optional) Name of the Objective C property getter selector.
+  metadata, ;; (optional) Name of the Objective C property setter selector.
+  i32       ;; (optional) Objective C property attributes.
 }
 </pre>
 </div>
@@ -601,11 +598,11 @@ DW_TAG_restrict_type    = 55
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection">
+<h4>
   <a name="format_composite_type">Composite type descriptors</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <div class="doc_code">
 <pre>
@@ -688,11 +685,11 @@ DW_TAG_inheritance      = 28
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection">
+<h4>
   <a name="format_subrange">Subrange descriptors</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <div class="doc_code">
 <pre>
@@ -708,16 +705,17 @@ DW_TAG_inheritance      = 28
    <a href="#format_composite_type">composite type</a>.  The low value defines
    the lower bounds typically zero for C/C++.  The high value is the upper
    bounds.  Values are 64 bit.  High - low + 1 is the size of the array.  If low
-   == high the array will be unbounded.</p>
+   > high the array bounds are not included in generated debugging information.
+</p>
 
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection">
+<h4>
   <a name="format_enumeration">Enumerator descriptors</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <div class="doc_code">
 <pre>
@@ -737,11 +735,11 @@ DW_TAG_inheritance      = 28
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection">
+<h4>
   <a name="format_variables">Local variables</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <div class="doc_code">
 <pre>
@@ -780,39 +778,39 @@ DW_TAG_return_variable = 258
 
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="format_common_intrinsics">Debugger intrinsic functions</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>LLVM uses several intrinsic functions (name prefixed with "llvm.dbg") to
    provide debug information at various points in generated code.</p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsubsection">
+<h4>
   <a name="format_common_declare">llvm.dbg.declare</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 <pre>
   void %<a href="#format_common_declare">llvm.dbg.declare</a>(metadata, metadata)
 </pre>
 
 <p>This intrinsic provides information about a local element (ex. variable.) The
-   first argument is metadata holding alloca for the variable.</tt>. The
+   first argument is metadata holding alloca for the variable. The
    second argument is metadata containing description of the variable. </p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection">
+<h4>
   <a name="format_common_value">llvm.dbg.value</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 <pre>
   void %<a href="#format_common_value">llvm.dbg.value</a>(metadata, i64, metadata)
 </pre>
@@ -824,12 +822,14 @@ DW_TAG_return_variable = 258
    user source variable. </p>
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="format_common_lifetime">Object lifetimes and scoping</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 <p>In many languages, the local variables in functions can have their lifetimes
    or scopes limited to a subset of a function.  In the C family of languages,
    for example, variables are only live (readable and writable) within the
@@ -987,13 +987,15 @@ call void @llvm.dbg.declare(metadata, metadata !12), !dbg !14
 
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="ccxx_frontend">C/C++ front-end specific debug information</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>The C and C++ front-ends represent information about the program in a format
    that is effectively identical
@@ -1014,14 +1016,12 @@ call void @llvm.dbg.declare(metadata, metadata !12), !dbg !14
 <p>The following sections provide examples of various C/C++ constructs and the
    debug information that would best describe those constructs.</p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="ccxx_compile_units">C/C++ source file information</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>Given the source files <tt>MySource.cpp</tt> and <tt>MyHeader.h</tt> located
    in the directory <tt>/Users/mine/sources</tt>, the following code:</p>
@@ -1095,11 +1095,11 @@ using <tt>Instruction::getMetadata()</tt> and
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="ccxx_global_variable">C/C++ global variable information</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>Given an integer global variable declared as follows:</p>
 
@@ -1165,11 +1165,11 @@ int MyGlobal = 100;
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="ccxx_subprogram">C/C++ function information</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>Given a function declared as follows:</p>
 
@@ -1222,22 +1222,20 @@ define i32 @main(i32 %argc, i8** %argv) {
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="ccxx_basic_types">C/C++ basic types</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>The following are the basic type descriptors for C/C++ core types:</p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsubsection">
+<h4>
   <a name="ccxx_basic_type_bool">bool</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <div class="doc_code">
 <pre>
@@ -1259,11 +1257,11 @@ define i32 @main(i32 %argc, i8** %argv) {
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection">
+<h4>
   <a name="ccxx_basic_char">char</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <div class="doc_code">
 <pre>
@@ -1285,11 +1283,11 @@ define i32 @main(i32 %argc, i8** %argv) {
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection">
+<h4>
   <a name="ccxx_basic_unsigned_char">unsigned char</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <div class="doc_code">
 <pre>
@@ -1311,11 +1309,11 @@ define i32 @main(i32 %argc, i8** %argv) {
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection">
+<h4>
   <a name="ccxx_basic_short">short</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <div class="doc_code">
 <pre>
@@ -1337,11 +1335,11 @@ define i32 @main(i32 %argc, i8** %argv) {
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection">
+<h4>
   <a name="ccxx_basic_unsigned_short">unsigned short</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <div class="doc_code">
 <pre>
@@ -1363,11 +1361,11 @@ define i32 @main(i32 %argc, i8** %argv) {
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection">
+<h4>
   <a name="ccxx_basic_int">int</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <div class="doc_code">
 <pre>
@@ -1388,11 +1386,11 @@ define i32 @main(i32 %argc, i8** %argv) {
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection">
+<h4>
   <a name="ccxx_basic_unsigned_int">unsigned int</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <div class="doc_code">
 <pre>
@@ -1414,11 +1412,11 @@ define i32 @main(i32 %argc, i8** %argv) {
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection">
+<h4>
   <a name="ccxx_basic_long_long">long long</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <div class="doc_code">
 <pre>
@@ -1440,11 +1438,11 @@ define i32 @main(i32 %argc, i8** %argv) {
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection">
+<h4>
   <a name="ccxx_basic_unsigned_long_long">unsigned long long</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <div class="doc_code">
 <pre>
@@ -1466,11 +1464,11 @@ define i32 @main(i32 %argc, i8** %argv) {
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection">
+<h4>
   <a name="ccxx_basic_float">float</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <div class="doc_code">
 <pre>
@@ -1492,11 +1490,11 @@ define i32 @main(i32 %argc, i8** %argv) {
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection">
+<h4>
   <a name="ccxx_basic_double">double</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <div class="doc_code">
 <pre>
@@ -1517,12 +1515,14 @@ define i32 @main(i32 %argc, i8** %argv) {
 
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="ccxx_derived_types">C/C++ derived types</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>Given the following as an example of C/C++ derived type:</p>
 
@@ -1603,11 +1603,11 @@ typedef const int *IntPtr;
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="ccxx_composite_types">C/C++ struct/union types</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>Given the following as an example of C/C++ struct type:</p>
 
@@ -1716,11 +1716,11 @@ struct Color {
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="ccxx_enumeration_types">C/C++ enumeration types</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>Given the following as an example of C/C++ enumeration type:</p>
 
@@ -1781,6 +1781,8 @@ enum Trees {
 
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
 
 <hr>
@@ -1791,7 +1793,7 @@ enum Trees {
   src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
 
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
-  <a href="http://llvm.org">LLVM Compiler Infrastructure</a><br>
+  <a href="http://llvm.org/">LLVM Compiler Infrastructure</a><br>
   Last modified: $Date$
 </address>
 
diff --git a/docs/SystemLibrary.html b/docs/SystemLibrary.html
index 0289a55..614737e 100644
--- a/docs/SystemLibrary.html
+++ b/docs/SystemLibrary.html
@@ -7,7 +7,7 @@
 </head>
 <body>
 
-<div class="doc_title">System Library</div>
+<h1>System Library</h1>
 <ul>
   <li><a href="#abstract">Abstract</a></li>
   <li><a href="#requirements">Keeping LLVM Portable</a>
@@ -36,8 +36,8 @@
 
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="abstract">Abstract</a></div>
-<div class="doc_text">
+<h2><a name="abstract">Abstract</a></h2>
+<div>
   <p>This document provides some details on LLVM's System Library, located in
   the source at <tt>lib/System</tt> and <tt>include/llvm/System</tt>. The
   library's purpose is to shield LLVM from the differences between operating
@@ -63,21 +63,19 @@
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="requirements">Keeping LLVM Portable</a>
-</div>
-<div class="doc_text">
+</h2>
+<div>
   <p>In order to keep LLVM portable, LLVM developers should adhere to a set of
   portability rules associated with the System Library. Adherence to these rules
   should help the System Library achieve its goal of shielding LLVM from the
   variations in operating system interfaces and doing so efficiently.  The 
   following sections define the rules needed to fulfill this objective.</p>
-</div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="headers">Don't Include System Headers</a>
-</div>
-<div class="doc_text">
+<h3><a name="headers">Don't Include System Headers</a></h3>
+<div>
   <p>Except in <tt>lib/System</tt>, no LLVM source code should directly
   <tt>#include</tt> a system header. Care has been taken to remove all such
   <tt>#includes</tt> from LLVM while <tt>lib/System</tt> was being
@@ -91,9 +89,8 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="expose">Don't Expose System Headers</a>
-</div>
-<div class="doc_text">
+<h3><a name="expose">Don't Expose System Headers</a></h3>
+<div>
   <p>The System Library must shield LLVM from <em>all</em> system headers. To 
   obtain system level functionality, LLVM source must 
   <tt>#include "llvm/System/Thing.h"</tt> and nothing else. This means that 
@@ -103,8 +100,8 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="c_headers">Use Standard C Headers</a></div>
-<div class="doc_text">
+<h3><a name="c_headers">Use Standard C Headers</a></h3>
+<div>
   <p>The <em>standard</em> C headers (the ones beginning with "c") are allowed
   to be exposed through the <tt>lib/System</tt> interface. These headers and 
   the things they declare are considered to be platform agnostic. LLVM source 
@@ -113,9 +110,8 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="cpp_headers">Use Standard C++ Headers</a>
-</div>
-<div class="doc_text">
+<h3><a name="cpp_headers">Use Standard C++ Headers</a></h3>
+<div>
   <p>The <em>standard</em> C++ headers from the standard C++ library and
   standard template library may be exposed through the <tt>lib/System</tt>
   interface. These headers and the things they declare are considered to be
@@ -124,8 +120,8 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="highlev">High Level Interface</a></div>
-<div class="doc_text">
+<h3><a name="highlev">High Level Interface</a></h3>
+<div>
   <p>The entry points specified in the interface of lib/System must be aimed at 
   completing some reasonably high level task needed by LLVM. We do not want to
   simply wrap each operating system call. It would be preferable to wrap several
@@ -143,8 +139,8 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="nounused">No Unused Functionality</a></div>
-<div class="doc_text">
+<h3><a name="nounused">No Unused Functionality</a></h3>
+<div>
   <p>There must be no functionality specified in the interface of lib/System 
   that isn't actually used by LLVM. We're not writing a general purpose
   operating system wrapper here, just enough to satisfy LLVM's needs. And, LLVM
@@ -153,9 +149,8 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="nodupl">No Duplicate Implementations</a>
-</div>
-<div class="doc_text">
+<h3><a name="nodupl">No Duplicate Implementations</a></h3>
+<div>
   <p>The implementation of a function for a given platform must be written
   exactly once. This implies that it must be possible to apply a function's 
   implementation to multiple operating systems if those operating systems can
@@ -165,8 +160,8 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="virtuals">No Virtual Methods</a></div>
-<div class="doc_text">
+<h3><a name="virtuals">No Virtual Methods</a></h3>
+<div>
   <p>The System Library interfaces can be called quite frequently by LLVM. In
   order to make those calls as efficient as possible, we discourage the use of
   virtual methods. There is no need to use inheritance for implementation
@@ -175,8 +170,8 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="nofunc">No Exposed Functions</a></div>
-<div class="doc_text">
+<h3><a name="nofunc">No Exposed Functions</a></h3>
+<div>
   <p>Any functions defined by system libraries (i.e. not defined by lib/System) 
   must not be exposed through the lib/System interface, even if the header file 
   for that function is not exposed. This prevents inadvertent use of system
@@ -191,8 +186,8 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="nodata">No Exposed Data</a></div>
-<div class="doc_text">
+<h3><a name="nodata">No Exposed Data</a></h3>
+<div>
   <p>Any data defined by system libraries (i.e. not defined by lib/System) must
   not be exposed through the lib/System interface, even if the header file for
   that function is not exposed. As with functions, this prevents inadvertent use
@@ -200,8 +195,8 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="softerrors">Minimize Soft Errors</a></div>
-<div class="doc_text">
+<h3><a name="softerrors">Minimize Soft Errors</a></h3>
+<div>
   <p>Operating system interfaces will generally provide error results for every
   little thing that could go wrong. In almost all cases, you can divide these
   error results into two groups: normal/good/soft and abnormal/bad/hard. That
@@ -239,9 +234,8 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="throw_spec">No throw Specifications</a>
-</div>
-<div class="doc_text">
+<h3><a name="throw_spec">No throw Specifications</a></h3>
+<div>
   <p>None of the lib/System interface functions may be declared with C++ 
   <tt>throw()</tt> specifications on them. This requirement makes sure that the
   compiler does not insert additional exception handling code into the interface
@@ -252,8 +246,8 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="organization">Code Organization</a></div>
-<div class="doc_text">
+<h3><a name="organization">Code Organization</a></h3>
+<div>
   <p>Implementations of the System Library interface are separated by their
   general class of operating system. Currently only Unix and Win32 classes are
   defined but more could be added for other operating system classifications.
@@ -281,8 +275,8 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="semantics">Consistent Semantics</a></div>
-<div class="doc_text">
+<h3><a name="semantics">Consistent Semantics</a></h3>
+<div>
   <p>The implementation of a lib/System interface can vary drastically between
   platforms. That's okay as long as the end result of the interface function 
   is the same. For example, a function to create a directory is pretty straight
@@ -296,12 +290,14 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="bug">Bug 351</a></div>
-<div class="doc_text">
+<h3><a name="bug">Bug 351</a></h3>
+<div>
   <p>See <a href="http://llvm.org/PR351">bug 351</a>
   for further details on the progress of this work</p>
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
 
 <hr>
@@ -312,7 +308,7 @@
   src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
 
   <a href="mailto:rspencer@x10sys.com">Reid Spencer</a><br>
-  <a href="http://llvm.org">LLVM Compiler Infrastructure</a><br>
+  <a href="http://llvm.org/">LLVM Compiler Infrastructure</a><br>
   Last modified: $Date$
 </address>
 </body>
diff --git a/docs/TableGenFundamentals.html b/docs/TableGenFundamentals.html
index de46a39..e8fca32 100644
--- a/docs/TableGenFundamentals.html
+++ b/docs/TableGenFundamentals.html
@@ -7,9 +7,9 @@
 </head>
 <body>
 
-<div class="doc_title">TableGen Fundamentals</div>
+<h1>TableGen Fundamentals</h1>
 
-<div class="doc_text">
+<div>
 <ul>
   <li><a href="#introduction">Introduction</a>
   <ol>
@@ -50,10 +50,10 @@
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="introduction">Introduction</a></div>
+<h2><a name="introduction">Introduction</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>TableGen's purpose is to help a human develop and maintain records of
 domain-specific information.  Because there may be a large number of these
@@ -72,12 +72,10 @@ find an emacs "TableGen mode" and a vim language file in the
 <tt>llvm/utils/emacs</tt> and <tt>llvm/utils/vim</tt> directories of your LLVM
 distribution, respectively.</p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="concepts">Basic concepts</a></div>
+<h3><a name="concepts">Basic concepts</a></h3>
 
-<div class="doc_text">
+<div>
 
 <p>TableGen files consist of two key parts: 'classes' and 'definitions', both
 of which are considered 'records'.</p>
@@ -112,9 +110,9 @@ multiclass, as if they were declared in the current multiclass.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="example">An example record</a></div>
+<h3><a name="example">An example record</a></h3>
 
-<div class="doc_text">
+<div>
 
 <p>With no other arguments, TableGen parses the specified file and prints out
 all of the classes, then all of the definitions.  This is a good way to see what
@@ -212,9 +210,9 @@ abstractions they prefer to use when describing their information.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="running">Running TableGen</a></div>
+<h3><a name="running">Running TableGen</a></h3>
 
-<div class="doc_text">
+<div>
 
 <p>TableGen runs just like any other LLVM tool.  The first (optional) argument
 specifies the file to read.  If a filename is not specified, <tt>tblgen</tt>
@@ -256,27 +254,28 @@ what you need and formats it in the appropriate way.</p>
 
 </div>
 
+</div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="syntax">TableGen syntax</a></div>
+<h2><a name="syntax">TableGen syntax</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>TableGen doesn't care about the meaning of data (that is up to the backend to
 define), but it does care about syntax, and it enforces a simple type system.
 This section describes the syntax and the constructs allowed in a TableGen file.
 </p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="primitives">TableGen primitives</a></div>
+<h3><a name="primitives">TableGen primitives</a></h3>
+
+<div>
 
 <!-- -------------------------------------------------------------------------->
-<div class="doc_subsubsection"><a name="comments">TableGen comments</a></div>
+<h4><a name="comments">TableGen comments</a></h4>
 
-<div class="doc_text">
+<div>
 
 <p>TableGen supports BCPL style "<tt>//</tt>" comments, which run to the end of
 the line, and it also supports <b>nestable</b> "<tt>/* */</tt>" comments.</p>
@@ -284,11 +283,11 @@ the line, and it also supports <b>nestable</b> "<tt>/* */</tt>" comments.</p>
 </div>
 
 <!-- -------------------------------------------------------------------------->
-<div class="doc_subsubsection">
+<h4>
   <a name="types">The TableGen type system</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>TableGen files are strongly typed, in a simple (but complete) type-system.
 These types are used to perform automatic conversions, check for errors, and to
@@ -344,11 +343,11 @@ needed.</p>
 </div>
 
 <!-- -------------------------------------------------------------------------->
-<div class="doc_subsubsection">
+<h4>
   <a name="values">TableGen values and expressions</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>TableGen allows for a pretty reasonable number of different expression forms
 when building up values.  These forms allow the TableGen file to be written in a
@@ -433,12 +432,14 @@ to a "<tt>bits&lt;4&gt;</tt>" value, for example.</p>
 
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="classesdefs">Classes and definitions</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>As mentioned in the <a href="#concepts">intro</a>, classes and definitions
 (collectively known as 'records') in TableGen are the main high-level unit of
@@ -473,14 +474,12 @@ between a group of records and isolating it in a single place.  Also, classes
 permit the specification of default values for their subclasses, allowing the
 subclasses to override them as they wish.</p>
 
-</div>
-
 <!---------------------------------------------------------------------------->
-<div class="doc_subsubsection">
+<h4>
   <a name="valuedef">Value definitions</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>Value definitions define named entries in records.  A value must be defined
 before it can be referred to as the operand for another value definition or
@@ -492,11 +491,11 @@ equal sign.  Value definitions require terminating semicolons.</p>
 </div>
 
 <!-- -------------------------------------------------------------------------->
-<div class="doc_subsubsection">
+<h4>
   <a name="recordlet">'let' expressions</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>A record-level let expression is used to change the value of a value
 definition in a record.  This is primarily useful when a superclass defines a
@@ -519,11 +518,11 @@ because the <tt>D</tt> class overrode its value.</p>
 </div>
 
 <!-- -------------------------------------------------------------------------->
-<div class="doc_subsubsection">
+<h4>
   <a name="templateargs">Class template arguments</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>TableGen permits the definition of parameterized classes as well as normal
 concrete classes.  Parameterized TableGen classes specify a list of variable
@@ -610,11 +609,11 @@ X86 backend.</p>
 </div>
 
 <!-- -------------------------------------------------------------------------->
-<div class="doc_subsubsection">
+<h4>
   <a name="multiclass">Multiclass definitions and instances</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>
 While classes with template arguments are a good way to factor commonality
@@ -772,17 +771,21 @@ before them.
 
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="filescope">File scope entities</a>
-</div>
+</h3>
+
+<div>
 
 <!-- -------------------------------------------------------------------------->
-<div class="doc_subsubsection">
+<h4>
   <a name="include">File inclusion</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 <p>TableGen supports the '<tt>include</tt>' token, which textually substitutes
 the specified file in place of the include directive.  The filename should be
 specified as a double quoted string immediately after the '<tt>include</tt>'
@@ -797,11 +800,11 @@ keyword.  Example:</p>
 </div>
 
 <!-- -------------------------------------------------------------------------->
-<div class="doc_subsubsection">
+<h4>
   <a name="globallet">'let' expressions</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>"Let" expressions at file scope are similar to <a href="#recordlet">"let"
 expressions within a record</a>, except they can specify a value binding for
@@ -864,11 +867,15 @@ several levels of multiclass instanciations. This also avoids the need of using
 </pre>
 </div>
 
+</div>
+
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="codegen">Code Generator backend info</a></div>
+<h2><a name="codegen">Code Generator backend info</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Expressions used by code generator to describe instructions and isel
 patterns:</p>
@@ -882,10 +889,10 @@ patterns:</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="backends">TableGen backends</a></div>
+<h2><a name="backends">TableGen backends</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>TODO: How they work, how to write one.  This section should not contain
 details about any particular backend, except maybe -print-enums as an example.
@@ -903,7 +910,7 @@ This should highlight the APIs in <tt>TableGen/Record.h</tt>.</p>
   src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
 
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
-  <a href="http://llvm.org">LLVM Compiler Infrastructure</a><br>
+  <a href="http://llvm.org/">LLVM Compiler Infrastructure</a><br>
   Last modified: $Date$
 </address>
 
diff --git a/docs/TestingGuide.html b/docs/TestingGuide.html
index b048b72..b4fa672 100644
--- a/docs/TestingGuide.html
+++ b/docs/TestingGuide.html
@@ -7,9 +7,9 @@
 </head>
 <body>
       
-<div class="doc_title">
+<h1>
   LLVM Testing Infrastructure Guide
-</div>
+</h1>
 
 <ol>
   <li><a href="#overview">Overview</a></li>
@@ -52,10 +52,10 @@
 </div>
 
 <!--=========================================================================-->
-<div class="doc_section"><a name="overview">Overview</a></div>
+<h2><a name="overview">Overview</a></h2>
 <!--=========================================================================-->
 
-<div class="doc_text">
+<div>
 
 <p>This document is the reference manual for the LLVM testing infrastructure. It
 documents the structure of the LLVM testing infrastructure, the tools needed to
@@ -64,10 +64,10 @@ use it, and how to add and run tests.</p>
 </div>
 
 <!--=========================================================================-->
-<div class="doc_section"><a name="requirements">Requirements</a></div>
+<h2><a name="requirements">Requirements</a></h2>
 <!--=========================================================================-->
 
-<div class="doc_text">
+<div>
 
 <p>In order to use the LLVM testing infrastructure, you will need all of the
 software required to build LLVM, as well
@@ -76,10 +76,10 @@ as <a href="http://python.org">Python</a> 2.4 or later.</p>
 </div>
 
 <!--=========================================================================-->
-<div class="doc_section"><a name="org">LLVM testing infrastructure organization</a></div>
+<h2><a name="org">LLVM testing infrastructure organization</a></h2>
 <!--=========================================================================-->
 
-<div class="doc_text">
+<div>
 
 <p>The LLVM testing infrastructure contains two major categories of tests:
 regression tests and whole programs. The regression tests are contained inside
@@ -89,13 +89,11 @@ referred to as the "LLVM test suite" and are in the <tt>test-suite</tt> module
 in subversion.
 </p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsection"><a name="regressiontests">Regression tests</a></div>
+<h3><a name="regressiontests">Regression tests</a></h3>
 <!-- _______________________________________________________________________ -->
 
-<div class="doc_text">
+<div>
 
 <p>The regression tests are small pieces of code that test a specific feature of
 LLVM or trigger a specific bug in LLVM.  They are usually written in LLVM
@@ -119,10 +117,10 @@ application or benchmark.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsection"><a name="testsuite">Test suite</a></div>
+<h3><a name="testsuite">Test suite</a></h3>
 <!-- _______________________________________________________________________ -->
 
-<div class="doc_text">
+<div>
 
 <p>The test suite contains whole programs, which are pieces of
 code which can be compiled and linked into a stand-alone program that can be
@@ -144,11 +142,10 @@ generates code.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsection"><a name="debuginfotests">Debugging Information 
-tests</a></div>
+<h3><a name="debuginfotests">Debugging Information tests</a></h3>
 <!-- _______________________________________________________________________ -->
 
-<div class="doc_text">
+<div>
 
 <p>The test suite contains tests to check quality of debugging information.
 The test are written in C based languages or in LLVM assembly language. </p>
@@ -160,11 +157,13 @@ test suite for more information . This test suite is located in the
 
 </div>
 
+</div>
+
 <!--=========================================================================-->
-<div class="doc_section"><a name="quick">Quick start</a></div>
+<h2><a name="quick">Quick start</a></h2>
 <!--=========================================================================-->
 
-<div class="doc_text">
+<div>
 
   <p>The tests are located in two separate Subversion modules. The regressions
   tests are in the main "llvm" module under the directory
@@ -179,7 +178,8 @@ the <tt>test-suite</tt> directory will be automatically configured.
 Alternatively, you can configure the <tt>test-suite</tt> module manually.</p>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsection"><a name="quickregressiontests">Regression tests</a></div>
+<h3><a name="quickregressiontests">Regression tests</a></h3>
+<div>
 <!-- _______________________________________________________________________ -->
 <p>To run all of the LLVM regression tests, use master Makefile in
  the <tt>llvm/test</tt> directory:</p>
@@ -198,7 +198,7 @@ Alternatively, you can configure the <tt>test-suite</tt> module manually.</p>
 </pre>
 </div>
 
-<p>If you have <a href="http://clang.llvm.org">Clang</a> checked out and built,
+<p>If you have <a href="http://clang.llvm.org/">Clang</a> checked out and built,
 you can run the LLVM and Clang tests simultaneously using:</p>
 
 <p>or</p>
@@ -239,10 +239,14 @@ script which is built as part of LLVM. For example, to run the
 <p>For more information on using the 'lit' tool, see 'llvm-lit --help' or the
 'lit' man page.</p>
 
+</div>
+
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsection"><a name="quicktestsuite">Test suite</a></div>
+<h3><a name="quicktestsuite">Test suite</a></h3>
 <!-- _______________________________________________________________________ -->
 
+<div>
+
 <p>To run the comprehensive test suite (tests that compile and execute whole 
 programs), first checkout and setup the <tt>test-suite</tt> module:</p>
 
@@ -292,9 +296,10 @@ that subdirectory.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsection"><a name="quickdebuginfotests">Debugging Information 
-tests</a></div>
+<h3><a name="quickdebuginfotests">Debugging Information tests</a></h3>
+<div>
 <!-- _______________________________________________________________________ -->
+<div>
 
 <p> To run debugging information tests simply checkout the tests inside
 clang/test directory. </p>
@@ -310,10 +315,14 @@ clang/test directory. </p>
 
 </div>
 
+</div>
+
+</div>
+
 <!--=========================================================================-->
-<div class="doc_section"><a name="rtstructure">Regression test structure</a></div>
+<h2><a name="rtstructure">Regression test structure</a></h2>
 <!--=========================================================================-->
-<div class="doc_text">
+<div>
   <p>The LLVM regression tests are driven by 'lit' and are located in
   the <tt>llvm/test</tt> directory.
 
@@ -335,12 +344,10 @@ clang/test directory. </p>
     <li><tt>Verifier</tt>: tests the IR verifier.</li>
   </ul>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsection"><a name="rtcustom">Writing new regression tests</a></div>
+<h3><a name="rtcustom">Writing new regression tests</a></h3>
 <!-- _______________________________________________________________________ -->
-<div class="doc_text">
+<div>
   <p>The regression test structure is very simple, but does require some
   information to be set. This information is gathered via <tt>configure</tt> and
   is written to a file, <tt>lit.site.cfg</tt>
@@ -358,8 +365,8 @@ clang/test directory. </p>
   obtained by using Tcl's glob command.  Any directory that contains only
   directories does not need the <tt>dg.exp</tt> file.</p>
 
-  <p>The <tt>llvm-runtests</tt> function lookas at each file that is passed to
-  it and gathers any lines together that match "RUN:". This are the "RUN" lines
+  <p>The <tt>llvm-runtests</tt> function looks at each file that is passed to
+  it and gathers any lines together that match "RUN:". These are the "RUN" lines
   that specify how the test is to be run. So, each test script must contain
   RUN lines if it is to do anything. If there are no RUN lines, the
   <tt>llvm-runtests</tt> function will issue an error and the test will
@@ -492,10 +499,10 @@ negatives).</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsection"><a name="FileCheck">The FileCheck utility</a></div>
+<h3><a name="FileCheck">The FileCheck utility</a></h3>
 <!-- _______________________________________________________________________ -->
 
-<div class="doc_text">
+<div>
 
 <p>A powerful feature of the RUN: lines is that it allows any arbitrary commands
    to be executed as part of the test harness.  While standard (portable) unix
@@ -561,13 +568,12 @@ is a "subl" in between those labels.  If it existed somewhere else in the file,
 that would not count: "grep subl" matches if subl exists anywhere in the
 file.</p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"><a 
-name="FileCheck-check-prefix">The FileCheck -check-prefix option</a></div>
+<h4>
+  <a name="FileCheck-check-prefix">The FileCheck -check-prefix option</a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>The FileCheck -check-prefix option allows multiple test configurations to be
 driven from one .ll file.  This is useful in many circumstances, for example,
@@ -598,10 +604,11 @@ both 32-bit and 64-bit code generation.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"><a 
-name="FileCheck-CHECK-NEXT">The "CHECK-NEXT:" directive</a></div>
+<h4>
+  <a name="FileCheck-CHECK-NEXT">The "CHECK-NEXT:" directive</a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>Sometimes you want to match lines and would like to verify that matches
 happen on exactly consecutive lines with no other lines in between them.  In
@@ -638,10 +645,11 @@ directive in a file.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"><a 
-name="FileCheck-CHECK-NOT">The "CHECK-NOT:" directive</a></div>
+<h4>
+  <a name="FileCheck-CHECK-NOT">The "CHECK-NOT:" directive</a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>The CHECK-NOT: directive is used to verify that a string doesn't occur
 between two matches (or the first match and the beginning of the file).  For
@@ -668,10 +676,11 @@ define i8 @coerce_offset0(i32 %V, i32* %P) {
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"><a 
-name="FileCheck-Matching">FileCheck Pattern Matching Syntax</a></div>
+<h4>
+  <a name="FileCheck-Matching">FileCheck Pattern Matching Syntax</a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>The CHECK: and CHECK-NOT: directives both take a pattern to match.  For most
 uses of FileCheck, fixed string matching is perfectly sufficient.  For some
@@ -700,10 +709,11 @@ braces explicitly from the input, you can use something ugly like
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection"><a 
-name="FileCheck-Variables">FileCheck Variables</a></div>
+<h4>
+  <a name="FileCheck-Variables">FileCheck Variables</a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>It is often useful to match a pattern and then verify that it occurs again
 later in the file.  For codegen tests, this can be useful to allow any register,
@@ -738,11 +748,12 @@ define two separate CHECK lines that match on the same line.
 
 </div>
 
+</div>
+
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsection"><a name="rtvars">Variables and
-substitutions</a></div>
+<h3><a name="rtvars">Variables and substitutions</a></h3>
 <!-- _______________________________________________________________________ -->
-<div class="doc_text">
+<div>
   <p>With a RUN line there are a number of substitutions that are permitted. In
   general, any Tcl variable that is available in the <tt>substitute</tt> 
   function (in <tt>test/lib/llvm.exp</tt>) can be substituted into a RUN line.
@@ -835,9 +846,9 @@ substitutions</a></div>
 </div>
   
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsection"><a name="rtfeatures">Other Features</a></div>
+<h3><a name="rtfeatures">Other Features</a></h3>
 <!-- _______________________________________________________________________ -->
-<div class="doc_text">
+<div>
   <p>To make RUN line writing easier, there are several shell scripts located
   in the <tt>llvm/test/Scripts</tt> directory. This directory is in the PATH
   when running tests, so you can just call these scripts using their name. For
@@ -892,12 +903,13 @@ substitutions</a></div>
 
 </div>
 
+</div>
+
 <!--=========================================================================-->
-<div class="doc_section"><a name="testsuitestructure">Test suite
-Structure</a></div>
+<h2><a name="testsuitestructure">Test suite Structure</a></h2>
 <!--=========================================================================-->
 
-<div class="doc_text">
+<div>
 
 <p>The <tt>test-suite</tt> module contains a number of programs that can be compiled 
 with LLVM and executed. These programs are compiled using the native compiler
@@ -962,10 +974,10 @@ will help you separate benign warnings from actual test failures.</p>
 </div>
 
 <!--=========================================================================-->
-<div class="doc_section"><a name="testsuiterun">Running the test suite</a></div>
+<h2><a name="testsuiterun">Running the test suite</a></h2>
 <!--=========================================================================-->
 
-<div class="doc_text">
+<div>
 
 <p>First, all tests are executed within the LLVM object directory tree.  They
 <i>are not</i> executed inside of the LLVM source tree. This is because the
@@ -1020,14 +1032,13 @@ test suite creates temporary files during execution.</p>
 have the suite checked out and configured, you don't need to do it again (unless
 the test code or configure script changes).</p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsection">
-<a name="testsuiteexternal">Configuring External Tests</a></div>
+<h3>
+  <a name="testsuiteexternal">Configuring External Tests</a>
+</h3>
 <!-- _______________________________________________________________________ -->
 
-<div class="doc_text">
+<div>
 <p>In order to run the External tests in the <tt>test-suite</tt>
   module, you must specify <i>--with-externals</i>.  This
   must be done during the <em>re-configuration</em> step (see above),
@@ -1055,10 +1066,11 @@ the test code or configure script changes).</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsection">
-<a name="testsuitetests">Running different tests</a></div>
+<h3>
+  <a name="testsuitetests">Running different tests</a>
+</h3>
 <!-- _______________________________________________________________________ -->
-<div class="doc_text">
+<div>
 <p>In addition to the regular "whole program" tests, the <tt>test-suite</tt>
 module also provides a mechanism for compiling the programs in different ways.
 If the variable TEST is defined on the <tt>gmake</tt> command line, the test system will
@@ -1078,10 +1090,11 @@ LLVM.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsection">
-<a name="testsuiteoutput">Generating test output</a></div>
+<h3>
+  <a name="testsuiteoutput">Generating test output</a>
+</h3>
 <!-- _______________________________________________________________________ -->
-<div class="doc_text">
+<div>
   <p>There are a number of ways to run the tests and generate output. The most
   simple one is simply running <tt>gmake</tt> with no arguments. This will
   compile and run all programs in the tree using a number of different methods
@@ -1109,11 +1122,12 @@ LLVM.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsection">
-<a name="testsuitecustom">Writing custom tests for the test suite</a></div>
+<h3>
+  <a name="testsuitecustom">Writing custom tests for the test suite</a>
+</h3>
 <!-- _______________________________________________________________________ -->
 
-<div class="doc_text">
+<div>
 
 <p>Assuming you can run the test suite, (e.g. "<tt>gmake TEST=nightly report</tt>"
 should work), it is really easy to run optimizations or code generator
@@ -1179,6 +1193,8 @@ example reports that can do fancy stuff.</p>
 
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
 
 <hr>
@@ -1189,7 +1205,7 @@ example reports that can do fancy stuff.</p>
   src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
 
   John T. Criswell, Daniel Dunbar, Reid Spencer, and Tanya Lattner<br>
-  <a href="http://llvm.org">The LLVM Compiler Infrastructure</a><br>
+  <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
   Last modified: $Date$
 </address>
 </body>
diff --git a/docs/UsingLibraries.html b/docs/UsingLibraries.html
index e06838b..2973452 100644
--- a/docs/UsingLibraries.html
+++ b/docs/UsingLibraries.html
@@ -5,7 +5,7 @@
   <link rel="stylesheet" href="llvm.css" type="text/css">
 </head>
 <body>
-<div class="doc_title">Using The LLVM Libraries</div>
+<h1>Using The LLVM Libraries</h1>
 <ol>
   <li><a href="#abstract">Abstract</a></li>
   <li><a href="#introduction">Introduction</a></li>
@@ -26,12 +26,12 @@
 <p class="doc_warning">Warning: This document is out of date, for more
   information please
   see <a href="CommandGuide/html/llvm-config.html">llvm-config</a> or,
-  if you use CMake, <a href=CMake.html#embedding>the CMake LLVM
+  if you use CMake, <a href="CMake.html#embedding">the CMake LLVM
   guide</a>.</p>
 
 <!-- ======================================================================= -->
-<div class="doc_section"><a name="abstract">Abstract</a></div>
-<div class="doc_text">
+<h2><a name="abstract">Abstract</a></h2>
+<div>
   <p>Amongst other things, LLVM is a toolkit for building compilers, linkers,
   runtime executives, virtual machines, and other program execution related
   tools. In addition to the LLVM tool set, the functionality of LLVM is
@@ -45,8 +45,8 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_section"> <a name="introduction">Introduction</a></div>
-<div class="doc_text">
+<h2><a name="introduction">Introduction</a></h2>
+<div>
   <p>If you're writing a compiler, virtual machine, or any other utility based 
   on LLVM, you'll need to figure out which of the many libraries files you will 
   need to link with to be successful. An understanding of the contents of these 
@@ -74,8 +74,8 @@
   correct for your tool can sometimes be challenging.
 </div>
 <!-- ======================================================================= -->
-<div class="doc_section"><a name="descriptions"></a>Library Descriptions</div>
-<div class="doc_text">
+<h2><a name="descriptions">Library Descriptions</a></h2>
+<div>
   <p>The table below categorizes each library
 <table style="text-align:left">
   <tr><th>Library</th><th>Forms</th><th>Description</th></tr>
@@ -152,8 +152,8 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_section"><a name="dependencies"></a>Using llvm-config</div>
-<div class="doc_text">
+<h2><a name="dependencies">Using llvm-config</a></h2>
+<div>
   <p>The <tt>llvm-config</tt> tool is a perl script that produces on its output
   various kinds of information. For example, the source or object directories 
   used to build LLVM can be accessed by passing options to <tt>llvm-config</tt>.
@@ -187,16 +187,16 @@
   <!-- === This should be updated whenever new libraries are added,       ===-->
   <!-- === removed, or changed                                            ===-->
   <!-- =======NOTE: =========================================================-->
-  <h2>Dependency Relationships Of Libraries</h2>
+  <h3>Dependency Relationships Of Libraries</h3>
   <p>This graph shows the dependency of archive libraries on other archive 
   libraries or objects. Where a library has both archive and object forms, only
   the archive form is shown.</p>
-  <img src="img/libdeps.gif" alt="Library Dependencies"/>
-  <h2>Dependency Relationships Of Object Files</h2>
+  <img src="img/libdeps.gif" alt="Library Dependencies">
+  <h3>Dependency Relationships Of Object Files</h3>
   <p>This graph shows the dependency of object files on archive libraries or 
   other objects. Where a library has both object and archive forms, only the 
   dependency to the archive form is shown.</p> 
-  <img src="img/objdeps.gif" alt="Object File Dependencies"/>
+  <img src="img/objdeps.gif" alt="Object File Dependencies">
   <p>The following list shows the dependency relationships between libraries in
   textual form. The information is the same as shown on the graphs but arranged
   alphabetically.</p>
@@ -280,8 +280,8 @@
     <li>libLLVMSystem.a</li>
     <li>libLLVMbzip2.a</li>
   </ul></dd>
-  <dt><b>libLLVMSystem.a</b></dt><dd><ul>
-  </ul></dd>
+  <dt><b>libLLVMSystem.a</b></dt><dd>
+  </dd>
   <dt><b>libLLVMTarget.a</b></dt><dd><ul>
     <li>libLLVMCore.a</li>
     <li>libLLVMSupport.a</li>
@@ -295,8 +295,8 @@
     <li>libLLVMTarget.a</li>
     <li>libLLVMipa.a</li>
   </ul></dd>
-  <dt><b>libLLVMbzip2.a</b></dt><dd><ul>
-  </ul></dd>
+  <dt><b>libLLVMbzip2.a</b></dt><dd>
+  </dd>
   <dt><b>libLLVMipa.a</b></dt><dd><ul>
     <li>libLLVMAnalysis.a</li>
     <li>libLLVMCore.a</li>
@@ -401,41 +401,45 @@
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_section"><a name="rot">Linkage Rules Of Thumb</a></div>
-<div class="doc_text">
+<h2><a name="rot">Linkage Rules Of Thumb</a></h2>
+<div>
 	<p>This section contains various "rules of thumb" about what files you
 	should link into your programs.</p>
-</div>
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="always">Always Link LLVMCore, LLVMSupport,
-    and LLVMSystem</a></div>
-<div class="doc_text">
+<h3>
+  <a name="always">Always Link LLVMCore, LLVMSupport, and LLVMSystem</a>
+</h3>
+<div>
   <p>No matter what you do with LLVM, the last three entries in the value of 
   your LLVMLIBS make variable should always be: 
   <tt>LLVMCore LLVMSupport.a LLVMSystem.a</tt>. There are no <tt>LLVM</tt> 
   programs that don't depend on these three.</p>
 </div>
 <!-- ======================================================================= -->
-<div class="doc_subsection"><a name="onlyone">Never link both archive and
-    re-linked library</a></div>
-<div class="doc_text">
+<h3>
+  <a name="onlyone">Never link both archive and re-linked library</a>
+</h3>
+<div>
   <p>There is never any point to linking both the re-linked (<tt>.o</tt>) and
   the archive (<tt>.a</tt>) versions of a library. Since the re-linked version
   includes the entire library, the archive version will not resolve any symbols.
   You could even end up with link error if you place the archive version before
   the re-linked version on the linker's command line.</p>
 </div>
+
+</div>
+
 <!-- ======================================================================= -->
 <hr>
 <div class="doc_footer">
 <address>
   <a href="http://jigsaw.w3.org/css-validator/check/referer"><img
-    src="http://jigsaw.w3.org/css-validator/images/vcss-blue" alt="Valid CSS"/></a>
+    src="http://jigsaw.w3.org/css-validator/images/vcss-blue" alt="Valid CSS"></a>
   <a href="http://validator.w3.org/check/referer"><img
     src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
   <a href="mailto:rspencer@x10sys.com">Reid Spencer</a>
 </address>
-<a href="http://llvm.org">The LLVM Compiler Infrastructure</a> 
+<a href="http://llvm.org/">The LLVM Compiler Infrastructure</a> 
 <br>Last modified: $Date$ </div>
 </body>
 </html>
diff --git a/docs/WritingAnLLVMBackend.html b/docs/WritingAnLLVMBackend.html
index 7290232..5e3d070 100644
--- a/docs/WritingAnLLVMBackend.html
+++ b/docs/WritingAnLLVMBackend.html
@@ -9,9 +9,9 @@
 
 <body>
 
-<div class="doc_title">
+<h1>
   Writing an LLVM Compiler Backend
-</div>
+</h1>
 
 <ol>
   <li><a href="#intro">Introduction</a>
@@ -61,12 +61,12 @@
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="intro">Introduction</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>
 This document describes techniques for writing compiler backends that convert
@@ -91,13 +91,11 @@ characteristics, such as a RISC instruction set and straightforward calling
 conventions.
 </p>
 
-</div>
-
-<div class="doc_subsection">
+<h3>
   <a name="Audience">Audience</a>
-</div>  
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>
 The audience for this document is anyone who needs to write an LLVM backend to
@@ -106,21 +104,21 @@ generate code for a specific hardware or software target.
 
 </div>
 
-<div class="doc_subsection">
+<h3>
   <a name="Prerequisite">Prerequisite Reading</a>
-</div>  
+</h3>
 
-<div class="doc_text">  
+<div>  
 
 <p>
 These essential documents must be read before reading this document:
 </p>
 
 <ul>
-<li><i><a href="http://www.llvm.org/docs/LangRef.html">LLVM Language Reference
+<li><i><a href="LangRef.html">LLVM Language Reference
     Manual</a></i> &mdash; a reference manual for the LLVM assembly language.</li>
 
-<li><i><a href="http://www.llvm.org/docs/CodeGenerator.html">The LLVM
+<li><i><a href="CodeGenerator.html">The LLVM
     Target-Independent Code Generator</a></i> &mdash; a guide to the components
     (classes and code generation algorithms) for translating the LLVM internal
     representation into machine code for a specified target.  Pay particular
@@ -129,14 +127,14 @@ These essential documents must be read before reading this document:
     Allocation, Prolog/Epilog Code Insertion, Late Machine Code Optimizations,
     and Code Emission.</li>
 
-<li><i><a href="http://www.llvm.org/docs/TableGenFundamentals.html">TableGen
+<li><i><a href="TableGenFundamentals.html">TableGen
     Fundamentals</a></i> &mdash;a document that describes the TableGen
     (<tt>tblgen</tt>) application that manages domain-specific information to
     support LLVM code generation. TableGen processes input from a target
     description file (<tt>.td</tt> suffix) and generates C++ code that can be
     used for code generation.</li>
 
-<li><i><a href="http://www.llvm.org/docs/WritingAnLLVMPass.html">Writing an LLVM
+<li><i><a href="WritingAnLLVMPass.html">Writing an LLVM
     Pass</a></i> &mdash; The assembly printer is a <tt>FunctionPass</tt>, as are
     several SelectionDAG processing steps.</li>
 </ul>
@@ -155,11 +153,11 @@ machine dependent features.
 
 </div>
 
-<div class="doc_subsection">
+<h3>
   <a name="Basic">Basic Steps</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>
 To write a compiler backend for LLVM that converts the LLVM IR to code for a
@@ -220,17 +218,17 @@ that the class will need and which components will need to be subclassed.
 
 </div>
 
-<div class="doc_subsection">
+<h3>
   <a name="Preliminaries">Preliminaries</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>
 To actually create your compiler backend, you need to create and modify a few
 files. The absolute minimum is discussed here. But to actually use the LLVM
 target-independent code generator, you must perform the steps described in
-the <a href="http://www.llvm.org/docs/CodeGenerator.html">LLVM
+the <a href="CodeGenerator.html">LLVM
 Target-Independent Code Generator</a> document.
 </p>
 
@@ -281,13 +279,15 @@ regenerate configure by running <tt>./autoconf/AutoRegen.sh</tt>.
 
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="TargetMachine">Target Machine</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>
 <tt>LLVMTargetMachine</tt> is designed as a base class for targets implemented
@@ -360,11 +360,6 @@ public:
 </pre>
 </div>
 
-</div>
-
-
-<div class="doc_text">
-
 <ul>
 <li><tt>getInstrInfo()</tt></li>
 <li><tt>getRegisterInfo()</tt></li>
@@ -398,10 +393,6 @@ SparcTargetMachine::SparcTargetMachine(const Module &amp;M, const std::string &a
 </pre>
 </div>
 
-</div>
-
-<div class="doc_text">
-
 <p>Hyphens separate portions of the <tt>TargetDescription</tt> string.</p>
 
 <ul>
@@ -424,12 +415,12 @@ SparcTargetMachine::SparcTargetMachine(const Module &amp;M, const std::string &a
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="TargetRegistration">Target Registration</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>
 You must also register your target with the <tt>TargetRegistry</tt>, which is
@@ -480,12 +471,12 @@ For more information, see
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="RegisterSet">Register Set and Register Classes</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>
 You should describe a concrete target-specific class that represents the
@@ -514,14 +505,12 @@ input files and placed in <tt>XXXGenRegisterInfo.h.inc</tt> and
 implementation of <tt>XXXRegisterInfo</tt> requires hand-coding.
 </p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="RegisterDef">Defining a Register</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>
 The <tt>XXXRegisterInfo.td</tt> file typically starts with register definitions
@@ -700,11 +689,11 @@ fields of a register's TargetRegisterDesc.
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="RegisterClassDef">Defining a Register Class</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>
 The <tt>RegisterClass</tt> class (specified in <tt>Target.td</tt>) is used to
@@ -717,8 +706,7 @@ classes using the following class:
 <div class="doc_code">
 <pre>
 class RegisterClass&lt;string namespace,
-list&lt;ValueType&gt; regTypes, int alignment,
-                    list&lt;Register&gt; regList&gt; {
+list&lt;ValueType&gt; regTypes, int alignment, dag regList&gt; {
   string Namespace = namespace;
   list&lt;ValueType&gt; RegTypes = regTypes;
   int Size = 0;  // spill size, in bits; zero lets tblgen pick the size
@@ -728,7 +716,7 @@ list&lt;ValueType&gt; regTypes, int alignment,
   // default value 1 means a single instruction
   // A negative value means copying is extremely expensive or impossible
   int CopyCost = 1;  
-  list&lt;Register&gt; MemberList = regList;
+  dag MemberList = regList;
   
   // for register classes that are subregisters of this class
   list&lt;RegisterClass&gt; SubRegClassList = [];  
@@ -760,9 +748,11 @@ list&lt;ValueType&gt; regTypes, int alignment,
     memory.</li>
 
 <li>The final argument, <tt>regList</tt>, specifies which registers are in this
-    class.  If an <tt>allocation_order_*</tt> method is not specified,
-    then <tt>regList</tt> also defines the order of allocation used by the
-    register allocator.</li>
+    class. If an alternative allocation order method is not specified, then
+    <tt>regList</tt> also defines the order of allocation used by the register
+    allocator. Besides simply listing registers with <tt>(add R0, R1, ...)</tt>,
+    more advanced set operators are available. See
+    <tt>include/llvm/Target/Target.td</tt> for more information.</li>
 </ul>
 
 <p>
@@ -772,44 +762,31 @@ classes, the first argument defines the namespace with the string
 '<tt>SP</tt>'. <tt>FPRegs</tt> defines a group of 32 single-precision
 floating-point registers (<tt>F0</tt> to <tt>F31</tt>); <tt>DFPRegs</tt> defines
 a group of 16 double-precision registers
-(<tt>D0-D15</tt>). For <tt>IntRegs</tt>, the <tt>MethodProtos</tt>
-and <tt>MethodBodies</tt> methods are used by TableGen to insert the specified
-code into generated output.
+(<tt>D0-D15</tt>).
 </p>
 
 <div class="doc_code">
 <pre>
-def FPRegs : RegisterClass&lt;"SP", [f32], 32,
-  [F0, F1, F2, F3, F4, F5, F6, F7, F8, F9, F10, F11, F12, F13, F14, F15,
-   F16, F17, F18, F19, F20, F21, F22, F23, F24, F25, F26, F27, F28, F29, F30, F31]&gt;;
+// F0, F1, F2, ..., F31
+def FPRegs : RegisterClass&lt;"SP", [f32], 32, (sequence "F%u", 0, 31)&gt;;
 
 def DFPRegs : RegisterClass&lt;"SP", [f64], 64,
-  [D0, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10, D11, D12, D13, D14, D15]&gt;;
+                            (add D0, D1, D2, D3, D4, D5, D6, D7, D8,
+                                 D9, D10, D11, D12, D13, D14, D15)&gt;;
 &nbsp;
 def IntRegs : RegisterClass&lt;"SP", [i32], 32,
-    [L0, L1, L2, L3, L4, L5, L6, L7,
-     I0, I1, I2, I3, I4, I5,
-     O0, O1, O2, O3, O4, O5, O7,
-     G1,
-     // Non-allocatable regs:
-     G2, G3, G4, 
-     O6,        // stack ptr
-    I6,        // frame ptr
-     I7,        // return address
-     G0,        // constant zero
-     G5, G6, G7 // reserved for kernel
-    ]&gt; {
-  let MethodProtos = [{
-    iterator allocation_order_end(const MachineFunction &amp;MF) const;
-  }];
-  let MethodBodies = [{
-    IntRegsClass::iterator
-    IntRegsClass::allocation_order_end(const MachineFunction &amp;MF) const {
-      return end() - 10  // Don't allocate special registers
-         -1;
-    }
-  }];
-}
+    (add L0, L1, L2, L3, L4, L5, L6, L7,
+         I0, I1, I2, I3, I4, I5,
+         O0, O1, O2, O3, O4, O5, O7,
+         G1,
+         // Non-allocatable regs:
+         G2, G3, G4,
+         O6,        // stack ptr
+         I6,        // frame ptr
+         I7,        // return address
+         G0,        // constant zero
+         G5, G6, G7 // reserved for kernel
+    )&gt;;
 </pre>
 </div>
 
@@ -831,10 +808,7 @@ which is included at the bottom of <tt>SparcRegisterInfo.cpp</tt>, the SPARC
 register implementation. The code below shows only the generated integer
 registers and associated register classes. The order of registers
 in <tt>IntRegs</tt> reflects the order in the definition of <tt>IntRegs</tt> in
-the target description file. Take special note of the use
-of <tt>MethodBodies</tt> in <tt>SparcRegisterInfo.td</tt> to create code in
-<tt>SparcGenRegisterInfo.inc</tt>. <tt>MethodProtos</tt> generates similar code
-in <tt>SparcGenRegisterInfo.h.inc</tt>.
+the target description file.
 </p>
 
 <div class="doc_code">
@@ -877,13 +851,7 @@ namespace SP {   // Register class instances
   static const TargetRegisterClass* const IntRegsSuperclasses [] = {
     NULL
   };
-...
-  IntRegsClass::iterator
-  IntRegsClass::allocation_order_end(const MachineFunction &amp;MF) const {
-     return end()-10  // Don't allocate special registers
-         -1;
-  }
-  
+
   IntRegsClass::IntRegsClass() : TargetRegisterClass(IntRegsRegClassID, 
     IntRegsVTs, IntRegsSubclasses, IntRegsSuperclasses, IntRegsSubRegClasses, 
     IntRegsSuperRegClasses, 4, 4, 1, IntRegs, IntRegs + 32) {}
@@ -891,15 +859,22 @@ namespace SP {   // Register class instances
 </pre>
 </div>
 
+<p>
+The register allocators will avoid using reserved registers, and callee saved
+registers are not used until all the volatile registers have been used.  That
+is usually good enough, but in some cases it may be necessary to provide custom
+allocation orders.
+</p>
+
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="implementRegister">Implement a subclass of</a> 
-  <a href="http://www.llvm.org/docs/CodeGenerator.html#targetregisterinfo">TargetRegisterInfo</a>
-</div>
+  <a href="CodeGenerator.html#targetregisterinfo">TargetRegisterInfo</a>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>
 The final step is to hand code portions of <tt>XXXRegisterInfo</tt>, which
@@ -933,13 +908,15 @@ implementation in <tt>SparcRegisterInfo.cpp</tt>:
 
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="InstructionSet">Instruction Set</a>
-</div>
+</h2>
 
 <!-- *********************************************************************** -->
-<div class="doc_text">
+<div>
 
 <p>
 During the early stages of code generation, the LLVM IR code is converted to a
@@ -1103,7 +1080,7 @@ The fifth parameter is a string that is used by the assembly printer and can be
 left as an empty string until the assembly printer interface is implemented. The
 sixth and final parameter is the pattern used to match the instruction during
 the SelectionDAG Select Phase described in
-(<a href="http://www.llvm.org/docs/CodeGenerator.html">The LLVM
+(<a href="CodeGenerator.html">The LLVM
 Target-Independent Code Generator</a>).  This parameter is detailed in the next
 section, <a href="#InstructionSelector">Instruction Selector</a>.
 </p>
@@ -1188,14 +1165,12 @@ correspond to the values in <tt>SparcInstrInfo.td</tt>. I.e.,
 <tt>SPCC::ICC_NE = 9</tt>, <tt>SPCC::FCC_U = 23</tt> and so on.)
 </p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="operandMapping">Instruction Operand Mapping</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>
 The code generator backend maps instruction operands to fields in the
@@ -1283,12 +1258,12 @@ the <tt>rd</tt>, <tt>rs1</tt>, and <tt>rs2</tt> fields respectively.
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="implementInstr">Implement a subclass of </a>
-  <a href="http://www.llvm.org/docs/CodeGenerator.html#targetinstrinfo">TargetInstrInfo</a>
-</div>
+  <a href="CodeGenerator.html#targetinstrinfo">TargetInstrInfo</a>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>
 The final step is to hand code portions of <tt>XXXInstrInfo</tt>, which
@@ -1327,10 +1302,10 @@ implementation in <tt>SparcInstrInfo.cpp</tt>:
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="branchFolding">Branch Folding and If Conversion</a>
-</div>
-<div class="doc_text">
+</h3>
+<div>
 
 <p>
 Performance can be improved by combining instructions or by eliminating
@@ -1485,13 +1460,15 @@ branch.
 
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="InstructionSelector">Instruction Selector</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>
 LLVM uses a <tt>SelectionDAG</tt> to represent LLVM IR instructions, and nodes
@@ -1533,7 +1510,7 @@ selection pass into the queue of passes to run.
 The LLVM static compiler (<tt>llc</tt>) is an excellent tool for visualizing the
 contents of DAGs. To display the <tt>SelectionDAG</tt> before or after specific
 processing phases, use the command line options for <tt>llc</tt>, described
-at <a href="http://llvm.org/docs/CodeGenerator.html#selectiondag_process">
+at <a href="CodeGenerator.html#selectiondag_process">
 SelectionDAG Instruction Selection Process</a>.
 </p>
 
@@ -1642,14 +1619,12 @@ SDNode *Select_ISD_STORE(const SDValue &amp;N) {
 </pre>
 </div>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="LegalizePhase">The SelectionDAG Legalize Phase</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>
 The Legalize phase converts a DAG to use types and operations that are natively
@@ -1716,14 +1691,12 @@ a <tt>LegalAction</tt> type enum value: <tt>Promote</tt>, <tt>Expand</tt>,
 contains examples of all four <tt>LegalAction</tt> values.
 </p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="promote">Promote</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>
 For an operation without native support for a given type, the specified type may
@@ -1742,11 +1715,11 @@ setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="expand">Expand</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>
 For a type without native support, a value may need to be broken down further,
@@ -1767,11 +1740,11 @@ setOperationAction(ISD::FCOS, MVT::f32, Expand);
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="custom">Custom</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>
 For some operations, simple type promotion or operation expansion may be
@@ -1833,11 +1806,11 @@ static SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &amp;DAG) {
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="legal">Legal</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>
 The <tt>Legal</tt> LegalizeAction enum value simply indicates that an
@@ -1865,12 +1838,14 @@ if (TM.getSubtarget&lt;SparcSubtarget&gt;().isV9())
 
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="callingConventions">Calling Conventions</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>
 To support target-specific calling conventions, <tt>XXXGenCallingConv.td</tt>
@@ -2015,13 +1990,15 @@ def RetCC_X86_32 : CallingConv&lt;[
 
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="assemblyPrinter">Assembly Printer</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>
 During the code emission stage, the code generator may utilize an LLVM pass to
@@ -2171,12 +2148,12 @@ output.
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="subtargetSupport">Subtarget Support</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>
 Subtarget support is used to inform the code generation process of instruction
@@ -2289,12 +2266,12 @@ XXXSubtarget::XXXSubtarget(const Module &amp;M, const std::string &amp;FS) {
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="jitSupport">JIT Support</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>
 The implementation of a target machine optionally includes a Just-In-Time (JIT)
@@ -2333,14 +2310,12 @@ Both <tt>XXXJITInfo.cpp</tt> and <tt>XXXCodeEmitter.cpp</tt> must include the
 that write data (in bytes, words, strings, etc.) to the output stream.
 </p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="mce">Machine Code Emitter</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>
 In <tt>XXXCodeEmitter.cpp</tt>, a target-specific of the <tt>Emitter</tt> class
@@ -2478,11 +2453,11 @@ enum RelocationType {
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="targetJITInfo">Target JIT Info</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>
 <tt>XXXJITInfo.cpp</tt> implements the JIT interfaces for target-specific
@@ -2537,6 +2512,8 @@ with assembler.
 
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
 
 <hr>
@@ -2547,7 +2524,7 @@ with assembler.
   src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
 
   <a href="http://www.woo.com">Mason Woo</a> and <a href="http://misha.brukman.net">Misha Brukman</a><br>
-  <a href="http://llvm.org">The LLVM Compiler Infrastructure</a>
+  <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a>
   <br>
   Last modified: $Date$
 </address>
diff --git a/docs/WritingAnLLVMPass.html b/docs/WritingAnLLVMPass.html
index fe93a87..136f8fb 100644
--- a/docs/WritingAnLLVMPass.html
+++ b/docs/WritingAnLLVMPass.html
@@ -8,9 +8,9 @@
 </head>
 <body>
 
-<div class="doc_title">
+<h1>
   Writing an LLVM Pass
-</div>
+</h1>
 
 <ol>
   <li><a href="#introduction">Introduction - What is a pass?</a></li>
@@ -121,12 +121,12 @@
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="introduction">Introduction - What is a pass?</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>The LLVM Pass Framework is an important part of the LLVM system, because LLVM
 passes are where most of the interesting parts of the compiler exist.  Passes
@@ -156,12 +156,12 @@ more advanced features are discussed.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="quickstart">Quick Start - Writing hello world</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Here we describe how to write the "hello world" of passes.  The "Hello" pass
 is designed to simply print out the name of non-external functions that exist in
@@ -169,14 +169,12 @@ the program being compiled.  It does not modify the program at all, it just
 inspects it.  The source code and files for this pass are available in the LLVM
 source tree in the <tt>lib/Transforms/Hello</tt> directory.</p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="makefile">Setting up the build environment</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
   <p>First, configure and build LLVM.  This needs to be done directly inside the
   LLVM source tree rather than in a separate objects directory.
@@ -185,7 +183,7 @@ source tree in the <tt>lib/Transforms/Hello</tt> directory.</p>
   <tt>lib/Transforms/Hello</tt>.  Finally, you must set up a build script 
   (Makefile) that will compile the source code for the new pass.  To do this, 
   copy the following into <tt>Makefile</tt>:</p>
-  <hr/>
+  <hr>
 
 <div class="doc_code"><pre>
 # Makefile for hello pass
@@ -211,17 +209,20 @@ the <tt>opt</tt> or <tt>bugpoint</tt> tools via their <tt>-load</tt> options.
 If your operating system uses a suffix other than .so (such as windows or 
 Mac OS/X), the appropriate extension will be used.</p>
 
+<p>If you are used CMake to build LLVM, see
+<a href="CMake.html#passdev">Developing an LLVM pass with CMake</a>.</p>
+
 <p>Now that we have the build scripts set up, we just need to write the code for
 the pass itself.</p>
 
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="basiccode">Basic code required</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>Now that we have a way to compile our new pass, we just have to write it.
 Start out with:</p>
@@ -301,7 +302,7 @@ function.</p>
 initialization value is not important.</p>
 
 <div class="doc_code"><pre>
-  static RegisterPass<Hello> X("<i>hello</i>", "<i>Hello World Pass</i>",
+  static RegisterPass&lt;Hello&gt; X("<i>hello</i>", "<i>Hello World Pass</i>",
                         false /* Only looks at CFG */,
                         false /* Analysis Pass */);
 }  <i>// end of anonymous namespace</i>
@@ -337,7 +338,7 @@ is supplied as fourth argument. </p>
   };
   
   char Hello::ID = 0;
-  static RegisterPass<Hello> X("hello", "Hello World Pass", false, false);
+  static RegisterPass&lt;Hello&gt; X("hello", "Hello World Pass", false, false);
 }
 
 </pre></div>
@@ -353,11 +354,11 @@ them) to be useful.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="running">Running a pass with <tt>opt</tt></a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>Now that you have a brand new shiny shared object file, we can use the
 <tt>opt</tt> command to run an LLVM program through your pass.  Because you
@@ -443,13 +444,15 @@ about some more details of how they work and how to use them.</p>
 
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="passtype">Pass classes and requirements</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>One of the first things that you should do when designing a new pass is to
 decide what class you should subclass for your pass.  The <a
@@ -464,14 +467,12 @@ listed.  This gives the LLVM Pass Infrastructure information necessary to
 optimize how passes are run, so that the resultant compiler isn't unnecessarily
 slow.</p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="ImmutablePass">The <tt>ImmutablePass</tt> class</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>The most plain and boring type of pass is the "<tt><a
 href="http://llvm.org/doxygen/classllvm_1_1ImmutablePass.html">ImmutablePass</a></tt>"
@@ -490,11 +491,11 @@ invalidated, and are never "run".</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="ModulePass">The <tt>ModulePass</tt> class</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>The "<tt><a
 href="http://llvm.org/doxygen/classllvm_1_1ModulePass.html">ModulePass</a></tt>"
@@ -516,14 +517,12 @@ DominatorTree for function definitions, not declarations.</p>
 <tt>ModulePass</tt> and overload the <tt>runOnModule</tt> method with the
 following signature:</p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="runOnModule">The <tt>runOnModule</tt> method</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <div class="doc_code"><pre>
   <b>virtual bool</b> runOnModule(Module &amp;M) = 0;
@@ -535,12 +534,14 @@ false otherwise.</p>
 
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="CallGraphSCCPass">The <tt>CallGraphSCCPass</tt> class</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>The "<tt><a
 href="http://llvm.org/doxygen/classllvm_1_1CallGraphSCCPass.html">CallGraphSCCPass</a></tt>"
@@ -581,15 +582,14 @@ because it has to handle SCCs with more than one node in it.  All of the virtual
 methods described below should return true if they modified the program, or
 false if they didn't.</p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
-  <a name="doInitialization_scc">The <tt>doInitialization(CallGraph &amp;)</tt>
-  method</a>
-</div>
+<h4>
+  <a name="doInitialization_scc">
+    The <tt>doInitialization(CallGraph &amp;)</tt> method
+  </a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <div class="doc_code"><pre>
   <b>virtual bool</b> doInitialization(CallGraph &amp;CG);
@@ -606,11 +606,11 @@ fast).</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="runOnSCC">The <tt>runOnSCC</tt> method</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <div class="doc_code"><pre>
   <b>virtual bool</b> runOnSCC(CallGraphSCC &amp;SCC) = 0;
@@ -623,12 +623,13 @@ otherwise.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
-  <a name="doFinalization_scc">The <tt>doFinalization(CallGraph
-   &amp;)</tt> method</a>
-</div>
+<h4>
+  <a name="doFinalization_scc">
+    The <tt>doFinalization(CallGraph &amp;)</tt> method
+  </a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <div class="doc_code"><pre>
   <b>virtual bool</b> doFinalization(CallGraph &amp;CG);
@@ -641,12 +642,14 @@ program being compiled.</p>
 
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="FunctionPass">The <tt>FunctionPass</tt> class</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>In contrast to <tt>ModulePass</tt> subclasses, <tt><a
 href="http://llvm.org/doxygen/classllvm_1_1Pass.html">FunctionPass</a></tt>
@@ -671,15 +674,14 @@ href="#basiccode">Hello World</a> pass for example).  <tt>FunctionPass</tt>'s
 may overload three virtual methods to do their work.  All of these methods
 should return true if they modified the program, or false if they didn't.</p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
-  <a name="doInitialization_mod">The <tt>doInitialization(Module &amp;)</tt>
-  method</a>
-</div>
+<h4>
+  <a name="doInitialization_mod">
+    The <tt>doInitialization(Module &amp;)</tt> method
+  </a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <div class="doc_code"><pre>
   <b>virtual bool</b> doInitialization(Module &amp;M);
@@ -703,11 +705,11 @@ free functions that it needs, adding prototypes to the module if necessary.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="runOnFunction">The <tt>runOnFunction</tt> method</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <div class="doc_code"><pre>
   <b>virtual bool</b> runOnFunction(Function &amp;F) = 0;
@@ -720,12 +722,13 @@ be returned if the function is modified.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
-  <a name="doFinalization_mod">The <tt>doFinalization(Module
-  &amp;)</tt> method</a>
-</div>
+<h4>
+  <a name="doFinalization_mod">
+    The <tt>doFinalization(Module &amp;)</tt> method
+  </a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <div class="doc_code"><pre>
   <b>virtual bool</b> doFinalization(Module &amp;M);
@@ -738,12 +741,14 @@ program being compiled.</p>
 
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="LoopPass">The <tt>LoopPass</tt> class </a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p> All <tt>LoopPass</tt> execute on each loop in the function independent of
 all of the other loops in the function. <tt>LoopPass</tt> processes loops in
@@ -754,16 +759,15 @@ loop nest order such that outer most loop is processed last. </p>
 straightforward. <tt>LoopPass</tt>'s may overload three virtual methods to
 do their work. All these methods should return true if they modified the 
 program, or false if they didn't. </p>
-</div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
-  <a name="doInitialization_loop">The <tt>doInitialization(Loop *,
-                                                 LPPassManager &amp;)</tt>
-  method</a>
-</div>
+<h4>
+  <a name="doInitialization_loop">
+    The <tt>doInitialization(Loop *,LPPassManager &amp;)</tt> method
+  </a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <div class="doc_code"><pre>
   <b>virtual bool</b> doInitialization(Loop *, LPPassManager &amp;LPM);
@@ -780,11 +784,11 @@ information.</p>
 
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="runOnLoop">The <tt>runOnLoop</tt> method</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <div class="doc_code"><pre>
   <b>virtual bool</b> runOnLoop(Loop *, LPPassManager &amp;LPM) = 0;
@@ -798,11 +802,11 @@ should be used to update loop nest.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="doFinalization_loop">The <tt>doFinalization()</tt> method</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <div class="doc_code"><pre>
   <b>virtual bool</b> doFinalization();
@@ -815,12 +819,14 @@ program being compiled. </p>
 
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="RegionPass">The <tt>RegionPass</tt> class </a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p> <tt>RegionPass</tt> is similar to <a href="#LoopPass"><tt>LoopPass</tt></a>,
 but executes on each single entry single exit region in the function.
@@ -832,16 +838,15 @@ the <tt>RGPassManager</tt> interface. You may overload three virtual methods of
 <tt>RegionPass</tt> to implement your own region pass. All these
 methods should return true if they modified the program, or false if they didn not.
 </p>
-</div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
-  <a name="doInitialization_region">The <tt>doInitialization(Region *,
-                                                 RGPassManager &amp;)</tt>
-  method</a>
-</div>
+<h4>
+  <a name="doInitialization_region">
+    The <tt>doInitialization(Region *, RGPassManager &amp;)</tt> method
+  </a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <div class="doc_code"><pre>
   <b>virtual bool</b> doInitialization(Region *, RGPassManager &amp;RGM);
@@ -858,11 +863,11 @@ information.</p>
 
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="runOnRegion">The <tt>runOnRegion</tt> method</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <div class="doc_code"><pre>
   <b>virtual bool</b> runOnRegion(Region *, RGPassManager &amp;RGM) = 0;
@@ -876,11 +881,11 @@ should be used to update region tree.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="doFinalization_region">The <tt>doFinalization()</tt> method</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <div class="doc_code"><pre>
   <b>virtual bool</b> doFinalization();
@@ -893,14 +898,14 @@ program being compiled. </p>
 
 </div>
 
-
+</div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="BasicBlockPass">The <tt>BasicBlockPass</tt> class</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p><tt>BasicBlockPass</tt>'s are just like <a
 href="#FunctionPass"><tt>FunctionPass</tt></a>'s, except that they must limit
@@ -922,15 +927,14 @@ href="#doInitialization_mod"><tt>doInitialization(Module &amp;)</tt></a> and <a
 href="#doFinalization_mod"><tt>doFinalization(Module &amp;)</tt></a> methods that <a
 href="#FunctionPass"><tt>FunctionPass</tt></a>'s have, but also have the following virtual methods that may also be implemented:</p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
-  <a name="doInitialization_fn">The <tt>doInitialization(Function
-  &amp;)</tt> method</a>
-</div>
+<h4>
+  <a name="doInitialization_fn">
+    The <tt>doInitialization(Function &amp;)</tt> method
+  </a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <div class="doc_code"><pre>
   <b>virtual bool</b> doInitialization(Function &amp;F);
@@ -947,11 +951,11 @@ fast).</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="runOnBasicBlock">The <tt>runOnBasicBlock</tt> method</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <div class="doc_code"><pre>
   <b>virtual bool</b> runOnBasicBlock(BasicBlock &amp;BB) = 0;
@@ -965,12 +969,13 @@ if the basic block is modified.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
-  <a name="doFinalization_fn">The <tt>doFinalization(Function &amp;)</tt> 
-  method</a>
-</div>
+<h4>
+  <a name="doFinalization_fn">
+    The <tt>doFinalization(Function &amp;)</tt> method
+  </a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <div class="doc_code"><pre>
   <b>virtual bool</b> doFinalization(Function &amp;F);
@@ -984,12 +989,14 @@ finalization.</p>
 
 </div>
 
+</div>
+
 <!-- ======================================================================= -->
-<div class="doc_subsection">
+<h3>
   <a name="MachineFunctionPass">The <tt>MachineFunctionPass</tt> class</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>A <tt>MachineFunctionPass</tt> is a part of the LLVM code generator that
 executes on the machine-dependent representation of each LLVM function in the
@@ -1014,15 +1021,14 @@ href="#runOnMachineFunction"><tt>runOnMachineFunction</tt></a> (including global
 data)</li>
 </ol>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
-  <a name="runOnMachineFunction">The <tt>runOnMachineFunction(MachineFunction
-  &amp;MF)</tt> method</a>
-</div>
+<h4>
+  <a name="runOnMachineFunction">
+    The <tt>runOnMachineFunction(MachineFunction &amp;MF)</tt> method
+  </a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <div class="doc_code"><pre>
   <b>virtual bool</b> runOnMachineFunction(MachineFunction &amp;MF) = 0;
@@ -1043,13 +1049,17 @@ remember, you may not modify the LLVM <tt>Function</tt> or its contents from a
 
 </div>
 
+</div>
+
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="registration">Pass registration</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>In the <a href="#basiccode">Hello World</a> example pass we illustrated how
 pass registration works, and discussed some of the reasons that it is used and
@@ -1066,14 +1076,12 @@ well as for debug output generated by the <tt>--debug-pass</tt> option.</p>
 <p>If you want your pass to be easily dumpable, you should 
 implement the virtual <tt>print</tt> method:</p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="print">The <tt>print</tt> method</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <div class="doc_code"><pre>
   <b>virtual void</b> print(std::ostream &amp;O, <b>const</b> Module *M) <b>const</b>;
@@ -1093,13 +1101,15 @@ depended on.</p>
 
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="interaction">Specifying interactions between passes</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>One of the main responsibilities of the <tt>PassManager</tt> is to make sure
 that passes interact with each other correctly.  Because <tt>PassManager</tt>
@@ -1116,14 +1126,12 @@ specifies.  If a pass does not implement the <tt><a
 href="#getAnalysisUsage">getAnalysisUsage</a></tt> method, it defaults to not
 having any prerequisite passes, and invalidating <b>all</b> other passes.</p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="getAnalysisUsage">The <tt>getAnalysisUsage</tt> method</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <div class="doc_code"><pre>
   <b>virtual void</b> getAnalysisUsage(AnalysisUsage &amp;Info) <b>const</b>;
@@ -1139,11 +1147,14 @@ object:</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
-  <a name="AU::addRequired">The <tt>AnalysisUsage::addRequired&lt;&gt;</tt> and <tt>AnalysisUsage::addRequiredTransitive&lt;&gt;</tt> methods</a>
-</div>
-
-<div class="doc_text">
+<h4>
+  <a name="AU::addRequired">
+    The <tt>AnalysisUsage::addRequired&lt;&gt;</tt>
+    and <tt>AnalysisUsage::addRequiredTransitive&lt;&gt;</tt> methods
+  </a>
+</h4>
+
+<div>
 <p>
 If your pass requires a previous pass to be executed (an analysis for example),
 it can use one of these methods to arrange for it to be run before your pass.
@@ -1165,11 +1176,13 @@ pass is.
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
-  <a name="AU::addPreserved">The <tt>AnalysisUsage::addPreserved&lt;&gt;</tt> method</a>
-</div>
+<h4>
+  <a name="AU::addPreserved">
+    The <tt>AnalysisUsage::addPreserved&lt;&gt;</tt> method
+  </a>
+</h4>
 
-<div class="doc_text">
+<div>
 <p>
 One of the jobs of the PassManager is to optimize how and when analyses are run.
 In particular, it attempts to avoid recomputing data unless it needs to.  For
@@ -1200,11 +1213,13 @@ the fact that it hacks on the CFG.
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
-  <a name="AU::examples">Example implementations of <tt>getAnalysisUsage</tt></a>
-</div>
+<h4>
+  <a name="AU::examples">
+    Example implementations of <tt>getAnalysisUsage</tt>
+  </a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <div class="doc_code"><pre>
   <i>// This example modifies the program, but does not modify the CFG</i>
@@ -1217,12 +1232,14 @@ the fact that it hacks on the CFG.
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
-  <a name="getAnalysis">The <tt>getAnalysis&lt;&gt;</tt> and
-<tt>getAnalysisIfAvailable&lt;&gt;</tt> methods</a>
-</div>
+<h4>
+  <a name="getAnalysis">
+    The <tt>getAnalysis&lt;&gt;</tt> and
+    <tt>getAnalysisIfAvailable&lt;&gt;</tt> methods
+  </a>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>The <tt>Pass::getAnalysis&lt;&gt;</tt> method is automatically inherited by
 your class, providing you with access to the passes that you declared that you
@@ -1274,13 +1291,15 @@ if it is active.  For example:</p>
 
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="analysisgroup">Implementing Analysis Groups</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Now that we understand the basics of how passes are defined, how they are
 used, and how they are required from other passes, it's time to get a little bit
@@ -1299,14 +1318,12 @@ between these two extremes for other implementations).  To cleanly support
 situations like this, the LLVM Pass Infrastructure supports the notion of
 Analysis Groups.</p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="agconcepts">Analysis Group Concepts</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>An Analysis Group is a single simple interface that may be implemented by
 multiple different passes.  Analysis Groups can be given human readable names
@@ -1353,11 +1370,11 @@ hypothetical example) instead.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="registerag">Using <tt>RegisterAnalysisGroup</tt></a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>The <tt>RegisterAnalysisGroup</tt> template is used to register the analysis
 group itself, while the <tt>INITIALIZE_AG_PASS</tt> is used to add pass
@@ -1414,13 +1431,15 @@ pass is the default implementation for the interface.</p>
 
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="passStatistics">Pass Statistics</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 <p>The <a
 href="http://llvm.org/doxygen/Statistic_8h-source.html"><tt>Statistic</tt></a>
 class is designed to be an easy way to expose various success
@@ -1432,12 +1451,12 @@ line. See the <a href="http://llvm.org/docs/ProgrammersManual.html#Statistic">St
 
 
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="passmanager">What PassManager does</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>The <a
 href="http://llvm.org/doxygen/PassManager_8h-source.html"><tt>PassManager</tt></a>
@@ -1604,14 +1623,12 @@ Hello: main
 <p>Which shows that we don't accidentally invalidate dominator information
 anymore, and therefore do not have to compute it twice.</p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="releaseMemory">The <tt>releaseMemory</tt> method</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <div class="doc_code"><pre>
   <b>virtual void</b> releaseMemory();
@@ -1632,13 +1649,15 @@ class, before the next call of <tt>run*</tt> in your pass.</p>
 
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="registering">Registering dynamically loaded passes</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p><i>Size matters</i> when constructing production quality tools using llvm, 
 both for the purposes of distribution, and for regulating the resident code size
@@ -1665,14 +1684,12 @@ the static destructor <i>unregisters</i>. Thus a pass that is statically linked
 in the tool will be registered at start up. A dynamically loaded pass will
 register on load and unregister at unload.</p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsection">
+<h3>
   <a name="registering_existing">Using existing registries</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>There are predefined registries to track instruction scheduling
 (<tt>RegisterScheduler</tt>) and register allocation (<tt>RegisterRegAlloc</tt>)
@@ -1733,11 +1750,11 @@ call line to <tt>llvm/Codegen/LinkAllCodegenComponents.h</tt>.</p>
 
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsection">
+<h3>
   <a name="registering_new">Creating new registries</a>
-</div>
+</h3>
 
-<div class="doc_text">
+<div>
 
 <p>The easiest way to get started is to clone one of the existing registries; we
 recommend <tt>llvm/CodeGen/RegAllocRegistry.h</tt>.  The key things to modify
@@ -1765,13 +1782,15 @@ creator.</p>
 
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="debughints">Using GDB with dynamically loaded passes</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Unfortunately, using GDB with dynamically loaded passes is not as easy as it
 should be.  First of all, you can't set a breakpoint in a shared object that has
@@ -1783,14 +1802,12 @@ GDB.</p>
 transformation invoked by <tt>opt</tt>, although nothing described here depends
 on that.</p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="breakpoint">Setting a breakpoint in your pass</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>First thing you do is start <tt>gdb</tt> on the <tt>opt</tt> process:</p>
 
@@ -1831,11 +1848,11 @@ or do other standard debugging stuff.</p>
 </div>
 
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="debugmisc">Miscellaneous Problems</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>Once you have the basics down, there are a couple of problems that GDB has,
 some with solutions, some without.</p>
@@ -1863,26 +1880,26 @@ href="mailto:sabre@nondot.org">Chris</a>.</p>
 
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section">
+<h2>
   <a name="future">Future extensions planned</a>
-</div>
+</h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Although the LLVM Pass Infrastructure is very capable as it stands, and does
 some nifty stuff, there are things we'd like to add in the future.  Here is
 where we are going:</p>
 
-</div>
-
 <!-- _______________________________________________________________________ -->
-<div class="doc_subsubsection">
+<h4>
   <a name="SMP">Multithreaded LLVM</a>
-</div>
+</h4>
 
-<div class="doc_text">
+<div>
 
 <p>Multiple CPU machines are becoming more common and compilation can never be
 fast enough: obviously we should allow for a multithreaded compiler.  Because of
@@ -1900,6 +1917,8 @@ Despite that, we have kept the LLVM passes SMP ready, and you should too.</p>
 
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
 <hr>
 <address>
@@ -1909,7 +1928,7 @@ Despite that, we have kept the LLVM passes SMP ready, and you should too.</p>
   src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
 
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
-  <a href="http://llvm.org">The LLVM Compiler Infrastructure</a><br>
+  <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
   Last modified: $Date$
 </address>
 
diff --git a/docs/doxygen.css b/docs/doxygen.css
index 83b049b..80c6cad 100644
--- a/docs/doxygen.css
+++ b/docs/doxygen.css
@@ -370,9 +370,39 @@ H2 {
 H3 {
  font-size: 100%;
 }
+
+H2, H3 {
+  border-bottom: 2px solid;
+  margin-top: 2em;
+}
+
 A.qindex {}
 A.qindexRef {}
 A.el { text-decoration: none; font-weight: bold }
 A.elRef { font-weight: bold }
 A.code { text-decoration: none; font-weight: normal; color: #4444ee }
 A.codeRef { font-weight: normal; color: #4444ee }
+
+div.memitem {
+  border: 1px solid #999999;
+  margin-top: 1.0em;
+  margin-bottom: 1.0em;
+  -webkit-border-radius: 0.5em;
+  -webkit-box-shadow: 3px 3px 6px #777777;
+  -moz-border-radius: 0.5em;
+  -moz-box-shadow: black 3px 3px 3px;
+}
+
+div.memproto {
+  background-color: #E3E4E5;
+  padding: 0.25em 0.5em;
+  -webkit-border-top-left-radius: 0.5em;
+  -webkit-border-top-right-radius: 0.5em;
+  -moz-border-radius-topleft: 0.5em;
+  -moz-border-radius-topright: 0.5em;
+}
+
+div.memdoc {
+  padding-left: 1em;
+  padding-right: 1em;
+}
diff --git a/docs/doxygen.footer b/docs/doxygen.footer
index d75fff5..15585b8 100644
--- a/docs/doxygen.footer
+++ b/docs/doxygen.footer
@@ -1,6 +1,6 @@
 <hr>
 <p class="footer">
-Generated on $datetime for <a href="http://llvm.org">$projectname</a> by
+Generated on $datetime for <a href="http://llvm.org/">$projectname</a> by
 <a href="http://www.doxygen.org"><img src="doxygen.png" alt="Doxygen"
 align="middle" border="0"/>$doxygenversion</a><br>
 Copyright &copy; 2003-2009 University of Illinois at Urbana-Champaign.
diff --git a/docs/index.html b/docs/index.html
index bf12f71..f9ebaa1 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -7,13 +7,12 @@
 </head>
 <body>
 
-<div class="doc_title">Documentation for the LLVM System at SVN head</div>
+<h1>Documentation for the LLVM System at SVN head</h1>
 
 <p class="doc_warning">If you are using a released version of LLVM,
 see <a href="http://llvm.org/releases/">the download page</a> to find
 your documentation.</p>
 
-<div class="doc_text">
 <table class="layout" width="95%"><tr class="layout"><td class="left">
 <ul>
   <li><a href="#llvmdesign">LLVM Design</a></li>
@@ -32,22 +31,20 @@ your documentation.</p>
     </p>
   </form>
 </td></tr></table>
-</div>
 
 <div class="doc_author">
-  <p>Written by <a href="http://llvm.org">The LLVM Team</a></p>
+  <p>Written by <a href="http://llvm.org/">The LLVM Team</a></p>
 </div>
 
 <!--=======================================================================-->
-<div class="doc_section"><a name="llvmdesign">LLVM Design &amp; Overview</a></div>
+<h2><a name="llvmdesign">LLVM Design &amp; Overview</a></h2>
 <!--=======================================================================-->
 
 <ul>
 <li><a href="LangRef.html">LLVM Language Reference Manual</a> - Defines the LLVM
 intermediate representation.</li>
-<li><a href="http://llvm.org/pubs/2008-10-04-ACAT-LLVM-Intro.html">Introduction to the LLVM Compiler </a> - Presentation describing LLVM.</li>
-<li><a href="http://llvm.org/pubs/2004-09-22-LCPCLLVMTutorial.html">The LLVM Compiler Framework and
-Infrastructure Tutorial</a> - Tutorial for writing passes, exploring the system.</li>
+<li><a href="http://llvm.org/pubs/2008-10-04-ACAT-LLVM-Intro.html">Introduction to the LLVM Compiler </a> - Presentation providing a users introduction to LLVM.</li>
+<li><a href="http://www.aosabook.org/en/llvm.html">Intro to LLVM</a> - book chapter providing a compiler hacker's introduction to LLVM.</li>
 <li><a href="http://llvm.org/pubs/2004-01-30-CGO-LLVM.html">LLVM: A Compilation Framework for
 Lifelong Program Analysis &amp; Transformation</a> - Design overview.</li>
 <li><a href="http://llvm.org/pubs/2002-12-LattnerMSThesis.html">LLVM: An Infrastructure for
@@ -57,7 +54,7 @@ frequent questions about LLVM's most frequently misunderstood instruction.</li>
 </ul>
 
 <!--=======================================================================-->
-<div class="doc_section"><a name="userguide">LLVM User Guides</a></div>
+<h2><a name="userguide">LLVM User Guides</a></h2>
 <!--=======================================================================-->
 
 <ul>
@@ -75,7 +72,7 @@ LLVM for a custom language, and the facilities LLVM offers in tutorial form.</li
 <li><a href="DeveloperPolicy.html">Developer Policy</a> - The LLVM project's
 policy towards developers and their contributions.</li>
 
-<li><a href="/docs/CommandGuide/index.html">LLVM Command Guide</a> - A reference
+<li><a href="CommandGuide/index.html">LLVM Command Guide</a> - A reference
 manual for the LLVM command line utilities ("man" pages for LLVM tools).<br>
 Current tools:
  <a href="/cmds/llvm-ar.html">llvm-ar</a>,
@@ -131,7 +128,7 @@ href="irc://irc.oftc.net/llvm">join #llvm on irc.oftc.net</a> directly.</li>
 
 
 <!--=======================================================================-->
-<div class="doc_section"><a name="llvmprog">General LLVM Programming Documentation</a></div>
+<h2><a name="llvmprog">General LLVM Programming Documentation</a></h2>
 <!--=======================================================================-->
 
 <ul>
@@ -179,7 +176,7 @@ href="http://llvm.org/doxygen/inherits.html">classes</a>)
 </ul>
 
 <!--=======================================================================-->
-<div class="doc_section"><a name="subsystems">LLVM Subsystem Documentation</a></div>
+<h2><a name="subsystems">LLVM Subsystem Documentation</a></h2>
 <!--=======================================================================-->
 
 <ul>
@@ -246,7 +243,7 @@ JITed code with GDB.</li>
 
 
 <!--=======================================================================-->
-<div class="doc_section"><a name="maillist">LLVM Mailing Lists</a></div>
+<h2><a name="maillist">LLVM Mailing Lists</a></h2>
 <!--=======================================================================-->
 
 <ul>
@@ -286,7 +283,7 @@ times each day, making it a high volume list.</li>
   <a href="http://validator.w3.org/check/referer"><img
   src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
 
-  <a href="http://llvm.org">LLVM Compiler Infrastructure</a><br>
+  <a href="http://llvm.org/">LLVM Compiler Infrastructure</a><br>
   Last modified: $Date$
 </address>
 </body></html>
diff --git a/docs/llvm.css b/docs/llvm.css
index f572b5e..1222cf1 100644
--- a/docs/llvm.css
+++ b/docs/llvm.css
@@ -23,7 +23,7 @@ th          { border: 2px solid gray; font-weight: bold; font-size: 105%;
  * Documentation
  */
 /* Common for title and header */
-.doc_title, .doc_section, .doc_subsection, h1, h2 {
+.doc_title, .doc_section, .doc_subsection, h1, h2, h3 {
   color: black; background: url("img/lines.gif");
   font-family: "Georgia,Palatino,Times,Roman,SanSerif"; font-weight: bold;
   border-width: 1px;
@@ -35,17 +35,17 @@ th          { border: 2px solid gray; font-weight: bold; font-size: 105%;
   padding-bottom: 2px
 }
 
-h1, .doc_section   { text-align: center; font-size: 22pt;
-                     margin: 20pt 0pt 5pt 0pt; }
+h1, .doc_title, .title { text-align: left;   font-size: 25pt }
 
-.doc_title, .title { text-align: left;   font-size: 25pt }
+h2, .doc_section   { text-align: center; font-size: 22pt;
+                     margin: 20pt 0pt 5pt 0pt; }
 
-h2, .doc_subsection { width: 75%;
+h3, .doc_subsection { width: 75%;
                       text-align: left;  font-size: 12pt;
                       padding: 4pt 4pt 4pt 4pt;
                       margin: 1.5em 0.5em 0.5em 0.5em }
 
-h3, .doc_subsubsection { margin: 2.0em 0.5em 0.5em 0.5em;
+h4, .doc_subsubsection { margin: 2.0em 0.5em 0.5em 0.5em;
                          font-weight: bold; font-style: oblique;
                          border-bottom: 1px solid #999999; font-size: 12pt;
                          width: 75%; }
@@ -70,6 +70,10 @@ h3, .doc_subsubsection { margin: 2.0em 0.5em 0.5em 0.5em;
                   display: table;
                 }
 
+h2+div, h2+p {text-align: left; padding-left: 20pt; padding-right: 10pt;}
+h3+div, h3+p {text-align: left; padding-left: 20pt; padding-right: 10pt;}
+h4+div, h4+p {text-align: left; padding-left: 20pt; padding-right: 10pt;}
+
 /* It is preferrable to use <pre class="doc_code"> everywhere instead of the
  * <div class="doc_code"><pre>...</ptr></div> construct.
  *
diff --git a/docs/tutorial/LangImpl1.html b/docs/tutorial/LangImpl1.html
index 66843db..2e1746f 100644
--- a/docs/tutorial/LangImpl1.html
+++ b/docs/tutorial/LangImpl1.html
@@ -11,7 +11,7 @@
 
 <body>
 
-<div class="doc_title">Kaleidoscope: Tutorial Introduction and the Lexer</div>
+<h1>Kaleidoscope: Tutorial Introduction and the Lexer</h1>
 
 <ul>
 <li><a href="index.html">Up to Tutorial Index</a></li>
@@ -30,10 +30,10 @@
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="intro">Tutorial Introduction</a></div>
+<h2><a name="intro">Tutorial Introduction</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Welcome to the "Implementing a language with LLVM" tutorial.  This tutorial
 runs through the implementation of a simple language, showing how fun and
@@ -123,10 +123,10 @@ languages!</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="language">The Basic Language</a></div>
+<h2><a name="language">The Basic Language</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>This tutorial will be illustrated with a toy language that we'll call
 "<a href="http://en.wikipedia.org/wiki/Kaleidoscope">Kaleidoscope</a>" (derived 
@@ -181,10 +181,10 @@ a Mandelbrot Set</a> at various levels of magnification.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="lexer">The Lexer</a></div>
+<h2><a name="lexer">The Lexer</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>When it comes to implementing a language, the first thing needed is
 the ability to process a text file and recognize what it says.  The traditional
@@ -341,7 +341,7 @@ so that you can use the lexer and parser together.
   src="http://www.w3.org/Icons/valid-html401" alt="Valid HTML 4.01!"></a>
 
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
-  <a href="http://llvm.org">The LLVM Compiler Infrastructure</a><br>
+  <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
   Last modified: $Date$
 </address>
 </body>
diff --git a/docs/tutorial/LangImpl2.html b/docs/tutorial/LangImpl2.html
index 9c13b48..acccd20 100644
--- a/docs/tutorial/LangImpl2.html
+++ b/docs/tutorial/LangImpl2.html
@@ -11,7 +11,7 @@
 
 <body>
 
-<div class="doc_title">Kaleidoscope: Implementing a Parser and AST</div>
+<h1>Kaleidoscope: Implementing a Parser and AST</h1>
 
 <ul>
 <li><a href="index.html">Up to Tutorial Index</a></li>
@@ -36,10 +36,10 @@
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="intro">Chapter 2 Introduction</a></div>
+<h2><a name="intro">Chapter 2 Introduction</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Welcome to Chapter 2 of the "<a href="index.html">Implementing a language
 with LLVM</a>" tutorial.  This chapter shows you how to use the lexer, built in 
@@ -61,10 +61,10 @@ Tree.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="ast">The Abstract Syntax Tree (AST)</a></div>
+<h2><a name="ast">The Abstract Syntax Tree (AST)</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>The AST for a program captures its behavior in such a way that it is easy for
 later stages of the compiler (e.g. code generation) to interpret.  We basically
@@ -178,10 +178,10 @@ bodies in Kaleidoscope.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="parserbasics">Parser Basics</a></div>
+<h2><a name="parserbasics">Parser Basics</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Now that we have an AST to build, we need to define the parser code to build
 it.  The idea here is that we want to parse something like "x+y" (which is
@@ -239,11 +239,10 @@ piece of our grammar: numeric literals.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="parserprimexprs">Basic Expression
- Parsing</a></div>
+<h2><a name="parserprimexprs">Basic Expression Parsing</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>We start with numeric literals, because they are the simplest to process.
 For each production in our grammar, we'll define a function which parses that
@@ -394,11 +393,10 @@ They are a bit more complex.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="parserbinops">Binary Expression
- Parsing</a></div>
+<h2><a name="parserbinops">Binary Expression Parsing</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Binary expressions are significantly harder to parse because they are often
 ambiguous.  For example, when given the string "x+y*z", the parser can choose
@@ -617,10 +615,10 @@ handle function definitions, etc.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="parsertop">Parsing the Rest</a></div>
+<h2><a name="parsertop">Parsing the Rest</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>
 The next thing missing is handling of function prototypes.  In Kaleidoscope,
@@ -714,10 +712,10 @@ actually <em>execute</em> this code we've built!</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="driver">The Driver</a></div>
+<h2><a name="driver">The Driver</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>The driver for this simply invokes all of the parsing pieces with a top-level
 dispatch loop.  There isn't much interesting here, so I'll just include the
@@ -753,10 +751,10 @@ type "4+5;", and the parser will know you are done.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="conclusions">Conclusions</a></div>
+<h2><a name="conclusions">Conclusions</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>With just under 400 lines of commented code (240 lines of non-comment, 
 non-blank code), we fully defined our minimal language, including a lexer,
@@ -790,10 +788,10 @@ Representation (IR) from the AST.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="code">Full Code Listing</a></div>
+<h2><a name="code">Full Code Listing</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>
 Here is the complete code listing for this and the previous chapter.  
@@ -1226,7 +1224,7 @@ int main() {
   src="http://www.w3.org/Icons/valid-html401" alt="Valid HTML 4.01!"></a>
 
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
-  <a href="http://llvm.org">The LLVM Compiler Infrastructure</a><br>
+  <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
   Last modified: $Date$
 </address>
 </body>
diff --git a/docs/tutorial/LangImpl3.html b/docs/tutorial/LangImpl3.html
index fe09172..c9f4cee 100644
--- a/docs/tutorial/LangImpl3.html
+++ b/docs/tutorial/LangImpl3.html
@@ -11,7 +11,7 @@
 
 <body>
 
-<div class="doc_title">Kaleidoscope: Code generation to LLVM IR</div>
+<h1>Kaleidoscope: Code generation to LLVM IR</h1>
 
 <ul>
 <li><a href="index.html">Up to Tutorial Index</a></li>
@@ -34,10 +34,10 @@ Support</li>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="intro">Chapter 3 Introduction</a></div>
+<h2><a name="intro">Chapter 3 Introduction</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Welcome to Chapter 3 of the "<a href="index.html">Implementing a language
 with LLVM</a>" tutorial.  This chapter shows you how to transform the <a 
@@ -57,10 +57,10 @@ releases page</a>.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="basics">Code Generation Setup</a></div>
+<h2><a name="basics">Code Generation Setup</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>
 In order to generate LLVM IR, we want some simple setup to get started.  First
@@ -147,10 +147,10 @@ has already been done, and we'll just use it to emit code.
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="exprs">Expression Code Generation</a></div>
+<h2><a name="exprs">Expression Code Generation</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Generating LLVM code for expression nodes is very straightforward: less
 than 45 lines of commented code for all four of our expression nodes.  First
@@ -293,10 +293,10 @@ basic framework.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="funcs">Function Code Generation</a></div>
+<h2><a name="funcs">Function Code Generation</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Code generation for prototypes and functions must handle a number of
 details, which make their code less beautiful than expression code
@@ -515,11 +515,10 @@ def bar() foo(1, 2); # error, unknown function "foo"
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="driver">Driver Changes and 
-Closing Thoughts</a></div>
+<h2><a name="driver">Driver Changes and Closing Thoughts</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>
 For now, code generation to LLVM doesn't really get us much, except that we can
@@ -657,10 +656,10 @@ support</a> to this so we can actually start running code!</p>
 
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="code">Full Code Listing</a></div>
+<h2><a name="code">Full Code Listing</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>
 Here is the complete code listing for our running example, enhanced with the
@@ -1262,7 +1261,7 @@ int main() {
   src="http://www.w3.org/Icons/valid-html401" alt="Valid HTML 4.01!"></a>
 
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
-  <a href="http://llvm.org">The LLVM Compiler Infrastructure</a><br>
+  <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
   Last modified: $Date$
 </address>
 </body>
diff --git a/docs/tutorial/LangImpl4.html b/docs/tutorial/LangImpl4.html
index 3eb5be4..fe54fb5 100644
--- a/docs/tutorial/LangImpl4.html
+++ b/docs/tutorial/LangImpl4.html
@@ -11,7 +11,7 @@
 
 <body>
 
-<div class="doc_title">Kaleidoscope: Adding JIT and Optimizer Support</div>
+<h1>Kaleidoscope: Adding JIT and Optimizer Support</h1>
 
 <ul>
 <li><a href="index.html">Up to Tutorial Index</a></li>
@@ -33,10 +33,10 @@ Flow</li>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="intro">Chapter 4 Introduction</a></div>
+<h2><a name="intro">Chapter 4 Introduction</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Welcome to Chapter 4 of the "<a href="index.html">Implementing a language
 with LLVM</a>" tutorial.  Chapters 1-3 described the implementation of a simple
@@ -48,11 +48,10 @@ for the Kaleidoscope language.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="trivialconstfold">Trivial Constant
-Folding</a></div>
+<h2><a name="trivialconstfold">Trivial Constant Folding</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>
 Our demonstration for Chapter 3 is elegant and easy to extend.  Unfortunately,
@@ -134,11 +133,10 @@ range of optimizations that you can use, in the form of "passes".</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="optimizerpasses">LLVM Optimization
- Passes</a></div>
+<h2><a name="optimizerpasses">LLVM Optimization Passes</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>LLVM provides many optimization passes, which do many different sorts of
 things and have different tradeoffs.  Unlike other systems, LLVM doesn't hold
@@ -266,10 +264,10 @@ executing it!</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="jit">Adding a JIT Compiler</a></div>
+<h2><a name="jit">Adding a JIT Compiler</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Code that is available in LLVM IR can have a wide variety of tools 
 applied to it.  For example, you can run optimizations on it (as we did above),
@@ -474,10 +472,10 @@ tackling some interesting LLVM IR issues along the way.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="code">Full Code Listing</a></div>
+<h2><a name="code">Full Code Listing</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>
 Here is the complete code listing for our running example, enhanced with the
@@ -1078,7 +1076,7 @@ int main() {
 
   // Create the JIT.  This takes ownership of the module.
   std::string ErrStr;
-  TheExecutionEngine = EngineBuilder(TheModule).setErrorStr(&ErrStr).create();
+TheExecutionEngine = EngineBuilder(TheModule).setErrorStr(&amp;ErrStr).create();
   if (!TheExecutionEngine) {
     fprintf(stderr, "Could not create ExecutionEngine: %s\n", ErrStr.c_str());
     exit(1);
@@ -1130,7 +1128,7 @@ int main() {
   src="http://www.w3.org/Icons/valid-html401" alt="Valid HTML 4.01!"></a>
 
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
-  <a href="http://llvm.org">The LLVM Compiler Infrastructure</a><br>
+  <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
   Last modified: $Date$
 </address>
 </body>
diff --git a/docs/tutorial/LangImpl5.html b/docs/tutorial/LangImpl5.html
index e993d96..e46ded1 100644
--- a/docs/tutorial/LangImpl5.html
+++ b/docs/tutorial/LangImpl5.html
@@ -11,7 +11,7 @@
 
 <body>
 
-<div class="doc_title">Kaleidoscope: Extending the Language: Control Flow</div>
+<h1>Kaleidoscope: Extending the Language: Control Flow</h1>
 
 <ul>
 <li><a href="index.html">Up to Tutorial Index</a></li>
@@ -48,10 +48,10 @@ User-defined Operators</li>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="intro">Chapter 5 Introduction</a></div>
+<h2><a name="intro">Chapter 5 Introduction</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Welcome to Chapter 5 of the "<a href="index.html">Implementing a language
 with LLVM</a>" tutorial.  Parts 1-4 described the implementation of the simple
@@ -65,10 +65,10 @@ have an if/then/else expression plus a simple 'for' loop.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="ifthen">If/Then/Else</a></div>
+<h2><a name="ifthen">If/Then/Else</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>
 Extending Kaleidoscope to support if/then/else is quite straightforward.  It
@@ -108,15 +108,12 @@ Since Kaleidoscope allows side-effects, this behavior is important to nail down.
 <p>Now that we know what we "want", lets break this down into its constituent
 pieces.</p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsubsection"><a name="iflexer">Lexer Extensions for
-If/Then/Else</a></div>
+<h4><a name="iflexer">Lexer Extensions for If/Then/Else</a></h4>
 <!-- ======================================================================= -->
 
 
-<div class="doc_text">
+<div>
 
 <p>The lexer extensions are straightforward.  First we add new enum values
 for the relevant tokens:</p>
@@ -146,11 +143,10 @@ stuff:</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection"><a name="ifast">AST Extensions for
- If/Then/Else</a></div>
+<h4><a name="ifast">AST Extensions for If/Then/Else</a></h4>
 <!-- ======================================================================= -->
 
-<div class="doc_text">
+<div>
 
 <p>To represent the new expression we add a new AST node for it:</p>
 
@@ -172,11 +168,10 @@ public:
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection"><a name="ifparser">Parser Extensions for
-If/Then/Else</a></div>
+<h4><a name="ifparser">Parser Extensions for If/Then/Else</a></h4>
 <!-- ======================================================================= -->
 
-<div class="doc_text">
+<div>
 
 <p>Now that we have the relevant tokens coming from the lexer and we have the
 AST node to build, our parsing logic is relatively straightforward.  First we
@@ -231,10 +226,10 @@ static ExprAST *ParsePrimary() {
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection"><a name="ifir">LLVM IR for If/Then/Else</a></div>
+<h4><a name="ifir">LLVM IR for If/Then/Else</a></h4>
 <!-- ======================================================================= -->
 
-<div class="doc_text">
+<div>
 
 <p>Now that we have it parsing and building the AST, the final piece is adding
 LLVM code generation support.  This is the most interesting part of the
@@ -347,11 +342,10 @@ directly.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection"><a name="ifcodegen">Code Generation for 
-If/Then/Else</a></div>
+<h4><a name="ifcodegen">Code Generation for If/Then/Else</a></h4>
 <!-- ======================================================================= -->
 
-<div class="doc_text">
+<div>
 
 <p>In order to generate code for this, we implement the <tt>Codegen</tt> method
 for <tt>IfExprAST</tt>:</p>
@@ -500,11 +494,13 @@ another useful expression that is familiar from non-functional languages...</p>
 
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="for">'for' Loop Expression</a></div>
+<h2><a name="for">'for' Loop Expression</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Now that we know how to add basic control flow constructs to the language,
 we have the tools to add more powerful things.  Lets add something more
@@ -533,14 +529,11 @@ variables, it will get more useful.</p>
 <p>As before, lets talk about the changes that we need to Kaleidoscope to
 support this.</p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsubsection"><a name="forlexer">Lexer Extensions for
-the 'for' Loop</a></div>
+<h4><a name="forlexer">Lexer Extensions for the 'for' Loop</a></h4>
 <!-- ======================================================================= -->
 
-<div class="doc_text">
+<div>
 
 <p>The lexer extensions are the same sort of thing as for if/then/else:</p>
 
@@ -566,11 +559,10 @@ the 'for' Loop</a></div>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection"><a name="forast">AST Extensions for
-the 'for' Loop</a></div>
+<h4><a name="forast">AST Extensions for the 'for' Loop</a></h4>
 <!-- ======================================================================= -->
 
-<div class="doc_text">
+<div>
 
 <p>The AST node is just as simple.  It basically boils down to capturing
 the variable name and the constituent expressions in the node.</p>
@@ -593,11 +585,10 @@ public:
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection"><a name="forparser">Parser Extensions for
-the 'for' Loop</a></div>
+<h4><a name="forparser">Parser Extensions for the 'for' Loop</a></h4>
 <!-- ======================================================================= -->
 
-<div class="doc_text">
+<div>
 
 <p>The parser code is also fairly standard.  The only interesting thing here is
 handling of the optional step value.  The parser code handles it by checking to
@@ -653,11 +644,10 @@ static ExprAST *ParseForExpr() {
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection"><a name="forir">LLVM IR for 
-the 'for' Loop</a></div>
+<h4><a name="forir">LLVM IR for the 'for' Loop</a></h4>
 <!-- ======================================================================= -->
 
-<div class="doc_text">
+<div>
 
 <p>Now we get to the good part: the LLVM IR we want to generate for this thing.
 With the simple example above, we get this LLVM IR (note that this dump is
@@ -699,11 +689,10 @@ expressions, and some basic blocks.  Lets see how this fits together.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection"><a name="forcodegen">Code Generation for 
-the 'for' Loop</a></div>
+<h4><a name="forcodegen">Code Generation for the 'for' Loop</a></h4>
 <!-- ======================================================================= -->
 
-<div class="doc_text">
+<div>
 
 <p>The first part of Codegen is very simple: we just output the start expression
 for the loop value:</p>
@@ -876,11 +865,13 @@ language.</p>
 
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="code">Full Code Listing</a></div>
+<h2><a name="code">Full Code Listing</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>
 Here is the complete code listing for our running example, enhanced with the
@@ -1721,7 +1712,7 @@ int main() {
 
   // Create the JIT.  This takes ownership of the module.
   std::string ErrStr;
-  TheExecutionEngine = EngineBuilder(TheModule).setErrorStr(&ErrStr).create();
+  TheExecutionEngine = EngineBuilder(TheModule).setErrorStr(&amp;ErrStr).create();
   if (!TheExecutionEngine) {
     fprintf(stderr, "Could not create ExecutionEngine: %s\n", ErrStr.c_str());
     exit(1);
@@ -1773,7 +1764,7 @@ int main() {
   src="http://www.w3.org/Icons/valid-html401" alt="Valid HTML 4.01!"></a>
 
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
-  <a href="http://llvm.org">The LLVM Compiler Infrastructure</a><br>
+  <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
   Last modified: $Date$
 </address>
 </body>
diff --git a/docs/tutorial/LangImpl6.html b/docs/tutorial/LangImpl6.html
index 3e83cf6..39264cf 100644
--- a/docs/tutorial/LangImpl6.html
+++ b/docs/tutorial/LangImpl6.html
@@ -11,7 +11,7 @@
 
 <body>
 
-<div class="doc_title">Kaleidoscope: Extending the Language: User-defined Operators</div>
+<h1>Kaleidoscope: Extending the Language: User-defined Operators</h1>
 
 <ul>
 <li><a href="index.html">Up to Tutorial Index</a></li>
@@ -34,10 +34,10 @@ Variables / SSA Construction</li>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="intro">Chapter 6 Introduction</a></div>
+<h2><a name="intro">Chapter 6 Introduction</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Welcome to Chapter 6 of the "<a href="index.html">Implementing a language
 with LLVM</a>" tutorial.  At this point in our tutorial, we now have a fully
@@ -60,10 +60,10 @@ an example of what you can build with Kaleidoscope and its feature set.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="idea">User-defined Operators: the Idea</a></div>
+<h2><a name="idea">User-defined Operators: the Idea</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>
 The "operator overloading" that we will add to Kaleidoscope is more general than
@@ -125,10 +125,10 @@ operators.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="binary">User-defined Binary Operators</a></div>
+<h2><a name="binary">User-defined Binary Operators</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Adding support for user-defined binary operators is pretty simple with our
 current framework.  We'll first add support for the unary/binary keywords:</p>
@@ -342,10 +342,10 @@ see what it takes.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="unary">User-defined Unary Operators</a></div>
+<h2><a name="unary">User-defined Unary Operators</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Since we don't currently support unary operators in the Kaleidoscope
 language, we'll need to add everything to support them.  Above, we added simple
@@ -491,10 +491,10 @@ is simpler primarily because it doesn't need to handle any predefined operators.
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="example">Kicking the Tires</a></div>
+<h2><a name="example">Kicking the Tires</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>It is somewhat hard to believe, but with a few simple extensions we've
 covered in the last chapters, we have grown a real-ish language.  With this, we 
@@ -796,10 +796,10 @@ add variable mutation without building SSA in your front-end.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="code">Full Code Listing</a></div>
+<h2><a name="code">Full Code Listing</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>
 Here is the complete code listing for our running example, enhanced with the
@@ -1758,7 +1758,7 @@ int main() {
 
   // Create the JIT.  This takes ownership of the module.
   std::string ErrStr;
-  TheExecutionEngine = EngineBuilder(TheModule).setErrorStr(&ErrStr).create();
+  TheExecutionEngine = EngineBuilder(TheModule).setErrorStr(&amp;ErrStr).create();
   if (!TheExecutionEngine) {
     fprintf(stderr, "Could not create ExecutionEngine: %s\n", ErrStr.c_str());
     exit(1);
@@ -1810,7 +1810,7 @@ int main() {
   src="http://www.w3.org/Icons/valid-html401" alt="Valid HTML 4.01!"></a>
 
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
-  <a href="http://llvm.org">The LLVM Compiler Infrastructure</a><br>
+  <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
   Last modified: $Date$
 </address>
 </body>
diff --git a/docs/tutorial/LangImpl7.html b/docs/tutorial/LangImpl7.html
index 71d5210..b2b26bd 100644
--- a/docs/tutorial/LangImpl7.html
+++ b/docs/tutorial/LangImpl7.html
@@ -12,7 +12,7 @@
 
 <body>
 
-<div class="doc_title">Kaleidoscope: Extending the Language: Mutable Variables</div>
+<h1>Kaleidoscope: Extending the Language: Mutable Variables</h1>
 
 <ul>
 <li><a href="index.html">Up to Tutorial Index</a></li>
@@ -38,10 +38,10 @@
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="intro">Chapter 7 Introduction</a></div>
+<h2><a name="intro">Chapter 7 Introduction</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Welcome to Chapter 7 of the "<a href="index.html">Implementing a language
 with LLVM</a>" tutorial.  In chapters 1 through 6, we've built a very
@@ -66,10 +66,10 @@ support for this, though the way it works is a bit unexpected for some.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="why">Why is this a hard problem?</a></div>
+<h2><a name="why">Why is this a hard problem?</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>
 To understand why mutable variables cause complexities in SSA construction, 
@@ -140,10 +140,10 @@ logic.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="memory">Memory in LLVM</a></div>
+<h2><a name="memory">Memory in LLVM</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>The 'trick' here is that while LLVM does require all register values to be
 in SSA form, it does not require (or permit) memory objects to be in SSA form.
@@ -321,11 +321,10 @@ variables now!
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="kalvars">Mutable Variables in 
-Kaleidoscope</a></div>
+<h2><a name="kalvars">Mutable Variables in Kaleidoscope</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Now that we know the sort of problem we want to tackle, lets see what this
 looks like in the context of our little Kaleidoscope language.  We're going to
@@ -378,11 +377,10 @@ Kaleidoscope to support new variable definitions.
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="adjustments">Adjusting Existing Variables for
-Mutation</a></div>
+<h2><a name="adjustments">Adjusting Existing Variables for Mutation</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>
 The symbol table in Kaleidoscope is managed at code generation time by the 
@@ -648,10 +646,10 @@ we'll add the assignment operator.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="assignment">New Assignment Operator</a></div>
+<h2><a name="assignment">New Assignment Operator</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>With our current framework, adding a new assignment operator is really
 simple.  We will parse it just like any other binary operator, but handle it
@@ -745,11 +743,10 @@ add this next!
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="localvars">User-defined Local 
-Variables</a></div>
+<h2><a name="localvars">User-defined Local Variables</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Adding var/in is just like any other other extensions we made to 
 Kaleidoscope: we extend the lexer, the parser, the AST and the code generator.
@@ -979,10 +976,10 @@ anywhere in sight.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="code">Full Code Listing</a></div>
+<h2><a name="code">Full Code Listing</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>
 Here is the complete code listing for our running example, enhanced with mutable
@@ -2160,7 +2157,7 @@ int main() {
   src="http://www.w3.org/Icons/valid-html401" alt="Valid HTML 4.01!"></a>
 
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
-  <a href="http://llvm.org">The LLVM Compiler Infrastructure</a><br>
+  <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
   Last modified: $Date$
 </address>
 </body>
diff --git a/docs/tutorial/LangImpl8.html b/docs/tutorial/LangImpl8.html
index 64a6200..eed8c03 100644
--- a/docs/tutorial/LangImpl8.html
+++ b/docs/tutorial/LangImpl8.html
@@ -11,8 +11,7 @@
 
 <body>
 
-<div class="doc_title">Kaleidoscope: Conclusion and other useful LLVM
- tidbits</div>
+<h1>Kaleidoscope: Conclusion and other useful LLVM tidbits</h1>
 
 <ul>
 <li><a href="index.html">Up to Tutorial Index</a></li>
@@ -43,10 +42,10 @@
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="conclusion">Tutorial Conclusion</a></div>
+<h2><a name="conclusion">Tutorial Conclusion</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Welcome to the the final chapter of the "<a href="index.html">Implementing a
 language with LLVM</a>" tutorial.  In the course of this tutorial, we have grown
@@ -154,23 +153,19 @@ are very useful if you want to take advantage of LLVM's capabilities.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="llvmirproperties">Properties of the LLVM 
-IR</a></div>
+<h2><a name="llvmirproperties">Properties of the LLVM IR</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>We have a couple common questions about code in the LLVM IR form - lets just
 get these out of the way right now, shall we?</p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsubsection"><a name="targetindep">Target 
-Independence</a></div>
+<h4><a name="targetindep">Target Independence</a></h4>
 <!-- ======================================================================= -->
 
-<div class="doc_text">
+<div>
 
 <p>Kaleidoscope is an example of a "portable language": any program written in
 Kaleidoscope will work the same way on any target that it runs on.  Many other
@@ -221,10 +216,10 @@ in-kernel language.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection"><a name="safety">Safety Guarantees</a></div>
+<h4><a name="safety">Safety Guarantees</a></h4>
 <!-- ======================================================================= -->
 
-<div class="doc_text">
+<div>
 
 <p>Many of the languages above are also "safe" languages: it is impossible for
 a program written in Java to corrupt its address space and crash the process
@@ -243,11 +238,10 @@ list</a> if you are interested in more details.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection"><a name="langspecific">Language-Specific 
-Optimizations</a></div>
+<h4><a name="langspecific">Language-Specific Optimizations</a></h4>
 <!-- ======================================================================= -->
 
-<div class="doc_text">
+<div>
 
 <p>One thing about LLVM that turns off many people is that it does not solve all
 the world's problems in one system (sorry 'world hunger', someone else will have
@@ -297,24 +291,23 @@ language-specific AST.
 
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="tipsandtricks">Tips and Tricks</a></div>
+<h2><a name="tipsandtricks">Tips and Tricks</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>There is a variety of useful tips and tricks that you come to know after
 working on/with LLVM that aren't obvious at first glance.  Instead of letting
 everyone rediscover them, this section talks about some of these issues.</p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsubsection"><a name="offsetofsizeof">Implementing portable
-offsetof/sizeof</a></div>
+<h4><a name="offsetofsizeof">Implementing portable offsetof/sizeof</a></h4>
 <!-- ======================================================================= -->
 
-<div class="doc_text">
+<div>
 
 <p>One interesting thing that comes up, if you are trying to keep the code 
 generated by your compiler "target independent", is that you often need to know
@@ -331,11 +324,10 @@ in a portable way.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection"><a name="gcstack">Garbage Collected 
-Stack Frames</a></div>
+<h4><a name="gcstack">Garbage Collected Stack Frames</a></h4>
 <!-- ======================================================================= -->
 
-<div class="doc_text">
+<div>
 
 <p>Some languages want to explicitly manage their stack frames, often so that
 they are garbage collected or to allow easy implementation of closures.  There
@@ -349,6 +341,8 @@ Passing Style</a> and the use of tail calls (which LLVM also supports).</p>
 
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
 <hr>
 <address>
@@ -358,7 +352,7 @@ Passing Style</a> and the use of tail calls (which LLVM also supports).</p>
   src="http://www.w3.org/Icons/valid-html401" alt="Valid HTML 4.01!"></a>
 
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
-  <a href="http://llvm.org">The LLVM Compiler Infrastructure</a><br>
+  <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
   Last modified: $Date$
 </address>
 </body>
diff --git a/docs/tutorial/OCamlLangImpl1.html b/docs/tutorial/OCamlLangImpl1.html
index 98c1124..aa2bd87 100644
--- a/docs/tutorial/OCamlLangImpl1.html
+++ b/docs/tutorial/OCamlLangImpl1.html
@@ -12,7 +12,7 @@
 
 <body>
 
-<div class="doc_title">Kaleidoscope: Tutorial Introduction and the Lexer</div>
+<h1>Kaleidoscope: Tutorial Introduction and the Lexer</h1>
 
 <ul>
 <li><a href="index.html">Up to Tutorial Index</a></li>
@@ -35,10 +35,10 @@ AST</li>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="intro">Tutorial Introduction</a></div>
+<h2><a name="intro">Tutorial Introduction</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Welcome to the "Implementing a language with LLVM" tutorial.  This tutorial
 runs through the implementation of a simple language, showing how fun and
@@ -130,10 +130,10 @@ languages!</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="language">The Basic Language</a></div>
+<h2><a name="language">The Basic Language</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>This tutorial will be illustrated with a toy language that we'll call
 "<a href="http://en.wikipedia.org/wiki/Kaleidoscope">Kaleidoscope</a>" (derived
@@ -188,10 +188,10 @@ a Mandelbrot Set</a> at various levels of magnification.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="lexer">The Lexer</a></div>
+<h2><a name="lexer">The Lexer</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>When it comes to implementing a language, the first thing needed is
 the ability to process a text file and recognize what it says.  The traditional
@@ -358,7 +358,7 @@ include a driver so that you can use the lexer and parser together.
 
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
   <a href="mailto:idadesub@users.sourceforge.net">Erick Tryzelaar</a><br>
-  <a href="http://llvm.org">The LLVM Compiler Infrastructure</a><br>
+  <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
   Last modified: $Date$
 </address>
 </body>
diff --git a/docs/tutorial/OCamlLangImpl2.html b/docs/tutorial/OCamlLangImpl2.html
index 6665109..20e006d 100644
--- a/docs/tutorial/OCamlLangImpl2.html
+++ b/docs/tutorial/OCamlLangImpl2.html
@@ -12,7 +12,7 @@
 
 <body>
 
-<div class="doc_title">Kaleidoscope: Implementing a Parser and AST</div>
+<h1>Kaleidoscope: Implementing a Parser and AST</h1>
 
 <ul>
 <li><a href="index.html">Up to Tutorial Index</a></li>
@@ -40,10 +40,10 @@
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="intro">Chapter 2 Introduction</a></div>
+<h2><a name="intro">Chapter 2 Introduction</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Welcome to Chapter 2 of the "<a href="index.html">Implementing a language
 with LLVM in Objective Caml</a>" tutorial.  This chapter shows you how to use
@@ -65,10 +65,10 @@ Tree.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="ast">The Abstract Syntax Tree (AST)</a></div>
+<h2><a name="ast">The Abstract Syntax Tree (AST)</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>The AST for a program captures its behavior in such a way that it is easy for
 later stages of the compiler (e.g. code generation) to interpret.  We basically
@@ -146,10 +146,10 @@ bodies in Kaleidoscope.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="parserbasics">Parser Basics</a></div>
+<h2><a name="parserbasics">Parser Basics</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Now that we have an AST to build, we need to define the parser code to build
 it.  The idea here is that we want to parse something like "x+y" (which is
@@ -181,11 +181,10 @@ piece of our grammar: numeric literals.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="parserprimexprs">Basic Expression
- Parsing</a></div>
+<h2><a name="parserprimexprs">Basic Expression Parsing</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>We start with numeric literals, because they are the simplest to process.
 For each production in our grammar, we'll define a function which parses that
@@ -303,11 +302,10 @@ They are a bit more complex.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="parserbinops">Binary Expression
- Parsing</a></div>
+<h2><a name="parserbinops">Binary Expression Parsing</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Binary expressions are significantly harder to parse because they are often
 ambiguous.  For example, when given the string "x+y*z", the parser can choose
@@ -517,10 +515,10 @@ handle function definitions, etc.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="parsertop">Parsing the Rest</a></div>
+<h2><a name="parsertop">Parsing the Rest</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>
 The next thing missing is handling of function prototypes.  In Kaleidoscope,
@@ -596,10 +594,10 @@ actually <em>execute</em> this code we've built!</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="driver">The Driver</a></div>
+<h2><a name="driver">The Driver</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>The driver for this simply invokes all of the parsing pieces with a top-level
 dispatch loop.  There isn't much interesting here, so I'll just include the
@@ -652,10 +650,10 @@ type "4+5;", and the parser will know you are done.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="conclusions">Conclusions</a></div>
+<h2><a name="conclusions">Conclusions</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>With just under 300 lines of commented code (240 lines of non-comment,
 non-blank code), we fully defined our minimal language, including a lexer,
@@ -689,10 +687,10 @@ Representation (IR) from the AST.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="code">Full Code Listing</a></div>
+<h2><a name="code">Full Code Listing</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>
 Here is the complete code listing for this and the previous chapter.
@@ -1038,7 +1036,7 @@ main ()
 
   <a href="mailto:sabre@nondot.org">Chris Lattner</a>
   <a href="mailto:erickt@users.sourceforge.net">Erick Tryzelaar</a><br>
-  <a href="http://llvm.org">The LLVM Compiler Infrastructure</a><br>
+  <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
   Last modified: $Date$
 </address>
 </body>
diff --git a/docs/tutorial/OCamlLangImpl3.html b/docs/tutorial/OCamlLangImpl3.html
index d55fd0f..45ee6e9 100644
--- a/docs/tutorial/OCamlLangImpl3.html
+++ b/docs/tutorial/OCamlLangImpl3.html
@@ -12,7 +12,7 @@
 
 <body>
 
-<div class="doc_title">Kaleidoscope: Code generation to LLVM IR</div>
+<h1>Kaleidoscope: Code generation to LLVM IR</h1>
 
 <ul>
 <li><a href="index.html">Up to Tutorial Index</a></li>
@@ -38,10 +38,10 @@ Support</li>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="intro">Chapter 3 Introduction</a></div>
+<h2><a name="intro">Chapter 3 Introduction</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Welcome to Chapter 3 of the "<a href="index.html">Implementing a language
 with LLVM</a>" tutorial.  This chapter shows you how to transform the <a
@@ -57,10 +57,10 @@ LLVM SVN to work.  LLVM 2.2 and before will not work with it.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="basics">Code Generation Setup</a></div>
+<h2><a name="basics">Code Generation Setup</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>
 In order to generate LLVM IR, we want some simple setup to get started.  First
@@ -128,10 +128,10 @@ that this has already been done, and we'll just use it to emit code.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="exprs">Expression Code Generation</a></div>
+<h2><a name="exprs">Expression Code Generation</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Generating LLVM code for expression nodes is very straightforward: less
 than 30 lines of commented code for all four of our expression nodes.  First
@@ -263,10 +263,10 @@ basic framework.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="funcs">Function Code Generation</a></div>
+<h2><a name="funcs">Function Code Generation</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Code generation for prototypes and functions must handle a number of
 details, which make their code less beautiful than expression code
@@ -466,11 +466,10 @@ def bar() foo(1, 2); # error, unknown function "foo"
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="driver">Driver Changes and
-Closing Thoughts</a></div>
+<h2><a name="driver">Driver Changes and Closing Thoughts</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>
 For now, code generation to LLVM doesn't really get us much, except that we can
@@ -607,10 +606,10 @@ support</a> to this so we can actually start running code!</p>
 
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="code">Full Code Listing</a></div>
+<h2><a name="code">Full Code Listing</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>
 Here is the complete code listing for our running example, enhanced with the
@@ -1086,7 +1085,7 @@ main ()
 
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
   <a href="mailto:idadesub@users.sourceforge.net">Erick Tryzelaar</a><br>
-  <a href="http://llvm.org">The LLVM Compiler Infrastructure</a><br>
+  <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
   Last modified: $Date$
 </address>
 </body>
diff --git a/docs/tutorial/OCamlLangImpl4.html b/docs/tutorial/OCamlLangImpl4.html
index 979119a..fd2b5ad 100644
--- a/docs/tutorial/OCamlLangImpl4.html
+++ b/docs/tutorial/OCamlLangImpl4.html
@@ -12,7 +12,7 @@
 
 <body>
 
-<div class="doc_title">Kaleidoscope: Adding JIT and Optimizer Support</div>
+<h1>Kaleidoscope: Adding JIT and Optimizer Support</h1>
 
 <ul>
 <li><a href="index.html">Up to Tutorial Index</a></li>
@@ -37,10 +37,10 @@ Flow</li>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="intro">Chapter 4 Introduction</a></div>
+<h2><a name="intro">Chapter 4 Introduction</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Welcome to Chapter 4 of the "<a href="index.html">Implementing a language
 with LLVM</a>" tutorial.  Chapters 1-3 described the implementation of a simple
@@ -52,11 +52,10 @@ for the Kaleidoscope language.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="trivialconstfold">Trivial Constant
-Folding</a></div>
+<h2><a name="trivialconstfold">Trivial Constant Folding</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p><b>Note:</b> the default <tt>IRBuilder</tt> now always includes the constant 
 folding optimisations below.<p>
@@ -148,11 +147,10 @@ range of optimizations that you can use, in the form of "passes".</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="optimizerpasses">LLVM Optimization
- Passes</a></div>
+<h2><a name="optimizerpasses">LLVM Optimization Passes</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>LLVM provides many optimization passes, which do many different sorts of
 things and have different tradeoffs.  Unlike other systems, LLVM doesn't hold
@@ -283,10 +281,10 @@ executing it!</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="jit">Adding a JIT Compiler</a></div>
+<h2><a name="jit">Adding a JIT Compiler</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Code that is available in LLVM IR can have a wide variety of tools
 applied to it.  For example, you can run optimizations on it (as we did above),
@@ -486,10 +484,10 @@ constructs</a>, tackling some interesting LLVM IR issues along the way.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="code">Full Code Listing</a></div>
+<h2><a name="code">Full Code Listing</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>
 Here is the complete code listing for our running example, enhanced with the
@@ -1022,7 +1020,7 @@ extern double putchard(double X) {
 
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
   <a href="mailto:idadesub@users.sourceforge.net">Erick Tryzelaar</a><br>
-  <a href="http://llvm.org">The LLVM Compiler Infrastructure</a><br>
+  <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
   Last modified: $Date$
 </address>
 </body>
diff --git a/docs/tutorial/OCamlLangImpl5.html b/docs/tutorial/OCamlLangImpl5.html
index 7a21395..d356f12 100644
--- a/docs/tutorial/OCamlLangImpl5.html
+++ b/docs/tutorial/OCamlLangImpl5.html
@@ -12,7 +12,7 @@
 
 <body>
 
-<div class="doc_title">Kaleidoscope: Extending the Language: Control Flow</div>
+<h1>Kaleidoscope: Extending the Language: Control Flow</h1>
 
 <ul>
 <li><a href="index.html">Up to Tutorial Index</a></li>
@@ -52,10 +52,10 @@ User-defined Operators</li>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="intro">Chapter 5 Introduction</a></div>
+<h2><a name="intro">Chapter 5 Introduction</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Welcome to Chapter 5 of the "<a href="index.html">Implementing a language
 with LLVM</a>" tutorial.  Parts 1-4 described the implementation of the simple
@@ -69,10 +69,10 @@ have an if/then/else expression plus a simple 'for' loop.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="ifthen">If/Then/Else</a></div>
+<h2><a name="ifthen">If/Then/Else</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>
 Extending Kaleidoscope to support if/then/else is quite straightforward.  It
@@ -112,15 +112,12 @@ Since Kaleidoscope allows side-effects, this behavior is important to nail down.
 <p>Now that we know what we "want", lets break this down into its constituent
 pieces.</p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsubsection"><a name="iflexer">Lexer Extensions for
-If/Then/Else</a></div>
+<h4><a name="iflexer">Lexer Extensions for If/Then/Else</a></h4>
 <!-- ======================================================================= -->
 
 
-<div class="doc_text">
+<div>
 
 <p>The lexer extensions are straightforward.  First we add new variants
 for the relevant tokens:</p>
@@ -153,11 +150,10 @@ stuff:</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection"><a name="ifast">AST Extensions for
- If/Then/Else</a></div>
+<h4><a name="ifast">AST Extensions for If/Then/Else</a></h4>
 <!-- ======================================================================= -->
 
-<div class="doc_text">
+<div>
 
 <p>To represent the new expression we add a new AST variant for it:</p>
 
@@ -175,11 +171,10 @@ type expr =
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection"><a name="ifparser">Parser Extensions for
-If/Then/Else</a></div>
+<h4><a name="ifparser">Parser Extensions for If/Then/Else</a></h4>
 <!-- ======================================================================= -->
 
-<div class="doc_text">
+<div>
 
 <p>Now that we have the relevant tokens coming from the lexer and we have the
 AST node to build, our parsing logic is relatively straightforward.  First we
@@ -214,10 +209,10 @@ let rec parse_primary = parser
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection"><a name="ifir">LLVM IR for If/Then/Else</a></div>
+<h4><a name="ifir">LLVM IR for If/Then/Else</a></h4>
 <!-- ======================================================================= -->
 
-<div class="doc_text">
+<div>
 
 <p>Now that we have it parsing and building the AST, the final piece is adding
 LLVM code generation support.  This is the most interesting part of the
@@ -331,11 +326,10 @@ directly.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection"><a name="ifcodegen">Code Generation for
-If/Then/Else</a></div>
+<h4><a name="ifcodegen">Code Generation for If/Then/Else</a></h4>
 <!-- ======================================================================= -->
 
-<div class="doc_text">
+<div>
 
 <p>In order to generate code for this, we implement the <tt>Codegen</tt> method
 for <tt>IfExprAST</tt>:</p>
@@ -492,11 +486,13 @@ another useful expression that is familiar from non-functional languages...</p>
 
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="for">'for' Loop Expression</a></div>
+<h2><a name="for">'for' Loop Expression</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Now that we know how to add basic control flow constructs to the language,
 we have the tools to add more powerful things.  Lets add something more
@@ -525,14 +521,11 @@ variables, it will get more useful.</p>
 <p>As before, lets talk about the changes that we need to Kaleidoscope to
 support this.</p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsubsection"><a name="forlexer">Lexer Extensions for
-the 'for' Loop</a></div>
+<h4><a name="forlexer">Lexer Extensions for the 'for' Loop</a></h4>
 <!-- ======================================================================= -->
 
-<div class="doc_text">
+<div>
 
 <p>The lexer extensions are the same sort of thing as for if/then/else:</p>
 
@@ -559,11 +552,10 @@ the 'for' Loop</a></div>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection"><a name="forast">AST Extensions for
-the 'for' Loop</a></div>
+<h4><a name="forast">AST Extensions for the 'for' Loop</a></h4>
 <!-- ======================================================================= -->
 
-<div class="doc_text">
+<div>
 
 <p>The AST variant is just as simple.  It basically boils down to capturing
 the variable name and the constituent expressions in the node.</p>
@@ -580,11 +572,10 @@ type expr =
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection"><a name="forparser">Parser Extensions for
-the 'for' Loop</a></div>
+<h4><a name="forparser">Parser Extensions for the 'for' Loop</a></h4>
 <!-- ======================================================================= -->
 
-<div class="doc_text">
+<div>
 
 <p>The parser code is also fairly standard.  The only interesting thing here is
 handling of the optional step value.  The parser code handles it by checking to
@@ -628,11 +619,10 @@ let rec parse_primary = parser
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection"><a name="forir">LLVM IR for
-the 'for' Loop</a></div>
+<h4><a name="forir">LLVM IR for the 'for' Loop</a></h4>
 <!-- ======================================================================= -->
 
-<div class="doc_text">
+<div>
 
 <p>Now we get to the good part: the LLVM IR we want to generate for this thing.
 With the simple example above, we get this LLVM IR (note that this dump is
@@ -674,11 +664,10 @@ expressions, and some basic blocks.  Lets see how this fits together.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection"><a name="forcodegen">Code Generation for
-the 'for' Loop</a></div>
+<h4><a name="forcodegen">Code Generation for the 'for' Loop</a></h4>
 <!-- ======================================================================= -->
 
-<div class="doc_text">
+<div>
 
 <p>The first part of Codegen is very simple: we just output the start expression
 for the loop value:</p>
@@ -851,11 +840,13 @@ to our poor innocent language.</p>
 
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="code">Full Code Listing</a></div>
+<h2><a name="code">Full Code Listing</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>
 Here is the complete code listing for our running example, enhanced with the
@@ -1562,7 +1553,7 @@ operators</a>
 
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
   <a href="mailto:idadesub@users.sourceforge.net">Erick Tryzelaar</a><br>
-  <a href="http://llvm.org">The LLVM Compiler Infrastructure</a><br>
+  <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
   Last modified: $Date$
 </address>
 </body>
diff --git a/docs/tutorial/OCamlLangImpl6.html b/docs/tutorial/OCamlLangImpl6.html
index 86210fe..480aab3 100644
--- a/docs/tutorial/OCamlLangImpl6.html
+++ b/docs/tutorial/OCamlLangImpl6.html
@@ -12,7 +12,7 @@
 
 <body>
 
-<div class="doc_title">Kaleidoscope: Extending the Language: User-defined Operators</div>
+<h1>Kaleidoscope: Extending the Language: User-defined Operators</h1>
 
 <ul>
 <li><a href="index.html">Up to Tutorial Index</a></li>
@@ -38,10 +38,10 @@ Variables / SSA Construction</li>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="intro">Chapter 6 Introduction</a></div>
+<h2><a name="intro">Chapter 6 Introduction</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Welcome to Chapter 6 of the "<a href="index.html">Implementing a language
 with LLVM</a>" tutorial.  At this point in our tutorial, we now have a fully
@@ -64,10 +64,10 @@ an example of what you can build with Kaleidoscope and its feature set.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="idea">User-defined Operators: the Idea</a></div>
+<h2><a name="idea">User-defined Operators: the Idea</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>
 The "operator overloading" that we will add to Kaleidoscope is more general than
@@ -129,10 +129,10 @@ operators.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="binary">User-defined Binary Operators</a></div>
+<h2><a name="binary">User-defined Binary Operators</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Adding support for user-defined binary operators is pretty simple with our
 current framework.  We'll first add support for the unary/binary keywords:</p>
@@ -320,10 +320,10 @@ see what it takes.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="unary">User-defined Unary Operators</a></div>
+<h2><a name="unary">User-defined Unary Operators</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Since we don't currently support unary operators in the Kaleidoscope
 language, we'll need to add everything to support them.  Above, we added simple
@@ -472,10 +472,10 @@ is simpler primarily because it doesn't need to handle any predefined operators.
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="example">Kicking the Tires</a></div>
+<h2><a name="example">Kicking the Tires</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>It is somewhat hard to believe, but with a few simple extensions we've
 covered in the last chapters, we have grown a real-ish language.  With this, we
@@ -778,10 +778,10 @@ add variable mutation without building SSA in your front-end.</p>
 
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="code">Full Code Listing</a></div>
+<h2><a name="code">Full Code Listing</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>
 Here is the complete code listing for our running example, enhanced with the
@@ -1567,7 +1567,7 @@ SSA construction</a>
 
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
   <a href="mailto:idadesub@users.sourceforge.net">Erick Tryzelaar</a><br>
-  <a href="http://llvm.org">The LLVM Compiler Infrastructure</a><br>
+  <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
   Last modified: $Date$
 </address>
 </body>
diff --git a/docs/tutorial/OCamlLangImpl7.html b/docs/tutorial/OCamlLangImpl7.html
index 7146a5c..51986b5 100644
--- a/docs/tutorial/OCamlLangImpl7.html
+++ b/docs/tutorial/OCamlLangImpl7.html
@@ -13,7 +13,7 @@
 
 <body>
 
-<div class="doc_title">Kaleidoscope: Extending the Language: Mutable Variables</div>
+<h1>Kaleidoscope: Extending the Language: Mutable Variables</h1>
 
 <ul>
 <li><a href="index.html">Up to Tutorial Index</a></li>
@@ -42,10 +42,10 @@
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="intro">Chapter 7 Introduction</a></div>
+<h2><a name="intro">Chapter 7 Introduction</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Welcome to Chapter 7 of the "<a href="index.html">Implementing a language
 with LLVM</a>" tutorial.  In chapters 1 through 6, we've built a very
@@ -70,10 +70,10 @@ support for this, though the way it works is a bit unexpected for some.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="why">Why is this a hard problem?</a></div>
+<h2><a name="why">Why is this a hard problem?</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>
 To understand why mutable variables cause complexities in SSA construction,
@@ -144,10 +144,10 @@ logic.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="memory">Memory in LLVM</a></div>
+<h2><a name="memory">Memory in LLVM</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>The 'trick' here is that while LLVM does require all register values to be
 in SSA form, it does not require (or permit) memory objects to be in SSA form.
@@ -325,11 +325,10 @@ variables now!
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="kalvars">Mutable Variables in
-Kaleidoscope</a></div>
+<h2><a name="kalvars">Mutable Variables in Kaleidoscope</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Now that we know the sort of problem we want to tackle, lets see what this
 looks like in the context of our little Kaleidoscope language.  We're going to
@@ -382,11 +381,10 @@ Kaleidoscope to support new variable definitions.
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="adjustments">Adjusting Existing Variables for
-Mutation</a></div>
+<h2><a name="adjustments">Adjusting Existing Variables for Mutation</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>
 The symbol table in Kaleidoscope is managed at code generation time by the
@@ -672,10 +670,10 @@ we'll add the assignment operator.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="assignment">New Assignment Operator</a></div>
+<h2><a name="assignment">New Assignment Operator</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>With our current framework, adding a new assignment operator is really
 simple.  We will parse it just like any other binary operator, but handle it
@@ -773,11 +771,10 @@ add this next!
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="localvars">User-defined Local
-Variables</a></div>
+<h2><a name="localvars">User-defined Local Variables</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Adding var/in is just like any other other extensions we made to
 Kaleidoscope: we extend the lexer, the parser, the AST and the code generator.
@@ -956,10 +953,10 @@ anywhere in sight.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="code">Full Code Listing</a></div>
+<h2><a name="code">Full Code Listing</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>
 Here is the complete code listing for our running example, enhanced with mutable
@@ -1887,7 +1884,7 @@ extern double printd(double X) {
 </dd>
 </dl>
 
-<a href="LangImpl8.html">Next: Conclusion and other useful LLVM tidbits</a>
+<a href="OCamlLangImpl8.html">Next: Conclusion and other useful LLVM tidbits</a>
 </div>
 
 <!-- *********************************************************************** -->
@@ -1899,7 +1896,7 @@ extern double printd(double X) {
   src="http://www.w3.org/Icons/valid-html401" alt="Valid HTML 4.01!"></a>
 
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
-  <a href="http://llvm.org">The LLVM Compiler Infrastructure</a><br>
+  <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
   <a href="mailto:idadesub@users.sourceforge.net">Erick Tryzelaar</a><br>
   Last modified: $Date$
 </address>
diff --git a/docs/tutorial/OCamlLangImpl8.html b/docs/tutorial/OCamlLangImpl8.html
index 64a6200..eed8c03 100644
--- a/docs/tutorial/OCamlLangImpl8.html
+++ b/docs/tutorial/OCamlLangImpl8.html
@@ -11,8 +11,7 @@
 
 <body>
 
-<div class="doc_title">Kaleidoscope: Conclusion and other useful LLVM
- tidbits</div>
+<h1>Kaleidoscope: Conclusion and other useful LLVM tidbits</h1>
 
 <ul>
 <li><a href="index.html">Up to Tutorial Index</a></li>
@@ -43,10 +42,10 @@
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="conclusion">Tutorial Conclusion</a></div>
+<h2><a name="conclusion">Tutorial Conclusion</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>Welcome to the the final chapter of the "<a href="index.html">Implementing a
 language with LLVM</a>" tutorial.  In the course of this tutorial, we have grown
@@ -154,23 +153,19 @@ are very useful if you want to take advantage of LLVM's capabilities.</p>
 </div>
 
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="llvmirproperties">Properties of the LLVM 
-IR</a></div>
+<h2><a name="llvmirproperties">Properties of the LLVM IR</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>We have a couple common questions about code in the LLVM IR form - lets just
 get these out of the way right now, shall we?</p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsubsection"><a name="targetindep">Target 
-Independence</a></div>
+<h4><a name="targetindep">Target Independence</a></h4>
 <!-- ======================================================================= -->
 
-<div class="doc_text">
+<div>
 
 <p>Kaleidoscope is an example of a "portable language": any program written in
 Kaleidoscope will work the same way on any target that it runs on.  Many other
@@ -221,10 +216,10 @@ in-kernel language.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection"><a name="safety">Safety Guarantees</a></div>
+<h4><a name="safety">Safety Guarantees</a></h4>
 <!-- ======================================================================= -->
 
-<div class="doc_text">
+<div>
 
 <p>Many of the languages above are also "safe" languages: it is impossible for
 a program written in Java to corrupt its address space and crash the process
@@ -243,11 +238,10 @@ list</a> if you are interested in more details.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection"><a name="langspecific">Language-Specific 
-Optimizations</a></div>
+<h4><a name="langspecific">Language-Specific Optimizations</a></h4>
 <!-- ======================================================================= -->
 
-<div class="doc_text">
+<div>
 
 <p>One thing about LLVM that turns off many people is that it does not solve all
 the world's problems in one system (sorry 'world hunger', someone else will have
@@ -297,24 +291,23 @@ language-specific AST.
 
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
-<div class="doc_section"><a name="tipsandtricks">Tips and Tricks</a></div>
+<h2><a name="tipsandtricks">Tips and Tricks</a></h2>
 <!-- *********************************************************************** -->
 
-<div class="doc_text">
+<div>
 
 <p>There is a variety of useful tips and tricks that you come to know after
 working on/with LLVM that aren't obvious at first glance.  Instead of letting
 everyone rediscover them, this section talks about some of these issues.</p>
 
-</div>
-
 <!-- ======================================================================= -->
-<div class="doc_subsubsection"><a name="offsetofsizeof">Implementing portable
-offsetof/sizeof</a></div>
+<h4><a name="offsetofsizeof">Implementing portable offsetof/sizeof</a></h4>
 <!-- ======================================================================= -->
 
-<div class="doc_text">
+<div>
 
 <p>One interesting thing that comes up, if you are trying to keep the code 
 generated by your compiler "target independent", is that you often need to know
@@ -331,11 +324,10 @@ in a portable way.</p>
 </div>
 
 <!-- ======================================================================= -->
-<div class="doc_subsubsection"><a name="gcstack">Garbage Collected 
-Stack Frames</a></div>
+<h4><a name="gcstack">Garbage Collected Stack Frames</a></h4>
 <!-- ======================================================================= -->
 
-<div class="doc_text">
+<div>
 
 <p>Some languages want to explicitly manage their stack frames, often so that
 they are garbage collected or to allow easy implementation of closures.  There
@@ -349,6 +341,8 @@ Passing Style</a> and the use of tail calls (which LLVM also supports).</p>
 
 </div>
 
+</div>
+
 <!-- *********************************************************************** -->
 <hr>
 <address>
@@ -358,7 +352,7 @@ Passing Style</a> and the use of tail calls (which LLVM also supports).</p>
   src="http://www.w3.org/Icons/valid-html401" alt="Valid HTML 4.01!"></a>
 
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
-  <a href="http://llvm.org">The LLVM Compiler Infrastructure</a><br>
+  <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
   Last modified: $Date$
 </address>
 </body>
diff --git a/docs/tutorial/index.html b/docs/tutorial/index.html
index 11dd5e2..0a8cae2 100644
--- a/docs/tutorial/index.html
+++ b/docs/tutorial/index.html
@@ -12,7 +12,7 @@
 
 <body>
 
-<div class="doc_title"> LLVM Tutorial: Table of Contents </div>
+<h1>LLVM Tutorial: Table of Contents</h1>
 
 <ol>
   <li>Kaleidoscope: Implementing a Language with LLVM
diff --git a/examples/ExceptionDemo/ExceptionDemo.cpp b/examples/ExceptionDemo/ExceptionDemo.cpp
index 95ccd24..e5bd377 100644
--- a/examples/ExceptionDemo/ExceptionDemo.cpp
+++ b/examples/ExceptionDemo/ExceptionDemo.cpp
@@ -1,12 +1,11 @@
-//===-- examples/ExceptionDemo/ExceptionDemo.cpp - 
-//                      An example use of the llvm Exception mechanism --===//
+//===-- ExceptionDemo.cpp - An example using llvm Exceptions --------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-//===--------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Demo program which implements an example LLVM exception implementation, and
 // shows several test cases including the handling of foreign exceptions.
@@ -46,8 +45,7 @@
 // This code uses code from the llvm compiler-rt project and the llvm 
 // Kaleidoscope project.
 //
-//===--------------------------------------------------------------------===//
-
+//===----------------------------------------------------------------------===//
 
 #include "llvm/LLVMContext.h"
 #include "llvm/DerivedTypes.h"
@@ -64,11 +62,17 @@
 #include "llvm/Support/IRBuilder.h"
 #include "llvm/Support/Dwarf.h"
 
+// FIXME: Although all systems tested with (Linux, OS X), do not need this 
+//        header file included. A user on ubuntu reported, undefined symbols 
+//        for stderr, and fprintf, and the addition of this include fixed the
+//        issue for them. Given that LLVM's best practices include the goal 
+//        of reducing the number of redundant header files included, the 
+//        correct solution would be to find out why these symbols are not 
+//        defined for the system in question, and fix the issue by finding out
+//        which LLVM header file, if any, would include these symbols.
 #include <cstdio>
-#include <string>
+
 #include <sstream>
-#include <map>
-#include <vector>
 #include <stdexcept>
 
 
@@ -80,8 +84,8 @@
 //     http://refspecs.freestandards.org/abi-eh-1.21.html
 
 extern "C" {
-
-typedef enum {
+  
+  typedef enum {
     _URC_NO_REASON = 0,
     _URC_FOREIGN_EXCEPTION_CAUGHT = 1,
     _URC_FATAL_PHASE2_ERROR = 2,
@@ -91,43 +95,43 @@ typedef enum {
     _URC_HANDLER_FOUND = 6,
     _URC_INSTALL_CONTEXT = 7,
     _URC_CONTINUE_UNWIND = 8
-} _Unwind_Reason_Code;
-
-typedef enum {
+  } _Unwind_Reason_Code;
+  
+  typedef enum {
     _UA_SEARCH_PHASE = 1,
     _UA_CLEANUP_PHASE = 2,
     _UA_HANDLER_FRAME = 4,
     _UA_FORCE_UNWIND = 8,
     _UA_END_OF_STACK = 16
-} _Unwind_Action;
-
-struct _Unwind_Exception;
-
-typedef void (*_Unwind_Exception_Cleanup_Fn) (_Unwind_Reason_Code,
-                                              struct _Unwind_Exception *);
-
-struct _Unwind_Exception {
+  } _Unwind_Action;
+  
+  struct _Unwind_Exception;
+  
+  typedef void (*_Unwind_Exception_Cleanup_Fn) (_Unwind_Reason_Code,
+                                                struct _Unwind_Exception *);
+  
+  struct _Unwind_Exception {
     uint64_t exception_class;
     _Unwind_Exception_Cleanup_Fn exception_cleanup;
-
+    
     uintptr_t private_1;    
     uintptr_t private_2;    
-
+    
     // @@@ The IA-64 ABI says that this structure must be double-word aligned.
     //  Taking that literally does not make much sense generically.  Instead 
     //  we provide the maximum alignment required by any type for the machine.
-} __attribute__((__aligned__));
-
-struct _Unwind_Context;
-typedef struct _Unwind_Context* _Unwind_Context_t;
-
-extern const uint8_t* _Unwind_GetLanguageSpecificData (_Unwind_Context_t c);
-extern uintptr_t _Unwind_GetGR (_Unwind_Context_t c, int i);
-extern void _Unwind_SetGR (_Unwind_Context_t c, int i, uintptr_t n);
-extern void _Unwind_SetIP (_Unwind_Context_t, uintptr_t new_value);
-extern uintptr_t _Unwind_GetIP (_Unwind_Context_t context);
-extern uintptr_t _Unwind_GetRegionStart (_Unwind_Context_t context);
-
+  } __attribute__((__aligned__));
+  
+  struct _Unwind_Context;
+  typedef struct _Unwind_Context *_Unwind_Context_t;
+  
+  extern const uint8_t *_Unwind_GetLanguageSpecificData (_Unwind_Context_t c);
+  extern uintptr_t _Unwind_GetGR (_Unwind_Context_t c, int i);
+  extern void _Unwind_SetGR (_Unwind_Context_t c, int i, uintptr_t n);
+  extern void _Unwind_SetIP (_Unwind_Context_t, uintptr_t new_value);
+  extern uintptr_t _Unwind_GetIP (_Unwind_Context_t context);
+  extern uintptr_t _Unwind_GetRegionStart (_Unwind_Context_t context);
+  
 } // extern "C"
 
 //
@@ -136,8 +140,8 @@ extern uintptr_t _Unwind_GetRegionStart (_Unwind_Context_t context);
 
 /// This is our simplistic type info
 struct OurExceptionType_t {
-    /// type info type
-    int type;
+  /// type info type
+  int type;
 };
 
 
@@ -148,10 +152,10 @@ struct OurExceptionType_t {
 ///       on a double word boundary. This is necessary to match the standard:
 ///       http://refspecs.freestandards.org/abi-eh-1.21.html
 struct OurBaseException_t {
-    struct OurExceptionType_t type;
-
-    // Note: This is properly aligned in unwind.h
-    struct _Unwind_Exception unwindException;
+  struct OurExceptionType_t type;
+  
+  // Note: This is properly aligned in unwind.h
+  struct _Unwind_Exception unwindException;
 };
 
 
@@ -169,7 +173,7 @@ static std::map<std::string, llvm::Value*> namedValues;
 int64_t ourBaseFromUnwindOffset;
 
 const unsigned char ourBaseExcpClassChars[] = 
-                                {'o', 'b', 'j', '\0', 'b', 'a', 's', '\0'};
+{'o', 'b', 'j', '\0', 'b', 'a', 's', '\0'};
 
 
 static uint64_t ourBaseExceptionClass = 0;
@@ -177,13 +181,13 @@ static uint64_t ourBaseExceptionClass = 0;
 static std::vector<std::string> ourTypeInfoNames;
 static std::map<int, std::string> ourTypeInfoNamesIndex;
 
-static llvm::StructType* ourTypeInfoType;
-static llvm::StructType* ourExceptionType;
-static llvm::StructType* ourUnwindExceptionType;
+static llvm::StructType *ourTypeInfoType;
+static llvm::StructType *ourExceptionType;
+static llvm::StructType *ourUnwindExceptionType;
 
-static llvm::ConstantInt* ourExceptionNotThrownState;
-static llvm::ConstantInt* ourExceptionThrownState;
-static llvm::ConstantInt* ourExceptionCaughtState;
+static llvm::ConstantInt *ourExceptionNotThrownState;
+static llvm::ConstantInt *ourExceptionThrownState;
+static llvm::ConstantInt *ourExceptionCaughtState;
 
 typedef std::vector<std::string> ArgNames;
 typedef std::vector<const llvm::Type*> ArgTypes;
@@ -204,35 +208,32 @@ typedef std::vector<const llvm::Type*> ArgTypes;
 /// @param declarationOnly for function declarations
 /// @param isVarArg function uses vararg arguments
 /// @returns function instance
-llvm::Function *createFunction(llvm::Module& module,
-                               const llvm::Type* retType,
-                               const ArgTypes& theArgTypes,
-                               const ArgNames& theArgNames,
-                               const std::string& functName,
+llvm::Function *createFunction(llvm::Module &module,
+                               const llvm::Type *retType,
+                               const ArgTypes &theArgTypes,
+                               const ArgNames &theArgNames,
+                               const std::string &functName,
                                llvm::GlobalValue::LinkageTypes linkage,
                                bool declarationOnly,
                                bool isVarArg) {
-    llvm::FunctionType* functType = llvm::FunctionType::get(retType, 
-                                                            theArgTypes, 
-                                                            isVarArg);
-    llvm::Function* ret = llvm::Function::Create(functType, 
-                                                 linkage, 
-                                                 functName, 
-                                                 &module);
-    if (!ret || declarationOnly)
-        return(ret);
-
-    namedValues.clear();
-    unsigned i = 0; 
-    for (llvm::Function::arg_iterator argIndex = ret->arg_begin();
-         i != theArgNames.size();
-         ++argIndex, ++i) {
-
-        argIndex->setName(theArgNames[i]);
-        namedValues[theArgNames[i]] = argIndex;
-    }
-
+  llvm::FunctionType *functType =
+    llvm::FunctionType::get(retType, theArgTypes, isVarArg);
+  llvm::Function *ret =
+    llvm::Function::Create(functType, linkage, functName, &module);
+  if (!ret || declarationOnly)
     return(ret);
+  
+  namedValues.clear();
+  unsigned i = 0; 
+  for (llvm::Function::arg_iterator argIndex = ret->arg_begin();
+       i != theArgNames.size();
+       ++argIndex, ++i) {
+    
+    argIndex->setName(theArgNames[i]);
+    namedValues[theArgNames[i]] = argIndex;
+  }
+  
+  return(ret);
 }
 
 
@@ -243,18 +244,18 @@ llvm::Function *createFunction(llvm::Module& module,
 /// @param type stack variable type
 /// @param initWith optional constant initialization value
 /// @returns AllocaInst instance
-static llvm::AllocaInst *createEntryBlockAlloca(llvm::Function& function,
-                                            const std::string &varName,
-                                            const llvm::Type* type,
-                                            llvm::Constant* initWith = NULL) {
-    llvm::BasicBlock& block = function.getEntryBlock(); 
-    llvm::IRBuilder<> tmp(&block, block.begin());
-    llvm::AllocaInst* ret = tmp.CreateAlloca(type, 0, varName.c_str());
-
-    if (initWith) 
-        tmp.CreateStore(initWith, ret);
-
-    return(ret);
+static llvm::AllocaInst *createEntryBlockAlloca(llvm::Function &function,
+                                                const std::string &varName,
+                                                const llvm::Type *type,
+                                                llvm::Constant *initWith = 0) {
+  llvm::BasicBlock &block = function.getEntryBlock(); 
+  llvm::IRBuilder<> tmp(&block, block.begin());
+  llvm::AllocaInst *ret = tmp.CreateAlloca(type, 0, varName.c_str());
+  
+  if (initWith) 
+    tmp.CreateStore(initWith, ret);
+  
+  return(ret);
 }
 
 
@@ -274,15 +275,15 @@ extern "C" {
 /// Prints a 32 bit number, according to the format, to stderr.
 /// @param intToPrint integer to print 
 /// @param format printf like format to use when printing
-void print32Int(int intToPrint, const char* format) {
-    if (format) {
-        // Note: No NULL check
-        fprintf(stderr, format, intToPrint);
-    }
-    else {
-        // Note: No NULL check
-        fprintf(stderr, "::print32Int(...):NULL arg.\n");
-    }
+void print32Int(int intToPrint, const char *format) {
+  if (format) {
+    // Note: No NULL check
+    fprintf(stderr, format, intToPrint);
+  }
+  else {
+    // Note: No NULL check
+    fprintf(stderr, "::print32Int(...):NULL arg.\n");
+  }
 }
 
 
@@ -291,27 +292,27 @@ void print32Int(int intToPrint, const char* format) {
 /// Prints a 64 bit number, according to the format, to stderr.
 /// @param intToPrint integer to print 
 /// @param format printf like format to use when printing
-void print64Int(long int intToPrint, const char* format) {
-    if (format) {
-        // Note: No NULL check
-        fprintf(stderr, format, intToPrint);
-    }
-    else {
-        // Note: No NULL check
-        fprintf(stderr, "::print64Int(...):NULL arg.\n");
-    }
+void print64Int(long int intToPrint, const char *format) {
+  if (format) {
+    // Note: No NULL check
+    fprintf(stderr, format, intToPrint);
+  }
+  else {
+    // Note: No NULL check
+    fprintf(stderr, "::print64Int(...):NULL arg.\n");
+  }
 }
 
 
 /// Prints a C string to stderr
 /// @param toPrint string to print
-void printStr(char* toPrint) {
-    if (toPrint) {
-        fprintf(stderr, "%s", toPrint);
-    }
-    else {
-        fprintf(stderr, "::printStr(...):NULL arg.\n");
-    }
+void printStr(char *toPrint) {
+  if (toPrint) {
+    fprintf(stderr, "%s", toPrint);
+  }
+  else {
+    fprintf(stderr, "::printStr(...):NULL arg.\n");
+  }
 }
 
 
@@ -319,17 +320,17 @@ void printStr(char* toPrint) {
 /// is calculated from the supplied OurBaseException_t::unwindException
 /// member address. Handles (ignores), NULL pointers.
 /// @param expToDelete exception to delete
-void deleteOurException(OurUnwindException* expToDelete) {
+void deleteOurException(OurUnwindException *expToDelete) {
 #ifdef DEBUG
-    fprintf(stderr,
-            "deleteOurException(...).\n");
+  fprintf(stderr,
+          "deleteOurException(...).\n");
 #endif
-
-    if (expToDelete &&
-        (expToDelete->exception_class == ourBaseExceptionClass)) {
-
-        free(((char*) expToDelete) + ourBaseFromUnwindOffset);
-    }
+  
+  if (expToDelete &&
+      (expToDelete->exception_class == ourBaseExceptionClass)) {
+    
+    free(((char*) expToDelete) + ourBaseFromUnwindOffset);
+  }
 }
 
 
@@ -340,27 +341,27 @@ void deleteOurException(OurUnwindException* expToDelete) {
 /// @unlink
 /// @param expToDelete exception instance to delete
 void deleteFromUnwindOurException(_Unwind_Reason_Code reason,
-                                  OurUnwindException* expToDelete) {
+                                  OurUnwindException *expToDelete) {
 #ifdef DEBUG
-    fprintf(stderr,
-            "deleteFromUnwindOurException(...).\n");
+  fprintf(stderr,
+          "deleteFromUnwindOurException(...).\n");
 #endif
-
-    deleteOurException(expToDelete);
+  
+  deleteOurException(expToDelete);
 }
 
 
 /// Creates (allocates on the heap), an exception (OurException instance),
 /// of the supplied type info type.
 /// @param type type info type
-OurUnwindException* createOurException(int type) {
-    size_t size = sizeof(OurException);
-    OurException* ret = (OurException*) memset(malloc(size), 0, size);
-    (ret->type).type = type;
-    (ret->unwindException).exception_class = ourBaseExceptionClass;
-    (ret->unwindException).exception_cleanup = deleteFromUnwindOurException;
-
-    return(&(ret->unwindException));
+OurUnwindException *createOurException(int type) {
+  size_t size = sizeof(OurException);
+  OurException *ret = (OurException*) memset(malloc(size), 0, size);
+  (ret->type).type = type;
+  (ret->unwindException).exception_class = ourBaseExceptionClass;
+  (ret->unwindException).exception_cleanup = deleteFromUnwindOurException;
+  
+  return(&(ret->unwindException));
 }
 
 
@@ -369,22 +370,22 @@ OurUnwindException* createOurException(int type) {
 /// @link http://dwarfstd.org/Dwarf3.pdf @unlink
 /// @param data reference variable holding memory pointer to decode from
 /// @returns decoded value
-static uintptr_t readULEB128(const uint8_t** data) {
-    uintptr_t result = 0;
-    uintptr_t shift = 0;
-    unsigned char byte;
-    const uint8_t* p = *data;
-
-    do {
-        byte = *p++;
-        result |= (byte & 0x7f) << shift;
-        shift += 7;
-    } 
-    while (byte & 0x80);
-
-    *data = p;
-
-    return result;
+static uintptr_t readULEB128(const uint8_t **data) {
+  uintptr_t result = 0;
+  uintptr_t shift = 0;
+  unsigned char byte;
+  const uint8_t *p = *data;
+  
+  do {
+    byte = *p++;
+    result |= (byte & 0x7f) << shift;
+    shift += 7;
+  } 
+  while (byte & 0x80);
+  
+  *data = p;
+  
+  return result;
 }
 
 
@@ -393,26 +394,26 @@ static uintptr_t readULEB128(const uint8_t** data) {
 /// @link http://dwarfstd.org/Dwarf3.pdf @unlink
 /// @param data reference variable holding memory pointer to decode from
 /// @returns decoded value
-static uintptr_t readSLEB128(const uint8_t** data) {
-    uintptr_t result = 0;
-    uintptr_t shift = 0;
-    unsigned char byte;
-    const uint8_t* p = *data;
-
-    do {
-        byte = *p++;
-        result |= (byte & 0x7f) << shift;
-        shift += 7;
-    } 
-    while (byte & 0x80);
-
-    *data = p;
-
-    if ((byte & 0x40) && (shift < (sizeof(result) << 3))) {
-        result |= (~0 << shift);
-    }
-
-    return result;
+static uintptr_t readSLEB128(const uint8_t **data) {
+  uintptr_t result = 0;
+  uintptr_t shift = 0;
+  unsigned char byte;
+  const uint8_t *p = *data;
+  
+  do {
+    byte = *p++;
+    result |= (byte & 0x7f) << shift;
+    shift += 7;
+  } 
+  while (byte & 0x80);
+  
+  *data = p;
+  
+  if ((byte & 0x40) && (shift < (sizeof(result) << 3))) {
+    result |= (~0 << shift);
+  }
+  
+  return result;
 }
 
 
@@ -422,82 +423,82 @@ static uintptr_t readSLEB128(const uint8_t** data) {
 /// @param data reference variable holding memory pointer to decode from
 /// @param encoding dwarf encoding type
 /// @returns decoded value
-static uintptr_t readEncodedPointer(const uint8_t** data, uint8_t encoding) {
-    uintptr_t result = 0;
-    const uint8_t* p = *data;
-
-    if (encoding == llvm::dwarf::DW_EH_PE_omit) 
-        return(result);
-
-    // first get value 
-    switch (encoding & 0x0F) {
-        case llvm::dwarf::DW_EH_PE_absptr:
-            result = *((uintptr_t*)p);
-            p += sizeof(uintptr_t);
-            break;
-        case llvm::dwarf::DW_EH_PE_uleb128:
-            result = readULEB128(&p);
-            break;
-        // Note: This case has not been tested
-        case llvm::dwarf::DW_EH_PE_sleb128:
-            result = readSLEB128(&p);
-            break;
-        case llvm::dwarf::DW_EH_PE_udata2:
-            result = *((uint16_t*)p);
-            p += sizeof(uint16_t);
-            break;
-        case llvm::dwarf::DW_EH_PE_udata4:
-            result = *((uint32_t*)p);
-            p += sizeof(uint32_t);
-            break;
-        case llvm::dwarf::DW_EH_PE_udata8:
-            result = *((uint64_t*)p);
-            p += sizeof(uint64_t);
-            break;
-        case llvm::dwarf::DW_EH_PE_sdata2:
-            result = *((int16_t*)p);
-            p += sizeof(int16_t);
-            break;
-        case llvm::dwarf::DW_EH_PE_sdata4:
-            result = *((int32_t*)p);
-            p += sizeof(int32_t);
-            break;
-        case llvm::dwarf::DW_EH_PE_sdata8:
-            result = *((int64_t*)p);
-            p += sizeof(int64_t);
-            break;
-        default:
-            // not supported 
-            abort();
-            break;
-    }
-
-    // then add relative offset 
-    switch (encoding & 0x70) {
-        case llvm::dwarf::DW_EH_PE_absptr:
-            // do nothing 
-            break;
-        case llvm::dwarf::DW_EH_PE_pcrel:
-            result += (uintptr_t)(*data);
-            break;
-        case llvm::dwarf::DW_EH_PE_textrel:
-        case llvm::dwarf::DW_EH_PE_datarel:
-        case llvm::dwarf::DW_EH_PE_funcrel:
-        case llvm::dwarf::DW_EH_PE_aligned:
-        default:
-            // not supported 
-            abort();
-            break;
-    }
-
-    // then apply indirection 
-    if (encoding & llvm::dwarf::DW_EH_PE_indirect) {
-        result = *((uintptr_t*)result);
-    }
-
-    *data = p;
-
-    return result;
+static uintptr_t readEncodedPointer(const uint8_t **data, uint8_t encoding) {
+  uintptr_t result = 0;
+  const uint8_t *p = *data;
+  
+  if (encoding == llvm::dwarf::DW_EH_PE_omit) 
+    return(result);
+  
+  // first get value 
+  switch (encoding & 0x0F) {
+    case llvm::dwarf::DW_EH_PE_absptr:
+      result = *((uintptr_t*)p);
+      p += sizeof(uintptr_t);
+      break;
+    case llvm::dwarf::DW_EH_PE_uleb128:
+      result = readULEB128(&p);
+      break;
+      // Note: This case has not been tested
+    case llvm::dwarf::DW_EH_PE_sleb128:
+      result = readSLEB128(&p);
+      break;
+    case llvm::dwarf::DW_EH_PE_udata2:
+      result = *((uint16_t*)p);
+      p += sizeof(uint16_t);
+      break;
+    case llvm::dwarf::DW_EH_PE_udata4:
+      result = *((uint32_t*)p);
+      p += sizeof(uint32_t);
+      break;
+    case llvm::dwarf::DW_EH_PE_udata8:
+      result = *((uint64_t*)p);
+      p += sizeof(uint64_t);
+      break;
+    case llvm::dwarf::DW_EH_PE_sdata2:
+      result = *((int16_t*)p);
+      p += sizeof(int16_t);
+      break;
+    case llvm::dwarf::DW_EH_PE_sdata4:
+      result = *((int32_t*)p);
+      p += sizeof(int32_t);
+      break;
+    case llvm::dwarf::DW_EH_PE_sdata8:
+      result = *((int64_t*)p);
+      p += sizeof(int64_t);
+      break;
+    default:
+      // not supported 
+      abort();
+      break;
+  }
+  
+  // then add relative offset 
+  switch (encoding & 0x70) {
+    case llvm::dwarf::DW_EH_PE_absptr:
+      // do nothing 
+      break;
+    case llvm::dwarf::DW_EH_PE_pcrel:
+      result += (uintptr_t)(*data);
+      break;
+    case llvm::dwarf::DW_EH_PE_textrel:
+    case llvm::dwarf::DW_EH_PE_datarel:
+    case llvm::dwarf::DW_EH_PE_funcrel:
+    case llvm::dwarf::DW_EH_PE_aligned:
+    default:
+      // not supported 
+      abort();
+      break;
+  }
+  
+  // then apply indirection 
+  if (encoding & llvm::dwarf::DW_EH_PE_indirect) {
+    result = *((uintptr_t*)result);
+  }
+  
+  *data = p;
+  
+  return result;
 }
 
 
@@ -524,74 +525,74 @@ static bool handleActionValue(int64_t *resultAction,
                               uintptr_t actionEntry, 
                               uint64_t exceptionClass, 
                               struct _Unwind_Exception *exceptionObject) {
-    bool ret = false;
-
-    if (!resultAction || 
-        !exceptionObject || 
-        (exceptionClass != ourBaseExceptionClass))
-        return(ret);
-
-    struct OurBaseException_t* excp = (struct OurBaseException_t*)
-                        (((char*) exceptionObject) + ourBaseFromUnwindOffset);
-    struct OurExceptionType_t *excpType = &(excp->type);
-    int type = excpType->type;
-
+  bool ret = false;
+  
+  if (!resultAction || 
+      !exceptionObject || 
+      (exceptionClass != ourBaseExceptionClass))
+    return(ret);
+  
+  struct OurBaseException_t *excp = (struct OurBaseException_t*)
+  (((char*) exceptionObject) + ourBaseFromUnwindOffset);
+  struct OurExceptionType_t *excpType = &(excp->type);
+  int type = excpType->type;
+  
 #ifdef DEBUG
-    fprintf(stderr,
-            "handleActionValue(...): exceptionObject = <%p>, "
-                "excp = <%p>.\n",
-            exceptionObject,
-            excp);
+  fprintf(stderr,
+          "handleActionValue(...): exceptionObject = <%p>, "
+          "excp = <%p>.\n",
+          exceptionObject,
+          excp);
 #endif
-
-    const uint8_t *actionPos = (uint8_t*) actionEntry,
-                  *tempActionPos;
-    int64_t typeOffset = 0,
-            actionOffset;
-
-    for (int i = 0; true; ++i) {
-        // Each emitted dwarf action corresponds to a 2 tuple of
-        // type info address offset, and action offset to the next
-        // emitted action.
-        typeOffset = readSLEB128(&actionPos);
-        tempActionPos = actionPos;
-        actionOffset = readSLEB128(&tempActionPos);
-
+  
+  const uint8_t *actionPos = (uint8_t*) actionEntry,
+  *tempActionPos;
+  int64_t typeOffset = 0,
+  actionOffset;
+  
+  for (int i = 0; true; ++i) {
+    // Each emitted dwarf action corresponds to a 2 tuple of
+    // type info address offset, and action offset to the next
+    // emitted action.
+    typeOffset = readSLEB128(&actionPos);
+    tempActionPos = actionPos;
+    actionOffset = readSLEB128(&tempActionPos);
+    
 #ifdef DEBUG
-        fprintf(stderr,
-                "handleActionValue(...):typeOffset: <%lld>, "
-                    "actionOffset: <%lld>.\n",
-                typeOffset,
-                actionOffset);
+    fprintf(stderr,
+            "handleActionValue(...):typeOffset: <%lld>, "
+            "actionOffset: <%lld>.\n",
+            typeOffset,
+            actionOffset);
 #endif
-        assert((typeOffset >= 0) && 
-               "handleActionValue(...):filters are not supported.");
-
-        // Note: A typeOffset == 0 implies that a cleanup llvm.eh.selector
-        //       argument has been matched.
-        if ((typeOffset > 0) &&
-            (type == (classInfo[-typeOffset])->type)) {
+    assert((typeOffset >= 0) && 
+           "handleActionValue(...):filters are not supported.");
+    
+    // Note: A typeOffset == 0 implies that a cleanup llvm.eh.selector
+    //       argument has been matched.
+    if ((typeOffset > 0) &&
+        (type == (classInfo[-typeOffset])->type)) {
 #ifdef DEBUG
-            fprintf(stderr,
-                    "handleActionValue(...):actionValue <%d> found.\n",
-                    i);
+      fprintf(stderr,
+              "handleActionValue(...):actionValue <%d> found.\n",
+              i);
 #endif
-            *resultAction = i + 1;
-            ret = true;
-            break;
-        }
-
+      *resultAction = i + 1;
+      ret = true;
+      break;
+    }
+    
 #ifdef DEBUG
-        fprintf(stderr,
-                "handleActionValue(...):actionValue not found.\n");
+    fprintf(stderr,
+            "handleActionValue(...):actionValue not found.\n");
 #endif
-        if (!actionOffset)
-            break;
-
-        actionPos += actionOffset;
-    }
-
-    return(ret);
+    if (!actionOffset)
+      break;
+    
+    actionPos += actionOffset;
+  }
+  
+  return(ret);
 }
 
 
@@ -607,180 +608,177 @@ static bool handleActionValue(int64_t *resultAction,
 /// @param context unwind system context
 /// @returns minimally supported unwinding control indicator 
 static _Unwind_Reason_Code handleLsda(int version, 
-                                  const uint8_t* lsda,
-                                  _Unwind_Action actions,
-                                  uint64_t exceptionClass, 
-                                  struct _Unwind_Exception* exceptionObject,
-                                  _Unwind_Context_t context) {
-    _Unwind_Reason_Code ret = _URC_CONTINUE_UNWIND;
-
-    if (!lsda)
-        return(ret);
-
+                                      const uint8_t *lsda,
+                                      _Unwind_Action actions,
+                                      uint64_t exceptionClass, 
+                                    struct _Unwind_Exception *exceptionObject,
+                                      _Unwind_Context_t context) {
+  _Unwind_Reason_Code ret = _URC_CONTINUE_UNWIND;
+  
+  if (!lsda)
+    return(ret);
+  
 #ifdef DEBUG
-    fprintf(stderr, 
-            "handleLsda(...):lsda is non-zero.\n");
+  fprintf(stderr, 
+          "handleLsda(...):lsda is non-zero.\n");
 #endif
-
-    // Get the current instruction pointer and offset it before next
-    // instruction in the current frame which threw the exception.
-    uintptr_t pc = _Unwind_GetIP(context)-1;
-
-    // Get beginning current frame's code (as defined by the 
-    // emitted dwarf code)
-    uintptr_t funcStart = _Unwind_GetRegionStart(context);
-    uintptr_t pcOffset = pc - funcStart;
-    struct OurExceptionType_t** classInfo = NULL;
-
-    // Note: See JITDwarfEmitter::EmitExceptionTable(...) for corresponding
-    //       dwarf emission
-
-    // Parse LSDA header.
-    uint8_t lpStartEncoding = *lsda++;
-
-    if (lpStartEncoding != llvm::dwarf::DW_EH_PE_omit) {
-        readEncodedPointer(&lsda, lpStartEncoding); 
-    }
-
-    uint8_t ttypeEncoding = *lsda++;
-    uintptr_t classInfoOffset;
-
-    if (ttypeEncoding != llvm::dwarf::DW_EH_PE_omit) {
-        // Calculate type info locations in emitted dwarf code which
-        // were flagged by type info arguments to llvm.eh.selector
-        // intrinsic
-        classInfoOffset = readULEB128(&lsda);
-        classInfo = (struct OurExceptionType_t**) (lsda + classInfoOffset);
-    }
-
-    // Walk call-site table looking for range that 
-    // includes current PC. 
-
-    uint8_t         callSiteEncoding = *lsda++;
-    uint32_t        callSiteTableLength = readULEB128(&lsda);
-    const uint8_t*  callSiteTableStart = lsda;
-    const uint8_t*  callSiteTableEnd = callSiteTableStart + 
-                                                    callSiteTableLength;
-    const uint8_t*  actionTableStart = callSiteTableEnd;
-    const uint8_t*  callSitePtr = callSiteTableStart;
-
-    bool foreignException = false;
-
-    while (callSitePtr < callSiteTableEnd) {
-        uintptr_t start = readEncodedPointer(&callSitePtr, 
-                                             callSiteEncoding);
-        uintptr_t length = readEncodedPointer(&callSitePtr, 
+  
+  // Get the current instruction pointer and offset it before next
+  // instruction in the current frame which threw the exception.
+  uintptr_t pc = _Unwind_GetIP(context)-1;
+  
+  // Get beginning current frame's code (as defined by the 
+  // emitted dwarf code)
+  uintptr_t funcStart = _Unwind_GetRegionStart(context);
+  uintptr_t pcOffset = pc - funcStart;
+  struct OurExceptionType_t **classInfo = NULL;
+  
+  // Note: See JITDwarfEmitter::EmitExceptionTable(...) for corresponding
+  //       dwarf emission
+  
+  // Parse LSDA header.
+  uint8_t lpStartEncoding = *lsda++;
+  
+  if (lpStartEncoding != llvm::dwarf::DW_EH_PE_omit) {
+    readEncodedPointer(&lsda, lpStartEncoding); 
+  }
+  
+  uint8_t ttypeEncoding = *lsda++;
+  uintptr_t classInfoOffset;
+  
+  if (ttypeEncoding != llvm::dwarf::DW_EH_PE_omit) {
+    // Calculate type info locations in emitted dwarf code which
+    // were flagged by type info arguments to llvm.eh.selector
+    // intrinsic
+    classInfoOffset = readULEB128(&lsda);
+    classInfo = (struct OurExceptionType_t**) (lsda + classInfoOffset);
+  }
+  
+  // Walk call-site table looking for range that 
+  // includes current PC. 
+  
+  uint8_t         callSiteEncoding = *lsda++;
+  uint32_t        callSiteTableLength = readULEB128(&lsda);
+  const uint8_t   *callSiteTableStart = lsda;
+  const uint8_t   *callSiteTableEnd = callSiteTableStart + 
+  callSiteTableLength;
+  const uint8_t   *actionTableStart = callSiteTableEnd;
+  const uint8_t   *callSitePtr = callSiteTableStart;
+  
+  bool foreignException = false;
+  
+  while (callSitePtr < callSiteTableEnd) {
+    uintptr_t start = readEncodedPointer(&callSitePtr, 
+                                         callSiteEncoding);
+    uintptr_t length = readEncodedPointer(&callSitePtr, 
+                                          callSiteEncoding);
+    uintptr_t landingPad = readEncodedPointer(&callSitePtr, 
                                               callSiteEncoding);
-        uintptr_t landingPad = readEncodedPointer(&callSitePtr, 
-                                                  callSiteEncoding);
-
-        // Note: Action value
-        uintptr_t actionEntry = readULEB128(&callSitePtr);
-
-        if (exceptionClass != ourBaseExceptionClass) {
-            // We have been notified of a foreign exception being thrown,
-            // and we therefore need to execute cleanup landing pads
-            actionEntry = 0;
-            foreignException = true;
-        }
-
-        if (landingPad == 0) {
+    
+    // Note: Action value
+    uintptr_t actionEntry = readULEB128(&callSitePtr);
+    
+    if (exceptionClass != ourBaseExceptionClass) {
+      // We have been notified of a foreign exception being thrown,
+      // and we therefore need to execute cleanup landing pads
+      actionEntry = 0;
+      foreignException = true;
+    }
+    
+    if (landingPad == 0) {
 #ifdef DEBUG
-            fprintf(stderr,
-                    "handleLsda(...): No landing pad found.\n");
+      fprintf(stderr,
+              "handleLsda(...): No landing pad found.\n");
 #endif
-
-            continue; // no landing pad for this entry
-        }
-
-        if (actionEntry) {
-            actionEntry += ((uintptr_t) actionTableStart) - 1;
-        }
-        else {
+      
+      continue; // no landing pad for this entry
+    }
+    
+    if (actionEntry) {
+      actionEntry += ((uintptr_t) actionTableStart) - 1;
+    }
+    else {
 #ifdef DEBUG
-            fprintf(stderr,
-                    "handleLsda(...):No action table found.\n");
+      fprintf(stderr,
+              "handleLsda(...):No action table found.\n");
 #endif
-        }
-
-        bool exceptionMatched = false;
-
-        if ((start <= pcOffset) && (pcOffset < (start + length))) {
+    }
+    
+    bool exceptionMatched = false;
+    
+    if ((start <= pcOffset) && (pcOffset < (start + length))) {
 #ifdef DEBUG
-            fprintf(stderr,
-                    "handleLsda(...): Landing pad found.\n");
+      fprintf(stderr,
+              "handleLsda(...): Landing pad found.\n");
 #endif
-            int64_t actionValue = 0;
-
-            if (actionEntry) {
-                exceptionMatched = handleActionValue
-                                   (
-                                       &actionValue,
-                                       classInfo, 
-                                       actionEntry, 
-                                       exceptionClass, 
-                                       exceptionObject
-                                   );
-            }
-
-            if (!(actions & _UA_SEARCH_PHASE)) {
+      int64_t actionValue = 0;
+      
+      if (actionEntry) {
+        exceptionMatched = handleActionValue(&actionValue,
+                                             classInfo, 
+                                             actionEntry, 
+                                             exceptionClass, 
+                                             exceptionObject);
+      }
+      
+      if (!(actions & _UA_SEARCH_PHASE)) {
 #ifdef DEBUG
-                fprintf(stderr,
-                        "handleLsda(...): installed landing pad "
-                            "context.\n");
+        fprintf(stderr,
+                "handleLsda(...): installed landing pad "
+                "context.\n");
 #endif
-
-                // Found landing pad for the PC.
-                // Set Instruction Pointer to so we re-enter function 
-                // at landing pad. The landing pad is created by the 
-                // compiler to take two parameters in registers.
-                _Unwind_SetGR(context, 
-                              __builtin_eh_return_data_regno(0), 
-                              (uintptr_t)exceptionObject);
-
-                // Note: this virtual register directly corresponds
-                //       to the return of the llvm.eh.selector intrinsic
-                if (!actionEntry || !exceptionMatched) {
-                    // We indicate cleanup only
-                    _Unwind_SetGR(context, 
-                                  __builtin_eh_return_data_regno(1), 
-                                  0);
-                }
-                else {
-                    // Matched type info index of llvm.eh.selector intrinsic
-                    // passed here.
-                    _Unwind_SetGR(context, 
-                                  __builtin_eh_return_data_regno(1), 
-                                  actionValue);
-                }
-
-                // To execute landing pad set here
-                _Unwind_SetIP(context, funcStart + landingPad);
-                ret = _URC_INSTALL_CONTEXT;
-            }
-            else if (exceptionMatched) {
+        
+        // Found landing pad for the PC.
+        // Set Instruction Pointer to so we re-enter function 
+        // at landing pad. The landing pad is created by the 
+        // compiler to take two parameters in registers.
+        _Unwind_SetGR(context, 
+                      __builtin_eh_return_data_regno(0), 
+                      (uintptr_t)exceptionObject);
+        
+        // Note: this virtual register directly corresponds
+        //       to the return of the llvm.eh.selector intrinsic
+        if (!actionEntry || !exceptionMatched) {
+          // We indicate cleanup only
+          _Unwind_SetGR(context, 
+                        __builtin_eh_return_data_regno(1), 
+                        0);
+        }
+        else {
+          // Matched type info index of llvm.eh.selector intrinsic
+          // passed here.
+          _Unwind_SetGR(context, 
+                        __builtin_eh_return_data_regno(1), 
+                        actionValue);
+        }
+        
+        // To execute landing pad set here
+        _Unwind_SetIP(context, funcStart + landingPad);
+        ret = _URC_INSTALL_CONTEXT;
+      }
+      else if (exceptionMatched) {
 #ifdef DEBUG
-                fprintf(stderr,
-                        "handleLsda(...): setting handler found.\n");
+        fprintf(stderr,
+                "handleLsda(...): setting handler found.\n");
 #endif
-                ret = _URC_HANDLER_FOUND;
-            }
-            else {
-                // Note: Only non-clean up handlers are marked as
-                //       found. Otherwise the clean up handlers will be 
-                //       re-found and executed during the clean up 
-                //       phase.
+        ret = _URC_HANDLER_FOUND;
+      }
+      else {
+        // Note: Only non-clean up handlers are marked as
+        //       found. Otherwise the clean up handlers will be 
+        //       re-found and executed during the clean up 
+        //       phase.
 #ifdef DEBUG
-                fprintf(stderr,
-                        "handleLsda(...): cleanup handler found.\n");
+        fprintf(stderr,
+                "handleLsda(...): cleanup handler found.\n");
 #endif
-            }
-
-            break;
-        }
+      }
+      
+      break;
     }
-
-    return(ret);
+  }
+  
+  return(ret);
 }
 
 
@@ -796,38 +794,38 @@ static _Unwind_Reason_Code handleLsda(int version,
 /// @param context unwind system context
 /// @returns minimally supported unwinding control indicator 
 _Unwind_Reason_Code ourPersonality(int version, 
-                               _Unwind_Action actions,
-                               uint64_t exceptionClass, 
-                               struct _Unwind_Exception* exceptionObject,
-                               _Unwind_Context_t context) {
+                                   _Unwind_Action actions,
+                                   uint64_t exceptionClass, 
+                                   struct _Unwind_Exception *exceptionObject,
+                                   _Unwind_Context_t context) {
 #ifdef DEBUG
-    fprintf(stderr, 
-            "We are in ourPersonality(...):actions is <%d>.\n",
-            actions);
-
-    if (actions & _UA_SEARCH_PHASE) {
-        fprintf(stderr, "ourPersonality(...):In search phase.\n");
-    }
-    else {
-        fprintf(stderr, "ourPersonality(...):In non-search phase.\n");
-    }
+  fprintf(stderr, 
+          "We are in ourPersonality(...):actions is <%d>.\n",
+          actions);
+  
+  if (actions & _UA_SEARCH_PHASE) {
+    fprintf(stderr, "ourPersonality(...):In search phase.\n");
+  }
+  else {
+    fprintf(stderr, "ourPersonality(...):In non-search phase.\n");
+  }
 #endif
-
-    const uint8_t* lsda = _Unwind_GetLanguageSpecificData(context);
-
+  
+  const uint8_t *lsda = _Unwind_GetLanguageSpecificData(context);
+  
 #ifdef DEBUG
-    fprintf(stderr, 
-            "ourPersonality(...):lsda = <%p>.\n",
-            lsda);
+  fprintf(stderr, 
+          "ourPersonality(...):lsda = <%p>.\n",
+          lsda);
 #endif
-
-    // The real work of the personality function is captured here
-    return(handleLsda(version,
-                      lsda,
-                      actions,
-                      exceptionClass,
-                      exceptionObject,
-                      context));
+  
+  // The real work of the personality function is captured here
+  return(handleLsda(version,
+                    lsda,
+                    actions,
+                    exceptionClass,
+                    exceptionObject,
+                    context));
 }
 
 
@@ -840,14 +838,14 @@ _Unwind_Reason_Code ourPersonality(int version,
 /// @returns class value
 uint64_t genClass(const unsigned char classChars[], size_t classCharsSize)
 {
-    uint64_t ret = classChars[0];
-
-    for (unsigned i = 1; i < classCharsSize; ++i) {
-        ret <<= 8;
-        ret += classChars[i];
-    }
-
-    return(ret);
+  uint64_t ret = classChars[0];
+  
+  for (unsigned i = 1; i < classCharsSize; ++i) {
+    ret <<= 8;
+    ret += classChars[i];
+  }
+  
+  return(ret);
 }
 
 } // extern "C"
@@ -869,36 +867,36 @@ uint64_t genClass(const unsigned char classChars[], size_t classCharsSize)
 ///        generated, and is used to hold the constant string. A value of 
 ///        false indicates that the constant string will be stored on the 
 ///        stack.
-void generateStringPrint(llvm::LLVMContext& context, 
-                         llvm::Module& module,
-                         llvm::IRBuilder<>& builder, 
+void generateStringPrint(llvm::LLVMContext &context, 
+                         llvm::Module &module,
+                         llvm::IRBuilder<> &builder, 
                          std::string toPrint,
                          bool useGlobal = true) {
-    llvm::Function *printFunct = module.getFunction("printStr");
-
-    llvm::Value *stringVar;
-    llvm::Constant* stringConstant = 
-        llvm::ConstantArray::get(context, toPrint);
-
-    if (useGlobal) {
-        // Note: Does not work without allocation
-        stringVar = 
-            new llvm::GlobalVariable(module, 
-                                     stringConstant->getType(),
-                                     true, 
-                                     llvm::GlobalValue::LinkerPrivateLinkage, 
-                                     stringConstant, 
-                                     "");
-    }
-    else {
-        stringVar = builder.CreateAlloca(stringConstant->getType());
-        builder.CreateStore(stringConstant, stringVar);
-    }
-
-    llvm::Value* cast = 
-        builder.CreatePointerCast(stringVar, 
-                                  builder.getInt8Ty()->getPointerTo());
-    builder.CreateCall(printFunct, cast);
+  llvm::Function *printFunct = module.getFunction("printStr");
+  
+  llvm::Value *stringVar;
+  llvm::Constant *stringConstant = 
+  llvm::ConstantArray::get(context, toPrint);
+  
+  if (useGlobal) {
+    // Note: Does not work without allocation
+    stringVar = 
+    new llvm::GlobalVariable(module, 
+                             stringConstant->getType(),
+                             true, 
+                             llvm::GlobalValue::LinkerPrivateLinkage, 
+                             stringConstant, 
+                             "");
+  }
+  else {
+    stringVar = builder.CreateAlloca(stringConstant->getType());
+    builder.CreateStore(stringConstant, stringVar);
+  }
+  
+  llvm::Value *cast = 
+  builder.CreatePointerCast(stringVar, 
+                            builder.getInt8Ty()->getPointerTo());
+  builder.CreateCall(printFunct, cast);
 }
 
 
@@ -914,35 +912,35 @@ void generateStringPrint(llvm::LLVMContext& context,
 ///        generated, and is used to hold the constant string. A value of 
 ///        false indicates that the constant string will be stored on the 
 ///        stack.
-void generateIntegerPrint(llvm::LLVMContext& context, 
-                          llvm::Module& module,
-                          llvm::IRBuilder<>& builder, 
-                          llvm::Function& printFunct,
-                          llvm::Value& toPrint,
+void generateIntegerPrint(llvm::LLVMContext &context, 
+                          llvm::Module &module,
+                          llvm::IRBuilder<> &builder, 
+                          llvm::Function &printFunct,
+                          llvm::Value &toPrint,
                           std::string format, 
                           bool useGlobal = true) {
-    llvm::Constant *stringConstant = llvm::ConstantArray::get(context, format);
-    llvm::Value *stringVar;
-
-    if (useGlobal) {
-        // Note: Does not seem to work without allocation
-        stringVar = 
-            new llvm::GlobalVariable(module, 
-                                     stringConstant->getType(),
-                                    true, 
-                                     llvm::GlobalValue::LinkerPrivateLinkage, 
-                                     stringConstant, 
-                                     "");
-    }
-    else {
-        stringVar = builder.CreateAlloca(stringConstant->getType());
-        builder.CreateStore(stringConstant, stringVar);
-    }
-
-    llvm::Value* cast = 
-        builder.CreateBitCast(stringVar, 
-                              builder.getInt8Ty()->getPointerTo());
-    builder.CreateCall2(&printFunct, &toPrint, cast);
+  llvm::Constant *stringConstant = llvm::ConstantArray::get(context, format);
+  llvm::Value *stringVar;
+  
+  if (useGlobal) {
+    // Note: Does not seem to work without allocation
+    stringVar = 
+    new llvm::GlobalVariable(module, 
+                             stringConstant->getType(),
+                             true, 
+                             llvm::GlobalValue::LinkerPrivateLinkage, 
+                             stringConstant, 
+                             "");
+  }
+  else {
+    stringVar = builder.CreateAlloca(stringConstant->getType());
+    builder.CreateStore(stringConstant, stringVar);
+  }
+  
+  llvm::Value *cast = 
+  builder.CreateBitCast(stringVar, 
+                        builder.getInt8Ty()->getPointerTo());
+  builder.CreateCall2(&printFunct, &toPrint, cast);
 }
 
 
@@ -965,64 +963,61 @@ void generateIntegerPrint(llvm::LLVMContext& context,
 /// @param exceptionCaughtFlag reference exception caught/thrown status storage
 /// @param exceptionStorage reference to exception pointer storage
 /// @returns newly created block
-static llvm::BasicBlock* createFinallyBlock(llvm::LLVMContext& context, 
-                             llvm::Module& module, 
-                             llvm::IRBuilder<>& builder, 
-                             llvm::Function& toAddTo,
-                             std::string& blockName,
-                             std::string& functionId,
-                             llvm::BasicBlock& terminatorBlock,
-                             llvm::BasicBlock& unwindResumeBlock,
-                             llvm::Value** exceptionCaughtFlag,
-                             llvm::Value** exceptionStorage) {
-    assert(exceptionCaughtFlag && 
-           "ExceptionDemo::createFinallyBlock(...):exceptionCaughtFlag "
-               "is NULL");
-    assert(exceptionStorage && 
-           "ExceptionDemo::createFinallyBlock(...):exceptionStorage "
-               "is NULL");
-
-    *exceptionCaughtFlag = 
-        createEntryBlockAlloca(toAddTo,
-                               "exceptionCaught",
-                               ourExceptionNotThrownState->getType(),
-                               ourExceptionNotThrownState);
-
-    const llvm::PointerType* exceptionStorageType = 
-                                builder.getInt8Ty()->getPointerTo();
-    *exceptionStorage = 
-        createEntryBlockAlloca(toAddTo,
-                               "exceptionStorage",
-                               exceptionStorageType,
-                               llvm::ConstantPointerNull::get(
-                                   exceptionStorageType));
-
-    llvm::BasicBlock *ret = llvm::BasicBlock::Create(context,
-                                                     blockName,
-                                                     &toAddTo);
-
-    builder.SetInsertPoint(ret);
-   
-    std::ostringstream bufferToPrint;
-    bufferToPrint << "Gen: Executing finally block "
-                  << blockName
-                  << " in "
-                  << functionId
-                  << std::endl;
-    generateStringPrint(context, 
-                        module, 
-                        builder, 
-                        bufferToPrint.str(),
-                        USE_GLOBAL_STR_CONSTS);
-
-    llvm::SwitchInst* theSwitch = 
-        builder.CreateSwitch(builder.CreateLoad(*exceptionCaughtFlag), 
-                             &terminatorBlock,
-                             2);
-    theSwitch->addCase(ourExceptionCaughtState, &terminatorBlock);
-    theSwitch->addCase(ourExceptionThrownState, &unwindResumeBlock);
-
-    return(ret);
+static llvm::BasicBlock *createFinallyBlock(llvm::LLVMContext &context, 
+                                            llvm::Module &module, 
+                                            llvm::IRBuilder<> &builder, 
+                                            llvm::Function &toAddTo,
+                                            std::string &blockName,
+                                            std::string &functionId,
+                                            llvm::BasicBlock &terminatorBlock,
+                                            llvm::BasicBlock &unwindResumeBlock,
+                                            llvm::Value **exceptionCaughtFlag,
+                                            llvm::Value **exceptionStorage) {
+  assert(exceptionCaughtFlag && 
+         "ExceptionDemo::createFinallyBlock(...):exceptionCaughtFlag "
+         "is NULL");
+  assert(exceptionStorage && 
+         "ExceptionDemo::createFinallyBlock(...):exceptionStorage "
+         "is NULL");
+  
+  *exceptionCaughtFlag = 
+  createEntryBlockAlloca(toAddTo,
+                         "exceptionCaught",
+                         ourExceptionNotThrownState->getType(),
+                         ourExceptionNotThrownState);
+  
+  const llvm::PointerType *exceptionStorageType = 
+  builder.getInt8Ty()->getPointerTo();
+  *exceptionStorage = 
+  createEntryBlockAlloca(toAddTo,
+                         "exceptionStorage",
+                         exceptionStorageType,
+                         llvm::ConstantPointerNull::get(
+                                                        exceptionStorageType));
+  
+  llvm::BasicBlock *ret = llvm::BasicBlock::Create(context,
+                                                   blockName,
+                                                   &toAddTo);
+  
+  builder.SetInsertPoint(ret);
+  
+  std::ostringstream bufferToPrint;
+  bufferToPrint << "Gen: Executing finally block "
+    << blockName << " in " << functionId << "\n";
+  generateStringPrint(context, 
+                      module, 
+                      builder, 
+                      bufferToPrint.str(),
+                      USE_GLOBAL_STR_CONSTS);
+  
+  llvm::SwitchInst *theSwitch = 
+  builder.CreateSwitch(builder.CreateLoad(*exceptionCaughtFlag), 
+                       &terminatorBlock,
+                       2);
+  theSwitch->addCase(ourExceptionCaughtState, &terminatorBlock);
+  theSwitch->addCase(ourExceptionThrownState, &unwindResumeBlock);
+  
+  return(ret);
 }
 
 
@@ -1038,36 +1033,36 @@ static llvm::BasicBlock* createFinallyBlock(llvm::LLVMContext& context,
 /// @param terminatorBlock terminator "end" block
 /// @param exceptionCaughtFlag exception caught/thrown status
 /// @returns newly created block
-static llvm::BasicBlock* createCatchBlock(llvm::LLVMContext& context, 
-                                          llvm::Module& module, 
-                                          llvm::IRBuilder<>& builder, 
-                                          llvm::Function& toAddTo,
-                                          std::string& blockName,
-                                          std::string& functionId,
-                                          llvm::BasicBlock& terminatorBlock,
-                                          llvm::Value& exceptionCaughtFlag) {
-
-    llvm::BasicBlock *ret = llvm::BasicBlock::Create(context,
-                                                     blockName,
-                                                     &toAddTo);
-
-    builder.SetInsertPoint(ret);
-
-    std::ostringstream bufferToPrint;
-    bufferToPrint << "Gen: Executing catch block "
-                  << blockName
-                  << " in "
-                  << functionId
-                  << std::endl;
-    generateStringPrint(context, 
-                        module, 
-                        builder, 
-                        bufferToPrint.str(),
-                        USE_GLOBAL_STR_CONSTS);
-    builder.CreateStore(ourExceptionCaughtState, &exceptionCaughtFlag);
-    builder.CreateBr(&terminatorBlock);
-
-    return(ret);
+static llvm::BasicBlock *createCatchBlock(llvm::LLVMContext &context, 
+                                          llvm::Module &module, 
+                                          llvm::IRBuilder<> &builder, 
+                                          llvm::Function &toAddTo,
+                                          std::string &blockName,
+                                          std::string &functionId,
+                                          llvm::BasicBlock &terminatorBlock,
+                                          llvm::Value &exceptionCaughtFlag) {
+  
+  llvm::BasicBlock *ret = llvm::BasicBlock::Create(context,
+                                                   blockName,
+                                                   &toAddTo);
+  
+  builder.SetInsertPoint(ret);
+  
+  std::ostringstream bufferToPrint;
+  bufferToPrint << "Gen: Executing catch block "
+  << blockName
+  << " in "
+  << functionId
+  << std::endl;
+  generateStringPrint(context, 
+                      module, 
+                      builder, 
+                      bufferToPrint.str(),
+                      USE_GLOBAL_STR_CONSTS);
+  builder.CreateStore(ourExceptionCaughtState, &exceptionCaughtFlag);
+  builder.CreateBr(&terminatorBlock);
+  
+  return(ret);
 }
 
 
@@ -1091,275 +1086,269 @@ static llvm::BasicBlock* createCatchBlock(llvm::LLVMContext& context,
 /// @param exceptionTypesToCatch array of type info types to "catch"
 /// @returns generated function
 static
-llvm::Function* createCatchWrappedInvokeFunction(llvm::Module& module, 
-                    llvm::IRBuilder<>& builder, 
-                    llvm::FunctionPassManager& fpm,
-                    llvm::Function& toInvoke,
-                    std::string ourId,
-                    unsigned numExceptionsToCatch,
-                    unsigned exceptionTypesToCatch[]) {
-
-    llvm::LLVMContext& context = module.getContext();
-    llvm::Function *toPrint32Int = module.getFunction("print32Int");
-
-    ArgTypes argTypes;
-    argTypes.push_back(builder.getInt32Ty());
-
-    ArgNames argNames;
-    argNames.push_back("exceptTypeToThrow");
-
-    llvm::Function* ret = createFunction(module, 
-                                         builder.getVoidTy(),
-                                         argTypes, 
-                                         argNames, 
-                                         ourId,
-                                         llvm::Function::ExternalLinkage, 
-                                         false, 
-                                         false);
-
-    // Block which calls invoke
-    llvm::BasicBlock *entryBlock = llvm::BasicBlock::Create(context,
-                                                            "entry", 
-                                                            ret);
-    // Normal block for invoke
-    llvm::BasicBlock *normalBlock = llvm::BasicBlock::Create(context, 
-                                                             "normal", 
-                                                             ret);
-    // Unwind block for invoke
-    llvm::BasicBlock *exceptionBlock = 
-        llvm::BasicBlock::Create(context, "exception", ret);
-
-    // Block which routes exception to correct catch handler block
-    llvm::BasicBlock *exceptionRouteBlock = 
-        llvm::BasicBlock::Create(context, "exceptionRoute", ret);
-
-    // Foreign exception handler
-    llvm::BasicBlock *externalExceptionBlock = 
-        llvm::BasicBlock::Create(context, "externalException", ret);
-
-    // Block which calls _Unwind_Resume
-    llvm::BasicBlock *unwindResumeBlock = 
-        llvm::BasicBlock::Create(context, "unwindResume", ret);
-
-    // Clean up block which delete exception if needed
-    llvm::BasicBlock *endBlock = 
-        llvm::BasicBlock::Create(context, "end", ret);
-
-    std::string nextName;
-    std::vector<llvm::BasicBlock*> catchBlocks(numExceptionsToCatch);
-    llvm::Value* exceptionCaughtFlag = NULL;
-    llvm::Value* exceptionStorage = NULL;
-
-    // Finally block which will branch to unwindResumeBlock if 
-    // exception is not caught. Initializes/allocates stack locations.
-    llvm::BasicBlock* finallyBlock = createFinallyBlock(context, 
-                                                        module, 
-                                                        builder, 
-                                                        *ret, 
-                                                        nextName = "finally", 
-                                                        ourId,
-                                                        *endBlock,
-                                                        *unwindResumeBlock,
-                                                        &exceptionCaughtFlag,
-                                                        &exceptionStorage);
-
-    for (unsigned i = 0; i < numExceptionsToCatch; ++i) {
-        nextName = ourTypeInfoNames[exceptionTypesToCatch[i]];
-
-        // One catch block per type info to be caught
-        catchBlocks[i] = createCatchBlock(context, 
-                                          module, 
-                                          builder, 
-                                          *ret,
-                                          nextName, 
-                                          ourId,
-                                          *finallyBlock,
-                                          *exceptionCaughtFlag);
-    }
-
-    // Entry Block
-
-    builder.SetInsertPoint(entryBlock);
-
-    std::vector<llvm::Value*> args;
-    args.push_back(namedValues["exceptTypeToThrow"]);
-    builder.CreateInvoke(&toInvoke, 
-                         normalBlock, 
-                         exceptionBlock, 
-                         args.begin(), 
-                         args.end());
-
-    // End Block
-
-    builder.SetInsertPoint(endBlock);
-
-    generateStringPrint(context, 
-                        module,
-                        builder, 
-                        "Gen: In end block: exiting in " + ourId + ".\n",
-                        USE_GLOBAL_STR_CONSTS);
-    llvm::Function *deleteOurException = 
-                                    module.getFunction("deleteOurException");
-
-    // Note: function handles NULL exceptions
-    builder.CreateCall(deleteOurException, 
-                       builder.CreateLoad(exceptionStorage));
-    builder.CreateRetVoid();
-
-    // Normal Block
-
-    builder.SetInsertPoint(normalBlock);
-
-    generateStringPrint(context, 
-                        module,
-                        builder, 
-                        "Gen: No exception in " + ourId + "!\n",
-                        USE_GLOBAL_STR_CONSTS);
-
-    // Finally block is always called
-    builder.CreateBr(finallyBlock);
-
-    // Unwind Resume Block
-
-    builder.SetInsertPoint(unwindResumeBlock);
-
-    llvm::Function *resumeOurException = 
-                module.getFunction("_Unwind_Resume");
-    builder.CreateCall(resumeOurException, 
-                       builder.CreateLoad(exceptionStorage));
-    builder.CreateUnreachable();
-
-    // Exception Block
-
-    builder.SetInsertPoint(exceptionBlock);
-
-    llvm::Function *ehException = module.getFunction("llvm.eh.exception");
-
-    // Retrieve thrown exception
-    llvm::Value* unwindException = builder.CreateCall(ehException);
-
-    // Store exception and flag
-    builder.CreateStore(unwindException, exceptionStorage);
-    builder.CreateStore(ourExceptionThrownState, exceptionCaughtFlag);
-    llvm::Function *personality = module.getFunction("ourPersonality");
-    llvm::Value* functPtr = 
-        builder.CreatePointerCast(personality, 
-                              builder.getInt8Ty()->getPointerTo());
-
-    args.clear();
-    args.push_back(unwindException);
-    args.push_back(functPtr);
-
-    // Note: Skipping index 0
-    for (unsigned i = 0; i < numExceptionsToCatch; ++i) {
-        // Set up type infos to be caught
-        args.push_back(
-            module.getGlobalVariable(
-                ourTypeInfoNames[exceptionTypesToCatch[i]]));
-    }
-
-    args.push_back(llvm::ConstantInt::get(builder.getInt32Ty(), 0));
-
-    llvm::Function *ehSelector = module.getFunction("llvm.eh.selector");
-
-    // Set up this exeption block as the landing pad which will handle
-    // given type infos. See case Intrinsic::eh_selector in 
-    // SelectionDAGBuilder::visitIntrinsicCall(...) and AddCatchInfo(...)
-    // implemented in FunctionLoweringInfo.cpp to see how the implementation
-    // handles this call. This landing pad (this exception block), will be 
-    // called either because it nees to cleanup (call finally) or a type 
-    // info was found which matched the thrown exception.
-    llvm::Value* retTypeInfoIndex = builder.CreateCall(ehSelector, 
-                                                       args.begin(), 
-                                                       args.end());
-
-    // Retrieve exception_class member from thrown exception 
-    // (_Unwind_Exception instance). This member tells us whether or not
-    // the exception is foreign.
-    llvm::Value* unwindExceptionClass = 
-        builder.CreateLoad(
-            builder.CreateStructGEP(
-                builder.CreatePointerCast(
-                    unwindException, 
-                    ourUnwindExceptionType->getPointerTo()), 
-                0));
-
-    // Branch to the externalExceptionBlock if the exception is foreign or
-    // to a catch router if not. Either way the finally block will be run.
-    builder.CreateCondBr(
-        builder.CreateICmpEQ(unwindExceptionClass,
-                             llvm::ConstantInt::get(builder.getInt64Ty(), 
-                                                    ourBaseExceptionClass)),
-        exceptionRouteBlock,
-        externalExceptionBlock);
-
-    // External Exception Block
-
-    builder.SetInsertPoint(externalExceptionBlock);
-
-    generateStringPrint(context, 
-                        module,
-                        builder, 
-                        "Gen: Foreign exception received.\n",
-                        USE_GLOBAL_STR_CONSTS);
-
-    // Branch to the finally block
-    builder.CreateBr(finallyBlock);
-
-    // Exception Route Block
-
-    builder.SetInsertPoint(exceptionRouteBlock);
-
-    // Casts exception pointer (_Unwind_Exception instance) to parent 
-    // (OurException instance).
-    //
-    // Note: ourBaseFromUnwindOffset is usually negative
-    llvm::Value* typeInfoThrown = 
-        builder.CreatePointerCast(
-            builder.CreateConstGEP1_64(unwindException,
-                                       ourBaseFromUnwindOffset),
-            ourExceptionType->getPointerTo());
-
-    // Retrieve thrown exception type info type
-    //
-    // Note: Index is not relative to pointer but instead to structure
-    //       unlike a true getelementptr (GEP) instruction
-    typeInfoThrown = builder.CreateStructGEP(typeInfoThrown, 0);
-
-    llvm::Value* typeInfoThrownType = 
-                     builder.CreateStructGEP(typeInfoThrown, 0);
-
-    generateIntegerPrint(context, 
-                         module,
-                         builder, 
-                         *toPrint32Int, 
-                         *(builder.CreateLoad(typeInfoThrownType)),
-                         "Gen: Exception type <%d> received (stack unwound) " 
-                                 " in " + 
-                             ourId + 
-                             ".\n",
-                         USE_GLOBAL_STR_CONSTS);
-
-    // Route to matched type info catch block or run cleanup finally block
-    llvm::SwitchInst* switchToCatchBlock = 
-        builder.CreateSwitch(retTypeInfoIndex, 
-                             finallyBlock, 
-                             numExceptionsToCatch);
-
-    unsigned nextTypeToCatch;
-
-    for (unsigned i = 1; i <= numExceptionsToCatch; ++i) {
-        nextTypeToCatch = i - 1;
-        switchToCatchBlock->addCase(llvm::ConstantInt::get(
-                                        llvm::Type::getInt32Ty(context), 
-                                        i),
-                                    catchBlocks[nextTypeToCatch]);
-    }
-
-    llvm::verifyFunction(*ret);
-    fpm.run(*ret);
-
-    return(ret);
+llvm::Function *createCatchWrappedInvokeFunction(llvm::Module &module, 
+                                             llvm::IRBuilder<> &builder, 
+                                             llvm::FunctionPassManager &fpm,
+                                             llvm::Function &toInvoke,
+                                             std::string ourId,
+                                             unsigned numExceptionsToCatch,
+                                             unsigned exceptionTypesToCatch[]) {
+  
+  llvm::LLVMContext &context = module.getContext();
+  llvm::Function *toPrint32Int = module.getFunction("print32Int");
+  
+  ArgTypes argTypes;
+  argTypes.push_back(builder.getInt32Ty());
+  
+  ArgNames argNames;
+  argNames.push_back("exceptTypeToThrow");
+  
+  llvm::Function *ret = createFunction(module, 
+                                       builder.getVoidTy(),
+                                       argTypes, 
+                                       argNames, 
+                                       ourId,
+                                       llvm::Function::ExternalLinkage, 
+                                       false, 
+                                       false);
+  
+  // Block which calls invoke
+  llvm::BasicBlock *entryBlock = llvm::BasicBlock::Create(context,
+                                                          "entry", 
+                                                          ret);
+  // Normal block for invoke
+  llvm::BasicBlock *normalBlock = llvm::BasicBlock::Create(context, 
+                                                           "normal", 
+                                                           ret);
+  // Unwind block for invoke
+  llvm::BasicBlock *exceptionBlock = 
+  llvm::BasicBlock::Create(context, "exception", ret);
+  
+  // Block which routes exception to correct catch handler block
+  llvm::BasicBlock *exceptionRouteBlock = 
+  llvm::BasicBlock::Create(context, "exceptionRoute", ret);
+  
+  // Foreign exception handler
+  llvm::BasicBlock *externalExceptionBlock = 
+  llvm::BasicBlock::Create(context, "externalException", ret);
+  
+  // Block which calls _Unwind_Resume
+  llvm::BasicBlock *unwindResumeBlock = 
+  llvm::BasicBlock::Create(context, "unwindResume", ret);
+  
+  // Clean up block which delete exception if needed
+  llvm::BasicBlock *endBlock = 
+  llvm::BasicBlock::Create(context, "end", ret);
+  
+  std::string nextName;
+  std::vector<llvm::BasicBlock*> catchBlocks(numExceptionsToCatch);
+  llvm::Value *exceptionCaughtFlag = NULL;
+  llvm::Value *exceptionStorage = NULL;
+  
+  // Finally block which will branch to unwindResumeBlock if 
+  // exception is not caught. Initializes/allocates stack locations.
+  llvm::BasicBlock *finallyBlock = createFinallyBlock(context, 
+                                                      module, 
+                                                      builder, 
+                                                      *ret, 
+                                                      nextName = "finally", 
+                                                      ourId,
+                                                      *endBlock,
+                                                      *unwindResumeBlock,
+                                                      &exceptionCaughtFlag,
+                                                      &exceptionStorage);
+  
+  for (unsigned i = 0; i < numExceptionsToCatch; ++i) {
+    nextName = ourTypeInfoNames[exceptionTypesToCatch[i]];
+    
+    // One catch block per type info to be caught
+    catchBlocks[i] = createCatchBlock(context, 
+                                      module, 
+                                      builder, 
+                                      *ret,
+                                      nextName, 
+                                      ourId,
+                                      *finallyBlock,
+                                      *exceptionCaughtFlag);
+  }
+  
+  // Entry Block
+  
+  builder.SetInsertPoint(entryBlock);
+  
+  std::vector<llvm::Value*> args;
+  args.push_back(namedValues["exceptTypeToThrow"]);
+  builder.CreateInvoke(&toInvoke, 
+                       normalBlock, 
+                       exceptionBlock, 
+                       args.begin(), 
+                       args.end());
+  
+  // End Block
+  
+  builder.SetInsertPoint(endBlock);
+  
+  generateStringPrint(context, 
+                      module,
+                      builder, 
+                      "Gen: In end block: exiting in " + ourId + ".\n",
+                      USE_GLOBAL_STR_CONSTS);
+  llvm::Function *deleteOurException = 
+  module.getFunction("deleteOurException");
+  
+  // Note: function handles NULL exceptions
+  builder.CreateCall(deleteOurException, 
+                     builder.CreateLoad(exceptionStorage));
+  builder.CreateRetVoid();
+  
+  // Normal Block
+  
+  builder.SetInsertPoint(normalBlock);
+  
+  generateStringPrint(context, 
+                      module,
+                      builder, 
+                      "Gen: No exception in " + ourId + "!\n",
+                      USE_GLOBAL_STR_CONSTS);
+  
+  // Finally block is always called
+  builder.CreateBr(finallyBlock);
+  
+  // Unwind Resume Block
+  
+  builder.SetInsertPoint(unwindResumeBlock);
+  
+  llvm::Function *resumeOurException = 
+  module.getFunction("_Unwind_Resume");
+  builder.CreateCall(resumeOurException, 
+                     builder.CreateLoad(exceptionStorage));
+  builder.CreateUnreachable();
+  
+  // Exception Block
+  
+  builder.SetInsertPoint(exceptionBlock);
+  
+  llvm::Function *ehException = module.getFunction("llvm.eh.exception");
+  
+  // Retrieve thrown exception
+  llvm::Value *unwindException = builder.CreateCall(ehException);
+  
+  // Store exception and flag
+  builder.CreateStore(unwindException, exceptionStorage);
+  builder.CreateStore(ourExceptionThrownState, exceptionCaughtFlag);
+  llvm::Function *personality = module.getFunction("ourPersonality");
+  llvm::Value *functPtr = 
+  builder.CreatePointerCast(personality, 
+                            builder.getInt8Ty()->getPointerTo());
+  
+  args.clear();
+  args.push_back(unwindException);
+  args.push_back(functPtr);
+  
+  // Note: Skipping index 0
+  for (unsigned i = 0; i < numExceptionsToCatch; ++i) {
+    // Set up type infos to be caught
+    args.push_back(module.getGlobalVariable(
+                                  ourTypeInfoNames[exceptionTypesToCatch[i]]));
+  }
+  
+  args.push_back(llvm::ConstantInt::get(builder.getInt32Ty(), 0));
+  
+  llvm::Function *ehSelector = module.getFunction("llvm.eh.selector");
+  
+  // Set up this exeption block as the landing pad which will handle
+  // given type infos. See case Intrinsic::eh_selector in 
+  // SelectionDAGBuilder::visitIntrinsicCall(...) and AddCatchInfo(...)
+  // implemented in FunctionLoweringInfo.cpp to see how the implementation
+  // handles this call. This landing pad (this exception block), will be 
+  // called either because it nees to cleanup (call finally) or a type 
+  // info was found which matched the thrown exception.
+  llvm::Value *retTypeInfoIndex = builder.CreateCall(ehSelector, 
+                                                     args.begin(), 
+                                                     args.end());
+  
+  // Retrieve exception_class member from thrown exception 
+  // (_Unwind_Exception instance). This member tells us whether or not
+  // the exception is foreign.
+  llvm::Value *unwindExceptionClass = 
+    builder.CreateLoad(builder.CreateStructGEP(
+             builder.CreatePointerCast(unwindException, 
+                                       ourUnwindExceptionType->getPointerTo()), 
+                                               0));
+  
+  // Branch to the externalExceptionBlock if the exception is foreign or
+  // to a catch router if not. Either way the finally block will be run.
+  builder.CreateCondBr(builder.CreateICmpEQ(unwindExceptionClass,
+                            llvm::ConstantInt::get(builder.getInt64Ty(), 
+                                                   ourBaseExceptionClass)),
+                       exceptionRouteBlock,
+                       externalExceptionBlock);
+  
+  // External Exception Block
+  
+  builder.SetInsertPoint(externalExceptionBlock);
+  
+  generateStringPrint(context, 
+                      module,
+                      builder, 
+                      "Gen: Foreign exception received.\n",
+                      USE_GLOBAL_STR_CONSTS);
+  
+  // Branch to the finally block
+  builder.CreateBr(finallyBlock);
+  
+  // Exception Route Block
+  
+  builder.SetInsertPoint(exceptionRouteBlock);
+  
+  // Casts exception pointer (_Unwind_Exception instance) to parent 
+  // (OurException instance).
+  //
+  // Note: ourBaseFromUnwindOffset is usually negative
+  llvm::Value *typeInfoThrown = 
+  builder.CreatePointerCast(builder.CreateConstGEP1_64(unwindException,
+                                                       ourBaseFromUnwindOffset),
+                            ourExceptionType->getPointerTo());
+  
+  // Retrieve thrown exception type info type
+  //
+  // Note: Index is not relative to pointer but instead to structure
+  //       unlike a true getelementptr (GEP) instruction
+  typeInfoThrown = builder.CreateStructGEP(typeInfoThrown, 0);
+  
+  llvm::Value *typeInfoThrownType = 
+  builder.CreateStructGEP(typeInfoThrown, 0);
+  
+  generateIntegerPrint(context, 
+                       module,
+                       builder, 
+                       *toPrint32Int, 
+                       *(builder.CreateLoad(typeInfoThrownType)),
+                       "Gen: Exception type <%d> received (stack unwound) " 
+                       " in " + 
+                       ourId + 
+                       ".\n",
+                       USE_GLOBAL_STR_CONSTS);
+  
+  // Route to matched type info catch block or run cleanup finally block
+  llvm::SwitchInst *switchToCatchBlock = 
+  builder.CreateSwitch(retTypeInfoIndex, 
+                       finallyBlock, 
+                       numExceptionsToCatch);
+  
+  unsigned nextTypeToCatch;
+  
+  for (unsigned i = 1; i <= numExceptionsToCatch; ++i) {
+    nextTypeToCatch = i - 1;
+    switchToCatchBlock->addCase(llvm::ConstantInt::get(
+                                   llvm::Type::getInt32Ty(context), i),
+                                catchBlocks[nextTypeToCatch]);
+  }
+  
+  llvm::verifyFunction(*ret);
+  fpm.run(*ret);
+  
+  return(ret);
 }
 
 
@@ -1378,107 +1367,107 @@ llvm::Function* createCatchWrappedInvokeFunction(llvm::Module& module,
 ///        if the above nativeThrowType matches generated function's arg.
 /// @returns generated function
 static
-llvm::Function* createThrowExceptionFunction(llvm::Module& module, 
-                                         llvm::IRBuilder<>& builder, 
-                                         llvm::FunctionPassManager& fpm,
-                                         std::string ourId,
-                                         int32_t nativeThrowType,
-                                         llvm::Function& nativeThrowFunct) {
-    llvm::LLVMContext& context = module.getContext();
-    namedValues.clear();
-    ArgTypes unwindArgTypes;
-    unwindArgTypes.push_back(builder.getInt32Ty());
-    ArgNames unwindArgNames;
-    unwindArgNames.push_back("exceptTypeToThrow");
-
-    llvm::Function *ret = createFunction(module,
-                                         builder.getVoidTy(),
-                                         unwindArgTypes,
-                                         unwindArgNames,
-                                         ourId,
-                                         llvm::Function::ExternalLinkage,
-                                         false,
-                                         false);
-
-    // Throws either one of our exception or a native C++ exception depending
-    // on a runtime argument value containing a type info type.
-    llvm::BasicBlock *entryBlock = llvm::BasicBlock::Create(context,
-                                                            "entry", 
-                                                            ret);
-    // Throws a foreign exception
-    llvm::BasicBlock *nativeThrowBlock = 
-                                llvm::BasicBlock::Create(context,
-                                                         "nativeThrow", 
-                                                         ret);
-    // Throws one of our Exceptions
-    llvm::BasicBlock *generatedThrowBlock = 
-                                llvm::BasicBlock::Create(context,
-                                                         "generatedThrow", 
-                                                         ret);
-    // Retrieved runtime type info type to throw
-    llvm::Value* exceptionType = namedValues["exceptTypeToThrow"];
-
-    // nativeThrowBlock block
-
-    builder.SetInsertPoint(nativeThrowBlock);
-
-    // Throws foreign exception
-    builder.CreateCall(&nativeThrowFunct, exceptionType);
-    builder.CreateUnreachable();
-
-    // entry block
-
-    builder.SetInsertPoint(entryBlock);
-
-    llvm::Function *toPrint32Int = module.getFunction("print32Int");
-    generateIntegerPrint(context, 
-                         module,
-                         builder, 
-                         *toPrint32Int, 
-                         *exceptionType, 
-                         "\nGen: About to throw exception type <%d> in " + 
-                             ourId + 
-                             ".\n",
-                         USE_GLOBAL_STR_CONSTS);
-
-    // Switches on runtime type info type value to determine whether or not
-    // a foreign exception is thrown. Defaults to throwing one of our 
-    // generated exceptions.
-    llvm::SwitchInst* theSwitch = builder.CreateSwitch(exceptionType,
-                                                       generatedThrowBlock,
-                                                       1);
-
-    theSwitch->addCase(llvm::ConstantInt::get(llvm::Type::getInt32Ty(context), 
-                                              nativeThrowType),
-                       nativeThrowBlock);
-
-    // generatedThrow block
-
-    builder.SetInsertPoint(generatedThrowBlock);
-
-    llvm::Function *createOurException = 
-                module.getFunction("createOurException");
-    llvm::Function *raiseOurException = 
-                module.getFunction("_Unwind_RaiseException");
-
-    // Creates exception to throw with runtime type info type.
-    llvm::Value* exception = 
-        builder.CreateCall(createOurException, 
-                           namedValues["exceptTypeToThrow"]);
-
-    // Throw generated Exception
-    builder.CreateCall(raiseOurException, exception);
-    builder.CreateUnreachable();
-
-    llvm::verifyFunction(*ret);
-    fpm.run(*ret);
-
-    return(ret);
+llvm::Function *createThrowExceptionFunction(llvm::Module &module, 
+                                             llvm::IRBuilder<> &builder, 
+                                             llvm::FunctionPassManager &fpm,
+                                             std::string ourId,
+                                             int32_t nativeThrowType,
+                                             llvm::Function &nativeThrowFunct) {
+  llvm::LLVMContext &context = module.getContext();
+  namedValues.clear();
+  ArgTypes unwindArgTypes;
+  unwindArgTypes.push_back(builder.getInt32Ty());
+  ArgNames unwindArgNames;
+  unwindArgNames.push_back("exceptTypeToThrow");
+  
+  llvm::Function *ret = createFunction(module,
+                                       builder.getVoidTy(),
+                                       unwindArgTypes,
+                                       unwindArgNames,
+                                       ourId,
+                                       llvm::Function::ExternalLinkage,
+                                       false,
+                                       false);
+  
+  // Throws either one of our exception or a native C++ exception depending
+  // on a runtime argument value containing a type info type.
+  llvm::BasicBlock *entryBlock = llvm::BasicBlock::Create(context,
+                                                          "entry", 
+                                                          ret);
+  // Throws a foreign exception
+  llvm::BasicBlock *nativeThrowBlock = 
+  llvm::BasicBlock::Create(context,
+                           "nativeThrow", 
+                           ret);
+  // Throws one of our Exceptions
+  llvm::BasicBlock *generatedThrowBlock = 
+  llvm::BasicBlock::Create(context,
+                           "generatedThrow", 
+                           ret);
+  // Retrieved runtime type info type to throw
+  llvm::Value *exceptionType = namedValues["exceptTypeToThrow"];
+  
+  // nativeThrowBlock block
+  
+  builder.SetInsertPoint(nativeThrowBlock);
+  
+  // Throws foreign exception
+  builder.CreateCall(&nativeThrowFunct, exceptionType);
+  builder.CreateUnreachable();
+  
+  // entry block
+  
+  builder.SetInsertPoint(entryBlock);
+  
+  llvm::Function *toPrint32Int = module.getFunction("print32Int");
+  generateIntegerPrint(context, 
+                       module,
+                       builder, 
+                       *toPrint32Int, 
+                       *exceptionType, 
+                       "\nGen: About to throw exception type <%d> in " + 
+                       ourId + 
+                       ".\n",
+                       USE_GLOBAL_STR_CONSTS);
+  
+  // Switches on runtime type info type value to determine whether or not
+  // a foreign exception is thrown. Defaults to throwing one of our 
+  // generated exceptions.
+  llvm::SwitchInst *theSwitch = builder.CreateSwitch(exceptionType,
+                                                     generatedThrowBlock,
+                                                     1);
+  
+  theSwitch->addCase(llvm::ConstantInt::get(llvm::Type::getInt32Ty(context), 
+                                            nativeThrowType),
+                     nativeThrowBlock);
+  
+  // generatedThrow block
+  
+  builder.SetInsertPoint(generatedThrowBlock);
+  
+  llvm::Function *createOurException = 
+  module.getFunction("createOurException");
+  llvm::Function *raiseOurException = 
+  module.getFunction("_Unwind_RaiseException");
+  
+  // Creates exception to throw with runtime type info type.
+  llvm::Value *exception = 
+  builder.CreateCall(createOurException, 
+                     namedValues["exceptTypeToThrow"]);
+  
+  // Throw generated Exception
+  builder.CreateCall(raiseOurException, exception);
+  builder.CreateUnreachable();
+  
+  llvm::verifyFunction(*ret);
+  fpm.run(*ret);
+  
+  return(ret);
 }
 
 static void createStandardUtilityFunctions(unsigned numTypeInfos,
-                                           llvm::Module& module, 
-                                           llvm::IRBuilder<>& builder);
+                                           llvm::Module &module, 
+                                           llvm::IRBuilder<> &builder);
 
 /// Creates test code by generating and organizing these functions into the 
 /// test case. The test case consists of an outer function setup to invoke
@@ -1500,81 +1489,80 @@ static void createStandardUtilityFunctions(unsigned numTypeInfos,
 /// @param nativeThrowFunctName name of external function which will throw
 ///        a foreign exception
 /// @returns outermost generated test function.
-llvm::Function* createUnwindExceptionTest(llvm::Module& module, 
-                                          llvm::IRBuilder<>& builder, 
-                                          llvm::FunctionPassManager& fpm,
+llvm::Function *createUnwindExceptionTest(llvm::Module &module, 
+                                          llvm::IRBuilder<> &builder, 
+                                          llvm::FunctionPassManager &fpm,
                                           std::string nativeThrowFunctName) {
-    // Number of type infos to generate
-    unsigned numTypeInfos = 6;
-
-    // Initialze intrisics and external functions to use along with exception
-    // and type info globals.
-    createStandardUtilityFunctions(numTypeInfos,
-                                   module,
-                                   builder);
-    llvm::Function *nativeThrowFunct = 
-        module.getFunction(nativeThrowFunctName);
-
-    // Create exception throw function using the value ~0 to cause 
-    // foreign exceptions to be thrown.
-    llvm::Function* throwFunct = 
-                            createThrowExceptionFunction(module,
-                                                         builder,
-                                                         fpm,
-                                                         "throwFunct",
-                                                         ~0,
-                                                         *nativeThrowFunct);
-    // Inner function will catch even type infos
-    unsigned innerExceptionTypesToCatch[] = {6, 2, 4};
-    size_t numExceptionTypesToCatch = sizeof(innerExceptionTypesToCatch) / 
-                                          sizeof(unsigned);
-
-    // Generate inner function.
-    llvm::Function* innerCatchFunct = 
-        createCatchWrappedInvokeFunction(module,
-                                         builder,
-                                         fpm,
-                                         *throwFunct,
-                                         "innerCatchFunct",
-                                         numExceptionTypesToCatch,
-                                         innerExceptionTypesToCatch);
-
-    // Outer function will catch odd type infos
-    unsigned outerExceptionTypesToCatch[] = {3, 1, 5};
-    numExceptionTypesToCatch = sizeof(outerExceptionTypesToCatch) / 
-                                   sizeof(unsigned);
-
-    // Generate outer function
-    llvm::Function* outerCatchFunct = 
-        createCatchWrappedInvokeFunction(module,
-                                         builder,
-                                         fpm,
-                                         *innerCatchFunct,
-                                         "outerCatchFunct",
-                                         numExceptionTypesToCatch,
-                                         outerExceptionTypesToCatch);
-
-    // Return outer function to run
-    return(outerCatchFunct);
+  // Number of type infos to generate
+  unsigned numTypeInfos = 6;
+  
+  // Initialze intrisics and external functions to use along with exception
+  // and type info globals.
+  createStandardUtilityFunctions(numTypeInfos,
+                                 module,
+                                 builder);
+  llvm::Function *nativeThrowFunct = 
+  module.getFunction(nativeThrowFunctName);
+  
+  // Create exception throw function using the value ~0 to cause 
+  // foreign exceptions to be thrown.
+  llvm::Function *throwFunct = 
+  createThrowExceptionFunction(module,
+                               builder,
+                               fpm,
+                               "throwFunct",
+                               ~0,
+                               *nativeThrowFunct);
+  // Inner function will catch even type infos
+  unsigned innerExceptionTypesToCatch[] = {6, 2, 4};
+  size_t numExceptionTypesToCatch = sizeof(innerExceptionTypesToCatch) / 
+  sizeof(unsigned);
+  
+  // Generate inner function.
+  llvm::Function *innerCatchFunct = 
+  createCatchWrappedInvokeFunction(module,
+                                   builder,
+                                   fpm,
+                                   *throwFunct,
+                                   "innerCatchFunct",
+                                   numExceptionTypesToCatch,
+                                   innerExceptionTypesToCatch);
+  
+  // Outer function will catch odd type infos
+  unsigned outerExceptionTypesToCatch[] = {3, 1, 5};
+  numExceptionTypesToCatch = sizeof(outerExceptionTypesToCatch) / 
+  sizeof(unsigned);
+  
+  // Generate outer function
+  llvm::Function *outerCatchFunct = 
+  createCatchWrappedInvokeFunction(module,
+                                   builder,
+                                   fpm,
+                                   *innerCatchFunct,
+                                   "outerCatchFunct",
+                                   numExceptionTypesToCatch,
+                                   outerExceptionTypesToCatch);
+  
+  // Return outer function to run
+  return(outerCatchFunct);
 }
 
 
 /// Represents our foreign exceptions
 class OurCppRunException : public std::runtime_error {
 public:
-    OurCppRunException(const std::string reason) :
-        std::runtime_error(reason) {}
-
-    OurCppRunException (const OurCppRunException& toCopy) :
-        std::runtime_error(toCopy) {}
-
-    OurCppRunException& operator = (const OurCppRunException& toCopy) {
-        return(reinterpret_cast<OurCppRunException&>(
-                   std::runtime_error::operator = (toCopy)
-               ));
-    }
-
-    ~OurCppRunException (void) throw () {}
+  OurCppRunException(const std::string reason) :
+  std::runtime_error(reason) {}
+  
+  OurCppRunException (const OurCppRunException &toCopy) :
+  std::runtime_error(toCopy) {}
+  
+  OurCppRunException &operator = (const OurCppRunException &toCopy) {
+    return(reinterpret_cast<OurCppRunException&>(
+                                 std::runtime_error::operator=(toCopy)));
+  }
+  
+  ~OurCppRunException (void) throw () {}
 };
 
 
@@ -1583,13 +1571,13 @@ public:
 ///        generated function contract.
 extern "C"
 void throwCppException (int32_t ignoreIt) {
-    throw(OurCppRunException("thrown by throwCppException(...)"));
+  throw(OurCppRunException("thrown by throwCppException(...)"));
 }
 
 typedef void (*OurExceptionThrowFunctType) (int32_t typeToThrow);
 
 /// This is a test harness which runs test by executing generated 
-/// function with a type info type to throw. Harness wraps the excecution 
+/// function with a type info type to throw. Harness wraps the execution
 /// of generated function in a C++ try catch clause.
 /// @param engine execution engine to use for executing generated function.
 ///        This demo program expects this to be a JIT instance for demo
@@ -1598,45 +1586,44 @@ typedef void (*OurExceptionThrowFunctType) (int32_t typeToThrow);
 /// @param typeToThrow type info type of generated exception to throw, or
 ///        indicator to cause foreign exception to be thrown.
 static
-void runExceptionThrow(llvm::ExecutionEngine* engine, 
-                       llvm::Function* function, 
+void runExceptionThrow(llvm::ExecutionEngine *engine, 
+                       llvm::Function *function, 
                        int32_t typeToThrow) {
-
-    // Find test's function pointer
-    OurExceptionThrowFunctType functPtr = 
-          reinterpret_cast<OurExceptionThrowFunctType>(
-              reinterpret_cast<intptr_t>(
-                  engine->getPointerToFunction(function)
-              )
-          );
-
-    try {
-        // Run test
-        (*functPtr)(typeToThrow);
-    }
-    catch (OurCppRunException exc) {
-        // Catch foreign C++ exception
-        fprintf(stderr,
-                "\nrunExceptionThrow(...):In C++ catch OurCppRunException "
-                    "with reason: %s.\n", 
-                exc.what());
-    }
-    catch (...) {
-        // Catch all exceptions including our generated ones. I'm not sure
-        // why this latter functionality should work, as it seems that
-        // our exceptions should be foreign to C++ (the _Unwind_Exception::
-        // exception_class should be different from the one used by C++), and
-        // therefore C++ should ignore the generated exceptions. 
-
-        fprintf(stderr,
-                "\nrunExceptionThrow(...):In C++ catch all.\n");
-    }
+  
+  // Find test's function pointer
+  OurExceptionThrowFunctType functPtr = 
+    reinterpret_cast<OurExceptionThrowFunctType>(
+       reinterpret_cast<intptr_t>(engine->getPointerToFunction(function)));
+  
+  try {
+    // Run test
+    (*functPtr)(typeToThrow);
+  }
+  catch (OurCppRunException exc) {
+    // Catch foreign C++ exception
+    fprintf(stderr,
+            "\nrunExceptionThrow(...):In C++ catch OurCppRunException "
+            "with reason: %s.\n", 
+            exc.what());
+  }
+  catch (...) {
+    // Catch all exceptions including our generated ones. I'm not sure
+    // why this latter functionality should work, as it seems that
+    // our exceptions should be foreign to C++ (the _Unwind_Exception::
+    // exception_class should be different from the one used by C++), and
+    // therefore C++ should ignore the generated exceptions. 
+    
+    fprintf(stderr,
+            "\nrunExceptionThrow(...):In C++ catch all.\n");
+  }
 }
 
 //
 // End test functions
 //
 
+typedef llvm::ArrayRef<const llvm::Type*> TypeArray;
+
 /// This initialization routine creates type info globals and 
 /// adds external function declarations to module.
 /// @param numTypeInfos number of linear type info associated type info types
@@ -1644,287 +1631,285 @@ void runExceptionThrow(llvm::ExecutionEngine* engine,
 /// @param module code for module instance
 /// @param builder builder instance
 static void createStandardUtilityFunctions(unsigned numTypeInfos,
-                                           llvm::Module& module, 
-                                           llvm::IRBuilder<>& builder) {
-
-    llvm::LLVMContext& context = module.getContext();
-
-    // Exception initializations
-
-    // Setup exception catch state
-    ourExceptionNotThrownState = 
-                    llvm::ConstantInt::get(llvm::Type::getInt8Ty(context), 0),
-    ourExceptionThrownState = 
-                    llvm::ConstantInt::get(llvm::Type::getInt8Ty(context), 1),
-    ourExceptionCaughtState = 
-                    llvm::ConstantInt::get(llvm::Type::getInt8Ty(context), 2),
-
-
-    // Create our type info type
-    ourTypeInfoType = llvm::StructType::get(context, 
-                                            builder.getInt32Ty(), 
-                                            NULL);
-
-    // Create OurException type
-    ourExceptionType = llvm::StructType::get(context, 
-                                             ourTypeInfoType,
-                                             NULL);
-
-    // Create portion of _Unwind_Exception type
-    //
-    // Note: Declaring only a portion of the _Unwind_Exception struct.
-    //       Does this cause problems?
-    ourUnwindExceptionType = llvm::StructType::get(context, 
-                                                   builder.getInt64Ty(),
-                                                   NULL);
-    struct OurBaseException_t dummyException;
-
-    // Calculate offset of OurException::unwindException member.
-    ourBaseFromUnwindOffset = ((uintptr_t) &dummyException) - 
-                           ((uintptr_t) &(dummyException.unwindException));
-
+                                           llvm::Module &module, 
+                                           llvm::IRBuilder<> &builder) {
+  
+  llvm::LLVMContext &context = module.getContext();
+  
+  // Exception initializations
+  
+  // Setup exception catch state
+  ourExceptionNotThrownState = 
+  llvm::ConstantInt::get(llvm::Type::getInt8Ty(context), 0),
+  ourExceptionThrownState = 
+  llvm::ConstantInt::get(llvm::Type::getInt8Ty(context), 1),
+  ourExceptionCaughtState = 
+  llvm::ConstantInt::get(llvm::Type::getInt8Ty(context), 2),
+  
+  
+  
+  // Create our type info type
+  ourTypeInfoType = llvm::StructType::get(context, 
+                                          TypeArray(builder.getInt32Ty()));
+  
+  // Create OurException type
+  ourExceptionType = llvm::StructType::get(context, 
+                                           TypeArray(ourTypeInfoType));
+  
+  // Create portion of _Unwind_Exception type
+  //
+  // Note: Declaring only a portion of the _Unwind_Exception struct.
+  //       Does this cause problems?
+  ourUnwindExceptionType =
+    llvm::StructType::get(context, TypeArray(builder.getInt64Ty()));
+  struct OurBaseException_t dummyException;
+  
+  // Calculate offset of OurException::unwindException member.
+  ourBaseFromUnwindOffset = ((uintptr_t) &dummyException) - 
+  ((uintptr_t) &(dummyException.unwindException));
+  
 #ifdef DEBUG
-    fprintf(stderr,
-            "createStandardUtilityFunctions(...):ourBaseFromUnwindOffset "
-                "= %lld, sizeof(struct OurBaseException_t) - "
-                "sizeof(struct _Unwind_Exception) = %lu.\n",
-            ourBaseFromUnwindOffset,
-            sizeof(struct OurBaseException_t) - 
-                sizeof(struct _Unwind_Exception));
+  fprintf(stderr,
+          "createStandardUtilityFunctions(...):ourBaseFromUnwindOffset "
+          "= %lld, sizeof(struct OurBaseException_t) - "
+          "sizeof(struct _Unwind_Exception) = %lu.\n",
+          ourBaseFromUnwindOffset,
+          sizeof(struct OurBaseException_t) - 
+          sizeof(struct _Unwind_Exception));
 #endif
-
-    size_t numChars = sizeof(ourBaseExcpClassChars) / sizeof(char);
-
-    // Create our _Unwind_Exception::exception_class value
-    ourBaseExceptionClass = genClass(ourBaseExcpClassChars, numChars);
-
-    // Type infos
-
-    std::string baseStr = "typeInfo", typeInfoName;
-    std::ostringstream typeInfoNameBuilder;
-    std::vector<llvm::Constant*> structVals;
+  
+  size_t numChars = sizeof(ourBaseExcpClassChars) / sizeof(char);
+  
+  // Create our _Unwind_Exception::exception_class value
+  ourBaseExceptionClass = genClass(ourBaseExcpClassChars, numChars);
+  
+  // Type infos
+  
+  std::string baseStr = "typeInfo", typeInfoName;
+  std::ostringstream typeInfoNameBuilder;
+  std::vector<llvm::Constant*> structVals;
+  
+  llvm::Constant *nextStruct;
+  llvm::GlobalVariable *nextGlobal = NULL;
+  
+  // Generate each type info
+  //
+  // Note: First type info is not used.
+  for (unsigned i = 0; i <= numTypeInfos; ++i) {
+    structVals.clear();
+    structVals.push_back(llvm::ConstantInt::get(builder.getInt32Ty(), i));
+    nextStruct = llvm::ConstantStruct::get(ourTypeInfoType, structVals);
     
-    llvm::Constant *nextStruct;
-    llvm::GlobalVariable* nextGlobal = NULL;
-
-    // Generate each type info
-    //
-    // Note: First type info is not used.
-    for (unsigned i = 0; i <= numTypeInfos; ++i) {
-        structVals.clear();
-        structVals.push_back(llvm::ConstantInt::get(builder.getInt32Ty(), i));
-        nextStruct = llvm::ConstantStruct::get(ourTypeInfoType, structVals);
-
-        typeInfoNameBuilder.str("");
-        typeInfoNameBuilder << baseStr << i;
-        typeInfoName = typeInfoNameBuilder.str();
-
-        // Note: Does not seem to work without allocation
-        nextGlobal = 
-            new llvm::GlobalVariable(module, 
-                                     ourTypeInfoType, 
-                                     true, 
-                                     llvm::GlobalValue::ExternalLinkage, 
-                                     nextStruct, 
-                                     typeInfoName);
-
-        ourTypeInfoNames.push_back(typeInfoName);
-        ourTypeInfoNamesIndex[i] = typeInfoName;
-    }
-
-    ArgNames argNames;
-    ArgTypes argTypes;
-    llvm::Function* funct = NULL;
-
-    // print32Int
-
-    const llvm::Type* retType = builder.getVoidTy();
-
-    argTypes.clear();
-    argTypes.push_back(builder.getInt32Ty());
-    argTypes.push_back(builder.getInt8Ty()->getPointerTo());
-
-    argNames.clear();
-
-    createFunction(module, 
-                   retType, 
-                   argTypes, 
-                   argNames, 
-                   "print32Int", 
-                   llvm::Function::ExternalLinkage, 
-                   true, 
-                   false);
-
-    // print64Int
-
-    retType = builder.getVoidTy();
-
-    argTypes.clear();
-    argTypes.push_back(builder.getInt64Ty());
-    argTypes.push_back(builder.getInt8Ty()->getPointerTo());
-
-    argNames.clear();
-
-    createFunction(module, 
-                   retType, 
-                   argTypes, 
-                   argNames, 
-                   "print64Int", 
-                   llvm::Function::ExternalLinkage, 
-                   true, 
-                   false);
-
-    // printStr
-
-    retType = builder.getVoidTy();
-
-    argTypes.clear();
-    argTypes.push_back(builder.getInt8Ty()->getPointerTo());
-
-    argNames.clear();
-
-    createFunction(module, 
-                   retType, 
-                   argTypes, 
-                   argNames, 
-                   "printStr", 
-                   llvm::Function::ExternalLinkage, 
-                   true, 
-                   false);
-
-    // throwCppException
-
-    retType = builder.getVoidTy();
-
-    argTypes.clear();
-    argTypes.push_back(builder.getInt32Ty());
-
-    argNames.clear();
-
-    createFunction(module, 
-                   retType, 
-                   argTypes, 
-                   argNames, 
-                   "throwCppException", 
-                   llvm::Function::ExternalLinkage, 
-                   true, 
-                   false);
-
-    // deleteOurException
-
-    retType = builder.getVoidTy();
-
-    argTypes.clear();
-    argTypes.push_back(builder.getInt8Ty()->getPointerTo());
-
-    argNames.clear();
-
-    createFunction(module, 
-                   retType, 
-                   argTypes, 
-                   argNames, 
-                   "deleteOurException", 
-                   llvm::Function::ExternalLinkage, 
-                   true, 
-                   false);
-
-    // createOurException
-
-    retType = builder.getInt8Ty()->getPointerTo();
-
-    argTypes.clear();
-    argTypes.push_back(builder.getInt32Ty());
-
-    argNames.clear();
-
-    createFunction(module, 
-                   retType, 
-                   argTypes, 
-                   argNames, 
-                   "createOurException", 
-                   llvm::Function::ExternalLinkage, 
-                   true, 
-                   false);
-
-    // _Unwind_RaiseException
-
-    retType = builder.getInt32Ty();
-
-    argTypes.clear();
-    argTypes.push_back(builder.getInt8Ty()->getPointerTo());
-
-    argNames.clear();
-
-    funct = createFunction(module, 
-                           retType, 
-                           argTypes, 
-                           argNames, 
-                           "_Unwind_RaiseException", 
-                           llvm::Function::ExternalLinkage, 
-                           true, 
-                           false);
-
-    funct->addFnAttr(llvm::Attribute::NoReturn);
-
-    // _Unwind_Resume
-
-    retType = builder.getInt32Ty();
-
-    argTypes.clear();
-    argTypes.push_back(builder.getInt8Ty()->getPointerTo());
-
-    argNames.clear();
-
-    funct = createFunction(module, 
-                           retType, 
-                           argTypes, 
-                           argNames, 
-                           "_Unwind_Resume", 
-                           llvm::Function::ExternalLinkage, 
-                           true, 
-                           false);
-
-    funct->addFnAttr(llvm::Attribute::NoReturn);
-
-    // ourPersonality
-
-    retType = builder.getInt32Ty();
-
-    argTypes.clear();
-    argTypes.push_back(builder.getInt32Ty());
-    argTypes.push_back(builder.getInt32Ty());
-    argTypes.push_back(builder.getInt64Ty());
-    argTypes.push_back(builder.getInt8Ty()->getPointerTo());
-    argTypes.push_back(builder.getInt8Ty()->getPointerTo());
-
-    argNames.clear();
-
-    createFunction(module, 
-                   retType, 
-                   argTypes, 
-                   argNames, 
-                   "ourPersonality", 
-                   llvm::Function::ExternalLinkage, 
-                   true, 
-                   false);
-
-    // llvm.eh.selector intrinsic
-
-    getDeclaration(&module, llvm::Intrinsic::eh_selector);
-
-    // llvm.eh.exception intrinsic
-
-    getDeclaration(&module, llvm::Intrinsic::eh_exception);
-
-    // llvm.eh.typeid.for intrinsic
-
-    getDeclaration(&module, llvm::Intrinsic::eh_typeid_for);
+    typeInfoNameBuilder.str("");
+    typeInfoNameBuilder << baseStr << i;
+    typeInfoName = typeInfoNameBuilder.str();
+    
+    // Note: Does not seem to work without allocation
+    nextGlobal = 
+    new llvm::GlobalVariable(module, 
+                             ourTypeInfoType, 
+                             true, 
+                             llvm::GlobalValue::ExternalLinkage, 
+                             nextStruct, 
+                             typeInfoName);
+    
+    ourTypeInfoNames.push_back(typeInfoName);
+    ourTypeInfoNamesIndex[i] = typeInfoName;
+  }
+  
+  ArgNames argNames;
+  ArgTypes argTypes;
+  llvm::Function *funct = NULL;
+  
+  // print32Int
+  
+  const llvm::Type *retType = builder.getVoidTy();
+  
+  argTypes.clear();
+  argTypes.push_back(builder.getInt32Ty());
+  argTypes.push_back(builder.getInt8Ty()->getPointerTo());
+  
+  argNames.clear();
+  
+  createFunction(module, 
+                 retType, 
+                 argTypes, 
+                 argNames, 
+                 "print32Int", 
+                 llvm::Function::ExternalLinkage, 
+                 true, 
+                 false);
+  
+  // print64Int
+  
+  retType = builder.getVoidTy();
+  
+  argTypes.clear();
+  argTypes.push_back(builder.getInt64Ty());
+  argTypes.push_back(builder.getInt8Ty()->getPointerTo());
+  
+  argNames.clear();
+  
+  createFunction(module, 
+                 retType, 
+                 argTypes, 
+                 argNames, 
+                 "print64Int", 
+                 llvm::Function::ExternalLinkage, 
+                 true, 
+                 false);
+  
+  // printStr
+  
+  retType = builder.getVoidTy();
+  
+  argTypes.clear();
+  argTypes.push_back(builder.getInt8Ty()->getPointerTo());
+  
+  argNames.clear();
+  
+  createFunction(module, 
+                 retType, 
+                 argTypes, 
+                 argNames, 
+                 "printStr", 
+                 llvm::Function::ExternalLinkage, 
+                 true, 
+                 false);
+  
+  // throwCppException
+  
+  retType = builder.getVoidTy();
+  
+  argTypes.clear();
+  argTypes.push_back(builder.getInt32Ty());
+  
+  argNames.clear();
+  
+  createFunction(module, 
+                 retType, 
+                 argTypes, 
+                 argNames, 
+                 "throwCppException", 
+                 llvm::Function::ExternalLinkage, 
+                 true, 
+                 false);
+  
+  // deleteOurException
+  
+  retType = builder.getVoidTy();
+  
+  argTypes.clear();
+  argTypes.push_back(builder.getInt8Ty()->getPointerTo());
+  
+  argNames.clear();
+  
+  createFunction(module, 
+                 retType, 
+                 argTypes, 
+                 argNames, 
+                 "deleteOurException", 
+                 llvm::Function::ExternalLinkage, 
+                 true, 
+                 false);
+  
+  // createOurException
+  
+  retType = builder.getInt8Ty()->getPointerTo();
+  
+  argTypes.clear();
+  argTypes.push_back(builder.getInt32Ty());
+  
+  argNames.clear();
+  
+  createFunction(module, 
+                 retType, 
+                 argTypes, 
+                 argNames, 
+                 "createOurException", 
+                 llvm::Function::ExternalLinkage, 
+                 true, 
+                 false);
+  
+  // _Unwind_RaiseException
+  
+  retType = builder.getInt32Ty();
+  
+  argTypes.clear();
+  argTypes.push_back(builder.getInt8Ty()->getPointerTo());
+  
+  argNames.clear();
+  
+  funct = createFunction(module, 
+                         retType, 
+                         argTypes, 
+                         argNames, 
+                         "_Unwind_RaiseException", 
+                         llvm::Function::ExternalLinkage, 
+                         true, 
+                         false);
+  
+  funct->addFnAttr(llvm::Attribute::NoReturn);
+  
+  // _Unwind_Resume
+  
+  retType = builder.getInt32Ty();
+  
+  argTypes.clear();
+  argTypes.push_back(builder.getInt8Ty()->getPointerTo());
+  
+  argNames.clear();
+  
+  funct = createFunction(module, 
+                         retType, 
+                         argTypes, 
+                         argNames, 
+                         "_Unwind_Resume", 
+                         llvm::Function::ExternalLinkage, 
+                         true, 
+                         false);
+  
+  funct->addFnAttr(llvm::Attribute::NoReturn);
+  
+  // ourPersonality
+  
+  retType = builder.getInt32Ty();
+  
+  argTypes.clear();
+  argTypes.push_back(builder.getInt32Ty());
+  argTypes.push_back(builder.getInt32Ty());
+  argTypes.push_back(builder.getInt64Ty());
+  argTypes.push_back(builder.getInt8Ty()->getPointerTo());
+  argTypes.push_back(builder.getInt8Ty()->getPointerTo());
+  
+  argNames.clear();
+  
+  createFunction(module, 
+                 retType, 
+                 argTypes, 
+                 argNames, 
+                 "ourPersonality", 
+                 llvm::Function::ExternalLinkage, 
+                 true, 
+                 false);
+  
+  // llvm.eh.selector intrinsic
+  
+  getDeclaration(&module, llvm::Intrinsic::eh_selector);
+  
+  // llvm.eh.exception intrinsic
+  
+  getDeclaration(&module, llvm::Intrinsic::eh_exception);
+  
+  // llvm.eh.typeid.for intrinsic
+  
+  getDeclaration(&module, llvm::Intrinsic::eh_typeid_for);
 }
 
 
-//===---------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
 // Main test driver code.
-//===---------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
 
 /// Demo main routine which takes the type info types to throw. A test will
 /// be run for each given type info type. While type info types with the value 
@@ -1932,99 +1917,99 @@ static void createStandardUtilityFunctions(unsigned numTypeInfos,
 /// <= 6 and >= 1 will be caught by test functions; and type info types > 6
 /// will result in exceptions which pass through to the test harness. All other
 /// type info types are not supported and could cause a crash.
-int main(int argc, char* argv[]) {
-    if (argc == 1) {
-        fprintf(stderr,
-                "\nUsage: ExceptionDemo <exception type to throw> "
-                    "[<type 2>...<type n>].\n"
-                    "   Each type must have the value of 1 - 6 for "
-                    "generated exceptions to be caught;\n"
-                    "   the value -1 for foreign C++ exceptions to be "
-                    "generated and thrown;\n"
-                    "   or the values > 6 for exceptions to be ignored.\n"
-                    "\nTry: ExceptionDemo 2 3 7 -1\n"
-                    "   for a full test.\n\n");
-        return(0);
-    }
-
-    // If not set, exception handling will not be turned on
-    llvm::JITExceptionHandling = true;
-
-    llvm::InitializeNativeTarget();
-    llvm::LLVMContext& context = llvm::getGlobalContext();
-    llvm::IRBuilder<> theBuilder(context);
-
-    // Make the module, which holds all the code.
-    llvm::Module* module = new llvm::Module("my cool jit", context);
-
-    // Build engine with JIT
-    llvm::EngineBuilder factory(module);
-    factory.setEngineKind(llvm::EngineKind::JIT);
-    factory.setAllocateGVsWithCode(false);
-    llvm::ExecutionEngine* executionEngine = factory.create();
-
-    {
-        llvm::FunctionPassManager fpm(module);
-
-        // Set up the optimizer pipeline.  
-        // Start with registering info about how the
-        // target lays out data structures.
-        fpm.add(new llvm::TargetData(*executionEngine->getTargetData()));
-
-        // Optimizations turned on
+int main(int argc, char *argv[]) {
+  if (argc == 1) {
+    fprintf(stderr,
+            "\nUsage: ExceptionDemo <exception type to throw> "
+            "[<type 2>...<type n>].\n"
+            "   Each type must have the value of 1 - 6 for "
+            "generated exceptions to be caught;\n"
+            "   the value -1 for foreign C++ exceptions to be "
+            "generated and thrown;\n"
+            "   or the values > 6 for exceptions to be ignored.\n"
+            "\nTry: ExceptionDemo 2 3 7 -1\n"
+            "   for a full test.\n\n");
+    return(0);
+  }
+  
+  // If not set, exception handling will not be turned on
+  llvm::JITExceptionHandling = true;
+  
+  llvm::InitializeNativeTarget();
+  llvm::LLVMContext &context = llvm::getGlobalContext();
+  llvm::IRBuilder<> theBuilder(context);
+  
+  // Make the module, which holds all the code.
+  llvm::Module *module = new llvm::Module("my cool jit", context);
+  
+  // Build engine with JIT
+  llvm::EngineBuilder factory(module);
+  factory.setEngineKind(llvm::EngineKind::JIT);
+  factory.setAllocateGVsWithCode(false);
+  llvm::ExecutionEngine *executionEngine = factory.create();
+  
+  {
+    llvm::FunctionPassManager fpm(module);
+    
+    // Set up the optimizer pipeline.  
+    // Start with registering info about how the
+    // target lays out data structures.
+    fpm.add(new llvm::TargetData(*executionEngine->getTargetData()));
+    
+    // Optimizations turned on
 #ifdef ADD_OPT_PASSES
-
-        // Basic AliasAnslysis support for GVN.
-        fpm.add(llvm::createBasicAliasAnalysisPass());
-
-        // Promote allocas to registers.
-        fpm.add(llvm::createPromoteMemoryToRegisterPass());
-
-        // Do simple "peephole" optimizations and bit-twiddling optzns.
-        fpm.add(llvm::createInstructionCombiningPass());
-
-        // Reassociate expressions.
-        fpm.add(llvm::createReassociatePass());
-
-        // Eliminate Common SubExpressions.
-        fpm.add(llvm::createGVNPass());
-
-        // Simplify the control flow graph (deleting unreachable 
-        // blocks, etc).
-        fpm.add(llvm::createCFGSimplificationPass());
+    
+    // Basic AliasAnslysis support for GVN.
+    fpm.add(llvm::createBasicAliasAnalysisPass());
+    
+    // Promote allocas to registers.
+    fpm.add(llvm::createPromoteMemoryToRegisterPass());
+    
+    // Do simple "peephole" optimizations and bit-twiddling optzns.
+    fpm.add(llvm::createInstructionCombiningPass());
+    
+    // Reassociate expressions.
+    fpm.add(llvm::createReassociatePass());
+    
+    // Eliminate Common SubExpressions.
+    fpm.add(llvm::createGVNPass());
+    
+    // Simplify the control flow graph (deleting unreachable 
+    // blocks, etc).
+    fpm.add(llvm::createCFGSimplificationPass());
 #endif  // ADD_OPT_PASSES
-
-        fpm.doInitialization();
-
-        // Generate test code using function throwCppException(...) as
-        // the function which throws foreign exceptions.
-        llvm::Function* toRun = 
-                          createUnwindExceptionTest(*module, 
-                                                    theBuilder, 
-                                                    fpm,
-                                                    "throwCppException");
-
-        fprintf(stderr, "\nBegin module dump:\n\n");
-
-        module->dump();
-
-        fprintf(stderr, "\nEnd module dump:\n");
-
-        fprintf(stderr, "\n\nBegin Test:\n");
-
-        for (int i = 1; i < argc; ++i) {
-            // Run test for each argument whose value is the exception
-            // type to throw.
-            runExceptionThrow(executionEngine, 
-                              toRun, 
-                              (unsigned) strtoul(argv[i], NULL, 10));
-        }
-
-        fprintf(stderr, "\nEnd Test:\n\n");
-    } 
-
-    delete executionEngine;
+    
+    fpm.doInitialization();
+    
+    // Generate test code using function throwCppException(...) as
+    // the function which throws foreign exceptions.
+    llvm::Function *toRun = 
+    createUnwindExceptionTest(*module, 
+                              theBuilder, 
+                              fpm,
+                              "throwCppException");
+    
+    fprintf(stderr, "\nBegin module dump:\n\n");
+    
+    module->dump();
+    
+    fprintf(stderr, "\nEnd module dump:\n");
+    
+    fprintf(stderr, "\n\nBegin Test:\n");
+    
+    for (int i = 1; i < argc; ++i) {
+      // Run test for each argument whose value is the exception
+      // type to throw.
+      runExceptionThrow(executionEngine, 
+                        toRun, 
+                        (unsigned) strtoul(argv[i], NULL, 10));
+    }
+    
+    fprintf(stderr, "\nEnd Test:\n\n");
+  } 
+  
+  delete executionEngine;
   
-    return 0;
+  return 0;
 }
 
diff --git a/examples/HowToUseJIT/HowToUseJIT.cpp b/examples/HowToUseJIT/HowToUseJIT.cpp
index 8e3b6dc..2fb2b5e 100644
--- a/examples/HowToUseJIT/HowToUseJIT.cpp
+++ b/examples/HowToUseJIT/HowToUseJIT.cpp
@@ -45,6 +45,8 @@
 #include "llvm/Target/TargetSelect.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/IRBuilder.h"
+
 using namespace llvm;
 
 int main() {
@@ -68,8 +70,12 @@ int main() {
   // because of the last argument.
   BasicBlock *BB = BasicBlock::Create(Context, "EntryBlock", Add1F);
 
+  // Create a basic block builder with default parameters.  The builder will
+  // automatically append instructions to the basic block `BB'.
+  IRBuilder<> builder(BB);
+
   // Get pointers to the constant `1'.
-  Value *One = ConstantInt::get(Type::getInt32Ty(Context), 1);
+  Value *One = builder.getInt32(1);
 
   // Get pointers to the integer argument of the add1 function...
   assert(Add1F->arg_begin() != Add1F->arg_end()); // Make sure there's an arg
@@ -77,15 +83,15 @@ int main() {
   ArgX->setName("AnArg");            // Give it a nice symbolic name for fun.
 
   // Create the add instruction, inserting it into the end of BB.
-  Instruction *Add = BinaryOperator::CreateAdd(One, ArgX, "addresult", BB);
+  Value *Add = builder.CreateAdd(One, ArgX);
 
   // Create the return instruction and add it to the basic block
-  ReturnInst::Create(Context, Add, BB);
+  builder.CreateRet(Add);
 
   // Now, function add1 is ready.
 
 
-  // Now we going to create function `foo', which returns an int and takes no
+  // Now we're going to create function `foo', which returns an int and takes no
   // arguments.
   Function *FooF =
     cast<Function>(M->getOrInsertFunction("foo", Type::getInt32Ty(Context),
@@ -94,15 +100,18 @@ int main() {
   // Add a basic block to the FooF function.
   BB = BasicBlock::Create(Context, "EntryBlock", FooF);
 
-  // Get pointers to the constant `10'.
-  Value *Ten = ConstantInt::get(Type::getInt32Ty(Context), 10);
+  // Tell the basic block builder to attach itself to the new basic block
+  builder.SetInsertPoint(BB);
+
+  // Get pointer to the constant `10'.
+  Value *Ten = builder.getInt32(10);
 
-  // Pass Ten to the call call:
-  CallInst *Add1CallRes = CallInst::Create(Add1F, Ten, "add1", BB);
+  // Pass Ten to the call to Add1F
+  CallInst *Add1CallRes = builder.CreateCall(Add1F, Ten);
   Add1CallRes->setTailCall(true);
 
   // Create the return instruction and add it to the basic block.
-  ReturnInst::Create(Context, Add1CallRes, BB);
+  builder.CreateRet(Add1CallRes);
 
   // Now we create the JIT.
   ExecutionEngine* EE = EngineBuilder(M).create();
diff --git a/include/llvm-c/Core.h b/include/llvm-c/Core.h
index 39c3cb4..2eccc11 100644
--- a/include/llvm-c/Core.h
+++ b/include/llvm-c/Core.h
@@ -282,6 +282,8 @@ typedef enum {
   LLVMRealPredicateTrue   /**< Always true (always folded) */
 } LLVMRealPredicate;
 
+void LLVMInitializeCore(LLVMPassRegistryRef R);
+
 
 /*===-- Error handling ----------------------------------------------------===*/
 
@@ -1164,6 +1166,7 @@ namespace llvm {
     for (LLVMValueRef *I = Vals, *E = Vals + Length; I != E; ++I)
       cast<T>(*I);
     #endif
+    (void)Length;
     return reinterpret_cast<T**>(Vals);
   }
   
diff --git a/include/llvm-c/Disassembler.h b/include/llvm-c/Disassembler.h
index 63ed9df..3a3eb23 100644
--- a/include/llvm-c/Disassembler.h
+++ b/include/llvm-c/Disassembler.h
@@ -7,16 +7,16 @@
 |*                                                                            *|
 |*===----------------------------------------------------------------------===*|
 |*                                                                            *|
-|* This header provides public interface to a disassembler library.           *|
+|* This header provides a public interface to a disassembler library.         *|
 |* LLVM provides an implementation of this interface.                         *|
 |*                                                                            *|
 \*===----------------------------------------------------------------------===*/
 
 #ifndef LLVM_C_DISASSEMBLER_H
-#define LLVM_C_DISASSEMBLER_H  1
+#define LLVM_C_DISASSEMBLER_H
 
-#include <stddef.h>
 #include "llvm/Support/DataTypes.h"
+#include <stddef.h>
 
 /**
  * An opaque reference to a disassembler context.
@@ -31,29 +31,70 @@ typedef void *LLVMDisasmContextRef;
  * the call back in the DisInfo parameter.  The instruction containing operand
  * is at the PC parameter.  For some instruction sets, there can be more than
  * one operand with symbolic information.  To determine the symbolic operand
- * infomation for each operand, the bytes for the specific operand in the
+ * information for each operand, the bytes for the specific operand in the
  * instruction are specified by the Offset parameter and its byte widith is the
  * size parameter.  For instructions sets with fixed widths and one symbolic
  * operand per instruction, the Offset parameter will be zero and Size parameter
  * will be the instruction width.  The information is returned in TagBuf and is 
  * Triple specific with its specific information defined by the value of
  * TagType for that Triple.  If symbolic information is returned the function
- * returns 1 else it returns 0.
+ * returns 1, otherwise it returns 0.
+ */
+typedef int (*LLVMOpInfoCallback)(void *DisInfo, uint64_t PC,
+                                  uint64_t Offset, uint64_t Size,
+                                  int TagType, void *TagBuf);
+
+/**
+ * The initial support in LLVM MC for the most general form of a relocatable
+ * expression is "AddSymbol - SubtractSymbol + Offset".  For some Darwin targets
+ * this full form is encoded in the relocation information so that AddSymbol and
+ * SubtractSymbol can be link edited independent of each other.  Many other
+ * platforms only allow a relocatable expression of the form AddSymbol + Offset
+ * to be encoded.
+ * 
+ * The LLVMOpInfoCallback() for the TagType value of 1 uses the struct
+ * LLVMOpInfo1.  The value of the relocatable expression for the operand,
+ * including any PC adjustment, is passed in to the call back in the Value
+ * field.  The symbolic information about the operand is returned using all
+ * the fields of the structure with the Offset of the relocatable expression
+ * returned in the Value field.  It is possible that some symbols in the
+ * relocatable expression were assembly temporary symbols, for example
+ * "Ldata - LpicBase + constant", and only the Values of the symbols without
+ * symbol names are present in the relocation information.  The VariantKind
+ * type is one of the Target specific #defines below and is used to print
+ * operands like "_foo@GOT", ":lower16:_foo", etc.
+ */
+struct LLVMOpInfoSymbol1 {
+  uint64_t Present;  /* 1 if this symbol is present */
+  char *Name;        /* symbol name if not NULL */
+  uint64_t Value;    /* symbol value if name is NULL */
+};
+
+struct LLVMOpInfo1 {
+  struct LLVMOpInfoSymbol1 AddSymbol;
+  struct LLVMOpInfoSymbol1 SubtractSymbol;
+  uint64_t Value;
+  uint64_t VariantKind;
+};
+
+/**
+ * The operand VariantKinds for symbolic disassembly.
+ */
+#define LLVMDisassembler_VariantKind_None 0 /* all targets */
+
+/**
+ * The ARM target VariantKinds.
  */
-typedef int (*LLVMOpInfoCallback)(void *DisInfo,
-                                  uint64_t PC,
-                                  uint64_t Offset,
-                                  uint64_t Size,
-                                  int TagType,
-                                  void *TagBuf);
+#define LLVMDisassembler_VariantKind_ARM_HI16 1 /* :upper16: */
+#define LLVMDisassembler_VariantKind_ARM_LO16 2 /* :lower16: */
 
 /**
  * The type for the symbol lookup function.  This may be called by the
- * disassembler for such things like adding a comment for a PC plus a constant
+ * disassembler for things like adding a comment for a PC plus a constant
  * offset load instruction to use a symbol name instead of a load address value.
  * It is passed the block information is saved when the disassembler context is
  * created and a value of a symbol to look up.  If no symbol is found NULL is
- * to be returned.
+ * returned.
  */
 typedef const char *(*LLVMSymbolLookupCallback)(void *DisInfo,
                                                 uint64_t SymbolValue);
@@ -64,40 +105,33 @@ extern "C" {
 
 /**
  * Create a disassembler for the TripleName.  Symbolic disassembly is supported
- * by passing a block of information in the DisInfo parameter and specifing the
- * TagType and call back functions as described above.  These can all be passed
- * as NULL.  If successfull this returns a disassembler context if not it
+ * by passing a block of information in the DisInfo parameter and specifying the
+ * TagType and callback functions as described above.  These can all be passed
+ * as NULL.  If successful, this returns a disassembler context.  If not, it
  * returns NULL.
  */
-extern LLVMDisasmContextRef
-LLVMCreateDisasm(const char *TripleName,
-                 void *DisInfo,
-                 int TagType,
-                 LLVMOpInfoCallback GetOpInfo,
-                 LLVMSymbolLookupCallback SymbolLookUp);
+LLVMDisasmContextRef LLVMCreateDisasm(const char *TripleName, void *DisInfo,
+                                      int TagType, LLVMOpInfoCallback GetOpInfo,
+                                      LLVMSymbolLookupCallback SymbolLookUp);
 
 /**
  * Dispose of a disassembler context.
  */
-extern void
-LLVMDisasmDispose(LLVMDisasmContextRef DC);
+void LLVMDisasmDispose(LLVMDisasmContextRef DC);
 
 /**
- * Disassmble a single instruction using the disassembler context specified in
- * the parameter DC.  The bytes of the instuction are specified in the parameter
- * Bytes, and contains at least BytesSize number of bytes.  The instruction is
- * at the address specified by the PC parameter.  If a valid instruction can be
- * disassembled its string is returned indirectly in OutString which whos size
- * is specified in the parameter OutStringSize.  This function returns the
- * number of bytes in the instruction or zero if there was no valid instruction.
+ * Disassemble a single instruction using the disassembler context specified in
+ * the parameter DC.  The bytes of the instruction are specified in the
+ * parameter Bytes, and contains at least BytesSize number of bytes.  The
+ * instruction is at the address specified by the PC parameter.  If a valid
+ * instruction can be disassembled, its string is returned indirectly in
+ * OutString whose size is specified in the parameter OutStringSize.  This
+ * function returns the number of bytes in the instruction or zero if there was
+ * no valid instruction.
  */
-extern size_t
-LLVMDisasmInstruction(LLVMDisasmContextRef DC,
-                      uint8_t *Bytes,
-                      uint64_t BytesSize,
-                      uint64_t PC,
-                      char *OutString,
-                      size_t OutStringSize);
+size_t LLVMDisasmInstruction(LLVMDisasmContextRef DC, uint8_t *Bytes,
+                             uint64_t BytesSize, uint64_t PC,
+                             char *OutString, size_t OutStringSize);
 
 #ifdef __cplusplus
 }
diff --git a/include/llvm-c/Transforms/Scalar.h b/include/llvm-c/Transforms/Scalar.h
index da05a7d..cf8d71f 100644
--- a/include/llvm-c/Transforms/Scalar.h
+++ b/include/llvm-c/Transforms/Scalar.h
@@ -107,6 +107,13 @@ void LLVMAddCorrelatedValuePropagationPass(LLVMPassManagerRef PM);
 /** See llvm::createEarlyCSEPass function */
 void LLVMAddEarlyCSEPass(LLVMPassManagerRef PM);
 
+/** See llvm::createTypeBasedAliasAnalysisPass function */
+void LLVMAddTypeBasedAliasAnalysisPass(LLVMPassManagerRef PM);
+
+/** See llvm::createBasicAliasAnalysisPass function */
+void LLVMAddBasicAliasAnalysisPass(LLVMPassManagerRef PM);
+
+
 #ifdef __cplusplus
 }
 #endif /* defined(__cplusplus) */
diff --git a/include/llvm-c/lto.h b/include/llvm-c/lto.h
index e4ede9c..7ea7ad0 100644
--- a/include/llvm-c/lto.h
+++ b/include/llvm-c/lto.h
@@ -72,7 +72,7 @@ lto_get_version(void);
 
 
 /**
- * Returns the last error string or NULL if last operation was sucessful.
+ * Returns the last error string or NULL if last operation was successful.
  */
 extern const char*
 lto_get_error_message(void);
@@ -263,7 +263,7 @@ lto_codegen_write_merged_modules(lto_code_gen_t cg, const char* path);
 
 /**
  * Generates code for all added modules into one native object file.
- * On sucess returns a pointer to a generated mach-o/ELF buffer and
+ * On success returns a pointer to a generated mach-o/ELF buffer and
  * length set to the buffer size.  The buffer is owned by the 
  * lto_code_gen_t and will be freed when lto_codegen_dispose()
  * is called, or lto_codegen_compile() is called again.
diff --git a/include/llvm/ADT/APInt.h b/include/llvm/ADT/APInt.h
index 2feef07..e68e579 100644
--- a/include/llvm/ADT/APInt.h
+++ b/include/llvm/ADT/APInt.h
@@ -1241,18 +1241,19 @@ public:
 
   /// toString - Converts an APInt to a string and append it to Str.  Str is
   /// commonly a SmallString.
-  void toString(SmallVectorImpl<char> &Str, unsigned Radix, bool Signed) const;
+  void toString(SmallVectorImpl<char> &Str, unsigned Radix, bool Signed,
+                bool formatAsCLiteral = false) const;
 
   /// Considers the APInt to be unsigned and converts it into a string in the
   /// radix given. The radix can be 2, 8, 10 or 16.
   void toStringUnsigned(SmallVectorImpl<char> &Str, unsigned Radix = 10) const {
-    toString(Str, Radix, false);
+    toString(Str, Radix, false, false);
   }
 
   /// Considers the APInt to be signed and converts it into a string in the
   /// radix given. The radix can be 2, 8, 10 or 16.
   void toStringSigned(SmallVectorImpl<char> &Str, unsigned Radix = 10) const {
-    toString(Str, Radix, true);
+    toString(Str, Radix, true, false);
   }
 
   /// toString - This returns the APInt as a std::string.  Note that this is an
diff --git a/include/llvm/ADT/ArrayRef.h b/include/llvm/ADT/ArrayRef.h
index ebddb12..97e42cb 100644
--- a/include/llvm/ADT/ArrayRef.h
+++ b/include/llvm/ADT/ArrayRef.h
@@ -22,8 +22,8 @@ namespace llvm {
   ///
   /// This class does not own the underlying data, it is expected to be used in
   /// situations where the data resides in some other buffer, whose lifetime
-  /// extends past that of the StringRef. For this reason, it is not in general
-  /// safe to store a ArrayRef.
+  /// extends past that of the ArrayRef. For this reason, it is not in general
+  /// safe to store an ArrayRef.
   ///
   /// This is intended to be trivially copyable, so it should be passed by
   /// value.
diff --git a/include/llvm/ADT/DenseMap.h b/include/llvm/ADT/DenseMap.h
index c0209d9..0f1cfeb 100644
--- a/include/llvm/ADT/DenseMap.h
+++ b/include/llvm/ADT/DenseMap.h
@@ -73,7 +73,7 @@ public:
     }
 #ifndef NDEBUG
     if (NumBuckets)
-      memset(Buckets, 0x5a, sizeof(BucketT)*NumBuckets);
+      memset((void*)Buckets, 0x5a, sizeof(BucketT)*NumBuckets);
 #endif
     operator delete(Buckets);
   }
@@ -252,7 +252,7 @@ private:
 
     if (NumBuckets) {
 #ifndef NDEBUG
-      memset(Buckets, 0x5a, sizeof(BucketT)*NumBuckets);
+      memset((void*)Buckets, 0x5a, sizeof(BucketT)*NumBuckets);
 #endif
       operator delete(Buckets);
     }
@@ -426,7 +426,7 @@ private:
 
 #ifndef NDEBUG
     if (OldNumBuckets)
-      memset(OldBuckets, 0x5a, sizeof(BucketT)*OldNumBuckets);
+      memset((void*)OldBuckets, 0x5a, sizeof(BucketT)*OldNumBuckets);
 #endif
     // Free the old table.
     operator delete(OldBuckets);
@@ -459,13 +459,22 @@ private:
     }
 
 #ifndef NDEBUG
-    memset(OldBuckets, 0x5a, sizeof(BucketT)*OldNumBuckets);
+    memset((void*)OldBuckets, 0x5a, sizeof(BucketT)*OldNumBuckets);
 #endif
     // Free the old table.
     operator delete(OldBuckets);
 
     NumEntries = 0;
   }
+  
+public:
+  /// Return the approximate size (in bytes) of the actual map.
+  /// This is just the raw memory used by DenseMap.
+  /// If entries are pointers to objects, the size of the referenced objects
+  /// are not included.
+  size_t getMemorySize() const {
+    return NumBuckets * sizeof(BucketT);
+  }
 };
 
 template<typename KeyT, typename ValueT,
diff --git a/include/llvm/ADT/DenseMapInfo.h b/include/llvm/ADT/DenseMapInfo.h
index 25e341b..744b6f4 100644
--- a/include/llvm/ADT/DenseMapInfo.h
+++ b/include/llvm/ADT/DenseMapInfo.h
@@ -157,7 +157,10 @@ struct DenseMapInfo<std::pair<T, U> > {
     key ^= (key >> 31);
     return (unsigned)key;
   }
-  static bool isEqual(const Pair& LHS, const Pair& RHS) { return LHS == RHS; }
+  static bool isEqual(const Pair &LHS, const Pair &RHS) {
+    return FirstInfo::isEqual(LHS.first, RHS.first) && 
+           SecondInfo::isEqual(LHS.second, RHS.second);
+  }
 };
 
 } // end namespace llvm
diff --git a/include/llvm/ADT/FoldingSet.h b/include/llvm/ADT/FoldingSet.h
index 879dbd0..d2e0b8f 100644
--- a/include/llvm/ADT/FoldingSet.h
+++ b/include/llvm/ADT/FoldingSet.h
@@ -209,10 +209,10 @@ template<typename T> struct FoldingSetTrait;
 /// for FoldingSetTrait implementations.
 ///
 template<typename T> struct DefaultFoldingSetTrait {
-  static void Profile(const T& X, FoldingSetNodeID& ID) {
+  static void Profile(const T &X, FoldingSetNodeID &ID) {
     X.Profile(ID);
   }
-  static void Profile(T& X, FoldingSetNodeID& ID) {
+  static void Profile(T &X, FoldingSetNodeID &ID) {
     X.Profile(ID);
   }
 
@@ -267,7 +267,7 @@ template<typename T, typename Ctx> struct ContextualFoldingSetTrait
 /// is often much larger than necessary, and the possibility of heap
 /// allocation means it requires a non-trivial destructor call.
 class FoldingSetNodeIDRef {
-  const unsigned* Data;
+  const unsigned *Data;
   size_t Size;
 public:
   FoldingSetNodeIDRef() : Data(0), Size(0) {}
@@ -310,9 +310,10 @@ public:
   void AddInteger(unsigned long long I);
   void AddBoolean(bool B) { AddInteger(B ? 1U : 0U); }
   void AddString(StringRef String);
+  void AddNodeID(const FoldingSetNodeID &ID);
 
   template <typename T>
-  inline void Add(const T& x) { FoldingSetTrait<T>::Profile(x, *this); }
+  inline void Add(const T &x) { FoldingSetTrait<T>::Profile(x, *this); }
 
   /// clear - Clear the accumulated profile, allowing this FoldingSetNodeID
   /// object to be used to compute a new profile.
@@ -548,7 +549,7 @@ public:
     return static_cast<T*>(NodePtr);
   }
 
-  inline FoldingSetIterator& operator++() {          // Preincrement
+  inline FoldingSetIterator &operator++() {          // Preincrement
     advance();
     return *this;
   }
@@ -596,10 +597,10 @@ public:
   FoldingSetBucketIterator(void **Bucket, bool) :
     FoldingSetBucketIteratorImpl(Bucket, true) {}
 
-  T& operator*() const { return *static_cast<T*>(Ptr); }
-  T* operator->() const { return static_cast<T*>(Ptr); }
+  T &operator*() const { return *static_cast<T*>(Ptr); }
+  T *operator->() const { return static_cast<T*>(Ptr); }
 
-  inline FoldingSetBucketIterator& operator++() { // Preincrement
+  inline FoldingSetBucketIterator &operator++() { // Preincrement
     advance();
     return *this;
   }
@@ -615,36 +616,36 @@ template <typename T>
 class FoldingSetNodeWrapper : public FoldingSetNode {
   T data;
 public:
-  explicit FoldingSetNodeWrapper(const T& x) : data(x) {}
+  explicit FoldingSetNodeWrapper(const T &x) : data(x) {}
   virtual ~FoldingSetNodeWrapper() {}
 
   template<typename A1>
-  explicit FoldingSetNodeWrapper(const A1& a1)
+  explicit FoldingSetNodeWrapper(const A1 &a1)
     : data(a1) {}
 
   template <typename A1, typename A2>
-  explicit FoldingSetNodeWrapper(const A1& a1, const A2& a2)
+  explicit FoldingSetNodeWrapper(const A1 &a1, const A2 &a2)
     : data(a1,a2) {}
 
   template <typename A1, typename A2, typename A3>
-  explicit FoldingSetNodeWrapper(const A1& a1, const A2& a2, const A3& a3)
+  explicit FoldingSetNodeWrapper(const A1 &a1, const A2 &a2, const A3 &a3)
     : data(a1,a2,a3) {}
 
   template <typename A1, typename A2, typename A3, typename A4>
-  explicit FoldingSetNodeWrapper(const A1& a1, const A2& a2, const A3& a3,
-                                 const A4& a4)
+  explicit FoldingSetNodeWrapper(const A1 &a1, const A2 &a2, const A3 &a3,
+                                 const A4 &a4)
     : data(a1,a2,a3,a4) {}
 
   template <typename A1, typename A2, typename A3, typename A4, typename A5>
-  explicit FoldingSetNodeWrapper(const A1& a1, const A2& a2, const A3& a3,
-                                 const A4& a4, const A5& a5)
+  explicit FoldingSetNodeWrapper(const A1 &a1, const A2 &a2, const A3 &a3,
+                                 const A4 &a4, const A5 &a5)
   : data(a1,a2,a3,a4,a5) {}
 
 
-  void Profile(FoldingSetNodeID& ID) { FoldingSetTrait<T>::Profile(data, ID); }
+  void Profile(FoldingSetNodeID &ID) { FoldingSetTrait<T>::Profile(data, ID); }
 
-  T& getValue() { return data; }
-  const T& getValue() const { return data; }
+  T &getValue() { return data; }
+  const T &getValue() const { return data; }
 
   operator T&() { return data; }
   operator const T&() const { return data; }
@@ -661,24 +662,19 @@ class FastFoldingSetNode : public FoldingSetNode {
 protected:
   explicit FastFoldingSetNode(const FoldingSetNodeID &ID) : FastID(ID) {}
 public:
-  void Profile(FoldingSetNodeID& ID) const { ID = FastID; }
+  void Profile(FoldingSetNodeID &ID) const { 
+    ID.AddNodeID(FastID); 
+  }
 };
 
 //===----------------------------------------------------------------------===//
 // Partial specializations of FoldingSetTrait.
 
 template<typename T> struct FoldingSetTrait<T*> {
-  static inline void Profile(const T* X, FoldingSetNodeID& ID) {
-    ID.AddPointer(X);
-  }
-};
-
-template<typename T> struct FoldingSetTrait<const T*> {
-  static inline void Profile(const T* X, FoldingSetNodeID& ID) {
+  static inline void Profile(T *X, FoldingSetNodeID &ID) {
     ID.AddPointer(X);
   }
 };
-
 } // End of namespace llvm.
 
 #endif
diff --git a/include/llvm/ADT/ImmutableIntervalMap.h b/include/llvm/ADT/ImmutableIntervalMap.h
index 0d8fcf3..fa7ccb9 100644
--- a/include/llvm/ADT/ImmutableIntervalMap.h
+++ b/include/llvm/ADT/ImmutableIntervalMap.h
@@ -10,6 +10,10 @@
 // This file defines the ImmutableIntervalMap class.
 //
 //===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ADT_IMMUTABLE_INTERVAL_MAP_H
+#define LLVM_ADT_IMMUTABLE_INTERVAL_MAP_H
+
 #include "llvm/ADT/ImmutableMap.h"
 
 namespace llvm {
@@ -240,3 +244,5 @@ private:
 };
 
 } // end namespace llvm
+
+#endif
diff --git a/include/llvm/ADT/PackedVector.h b/include/llvm/ADT/PackedVector.h
new file mode 100644
index 0000000..2eaddc2
--- /dev/null
+++ b/include/llvm/ADT/PackedVector.h
@@ -0,0 +1,158 @@
+//===- llvm/ADT/PackedVector.h - Packed values vector -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the PackedVector class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ADT_PACKEDVECTOR_H
+#define LLVM_ADT_PACKEDVECTOR_H
+
+#include "llvm/ADT/BitVector.h"
+#include <limits>
+
+namespace llvm {
+
+template <typename T, unsigned BitNum, bool isSigned>
+class PackedVectorBase;
+
+// This won't be necessary if we can specialize members without specializing
+// the parent template.
+template <typename T, unsigned BitNum>
+class PackedVectorBase<T, BitNum, false> {
+protected:
+  static T getValue(const llvm::BitVector &Bits, unsigned Idx) {
+    T val = T();
+    for (unsigned i = 0; i != BitNum; ++i)
+      val = T(val | ((Bits[(Idx << (BitNum-1)) + i] ? 1UL : 0UL) << i));
+    return val;
+  }
+
+  static void setValue(llvm::BitVector &Bits, unsigned Idx, T val) {
+    assert((val >> BitNum) == 0 && "value is too big");
+    for (unsigned i = 0; i != BitNum; ++i)
+      Bits[(Idx << (BitNum-1)) + i] = val & (T(1) << i);
+  }
+};
+
+template <typename T, unsigned BitNum>
+class PackedVectorBase<T, BitNum, true> {
+protected:
+  static T getValue(const llvm::BitVector &Bits, unsigned Idx) {
+    T val = T();
+    for (unsigned i = 0; i != BitNum-1; ++i)
+      val = T(val | ((Bits[(Idx << (BitNum-1)) + i] ? 1UL : 0UL) << i));
+    if (Bits[(Idx << (BitNum-1)) + BitNum-1])
+      val = ~val;
+    return val;
+  }
+
+  static void setValue(llvm::BitVector &Bits, unsigned Idx, T val) {
+    if (val < 0) {
+      val = ~val;
+      Bits.set((Idx << (BitNum-1)) + BitNum-1);
+    }
+    assert((val >> (BitNum-1)) == 0 && "value is too big");
+    for (unsigned i = 0; i != BitNum-1; ++i)
+      Bits[(Idx << (BitNum-1)) + i] = val & (T(1) << i);
+  }
+};
+
+/// \brief Store a vector of values using a specific number of bits for each
+/// value. Both signed and unsigned types can be used, e.g
+/// @code
+///   PackedVector<signed, 2> vec;
+/// @endcode
+/// will create a vector accepting values -2, -1, 0, 1. Any other value will hit
+/// an assertion.
+template <typename T, unsigned BitNum>
+class PackedVector : public PackedVectorBase<T, BitNum,
+                                            std::numeric_limits<T>::is_signed> {
+  llvm::BitVector Bits;
+  typedef PackedVectorBase<T, BitNum, std::numeric_limits<T>::is_signed> base;
+
+public:
+  class reference {
+    PackedVector &Vec;
+    const unsigned Idx;
+
+    reference();  // Undefined    
+  public:
+    reference(PackedVector &vec, unsigned idx) : Vec(vec), Idx(idx) { }    
+
+    reference &operator=(T val) {
+      Vec.setValue(Vec.Bits, Idx, val);
+      return *this;
+    }
+    operator T() const {
+      return Vec.getValue(Vec.Bits, Idx);
+    }
+  };
+
+  PackedVector() { }
+  explicit PackedVector(unsigned size) : Bits(size << (BitNum-1)) { }
+
+  bool empty() const { return Bits.empty(); }
+
+  unsigned size() const { return Bits.size() >> (BitNum-1); }
+  
+  void clear() { Bits.clear(); }
+  
+  void resize(unsigned N) { Bits.resize(N << (BitNum-1)); }
+
+  void reserve(unsigned N) { Bits.reserve(N << (BitNum-1)); }
+
+  PackedVector &reset() {
+    Bits.reset();
+    return *this;
+  }
+
+  void push_back(T val) {
+    resize(size()+1);
+    (*this)[size()-1] = val;
+  }
+
+  reference operator[](unsigned Idx) {
+    return reference(*this, Idx);
+  }
+
+  T operator[](unsigned Idx) const {
+    return base::getValue(Bits, Idx);
+  }
+
+  bool operator==(const PackedVector &RHS) const {
+    return Bits == RHS.Bits;
+  }
+
+  bool operator!=(const PackedVector &RHS) const {
+    return Bits != RHS.Bits;
+  }
+
+  const PackedVector &operator=(const PackedVector &RHS) {
+    Bits = RHS.Bits;
+    return *this;
+  }
+
+  PackedVector &operator|=(const PackedVector &RHS) {
+    Bits |= RHS.Bits;
+    return *this;
+  }
+
+  void swap(PackedVector &RHS) {
+    Bits.swap(RHS.Bits);
+  }
+};
+
+// Leave BitNum=0 undefined. 
+template <typename T>
+class PackedVector<T, 0>;
+
+} // end llvm namespace
+
+#endif
diff --git a/include/llvm/ADT/StringExtras.h b/include/llvm/ADT/StringExtras.h
index acbed66..5f5c041 100644
--- a/include/llvm/ADT/StringExtras.h
+++ b/include/llvm/ADT/StringExtras.h
@@ -20,7 +20,6 @@
 #include <cctype>
 #include <cstdio>
 #include <string>
-#include <vector>
 
 namespace llvm {
 template<typename T> class SmallVectorImpl;
@@ -153,7 +152,7 @@ void SplitString(StringRef Source,
                  SmallVectorImpl<StringRef> &OutFragments,
                  StringRef Delimiters = " \t\n\v\f\r");
 
-/// HashString - Hash funtion for strings.
+/// HashString - Hash function for strings.
 ///
 /// This is the Bernstein hash function.
 //
diff --git a/include/llvm/ADT/StringMap.h b/include/llvm/ADT/StringMap.h
index 907c72d..934cacc 100644
--- a/include/llvm/ADT/StringMap.h
+++ b/include/llvm/ADT/StringMap.h
@@ -17,7 +17,6 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Allocator.h"
 #include <cstring>
-#include <string>
 
 namespace llvm {
   template<typename ValueT>
diff --git a/include/llvm/ADT/StringRef.h b/include/llvm/ADT/StringRef.h
index 1766d2b..8396921 100644
--- a/include/llvm/ADT/StringRef.h
+++ b/include/llvm/ADT/StringRef.h
@@ -46,7 +46,14 @@ namespace llvm {
     // integer works around this bug.
     static size_t min(size_t a, size_t b) { return a < b ? a : b; }
     static size_t max(size_t a, size_t b) { return a > b ? a : b; }
-
+    
+    // Workaround memcmp issue with null pointers (undefined behavior)
+    // by providing a specialized version
+    static int compareMemory(const char *Lhs, const char *Rhs, size_t Length) {
+      if (Length == 0) { return 0; }
+      return ::memcmp(Lhs,Rhs,Length);
+    }
+    
   public:
     /// @name Constructors
     /// @{
@@ -56,11 +63,17 @@ namespace llvm {
 
     /// Construct a string ref from a cstring.
     /*implicit*/ StringRef(const char *Str)
-      : Data(Str), Length(::strlen(Str)) {}
+      : Data(Str) {
+        assert(Str && "StringRef cannot be built from a NULL argument");
+        Length = ::strlen(Str); // invoking strlen(NULL) is undefined behavior
+      }
 
     /// Construct a string ref from a pointer and length.
     /*implicit*/ StringRef(const char *data, size_t length)
-      : Data(data), Length(length) {}
+      : Data(data), Length(length) {
+        assert((data || length == 0) &&
+        "StringRef cannot be built from a NULL argument with non-null length");
+      }
 
     /// Construct a string ref from an std::string.
     /*implicit*/ StringRef(const std::string &Str)
@@ -104,7 +117,7 @@ namespace llvm {
     /// compare() when the relative ordering of inequal strings isn't needed.
     bool equals(StringRef RHS) const {
       return (Length == RHS.Length &&
-              memcmp(Data, RHS.Data, RHS.Length) == 0);
+              compareMemory(Data, RHS.Data, RHS.Length) == 0);
     }
 
     /// equals_lower - Check for string equality, ignoring case.
@@ -116,7 +129,7 @@ namespace llvm {
     /// is lexicographically less than, equal to, or greater than the \arg RHS.
     int compare(StringRef RHS) const {
       // Check the prefix for a mismatch.
-      if (int Res = memcmp(Data, RHS.Data, min(Length, RHS.Length)))
+      if (int Res = compareMemory(Data, RHS.Data, min(Length, RHS.Length)))
         return Res < 0 ? -1 : 1;
 
       // Otherwise the prefixes match, so we only need to check the lengths.
@@ -183,13 +196,13 @@ namespace llvm {
     /// startswith - Check if this string starts with the given \arg Prefix.
     bool startswith(StringRef Prefix) const {
       return Length >= Prefix.Length &&
-             memcmp(Data, Prefix.Data, Prefix.Length) == 0;
+             compareMemory(Data, Prefix.Data, Prefix.Length) == 0;
     }
 
     /// endswith - Check if this string ends with the given \arg Suffix.
     bool endswith(StringRef Suffix) const {
       return Length >= Suffix.Length &&
-             memcmp(end() - Suffix.Length, Suffix.Data, Suffix.Length) == 0;
+        compareMemory(end() - Suffix.Length, Suffix.Data, Suffix.Length) == 0;
     }
 
     /// @}
@@ -447,6 +460,10 @@ namespace llvm {
     return LHS.compare(RHS) != -1;
   }
 
+  inline std::string &operator+=(std::string &buffer, llvm::StringRef string) {
+    return buffer.append(string.data(), string.size());
+  }
+
   /// @}
 
   // StringRefs can be treated like a POD type.
diff --git a/include/llvm/ADT/Triple.h b/include/llvm/ADT/Triple.h
index ff9dd19..078033d 100644
--- a/include/llvm/ADT/Triple.h
+++ b/include/llvm/ADT/Triple.h
@@ -64,7 +64,8 @@ public:
     x86_64,  // X86-64: amd64, x86_64
     xcore,   // XCore: xcore
     mblaze,  // MBlaze: mblaze
-    ptx,     // PTX: ptx
+    ptx32,   // PTX: ptx (32-bit)
+    ptx64,   // PTX: ptx (64-bit)
 
     InvalidArch
   };
@@ -83,8 +84,10 @@ public:
     Darwin,
     DragonFly,
     FreeBSD,
+    IOS,
     Linux,
     Lv2,        // PS3
+    MacOSX,
     MinGW32,    // i*86-pc-mingw32, *-w64-mingw32
     NetBSD,
     OpenBSD,
@@ -222,21 +225,81 @@ public:
   /// if the environment component is present).
   StringRef getOSAndEnvironmentName() const;
 
+  /// getOSVersion - Parse the version number from the OS name component of the
+  /// triple, if present.
+  ///
+  /// For example, "fooos1.2.3" would return (1, 2, 3).
+  ///
+  /// If an entry is not defined, it will be returned as 0.
+  void getOSVersion(unsigned &Major, unsigned &Minor, unsigned &Micro) const;
 
-  /// getDarwinNumber - Parse the 'darwin number' out of the specific target
-  /// triple.  For example, if we have darwin8.5 return 8,5,0.  If any entry is
-  /// not defined, return 0's.  This requires that the triple have an OSType of
-  /// darwin before it is called.
-  void getDarwinNumber(unsigned &Maj, unsigned &Min, unsigned &Revision) const;
-
-  /// getDarwinMajorNumber - Return just the major version number, this is
+  /// getOSMajorVersion - Return just the major version number, this is
   /// specialized because it is a common query.
-  unsigned getDarwinMajorNumber() const {
-    unsigned Maj, Min, Rev;
-    getDarwinNumber(Maj, Min, Rev);
+  unsigned getOSMajorVersion() const {
+    unsigned Maj, Min, Micro;
+    getDarwinNumber(Maj, Min, Micro);
     return Maj;
   }
 
+  void getDarwinNumber(unsigned &Major, unsigned &Minor,
+                       unsigned &Micro) const {
+    return getOSVersion(Major, Minor, Micro);
+  }
+
+  unsigned getDarwinMajorNumber() const {
+    return getOSMajorVersion();
+  }
+
+  /// isOSVersionLT - Helper function for doing comparisons against version
+  /// numbers included in the target triple.
+  bool isOSVersionLT(unsigned Major, unsigned Minor = 0,
+                     unsigned Micro = 0) const {
+    unsigned LHS[3];
+    getOSVersion(LHS[0], LHS[1], LHS[2]);
+
+    if (LHS[0] != Major)
+      return LHS[0] < Major;
+    if (LHS[1] != Minor)
+      return LHS[1] < Minor;
+    if (LHS[2] != Micro)
+      return LHS[1] < Micro;
+
+    return false;
+  }
+
+  /// isMacOSX - Is this a Mac OS X triple. For legacy reasons, we support both
+  /// "darwin" and "osx" as OS X triples.
+  bool isMacOSX() const {
+    return getOS() == Triple::Darwin || getOS() == Triple::MacOSX;
+  }
+
+  /// isOSDarwin - Is this a "Darwin" OS (OS X or iOS).
+  bool isOSDarwin() const {
+    return isMacOSX() ||getOS() == Triple::IOS;
+  }
+
+  /// isOSWindows - Is this a "Windows" OS.
+  bool isOSWindows() const {
+    return getOS() == Triple::Win32 || getOS() == Triple::Cygwin ||
+      getOS() == Triple::MinGW32;
+  }
+
+  /// isMacOSXVersionLT - Comparison function for checking OS X version
+  /// compatibility, which handles supporting skewed version numbering schemes
+  /// used by the "darwin" triples.
+  unsigned isMacOSXVersionLT(unsigned Major, unsigned Minor = 0,
+                          unsigned Micro = 0) const {
+    assert(isMacOSX() && "Not an OS X triple!");
+
+    // If this is OS X, expect a sane version number.
+    if (getOS() == Triple::MacOSX)
+      return isOSVersionLT(Major, Minor, Micro);
+
+    // Otherwise, compare to the "Darwin" number.
+    assert(Major == 10 && "Unexpected major version");
+    return isOSVersionLT(Minor + 4, Micro, 0);
+  }
+    
   /// @}
   /// @name Mutators
   /// @{
diff --git a/include/llvm/ADT/ilist.h b/include/llvm/ADT/ilist.h
index 865fcb3..bcacfd9 100644
--- a/include/llvm/ADT/ilist.h
+++ b/include/llvm/ADT/ilist.h
@@ -289,7 +289,7 @@ template<typename NodeTy> struct simplify_type<const ilist_iterator<NodeTy> > {
 //===----------------------------------------------------------------------===//
 //
 /// iplist - The subset of list functionality that can safely be used on nodes
-/// of polymorphic types, i.e. a heterogenous list with a common base class that
+/// of polymorphic types, i.e. a heterogeneous list with a common base class that
 /// holds the next/prev pointers.  The only state of the list itself is a single
 /// pointer to the head of the list.
 ///
diff --git a/include/llvm/Analysis/AliasAnalysis.h b/include/llvm/Analysis/AliasAnalysis.h
index 71a5982..5d8edd1 100644
--- a/include/llvm/Analysis/AliasAnalysis.h
+++ b/include/llvm/Analysis/AliasAnalysis.h
@@ -38,7 +38,7 @@
 #define LLVM_ANALYSIS_ALIAS_ANALYSIS_H
 
 #include "llvm/Support/CallSite.h"
-#include <vector>
+#include "llvm/ADT/DenseMap.h"
 
 namespace llvm {
 
@@ -489,6 +489,32 @@ public:
   }
 };
 
+// Specialize DenseMapInfo for Location.
+template<>
+struct DenseMapInfo<AliasAnalysis::Location> {
+  static inline AliasAnalysis::Location getEmptyKey() {
+    return
+      AliasAnalysis::Location(DenseMapInfo<const Value *>::getEmptyKey(),
+                              0, 0);
+  }
+  static inline AliasAnalysis::Location getTombstoneKey() {
+    return
+      AliasAnalysis::Location(DenseMapInfo<const Value *>::getTombstoneKey(),
+                              0, 0);
+  }
+  static unsigned getHashValue(const AliasAnalysis::Location &Val) {
+    return DenseMapInfo<const Value *>::getHashValue(Val.Ptr) ^
+           DenseMapInfo<uint64_t>::getHashValue(Val.Size) ^
+           DenseMapInfo<const MDNode *>::getHashValue(Val.TBAATag);
+  }
+  static bool isEqual(const AliasAnalysis::Location &LHS,
+                      const AliasAnalysis::Location &RHS) {
+    return LHS.Ptr == RHS.Ptr &&
+           LHS.Size == RHS.Size &&
+           LHS.TBAATag == RHS.TBAATag;
+  }
+};
+
 /// isNoAliasCall - Return true if this pointer is returned by a noalias
 /// function.
 bool isNoAliasCall(const Value *V);
diff --git a/include/llvm/Analysis/AliasSetTracker.h b/include/llvm/Analysis/AliasSetTracker.h
index e844d10..03149c66 100644
--- a/include/llvm/Analysis/AliasSetTracker.h
+++ b/include/llvm/Analysis/AliasSetTracker.h
@@ -259,6 +259,7 @@ private:
       if (CallSites[i] == CS.getInstruction()) {
         CallSites[i] = CallSites.back();
         CallSites.pop_back();
+        --i; --e;  // Revisit the moved entry.
       }
   }
   void setVolatile() { Volatile = true; }
@@ -283,6 +284,7 @@ class AliasSetTracker {
   class ASTCallbackVH : public CallbackVH {
     AliasSetTracker *AST;
     virtual void deleted();
+    virtual void allUsesReplacedWith(Value *);
   public:
     ASTCallbackVH(Value *V, AliasSetTracker *AST = 0);
     ASTCallbackVH &operator=(Value *V);
diff --git a/include/llvm/Analysis/BranchProbabilityInfo.h b/include/llvm/Analysis/BranchProbabilityInfo.h
new file mode 100644
index 0000000..5a17a76
--- /dev/null
+++ b/include/llvm/Analysis/BranchProbabilityInfo.h
@@ -0,0 +1,83 @@
+//===--- BranchProbabilityInfo.h - Branch Probability Analysis --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass is used to evaluate branch probabilties.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_BRANCHPROBABILITYINFO_H
+#define LLVM_ANALYSIS_BRANCHPROBABILITYINFO_H
+
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Analysis/LoopInfo.h"
+
+namespace llvm {
+
+class raw_ostream;
+
+class BranchProbabilityInfo : public FunctionPass {
+
+  // Default weight value. Used when we don't have information about the edge.
+  // TODO: DEFAULT_WEIGHT makes sense during static predication, when none of
+  // the successors have a weight yet. But it doesn't make sense when providing
+  // weight to an edge that may have siblings with non-zero weights. This can
+  // be handled various ways, but it's probably fine for an edge with unknown
+  // weight to just "inherit" the non-zero weight of an adjacent successor.
+  static const uint32_t DEFAULT_WEIGHT = 16;
+
+  typedef std::pair<BasicBlock *, BasicBlock *> Edge;
+
+  DenseMap<Edge, uint32_t> Weights;
+
+  // Get sum of the block successors' weights.
+  uint32_t getSumForBlock(BasicBlock *BB) const;
+
+public:
+  static char ID;
+
+  BranchProbabilityInfo() : FunctionPass(ID) {
+    initializeBranchProbabilityInfoPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addRequired<LoopInfo>();
+    AU.setPreservesAll();
+  }
+
+  bool runOnFunction(Function &F);
+
+  // Returned value is between 1 and UINT32_MAX. Look at
+  // BranchProbabilityInfo.cpp for details.
+  uint32_t getEdgeWeight(BasicBlock *Src, BasicBlock *Dst) const;
+
+  // Look at BranchProbabilityInfo.cpp for details. Use it with caution!
+  void setEdgeWeight(BasicBlock *Src, BasicBlock *Dst, uint32_t Weight);
+
+  // A 'Hot' edge is an edge which probability is >= 80%.
+  bool isEdgeHot(BasicBlock *Src, BasicBlock *Dst) const;
+
+  // Return a hot successor for the block BB or null if there isn't one.
+  BasicBlock *getHotSucc(BasicBlock *BB) const;
+
+  // Return a probability as a fraction between 0 (0% probability) and
+  // 1 (100% probability), however the value is never equal to 0, and can be 1
+  // only iff SRC block has only one successor.
+  BranchProbability getEdgeProbability(BasicBlock *Src, BasicBlock *Dst) const;
+
+  // Print value between 0 (0% probability) and 1 (100% probability),
+  // however the value is never equal to 0, and can be 1 only iff SRC block
+  // has only one successor.
+  raw_ostream &printEdgeProbability(raw_ostream &OS, BasicBlock *Src,
+                                    BasicBlock *Dst) const;
+};
+
+}
+
+#endif
diff --git a/include/llvm/Analysis/CFGPrinter.h b/include/llvm/Analysis/CFGPrinter.h
index ac8f596..61614e3 100644
--- a/include/llvm/Analysis/CFGPrinter.h
+++ b/include/llvm/Analysis/CFGPrinter.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_ANALYSIS_CFGPRINTER_H
 #define LLVM_ANALYSIS_CFGPRINTER_H
 
+#include "llvm/Constants.h"
 #include "llvm/Function.h"
 #include "llvm/Instructions.h"
 #include "llvm/Assembly/Writer.h"
diff --git a/include/llvm/Analysis/CallGraph.h b/include/llvm/Analysis/CallGraph.h
index 089f322..fb77da7 100644
--- a/include/llvm/Analysis/CallGraph.h
+++ b/include/llvm/Analysis/CallGraph.h
@@ -259,6 +259,9 @@ public:
   /// addCalledFunction - Add a function to the list of functions called by this
   /// one.
   void addCalledFunction(CallSite CS, CallGraphNode *M) {
+    assert(!CS.getInstruction() ||
+           !CS.getCalledFunction() ||
+           !CS.getCalledFunction()->isIntrinsic());
     CalledFunctions.push_back(std::make_pair(CS.getInstruction(), M));
     M->AddRef();
   }
diff --git a/include/llvm/Analysis/DIBuilder.h b/include/llvm/Analysis/DIBuilder.h
index 329db64..96c6587 100644
--- a/include/llvm/Analysis/DIBuilder.h
+++ b/include/llvm/Analysis/DIBuilder.h
@@ -16,6 +16,7 @@
 #define LLVM_ANALYSIS_DIBUILDER_H
 
 #include "llvm/Support/DataTypes.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
 
 namespace llvm {
@@ -116,8 +117,9 @@ namespace llvm {
     /// @param Name        Typedef name.
     /// @param File        File where this type is defined.
     /// @param LineNo      Line number.
+    /// @param Context     The surrounding context for the typedef.
     DIType createTypedef(DIType Ty, StringRef Name, DIFile File, 
-                         unsigned LineNo);
+                         unsigned LineNo, DIDescriptor Context);
 
     /// createFriend - Create debugging information entry for a 'friend'.
     DIType createFriend(DIType Ty, DIType FriendTy);
@@ -146,6 +148,30 @@ namespace llvm {
                             uint64_t AlignInBits, uint64_t OffsetInBits, 
                             unsigned Flags, DIType Ty);
 
+    /// createObjCIVar - Create debugging information entry for Objective-C
+    /// instance variable.
+    /// @param Name         Member name.
+    /// @param File         File where this member is defined.
+    /// @param LineNo       Line number.
+    /// @param SizeInBits   Member size.
+    /// @param AlignInBits  Member alignment.
+    /// @param OffsetInBits Member offset.
+    /// @param Flags        Flags to encode member attribute, e.g. private
+    /// @param Ty           Parent type.
+    /// @param PropertyName Name of the Objective C property assoicated with
+    ///                     this ivar.
+    /// @param GetterName   Name of the Objective C property getter selector.
+    /// @param SetterName   Name of the Objective C property setter selector.
+    /// @param PropertyAttributes Objective C property attributes.
+    DIType createObjCIVar(StringRef Name, DIFile File,
+                          unsigned LineNo, uint64_t SizeInBits, 
+                          uint64_t AlignInBits, uint64_t OffsetInBits, 
+                          unsigned Flags, DIType Ty,
+                          StringRef PropertyName = StringRef(),
+                          StringRef PropertyGetterName = StringRef(),
+                          StringRef PropertySetterName = StringRef(),
+                          unsigned PropertyAttributes = 0);
+
     /// createClassType - Create debugging information entry for a class.
     /// @param Scope        Scope in which this class is defined.
     /// @param Name         class name.
@@ -278,7 +304,7 @@ namespace llvm {
     DIDescriptor createUnspecifiedParameter();
 
     /// getOrCreateArray - Get a DIArray, create one if required.
-    DIArray getOrCreateArray(Value *const *Elements, unsigned NumElements);
+    DIArray getOrCreateArray(ArrayRef<Value *> Elements);
 
     /// getOrCreateSubrange - Create a descriptor for a value range.  This
     /// implicitly uniques the values returned.
@@ -345,14 +371,13 @@ namespace llvm {
     /// @param File        File where this variable is defined.
     /// @param LineNo      Line number.
     /// @param Ty          Variable Type
-    /// @param Addr        A pointer to a vector of complex address operations.
-    /// @param NumAddr     Num of address operations in the vector.
+    /// @param Addr        An array of complex address operations.
     /// @param ArgNo       If this variable is an arugment then this argument's
     ///                    number. 1 indicates 1st argument.
     DIVariable createComplexVariable(unsigned Tag, DIDescriptor Scope,
                                      StringRef Name, DIFile F, unsigned LineNo,
-                                     DIType Ty, Value *const *Addr,
-                                     unsigned NumAddr, unsigned ArgNo = 0);
+                                     DIType Ty, ArrayRef<Value *> Addr,
+                                     unsigned ArgNo = 0);
 
     /// createFunction - Create a new descriptor for the specified subprogram.
     /// See comments in DISubprogram for descriptions of these fields.
@@ -377,7 +402,8 @@ namespace llvm {
                                 unsigned Flags = 0,
                                 bool isOptimized = false,
                                 Function *Fn = 0,
-                                MDNode *TParam = 0);
+                                MDNode *TParam = 0,
+                                MDNode *Decl = 0);
 
     /// createMethod - Create a new descriptor for the specified C++ method.
     /// See comments in DISubprogram for descriptions of these fields.
diff --git a/include/llvm/Analysis/DebugInfo.h b/include/llvm/Analysis/DebugInfo.h
index 276ac45..fbee5a6 100644
--- a/include/llvm/Analysis/DebugInfo.h
+++ b/include/llvm/Analysis/DebugInfo.h
@@ -49,15 +49,16 @@ namespace llvm {
   class DIDescriptor {
   public:
     enum {
-      FlagPrivate          = 1 << 0,
-      FlagProtected        = 1 << 1,
-      FlagFwdDecl          = 1 << 2,
-      FlagAppleBlock       = 1 << 3,
-      FlagBlockByrefStruct = 1 << 4,
-      FlagVirtual          = 1 << 5,
-      FlagArtificial       = 1 << 6,
-      FlagExplicit         = 1 << 7,
-      FlagPrototyped       = 1 << 8
+      FlagPrivate            = 1 << 0,
+      FlagProtected          = 1 << 1,
+      FlagFwdDecl            = 1 << 2,
+      FlagAppleBlock         = 1 << 3,
+      FlagBlockByrefStruct   = 1 << 4,
+      FlagVirtual            = 1 << 5,
+      FlagArtificial         = 1 << 6,
+      FlagExplicit           = 1 << 7,
+      FlagPrototyped         = 1 << 8,
+      FlagObjcClassComplete  = 1 << 9
     };
   protected:
     const MDNode *DbgNode;
@@ -271,6 +272,9 @@ namespace llvm {
     bool isArtificial() const {
       return (getFlags() & FlagArtificial) != 0;
     }
+    bool isObjcClassComplete() const {
+      return (getFlags() & FlagObjcClassComplete) != 0;
+    }
     bool isValid() const {
       return DbgNode && (isBasicType() || isDerivedType() || isCompositeType());
     }
@@ -332,6 +336,32 @@ namespace llvm {
     /// return base type size.
     uint64_t getOriginalTypeSize() const;
 
+    StringRef getObjCPropertyName() const { return getStringField(10); }
+    StringRef getObjCPropertyGetterName() const {
+      return getStringField(11);
+    }
+    StringRef getObjCPropertySetterName() const {
+      return getStringField(12);
+    }
+    bool isReadOnlyObjCProperty() {
+      return (getUnsignedField(13) & dwarf::DW_APPLE_PROPERTY_readonly) != 0;
+    }
+    bool isReadWriteObjCProperty() {
+      return (getUnsignedField(13) & dwarf::DW_APPLE_PROPERTY_readwrite) != 0;
+    }
+    bool isAssignObjCProperty() {
+      return (getUnsignedField(13) & dwarf::DW_APPLE_PROPERTY_assign) != 0;
+    }
+    bool isRetainObjCProperty() {
+      return (getUnsignedField(13) & dwarf::DW_APPLE_PROPERTY_retain) != 0;
+    }
+    bool isCopyObjCProperty() {
+      return (getUnsignedField(13) & dwarf::DW_APPLE_PROPERTY_copy) != 0;
+    }
+    bool isNonAtomicObjCProperty() {
+      return (getUnsignedField(13) & dwarf::DW_APPLE_PROPERTY_nonatomic) != 0;
+    }
+
     /// Verify - Verify that a derived type descriptor is well formed.
     bool Verify() const;
 
@@ -512,6 +542,9 @@ namespace llvm {
 
     Function *getFunction() const { return getFunctionField(16); }
     DIArray getTemplateParams() const { return getFieldAs<DIArray>(17); }
+    DISubprogram getFunctionDeclaration() const {
+      return getFieldAs<DISubprogram>(18);
+    }
   };
 
   /// DIGlobalVariable - This is a wrapper for a global variable.
@@ -593,7 +626,9 @@ namespace llvm {
     unsigned getNumAddrElements() const;
     
     uint64_t getAddrElement(unsigned Idx) const {
-      return getUInt64Field(Idx+6);
+      if (getVersion() <= llvm::LLVMDebugVersion8)
+        return getUInt64Field(Idx+6);
+      return getUInt64Field(Idx+7);
     }
 
     /// isBlockByrefVariable - Return true if the variable was declared as
diff --git a/include/llvm/Analysis/FindUsedTypes.h b/include/llvm/Analysis/FindUsedTypes.h
index fc57e1a..3e5da57 100644
--- a/include/llvm/Analysis/FindUsedTypes.h
+++ b/include/llvm/Analysis/FindUsedTypes.h
@@ -14,8 +14,8 @@
 #ifndef LLVM_ANALYSIS_FINDUSEDTYPES_H
 #define LLVM_ANALYSIS_FINDUSEDTYPES_H
 
+#include "llvm/ADT/SetVector.h"
 #include "llvm/Pass.h"
-#include <set>
 
 namespace llvm {
 
@@ -23,7 +23,7 @@ class Type;
 class Value;
 
 class FindUsedTypes : public ModulePass {
-  std::set<const Type *> UsedTypes;
+  SetVector<const Type *> UsedTypes;
 public:
   static char ID; // Pass identification, replacement for typeid
   FindUsedTypes() : ModulePass(ID) {
@@ -33,7 +33,7 @@ public:
   /// getTypes - After the pass has been run, return the set containing all of
   /// the types used in the module.
   ///
-  const std::set<const Type *> &getTypes() const { return UsedTypes; }
+  const SetVector<const Type *> &getTypes() const { return UsedTypes; }
 
   /// Print the types found in the module.  If the optional Module parameter is
   /// passed in, then the types are printed symbolically if possible, using the
diff --git a/include/llvm/Analysis/IVUsers.h b/include/llvm/Analysis/IVUsers.h
index e56d24d..1b78fe4 100644
--- a/include/llvm/Analysis/IVUsers.h
+++ b/include/llvm/Analysis/IVUsers.h
@@ -37,8 +37,8 @@ class TargetData;
 class IVStrideUse : public CallbackVH, public ilist_node<IVStrideUse> {
   friend class IVUsers;
 public:
-  IVStrideUse(IVUsers *P, Instruction* U, Value *O)
-    : CallbackVH(U), Parent(P), OperandValToReplace(O) {
+  IVStrideUse(IVUsers *P, Instruction* U, Value *O, Value *PN)
+    : CallbackVH(U), Parent(P), OperandValToReplace(O), Phi(PN) {
   }
 
   /// getUser - Return the user instruction for this use.
@@ -51,6 +51,11 @@ public:
     setValPtr(NewUser);
   }
 
+  /// getPhi - Return the phi node that represents this IV.
+  PHINode *getPhi() const {
+    return cast<PHINode>(Phi);
+  }
+
   /// getOperandValToReplace - Return the Value of the operand in the user
   /// instruction that this IVStrideUse is representing.
   Value *getOperandValToReplace() const {
@@ -81,6 +86,9 @@ private:
   /// that this IVStrideUse is representing.
   WeakVH OperandValToReplace;
 
+  /// Phi - The loop header phi that represents this IV.
+  WeakVH Phi;
+
   /// PostIncLoops - The set of loops for which Expr has been adjusted to
   /// use post-inc mode. This corresponds with SCEVExpander's post-inc concept.
   PostIncLoopSet PostIncLoops;
@@ -143,9 +151,9 @@ public:
   /// AddUsersIfInteresting - Inspect the specified Instruction.  If it is a
   /// reducible SCEV, recursively add its users to the IVUsesByStride set and
   /// return true.  Otherwise, return false.
-  bool AddUsersIfInteresting(Instruction *I);
+  bool AddUsersIfInteresting(Instruction *I, PHINode *Phi);
 
-  IVStrideUse &AddUser(Instruction *User, Value *Operand);
+  IVStrideUse &AddUser(Instruction *User, Value *Operand, PHINode *Phi);
 
   /// getReplacementExpr - Return a SCEV expression which computes the
   /// value of the OperandValToReplace of the given IVStrideUse.
diff --git a/include/llvm/Analysis/InlineCost.h b/include/llvm/Analysis/InlineCost.h
index b08bf57..a0cce51 100644
--- a/include/llvm/Analysis/InlineCost.h
+++ b/include/llvm/Analysis/InlineCost.h
@@ -43,7 +43,7 @@ namespace llvm {
   /// InlineCost - Represent the cost of inlining a function. This
   /// supports special values for functions which should "always" or
   /// "never" be inlined. Otherwise, the cost represents a unitless
-  /// amount; smaller values increase the likelyhood of the function
+  /// amount; smaller values increase the likelihood of the function
   /// being inlined.
   class InlineCost {
     enum Kind {
diff --git a/include/llvm/Analysis/InstructionSimplify.h b/include/llvm/Analysis/InstructionSimplify.h
index dff1ba2..bc6e55f 100644
--- a/include/llvm/Analysis/InstructionSimplify.h
+++ b/include/llvm/Analysis/InstructionSimplify.h
@@ -55,6 +55,21 @@ namespace llvm {
   Value *SimplifyFDivInst(Value *LHS, Value *RHS, const TargetData *TD = 0,
                           const DominatorTree *DT = 0);
 
+  /// SimplifySRemInst - Given operands for an SRem, see if we can
+  /// fold the result.  If not, this returns null.
+  Value *SimplifySRemInst(Value *LHS, Value *RHS, const TargetData *TD = 0,
+                          const DominatorTree *DT = 0);
+
+  /// SimplifyURemInst - Given operands for a URem, see if we can
+  /// fold the result.  If not, this returns null.
+  Value *SimplifyURemInst(Value *LHS, Value *RHS, const TargetData *TD = 0,
+                          const DominatorTree *DT = 0);
+
+  /// SimplifyFRemInst - Given operands for an FRem, see if we can
+  /// fold the result.  If not, this returns null.
+  Value *SimplifyFRemInst(Value *LHS, Value *RHS, const TargetData *TD = 0,
+                          const DominatorTree *DT = 0);
+
   /// SimplifyShlInst - Given operands for a Shl, see if we can
   /// fold the result.  If not, this returns null.
   Value *SimplifyShlInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
diff --git a/include/llvm/Analysis/Lint.h b/include/llvm/Analysis/Lint.h
index eb65d22..7c88b13 100644
--- a/include/llvm/Analysis/Lint.h
+++ b/include/llvm/Analysis/Lint.h
@@ -20,8 +20,6 @@
 #ifndef LLVM_ANALYSIS_LINT_H
 #define LLVM_ANALYSIS_LINT_H
 
-#include <string>
-
 namespace llvm {
 
 class FunctionPass;
diff --git a/include/llvm/Analysis/MemoryDependenceAnalysis.h b/include/llvm/Analysis/MemoryDependenceAnalysis.h
index 4d5dd19..34860e7 100644
--- a/include/llvm/Analysis/MemoryDependenceAnalysis.h
+++ b/include/llvm/Analysis/MemoryDependenceAnalysis.h
@@ -48,6 +48,11 @@ namespace llvm {
       /// this occurs when we see a may-aliased store to the memory location we
       /// care about.
       ///
+      /// There are several cases that may be interesting here:
+      ///   1. Loads are clobbered by may-alias stores.
+      ///   2. Loads are considered clobbered by partially-aliased loads.  The
+      ///      client may choose to analyze deeper into these cases.
+      ///
       /// A dependence query on the first instruction of the entry block will
       /// return a clobber(self) result.
       Clobber,
@@ -85,18 +90,27 @@ namespace llvm {
     /// get methods: These are static ctor methods for creating various
     /// MemDepResult kinds.
     static MemDepResult getDef(Instruction *Inst) {
+      assert(Inst && "Def requires inst");
       return MemDepResult(PairTy(Inst, Def));
     }
     static MemDepResult getClobber(Instruction *Inst) {
+      assert(Inst && "Clobber requires inst");
       return MemDepResult(PairTy(Inst, Clobber));
     }
     static MemDepResult getNonLocal() {
       return MemDepResult(PairTy(0, NonLocal));
     }
+    static MemDepResult getUnknown() {
+      return MemDepResult(PairTy(0, Clobber));
+    }
 
     /// isClobber - Return true if this MemDepResult represents a query that is
     /// a instruction clobber dependency.
-    bool isClobber() const { return Value.getInt() == Clobber; }
+    bool isClobber() const { return Value.getInt() == Clobber && getInst(); }
+
+    /// isUnknown - Return true if this MemDepResult represents a query which
+    /// cannot and/or will not be computed.
+    bool isUnknown() const { return Value.getInt() == Clobber && !getInst(); }
 
     /// isDef - Return true if this MemDepResult represents a query that is
     /// a instruction definition dependency.
@@ -350,6 +364,20 @@ namespace llvm {
                                           BasicBlock::iterator ScanIt,
                                           BasicBlock *BB);
     
+    
+    /// getLoadLoadClobberFullWidthSize - This is a little bit of analysis that
+    /// looks at a memory location for a load (specified by MemLocBase, Offs,
+    /// and Size) and compares it against a load.  If the specified load could
+    /// be safely widened to a larger integer load that is 1) still efficient,
+    /// 2) safe for the target, and 3) would provide the specified memory
+    /// location value, then this function returns the size in bytes of the
+    /// load width to use.  If not, this returns zero.
+    static unsigned getLoadLoadClobberFullWidthSize(const Value *MemLocBase,
+                                                    int64_t MemLocOffs,
+                                                    unsigned MemLocSize,
+                                                    const LoadInst *LI,
+                                                    const TargetData &TD);
+    
   private:
     MemDepResult getCallSiteDependencyFrom(CallSite C, bool isReadOnlyCall,
                                            BasicBlock::iterator ScanIt,
diff --git a/include/llvm/Analysis/Passes.h b/include/llvm/Analysis/Passes.h
index 0eff75f..a22bd12 100644
--- a/include/llvm/Analysis/Passes.h
+++ b/include/llvm/Analysis/Passes.h
@@ -88,6 +88,13 @@ namespace llvm {
 
   //===--------------------------------------------------------------------===//
   //
+  // createObjCARCAliasAnalysisPass - This pass implements ObjC-ARC-based
+  // alias analysis.
+  //
+  ImmutablePass *createObjCARCAliasAnalysisPass();
+
+  //===--------------------------------------------------------------------===//
+  //
   // createProfileLoaderPass - This pass loads information from a profile dump
   // file.
   //
diff --git a/include/llvm/Analysis/PathProfileInfo.h b/include/llvm/Analysis/PathProfileInfo.h
index 263763f..cef6d2d 100644
--- a/include/llvm/Analysis/PathProfileInfo.h
+++ b/include/llvm/Analysis/PathProfileInfo.h
@@ -16,7 +16,6 @@
 
 #include "llvm/BasicBlock.h"
 #include "llvm/Analysis/PathNumbering.h"
-#include <stack>
 
 namespace llvm {
 
diff --git a/include/llvm/Analysis/RegionInfo.h b/include/llvm/Analysis/RegionInfo.h
index 81b71f9..9d89545 100644
--- a/include/llvm/Analysis/RegionInfo.h
+++ b/include/llvm/Analysis/RegionInfo.h
@@ -146,7 +146,7 @@ inline Region* RegionNode::getNodeAs<Region>() const {
 /// two connections to the remaining graph. It can be used to analyze or
 /// optimize parts of the control flow graph.
 ///
-/// A <em> simple Region </em> is connected to the remaing graph by just two
+/// A <em> simple Region </em> is connected to the remaining graph by just two
 /// edges. One edge entering the Region and another one leaving the Region.
 ///
 /// An <em> extended Region </em> (or just Region) is a subgraph that can be
@@ -443,7 +443,7 @@ public:
 
   /// @brief Move all direct child nodes of this Region to another Region.
   ///
-  /// @param To The Region the child nodes will be transfered to.
+  /// @param To The Region the child nodes will be transferred to.
   void transferChildrenTo(Region *To);
 
   /// @brief Verify if the region is a correct region.
diff --git a/include/llvm/Analysis/RegionIterator.h b/include/llvm/Analysis/RegionIterator.h
index ced5b52..7adc71c 100644
--- a/include/llvm/Analysis/RegionIterator.h
+++ b/include/llvm/Analysis/RegionIterator.h
@@ -20,7 +20,7 @@
 
 namespace llvm {
 //===----------------------------------------------------------------------===//
-/// @brief Hierachical RegionNode successor iterator.
+/// @brief Hierarchical RegionNode successor iterator.
 ///
 /// This iterator iterates over all successors of a RegionNode.
 ///
diff --git a/include/llvm/Analysis/RegionPass.h b/include/llvm/Analysis/RegionPass.h
index aedc06a..1a93859 100644
--- a/include/llvm/Analysis/RegionPass.h
+++ b/include/llvm/Analysis/RegionPass.h
@@ -54,7 +54,7 @@ public:
   /// @brief Get a pass to print the LLVM IR in the region.
   ///
   /// @param O      The ouput stream to print the Region.
-  /// @param Banner The banner to seperate different printed passes.
+  /// @param Banner The banner to separate different printed passes.
   ///
   /// @return The pass to print the LLVM IR in the region.
   Pass *createPrinterPass(raw_ostream &O, const std::string &Banner) const;
@@ -109,7 +109,7 @@ public:
   /// @brief Print passes managed by this manager.
   void dumpPassStructure(unsigned Offset);
 
-  /// @brief Print passes contained by this manager.
+  /// @brief Get passes contained by this manager.
   Pass *getContainedPass(unsigned N) {
     assert(N < PassVector.size() && "Pass number out of range!");
     Pass *FP = static_cast<Pass *>(PassVector[N]);
diff --git a/include/llvm/Analysis/ScalarEvolution.h b/include/llvm/Analysis/ScalarEvolution.h
index 6df5433..554524a 100644
--- a/include/llvm/Analysis/ScalarEvolution.h
+++ b/include/llvm/Analysis/ScalarEvolution.h
@@ -24,6 +24,7 @@
 #include "llvm/Pass.h"
 #include "llvm/Instructions.h"
 #include "llvm/Function.h"
+#include "llvm/Operator.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/ValueHandle.h"
 #include "llvm/Support/Allocator.h"
@@ -269,30 +270,30 @@ namespace llvm {
 
     /// BackedgeTakenCounts - Cache the backedge-taken count of the loops for
     /// this function as they are computed.
-    std::map<const Loop*, BackedgeTakenInfo> BackedgeTakenCounts;
+    DenseMap<const Loop*, BackedgeTakenInfo> BackedgeTakenCounts;
 
     /// ConstantEvolutionLoopExitValue - This map contains entries for all of
     /// the PHI instructions that we attempt to compute constant evolutions for.
     /// This allows us to avoid potentially expensive recomputation of these
     /// properties.  An instruction maps to null if we are unable to compute its
     /// exit value.
-    std::map<PHINode*, Constant*> ConstantEvolutionLoopExitValue;
+    DenseMap<PHINode*, Constant*> ConstantEvolutionLoopExitValue;
 
     /// ValuesAtScopes - This map contains entries for all the expressions
     /// that we attempt to compute getSCEVAtScope information for, which can
     /// be expensive in extreme cases.
-    std::map<const SCEV *,
+    DenseMap<const SCEV *,
              std::map<const Loop *, const SCEV *> > ValuesAtScopes;
 
     /// LoopDispositions - Memoized computeLoopDisposition results.
-    std::map<const SCEV *,
+    DenseMap<const SCEV *,
              std::map<const Loop *, LoopDisposition> > LoopDispositions;
 
     /// computeLoopDisposition - Compute a LoopDisposition value.
     LoopDisposition computeLoopDisposition(const SCEV *S, const Loop *L);
 
     /// BlockDispositions - Memoized computeBlockDisposition results.
-    std::map<const SCEV *,
+    DenseMap<const SCEV *,
              std::map<const BasicBlock *, BlockDisposition> > BlockDispositions;
 
     /// computeBlockDisposition - Compute a BlockDisposition value.
diff --git a/include/llvm/Argument.h b/include/llvm/Argument.h
index 71c001f..ff86378 100644
--- a/include/llvm/Argument.h
+++ b/include/llvm/Argument.h
@@ -51,6 +51,9 @@ public:
   /// hasByValAttr - Return true if this argument has the byval attribute on it
   /// in its containing function.
   bool hasByValAttr() const;
+  
+  /// getParamAlignment - If this is a byval argument, return its alignment.
+  unsigned getParamAlignment() const;
 
   /// hasNestAttr - Return true if this argument has the nest attribute on
   /// it in its containing function.
diff --git a/include/llvm/Attributes.h b/include/llvm/Attributes.h
index da6188b..233eab8 100644
--- a/include/llvm/Attributes.h
+++ b/include/llvm/Attributes.h
@@ -67,6 +67,23 @@ const Attributes StackAlignment  = 7<<26; ///< Alignment of stack for
                                           ///alignstack(1))
 const Attributes Hotpatch    = 1<<29;     ///< Function should have special
                                           ///'hotpatch' sequence in prologue
+const Attributes UWTable     = 1<<30;     ///< Function must be in a unwind
+                                          ///table
+const Attributes NonLazyBind = 1U<<31;    ///< Function is called early and/or
+                                          ///  often, so lazy binding isn't
+                                          ///  worthwhile.
+
+/// Note that uwtable is about the ABI or the user mandating an entry in the
+/// unwind table. The nounwind attribute is about an exception passing by the
+/// function.
+/// In a theoretical system that uses tables for profiling and sjlj for
+/// exceptions, they would be fully independent. In a normal system that
+/// uses tables for both, the semantics are:
+/// nil                = Needs an entry because an exception might pass by.
+/// nounwind           = No need for an entry
+/// uwtable            = Needs an entry because the ABI says so and because
+///                      an exception might pass by.
+/// uwtable + nounwind = Needs an entry because the ABI says so.
 
 /// @brief Attributes that only apply to function parameters.
 const Attributes ParameterOnly = ByVal | Nest | StructRet | NoCapture;
@@ -76,7 +93,7 @@ const Attributes ParameterOnly = ByVal | Nest | StructRet | NoCapture;
 const Attributes FunctionOnly = NoReturn | NoUnwind | ReadNone | ReadOnly |
   NoInline | AlwaysInline | OptimizeForSize | StackProtect | StackProtectReq |
   NoRedZone | NoImplicitFloat | Naked | InlineHint | StackAlignment |
-  Hotpatch;
+  Hotpatch | UWTable | NonLazyBind;
 
 /// @brief Parameter attributes that do not apply to vararg call arguments.
 const Attributes VarArgsIncompatible = StructRet;
diff --git a/include/llvm/Bitcode/Archive.h b/include/llvm/Bitcode/Archive.h
index 4abfa6e..f89a86c 100644
--- a/include/llvm/Bitcode/Archive.h
+++ b/include/llvm/Bitcode/Archive.h
@@ -435,7 +435,7 @@ class Archive {
     /// to determine just enough information to create an ArchiveMember object
     /// which is then inserted into the Archive object's ilist at the location
     /// given by \p where.
-    /// @returns true if an error occured, false otherwise
+    /// @returns true if an error occurred, false otherwise
     /// @brief Add a file to the archive.
     bool addFileBefore(
       const sys::Path& filename, ///< The file to be added
diff --git a/include/llvm/CodeGen/AsmPrinter.h b/include/llvm/CodeGen/AsmPrinter.h
index a071feb..5eea099 100644
--- a/include/llvm/CodeGen/AsmPrinter.h
+++ b/include/llvm/CodeGen/AsmPrinter.h
@@ -183,6 +183,17 @@ namespace llvm {
     /// function.
     void EmitFunctionBody();
 
+    void emitPrologLabel(const MachineInstr &MI);
+
+    enum CFIMoveType {
+      CFI_M_None,
+      CFI_M_EH,
+      CFI_M_Debug
+    };
+    CFIMoveType needsCFIMoves();
+
+    bool needsSEHMoves();
+
     /// EmitConstantPool - Print to the current output stream assembly
     /// representations of the constants in the constant pool MCP. This is
     /// used to print out constants which have been "spilled to memory" by
@@ -381,15 +392,16 @@ namespace llvm {
     /// encoding specified.
     virtual unsigned getISAEncoding() { return 0; }
 
+    /// EmitDwarfRegOp - Emit dwarf register operation.
+    virtual void EmitDwarfRegOp(const MachineLocation &MLoc) const;
+
     //===------------------------------------------------------------------===//
     // Dwarf Lowering Routines
     //===------------------------------------------------------------------===//
 
-    /// EmitFrameMoves - Emit frame instructions to describe the layout of the
+    /// EmitCFIFrameMove - Emit frame instruction to describe the layout of the
     /// frame.
-    void EmitFrameMoves(const std::vector<MachineMove> &Moves,
-                        MCSymbol *BaseLabel, bool isEH) const;
-    void EmitCFIFrameMoves(const std::vector<MachineMove> &Moves) const;
+    void EmitCFIFrameMove(const MachineMove &Move) const;
 
     //===------------------------------------------------------------------===//
     // Inline Asm Support
diff --git a/include/llvm/CodeGen/CalcSpillWeights.h b/include/llvm/CodeGen/CalcSpillWeights.h
index 1f5f088..60edcc5 100644
--- a/include/llvm/CodeGen/CalcSpillWeights.h
+++ b/include/llvm/CodeGen/CalcSpillWeights.h
@@ -40,14 +40,14 @@ namespace llvm {
   /// VirtRegAuxInfo - Calculate auxiliary information for a virtual
   /// register such as its spill weight and allocation hint.
   class VirtRegAuxInfo {
-    MachineFunction &mf_;
-    LiveIntervals &lis_;
-    const MachineLoopInfo &loops_;
-    DenseMap<unsigned, float> hint_;
+    MachineFunction &MF;
+    LiveIntervals &LIS;
+    const MachineLoopInfo &Loops;
+    DenseMap<unsigned, float> Hint;
   public:
     VirtRegAuxInfo(MachineFunction &mf, LiveIntervals &lis,
                    const MachineLoopInfo &loops) :
-      mf_(mf), lis_(lis), loops_(loops) {}
+      MF(mf), LIS(lis), Loops(loops) {}
 
     /// CalculateRegClass - recompute the register class for reg from its uses.
     /// Since the register class can affect the allocation hint, this function
diff --git a/include/llvm/CodeGen/CallingConvLower.h b/include/llvm/CodeGen/CallingConvLower.h
index 2a9bbdf..77dc644 100644
--- a/include/llvm/CodeGen/CallingConvLower.h
+++ b/include/llvm/CodeGen/CallingConvLower.h
@@ -16,6 +16,7 @@
 #define LLVM_CODEGEN_CALLINGCONVLOWER_H
 
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/Target/TargetCallingConv.h"
 #include "llvm/CallingConv.h"
@@ -141,12 +142,19 @@ typedef bool CCCustomFn(unsigned &ValNo, MVT &ValVT,
                         MVT &LocVT, CCValAssign::LocInfo &LocInfo,
                         ISD::ArgFlagsTy &ArgFlags, CCState &State);
 
+/// ParmContext - This enum tracks whether calling convention lowering is in
+/// the context of prologue or call generation. Not all backends make use of
+/// this information.
+typedef enum { Unknown, Prologue, Call } ParmContext;
+
 /// CCState - This class holds information needed while lowering arguments and
 /// return values.  It captures which registers are already assigned and which
 /// stack slots are used.  It provides accessors to allocate these values.
 class CCState {
+private:
   CallingConv::ID CallingConv;
   bool IsVarArg;
+  MachineFunction &MF;
   const TargetMachine &TM;
   const TargetRegisterInfo &TRI;
   SmallVector<CCValAssign, 16> &Locs;
@@ -154,9 +162,16 @@ class CCState {
 
   unsigned StackOffset;
   SmallVector<uint32_t, 16> UsedRegs;
+  unsigned FirstByValReg;
+  bool FirstByValRegValid;
+
+protected:
+  ParmContext CallOrPrologue;
+
 public:
-  CCState(CallingConv::ID CC, bool isVarArg, const TargetMachine &TM,
-          SmallVector<CCValAssign, 16> &locs, LLVMContext &C);
+  CCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF,
+          const TargetMachine &TM, SmallVector<CCValAssign, 16> &locs,
+          LLVMContext &C);
 
   void addLoc(const CCValAssign &V) {
     Locs.push_back(V);
@@ -164,6 +179,7 @@ public:
 
   LLVMContext &getContext() const { return Context; }
   const TargetMachine &getTarget() const { return TM; }
+  MachineFunction &getMachineFunction() const { return MF; }
   CallingConv::ID getCallingConv() const { return CallingConv; }
   bool isVarArg() const { return IsVarArg; }
 
@@ -288,6 +304,15 @@ public:
                    MVT LocVT, CCValAssign::LocInfo LocInfo,
                    int MinSize, int MinAlign, ISD::ArgFlagsTy ArgFlags);
 
+  // First GPR that carries part of a byval aggregate that's split
+  // between registers and memory.
+  unsigned getFirstByValReg() { return FirstByValRegValid ? FirstByValReg : 0; }
+  void setFirstByValReg(unsigned r) { FirstByValReg = r; FirstByValRegValid = true; }
+  void clearFirstByValReg() { FirstByValReg = 0; FirstByValRegValid = false; }
+  bool isFirstByValRegValid() { return FirstByValRegValid; }
+
+  ParmContext getCallOrPrologue() { return CallOrPrologue; }
+
 private:
   /// MarkAllocated - Mark a register and all of its aliases as allocated.
   void MarkAllocated(unsigned Reg);
diff --git a/include/llvm/CodeGen/EdgeBundles.h b/include/llvm/CodeGen/EdgeBundles.h
index 2c5215a..8aab3c6 100644
--- a/include/llvm/CodeGen/EdgeBundles.h
+++ b/include/llvm/CodeGen/EdgeBundles.h
@@ -16,6 +16,7 @@
 #ifndef LLVM_CODEGEN_EDGEBUNDLES_H
 #define LLVM_CODEGEN_EDGEBUNDLES_H
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/IntEqClasses.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 
@@ -29,6 +30,9 @@ class EdgeBundles : public MachineFunctionPass {
   ///   2*BB->getNumber()+1 -> Outgoing bundle.
   IntEqClasses EC;
 
+  /// Blocks - Map each bundle to a list of basic block numbers.
+  SmallVector<SmallVector<unsigned, 8>, 4> Blocks;
+
 public:
   static char ID;
   EdgeBundles() : MachineFunctionPass(ID) {}
@@ -40,6 +44,9 @@ public:
   /// getNumBundles - Return the total number of bundles in the CFG.
   unsigned getNumBundles() const { return EC.getNumClasses(); }
 
+  /// getBlocks - Return an array of blocks that are connected to Bundle.
+  ArrayRef<unsigned> getBlocks(unsigned Bundle) { return Blocks[Bundle]; }
+
   /// getMachineFunction - Return the last machine function computed.
   const MachineFunction *getMachineFunction() const { return MF; }
 
diff --git a/include/llvm/CodeGen/FastISel.h b/include/llvm/CodeGen/FastISel.h
index 6b237f8..962a4e2 100644
--- a/include/llvm/CodeGen/FastISel.h
+++ b/include/llvm/CodeGen/FastISel.h
@@ -204,15 +204,6 @@ protected:
                         unsigned Op0, bool Op0IsKill,
                         uint64_t Imm, MVT ImmType);
 
-  /// FastEmit_rf_ - This method is a wrapper of FastEmit_rf. It first tries
-  /// to emit an instruction with an immediate operand using FastEmit_rf.
-  /// If that fails, it materializes the immediate into a register and try
-  /// FastEmit_rr instead.
-  unsigned FastEmit_rf_(MVT VT,
-                        unsigned Opcode,
-                        unsigned Op0, bool Op0IsKill,
-                        const ConstantFP *FPImm, MVT ImmType);
-
   /// FastEmit_i - This method is called by target-independent code
   /// to request that an instruction with the given type, opcode, and
   /// immediate operand be emitted.
@@ -250,9 +241,18 @@ protected:
                            unsigned Op0, bool Op0IsKill,
                            unsigned Op1, bool Op1IsKill);
 
-  /// FastEmitInst_ri - Emit a MachineInstr with two register operands
+  /// FastEmitInst_rrr - Emit a MachineInstr with three register operands
   /// and a result register in the given register class.
   ///
+  unsigned FastEmitInst_rrr(unsigned MachineInstOpcode,
+                           const TargetRegisterClass *RC,
+                           unsigned Op0, bool Op0IsKill,
+                           unsigned Op1, bool Op1IsKill,
+                           unsigned Op2, bool Op2IsKill);
+
+  /// FastEmitInst_ri - Emit a MachineInstr with a register operand,
+  /// an immediate, and a result register in the given register class.
+  ///
   unsigned FastEmitInst_ri(unsigned MachineInstOpcode,
                            const TargetRegisterClass *RC,
                            unsigned Op0, bool Op0IsKill,
@@ -289,6 +289,11 @@ protected:
                           const TargetRegisterClass *RC,
                           uint64_t Imm);
 
+  /// FastEmitInst_ii - Emit a MachineInstr with a two immediate operands.
+  unsigned FastEmitInst_ii(unsigned MachineInstrOpcode,
+                          const TargetRegisterClass *RC,
+                          uint64_t Imm1, uint64_t Imm2);
+
   /// FastEmitInst_extractsubreg - Emit a MachineInstr for an extract_subreg
   /// from a specified index of a superregister to a specified type.
   unsigned FastEmitInst_extractsubreg(MVT RetVT,
@@ -305,7 +310,7 @@ protected:
   /// the CFG.
   void FastEmitBranch(MachineBasicBlock *MBB, DebugLoc DL);
 
-  unsigned UpdateValueMap(const Value* I, unsigned Reg);
+  void UpdateValueMap(const Value* I, unsigned Reg, unsigned NumRegs = 1);
 
   unsigned createResultReg(const TargetRegisterClass *RC);
 
@@ -321,6 +326,10 @@ protected:
     return 0;
   }
 
+  virtual unsigned TargetMaterializeFloatZero(const ConstantFP* CF) {
+    return 0;
+  }
+
 private:
   bool SelectBinaryOp(const User *I, unsigned ISDOpcode);
 
@@ -334,6 +343,8 @@ private:
 
   bool SelectCast(const User *I, unsigned Opcode);
 
+  bool SelectExtractValue(const User *I);
+
   /// HandlePHINodesInSuccessorBlocks - Handle PHI nodes in successor blocks.
   /// Emit code to ensure constants are copied into registers when needed.
   /// Remember the virtual registers that need to be added to the Machine PHI
diff --git a/include/llvm/CodeGen/FunctionLoweringInfo.h b/include/llvm/CodeGen/FunctionLoweringInfo.h
index 4421cc0..84bbf48 100644
--- a/include/llvm/CodeGen/FunctionLoweringInfo.h
+++ b/include/llvm/CodeGen/FunctionLoweringInfo.h
@@ -24,6 +24,7 @@
 #ifndef NDEBUG
 #include "llvm/ADT/SmallSet.h"
 #endif
+#include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -57,7 +58,7 @@ public:
   const Function *Fn;
   MachineFunction *MF;
   MachineRegisterInfo *RegInfo;
-
+  BranchProbabilityInfo *BPI;
   /// CanLowerReturn - true iff the function's return value can be lowered to
   /// registers.
   bool CanLowerReturn;
diff --git a/include/llvm/CodeGen/ISDOpcodes.h b/include/llvm/CodeGen/ISDOpcodes.h
index 3da11c4..498614e 100644
--- a/include/llvm/CodeGen/ISDOpcodes.h
+++ b/include/llvm/CodeGen/ISDOpcodes.h
@@ -107,11 +107,11 @@ namespace ISD {
     // and returns an outchain.
     EH_SJLJ_LONGJMP,
 
-    // OUTCHAIN = EH_SJLJ_DISPATCHSETUP(INCHAIN, context)
+    // OUTCHAIN = EH_SJLJ_DISPATCHSETUP(INCHAIN, setjmpval)
     // This corresponds to the eh.sjlj.dispatchsetup intrinsic. It takes an
-    // input chain and a pointer to the sjlj function context as inputs and
-    // returns an outchain. By default, this does nothing. Targets can lower
-    // this to unwind setup code if needed.
+    // input chain and the value returning from setjmp as inputs and returns an
+    // outchain. By default, this does nothing. Targets can lower this to unwind
+    // setup code if needed.
     EH_SJLJ_DISPATCHSETUP,
 
     // TargetConstant* - Like Constant*, but the DAG does not do any folding,
@@ -219,7 +219,7 @@ namespace ISD {
     // RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
     // These nodes take two operands: the normal LHS and RHS to the add. They
     // produce two results: the normal result of the add, and a boolean that
-    // indicates if an overflow occured (*not* a flag, because it may be stored
+    // indicates if an overflow occurred (*not* a flag, because it may be stored
     // to memory, etc.).  If the type of the boolean is not i1 then the high
     // bits conform to getBooleanContents.
     // These nodes are generated from the llvm.[su]add.with.overflow intrinsics.
@@ -580,7 +580,8 @@ namespace ISD {
 
     // PREFETCH - This corresponds to a prefetch intrinsic. It takes chains are
     // their first operand. The other operands are the address to prefetch,
-    // read / write specifier, and locality specifier.
+    // read / write specifier, locality specifier and instruction / data cache
+    // specifier.
     PREFETCH,
 
     // OUTCHAIN = MEMBARRIER(INCHAIN, load-load, load-store, store-load,
diff --git a/include/llvm/CodeGen/JITCodeEmitter.h b/include/llvm/CodeGen/JITCodeEmitter.h
index fea8523..88e22d6 100644
--- a/include/llvm/CodeGen/JITCodeEmitter.h
+++ b/include/llvm/CodeGen/JITCodeEmitter.h
@@ -23,8 +23,6 @@
 #include "llvm/CodeGen/MachineCodeEmitter.h"
 #include "llvm/ADT/DenseMap.h"
 
-using namespace std;
-
 namespace llvm {
 
 class MachineBasicBlock;
@@ -38,7 +36,7 @@ class GlobalValue;
 class Function;
   
 /// JITCodeEmitter - This class defines two sorts of methods: those for
-/// emitting the actual bytes of machine code, and those for emitting auxillary
+/// emitting the actual bytes of machine code, and those for emitting auxiliary
 /// structures, such as jump tables, relocations, etc.
 ///
 /// Emission of machine code is complicated by the fact that we don't (in
diff --git a/include/llvm/CodeGen/LiveInterval.h b/include/llvm/CodeGen/LiveInterval.h
index 785f31b..5fd4d3d 100644
--- a/include/llvm/CodeGen/LiveInterval.h
+++ b/include/llvm/CodeGen/LiveInterval.h
@@ -286,6 +286,11 @@ namespace llvm {
       return valnos[ValNo];
     }
 
+    /// containsValue - Returns true if VNI belongs to this interval.
+    bool containsValue(const VNInfo *VNI) const {
+      return VNI && VNI->id < getNumValNums() && VNI == getValNumInfo(VNI->id);
+    }
+
     /// getNextValue - Create a new value number and return it.  MIIdx specifies
     /// the instruction that defines the value number.
     VNInfo *getNextValue(SlotIndex def, MachineInstr *CopyMI,
@@ -487,9 +492,10 @@ namespace llvm {
 
     /// Returns true if the live interval is zero length, i.e. no live ranges
     /// span instructions. It doesn't pay to spill such an interval.
-    bool isZeroLength() const {
+    bool isZeroLength(SlotIndexes *Indexes) const {
       for (const_iterator i = begin(), e = end(); i != e; ++i)
-        if (i->end.getPrevIndex() > i->start)
+        if (Indexes->getNextNonNullIndex(i->start).getBaseIndex() <
+            i->end.getBaseIndex())
           return false;
       return true;
     }
diff --git a/include/llvm/CodeGen/MachineBasicBlock.h b/include/llvm/CodeGen/MachineBasicBlock.h
index ad12157..397e59e 100644
--- a/include/llvm/CodeGen/MachineBasicBlock.h
+++ b/include/llvm/CodeGen/MachineBasicBlock.h
@@ -16,6 +16,7 @@
 
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/ADT/GraphTraits.h"
+#include "llvm/Support/DataTypes.h"
 #include <functional>
 
 namespace llvm {
@@ -27,6 +28,7 @@ class MCSymbol;
 class SlotIndexes;
 class StringRef;
 class raw_ostream;
+class MachineBranchProbabilityInfo;
 
 template <>
 struct ilist_traits<MachineInstr> : public ilist_default_traits<MachineInstr> {
@@ -63,12 +65,19 @@ class MachineBasicBlock : public ilist_node<MachineBasicBlock> {
   const BasicBlock *BB;
   int Number;
   MachineFunction *xParent;
-  
+
   /// Predecessors/Successors - Keep track of the predecessor / successor
   /// basicblocks.
   std::vector<MachineBasicBlock *> Predecessors;
   std::vector<MachineBasicBlock *> Successors;
 
+
+  /// Weights - Keep track of the weights to the successors. This vector
+  /// has the same order as Successors, or it is empty if we don't use it
+  /// (disable optimization).
+  std::vector<uint32_t> Weights;
+  typedef std::vector<uint32_t>::iterator weight_iterator;
+
   /// LiveIns - Keep track of the physical registers that are livein of
   /// the basicblock.
   std::vector<unsigned> LiveIns;
@@ -244,11 +253,13 @@ public:
   void updateTerminator();
 
   // Machine-CFG mutators
-  
+
   /// addSuccessor - Add succ as a successor of this MachineBasicBlock.
-  /// The Predecessors list of succ is automatically updated.
+  /// The Predecessors list of succ is automatically updated. WEIGHT
+  /// parameter is stored in Weights list and it may be used by
+  /// MachineBranchProbabilityInfo analysis to calculate branch probability.
   ///
-  void addSuccessor(MachineBasicBlock *succ);
+  void addSuccessor(MachineBasicBlock *succ, uint32_t weight = 0);
 
   /// removeSuccessor - Remove successor from the successors list of this
   /// MachineBasicBlock. The Predecessors list of succ is automatically updated.
@@ -260,7 +271,12 @@ public:
   /// updated.  Return the iterator to the element after the one removed.
   ///
   succ_iterator removeSuccessor(succ_iterator I);
-  
+
+  /// replaceSuccessor - Replace successor OLD with NEW and update weight info.
+  ///
+  void replaceSuccessor(MachineBasicBlock *Old, MachineBasicBlock *New);
+
+
   /// transferSuccessors - Transfers all the successors from MBB to this
   /// machine basic block (i.e., copies all the successors fromMBB and
   /// remove all the successors from fromMBB).
@@ -396,8 +412,22 @@ public:
   /// getSymbol - Return the MCSymbol for this basic block.
   ///
   MCSymbol *getSymbol() const;
-  
-private:   // Methods used to maintain doubly linked list of blocks...
+
+
+private:
+  /// getWeightIterator - Return weight iterator corresponding to the I
+  /// successor iterator.
+  weight_iterator getWeightIterator(succ_iterator I);
+
+  friend class MachineBranchProbabilityInfo;
+
+  /// getSuccWeight - Return weight of the edge from this block to MBB. This
+  /// method should NOT be called directly, but by using getEdgeWeight method
+  /// from MachineBranchProbabilityInfo class.
+  uint32_t getSuccWeight(MachineBasicBlock *succ);
+
+
+  // Methods used to maintain doubly linked list of blocks...
   friend struct ilist_traits<MachineBasicBlock>;
 
   // Machine-CFG mutators
diff --git a/include/llvm/CodeGen/MachineBranchProbabilityInfo.h b/include/llvm/CodeGen/MachineBranchProbabilityInfo.h
new file mode 100644
index 0000000..f3b3e0e
--- /dev/null
+++ b/include/llvm/CodeGen/MachineBranchProbabilityInfo.h
@@ -0,0 +1,77 @@
+
+//==- MachineBranchProbabilityInfo.h - Machine Branch Probability Analysis -==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass is used to evaluate branch probabilties on machine basic blocks.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_MACHINEBRANCHPROBABILITYINFO_H
+#define LLVM_CODEGEN_MACHINEBRANCHPROBABILITYINFO_H
+
+#include "llvm/Pass.h"
+#include "llvm/Support/BranchProbability.h"
+#include <climits>
+
+namespace llvm {
+
+class raw_ostream;
+
+class MachineBranchProbabilityInfo : public ImmutablePass {
+
+  // Default weight value. Used when we don't have information about the edge.
+  // TODO: DEFAULT_WEIGHT makes sense during static predication, when none of
+  // the successors have a weight yet. But it doesn't make sense when providing
+  // weight to an edge that may have siblings with non-zero weights. This can
+  // be handled various ways, but it's probably fine for an edge with unknown
+  // weight to just "inherit" the non-zero weight of an adjacent successor.
+  static const uint32_t DEFAULT_WEIGHT = 16;
+
+  // Get sum of the block successors' weights.
+  uint32_t getSumForBlock(MachineBasicBlock *MBB) const;
+
+public:
+  static char ID;
+
+  MachineBranchProbabilityInfo() : ImmutablePass(ID) {
+    PassRegistry &Registry = *PassRegistry::getPassRegistry();
+    initializeMachineBranchProbabilityInfoPass(Registry);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.setPreservesAll();
+  }
+
+  // Return edge weight. If we don't have any informations about it - return
+  // DEFAULT_WEIGHT.
+  uint32_t getEdgeWeight(MachineBasicBlock *Src, MachineBasicBlock *Dst) const;
+
+  // A 'Hot' edge is an edge which probability is >= 80%.
+  bool isEdgeHot(MachineBasicBlock *Src, MachineBasicBlock *Dst) const;
+
+  // Return a hot successor for the block BB or null if there isn't one.
+  MachineBasicBlock *getHotSucc(MachineBasicBlock *MBB) const;
+
+  // Return a probability as a fraction between 0 (0% probability) and
+  // 1 (100% probability), however the value is never equal to 0, and can be 1
+  // only iff SRC block has only one successor.
+  BranchProbability getEdgeProbability(MachineBasicBlock *Src,
+                                       MachineBasicBlock *Dst) const;
+
+  // Print value between 0 (0% probability) and 1 (100% probability),
+  // however the value is never equal to 0, and can be 1 only iff SRC block
+  // has only one successor.
+  raw_ostream &printEdgeProbability(raw_ostream &OS, MachineBasicBlock *Src,
+                                    MachineBasicBlock *Dst) const;
+};
+
+}
+
+
+#endif
diff --git a/include/llvm/CodeGen/MachineCodeEmitter.h b/include/llvm/CodeGen/MachineCodeEmitter.h
index 8fc80ad..428aada 100644
--- a/include/llvm/CodeGen/MachineCodeEmitter.h
+++ b/include/llvm/CodeGen/MachineCodeEmitter.h
@@ -34,7 +34,7 @@ class Function;
 class MCSymbol;
 
 /// MachineCodeEmitter - This class defines two sorts of methods: those for
-/// emitting the actual bytes of machine code, and those for emitting auxillary
+/// emitting the actual bytes of machine code, and those for emitting auxiliary
 /// structures, such as jump tables, relocations, etc.
 ///
 /// Emission of machine code is complicated by the fact that we don't (in
@@ -54,7 +54,7 @@ protected:
   /// allocated for this code buffer.
   uint8_t *BufferBegin, *BufferEnd;
   /// CurBufferPtr - Pointer to the next byte of memory to fill when emitting
-  /// code.  This is guranteed to be in the range [BufferBegin,BufferEnd].  If
+  /// code.  This is guaranteed to be in the range [BufferBegin,BufferEnd].  If
   /// this pointer is at BufferEnd, it will never move due to code emission, and
   /// all code emission requests will be ignored (this is the buffer overflow
   /// condition).
diff --git a/include/llvm/CodeGen/MachineInstr.h b/include/llvm/CodeGen/MachineInstr.h
index 2724689..c36dd69 100644
--- a/include/llvm/CodeGen/MachineInstr.h
+++ b/include/llvm/CodeGen/MachineInstr.h
@@ -229,6 +229,7 @@ public:
 
   enum MICheckType {
     CheckDefs,      // Check all operands for equality
+    CheckKillDead,  // Check all operands including kill / dead markers
     IgnoreDefs,     // Ignore all definitions
     IgnoreVRegDefs  // Ignore virtual register definitions
   };
diff --git a/include/llvm/CodeGen/MachineInstrBuilder.h b/include/llvm/CodeGen/MachineInstrBuilder.h
index f04dee2..c8183a3 100644
--- a/include/llvm/CodeGen/MachineInstrBuilder.h
+++ b/include/llvm/CodeGen/MachineInstrBuilder.h
@@ -48,6 +48,7 @@ public:
   /// Allow automatic conversion to the machine instruction we are working on.
   ///
   operator MachineInstr*() const { return MI; }
+  MachineInstr *operator->() const { return MI; }
   operator MachineBasicBlock::iterator() const { return MI; }
 
   /// addReg - Add a new virtual register operand...
@@ -87,7 +88,7 @@ public:
     return *this;
   }
 
-  const MachineInstrBuilder &addFrameIndex(unsigned Idx) const {
+  const MachineInstrBuilder &addFrameIndex(int Idx) const {
     MI->addOperand(MachineOperand::CreateFI(Idx));
     return *this;
   }
diff --git a/include/llvm/CodeGen/MachineModuleInfo.h b/include/llvm/CodeGen/MachineModuleInfo.h
index 6bc80b0..fa185c4 100644
--- a/include/llvm/CodeGen/MachineModuleInfo.h
+++ b/include/llvm/CodeGen/MachineModuleInfo.h
@@ -52,27 +52,13 @@ namespace llvm {
 class Constant;
 class GlobalVariable;
 class MDNode;
+class MMIAddrLabelMap;
 class MachineBasicBlock;
 class MachineFunction;
 class Module;
 class PointerType;
 class StructType;
 
-/// MachineModuleInfoImpl - This class can be derived from and used by targets
-/// to hold private target-specific information for each Module.  Objects of
-/// type are accessed/created with MMI::getInfo and destroyed when the
-/// MachineModuleInfo is destroyed.
-class MachineModuleInfoImpl {
-public:
-  typedef PointerIntPair<MCSymbol*, 1, bool> StubValueTy;
-  virtual ~MachineModuleInfoImpl();
-  typedef std::vector<std::pair<MCSymbol*, StubValueTy> > SymbolListTy;
-protected:
-  static SymbolListTy GetSortedStubs(const DenseMap<MCSymbol*, StubValueTy>&);
-};
-
-
-
 //===----------------------------------------------------------------------===//
 /// LandingPadInfo - This structure is used to retain landing pad info for
 /// the current function.
@@ -89,7 +75,20 @@ struct LandingPadInfo {
     : LandingPadBlock(MBB), LandingPadLabel(0), Personality(0) {}
 };
 
-class MMIAddrLabelMap;
+//===----------------------------------------------------------------------===//
+/// MachineModuleInfoImpl - This class can be derived from and used by targets
+/// to hold private target-specific information for each Module.  Objects of
+/// type are accessed/created with MMI::getInfo and destroyed when the
+/// MachineModuleInfo is destroyed.
+/// 
+class MachineModuleInfoImpl {
+public:
+  typedef PointerIntPair<MCSymbol*, 1, bool> StubValueTy;
+  virtual ~MachineModuleInfoImpl();
+  typedef std::vector<std::pair<MCSymbol*, StubValueTy> > SymbolListTy;
+protected:
+  static SymbolListTy GetSortedStubs(const DenseMap<MCSymbol*, StubValueTy>&);
+};
 
 //===----------------------------------------------------------------------===//
 /// MachineModuleInfo - This class contains meta information specific to a
diff --git a/include/llvm/CodeGen/MachineOperand.h b/include/llvm/CodeGen/MachineOperand.h
index 8acc949..140c6e8 100644
--- a/include/llvm/CodeGen/MachineOperand.h
+++ b/include/llvm/CodeGen/MachineOperand.h
@@ -94,8 +94,8 @@ private:
   /// not a real instruction.  Such uses should be ignored during codegen.
   bool IsDebug : 1;
 
-  /// SmallContents - Thisreally should be part of the Contents union, but lives
-  /// out here so we can get a better packed struct.
+  /// SmallContents - This really should be part of the Contents union, but
+  /// lives out here so we can get a better packed struct.
   /// MO_Register: Register number.
   /// OffsetedInfo: Low bits of offset.
   union {
@@ -473,7 +473,7 @@ public:
     Op.setTargetFlags(TargetFlags);
     return Op;
   }
-  static MachineOperand CreateFI(unsigned Idx) {
+  static MachineOperand CreateFI(int Idx) {
     MachineOperand Op(MachineOperand::MO_FrameIndex);
     Op.setIndex(Idx);
     return Op;
diff --git a/include/llvm/CodeGen/PBQP/Graph.h b/include/llvm/CodeGen/PBQP/Graph.h
index b2224cb..5240729 100644
--- a/include/llvm/CodeGen/PBQP/Graph.h
+++ b/include/llvm/CodeGen/PBQP/Graph.h
@@ -18,7 +18,6 @@
 #include "Math.h"
 
 #include <list>
-#include <vector>
 #include <map>
 
 namespace PBQP {
diff --git a/include/llvm/CodeGen/PBQP/Heuristics/Briggs.h b/include/llvm/CodeGen/PBQP/Heuristics/Briggs.h
index 47a287c..e96c4cb 100644
--- a/include/llvm/CodeGen/PBQP/Heuristics/Briggs.h
+++ b/include/llvm/CodeGen/PBQP/Heuristics/Briggs.h
@@ -21,7 +21,6 @@
 #include "../HeuristicSolver.h"
 #include "../HeuristicBase.h"
 
-#include <set>
 #include <limits>
 
 namespace PBQP {
diff --git a/include/llvm/CodeGen/PseudoSourceValue.h b/include/llvm/CodeGen/PseudoSourceValue.h
index bace631..7dab4f9 100644
--- a/include/llvm/CodeGen/PseudoSourceValue.h
+++ b/include/llvm/CodeGen/PseudoSourceValue.h
@@ -21,7 +21,7 @@ namespace llvm {
   class raw_ostream;
 
   /// PseudoSourceValue - Special value supplied for machine level alias
-  /// analysis. It indicates that the a memory access references the functions
+  /// analysis. It indicates that a memory access references the functions
   /// stack frame (e.g., a spill slot), below the stack frame (e.g., argument
   /// space), or constant pool.
   class PseudoSourceValue : public Value {
diff --git a/include/llvm/CodeGen/RegAllocPBQP.h b/include/llvm/CodeGen/RegAllocPBQP.h
index 7e8745e..8139c65 100644
--- a/include/llvm/CodeGen/RegAllocPBQP.h
+++ b/include/llvm/CodeGen/RegAllocPBQP.h
@@ -94,7 +94,7 @@ namespace llvm {
     typedef std::map<PBQP::Graph::ConstNodeItr, unsigned,
                      PBQP::NodeItrComparator>  Node2VReg;
     typedef DenseMap<unsigned, PBQP::Graph::NodeItr> VReg2Node;
-    typedef std::map<unsigned, AllowedSet> AllowedSetMap;
+    typedef DenseMap<unsigned, AllowedSet> AllowedSetMap;
 
     PBQP::Graph graph;
     Node2VReg node2VReg;
diff --git a/include/llvm/CodeGen/ScheduleDAG.h b/include/llvm/CodeGen/ScheduleDAG.h
index 9a2345b..0657664 100644
--- a/include/llvm/CodeGen/ScheduleDAG.h
+++ b/include/llvm/CodeGen/ScheduleDAG.h
@@ -252,6 +252,7 @@ namespace llvm {
     unsigned short Latency;             // Node latency.
     bool isVRegCycle      : 1;          // May use and def the same vreg.
     bool isCall           : 1;          // Is a function call.
+    bool isCallOp         : 1;          // Is a function call operand.
     bool isTwoAddress     : 1;          // Is a two-address instruction.
     bool isCommutable     : 1;          // Is a commutable instruction.
     bool hasPhysRegDefs   : 1;          // Has physreg defs that are being used.
@@ -260,10 +261,10 @@ namespace llvm {
     bool isAvailable      : 1;          // True once available.
     bool isScheduled      : 1;          // True once scheduled.
     bool isScheduleHigh   : 1;          // True if preferable to schedule high.
+    bool isScheduleLow    : 1;          // True if preferable to schedule low.
     bool isCloned         : 1;          // True if this node has been cloned.
     Sched::Preference SchedulingPref;   // Scheduling preference.
 
-    SmallVector<MachineInstr*, 4> DbgInstrList; // dbg_values referencing this.
   private:
     bool isDepthCurrent   : 1;          // True if Depth is current.
     bool isHeightCurrent  : 1;          // True if Height is current.
@@ -279,10 +280,10 @@ namespace llvm {
       : Node(node), Instr(0), OrigNode(0), NodeNum(nodenum),
         NodeQueueId(0), NumPreds(0), NumSuccs(0), NumPredsLeft(0),
         NumSuccsLeft(0), NumRegDefsLeft(0), Latency(0),
-        isVRegCycle(false), isCall(false), isTwoAddress(false),
+        isVRegCycle(false), isCall(false), isCallOp(false), isTwoAddress(false),
         isCommutable(false), hasPhysRegDefs(false), hasPhysRegClobbers(false),
         isPending(false), isAvailable(false), isScheduled(false),
-        isScheduleHigh(false), isCloned(false),
+        isScheduleHigh(false), isScheduleLow(false), isCloned(false),
         SchedulingPref(Sched::None),
         isDepthCurrent(false), isHeightCurrent(false), Depth(0), Height(0),
         CopyDstRC(NULL), CopySrcRC(NULL) {}
@@ -293,10 +294,10 @@ namespace llvm {
       : Node(0), Instr(instr), OrigNode(0), NodeNum(nodenum),
         NodeQueueId(0), NumPreds(0), NumSuccs(0), NumPredsLeft(0),
         NumSuccsLeft(0), NumRegDefsLeft(0), Latency(0),
-        isVRegCycle(false), isCall(false), isTwoAddress(false),
+        isVRegCycle(false), isCall(false), isCallOp(false), isTwoAddress(false),
         isCommutable(false), hasPhysRegDefs(false), hasPhysRegClobbers(false),
         isPending(false), isAvailable(false), isScheduled(false),
-        isScheduleHigh(false), isCloned(false),
+        isScheduleHigh(false), isScheduleLow(false), isCloned(false),
         SchedulingPref(Sched::None),
         isDepthCurrent(false), isHeightCurrent(false), Depth(0), Height(0),
         CopyDstRC(NULL), CopySrcRC(NULL) {}
@@ -306,10 +307,10 @@ namespace llvm {
       : Node(0), Instr(0), OrigNode(0), NodeNum(~0u),
         NodeQueueId(0), NumPreds(0), NumSuccs(0), NumPredsLeft(0),
         NumSuccsLeft(0), NumRegDefsLeft(0), Latency(0),
-        isVRegCycle(false), isCall(false), isTwoAddress(false),
+        isVRegCycle(false), isCall(false), isCallOp(false), isTwoAddress(false),
         isCommutable(false), hasPhysRegDefs(false), hasPhysRegClobbers(false),
         isPending(false), isAvailable(false), isScheduled(false),
-        isScheduleHigh(false), isCloned(false),
+        isScheduleHigh(false), isScheduleLow(false), isCloned(false),
         SchedulingPref(Sched::None),
         isDepthCurrent(false), isHeightCurrent(false), Depth(0), Height(0),
         CopyDstRC(NULL), CopySrcRC(NULL) {}
@@ -496,6 +497,12 @@ namespace llvm {
     SUnit EntrySU;                        // Special node for the region entry.
     SUnit ExitSU;                         // Special node for the region exit.
 
+#ifdef NDEBUG
+    static const bool StressSched = false;
+#else
+    bool StressSched;
+#endif
+
     explicit ScheduleDAG(MachineFunction &mf);
 
     virtual ~ScheduleDAG();
@@ -691,11 +698,11 @@ namespace llvm {
     /// will create a cycle.
     bool WillCreateCycle(SUnit *SU, SUnit *TargetSU);
 
-    /// AddPred - Updates the topological ordering to accomodate an edge
+    /// AddPred - Updates the topological ordering to accommodate an edge
     /// to be added from SUnit X to SUnit Y.
     void AddPred(SUnit *Y, SUnit *X);
 
-    /// RemovePred - Updates the topological ordering to accomodate an
+    /// RemovePred - Updates the topological ordering to accommodate an
     /// an edge to be removed from the specified node N from the predecessors
     /// of the current node M.
     void RemovePred(SUnit *M, SUnit *N);
diff --git a/include/llvm/CodeGen/ScoreboardHazardRecognizer.h b/include/llvm/CodeGen/ScoreboardHazardRecognizer.h
index 8850006..118df28 100644
--- a/include/llvm/CodeGen/ScoreboardHazardRecognizer.h
+++ b/include/llvm/CodeGen/ScoreboardHazardRecognizer.h
@@ -21,7 +21,6 @@
 
 #include <cassert>
 #include <cstring>
-#include <string>
 
 namespace llvm {
 
diff --git a/include/llvm/CodeGen/SelectionDAG.h b/include/llvm/CodeGen/SelectionDAG.h
index b537a77..1c42bef 100644
--- a/include/llvm/CodeGen/SelectionDAG.h
+++ b/include/llvm/CodeGen/SelectionDAG.h
@@ -284,7 +284,7 @@ public:
   ///
   /// Note that this is an involved process that may invalidate pointers into
   /// the graph.
-  void Legalize(CodeGenOpt::Level OptLevel);
+  void Legalize();
 
   /// LegalizeVectors - This transforms the SelectionDAG into a SelectionDAG
   /// that only uses vector math operations supported by the target.  This is
@@ -829,7 +829,7 @@ public:
   /// These functions only replace all existing uses. It's possible that as
   /// these replacements are being performed, CSE may cause the From node
   /// to be given new uses. These new uses of From are left in place, and
-  /// not automatically transfered to To.
+  /// not automatically transferred to To.
   ///
   void ReplaceAllUsesWith(SDValue From, SDValue Op,
                           DAGUpdateListener *UpdateListener = 0);
@@ -985,10 +985,6 @@ public:
   /// other positive zero.
   bool isEqualTo(SDValue A, SDValue B) const;
 
-  /// isVerifiedDebugInfoDesc - Returns true if the specified SDValue has
-  /// been verified as a debug information descriptor.
-  bool isVerifiedDebugInfoDesc(SDValue Op) const;
-
   /// UnrollVectorOp - Utility function used by legalize and lowering to
   /// "unroll" a vector operation by splitting out the scalars and operating
   /// on each element individually.  If the ResNE is 0, fully unroll the vector
diff --git a/include/llvm/CodeGen/SelectionDAGISel.h b/include/llvm/CodeGen/SelectionDAGISel.h
index 5457679..ecf3947 100644
--- a/include/llvm/CodeGen/SelectionDAGISel.h
+++ b/include/llvm/CodeGen/SelectionDAGISel.h
@@ -258,7 +258,7 @@ public:
   }
 
   virtual SDValue RunSDNodeXForm(SDValue V, unsigned XFormNo) {
-    assert(0 && "Tblgen shoudl generate this!");
+    assert(0 && "Tblgen should generate this!");
     return SDValue();
   }
 
@@ -280,7 +280,8 @@ private:
 
   void PrepareEHLandingPad();
   void SelectAllBasicBlocks(const Function &Fn);
-  bool TryToFoldFastISelLoad(const LoadInst *LI, FastISel *FastIS);
+  bool TryToFoldFastISelLoad(const LoadInst *LI, const Instruction *FoldInst,
+                             FastISel *FastIS);
   void FinishBasicBlock();
 
   void SelectBasicBlock(BasicBlock::const_iterator Begin,
diff --git a/include/llvm/CodeGen/SelectionDAGNodes.h b/include/llvm/CodeGen/SelectionDAGNodes.h
index 6454639..9d265f1 100644
--- a/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -838,7 +838,7 @@ public:
 
 
 /// HandleSDNode - This class is used to form a handle around another node that
-/// is persistant and is updated across invocations of replaceAllUsesWith on its
+/// is persistent and is updated across invocations of replaceAllUsesWith on its
 /// operand.  This node should be directly created by end-users and not added to
 /// the AllNodes list.
 class HandleSDNode : public SDNode {
diff --git a/include/llvm/CodeGen/SlotIndexes.h b/include/llvm/CodeGen/SlotIndexes.h
index dc24713..33ce675 100644
--- a/include/llvm/CodeGen/SlotIndexes.h
+++ b/include/llvm/CodeGen/SlotIndexes.h
@@ -511,6 +511,40 @@ namespace llvm {
       return nextNonNull;
     }
 
+    /// getIndexBefore - Returns the index of the last indexed instruction
+    /// before MI, or the the start index of its basic block.
+    /// MI is not required to have an index.
+    SlotIndex getIndexBefore(const MachineInstr *MI) const {
+      const MachineBasicBlock *MBB = MI->getParent();
+      assert(MBB && "MI must be inserted inna basic block");
+      MachineBasicBlock::const_iterator I = MI, B = MBB->begin();
+      for (;;) {
+        if (I == B)
+          return getMBBStartIdx(MBB);
+        --I;
+        Mi2IndexMap::const_iterator MapItr = mi2iMap.find(I);
+        if (MapItr != mi2iMap.end())
+          return MapItr->second;
+      }
+    }
+
+    /// getIndexAfter - Returns the index of the first indexed instruction
+    /// after MI, or the end index of its basic block.
+    /// MI is not required to have an index.
+    SlotIndex getIndexAfter(const MachineInstr *MI) const {
+      const MachineBasicBlock *MBB = MI->getParent();
+      assert(MBB && "MI must be inserted inna basic block");
+      MachineBasicBlock::const_iterator I = MI, E = MBB->end();
+      for (;;) {
+        ++I;
+        if (I == E)
+          return getMBBEndIdx(MBB);
+        Mi2IndexMap::const_iterator MapItr = mi2iMap.find(I);
+        if (MapItr != mi2iMap.end())
+          return MapItr->second;
+      }
+    }
+
     /// Return the (start,end) range of the given basic block number.
     const std::pair<SlotIndex, SlotIndex> &
     getMBBRange(unsigned Num) const {
@@ -545,6 +579,8 @@ namespace llvm {
 
     /// Returns the basic block which the given index falls in.
     MachineBasicBlock* getMBBFromIndex(SlotIndex index) const {
+      if (MachineInstr *MI = getInstructionFromIndex(index))
+        return MI->getParent();
       SmallVectorImpl<IdxMBBPair>::const_iterator I =
         std::lower_bound(idx2MBBMap.begin(), idx2MBBMap.end(), index);
       // Take the pair containing the index
@@ -602,44 +638,36 @@ namespace llvm {
 
     /// Insert the given machine instruction into the mapping. Returns the
     /// assigned index.
-    SlotIndex insertMachineInstrInMaps(MachineInstr *mi) {
+    /// If Late is set and there are null indexes between mi's neighboring
+    /// instructions, create the new index after the null indexes instead of
+    /// before them.
+    SlotIndex insertMachineInstrInMaps(MachineInstr *mi, bool Late = false) {
       assert(mi2iMap.find(mi) == mi2iMap.end() && "Instr already indexed.");
       // Numbering DBG_VALUE instructions could cause code generation to be
       // affected by debug information.
       assert(!mi->isDebugValue() && "Cannot number DBG_VALUE instructions.");
 
-      MachineBasicBlock *mbb = mi->getParent();
-
-      assert(mbb != 0 && "Instr must be added to function.");
+      assert(mi->getParent() != 0 && "Instr must be added to function.");
 
-      MachineBasicBlock::iterator miItr(mi);
-      IndexListEntry *newEntry;
-      // Get previous index, considering that not all instructions are indexed.
-      IndexListEntry *prevEntry;
-      for (;;) {
-        // If mi is at the mbb beginning, get the prev index from the mbb.
-        if (miItr == mbb->begin()) {
-          prevEntry = &getMBBStartIdx(mbb).entry();
-          break;
-        }
-        // Otherwise rewind until we find a mapped instruction.
-        Mi2IndexMap::const_iterator itr = mi2iMap.find(--miItr);
-        if (itr != mi2iMap.end()) {
-          prevEntry = &itr->second.entry();
-          break;
-        }
+      // Get the entries where mi should be inserted.
+      IndexListEntry *prevEntry, *nextEntry;
+      if (Late) {
+        // Insert mi's index immediately before the following instruction.
+        nextEntry = &getIndexAfter(mi).entry();
+        prevEntry = nextEntry->getPrev();
+      } else {
+        // Insert mi's index immediately after the preceeding instruction.
+        prevEntry = &getIndexBefore(mi).entry();
+        nextEntry = prevEntry->getNext();
       }
 
-      // Get next entry from previous entry.
-      IndexListEntry *nextEntry = prevEntry->getNext();
-
       // Get a number for the new instr, or 0 if there's no room currently.
       // In the latter case we'll force a renumber later.
       unsigned dist = ((nextEntry->getIndex() - prevEntry->getIndex())/2) & ~3u;
       unsigned newNumber = prevEntry->getIndex() + dist;
 
       // Insert a new list entry for mi.
-      newEntry = createEntry(mi, newNumber);
+      IndexListEntry *newEntry = createEntry(mi, newNumber);
       insert(nextEntry, newEntry);
 
       // Renumber locally if we need to.
diff --git a/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h b/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h
index fba3e48..54e5751 100644
--- a/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h
+++ b/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h
@@ -58,6 +58,14 @@ public:
   virtual void Initialize(MCContext &Ctx, const TargetMachine &TM);
 
   virtual const MCSection *getEHFrameSection() const;
+  virtual const MCSection *getWin64EHFuncTableSection(StringRef) const {
+    return NULL;
+  }
+  virtual const MCSection *getWin64EHTableSection(StringRef) const{return NULL;}
+
+  virtual void emitPersonalityValue(MCStreamer &Streamer,
+                                    const TargetMachine &TM,
+                                    const MCSymbol *Sym) const;
 
   const MCSection *getDataRelSection() const { return DataRelSection; }
 
@@ -81,6 +89,11 @@ public:
   getExprForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang,
                                  MachineModuleInfo *MMI, unsigned Encoding,
                                  MCStreamer &Streamer) const;
+
+  // getCFIPersonalitySymbol - The symbol that gets passed to .cfi_personality.
+  virtual MCSymbol *
+  getCFIPersonalitySymbol(const GlobalValue *GV, Mangler *Mang,
+                          MachineModuleInfo *MMI) const;
 };
 
 
@@ -94,7 +107,7 @@ class TargetLoweringObjectFileMachO : public TargetLoweringObjectFile {
   ///
   const MCSection *TLSBSSSection;         // Defaults to ".tbss".
   
-  /// TLSTLVSection - Section for thread local structure infomation.
+  /// TLSTLVSection - Section for thread local structure information.
   /// Contains the source code name of the variable, visibility and a pointer
   /// to the initial value (.tdata or .tbss).
   const MCSection *TLSTLVSection;         // Defaults to ".tlv".
@@ -124,6 +137,10 @@ public:
   virtual void Initialize(MCContext &Ctx, const TargetMachine &TM);
 
   virtual const MCSection *getEHFrameSection() const;
+  virtual const MCSection *getWin64EHFuncTableSection(StringRef) const {
+    return NULL;
+  }
+  virtual const MCSection *getWin64EHTableSection(StringRef) const{return NULL;}
 
   virtual const MCSection *
   SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
@@ -172,9 +189,14 @@ public:
                                  MachineModuleInfo *MMI, unsigned Encoding,
                                  MCStreamer &Streamer) const;
 
+  // getCFIPersonalitySymbol - The symbol that gets passed to .cfi_personality.
+  virtual MCSymbol *
+  getCFIPersonalitySymbol(const GlobalValue *GV, Mangler *Mang,
+                          MachineModuleInfo *MMI) const;
+
   virtual unsigned getPersonalityEncoding() const;
   virtual unsigned getLSDAEncoding() const;
-  virtual unsigned getFDEEncoding() const;
+  virtual unsigned getFDEEncoding(bool CFI) const;
   virtual unsigned getTTypeEncoding() const;
 };
 
@@ -182,6 +204,8 @@ public:
 
 class TargetLoweringObjectFileCOFF : public TargetLoweringObjectFile {
   const MCSection *DrectveSection;
+  const MCSection *PDataSection;
+  const MCSection *XDataSection;
 public:
   TargetLoweringObjectFileCOFF() {}
   ~TargetLoweringObjectFileCOFF() {}
@@ -189,6 +213,8 @@ public:
   virtual void Initialize(MCContext &Ctx, const TargetMachine &TM);
 
   virtual const MCSection *getEHFrameSection() const;
+  virtual const MCSection *getWin64EHFuncTableSection(StringRef) const;
+  virtual const MCSection *getWin64EHTableSection(StringRef) const;
 
   virtual const MCSection *getDrectveSection() const { return DrectveSection; }
 
diff --git a/include/llvm/CodeGen/ValueTypes.h b/include/llvm/CodeGen/ValueTypes.h
index 22d1622..424721b 100644
--- a/include/llvm/CodeGen/ValueTypes.h
+++ b/include/llvm/CodeGen/ValueTypes.h
@@ -83,7 +83,11 @@ namespace llvm {
 
       isVoid         =  35,   // This has no value
 
-      LAST_VALUETYPE =  36,   // This always remains at the end of the list.
+      untyped        =  36,   // This value takes a register, but has
+                              // unspecified type.  The register class
+                              // will be determined by the opcode.
+
+      LAST_VALUETYPE =  37,   // This always remains at the end of the list.
 
       // This is the current maximum for LAST_VALUETYPE.
       // MVT::MAX_ALLOWED_VALUETYPE is used for asserts and to size bit vectors
diff --git a/include/llvm/CodeGen/ValueTypes.td b/include/llvm/CodeGen/ValueTypes.td
index a1163f7..0cfb634 100644
--- a/include/llvm/CodeGen/ValueTypes.td
+++ b/include/llvm/CodeGen/ValueTypes.td
@@ -1,10 +1,10 @@
 //===- ValueTypes.td - ValueType definitions ---------------*- tablegen -*-===//
-// 
+//
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
-// 
+//
 //===----------------------------------------------------------------------===//
 //
 // Value types - These values correspond to the register types defined in the
@@ -58,6 +58,7 @@ def v4f64  : ValueType<256, 32>;   //  4 x f64 vector value
 def x86mmx : ValueType<64 , 33>;   // X86 MMX value
 def FlagVT : ValueType<0  , 34>;   // Pre-RA sched glue
 def isVoid : ValueType<0  , 35>;   // Produces no value
+def untyped: ValueType<8  , 36>;   // Produces an untyped value
 
 def MetadataVT: ValueType<0, 250>; // Metadata
 
diff --git a/include/llvm/CompilerDriver/Common.td b/include/llvm/CompilerDriver/Common.td
index 84e8783..6ba30aa 100644
--- a/include/llvm/CompilerDriver/Common.td
+++ b/include/llvm/CompilerDriver/Common.td
@@ -56,8 +56,11 @@ def forward_not_split;
 def case;
 
 // Boolean constants.
-def true;
-def false;
+class Bool<bit val> {
+      bit Value = val;
+}
+def true : Bool<1>;
+def false : Bool<0>;
 
 // Boolean operators.
 def and;
diff --git a/include/llvm/Config/config.h.cmake b/include/llvm/Config/config.h.cmake
index b2deb1d..755daa6 100644
--- a/include/llvm/Config/config.h.cmake
+++ b/include/llvm/Config/config.h.cmake
@@ -196,6 +196,9 @@
 /* Define to 1 if you have the `udis86' library (-ludis86). */
 #undef HAVE_LIBUDIS86
 
+/* Type of 1st arg on ELM Callback */
+#cmakedefine WIN32_ELMCB_PCSTR ${WIN32_ELMCB_PCSTR}
+
 /* Define to 1 if you have the <limits.h> header file. */
 #cmakedefine HAVE_LIMITS_H ${HAVE_LIMITS_H}
 
diff --git a/include/llvm/Config/config.h.in b/include/llvm/Config/config.h.in
index 29f5c93..10a8935 100644
--- a/include/llvm/Config/config.h.in
+++ b/include/llvm/Config/config.h.in
@@ -675,6 +675,9 @@
 /* Define if use udis86 library */
 #undef USE_UDIS86
 
+/* Type of 1st arg on ELM Callback */
+#undef WIN32_ELMCB_PCSTR
+
 /* Define to empty if `const' does not conform to ANSI C. */
 #undef const
 
diff --git a/include/llvm/Config/llvm-config.h.cmake b/include/llvm/Config/llvm-config.h.cmake
index 9a9cb3b..ee81f7a 100644
--- a/include/llvm/Config/llvm-config.h.cmake
+++ b/include/llvm/Config/llvm-config.h.cmake
@@ -95,7 +95,7 @@
 #cmakedefine LLVM_PATH_TWOPI "${LLVM_PATH_TWOPI}"
 
 /* Define to path to xdot.py program if found or 'echo xdot.py' otherwise */
-#cmakedefine LLVM_PATH_XDOT_PY "${LLVM_PATH_XDOT.PY}"
+#cmakedefine LLVM_PATH_XDOT_PY "${LLVM_PATH_XDOT_PY}"
 
 /* Installation prefix directory */
 #cmakedefine LLVM_PREFIX "${LLVM_PREFIX}"
diff --git a/include/llvm/Constant.h b/include/llvm/Constant.h
index 38045fc..5f32ce0 100644
--- a/include/llvm/Constant.h
+++ b/include/llvm/Constant.h
@@ -47,10 +47,6 @@ protected:
     : User(ty, vty, Ops, NumOps) {}
 
   void destroyConstantImpl();
-  
-  void setOperand(unsigned i, Value *V) {
-    User::setOperand(i, V);
-  }
 public:
   /// isNullValue - Return true if this is the value that would be returned by
   /// getNullValue.
@@ -90,15 +86,6 @@ public:
   /// FIXME: This really should not be in VMCore.
   PossibleRelocationsTy getRelocationInfo() const;
   
-  // Specialize get/setOperand for Users as their operands are always
-  // constants or BasicBlocks as well.
-  User *getOperand(unsigned i) {
-    return static_cast<User*>(User::getOperand(i));
-  }
-  const User *getOperand(unsigned i) const {
-    return static_cast<const User*>(User::getOperand(i));
-  }
-  
   /// getVectorElements - This method, which is only valid on constant of vector
   /// type, returns the elements of the vector in the specified smallvector.
   /// This handles breaking down a vector undef into undef elements, etc.  For
diff --git a/include/llvm/Constants.h b/include/llvm/Constants.h
index c12b33f..eabc3a5 100644
--- a/include/llvm/Constants.h
+++ b/include/llvm/Constants.h
@@ -841,7 +841,7 @@ public:
   static Constant *getICmp(unsigned short pred, Constant *LHS, Constant *RHS);
   static Constant *getFCmp(unsigned short pred, Constant *LHS, Constant *RHS);
 
-  /// Getelementptr form.  std::vector<Value*> is only accepted for convenience:
+  /// Getelementptr form.  Value* is only accepted for convenience;
   /// all elements must be Constant's.
   ///
   static Constant *getGetElementPtr(Constant *C,
@@ -885,7 +885,7 @@ public:
 
   /// getIndices - Assert that this is an insertvalue or exactvalue
   /// expression and return the list of indices.
-  const SmallVector<unsigned, 4> &getIndices() const;
+  ArrayRef<unsigned> getIndices() const;
 
   /// getOpcodeName - Return a string representation for an opcode.
   const char *getOpcodeName() const;
@@ -897,10 +897,7 @@ public:
   /// getWithOperands - This returns the current constant expression with the
   /// operands replaced with the specified values.  The specified operands must
   /// match count and type with the existing ones.
-  Constant *getWithOperands(const std::vector<Constant*> &Ops) const {
-    return getWithOperands(&Ops[0], (unsigned)Ops.size());
-  }
-  Constant *getWithOperands(Constant *const *Ops, unsigned NumOps) const;
+  Constant *getWithOperands(ArrayRef<Constant*> Ops) const;
   
   virtual void destroyConstant();
   virtual void replaceUsesOfWithOnConstant(Value *From, Value *To, Use *U);
diff --git a/include/llvm/DefaultPasses.h b/include/llvm/DefaultPasses.h
new file mode 100644
index 0000000..e2e58a5b
--- /dev/null
+++ b/include/llvm/DefaultPasses.h
@@ -0,0 +1,167 @@
+//===- llvm/DefaultPasses.h - Default Pass Support code --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This file defines the infrastructure for registering the standard pass list.
+// This defines sets of standard optimizations that plugins can modify and
+// front ends can use.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEFAULT_PASS_SUPPORT_H
+#define LLVM_DEFAULT_PASS_SUPPORT_H
+
+namespace llvm {
+
+class PassManagerBase;
+
+/// Unique identifiers for the default standard passes.  The addresses of
+/// these symbols are used to uniquely identify passes from the default list.
+namespace DefaultStandardPasses {
+extern unsigned char AggressiveDCEID;
+extern unsigned char ArgumentPromotionID;
+extern unsigned char BasicAliasAnalysisID;
+extern unsigned char CFGSimplificationID;
+extern unsigned char ConstantMergeID;
+extern unsigned char CorrelatedValuePropagationID;
+extern unsigned char DeadArgEliminationID;
+extern unsigned char DeadStoreEliminationID;
+extern unsigned char DeadTypeEliminationID;
+extern unsigned char EarlyCSEID;
+extern unsigned char FunctionAttrsID;
+extern unsigned char FunctionInliningID;
+extern unsigned char GVNID;
+extern unsigned char GlobalDCEID;
+extern unsigned char GlobalOptimizerID;
+extern unsigned char GlobalsModRefID;
+extern unsigned char IPSCCPID;
+extern unsigned char IndVarSimplifyID;
+extern unsigned char InlinerPlaceholderID;
+extern unsigned char InstructionCombiningID;
+extern unsigned char JumpThreadingID;
+extern unsigned char LICMID;
+extern unsigned char LoopDeletionID;
+extern unsigned char LoopIdiomID;
+extern unsigned char LoopRotateID;
+extern unsigned char LoopUnrollID;
+extern unsigned char LoopUnswitchID;
+extern unsigned char MemCpyOptID;
+extern unsigned char PruneEHID;
+extern unsigned char ReassociateID;
+extern unsigned char SCCPID;
+extern unsigned char ScalarReplAggregatesID;
+extern unsigned char SimplifyLibCallsID;
+extern unsigned char StripDeadPrototypesID;
+extern unsigned char TailCallEliminationID;
+extern unsigned char TypeBasedAliasAnalysisID;
+}
+
+/// StandardPass - The class responsible for maintaining the lists of standard 
+class StandardPass {
+  friend class RegisterStandardPassLists;
+  public:
+  /// Predefined standard sets of passes
+  enum StandardSet {
+    AliasAnalysis,
+    Function,
+    Module,
+    LTO
+  };
+  /// Flags to specify whether a pass should be enabled.  Passes registered
+  /// with the standard sets may specify a minimum optimization level and one
+  /// or more flags that must be set when constructing the set for the pass to
+  /// be used.
+  enum OptimizationFlags {
+    /// Optimize for size was requested.
+    OptimizeSize = 1<<0,
+    /// Allow passes which may make global module changes.
+    UnitAtATime = 1<<1,
+    /// UnrollLoops - Allow loop unrolling.
+    UnrollLoops = 1<<2,
+    /// Allow library calls to be simplified.
+    SimplifyLibCalls = 1<<3,
+    /// Whether the module may have code using exceptions.
+    HaveExceptions = 1<<4,
+    // Run an inliner pass as part of this set.
+    RunInliner = 1<<5
+  };
+  enum OptimizationFlagComponents {
+    /// The low bits are used to store the optimization level.  When requesting
+    /// passes, this should store the requested optimisation level.  When
+    /// setting passes, this should set the minimum optimization level at which
+    /// the pass will run.
+    OptimizationLevelMask=0xf,
+    /// The maximum optimisation level at which the pass is run.
+    MaxOptimizationLevelMask=0xf0,
+    // Flags that must be set
+    RequiredFlagMask=0xff00,
+    // Flags that may not be set.
+    DisallowedFlagMask=0xff0000,
+    MaxOptimizationLevelShift=4,
+    RequiredFlagShift=8,
+    DisallowedFlagShift=16
+  };
+  /// Returns the optimisation level from a set of flags.
+  static unsigned OptimizationLevel(unsigned flags) {
+      return flags & OptimizationLevelMask;
+  }
+  /// Returns the maximum optimization level for this set of flags
+  static unsigned MaxOptimizationLevel(unsigned flags) {
+      return (flags & MaxOptimizationLevelMask) >> 4;
+  }
+  /// Constructs a set of flags from the specified minimum and maximum
+  /// optimisation level
+  static unsigned OptimzationFlags(unsigned minLevel=0, unsigned maxLevel=0xf,
+      unsigned requiredFlags=0, unsigned disallowedFlags=0) {
+    return ((minLevel & OptimizationLevelMask) |
+            ((maxLevel<<MaxOptimizationLevelShift) & MaxOptimizationLevelMask)
+            | ((requiredFlags<<RequiredFlagShift) & RequiredFlagMask)
+            | ((disallowedFlags<<DisallowedFlagShift) & DisallowedFlagMask));
+  }
+  /// Returns the flags that must be set for this to match
+  static unsigned RequiredFlags(unsigned flags) {
+      return (flags & RequiredFlagMask) >> RequiredFlagShift;
+  }
+  /// Returns the flags that must not be set for this to match
+  static unsigned DisallowedFlags(unsigned flags) {
+      return (flags & DisallowedFlagMask) >> DisallowedFlagShift;
+  }
+  /// Register a standard pass in the specified set.  If flags is non-zero,
+  /// then the pass will only be returned when the specified flags are set.
+  template<typename passName>
+  class RegisterStandardPass {
+    public:
+    RegisterStandardPass(StandardSet set, unsigned char *runBefore=0,
+        unsigned flags=0, unsigned char *ID=0) {
+      // Use the pass's ID if one is not specified
+      RegisterDefaultPass(PassInfo::NormalCtor_t(callDefaultCtor<passName>),
+               ID ? ID : (unsigned char*)&passName::ID, runBefore, set, flags);
+    }
+  };
+  /// Adds the passes from the specified set to the provided pass manager
+  static void AddPassesFromSet(PassManagerBase *PM,
+                               StandardSet set,
+                               unsigned flags=0,
+                               bool VerifyEach=false,
+                               Pass *inliner=0);
+  private:
+  /// Registers the default passes.  This is set by RegisterStandardPassLists
+  /// and is called lazily.
+  static void (*RegisterDefaultPasses)(void);
+  /// Creates the verifier pass that is inserted when a VerifyEach is passed to
+  /// AddPassesFromSet()
+  static Pass* (*CreateVerifierPass)(void);
+  /// Registers the pass
+  static void RegisterDefaultPass(PassInfo::NormalCtor_t constructor,
+                                  unsigned char *newPass,
+                                  unsigned char *oldPass,
+                                  StandardSet set,
+                                  unsigned flags=0);
+};
+
+} // namespace llvm
+
+#endif
diff --git a/include/llvm/DerivedTypes.h b/include/llvm/DerivedTypes.h
index cef7ec1..9e79339 100644
--- a/include/llvm/DerivedTypes.h
+++ b/include/llvm/DerivedTypes.h
@@ -19,7 +19,6 @@
 #define LLVM_DERIVED_TYPES_H
 
 #include "llvm/Type.h"
-#include "llvm/ADT/ArrayRef.h"
 #include "llvm/Support/DataTypes.h"
 
 namespace llvm {
@@ -34,6 +33,7 @@ class VectorValType;
 class IntegerValType;
 class APInt;
 class LLVMContext;
+template<typename T> class ArrayRef;
 
 class DerivedType : public Type {
   friend class Type;
@@ -69,7 +69,7 @@ public:
 
   void dump() const { Type::dump(); }
 
-  // Methods for support type inquiry through isa, cast, and dyn_cast:
+  // Methods for support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const DerivedType *) { return true; }
   static inline bool classof(const Type *T) {
     return T->isDerivedType();
@@ -103,7 +103,7 @@ public:
   /// that instance will be returned. Otherwise a new one will be created. Only
   /// one instance with a given NumBits value is ever created.
   /// @brief Get or create an IntegerType instance.
-  static const IntegerType* get(LLVMContext &C, unsigned NumBits);
+  static const IntegerType *get(LLVMContext &C, unsigned NumBits);
 
   /// @brief Get the number of bits in this IntegerType
   unsigned getBitWidth() const { return getSubclassData(); }
@@ -132,7 +132,7 @@ public:
   /// @brief Is this a power-of-2 byte-width IntegerType ?
   bool isPowerOf2ByteWidth() const;
 
-  // Methods for support type inquiry through isa, cast, and dyn_cast:
+  // Methods for support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const IntegerType *) { return true; }
   static inline bool classof(const Type *T) {
     return T->getTypeID() == IntegerTyID;
@@ -144,8 +144,6 @@ public:
 ///
 class FunctionType : public DerivedType {
   friend class TypeMap<FunctionValType, FunctionType>;
-  bool isVarArgs;
-
   FunctionType(const FunctionType &);                   // Do not implement
   const FunctionType &operator=(const FunctionType &);  // Do not implement
   FunctionType(const Type *Result, ArrayRef<const Type*> Params,
@@ -155,21 +153,13 @@ public:
   /// FunctionType::get - This static method is the primary way of constructing
   /// a FunctionType.
   ///
-  static FunctionType *get(
-    const Type *Result, ///< The result type
-    ArrayRef<const Type*> Params, ///< The types of the parameters
-    bool isVarArg  ///< Whether this is a variable argument length function
-  );
+  static FunctionType *get(const Type *Result,
+                           ArrayRef<const Type*> Params, bool isVarArg);
 
   /// FunctionType::get - Create a FunctionType taking no parameters.
   ///
-  static FunctionType *get(
-    const Type *Result, ///< The result type
-    bool isVarArg  ///< Whether this is a variable argument length function
-  ) {
-    return get(Result, std::vector<const Type *>(), isVarArg);
-  }
-
+  static FunctionType *get(const Type *Result, bool isVarArg);
+  
   /// isValidReturnType - Return true if the specified type is valid as a return
   /// type.
   static bool isValidReturnType(const Type *RetTy);
@@ -178,14 +168,14 @@ public:
   /// argument type.
   static bool isValidArgumentType(const Type *ArgTy);
 
-  inline bool isVarArg() const { return isVarArgs; }
-  inline const Type *getReturnType() const { return ContainedTys[0]; }
+  bool isVarArg() const { return getSubclassData(); }
+  const Type *getReturnType() const { return ContainedTys[0]; }
 
   typedef Type::subtype_iterator param_iterator;
   param_iterator param_begin() const { return ContainedTys + 1; }
   param_iterator param_end() const { return &ContainedTys[NumContainedTys]; }
 
-  // Parameter type accessors...
+  // Parameter type accessors.
   const Type *getParamType(unsigned i) const { return ContainedTys[i+1]; }
 
   /// getNumParams - Return the number of fixed parameters this function type
@@ -197,7 +187,7 @@ public:
   virtual void refineAbstractType(const DerivedType *OldTy, const Type *NewTy);
   virtual void typeBecameConcrete(const DerivedType *AbsTy);
 
-  // Methods for support type inquiry through isa, cast, and dyn_cast:
+  // Methods for support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const FunctionType *) { return true; }
   static inline bool classof(const Type *T) {
     return T->getTypeID() == FunctionTyID;
@@ -206,11 +196,10 @@ public:
 
 
 /// CompositeType - Common super class of ArrayType, StructType, PointerType
-/// and VectorType
+/// and VectorType.
 class CompositeType : public DerivedType {
 protected:
-  inline explicit CompositeType(LLVMContext &C, TypeID id) :
-    DerivedType(C, id) { }
+  explicit CompositeType(LLVMContext &C, TypeID tid) : DerivedType(C, tid) { }
 public:
 
   /// getTypeAtIndex - Given an index value into the type, return the type of
@@ -221,7 +210,7 @@ public:
   virtual bool indexValid(const Value *V) const = 0;
   virtual bool indexValid(unsigned Idx) const = 0;
 
-  // Methods for support type inquiry through isa, cast, and dyn_cast:
+  // Methods for support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const CompositeType *) { return true; }
   static inline bool classof(const Type *T) {
     return T->getTypeID() == ArrayTyID ||
@@ -232,7 +221,7 @@ public:
 };
 
 
-/// StructType - Class to represent struct types
+/// StructType - Class to represent struct types, both normal and packed.
 ///
 class StructType : public CompositeType {
   friend class TypeMap<StructValType, StructType>;
@@ -243,16 +232,13 @@ public:
   /// StructType::get - This static method is the primary way to create a
   /// StructType.
   ///
-  static StructType *get(LLVMContext &Context, 
-                         ArrayRef<const Type*> Params,
-                         bool isPacked=false);
+  static StructType *get(LLVMContext &Context, ArrayRef<const Type*> Params,
+                         bool isPacked = false);
 
   /// StructType::get - Create an empty structure type.
   ///
-  static StructType *get(LLVMContext &Context, bool isPacked=false) {
-    return get(Context, std::vector<const Type*>(), isPacked);
-  }
-
+  static StructType *get(LLVMContext &Context, bool isPacked=false);
+  
   /// StructType::get - This static method is a convenience method for
   /// creating structure types by specifying the elements as arguments.
   /// Note that this method always returns a non-packed struct.  To get
@@ -264,7 +250,9 @@ public:
   /// element type.
   static bool isValidElementType(const Type *ElemTy);
 
-  // Iterator access to the elements
+  bool isPacked() const { return getSubclassData() != 0 ? true : false; }
+
+  // Iterator access to the elements.
   typedef Type::subtype_iterator element_iterator;
   element_iterator element_begin() const { return ContainedTys; }
   element_iterator element_end() const { return &ContainedTys[NumContainedTys];}
@@ -288,13 +276,11 @@ public:
   virtual void refineAbstractType(const DerivedType *OldTy, const Type *NewTy);
   virtual void typeBecameConcrete(const DerivedType *AbsTy);
 
-  // Methods for support type inquiry through isa, cast, and dyn_cast:
+  // Methods for support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const StructType *) { return true; }
   static inline bool classof(const Type *T) {
     return T->getTypeID() == StructTyID;
   }
-
-  bool isPacked() const { return (0 != getSubclassData()) ? true : false; }
 };
 
 /// SequentialType - This is the superclass of the array, pointer and vector
@@ -306,12 +292,12 @@ public:
 /// components out in memory identically.
 ///
 class SequentialType : public CompositeType {
-  PATypeHandle ContainedType; ///< Storage for the single contained type
+  PATypeHandle ContainedType;       ///< Storage for the single contained type.
   SequentialType(const SequentialType &);                  // Do not implement!
   const SequentialType &operator=(const SequentialType &); // Do not implement!
 
   // avoiding warning: 'this' : used in base member initializer list
-  SequentialType* this_() { return this; }
+  SequentialType *this_() { return this; }
 protected:
   SequentialType(TypeID TID, const Type *ElType)
     : CompositeType(ElType->getContext(), TID), ContainedType(ElType, this_()) {
@@ -337,7 +323,7 @@ public:
     return ContainedTys[0];
   }
 
-  // Methods for support type inquiry through isa, cast, and dyn_cast:
+  // Methods for support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const SequentialType *) { return true; }
   static inline bool classof(const Type *T) {
     return T->getTypeID() == ArrayTyID ||
@@ -347,7 +333,7 @@ public:
 };
 
 
-/// ArrayType - Class to represent array types
+/// ArrayType - Class to represent array types.
 ///
 class ArrayType : public SequentialType {
   friend class TypeMap<ArrayValType, ArrayType>;
@@ -366,20 +352,20 @@ public:
   /// element type.
   static bool isValidElementType(const Type *ElemTy);
 
-  inline uint64_t getNumElements() const { return NumElements; }
+  uint64_t getNumElements() const { return NumElements; }
 
   // Implement the AbstractTypeUser interface.
   virtual void refineAbstractType(const DerivedType *OldTy, const Type *NewTy);
   virtual void typeBecameConcrete(const DerivedType *AbsTy);
 
-  // Methods for support type inquiry through isa, cast, and dyn_cast:
+  // Methods for support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const ArrayType *) { return true; }
   static inline bool classof(const Type *T) {
     return T->getTypeID() == ArrayTyID;
   }
 };
 
-/// VectorType - Class to represent vector types
+/// VectorType - Class to represent vector types.
 ///
 class VectorType : public SequentialType {
   friend class TypeMap<VectorValType, VectorType>;
@@ -390,7 +376,7 @@ class VectorType : public SequentialType {
   VectorType(const Type *ElType, unsigned NumEl);
 public:
   /// VectorType::get - This static method is the primary way to construct an
-  /// VectorType
+  /// VectorType.
   ///
   static VectorType *get(const Type *ElementType, unsigned NumElements);
 
@@ -431,10 +417,10 @@ public:
   static bool isValidElementType(const Type *ElemTy);
 
   /// @brief Return the number of elements in the Vector type.
-  inline unsigned getNumElements() const { return NumElements; }
+  unsigned getNumElements() const { return NumElements; }
 
   /// @brief Return the number of bits in the Vector type.
-  inline unsigned getBitWidth() const {
+  unsigned getBitWidth() const {
     return NumElements * getElementType()->getPrimitiveSizeInBits();
   }
 
@@ -442,7 +428,7 @@ public:
   virtual void refineAbstractType(const DerivedType *OldTy, const Type *NewTy);
   virtual void typeBecameConcrete(const DerivedType *AbsTy);
 
-  // Methods for support type inquiry through isa, cast, and dyn_cast:
+  // Methods for support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const VectorType *) { return true; }
   static inline bool classof(const Type *T) {
     return T->getTypeID() == VectorTyID;
@@ -450,11 +436,10 @@ public:
 };
 
 
-/// PointerType - Class to represent pointers
+/// PointerType - Class to represent pointers.
 ///
 class PointerType : public SequentialType {
   friend class TypeMap<PointerValType, PointerType>;
-  unsigned AddressSpace;
 
   PointerType(const PointerType &);                   // Do not implement
   const PointerType &operator=(const PointerType &);  // Do not implement
@@ -475,13 +460,13 @@ public:
   static bool isValidElementType(const Type *ElemTy);
 
   /// @brief Return the address space of the Pointer type.
-  inline unsigned getAddressSpace() const { return AddressSpace; }
+  inline unsigned getAddressSpace() const { return getSubclassData(); }
 
   // Implement the AbstractTypeUser interface.
   virtual void refineAbstractType(const DerivedType *OldTy, const Type *NewTy);
   virtual void typeBecameConcrete(const DerivedType *AbsTy);
 
-  // Implement support type inquiry through isa, cast, and dyn_cast:
+  // Implement support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const PointerType *) { return true; }
   static inline bool classof(const Type *T) {
     return T->getTypeID() == PointerTyID;
@@ -489,7 +474,7 @@ public:
 };
 
 
-/// OpaqueType - Class to represent abstract types
+/// OpaqueType - Class to represent opaque types.
 ///
 class OpaqueType : public DerivedType {
   friend class LLVMContextImpl;
@@ -497,11 +482,11 @@ class OpaqueType : public DerivedType {
   const OpaqueType &operator=(const OpaqueType &);  // DO NOT IMPLEMENT
   OpaqueType(LLVMContext &C);
 public:
-  /// OpaqueType::get - Static factory method for the OpaqueType class...
+  /// OpaqueType::get - Static factory method for the OpaqueType class.
   ///
   static OpaqueType *get(LLVMContext &C);
 
-  // Implement support for type inquiry through isa, cast, and dyn_cast:
+  // Implement support for type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const OpaqueType *) { return true; }
   static inline bool classof(const Type *T) {
     return T->getTypeID() == OpaqueTyID;
diff --git a/include/llvm/ExecutionEngine/ExecutionEngine.h b/include/llvm/ExecutionEngine/ExecutionEngine.h
index ef5e9ec..88b21cd 100644
--- a/include/llvm/ExecutionEngine/ExecutionEngine.h
+++ b/include/llvm/ExecutionEngine/ExecutionEngine.h
@@ -135,20 +135,14 @@ protected:
     JITMemoryManager *JMM,
     CodeGenOpt::Level OptLevel,
     bool GVsWithCode,
-    CodeModel::Model CMM,
-    StringRef MArch,
-    StringRef MCPU,
-    const SmallVectorImpl<std::string>& MAttrs);
+    TargetMachine *TM);
   static ExecutionEngine *(*MCJITCtor)(
     Module *M,
     std::string *ErrorStr,
     JITMemoryManager *JMM,
     CodeGenOpt::Level OptLevel,
     bool GVsWithCode,
-    CodeModel::Model CMM,
-    StringRef MArch,
-    StringRef MCPU,
-    const SmallVectorImpl<std::string>& MAttrs);
+    TargetMachine *TM);
   static ExecutionEngine *(*InterpCtor)(Module *M,
                                         std::string *ErrorStr);
 
@@ -185,7 +179,7 @@ public:
   /// \param GVsWithCode - Allocating globals with code breaks
   /// freeMachineCodeForFunction and is probably unsafe and bad for performance.
   /// However, we have clients who depend on this behavior, so we must support
-  /// it.  Eventually, when we're willing to break some backwards compatability,
+  /// it.  Eventually, when we're willing to break some backwards compatibility,
   /// this flag should be flipped to false, so that by default
   /// freeMachineCodeForFunction works.
   static ExecutionEngine *create(Module *M,
@@ -569,6 +563,14 @@ public:
     return *this;
   }
 
+  /// selectTarget - Pick a target either via -march or by guessing the native
+  /// arch.  Add any CPU features specified via -mcpu or -mattr.
+  static TargetMachine *selectTarget(Module *M,
+                                     StringRef MArch,
+                                     StringRef MCPU,
+                                     const SmallVectorImpl<std::string>& MAttrs,
+                                     std::string *Err);
+
   ExecutionEngine *create();
 };
 
diff --git a/include/llvm/ExecutionEngine/RuntimeDyld.h b/include/llvm/ExecutionEngine/RuntimeDyld.h
index 416acce..3dc65e3 100644
--- a/include/llvm/ExecutionEngine/RuntimeDyld.h
+++ b/include/llvm/ExecutionEngine/RuntimeDyld.h
@@ -58,11 +58,15 @@ public:
   ~RuntimeDyld();
 
   bool loadObject(MemoryBuffer *InputBuffer);
-  uint64_t getSymbolAddress(StringRef Name);
-  void reassignSymbolAddress(StringRef Name, uint64_t Addr);
-  // FIXME: Should be parameterized to get the memory block associated with
-  // a particular loaded object.
-  sys::MemoryBlock getMemoryBlock();
+  // Get the address of our local copy of the symbol. This may or may not
+  // be the address used for relocation (clients can copy the data around
+  // and resolve relocatons based on where they put it).
+  void *getSymbolAddress(StringRef Name);
+  // Resolve the relocations for all symbols we currently know about.
+  void resolveRelocations();
+  // Change the address associated with a symbol when resolving relocations.
+  // Any relocations already associated with the symbol will be re-resolved.
+  void reassignSymbolAddress(StringRef Name, uint8_t *Addr);
   StringRef getErrorString();
 };
 
diff --git a/include/llvm/Function.h b/include/llvm/Function.h
index 9a0825a..1edc176 100644
--- a/include/llvm/Function.h
+++ b/include/llvm/Function.h
@@ -253,6 +253,23 @@ public:
     else removeFnAttr(Attribute::NoUnwind);
   }
 
+  /// @brief True if the ABI mandates (or the user requested) that this
+  /// function be in a unwind table.
+  bool hasUWTable() const {
+    return hasFnAttr(Attribute::UWTable);
+  }
+  void setHasUWTable(bool HasUWTable = true) {
+    if (HasUWTable)
+      addFnAttr(Attribute::UWTable);
+    else
+      removeFnAttr(Attribute::UWTable);
+  }
+
+  /// @brief True if this function needs an unwind table.
+  bool needsUnwindTableEntry() const {
+    return hasUWTable() || !doesNotThrow();
+  }
+
   /// @brief Determine if the function returns a structure through first 
   /// pointer argument.
   bool hasStructRetAttr() const {
@@ -414,6 +431,10 @@ public:
   ///
   bool hasAddressTaken(const User** = 0) const;
 
+  /// callsFunctionThatReturnsTwice - Return true if the function has a call to
+  /// setjmp or other function that gcc recognizes as "returning twice".
+  bool callsFunctionThatReturnsTwice() const;
+
 private:
   // Shadow Value::setValueSubclassData with a private forwarding method so that
   // subclasses cannot accidentally use it.
diff --git a/include/llvm/GlobalVariable.h b/include/llvm/GlobalVariable.h
index 1769c66..442e0c0 100644
--- a/include/llvm/GlobalVariable.h
+++ b/include/llvm/GlobalVariable.h
@@ -12,7 +12,7 @@
 //
 // Global variables are constant pointers that refer to hunks of space that are
 // allocated by either the VM, or by the linker in a static compiler.  A global
-// variable may have an intial value, which is copied into the executables .data
+// variable may have an initial value, which is copied into the executables .data
 // area.  Global Constants are required to have initializers.
 //
 //===----------------------------------------------------------------------===//
diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h
index 634ffde..5efdcc9 100644
--- a/include/llvm/InitializePasses.h
+++ b/include/llvm/InitializePasses.h
@@ -66,6 +66,7 @@ void initializeBasicAliasAnalysisPass(PassRegistry&);
 void initializeBasicCallGraphPass(PassRegistry&);
 void initializeBlockExtractorPassPass(PassRegistry&);
 void initializeBlockPlacementPass(PassRegistry&);
+void initializeBranchProbabilityInfoPass(PassRegistry&);
 void initializeBreakCriticalEdgesPass(PassRegistry&);
 void initializeCFGOnlyPrinterPass(PassRegistry&);
 void initializeCFGOnlyViewerPass(PassRegistry&);
@@ -94,6 +95,7 @@ void initializeDominatorTreePass(PassRegistry&);
 void initializeEdgeBundlesPass(PassRegistry&);
 void initializeEdgeProfilerPass(PassRegistry&);
 void initializePathProfilerPass(PassRegistry&);
+void initializeGCOVProfilerPass(PassRegistry&);
 void initializeEarlyCSEPass(PassRegistry&);
 void initializeExpandISelPseudosPass(PassRegistry&);
 void initializeFindUsedTypesPass(PassRegistry&);
@@ -142,6 +144,7 @@ void initializeLowerIntrinsicsPass(PassRegistry&);
 void initializeLowerInvokePass(PassRegistry&);
 void initializeLowerSetJmpPass(PassRegistry&);
 void initializeLowerSwitchPass(PassRegistry&);
+void initializeMachineBranchProbabilityInfoPass(PassRegistry&);
 void initializeMachineCSEPass(PassRegistry&);
 void initializeMachineDominatorTreePass(PassRegistry&);
 void initializeMachineLICMPass(PassRegistry&);
@@ -158,6 +161,10 @@ void initializeModuleDebugInfoPrinterPass(PassRegistry&);
 void initializeNoAAPass(PassRegistry&);
 void initializeNoProfileInfoPass(PassRegistry&);
 void initializeNoPathProfileInfoPass(PassRegistry&);
+void initializeObjCARCAliasAnalysisPass(PassRegistry&);
+void initializeObjCARCExpandPass(PassRegistry&);
+void initializeObjCARCContractPass(PassRegistry&);
+void initializeObjCARCOptPass(PassRegistry&);
 void initializeOptimalEdgeProfilerPass(PassRegistry&);
 void initializeOptimizePHIsPass(PassRegistry&);
 void initializePEIPass(PassRegistry&);
@@ -193,7 +200,6 @@ void initializeRegionViewerPass(PassRegistry&);
 void initializeRegisterCoalescerAnalysisGroup(PassRegistry&);
 void initializeRenderMachineFunctionPass(PassRegistry&);
 void initializeSCCPPass(PassRegistry&);
-void initializeSRETPromotionPass(PassRegistry&);
 void initializeSROA_DTPass(PassRegistry&);
 void initializeSROA_SSAUpPass(PassRegistry&);
 void initializeScalarEvolutionAliasAnalysisPass(PassRegistry&);
diff --git a/include/llvm/InstrTypes.h b/include/llvm/InstrTypes.h
index a166956..cc9ec3a 100644
--- a/include/llvm/InstrTypes.h
+++ b/include/llvm/InstrTypes.h
@@ -18,7 +18,6 @@
 
 #include "llvm/Instruction.h"
 #include "llvm/OperandTraits.h"
-#include "llvm/Operator.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/ADT/Twine.h"
 
diff --git a/include/llvm/Instructions.h b/include/llvm/Instructions.h
index f14893a..54dfe39 100644
--- a/include/llvm/Instructions.h
+++ b/include/llvm/Instructions.h
@@ -584,7 +584,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(GetElementPtrInst, Value)
 /// @brief Represent an integer comparison operator.
 class ICmpInst: public CmpInst {
 protected:
-  /// @brief Clone an indentical ICmpInst
+  /// @brief Clone an identical ICmpInst
   virtual ICmpInst *clone_impl() const;
 public:
   /// @brief Constructor with insert-before-instruction semantics.
@@ -735,7 +735,7 @@ public:
 /// @brief Represents a floating point comparison operator.
 class FCmpInst: public CmpInst {
 protected:
-  /// @brief Clone an indentical FCmpInst
+  /// @brief Clone an identical FCmpInst
   virtual FCmpInst *clone_impl() const;
 public:
   /// @brief Constructor with insert-before-instruction semantics.
diff --git a/include/llvm/IntrinsicInst.h b/include/llvm/IntrinsicInst.h
index 74c30fb..24e5fe7 100644
--- a/include/llvm/IntrinsicInst.h
+++ b/include/llvm/IntrinsicInst.h
@@ -139,7 +139,7 @@ namespace llvm {
       return !getVolatileCst()->isZero();
     }
 
-    unsigned getAddressSpace() const {
+    unsigned getDestAddressSpace() const {
       return cast<PointerType>(getRawDest()->getType())->getAddressSpace();
     }
 
@@ -227,6 +227,10 @@ namespace llvm {
     /// value is guaranteed to be a pointer.
     Value *getSource() const { return getRawSource()->stripPointerCasts(); }
 
+    unsigned getSourceAddressSpace() const {
+      return cast<PointerType>(getRawSource()->getType())->getAddressSpace();
+    }
+
     void setSource(Value *Ptr) {
       assert(getRawSource()->getType() == Ptr->getType() &&
              "setSource called with pointer of wrong type!");
diff --git a/include/llvm/Intrinsics.td b/include/llvm/Intrinsics.td
index d92ab4c..c1fbce4 100644
--- a/include/llvm/Intrinsics.td
+++ b/include/llvm/Intrinsics.td
@@ -30,7 +30,7 @@ class IntrinsicProperty;
 def IntrNoMem : IntrinsicProperty;
 
 // IntrReadArgMem - This intrinsic reads only from memory that one of its
-// arguments points to, but may read an unspecified amount.
+// pointer-typed arguments points to, but may read an unspecified amount.
 def IntrReadArgMem : IntrinsicProperty;
 
 // IntrReadMem - This intrinsic reads from unspecified memory, so it cannot be
@@ -47,6 +47,9 @@ def IntrReadWriteArgMem : IntrinsicProperty;
 // Commutative - This intrinsic is commutative: X op Y == Y op X.
 def Commutative : IntrinsicProperty;
 
+// Throws - This intrinsic can throw.
+def Throws : IntrinsicProperty;
+
 // NoCapture - The specified argument pointer is not captured by the intrinsic.
 class NoCapture<int argNo> : IntrinsicProperty {
   int ArgNo = argNo;
@@ -208,7 +211,8 @@ def int_stackrestore  : Intrinsic<[], [llvm_ptr_ty]>,
 // however it does conveniently prevent the prefetch from being reordered
 // with respect to nearby accesses to the same memory.
 def int_prefetch      : Intrinsic<[],
-                                  [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty],
+                                  [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty,
+                                   llvm_i32_ty],
                                   [IntrReadWriteArgMem, NoCapture<0>]>;
 def int_pcmarker      : Intrinsic<[], [llvm_i32_ty]>;
 
@@ -292,6 +296,7 @@ let Properties = [IntrNoMem] in {
 def int_eh_exception : Intrinsic<[llvm_ptr_ty], [], [IntrReadMem]>;
 def int_eh_selector  : Intrinsic<[llvm_i32_ty],
                                  [llvm_ptr_ty, llvm_ptr_ty, llvm_vararg_ty]>;
+def int_eh_resume    : Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty], [Throws]>;
 
 def int_eh_typeid_for : Intrinsic<[llvm_i32_ty], [llvm_ptr_ty]>;
 
@@ -307,7 +312,7 @@ let Properties = [IntrNoMem] in {
   def int_eh_sjlj_lsda    : Intrinsic<[llvm_ptr_ty]>;
   def int_eh_sjlj_callsite: Intrinsic<[], [llvm_i32_ty]>;
 }
-def int_eh_sjlj_dispatch_setup : Intrinsic<[], []>;
+def int_eh_sjlj_dispatch_setup : Intrinsic<[], [llvm_i32_ty], [IntrReadMem]>;
 def int_eh_sjlj_setjmp  : Intrinsic<[llvm_i32_ty], [llvm_ptr_ty]>;
 def int_eh_sjlj_longjmp : Intrinsic<[], [llvm_ptr_ty]>;
 
diff --git a/include/llvm/IntrinsicsARM.td b/include/llvm/IntrinsicsARM.td
index 03e9261..fa8034e 100644
--- a/include/llvm/IntrinsicsARM.td
+++ b/include/llvm/IntrinsicsARM.td
@@ -36,6 +36,16 @@ let TargetPrefix = "arm" in {  // All intrinsics start with "llvm.arm.".
 }
 
 //===----------------------------------------------------------------------===//
+// Load and Store exclusive doubleword
+
+let TargetPrefix = "arm" in {  // All intrinsics start with "llvm.arm.".
+  def int_arm_strexd : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty,
+                                  llvm_ptr_ty], [IntrReadWriteArgMem]>;
+  def int_arm_ldrexd : Intrinsic<[llvm_i32_ty, llvm_i32_ty], [llvm_ptr_ty],
+                                 [IntrReadArgMem]>;
+}
+
+//===----------------------------------------------------------------------===//
 // VFP
 
 let TargetPrefix = "arm" in {  // All intrinsics start with "llvm.arm.".
@@ -50,6 +60,43 @@ let TargetPrefix = "arm" in {  // All intrinsics start with "llvm.arm.".
 }
 
 //===----------------------------------------------------------------------===//
+// Coprocessor
+
+let TargetPrefix = "arm" in {  // All intrinsics start with "llvm.arm.".
+  // Move to coprocessor
+  def int_arm_mcr : GCCBuiltin<"__builtin_arm_mcr">,
+     Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                    llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>;
+  def int_arm_mcr2 : GCCBuiltin<"__builtin_arm_mcr2">,
+     Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                    llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>;
+
+  // Move from coprocessor
+  def int_arm_mrc : GCCBuiltin<"__builtin_arm_mrc">,
+     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                               llvm_i32_ty, llvm_i32_ty], []>;
+  def int_arm_mrc2 : GCCBuiltin<"__builtin_arm_mrc2">,
+     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                               llvm_i32_ty, llvm_i32_ty], []>;
+
+  // Coprocessor data processing
+  def int_arm_cdp : GCCBuiltin<"__builtin_arm_cdp">,
+     Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                    llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>;
+  def int_arm_cdp2 : GCCBuiltin<"__builtin_arm_cdp2">,
+     Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                    llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>;
+
+  // Move from two registers to coprocessor
+  def int_arm_mcrr : GCCBuiltin<"__builtin_arm_mcrr">,
+     Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                    llvm_i32_ty, llvm_i32_ty], []>;
+  def int_arm_mcrr2 : GCCBuiltin<"__builtin_arm_mcrr2">,
+     Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                    llvm_i32_ty, llvm_i32_ty], []>;
+}
+
+//===----------------------------------------------------------------------===//
 // Advanced SIMD (NEON)
 
 let TargetPrefix = "arm" in {  // All intrinsics start with "llvm.arm.".
diff --git a/include/llvm/IntrinsicsPTX.td b/include/llvm/IntrinsicsPTX.td
index 01241fe..28379c9 100644
--- a/include/llvm/IntrinsicsPTX.td
+++ b/include/llvm/IntrinsicsPTX.td
@@ -12,53 +12,81 @@
 //===----------------------------------------------------------------------===//
 
 let TargetPrefix = "ptx" in {
-  multiclass PTXReadSpecialRegisterIntrinsic_v4i32 {
+  multiclass PTXReadSpecialRegisterIntrinsic_v4i32<string prefix> {
 // FIXME: Do we need the 128-bit integer type version?
 //    def _r64   : Intrinsic<[llvm_i128_ty],   [], [IntrNoMem]>;
 
 // FIXME: Enable this once v4i32 support is enabled in back-end.
 //    def _v4i16 : Intrinsic<[llvm_v4i32_ty], [], [IntrNoMem]>;
 
-    def _x     : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>;
-    def _y     : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>;
-    def _z     : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>;
-    def _w     : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>;
+    def _x     : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
+                 GCCBuiltin<!strconcat(prefix, "_x")>;
+    def _y     : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
+                 GCCBuiltin<!strconcat(prefix, "_y")>;
+    def _z     : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
+                 GCCBuiltin<!strconcat(prefix, "_z")>;
+    def _w     : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
+                 GCCBuiltin<!strconcat(prefix, "_w")>;
   }
 
-  class PTXReadSpecialRegisterIntrinsic_r32
-    : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>;
+  class PTXReadSpecialRegisterIntrinsic_r32<string name>
+    : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
+      GCCBuiltin<name>;
 
-  class PTXReadSpecialRegisterIntrinsic_r64
-    : Intrinsic<[llvm_i64_ty], [], [IntrNoMem]>;
+  class PTXReadSpecialRegisterIntrinsic_r64<string name>
+    : Intrinsic<[llvm_i64_ty], [], [IntrNoMem]>,
+      GCCBuiltin<name>;
 }
 
-defm int_ptx_read_tid        : PTXReadSpecialRegisterIntrinsic_v4i32;
-defm int_ptx_read_ntid       : PTXReadSpecialRegisterIntrinsic_v4i32;
+defm int_ptx_read_tid        : PTXReadSpecialRegisterIntrinsic_v4i32
+                               <"__builtin_ptx_read_tid">;
+defm int_ptx_read_ntid       : PTXReadSpecialRegisterIntrinsic_v4i32
+                               <"__builtin_ptx_read_ntid">;
 
-def int_ptx_read_laneid      : PTXReadSpecialRegisterIntrinsic_r32;
-def int_ptx_read_warpid      : PTXReadSpecialRegisterIntrinsic_r32;
-def int_ptx_read_nwarpid     : PTXReadSpecialRegisterIntrinsic_r32;
+def int_ptx_read_laneid      : PTXReadSpecialRegisterIntrinsic_r32
+                               <"__builtin_ptx_read_laneid">;
+def int_ptx_read_warpid      : PTXReadSpecialRegisterIntrinsic_r32
+                               <"__builtin_ptx_read_warpid">;
+def int_ptx_read_nwarpid     : PTXReadSpecialRegisterIntrinsic_r32
+                               <"__builtin_ptx_read_nwarpid">;
 
-defm int_ptx_read_ctaid      : PTXReadSpecialRegisterIntrinsic_v4i32;
-defm int_ptx_read_nctaid     : PTXReadSpecialRegisterIntrinsic_v4i32;
+defm int_ptx_read_ctaid      : PTXReadSpecialRegisterIntrinsic_v4i32
+                               <"__builtin_ptx_read_ctaid">;
+defm int_ptx_read_nctaid     : PTXReadSpecialRegisterIntrinsic_v4i32
+                               <"__builtin_ptx_read_nctaid">;
 
-def int_ptx_read_smid        : PTXReadSpecialRegisterIntrinsic_r32;
-def int_ptx_read_nsmid       : PTXReadSpecialRegisterIntrinsic_r32;
-def int_ptx_read_gridid      : PTXReadSpecialRegisterIntrinsic_r32;
+def int_ptx_read_smid        : PTXReadSpecialRegisterIntrinsic_r32
+                               <"__builtin_ptx_read_smid">;
+def int_ptx_read_nsmid       : PTXReadSpecialRegisterIntrinsic_r32
+                               <"__builtin_ptx_read_nsmid">;
+def int_ptx_read_gridid      : PTXReadSpecialRegisterIntrinsic_r32
+                               <"__builtin_ptx_read_gridid">;
 
-def int_ptx_read_lanemask_eq : PTXReadSpecialRegisterIntrinsic_r32;
-def int_ptx_read_lanemask_le : PTXReadSpecialRegisterIntrinsic_r32;
-def int_ptx_read_lanemask_lt : PTXReadSpecialRegisterIntrinsic_r32;
-def int_ptx_read_lanemask_ge : PTXReadSpecialRegisterIntrinsic_r32;
-def int_ptx_read_lanemask_gt : PTXReadSpecialRegisterIntrinsic_r32;
+def int_ptx_read_lanemask_eq : PTXReadSpecialRegisterIntrinsic_r32
+                               <"__builtin_ptx_read_lanemask_eq">;
+def int_ptx_read_lanemask_le : PTXReadSpecialRegisterIntrinsic_r32
+                               <"__builtin_ptx_read_lanemask_le">;
+def int_ptx_read_lanemask_lt : PTXReadSpecialRegisterIntrinsic_r32
+                               <"__builtin_ptx_read_lanemask_lt">;
+def int_ptx_read_lanemask_ge : PTXReadSpecialRegisterIntrinsic_r32
+                               <"__builtin_ptx_read_lanemask_ge">;
+def int_ptx_read_lanemask_gt : PTXReadSpecialRegisterIntrinsic_r32
+                               <"__builtin_ptx_read_lanemask_gt">;
 
-def int_ptx_read_clock       : PTXReadSpecialRegisterIntrinsic_r32;
-def int_ptx_read_clock64     : PTXReadSpecialRegisterIntrinsic_r64;
+def int_ptx_read_clock       : PTXReadSpecialRegisterIntrinsic_r32
+                               <"__builtin_ptx_read_clock">;
+def int_ptx_read_clock64     : PTXReadSpecialRegisterIntrinsic_r64
+                               <"__builtin_ptx_read_clock64">;
 
-def int_ptx_read_pm0         : PTXReadSpecialRegisterIntrinsic_r32;
-def int_ptx_read_pm1         : PTXReadSpecialRegisterIntrinsic_r32;
-def int_ptx_read_pm2         : PTXReadSpecialRegisterIntrinsic_r32;
-def int_ptx_read_pm3         : PTXReadSpecialRegisterIntrinsic_r32;
+def int_ptx_read_pm0         : PTXReadSpecialRegisterIntrinsic_r32
+                               <"__builtin_ptx_read_pm0">;
+def int_ptx_read_pm1         : PTXReadSpecialRegisterIntrinsic_r32
+                               <"__builtin_ptx_read_pm1">;
+def int_ptx_read_pm2         : PTXReadSpecialRegisterIntrinsic_r32
+                               <"__builtin_ptx_read_pm2">;
+def int_ptx_read_pm3         : PTXReadSpecialRegisterIntrinsic_r32
+                               <"__builtin_ptx_read_pm3">;
 
 let TargetPrefix = "ptx" in
-  def int_ptx_bar_sync : Intrinsic<[], [llvm_i32_ty], []>;
+  def int_ptx_bar_sync : Intrinsic<[], [llvm_i32_ty], []>,
+                         GCCBuiltin<"__builtin_ptx_bar_sync">;
diff --git a/include/llvm/IntrinsicsX86.td b/include/llvm/IntrinsicsX86.td
index 4946220..d445a01 100644
--- a/include/llvm/IntrinsicsX86.td
+++ b/include/llvm/IntrinsicsX86.td
@@ -18,6 +18,83 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 }
 
 //===----------------------------------------------------------------------===//
+// 3DNow!
+
+let TargetPrefix = "x86" in {
+  def int_x86_3dnow_pavgusb : GCCBuiltin<"__builtin_ia32_pavgusb">,
+              Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
+                        [IntrNoMem]>;
+  def int_x86_3dnow_pf2id : GCCBuiltin<"__builtin_ia32_pf2id">,
+              Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
+  def int_x86_3dnow_pfacc : GCCBuiltin<"__builtin_ia32_pfacc">,
+              Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
+                        [IntrNoMem]>;
+  def int_x86_3dnow_pfadd : GCCBuiltin<"__builtin_ia32_pfadd">,
+              Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
+                        [IntrNoMem]>;
+  def int_x86_3dnow_pfcmpeq : GCCBuiltin<"__builtin_ia32_pfcmpeq">,
+              Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
+                        [IntrNoMem]>;
+  def int_x86_3dnow_pfcmpge : GCCBuiltin<"__builtin_ia32_pfcmpge">,
+              Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
+                        [IntrNoMem]>;
+  def int_x86_3dnow_pfcmpgt : GCCBuiltin<"__builtin_ia32_pfcmpgt">,
+              Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
+                        [IntrNoMem]>;
+  def int_x86_3dnow_pfmax : GCCBuiltin<"__builtin_ia32_pfmax">,
+              Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
+                        [IntrNoMem]>;
+  def int_x86_3dnow_pfmin : GCCBuiltin<"__builtin_ia32_pfmin">,
+              Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
+                        [IntrNoMem]>;
+  def int_x86_3dnow_pfmul : GCCBuiltin<"__builtin_ia32_pfmul">,
+              Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
+                        [IntrNoMem]>;
+  def int_x86_3dnow_pfrcp : GCCBuiltin<"__builtin_ia32_pfrcp">,
+              Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
+  def int_x86_3dnow_pfrcpit1 : GCCBuiltin<"__builtin_ia32_pfrcpit1">,
+              Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
+                        [IntrNoMem]>;
+  def int_x86_3dnow_pfrcpit2 : GCCBuiltin<"__builtin_ia32_pfrcpit2">,
+              Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
+                        [IntrNoMem]>;
+  def int_x86_3dnow_pfrsqrt : GCCBuiltin<"__builtin_ia32_pfrsqrt">,
+              Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
+  def int_x86_3dnow_pfrsqit1 : GCCBuiltin<"__builtin_ia32_pfrsqit1">,
+              Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
+                        [IntrNoMem]>;
+  def int_x86_3dnow_pfsub : GCCBuiltin<"__builtin_ia32_pfsub">,
+              Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
+                        [IntrNoMem]>;
+  def int_x86_3dnow_pfsubr : GCCBuiltin<"__builtin_ia32_pfsubr">,
+              Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
+                        [IntrNoMem]>;
+  def int_x86_3dnow_pi2fd : GCCBuiltin<"__builtin_ia32_pi2fd">,
+              Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
+  def int_x86_3dnow_pmulhrw : GCCBuiltin<"__builtin_ia32_pmulhrw">,
+              Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
+                        [IntrNoMem]>;
+}
+
+//===----------------------------------------------------------------------===//
+// 3DNow! extensions
+
+let TargetPrefix = "x86" in {
+  def int_x86_3dnowa_pf2iw : GCCBuiltin<"__builtin_ia32_pf2iw">,
+              Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
+  def int_x86_3dnowa_pfnacc : GCCBuiltin<"__builtin_ia32_pfnacc">,
+              Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
+                        [IntrNoMem]>;
+  def int_x86_3dnowa_pfpnacc : GCCBuiltin<"__builtin_ia32_pfpnacc">,
+              Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
+                        [IntrNoMem]>;
+  def int_x86_3dnowa_pi2fw : GCCBuiltin<"__builtin_ia32_pi2fw">,
+              Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
+  def int_x86_3dnowa_pswapd :
+              Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
+}
+
+//===----------------------------------------------------------------------===//
 // SSE1
 
 // Arithmetic ops
@@ -138,12 +215,6 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
                          llvm_x86mmx_ty], [IntrNoMem]>;
 }
 
-// SIMD load ops
-let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_sse_loadu_ps : GCCBuiltin<"__builtin_ia32_loadups">,
-              Intrinsic<[llvm_v4f32_ty], [llvm_ptr_ty], [IntrReadMem]>;
-}
-
 // SIMD store ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_sse_storeu_ps : GCCBuiltin<"__builtin_ia32_storeups">,
@@ -153,9 +224,6 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 
 // Cacheability support ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_sse_movnt_ps : GCCBuiltin<"__builtin_ia32_movntps">,
-              Intrinsic<[], [llvm_ptr_ty,
-                         llvm_v4f32_ty], []>;
   def int_x86_sse_sfence : GCCBuiltin<"__builtin_ia32_sfence">,
               Intrinsic<[], [], []>;
 }
@@ -452,14 +520,6 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
               Intrinsic<[llvm_v2f64_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
 }
 
-// SIMD load ops
-let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_sse2_loadu_pd : GCCBuiltin<"__builtin_ia32_loadupd">,
-              Intrinsic<[llvm_v2f64_ty], [llvm_ptr_ty], [IntrReadMem]>;
-  def int_x86_sse2_loadu_dq : GCCBuiltin<"__builtin_ia32_loaddqu">,
-              Intrinsic<[llvm_v16i8_ty], [llvm_ptr_ty], [IntrReadMem]>;
-}
-
 // SIMD store ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_sse2_storeu_pd : GCCBuiltin<"__builtin_ia32_storeupd">,
@@ -473,19 +533,6 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
                          llvm_v4i32_ty], []>;
 }
 
-// Cacheability support ops
-let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_sse2_movnt_dq : GCCBuiltin<"__builtin_ia32_movntdq">,
-              Intrinsic<[], [llvm_ptr_ty,
-                         llvm_v2i64_ty], []>;
-  def int_x86_sse2_movnt_pd : GCCBuiltin<"__builtin_ia32_movntpd">,
-              Intrinsic<[], [llvm_ptr_ty,
-                         llvm_v2f64_ty], []>;
-  def int_x86_sse2_movnt_i : GCCBuiltin<"__builtin_ia32_movnti">,
-              Intrinsic<[], [llvm_ptr_ty,
-                         llvm_i32_ty], []>;
-}
-
 // Misc.
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_sse2_packsswb_128 : GCCBuiltin<"__builtin_ia32_packsswb128">,
@@ -901,19 +948,19 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
 // Miscellaneous
 // CRC Instruction
 let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
-  def int_x86_sse42_crc32_8         : GCCBuiltin<"__builtin_ia32_crc32qi">,
+  def int_x86_sse42_crc32_32_8       : GCCBuiltin<"__builtin_ia32_crc32qi">,
           Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i8_ty],
                     [IntrNoMem]>;
-  def int_x86_sse42_crc32_16         : GCCBuiltin<"__builtin_ia32_crc32hi">,
+  def int_x86_sse42_crc32_32_16      : GCCBuiltin<"__builtin_ia32_crc32hi">,
           Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i16_ty],
                     [IntrNoMem]>;
-  def int_x86_sse42_crc32_32         : GCCBuiltin<"__builtin_ia32_crc32si">,
+  def int_x86_sse42_crc32_32_32      : GCCBuiltin<"__builtin_ia32_crc32si">,
           Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
                     [IntrNoMem]>;
-  def int_x86_sse42_crc64_8         :
+  def int_x86_sse42_crc32_64_8       :
           Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i8_ty],
                     [IntrNoMem]>;
-  def int_x86_sse42_crc64_64         : GCCBuiltin<"__builtin_ia32_crc32di">,
+  def int_x86_sse42_crc32_64_64      : GCCBuiltin<"__builtin_ia32_crc32di">,
           Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
                     [IntrNoMem]>;
 }
@@ -921,68 +968,68 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
 // String/text processing ops.
 let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
   def int_x86_sse42_pcmpistrm128  : GCCBuiltin<"__builtin_ia32_pcmpistrm128">,
-	  Intrinsic<[llvm_v16i8_ty],
-		    [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i8_ty],
-		    [IntrNoMem]>;
+    Intrinsic<[llvm_v16i8_ty],
+        [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i8_ty],
+        [IntrNoMem]>;
   def int_x86_sse42_pcmpistri128  : GCCBuiltin<"__builtin_ia32_pcmpistri128">,
-	  Intrinsic<[llvm_i32_ty],
-		    [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i8_ty],
-		    [IntrNoMem]>;
+    Intrinsic<[llvm_i32_ty],
+        [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i8_ty],
+        [IntrNoMem]>;
   def int_x86_sse42_pcmpistria128 : GCCBuiltin<"__builtin_ia32_pcmpistria128">,
-	  Intrinsic<[llvm_i32_ty],
-		    [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i8_ty],
-		    [IntrNoMem]>;
+    Intrinsic<[llvm_i32_ty],
+        [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i8_ty],
+        [IntrNoMem]>;
   def int_x86_sse42_pcmpistric128 : GCCBuiltin<"__builtin_ia32_pcmpistric128">,
-	  Intrinsic<[llvm_i32_ty],
-		    [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i8_ty],
-		    [IntrNoMem]>;
+    Intrinsic<[llvm_i32_ty],
+        [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i8_ty],
+        [IntrNoMem]>;
   def int_x86_sse42_pcmpistrio128 : GCCBuiltin<"__builtin_ia32_pcmpistrio128">,
-	  Intrinsic<[llvm_i32_ty],
-		    [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i8_ty],
-		    [IntrNoMem]>;
+    Intrinsic<[llvm_i32_ty],
+        [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i8_ty],
+        [IntrNoMem]>;
   def int_x86_sse42_pcmpistris128 : GCCBuiltin<"__builtin_ia32_pcmpistris128">,
-	  Intrinsic<[llvm_i32_ty],
-		    [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i8_ty],
-		    [IntrNoMem]>;
+    Intrinsic<[llvm_i32_ty],
+        [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i8_ty],
+        [IntrNoMem]>;
   def int_x86_sse42_pcmpistriz128 : GCCBuiltin<"__builtin_ia32_pcmpistriz128">,
-	  Intrinsic<[llvm_i32_ty],
-		    [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i8_ty],
-		    [IntrNoMem]>;
+    Intrinsic<[llvm_i32_ty],
+        [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i8_ty],
+        [IntrNoMem]>;
   def int_x86_sse42_pcmpestrm128  : GCCBuiltin<"__builtin_ia32_pcmpestrm128">,
-	  Intrinsic<[llvm_v16i8_ty],
-		    [llvm_v16i8_ty, llvm_i32_ty, llvm_v16i8_ty, llvm_i32_ty,
-		     llvm_i8_ty],
-		    [IntrNoMem]>;
+    Intrinsic<[llvm_v16i8_ty],
+        [llvm_v16i8_ty, llvm_i32_ty, llvm_v16i8_ty, llvm_i32_ty,
+         llvm_i8_ty],
+        [IntrNoMem]>;
   def int_x86_sse42_pcmpestri128  : GCCBuiltin<"__builtin_ia32_pcmpestri128">,
-	  Intrinsic<[llvm_i32_ty],
-		    [llvm_v16i8_ty, llvm_i32_ty, llvm_v16i8_ty, llvm_i32_ty,
-		     llvm_i8_ty],
-		    [IntrNoMem]>;
+    Intrinsic<[llvm_i32_ty],
+        [llvm_v16i8_ty, llvm_i32_ty, llvm_v16i8_ty, llvm_i32_ty,
+         llvm_i8_ty],
+        [IntrNoMem]>;
   def int_x86_sse42_pcmpestria128 : GCCBuiltin<"__builtin_ia32_pcmpestria128">,
-	  Intrinsic<[llvm_i32_ty],
-		    [llvm_v16i8_ty, llvm_i32_ty, llvm_v16i8_ty, llvm_i32_ty,
-		     llvm_i8_ty],
-		    [IntrNoMem]>;
+    Intrinsic<[llvm_i32_ty],
+        [llvm_v16i8_ty, llvm_i32_ty, llvm_v16i8_ty, llvm_i32_ty,
+         llvm_i8_ty],
+        [IntrNoMem]>;
   def int_x86_sse42_pcmpestric128 : GCCBuiltin<"__builtin_ia32_pcmpestric128">,
-	  Intrinsic<[llvm_i32_ty],
-		    [llvm_v16i8_ty, llvm_i32_ty, llvm_v16i8_ty, llvm_i32_ty,
-		     llvm_i8_ty],
-		    [IntrNoMem]>;
+    Intrinsic<[llvm_i32_ty],
+        [llvm_v16i8_ty, llvm_i32_ty, llvm_v16i8_ty, llvm_i32_ty,
+         llvm_i8_ty],
+        [IntrNoMem]>;
   def int_x86_sse42_pcmpestrio128 : GCCBuiltin<"__builtin_ia32_pcmpestrio128">,
-	  Intrinsic<[llvm_i32_ty],
-		    [llvm_v16i8_ty, llvm_i32_ty, llvm_v16i8_ty, llvm_i32_ty,
-		     llvm_i8_ty],
-		    [IntrNoMem]>;
+    Intrinsic<[llvm_i32_ty],
+        [llvm_v16i8_ty, llvm_i32_ty, llvm_v16i8_ty, llvm_i32_ty,
+         llvm_i8_ty],
+        [IntrNoMem]>;
   def int_x86_sse42_pcmpestris128 : GCCBuiltin<"__builtin_ia32_pcmpestris128">,
-	  Intrinsic<[llvm_i32_ty],
-		    [llvm_v16i8_ty, llvm_i32_ty, llvm_v16i8_ty, llvm_i32_ty,
-		     llvm_i8_ty],
-		    [IntrNoMem]>;
+    Intrinsic<[llvm_i32_ty],
+        [llvm_v16i8_ty, llvm_i32_ty, llvm_v16i8_ty, llvm_i32_ty,
+         llvm_i8_ty],
+        [IntrNoMem]>;
   def int_x86_sse42_pcmpestriz128 : GCCBuiltin<"__builtin_ia32_pcmpestriz128">,
-	  Intrinsic<[llvm_i32_ty],
-		    [llvm_v16i8_ty, llvm_i32_ty, llvm_v16i8_ty, llvm_i32_ty,
-		     llvm_i8_ty],
-		    [IntrNoMem]>;
+    Intrinsic<[llvm_i32_ty],
+        [llvm_v16i8_ty, llvm_i32_ty, llvm_v16i8_ty, llvm_i32_ty,
+         llvm_i8_ty],
+        [IntrNoMem]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1571,14 +1618,14 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
               Intrinsic<[], [llvm_ptrx86mmx_ty, llvm_x86mmx_ty], []>;
 
   def int_x86_mmx_palignr_b : GCCBuiltin<"__builtin_ia32_palignr">,
-              Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, 
+              Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                         llvm_x86mmx_ty, llvm_i8_ty], [IntrNoMem]>;
 
   def int_x86_mmx_pextr_w : GCCBuiltin<"__builtin_ia32_vec_ext_v4hi">,
-              Intrinsic<[llvm_i32_ty], [llvm_x86mmx_ty, llvm_i32_ty], 
+              Intrinsic<[llvm_i32_ty], [llvm_x86mmx_ty, llvm_i32_ty],
                         [IntrNoMem]>;
 
   def int_x86_mmx_pinsr_w : GCCBuiltin<"__builtin_ia32_vec_set_v4hi">,
-              Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, 
+              Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                         llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
 }
diff --git a/include/llvm/IntrinsicsXCore.td b/include/llvm/IntrinsicsXCore.td
index e633af0..a062fc4 100644
--- a/include/llvm/IntrinsicsXCore.td
+++ b/include/llvm/IntrinsicsXCore.td
@@ -11,6 +11,12 @@
 let TargetPrefix = "xcore" in {  // All intrinsics start with "llvm.xcore.".
   // Miscellaneous instructions.
   def int_xcore_bitrev : Intrinsic<[llvm_i32_ty],[llvm_i32_ty],[IntrNoMem]>;
+  def int_xcore_crc8 : Intrinsic<[llvm_i32_ty, llvm_i32_ty],
+                                 [llvm_i32_ty,llvm_i32_ty,llvm_i32_ty],
+                                 [IntrNoMem]>;
+  def int_xcore_crc32 : Intrinsic<[llvm_i32_ty],
+                                  [llvm_i32_ty,llvm_i32_ty,llvm_i32_ty],
+                                  [IntrNoMem]>;
   def int_xcore_getid : Intrinsic<[llvm_i32_ty],[],[IntrNoMem]>;
   def int_xcore_getps : Intrinsic<[llvm_i32_ty],[llvm_i32_ty]>;
   def int_xcore_setps : Intrinsic<[],[llvm_i32_ty, llvm_i32_ty]>;
diff --git a/include/llvm/LinkAllPasses.h b/include/llvm/LinkAllPasses.h
index 0e33f14..c2ea8ef 100644
--- a/include/llvm/LinkAllPasses.h
+++ b/include/llvm/LinkAllPasses.h
@@ -49,7 +49,6 @@ namespace {
       (void) llvm::createAliasAnalysisCounterPass();
       (void) llvm::createAliasDebugger();
       (void) llvm::createArgumentPromotionPass();
-      (void) llvm::createStructRetPromotionPass();
       (void) llvm::createBasicAliasAnalysisPass();
       (void) llvm::createLibCallAliasAnalysisPass(0);
       (void) llvm::createScalarEvolutionAliasAnalysisPass();
@@ -71,6 +70,7 @@ namespace {
       (void) llvm::createEdgeProfilerPass();
       (void) llvm::createOptimalEdgeProfilerPass();
       (void) llvm::createPathProfilerPass();
+      (void) llvm::createGCOVProfilerPass(true, true, false);
       (void) llvm::createFunctionInliningPass();
       (void) llvm::createAlwaysInlinerPass();
       (void) llvm::createGlobalDCEPass();
@@ -97,6 +97,10 @@ namespace {
       (void) llvm::createLowerSwitchPass();
       (void) llvm::createNoAAPass();
       (void) llvm::createNoProfileInfoPass();
+      (void) llvm::createObjCARCAliasAnalysisPass();
+      (void) llvm::createObjCARCExpandPass();
+      (void) llvm::createObjCARCContractPass();
+      (void) llvm::createObjCARCOptPass();
       (void) llvm::createProfileEstimatorPass();
       (void) llvm::createProfileVerifierPass();
       (void) llvm::createPathProfileVerifierPass();
diff --git a/include/llvm/MC/MCAsmInfo.h b/include/llvm/MC/MCAsmInfo.h
index 70ad231..775d22b 100644
--- a/include/llvm/MC/MCAsmInfo.h
+++ b/include/llvm/MC/MCAsmInfo.h
@@ -20,15 +20,18 @@
 #include <cassert>
 
 namespace llvm {
+  class MCExpr;
   class MCSection;
+  class MCStreamer;
+  class MCSymbol;
   class MCContext;
 
-  /// MCAsmInfo - This class is intended to be used as a base class for asm
-  /// properties and features specific to the target.
   namespace ExceptionHandling {
-    enum ExceptionsType { None, DwarfTable, DwarfCFI, SjLj, ARM };
+    enum ExceptionsType { None, DwarfCFI, SjLj, ARM, Win64 };
   }
 
+  /// MCAsmInfo - This class is intended to be used as a base class for asm
+  /// properties and features specific to the target.
   class MCAsmInfo {
   protected:
     //===------------------------------------------------------------------===//
@@ -266,9 +269,6 @@ namespace llvm {
     /// SupportsExceptionHandling - True if target supports exception handling.
     ExceptionHandling::ExceptionsType ExceptionsType; // Defaults to None
 
-    /// RequiresFrameSection - true if the Dwarf2 output needs a frame section
-    bool DwarfRequiresFrameSection;          // Defaults to true.
-
     /// DwarfUsesInlineInfoSection - True if DwarfDebugInlineSection is used to
     /// encode inline subroutine information.
     bool DwarfUsesInlineInfoSection;         // Defaults to false.
@@ -276,9 +276,9 @@ namespace llvm {
     /// DwarfSectionOffsetDirective - Special section offset directive.
     const char* DwarfSectionOffsetDirective; // Defaults to NULL
 
-    /// DwarfUsesAbsoluteLabelForStmtList - True if DW_AT_stmt_list needs
-    /// absolute label instead of offset.
-    bool DwarfUsesAbsoluteLabelForStmtList;  // Defaults to true;
+    /// DwarfRequiresRelocationForSectionOffset - True if we need to produce a
+    // relocation when we want a section offset in dwarf.
+    bool DwarfRequiresRelocationForSectionOffset;  // Defaults to true;
 
     // DwarfUsesLabelOffsetDifference - True if Dwarf2 output can
     // use EmitLabelOffsetDifference.
@@ -321,6 +321,16 @@ namespace llvm {
       return 0;
     }
 
+    virtual const MCExpr *
+    getExprForPersonalitySymbol(const MCSymbol *Sym,
+                                unsigned Encoding,
+                                MCStreamer &Streamer) const;
+
+    const MCExpr *
+    getExprForFDESymbol(const MCSymbol *Sym,
+                        unsigned Encoding,
+                        MCStreamer &Streamer) const;
+
     bool usesSunStyleELFSectionSwitchSyntax() const {
       return SunStyleELFSectionSwitchSyntax;
     }
@@ -449,13 +459,9 @@ namespace llvm {
     }
     bool isExceptionHandlingDwarf() const {
       return
-        (ExceptionsType == ExceptionHandling::DwarfTable ||
-         ExceptionsType == ExceptionHandling::DwarfCFI ||
-         ExceptionsType == ExceptionHandling::ARM);
-    }
-
-    bool doesDwarfRequireFrameSection() const {
-      return DwarfRequiresFrameSection;
+        (ExceptionsType == ExceptionHandling::DwarfCFI ||
+         ExceptionsType == ExceptionHandling::ARM ||
+         ExceptionsType == ExceptionHandling::Win64);
     }
     bool doesDwarfUsesInlineInfoSection() const {
       return DwarfUsesInlineInfoSection;
@@ -463,8 +469,8 @@ namespace llvm {
     const char *getDwarfSectionOffsetDirective() const {
       return DwarfSectionOffsetDirective;
     }
-    bool doesDwarfUsesAbsoluteLabelForStmtList() const {
-      return DwarfUsesAbsoluteLabelForStmtList;
+    bool doesDwarfRequireRelocationForSectionOffset() const {
+      return DwarfRequiresRelocationForSectionOffset;
     }
     bool doesDwarfUsesLabelOffsetForRanges() const {
       return DwarfUsesLabelOffsetForRanges;
diff --git a/include/llvm/MC/MCAsmLayout.h b/include/llvm/MC/MCAsmLayout.h
index 01cb000..a4585d1 100644
--- a/include/llvm/MC/MCAsmLayout.h
+++ b/include/llvm/MC/MCAsmLayout.h
@@ -36,8 +36,8 @@ private:
   /// List of sections in layout order.
   llvm::SmallVector<MCSectionData*, 16> SectionOrder;
 
-  /// The last fragment which was layed out, or 0 if nothing has been layed
-  /// out. Fragments are always layed out in order, so all fragments with a
+  /// The last fragment which was laid out, or 0 if nothing has been laid
+  /// out. Fragments are always laid out in order, so all fragments with a
   /// lower ordinal will be up to date.
   mutable DenseMap<const MCSectionData*, MCFragment *> LastValidFragment;
 
@@ -58,7 +58,7 @@ public:
   void Invalidate(MCFragment *F);
 
   /// \brief Perform layout for a single fragment, assuming that the previous
-  /// fragment has already been layed out correctly, and the parent section has
+  /// fragment has already been laid out correctly, and the parent section has
   /// been initialized.
   void LayoutFragment(MCFragment *Fragment);
 
diff --git a/include/llvm/MC/MCAssembler.h b/include/llvm/MC/MCAssembler.h
index 30971c6..fc91966 100644
--- a/include/llvm/MC/MCAssembler.h
+++ b/include/llvm/MC/MCAssembler.h
@@ -706,7 +706,7 @@ private:
   /// \param DF The fragment the fixup is inside.
   /// \param Target [out] On return, the relocatable expression the fixup
   /// evaluates to.
-  /// \param Value [out] On return, the value of the fixup as currently layed
+  /// \param Value [out] On return, the value of the fixup as currently laid
   /// out.
   /// \return Whether the fixup value was fully resolved. This is true if the
   /// \arg Value result is fixed, otherwise the value may change due to
@@ -745,7 +745,7 @@ private:
                        MCFragment &F, const MCFixup &Fixup);
 
 public:
-  /// Compute the effective fragment size assuming it is layed out at the given
+  /// Compute the effective fragment size assuming it is laid out at the given
   /// \arg SectionAddress and \arg FragmentOffset.
   uint64_t ComputeFragmentSize(const MCAsmLayout &Layout, const MCFragment &F) const;
 
diff --git a/include/llvm/MC/MCContext.h b/include/llvm/MC/MCContext.h
index f627976..43a9ce6 100644
--- a/include/llvm/MC/MCContext.h
+++ b/include/llvm/MC/MCContext.h
@@ -39,18 +39,27 @@ namespace llvm {
   class MCContext {
     MCContext(const MCContext&); // DO NOT IMPLEMENT
     MCContext &operator=(const MCContext&); // DO NOT IMPLEMENT
+  public:
+    typedef StringMap<MCSymbol*, BumpPtrAllocator&> SymbolTable;
+  private:
 
     /// The MCAsmInfo for this target.
     const MCAsmInfo &MAI;
 
     const TargetAsmInfo *TAI;
 
+    /// Allocator - Allocator object used for creating machine code objects.
+    ///
+    /// We use a bump pointer allocator to avoid the need to track all allocated
+    /// objects.
+    BumpPtrAllocator Allocator;
+
     /// Symbols - Bindings of names to symbols.
-    StringMap<MCSymbol*> Symbols;
+    SymbolTable Symbols;
 
     /// UsedNames - Keeps tracks of names that were used both for used declared
     /// and artificial symbols.
-    StringMap<bool> UsedNames;
+    StringMap<bool, BumpPtrAllocator&> UsedNames;
 
     /// NextUniqueID - The next ID to dole out to an unnamed assembler temporary
     /// symbol.
@@ -96,12 +105,6 @@ namespace llvm {
     /// the elements were added.
     std::vector<const MCSection *> MCLineSectionOrder;
 
-    /// Allocator - Allocator object used for creating machine code objects.
-    ///
-    /// We use a bump pointer allocator to avoid the need to track all allocated
-    /// objects.
-    BumpPtrAllocator Allocator;
-
     void *MachOUniquingMap, *ELFUniquingMap, *COFFUniquingMap;
 
     MCSymbol *CreateSymbol(StringRef Name);
@@ -142,6 +145,14 @@ namespace llvm {
     /// LookupSymbol - Get the symbol for \p Name, or null.
     MCSymbol *LookupSymbol(StringRef Name) const;
 
+    /// getSymbols - Get a reference for the symbol table for clients that
+    /// want to, for example, iterate over all symbols. 'const' because we
+    /// still want any modifications to the table itself to use the MCContext
+    /// APIs.
+    const SymbolTable &getSymbols() const {
+      return Symbols;
+    }
+
     /// @}
 
     /// @name Section Management
diff --git a/include/llvm/MC/MCDisassembler.h b/include/llvm/MC/MCDisassembler.h
index 0fdb51b..ce8759a 100644
--- a/include/llvm/MC/MCDisassembler.h
+++ b/include/llvm/MC/MCDisassembler.h
@@ -10,12 +10,14 @@
 #define MCDISASSEMBLER_H
 
 #include "llvm/Support/DataTypes.h"
+#include "llvm-c/Disassembler.h"
 
 namespace llvm {
   
 class MCInst;
 class MemoryObject;
 class raw_ostream;
+class MCContext;
   
 struct EDInstInfo;
 
@@ -24,7 +26,7 @@ struct EDInstInfo;
 class MCDisassembler {
 public:
   /// Constructor     - Performs initial setup for the disassembler.
-  MCDisassembler() {}
+  MCDisassembler() : GetOpInfo(0), DisInfo(0), Ctx(0) {}
   
   virtual ~MCDisassembler();
   
@@ -53,6 +55,30 @@ public:
   ///                   each MCInst opcode this disassembler returns.
   ///                   NULL if there is no info for this target.
   virtual EDInstInfo   *getEDInfo() const { return (EDInstInfo*)0; }
+
+private:
+  //
+  // Hooks for symbolic disassembly via the public 'C' interface.
+  //
+  // The function to get the symbolic information for operands.
+  LLVMOpInfoCallback GetOpInfo;
+  // The pointer to the block of symbolic information for above call back.
+  void *DisInfo;
+  // The assembly context for creating symbols and MCExprs in place of
+  // immediate operands when there is symbolic information.
+  MCContext *Ctx;
+
+public:
+  void setupForSymbolicDisassembly(LLVMOpInfoCallback getOpInfo,
+                                   void *disInfo,
+                                   MCContext *ctx) {
+    GetOpInfo = getOpInfo;
+    DisInfo = disInfo;
+    Ctx = ctx;
+  }
+  LLVMOpInfoCallback getLLVMOpInfoCallback() const { return GetOpInfo; }
+  void *getDisInfoBlock() const { return DisInfo; }
+  MCContext *getMCContext() const { return Ctx; }
 };
 
 } // namespace llvm
diff --git a/include/llvm/MC/MCDwarf.h b/include/llvm/MC/MCDwarf.h
index 07a7bad..90c3728 100644
--- a/include/llvm/MC/MCDwarf.h
+++ b/include/llvm/MC/MCDwarf.h
@@ -23,6 +23,7 @@
 #include <vector>
 
 namespace llvm {
+  class TargetAsmInfo;
   class MachineMove;
   class MCContext;
   class MCExpr;
@@ -230,7 +231,7 @@ namespace llvm {
 
   class MCCFIInstruction {
   public:
-    enum OpType { Remember, Restore, Move };
+    enum OpType { SameValue, Remember, Restore, Move, RelMove };
   private:
     OpType Operation;
     MCSymbol *Label;
@@ -242,10 +243,19 @@ namespace llvm {
       : Operation(Op), Label(L) {
       assert(Op == Remember || Op == Restore);
     }
+    MCCFIInstruction(OpType Op, MCSymbol *L, unsigned Register)
+      : Operation(Op), Label(L), Destination(Register) {
+      assert(Op == SameValue);
+    }
     MCCFIInstruction(MCSymbol *L, const MachineLocation &D,
                      const MachineLocation &S)
       : Operation(Move), Label(L), Destination(D), Source(S) {
     }
+    MCCFIInstruction(OpType Op, MCSymbol *L, const MachineLocation &D,
+                     const MachineLocation &S)
+      : Operation(Op), Label(L), Destination(D), Source(S) {
+      assert(Op == RelMove);
+    }
     OpType getOperation() const { return Operation; }
     MCSymbol *getLabel() const { return Label; }
     const MachineLocation &getDestination() const { return Destination; }
@@ -254,12 +264,13 @@ namespace llvm {
 
   struct MCDwarfFrameInfo {
     MCDwarfFrameInfo() : Begin(0), End(0), Personality(0), Lsda(0),
-                         Instructions(), PersonalityEncoding(0),
+                         Function(0), Instructions(), PersonalityEncoding(),
                          LsdaEncoding(0) {}
     MCSymbol *Begin;
     MCSymbol *End;
     const MCSymbol *Personality;
     const MCSymbol *Lsda;
+    const MCSymbol *Function;
     std::vector<MCCFIInstruction> Instructions;
     unsigned PersonalityEncoding;
     unsigned LsdaEncoding;
@@ -270,7 +281,8 @@ namespace llvm {
     //
     // This emits the frame info section.
     //
-    static void Emit(MCStreamer &streamer);
+    static void Emit(MCStreamer &streamer, bool usingCFI,
+                     bool isEH);
     static void EmitAdvanceLoc(MCStreamer &Streamer, uint64_t AddrDelta);
     static void EncodeAdvanceLoc(uint64_t AddrDelta, raw_ostream &OS);
   };
diff --git a/include/llvm/MC/MCELFSymbolFlags.h b/include/llvm/MC/MCELFSymbolFlags.h
index d798fb0..2225ea0 100644
--- a/include/llvm/MC/MCELFSymbolFlags.h
+++ b/include/llvm/MC/MCELFSymbolFlags.h
@@ -49,7 +49,8 @@ namespace llvm {
       ELF_STV_Hidden    = (ELF::STV_HIDDEN    << ELF_STV_Shift),
       ELF_STV_Protected = (ELF::STV_PROTECTED << ELF_STV_Shift),
 
-      ELF_Other_Weakref = (1                  << ELF_Other_Shift)
+      ELF_Other_Weakref = (1                  << ELF_Other_Shift),
+      ELF_Other_ThumbFunc = (2                << ELF_Other_Shift)
   };
 
 } // end namespace llvm
diff --git a/include/llvm/MC/MCExpr.h b/include/llvm/MC/MCExpr.h
index fea5249..0f28599 100644
--- a/include/llvm/MC/MCExpr.h
+++ b/include/llvm/MC/MCExpr.h
@@ -19,6 +19,7 @@ class MCAsmInfo;
 class MCAsmLayout;
 class MCAssembler;
 class MCContext;
+class MCSection;
 class MCSectionData;
 class MCSymbol;
 class MCValue;
@@ -92,6 +93,12 @@ public:
   /// @result - True on success.
   bool EvaluateAsRelocatable(MCValue &Res, const MCAsmLayout &Layout) const;
 
+  /// FindAssociatedSection - Find the "associated section" for this expression,
+  /// which is currently defined as the absolute section for constants, or
+  /// otherwise the section associated with the first defined symbol in the
+  /// expression.
+  const MCSection *FindAssociatedSection() const;
+
   /// @}
 
   static bool classof(const MCExpr *) { return true; }
@@ -164,8 +171,10 @@ public:
     VK_ARM_GOTTPOFF,
 
     VK_PPC_TOC,
-    VK_PPC_HA16,  // ha16(symbol)
-    VK_PPC_LO16   // lo16(symbol)
+    VK_PPC_DARWIN_HA16,  // ha16(symbol)
+    VK_PPC_DARWIN_LO16,  // lo16(symbol)
+    VK_PPC_GAS_HA16,     // symbol@ha
+    VK_PPC_GAS_LO16      // symbol@l
   };
 
 private:
@@ -420,6 +429,7 @@ public:
   virtual bool EvaluateAsRelocatableImpl(MCValue &Res,
                                          const MCAsmLayout *Layout) const = 0;
   virtual void AddValueSymbols(MCAssembler *) const = 0;
+  virtual const MCSection *FindAssociatedSection() const = 0;
 
   static bool classof(const MCExpr *E) {
     return E->getKind() == MCExpr::Target;
diff --git a/include/llvm/MC/MCInstPrinter.h b/include/llvm/MC/MCInstPrinter.h
index 0669558..39002da 100644
--- a/include/llvm/MC/MCInstPrinter.h
+++ b/include/llvm/MC/MCInstPrinter.h
@@ -45,8 +45,8 @@ public:
   /// "MOV32ri") or empty if we can't resolve it.
   virtual StringRef getOpcodeName(unsigned Opcode) const;
 
-  /// getRegName - Return the assembler register name.
-  virtual StringRef getRegName(unsigned RegNo) const;
+  /// printRegName - Print the assembler register name.
+  virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
 
   unsigned getAvailableFeatures() const { return AvailableFeatures; }
   void setAvailableFeatures(unsigned Value) { AvailableFeatures = Value; }
diff --git a/include/llvm/MC/MCObjectStreamer.h b/include/llvm/MC/MCObjectStreamer.h
index 221a684..8b0d87a 100644
--- a/include/llvm/MC/MCObjectStreamer.h
+++ b/include/llvm/MC/MCObjectStreamer.h
@@ -63,9 +63,9 @@ public:
 
   virtual void EmitLabel(MCSymbol *Symbol);
   virtual void EmitValueImpl(const MCExpr *Value, unsigned Size,
-                             bool isPCRel, unsigned AddrSpace);
-  virtual void EmitULEB128Value(const MCExpr *Value, unsigned AddrSpace = 0);
-  virtual void EmitSLEB128Value(const MCExpr *Value, unsigned AddrSpace = 0);
+                             unsigned AddrSpace);
+  virtual void EmitULEB128Value(const MCExpr *Value);
+  virtual void EmitSLEB128Value(const MCExpr *Value);
   virtual void EmitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol);
   virtual void ChangeSection(const MCSection *Section);
   virtual void EmitInstruction(const MCInst &Inst);
diff --git a/include/llvm/MC/MCParser/MCAsmLexer.h b/include/llvm/MC/MCParser/MCAsmLexer.h
index 606725a..47c580f 100644
--- a/include/llvm/MC/MCParser/MCAsmLexer.h
+++ b/include/llvm/MC/MCParser/MCAsmLexer.h
@@ -44,6 +44,7 @@ public:
     Colon,
     Plus, Minus, Tilde,
     Slash,    // '/'
+    BackSlash, // '\'
     LParen, RParen, LBrac, RBrac, LCurly, RCurly,
     Star, Dot, Comma, Dollar, Equal, EqualEqual,
 
diff --git a/include/llvm/MC/MCParser/MCAsmParser.h b/include/llvm/MC/MCParser/MCAsmParser.h
index 54979d9..7376693 100644
--- a/include/llvm/MC/MCParser/MCAsmParser.h
+++ b/include/llvm/MC/MCParser/MCAsmParser.h
@@ -71,7 +71,9 @@ public:
 
   /// Warning - Emit a warning at the location \arg L, with the message \arg
   /// Msg.
-  virtual void Warning(SMLoc L, const Twine &Msg) = 0;
+  ///
+  /// \return The return value is true, if warnings are fatal.
+  virtual bool Warning(SMLoc L, const Twine &Msg) = 0;
 
   /// Error - Emit an error at the location \arg L, with the message \arg
   /// Msg.
diff --git a/include/llvm/MC/MCParser/MCAsmParserExtension.h b/include/llvm/MC/MCParser/MCAsmParserExtension.h
index ceb57f5..4e2aee9 100644
--- a/include/llvm/MC/MCParser/MCAsmParserExtension.h
+++ b/include/llvm/MC/MCParser/MCAsmParserExtension.h
@@ -56,7 +56,7 @@ public:
   MCAsmParser &getParser() { return *Parser; }
   SourceMgr &getSourceManager() { return getParser().getSourceManager(); }
   MCStreamer &getStreamer() { return getParser().getStreamer(); }
-  void Warning(SMLoc L, const Twine &Msg) {
+  bool Warning(SMLoc L, const Twine &Msg) {
     return getParser().Warning(L, Msg);
   }
   bool Error(SMLoc L, const Twine &Msg) {
diff --git a/include/llvm/MC/MCSection.h b/include/llvm/MC/MCSection.h
index 1c01b2f..5700817 100644
--- a/include/llvm/MC/MCSection.h
+++ b/include/llvm/MC/MCSection.h
@@ -14,7 +14,6 @@
 #ifndef LLVM_MC_MCSECTION_H
 #define LLVM_MC_MCSECTION_H
 
-#include <string>
 #include "llvm/ADT/StringRef.h"
 #include "llvm/MC/SectionKind.h"
 #include "llvm/Support/Casting.h"
diff --git a/include/llvm/MC/MCStreamer.h b/include/llvm/MC/MCStreamer.h
index 4fdbc44..c05a925 100644
--- a/include/llvm/MC/MCStreamer.h
+++ b/include/llvm/MC/MCStreamer.h
@@ -18,6 +18,7 @@
 #include "llvm/Support/DataTypes.h"
 #include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCWin64EH.h"
 
 namespace llvm {
   class MCAsmInfo;
@@ -50,13 +51,20 @@ namespace llvm {
     MCStreamer(const MCStreamer&); // DO NOT IMPLEMENT
     MCStreamer &operator=(const MCStreamer&); // DO NOT IMPLEMENT
 
-    void EmitSymbolValue(const MCSymbol *Sym, unsigned Size,
-                         bool isPCRel, unsigned AddrSpace);
+    bool EmitEHFrame;
+    bool EmitDebugFrame;
 
     std::vector<MCDwarfFrameInfo> FrameInfos;
     MCDwarfFrameInfo *getCurrentFrameInfo();
     void EnsureValidFrame();
 
+    std::vector<MCWin64EHUnwindInfo *> W64UnwindInfos;
+    MCWin64EHUnwindInfo *CurrentW64UnwindInfo;
+    void setCurrentW64UnwindInfo(MCWin64EHUnwindInfo *Frame);
+    void EnsureValidW64UnwindInfo();
+
+    const MCSymbol* LastNonPrivate;
+
     /// SectionStack - This is stack of current and previous section
     /// values saved by PushSection.
     SmallVector<std::pair<const MCSection *,
@@ -65,6 +73,16 @@ namespace llvm {
   protected:
     MCStreamer(MCContext &Ctx);
 
+    const MCExpr *BuildSymbolDiff(MCContext &Context, const MCSymbol *A,
+                                  const MCSymbol *B);
+
+    const MCExpr *ForceExpAbs(const MCExpr* Expr);
+
+    void EmitFrames(bool usingCFI);
+
+    MCWin64EHUnwindInfo *getCurrentW64UnwindInfo(){return CurrentW64UnwindInfo;}
+    void EmitW64Tables();
+
   public:
     virtual ~MCStreamer();
 
@@ -78,6 +96,14 @@ namespace llvm {
       return FrameInfos[i];
     }
 
+    unsigned getNumW64UnwindInfos() {
+      return W64UnwindInfos.size();
+    }
+
+    MCWin64EHUnwindInfo &getW64UnwindInfo(unsigned i) {
+      return *W64UnwindInfos[i];
+    }
+
     /// @name Assembly File Formatting.
     /// @{
 
@@ -169,6 +195,17 @@ namespace llvm {
       }
     }
 
+    /// SwitchSectionNoChange - Set the current section where code is being
+    /// emitted to @p Section.  This is required to update CurSection. This
+    /// version does not call ChangeSection.
+    void SwitchSectionNoChange(const MCSection *Section) {
+      assert(Section && "Cannot switch to a null section!");
+      const MCSection *curSection = SectionStack.back().first;
+      SectionStack.back().second = curSection;
+      if (Section != curSection)
+        SectionStack.back().first = Section;
+    }
+
     /// InitSections - Create the default sections and set the initial one.
     virtual void InitSections() = 0;
 
@@ -180,7 +217,10 @@ namespace llvm {
     /// @param Symbol - The symbol to emit. A given symbol should only be
     /// emitted as a label once, and symbols emitted as a label should never be
     /// used in an assignment.
-    virtual void EmitLabel(MCSymbol *Symbol) = 0;
+    virtual void EmitLabel(MCSymbol *Symbol);
+
+    virtual void EmitEHSymAttributes(const MCSymbol *Symbol,
+                                     MCSymbol *EHSymbol);
 
     /// EmitAssemblerFlag - Note in the output the specified @p Flag
     virtual void EmitAssemblerFlag(MCAssemblerFlag Flag) = 0;
@@ -280,6 +320,7 @@ namespace llvm {
     /// if non-zero.  This must be a power of 2 on some targets.
     virtual void EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
                                 uint64_t Size, unsigned ByteAlignment = 0) = 0;
+
     /// @}
     /// @name Generating Data
     /// @{
@@ -300,13 +341,10 @@ namespace llvm {
     /// @param Size - The size of the integer (in bytes) to emit. This must
     /// match a native machine width.
     virtual void EmitValueImpl(const MCExpr *Value, unsigned Size,
-                               bool isPCRel, unsigned AddrSpace) = 0;
+                               unsigned AddrSpace) = 0;
 
     void EmitValue(const MCExpr *Value, unsigned Size, unsigned AddrSpace = 0);
 
-    void EmitPCRelValue(const MCExpr *Value, unsigned Size,
-                        unsigned AddrSpace = 0);
-
     /// EmitIntValue - Special case of EmitValue that avoids the client having
     /// to pass in a MCExpr for constant integers.
     virtual void EmitIntValue(uint64_t Value, unsigned Size,
@@ -319,11 +357,9 @@ namespace llvm {
     void EmitAbsValue(const MCExpr *Value, unsigned Size,
                       unsigned AddrSpace = 0);
 
-    virtual void EmitULEB128Value(const MCExpr *Value,
-                                  unsigned AddrSpace = 0) = 0;
+    virtual void EmitULEB128Value(const MCExpr *Value) = 0;
 
-    virtual void EmitSLEB128Value(const MCExpr *Value,
-                                  unsigned AddrSpace = 0) = 0;
+    virtual void EmitSLEB128Value(const MCExpr *Value) = 0;
 
     /// EmitULEB128Value - Special case of EmitULEB128Value that avoids the
     /// client having to pass in a MCExpr for constant integers.
@@ -338,9 +374,6 @@ namespace llvm {
     void EmitSymbolValue(const MCSymbol *Sym, unsigned Size,
                          unsigned AddrSpace = 0);
 
-    void EmitPCRelSymbolValue(const MCSymbol *Sym, unsigned Size,
-                              unsigned AddrSpace = 0);
-
     /// EmitGPRel32Value - Emit the expression @p Value into the output as a
     /// gprel32 (32-bit GP relative) value.
     ///
@@ -422,7 +455,8 @@ namespace llvm {
     virtual void EmitDwarfLocDirective(unsigned FileNo, unsigned Line,
                                        unsigned Column, unsigned Flags,
                                        unsigned Isa,
-                                       unsigned Discriminator);
+                                       unsigned Discriminator,
+                                       StringRef FileName);
 
     virtual void EmitDwarfAdvanceLineAddr(int64_t LineDelta,
                                           const MCSymbol *LastLabel,
@@ -435,17 +469,35 @@ namespace llvm {
     void EmitDwarfSetLineAddr(int64_t LineDelta, const MCSymbol *Label,
                               int PointerSize);
 
-    virtual bool EmitCFIStartProc();
-    virtual bool EmitCFIEndProc();
-    virtual bool EmitCFIDefCfa(int64_t Register, int64_t Offset);
-    virtual bool EmitCFIDefCfaOffset(int64_t Offset);
-    virtual bool EmitCFIDefCfaRegister(int64_t Register);
-    virtual bool EmitCFIOffset(int64_t Register, int64_t Offset);
-    virtual bool EmitCFIPersonality(const MCSymbol *Sym,
-                                    unsigned Encoding);
-    virtual bool EmitCFILsda(const MCSymbol *Sym, unsigned Encoding);
-    virtual bool EmitCFIRememberState();
-    virtual bool EmitCFIRestoreState();
+    virtual void EmitCFISections(bool EH, bool Debug);
+    virtual void EmitCFIStartProc();
+    virtual void EmitCFIEndProc();
+    virtual void EmitCFIDefCfa(int64_t Register, int64_t Offset);
+    virtual void EmitCFIDefCfaOffset(int64_t Offset);
+    virtual void EmitCFIDefCfaRegister(int64_t Register);
+    virtual void EmitCFIOffset(int64_t Register, int64_t Offset);
+    virtual void EmitCFIPersonality(const MCSymbol *Sym, unsigned Encoding);
+    virtual void EmitCFILsda(const MCSymbol *Sym, unsigned Encoding);
+    virtual void EmitCFIRememberState();
+    virtual void EmitCFIRestoreState();
+    virtual void EmitCFISameValue(int64_t Register);
+    virtual void EmitCFIRelOffset(int64_t Register, int64_t Offset);
+    virtual void EmitCFIAdjustCfaOffset(int64_t Adjustment);
+
+    virtual void EmitWin64EHStartProc(const MCSymbol *Symbol);
+    virtual void EmitWin64EHEndProc();
+    virtual void EmitWin64EHStartChained();
+    virtual void EmitWin64EHEndChained();
+    virtual void EmitWin64EHHandler(const MCSymbol *Sym, bool Unwind,
+                                    bool Except);
+    virtual void EmitWin64EHHandlerData();
+    virtual void EmitWin64EHPushReg(unsigned Register);
+    virtual void EmitWin64EHSetFrame(unsigned Register, unsigned Offset);
+    virtual void EmitWin64EHAllocStack(unsigned Size);
+    virtual void EmitWin64EHSaveReg(unsigned Register, unsigned Offset);
+    virtual void EmitWin64EHSaveXMM(unsigned Register, unsigned Offset);
+    virtual void EmitWin64EHPushFrame(bool Code);
+    virtual void EmitWin64EHEndProlog();
 
     /// EmitInstruction - Emit the given @p Instruction into the current
     /// section.
@@ -498,6 +550,7 @@ namespace llvm {
   MCStreamer *createAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
                                 bool isVerboseAsm,
                                 bool useLoc,
+                                bool useCFI,
                                 MCInstPrinter *InstPrint = 0,
                                 MCCodeEmitter *CE = 0,
                                 TargetAsmBackend *TAB = 0,
diff --git a/include/llvm/MC/MCSymbol.h b/include/llvm/MC/MCSymbol.h
index 7da4d7c..0583ce5 100644
--- a/include/llvm/MC/MCSymbol.h
+++ b/include/llvm/MC/MCSymbol.h
@@ -56,6 +56,7 @@ namespace llvm {
     mutable unsigned IsUsed : 1;
 
   private:  // MCContext creates and uniques these.
+    friend class MCExpr;
     friend class MCContext;
     MCSymbol(StringRef name, bool isTemporary)
       : Name(name), Section(0), Value(0),
diff --git a/include/llvm/MC/MCWin64EH.h b/include/llvm/MC/MCWin64EH.h
new file mode 100644
index 0000000..eb4665a
--- /dev/null
+++ b/include/llvm/MC/MCWin64EH.h
@@ -0,0 +1,93 @@
+//===- MCWin64EH.h - Machine Code Win64 EH support --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains declarations to support the Win64 Exception Handling
+// scheme in MC.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_MC_MCWIN64EH_H
+#define LLVM_MC_MCWIN64EH_H
+
+#include "llvm/Support/Win64EH.h"
+#include <cassert>
+#include <vector>
+
+namespace llvm {
+  class StringRef;
+  class MCStreamer;
+  class MCSymbol;
+
+  class MCWin64EHInstruction {
+  public:
+    typedef Win64EH::UnwindOpcodes OpType;
+  private:
+    OpType Operation;
+    MCSymbol *Label;
+    unsigned Offset;
+    unsigned Register;
+  public:
+    MCWin64EHInstruction(OpType Op, MCSymbol *L, unsigned Reg)
+      : Operation(Op), Label(L), Offset(0), Register(Reg) {
+     assert(Op == Win64EH::UOP_PushNonVol);
+    }
+    MCWin64EHInstruction(MCSymbol *L, unsigned Size)
+      : Operation(Size>128 ? Win64EH::UOP_AllocLarge : Win64EH::UOP_AllocSmall),
+        Label(L), Offset(Size) { }
+    MCWin64EHInstruction(OpType Op, MCSymbol *L, unsigned Reg, unsigned Off)
+      : Operation(Op), Label(L), Offset(Off), Register(Reg) {
+      assert(Op == Win64EH::UOP_SetFPReg ||
+             Op == Win64EH::UOP_SaveNonVol ||
+             Op == Win64EH::UOP_SaveNonVolBig ||
+             Op == Win64EH::UOP_SaveXMM128 ||
+             Op == Win64EH::UOP_SaveXMM128Big);
+    }
+    MCWin64EHInstruction(OpType Op, MCSymbol *L, bool Code)
+      : Operation(Op), Label(L), Offset(Code ? 1 : 0) {
+      assert(Op == Win64EH::UOP_PushMachFrame);
+    }
+    OpType getOperation() const { return Operation; }
+    MCSymbol *getLabel() const { return Label; }
+    unsigned getOffset() const { return Offset; }
+    unsigned getSize() const { return Offset; }
+    unsigned getRegister() const { return Register; }
+    bool isPushCodeFrame() const { return Offset == 1; }
+  };
+
+  struct MCWin64EHUnwindInfo {
+    MCWin64EHUnwindInfo() : Begin(0), End(0), ExceptionHandler(0),
+                            Function(0), PrologEnd(0), Symbol(0),
+                            HandlesUnwind(false), HandlesExceptions(false),
+                            LastFrameInst(-1), ChainedParent(0),
+                            Instructions() {}
+    MCSymbol *Begin;
+    MCSymbol *End;
+    const MCSymbol *ExceptionHandler;
+    const MCSymbol *Function;
+    MCSymbol *PrologEnd;
+    MCSymbol *Symbol;
+    bool HandlesUnwind;
+    bool HandlesExceptions;
+    int LastFrameInst;
+    MCWin64EHUnwindInfo *ChainedParent;
+    std::vector<MCWin64EHInstruction> Instructions;
+  };
+
+  class MCWin64EHUnwindEmitter {
+  public:
+    static StringRef GetSectionSuffix(const MCSymbol *func);
+    //
+    // This emits the unwind info sections (.pdata and .xdata in PE/COFF).
+    //
+    static void Emit(MCStreamer &streamer);
+    static void EmitUnwindInfo(MCStreamer &streamer, MCWin64EHUnwindInfo *info);
+  };
+} // end namespace llvm
+
+#endif
diff --git a/include/llvm/Metadata.h b/include/llvm/Metadata.h
index 6a61996..887e33c 100644
--- a/include/llvm/Metadata.h
+++ b/include/llvm/Metadata.h
@@ -34,7 +34,7 @@ template<typename ValueSubClass, typename ItemParentClass>
 //===----------------------------------------------------------------------===//
 /// MDString - a single uniqued string.
 /// These are used to efficiently contain a byte sequence for metadata.
-/// MDString is always unnamd.
+/// MDString is always unnamed.
 class MDString : public Value {
   MDString(const MDString &);            // DO NOT IMPLEMENT
 
@@ -111,30 +111,25 @@ class MDNode : public Value, public FoldingSetNode {
   void replaceOperand(MDNodeOperand *Op, Value *NewVal);
   ~MDNode();
 
-  MDNode(LLVMContext &C, Value *const *Vals, unsigned NumVals,
-         bool isFunctionLocal);
+  MDNode(LLVMContext &C, ArrayRef<Value*> Vals, bool isFunctionLocal);
   
-  static MDNode *getMDNode(LLVMContext &C, Value *const *Vals, unsigned NumVals,
+  static MDNode *getMDNode(LLVMContext &C, ArrayRef<Value*> Vals,
                            FunctionLocalness FL, bool Insert = true);
 public:
   // Constructors and destructors.
-  static MDNode *get(LLVMContext &Context, ArrayRef<Value*> V);
-  // FIXME: Eliminate this constructor form.
-  static MDNode *get(LLVMContext &Context, Value *const *Vals,
-                     unsigned NumVals);
+  static MDNode *get(LLVMContext &Context, ArrayRef<Value*> Vals);
   // getWhenValsUnresolved - Construct MDNode determining function-localness
   // from isFunctionLocal argument, not by analyzing Vals.
-  static MDNode *getWhenValsUnresolved(LLVMContext &Context, Value *const *Vals,
-                                       unsigned NumVals, bool isFunctionLocal);
+  static MDNode *getWhenValsUnresolved(LLVMContext &Context,
+                                       ArrayRef<Value*> Vals,
+                                       bool isFunctionLocal);
                                        
-  static MDNode *getIfExists(LLVMContext &Context, Value *const *Vals,
-                             unsigned NumVals);
+  static MDNode *getIfExists(LLVMContext &Context, ArrayRef<Value*> Vals);
 
   /// getTemporary - Return a temporary MDNode, for use in constructing
   /// cyclic MDNode structures. A temporary MDNode is not uniqued,
   /// may be RAUW'd, and must be manually deleted with deleteTemporary.
-  static MDNode *getTemporary(LLVMContext &Context, Value *const *Vals,
-                              unsigned NumVals);
+  static MDNode *getTemporary(LLVMContext &Context, ArrayRef<Value*> Vals);
 
   /// deleteTemporary - Deallocate a node created by getTemporary. The
   /// node must not have any users.
diff --git a/include/llvm/Operator.h b/include/llvm/Operator.h
index ff2a0ad..e9aa499 100644
--- a/include/llvm/Operator.h
+++ b/include/llvm/Operator.h
@@ -186,28 +186,46 @@ public:
 };
 
 class AddOperator
-  : public ConcreteOperator<OverflowingBinaryOperator, Instruction::Add> {};
+  : public ConcreteOperator<OverflowingBinaryOperator, Instruction::Add> {
+  ~AddOperator(); // DO NOT IMPLEMENT
+};
 class SubOperator
-  : public ConcreteOperator<OverflowingBinaryOperator, Instruction::Sub> {};
+  : public ConcreteOperator<OverflowingBinaryOperator, Instruction::Sub> {
+  ~SubOperator(); // DO NOT IMPLEMENT
+};
 class MulOperator
-  : public ConcreteOperator<OverflowingBinaryOperator, Instruction::Mul> {};
+  : public ConcreteOperator<OverflowingBinaryOperator, Instruction::Mul> {
+  ~MulOperator(); // DO NOT IMPLEMENT
+};
 class ShlOperator
-  : public ConcreteOperator<OverflowingBinaryOperator, Instruction::Shl> {};
+  : public ConcreteOperator<OverflowingBinaryOperator, Instruction::Shl> {
+  ~ShlOperator(); // DO NOT IMPLEMENT
+};
 
   
 class SDivOperator
-  : public ConcreteOperator<PossiblyExactOperator, Instruction::SDiv> {};
+  : public ConcreteOperator<PossiblyExactOperator, Instruction::SDiv> {
+  ~SDivOperator(); // DO NOT IMPLEMENT
+};
 class UDivOperator
-  : public ConcreteOperator<PossiblyExactOperator, Instruction::UDiv> {};
+  : public ConcreteOperator<PossiblyExactOperator, Instruction::UDiv> {
+  ~UDivOperator(); // DO NOT IMPLEMENT
+};
 class AShrOperator
-  : public ConcreteOperator<PossiblyExactOperator, Instruction::AShr> {};
+  : public ConcreteOperator<PossiblyExactOperator, Instruction::AShr> {
+  ~AShrOperator(); // DO NOT IMPLEMENT
+};
 class LShrOperator
-  : public ConcreteOperator<PossiblyExactOperator, Instruction::LShr> {};
+  : public ConcreteOperator<PossiblyExactOperator, Instruction::LShr> {
+  ~LShrOperator(); // DO NOT IMPLEMENT
+};
   
   
   
 class GEPOperator
   : public ConcreteOperator<Operator, Instruction::GetElementPtr> {
+  ~GEPOperator(); // DO NOT IMPLEMENT
+
   enum {
     IsInBounds = (1 << 0)
   };
diff --git a/include/llvm/Pass.h b/include/llvm/Pass.h
index ed0fb39..04dd8b6 100644
--- a/include/llvm/Pass.h
+++ b/include/llvm/Pass.h
@@ -13,7 +13,7 @@
 // Passes are designed this way so that it is possible to run passes in a cache
 // and organizationally optimal order without having to specify it at the front
 // end.  This allows arbitrary passes to be strung together and have them
-// executed as effeciently as possible.
+// executed as efficiently as possible.
 //
 // Passes should extend one of the classes below, depending on the guarantees
 // that it can make about what will be modified as it is run.  For example, most
@@ -114,7 +114,7 @@ public:
   void dump() const; // dump - Print to stderr.
 
   /// createPrinterPass - Get a Pass appropriate to print the IR this
-  /// pass operates one (Module, Function or MachineFunction).
+  /// pass operates on (Module, Function or MachineFunction).
   virtual Pass *createPrinterPass(raw_ostream &O,
                                   const std::string &Banner) const = 0;
 
@@ -320,7 +320,7 @@ class BasicBlockPass : public Pass {
 public:
   explicit BasicBlockPass(char &pid) : Pass(PT_BasicBlock, pid) {}
 
-  /// createPrinterPass - Get a function printer pass.
+  /// createPrinterPass - Get a basic block printer pass.
   Pass *createPrinterPass(raw_ostream &O, const std::string &Banner) const;
 
   /// doInitialization - Virtual method overridden by subclasses to do
diff --git a/include/llvm/Support/Allocator.h b/include/llvm/Support/Allocator.h
index c680709..a2ad24f 100644
--- a/include/llvm/Support/Allocator.h
+++ b/include/llvm/Support/Allocator.h
@@ -177,6 +177,9 @@ public:
   unsigned GetNumSlabs() const;
 
   void PrintStats() const;
+  
+  /// Compute the total physical memory allocated by this allocator.
+  size_t getTotalMemory() const;
 };
 
 /// SpecificBumpPtrAllocator - Same as BumpPtrAllocator but allows only
diff --git a/include/llvm/Support/BranchProbability.h b/include/llvm/Support/BranchProbability.h
new file mode 100644
index 0000000..7ba6491
--- /dev/null
+++ b/include/llvm/Support/BranchProbability.h
@@ -0,0 +1,50 @@
+//===- BranchProbability.h - Branch Probability Analysis --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Definition of BranchProbability shared by IR and Machine Instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_BRANCHPROBABILITY_H
+#define LLVM_SUPPORT_BRANCHPROBABILITY_H
+
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+
+class raw_ostream;
+class BranchProbabilityInfo;
+class MachineBranchProbabilityInfo;
+class MachineBasicBlock;
+
+// This class represents Branch Probability as a non-negative fraction.
+class BranchProbability {
+  friend class BranchProbabilityInfo;
+  friend class MachineBranchProbabilityInfo;
+  friend class MachineBasicBlock;
+
+  // Numerator
+  uint32_t N;
+
+  // Denominator
+  uint32_t D;
+
+  BranchProbability(uint32_t n, uint32_t d);
+
+public:
+  raw_ostream &print(raw_ostream &OS) const;
+
+  void dump() const;
+};
+
+raw_ostream &operator<<(raw_ostream &OS, const BranchProbability &Prob);
+
+}
+
+#endif
diff --git a/include/llvm/Support/Casting.h b/include/llvm/Support/Casting.h
index 6bb9806..3aab436 100644
--- a/include/llvm/Support/Casting.h
+++ b/include/llvm/Support/Casting.h
@@ -23,8 +23,6 @@ namespace llvm {
 //                          isa<x> Support Templates
 //===----------------------------------------------------------------------===//
 
-template<typename FromCl> struct isa_impl_cl;
-
 // Define a template that can be specialized by smart pointers to reflect the
 // fact that they are automatically dereferenced, and are not involved with the
 // template selection process...  the default implementation is a noop.
@@ -43,12 +41,9 @@ template<typename From> struct simplify_type<const From> {
   }
 };
 
-
-// isa<X> - Return true if the parameter to the template is an instance of the
-// template type argument.  Used like this:
-//
-//  if (isa<Type*>(myVal)) { ... }
-//
+// The core of the implementation of isa<X> is here; To and From should be
+// the names of classes.  This template can be specialized to customize the
+// implementation of isa<> without rewriting it from scratch.
 template <typename To, typename From>
 struct isa_impl {
   static inline bool doit(const From &Val) {
@@ -56,66 +51,63 @@ struct isa_impl {
   }
 };
 
-template<typename To, typename From, typename SimpleType>
-struct isa_impl_wrap {
-  // When From != SimplifiedType, we can simplify the type some more by using
-  // the simplify_type template.
-  static bool doit(const From &Val) {
-    return isa_impl_cl<const SimpleType>::template
-                    isa<To>(simplify_type<const From>::getSimplifiedValue(Val));
+template <typename To, typename From> struct isa_impl_cl {
+  static inline bool doit(const From &Val) {
+    return isa_impl<To, From>::doit(Val);
   }
 };
 
-template<typename To, typename FromTy>
-struct isa_impl_wrap<To, const FromTy, const FromTy> {
-  // When From == SimpleType, we are as simple as we are going to get.
-  static bool doit(const FromTy &Val) {
-    return isa_impl<To,FromTy>::doit(Val);
+template <typename To, typename From> struct isa_impl_cl<To, const From> {
+  static inline bool doit(const From &Val) {
+    return isa_impl<To, From>::doit(Val);
   }
 };
 
-// isa_impl_cl - Use class partial specialization to transform types to a single
-// canonical form for isa_impl.
-//
-template<typename FromCl>
-struct isa_impl_cl {
-  template<class ToCl>
-  static bool isa(const FromCl &Val) {
-    return isa_impl_wrap<ToCl,const FromCl,
-                   typename simplify_type<const FromCl>::SimpleType>::doit(Val);
+template <typename To, typename From> struct isa_impl_cl<To, From*> {
+  static inline bool doit(const From *Val) {
+    return isa_impl<To, From>::doit(*Val);
   }
 };
 
-// Specialization used to strip const qualifiers off of the FromCl type...
-template<typename FromCl>
-struct isa_impl_cl<const FromCl> {
-  template<class ToCl>
-  static bool isa(const FromCl &Val) {
-    return isa_impl_cl<FromCl>::template isa<ToCl>(Val);
+template <typename To, typename From> struct isa_impl_cl<To, const From*> {
+  static inline bool doit(const From *Val) {
+    return isa_impl<To, From>::doit(*Val);
   }
 };
 
-// Define pointer traits in terms of base traits...
-template<class FromCl>
-struct isa_impl_cl<FromCl*> {
-  template<class ToCl>
-  static bool isa(FromCl *Val) {
-    return isa_impl_cl<FromCl>::template isa<ToCl>(*Val);
+template <typename To, typename From> struct isa_impl_cl<To, const From*const> {
+  static inline bool doit(const From *Val) {
+    return isa_impl<To, From>::doit(*Val);
   }
 };
 
-// Define reference traits in terms of base traits...
-template<class FromCl>
-struct isa_impl_cl<FromCl&> {
-  template<class ToCl>
-  static bool isa(FromCl &Val) {
-    return isa_impl_cl<FromCl>::template isa<ToCl>(&Val);
+template<typename To, typename From, typename SimpleFrom>
+struct isa_impl_wrap {
+  // When From != SimplifiedType, we can simplify the type some more by using
+  // the simplify_type template.
+  static bool doit(const From &Val) {
+    return isa_impl_wrap<To, SimpleFrom,
+      typename simplify_type<SimpleFrom>::SimpleType>::doit(
+                          simplify_type<From>::getSimplifiedValue(Val));
+  }
+};
+
+template<typename To, typename FromTy>
+struct isa_impl_wrap<To, FromTy, FromTy> {
+  // When From == SimpleType, we are as simple as we are going to get.
+  static bool doit(const FromTy &Val) {
+    return isa_impl_cl<To,FromTy>::doit(Val);
   }
 };
 
+// isa<X> - Return true if the parameter to the template is an instance of the
+// template type argument.  Used like this:
+//
+//  if (isa<Type>(myVal)) { ... }
+//
 template <class X, class Y>
 inline bool isa(const Y &Val) {
-  return isa_impl_cl<Y>::template isa<X>(Val);
+  return isa_impl_wrap<X, Y, typename simplify_type<Y>::SimpleType>::doit(Val);
 }
 
 //===----------------------------------------------------------------------===//
@@ -192,8 +184,8 @@ template<class To, class FromTy> struct cast_convert_val<To,FromTy,FromTy> {
 
 // cast<X> - Return the argument parameter cast to the specified type.  This
 // casting operator asserts that the type is correct, so it does not return null
-// on failure.  But it will correctly return NULL when the input is NULL.
-// Used Like this:
+// on failure.  It does not allow a null argument (use cast_or_null for that).
+// It is typically used like this:
 //
 //  cast<Instruction>(myVal)->getParent()
 //
diff --git a/include/llvm/Support/CommandLine.h b/include/llvm/Support/CommandLine.h
index c57d7dd..d609871 100644
--- a/include/llvm/Support/CommandLine.h
+++ b/include/llvm/Support/CommandLine.h
@@ -334,11 +334,11 @@ struct OptionValueBase : public GenericOptionValue {
 
   // Some options may take their value from a different data type.
   template<class DT>
-  void setValue(const DT& V) {}
+  void setValue(const DT& /*V*/) {}
 
-  bool compare(const DataType &V) const { return false; }
+  bool compare(const DataType &/*V*/) const { return false; }
 
-  virtual bool compare(const GenericOptionValue& V) const { return false; }
+  virtual bool compare(const GenericOptionValue& /*V*/) const { return false; }
 };
 
 // Simple copy of the option value.
@@ -904,8 +904,8 @@ void printOptionDiff(const Option &O, const generic_parser_base &P, const DT &V,
 // type than the option value. e.g. HelpPrinter.
 template<class ParserDT, class ValDT>
 struct OptionDiffPrinter {
-  void print(const Option &O, const parser<ParserDT> P, const ValDT &V,
-             const OptionValue<ValDT> &Default, size_t GlobalWidth) {
+  void print(const Option &O, const parser<ParserDT> P, const ValDT &/*V*/,
+             const OptionValue<ValDT> &/*Default*/, size_t GlobalWidth) {
     P.printOptionNoValue(O, GlobalWidth);
   }
 };
@@ -1287,7 +1287,7 @@ class list : public Option, public list_storage<DataType, Storage> {
   }
 
   // Unimplemented: list options don't currently store their default value.
-  virtual void printOptionValue(size_t GlobalWidth, bool Force) const {}
+  virtual void printOptionValue(size_t /*GlobalWidth*/, bool /*Force*/) const {}
 
   void done() {
     addArgument();
@@ -1489,7 +1489,7 @@ class bits : public Option, public bits_storage<DataType, Storage> {
   }
 
   // Unimplemented: bits options don't currently store their default values.
-  virtual void printOptionValue(size_t GlobalWidth, bool Force) const {}
+  virtual void printOptionValue(size_t /*GlobalWidth*/, bool /*Force*/) const {}
 
   void done() {
     addArgument();
@@ -1583,7 +1583,7 @@ class alias : public Option {
   virtual void printOptionInfo(size_t GlobalWidth) const;
 
   // Aliases do not need to print their values.
-  virtual void printOptionValue(size_t GlobalWidth, bool Force) const {}
+  virtual void printOptionValue(size_t /*GlobalWidth*/, bool /*Force*/) const {}
 
   void done() {
     if (!hasArgStr())
diff --git a/include/llvm/Support/CrashRecoveryContext.h b/include/llvm/Support/CrashRecoveryContext.h
index db835e8..4c0a5e2 100644
--- a/include/llvm/Support/CrashRecoveryContext.h
+++ b/include/llvm/Support/CrashRecoveryContext.h
@@ -186,8 +186,13 @@ public:
   }
 
   ~CrashRecoveryContextCleanupRegistrar() {
+    unregister();
+  }
+  
+  void unregister() {
     if (cleanup && !cleanup->cleanupFired)
-        cleanup->getContext()->unregisterCleanup(cleanup);
+      cleanup->getContext()->unregisterCleanup(cleanup);
+    cleanup = 0;
   }
 };
 }
diff --git a/include/llvm/Support/DebugLoc.h b/include/llvm/Support/DebugLoc.h
index 8d19e30..98a05a4 100644
--- a/include/llvm/Support/DebugLoc.h
+++ b/include/llvm/Support/DebugLoc.h
@@ -31,7 +31,7 @@ namespace llvm {
     /// not equal to the tombstone key or DebugLoc().
     static DebugLoc getEmptyKey() {
       DebugLoc DL;
-      DL.LineCol = -1;
+      DL.LineCol = 1;
       return DL;
     }
 
@@ -39,7 +39,7 @@ namespace llvm {
     /// is not equal to the empty key or DebugLoc().
     static DebugLoc getTombstoneKey() {
       DebugLoc DL;
-      DL.LineCol = -2;
+      DL.LineCol = 2;
       return DL;
     }
 
diff --git a/include/llvm/Support/Dwarf.h b/include/llvm/Support/Dwarf.h
index 5d0b5a9..70bac0c 100644
--- a/include/llvm/Support/Dwarf.h
+++ b/include/llvm/Support/Dwarf.h
@@ -231,6 +231,11 @@ enum dwarf_constants {
   DW_AT_APPLE_major_runtime_vers = 0x3fe5,
   DW_AT_APPLE_runtime_class = 0x3fe6,
   DW_AT_APPLE_omit_frame_ptr = 0x3fe7,
+  DW_AT_APPLE_property_name = 0x3fe8,
+  DW_AT_APPLE_property_getter = 0x3fe9,
+  DW_AT_APPLE_property_setter = 0x3fea,
+  DW_AT_APPLE_property_attribute = 0x3feb,
+  DW_AT_APPLE_objc_complete_type = 0x3fec,
 
   // Attribute form encodings
   DW_FORM_addr = 0x01,
@@ -407,6 +412,7 @@ enum dwarf_constants {
   DW_OP_call_ref = 0x9a,
   DW_OP_form_tls_address = 0x9b,
   DW_OP_call_frame_cfa = 0x9c,
+  DW_OP_bit_piece = 0x9d,
   DW_OP_lo_user = 0xe0,
   DW_OP_hi_user = 0xff,
 
@@ -584,7 +590,15 @@ enum dwarf_constants {
   DW_EH_PE_datarel = 0x30,
   DW_EH_PE_funcrel = 0x40,
   DW_EH_PE_aligned = 0x50,
-  DW_EH_PE_indirect = 0x80
+  DW_EH_PE_indirect = 0x80,
+
+  // Apple Objective-C Property Attributes
+  DW_APPLE_PROPERTY_readonly = 0x01,
+  DW_APPLE_PROPERTY_readwrite = 0x02,
+  DW_APPLE_PROPERTY_assign = 0x04,
+  DW_APPLE_PROPERTY_retain = 0x08,
+  DW_APPLE_PROPERTY_copy = 0x10,
+  DW_APPLE_PROPERTY_nonatomic = 0x20
 };
 
 /// TagString - Return the string for the specified tag.
diff --git a/include/llvm/Support/ELF.h b/include/llvm/Support/ELF.h
index cc72bd5..be48112 100644
--- a/include/llvm/Support/ELF.h
+++ b/include/llvm/Support/ELF.h
@@ -487,11 +487,11 @@ enum {
   SHT_REL           = 9,  // Relocation entries; no explicit addends.
   SHT_SHLIB         = 10, // Reserved.
   SHT_DYNSYM        = 11, // Symbol table.
-  SHT_INIT_ARRAY    = 14, // Pointers to initialisation functions.
+  SHT_INIT_ARRAY    = 14, // Pointers to initialization functions.
   SHT_FINI_ARRAY    = 15, // Pointers to termination functions.
   SHT_PREINIT_ARRAY = 16, // Pointers to pre-init functions.
   SHT_GROUP         = 17, // Section group.
-  SHT_SYMTAB_SHNDX  = 18, // Indicies for SHN_XINDEX entries.
+  SHT_SYMTAB_SHNDX  = 18, // Indices for SHN_XINDEX entries.
   SHT_LOOS          = 0x60000000, // Lowest operating system-specific type.
   SHT_HIOS          = 0x6fffffff, // Highest operating system-specific type.
   SHT_LOPROC        = 0x70000000, // Lowest processor architecture-specific type.
@@ -630,7 +630,7 @@ enum {
   STT_FUNC    = 2,   // Symbol is executable code (function, etc.)
   STT_SECTION = 3,   // Symbol refers to a section
   STT_FILE    = 4,   // Local, absolute symbol that refers to a file
-  STT_COMMON  = 5,   // An uninitialised common block
+  STT_COMMON  = 5,   // An uninitialized common block
   STT_TLS     = 6,   // Thread local data object
   STT_LOPROC  = 13,  // Lowest processor-specific symbol type
   STT_HIPROC  = 15   // Highest processor-specific symbol type
@@ -804,7 +804,7 @@ enum {
   DT_RELENT       = 19,       // Size of a Rel relocation entry.
   DT_PLTREL       = 20,       // Type of relocation entry used for linking.
   DT_DEBUG        = 21,       // Reserved for debugger.
-  DT_TEXTREL      = 22,       // Relocations exist for non-writable segements.
+  DT_TEXTREL      = 22,       // Relocations exist for non-writable segments.
   DT_JMPREL       = 23,       // Address of relocations associated with PLT.
   DT_BIND_NOW     = 24,       // Process all relocations before execution.
   DT_INIT_ARRAY   = 25,       // Pointer to array of initialization functions.
diff --git a/include/llvm/Support/GraphWriter.h b/include/llvm/Support/GraphWriter.h
index a5165f4..eab0c9d 100644
--- a/include/llvm/Support/GraphWriter.h
+++ b/include/llvm/Support/GraphWriter.h
@@ -272,7 +272,7 @@ public:
                 const void *DestNodeID, int DestNodePort,
                 const std::string &Attrs) {
     if (SrcNodePort  > 64) return;             // Eminating from truncated part?
-    if (DestNodePort > 64) DestNodePort = 64;  // Targetting the truncated part?
+    if (DestNodePort > 64) DestNodePort = 64;  // Targeting the truncated part?
 
     O << "\tNode" << SrcNodeID;
     if (SrcNodePort >= 0)
diff --git a/include/llvm/Support/IRBuilder.h b/include/llvm/Support/IRBuilder.h
index 626d22c..6a7c277 100644
--- a/include/llvm/Support/IRBuilder.h
+++ b/include/llvm/Support/IRBuilder.h
@@ -18,6 +18,7 @@
 #include "llvm/Instructions.h"
 #include "llvm/BasicBlock.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/ConstantFolder.h"
 
@@ -79,6 +80,7 @@ public:
   void SetInsertPoint(Instruction *I) {
     BB = I->getParent();
     InsertPt = I;
+    SetCurrentDebugLocation(I->getDebugLoc());
   }
   
   /// SetInsertPoint - This specifies that created instructions should be
@@ -105,6 +107,10 @@ public:
       I->setDebugLoc(CurDbgLocation);
   }
 
+  /// getCurrentFunctionReturnType - Get the return type of the current function
+  /// that we're emitting into.
+  const Type *getCurrentFunctionReturnType() const;
+  
   /// InsertPoint - A saved insertion point.
   class InsertPoint {
     BasicBlock *Block;
@@ -153,9 +159,10 @@ public:
 
   /// CreateGlobalString - Make a new global variable with an initializer that
   /// has array of i8 type filled in with the nul terminated string value
-  /// specified.  If Name is specified, it is the name of the global variable
-  /// created.
-  Value *CreateGlobalString(const char *Str = "", const Twine &Name = "");
+  /// specified.  The new global variable will be marked mergable with any
+  /// others of the same contents.  If Name is specified, it is the name of the
+  /// global variable created.
+  Value *CreateGlobalString(StringRef Str, const Twine &Name = "");
 
   /// getInt1 - Get a constant value representing either true or false.
   ConstantInt *getInt1(bool V) {
@@ -191,6 +198,11 @@ public:
   ConstantInt *getInt64(uint64_t C) {
     return ConstantInt::get(getInt64Ty(), C);
   }
+  
+  /// getInt - Get a constant integer value.
+  ConstantInt *getInt(const APInt &AI) {
+    return ConstantInt::get(Context, AI);
+  }
 
   //===--------------------------------------------------------------------===//
   // Type creation methods
@@ -240,10 +252,10 @@ public:
     return Type::getInt8PtrTy(Context, AddrSpace);
   }
 
-  /// getCurrentFunctionReturnType - Get the return type of the current function
-  /// that we're emitting into.
-  const Type *getCurrentFunctionReturnType() const;
-  
+  //===--------------------------------------------------------------------===//
+  // Intrinsic creation methods
+  //===--------------------------------------------------------------------===//
+
   /// CreateMemSet - Create and insert a memset to the specified pointer and the
   /// specified value.  If the pointer isn't an i8*, it will be converted.  If a
   /// TBAA tag is specified, it will be added to the instruction.
@@ -276,6 +288,15 @@ public:
   
   CallInst *CreateMemMove(Value *Dst, Value *Src, Value *Size, unsigned Align,
                           bool isVolatile = false, MDNode *TBAATag = 0);  
+
+  /// CreateLifetimeStart - Create a lifetime.start intrinsic.  If the pointer
+  /// isn't i8* it will be converted.
+  CallInst *CreateLifetimeStart(Value *Ptr, ConstantInt *Size = 0);
+
+  /// CreateLifetimeEnd - Create a lifetime.end intrinsic.  If the pointer isn't
+  /// i8* it will be converted.
+  CallInst *CreateLifetimeEnd(Value *Ptr, ConstantInt *Size = 0);
+
 private:
   Value *getCastedInt8PtrValue(Value *Ptr);
 };
@@ -318,6 +339,7 @@ public:
   explicit IRBuilder(Instruction *IP)
     : IRBuilderBase(IP->getContext()), Folder() {
     SetInsertPoint(IP);
+    SetCurrentDebugLocation(IP->getDebugLoc());
   }
   
   IRBuilder(BasicBlock *TheBB, BasicBlock::iterator IP, const T& F)
@@ -862,7 +884,7 @@ public:
 
   /// CreateGlobalStringPtr - Same as CreateGlobalString, but return a pointer
   /// with "i8*" type instead of a pointer to array of i8.
-  Value *CreateGlobalStringPtr(const char *Str = "", const Twine &Name = "") {
+  Value *CreateGlobalStringPtr(StringRef Str, const Twine &Name = "") {
     Value *gv = CreateGlobalString(Str, Name);
     Value *zero = ConstantInt::get(Type::getInt32Ty(Context), 0);
     Value *Args[] = { zero, zero };
diff --git a/include/llvm/Support/MemoryBuffer.h b/include/llvm/Support/MemoryBuffer.h
index 9a2aff0..5e55bd9 100644
--- a/include/llvm/Support/MemoryBuffer.h
+++ b/include/llvm/Support/MemoryBuffer.h
@@ -81,7 +81,7 @@ public:
                                 bool RequiresNullTerminator = true);
 
   /// getMemBuffer - Open the specified memory range as a MemoryBuffer.  Note
-  /// that InputData must be null terminated.
+  /// that InputData must be null terminated if RequiresNullTerminator is true.
   static MemoryBuffer *getMemBuffer(StringRef InputData,
                                     StringRef BufferName = "",
                                     bool RequiresNullTerminator = true);
@@ -119,6 +119,21 @@ public:
   static error_code getFileOrSTDIN(const char *Filename,
                                    OwningPtr<MemoryBuffer> &result,
                                    int64_t FileSize = -1);
+  
+  
+  //===--------------------------------------------------------------------===//
+  // Provided for performance analysis.
+  //===--------------------------------------------------------------------===//
+
+  /// The kind of memory backing used to support the MemoryBuffer.
+  enum BufferKind {
+    MemoryBuffer_Malloc,
+    MemoryBuffer_MMap
+  };
+
+  /// Return information on the memory mechanism used to support the
+  /// MemoryBuffer.
+  virtual BufferKind getBufferKind() const = 0;  
 };
 
 } // end namespace llvm
diff --git a/include/llvm/Support/PassManagerBuilder.h b/include/llvm/Support/PassManagerBuilder.h
new file mode 100644
index 0000000..31624db
--- /dev/null
+++ b/include/llvm/Support/PassManagerBuilder.h
@@ -0,0 +1,324 @@
+//===-- llvm/Support/PassManagerBuilder.h - Build Standard Pass -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the PassManagerBuilder class, which is used to set up a
+// "standard" optimization sequence suitable for languages like C and C++.
+//
+// These are implemented as inline functions so that we do not have to worry
+// about link issues.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_PASSMANAGERBUILDER_H
+#define LLVM_SUPPORT_PASSMANAGERBUILDER_H
+
+#include "llvm/PassManager.h"
+#include "llvm/DefaultPasses.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/Verifier.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/IPO.h"
+
+namespace llvm {
+  
+/// PassManagerBuilder - This class is used to set up a standard optimization
+/// sequence for languages like C and C++, allowing some APIs to customize the
+/// pass sequence in various ways. A simple example of using it would be:
+///
+///  PassManagerBuilder Builder;
+///  Builder.OptLevel = 2;
+///  Builder.populateFunctionPassManager(FPM);
+///  Builder.populateModulePassManager(MPM);
+///
+/// In addition to setting up the basic passes, PassManagerBuilder allows
+/// frontends to vend a plugin API, where plugins are allowed to add extensions
+/// to the default pass manager.  They do this by specifying where in the pass
+/// pipeline they want to be added, along with a callback function that adds
+/// the pass(es).  For example, a plugin that wanted to add a loop optimization
+/// could do something like this:
+///
+/// static void addMyLoopPass(const PMBuilder &Builder, PassManagerBase &PM) {
+///   if (Builder.getOptLevel() > 2 && Builder.getOptSizeLevel() == 0)
+///     PM.add(createMyAwesomePass());
+/// }
+///   ...
+///   Builder.addExtension(PassManagerBuilder::EP_LoopOptimizerEnd,
+///                        addMyLoopPass);
+///   ...
+class PassManagerBuilder {
+public:
+  
+  /// Extensions are passed the builder itself (so they can see how it is
+  /// configured) as well as the pass manager to add stuff to.
+  typedef void (*ExtensionFn)(const PassManagerBuilder &Builder,
+                              PassManagerBase &PM);
+  enum ExtensionPointTy {
+    /// EP_EarlyAsPossible - This extension point allows adding passes before
+    /// any other transformations, allowing them to see the code as it is coming
+    /// out of the frontend.
+    EP_EarlyAsPossible,
+    
+    /// EP_LoopOptimizerEnd - This extension point allows adding loop passes to
+    /// the end of the loop optimizer.
+    EP_LoopOptimizerEnd
+  };
+  
+  /// The Optimization Level - Specify the basic optimization level.
+  ///    0 = -O0, 1 = -O1, 2 = -O2, 3 = -O3
+  unsigned OptLevel;
+  
+  /// SizeLevel - How much we're optimizing for size.
+  ///    0 = none, 1 = -Os, 2 = -Oz
+  unsigned SizeLevel;
+  
+  /// LibraryInfo - Specifies information about the runtime library for the
+  /// optimizer.  If this is non-null, it is added to both the function and
+  /// per-module pass pipeline.
+  TargetLibraryInfo *LibraryInfo;
+  
+  /// Inliner - Specifies the inliner to use.  If this is non-null, it is
+  /// added to the per-module passes.
+  Pass *Inliner;
+  
+  bool DisableSimplifyLibCalls;
+  bool DisableUnitAtATime;
+  bool DisableUnrollLoops;
+  
+private:
+  /// ExtensionList - This is list of all of the extensions that are registered.
+  std::vector<std::pair<ExtensionPointTy, ExtensionFn> > Extensions;
+  
+public:
+  PassManagerBuilder() {
+    OptLevel = 2;
+    SizeLevel = 0;
+    LibraryInfo = 0;
+    Inliner = 0;
+    DisableSimplifyLibCalls = false;
+    DisableUnitAtATime = false;
+    DisableUnrollLoops = false;
+  }
+  
+  ~PassManagerBuilder() {
+    delete LibraryInfo;
+    delete Inliner;
+  }
+  
+  void addExtension(ExtensionPointTy Ty, ExtensionFn Fn) {
+    Extensions.push_back(std::make_pair(Ty, Fn));
+  }
+  
+private:
+  void addExtensionsToPM(ExtensionPointTy ETy, PassManagerBase &PM) const {
+    for (unsigned i = 0, e = Extensions.size(); i != e; ++i)
+      if (Extensions[i].first == ETy)
+        Extensions[i].second(*this, PM);
+  }
+  
+  void addInitialAliasAnalysisPasses(PassManagerBase &PM) const {
+    // Add TypeBasedAliasAnalysis before BasicAliasAnalysis so that
+    // BasicAliasAnalysis wins if they disagree. This is intended to help
+    // support "obvious" type-punning idioms.
+    PM.add(createTypeBasedAliasAnalysisPass());
+    PM.add(createBasicAliasAnalysisPass());
+  }
+public:
+  
+  /// populateFunctionPassManager - This fills in the function pass manager,
+  /// which is expected to be run on each function immediately as it is
+  /// generated.  The idea is to reduce the size of the IR in memory.
+  void populateFunctionPassManager(FunctionPassManager &FPM) {
+    addExtensionsToPM(EP_EarlyAsPossible, FPM);
+    
+    // Add LibraryInfo if we have some.
+    if (LibraryInfo) FPM.add(new TargetLibraryInfo(*LibraryInfo));
+
+    if (OptLevel == 0) return;
+
+    addInitialAliasAnalysisPasses(FPM);
+    
+    FPM.add(createCFGSimplificationPass());
+    FPM.add(createScalarReplAggregatesPass());
+    FPM.add(createEarlyCSEPass());
+  }
+  
+  /// populateModulePassManager - This sets up the primary pass manager.
+  void populateModulePassManager(PassManagerBase &MPM) {
+    // If all optimizations are disabled, just run the always-inline pass.
+    if (OptLevel == 0) {
+      if (Inliner) {
+        MPM.add(Inliner);
+        Inliner = 0;
+      }
+      return;
+    }
+      
+    // Add LibraryInfo if we have some.
+    if (LibraryInfo) MPM.add(new TargetLibraryInfo(*LibraryInfo));
+
+    addInitialAliasAnalysisPasses(MPM);
+    
+    if (!DisableUnitAtATime) {
+      MPM.add(createGlobalOptimizerPass());     // Optimize out global vars
+      
+      MPM.add(createIPSCCPPass());              // IP SCCP
+      MPM.add(createDeadArgEliminationPass());  // Dead argument elimination
+      
+      MPM.add(createInstructionCombiningPass());// Clean up after IPCP & DAE
+      MPM.add(createCFGSimplificationPass());   // Clean up after IPCP & DAE
+    }
+    
+    // Start of CallGraph SCC passes.
+    if (!DisableUnitAtATime)
+      MPM.add(createPruneEHPass());             // Remove dead EH info
+    if (Inliner) {
+      MPM.add(Inliner);
+      Inliner = 0;
+    }
+    if (!DisableUnitAtATime)
+      MPM.add(createFunctionAttrsPass());       // Set readonly/readnone attrs
+    if (OptLevel > 2)
+      MPM.add(createArgumentPromotionPass());   // Scalarize uninlined fn args
+    
+    // Start of function pass.
+    MPM.add(createObjCARCExpandPass());         // Canonicalize ObjC ARC code.
+    // Break up aggregate allocas, using SSAUpdater.
+    MPM.add(createScalarReplAggregatesPass(-1, false));
+    MPM.add(createEarlyCSEPass());              // Catch trivial redundancies
+    if (!DisableSimplifyLibCalls)
+      MPM.add(createSimplifyLibCallsPass());    // Library Call Optimizations
+    MPM.add(createJumpThreadingPass());         // Thread jumps.
+    MPM.add(createCorrelatedValuePropagationPass()); // Propagate conditionals
+    MPM.add(createCFGSimplificationPass());     // Merge & remove BBs
+    MPM.add(createInstructionCombiningPass());  // Combine silly seq's
+    
+    MPM.add(createTailCallEliminationPass());   // Eliminate tail calls
+    MPM.add(createCFGSimplificationPass());     // Merge & remove BBs
+    MPM.add(createReassociatePass());           // Reassociate expressions
+    MPM.add(createLoopRotatePass());            // Rotate Loop
+    MPM.add(createLICMPass());                  // Hoist loop invariants
+    MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3));
+    MPM.add(createInstructionCombiningPass());  
+    MPM.add(createIndVarSimplifyPass());        // Canonicalize indvars
+    MPM.add(createLoopIdiomPass());             // Recognize idioms like memset.
+    MPM.add(createLoopDeletionPass());          // Delete dead loops
+    if (!DisableUnrollLoops)
+      MPM.add(createLoopUnrollPass());          // Unroll small loops
+    addExtensionsToPM(EP_LoopOptimizerEnd, MPM);
+
+    if (OptLevel > 1)
+      MPM.add(createGVNPass());                 // Remove redundancies
+    MPM.add(createMemCpyOptPass());             // Remove memcpy / form memset
+    MPM.add(createSCCPPass());                  // Constant prop with SCCP
+    
+    // Run instcombine after redundancy elimination to exploit opportunities
+    // opened up by them.
+    MPM.add(createInstructionCombiningPass());
+    MPM.add(createJumpThreadingPass());         // Thread jumps
+    MPM.add(createCorrelatedValuePropagationPass());
+    MPM.add(createDeadStoreEliminationPass());  // Delete dead stores
+    MPM.add(createObjCARCOptPass());            // Objective-C ARC optimizations.
+    MPM.add(createAggressiveDCEPass());         // Delete dead instructions
+    MPM.add(createCFGSimplificationPass());     // Merge & remove BBs
+    MPM.add(createInstructionCombiningPass());  // Clean up after everything.
+    
+    if (!DisableUnitAtATime) {
+      MPM.add(createStripDeadPrototypesPass()); // Get rid of dead prototypes
+      MPM.add(createDeadTypeEliminationPass()); // Eliminate dead types
+      
+      // GlobalOpt already deletes dead functions and globals, at -O3 try a
+      // late pass of GlobalDCE.  It is capable of deleting dead cycles.
+      if (OptLevel > 2)
+        MPM.add(createGlobalDCEPass());         // Remove dead fns and globals.
+      
+      if (OptLevel > 1)
+        MPM.add(createConstantMergePass());     // Merge dup global constants
+    }
+  }
+  
+  void populateLTOPassManager(PassManagerBase &PM, bool Internalize,
+                              bool RunInliner) {
+    // Provide AliasAnalysis services for optimizations.
+    addInitialAliasAnalysisPasses(PM);
+    
+    // Now that composite has been compiled, scan through the module, looking
+    // for a main function.  If main is defined, mark all other functions
+    // internal.
+    if (Internalize)
+      PM.add(createInternalizePass(true));
+    
+    // Propagate constants at call sites into the functions they call.  This
+    // opens opportunities for globalopt (and inlining) by substituting function
+    // pointers passed as arguments to direct uses of functions.  
+    PM.add(createIPSCCPPass());
+    
+    // Now that we internalized some globals, see if we can hack on them!
+    PM.add(createGlobalOptimizerPass());
+    
+    // Linking modules together can lead to duplicated global constants, only
+    // keep one copy of each constant.
+    PM.add(createConstantMergePass());
+    
+    // Remove unused arguments from functions.
+    PM.add(createDeadArgEliminationPass());
+    
+    // Reduce the code after globalopt and ipsccp.  Both can open up significant
+    // simplification opportunities, and both can propagate functions through
+    // function pointers.  When this happens, we often have to resolve varargs
+    // calls, etc, so let instcombine do this.
+    PM.add(createInstructionCombiningPass());
+
+    // Inline small functions
+    if (RunInliner)
+      PM.add(createFunctionInliningPass());
+    
+    PM.add(createPruneEHPass());   // Remove dead EH info.
+    
+    // Optimize globals again if we ran the inliner.
+    if (RunInliner)
+      PM.add(createGlobalOptimizerPass());
+    PM.add(createGlobalDCEPass()); // Remove dead functions.
+    
+    // If we didn't decide to inline a function, check to see if we can
+    // transform it to pass arguments by value instead of by reference.
+    PM.add(createArgumentPromotionPass());
+    
+    // The IPO passes may leave cruft around.  Clean up after them.
+    PM.add(createInstructionCombiningPass());
+    PM.add(createJumpThreadingPass());
+    // Break up allocas
+    PM.add(createScalarReplAggregatesPass());
+    
+    // Run a few AA driven optimizations here and now, to cleanup the code.
+    PM.add(createFunctionAttrsPass()); // Add nocapture.
+    PM.add(createGlobalsModRefPass()); // IP alias analysis.
+    
+    PM.add(createLICMPass());      // Hoist loop invariants.
+    PM.add(createGVNPass());       // Remove redundancies.
+    PM.add(createMemCpyOptPass()); // Remove dead memcpys.
+    // Nuke dead stores.
+    PM.add(createDeadStoreEliminationPass());
+    
+    // Cleanup and simplify the code after the scalar optimizations.
+    PM.add(createInstructionCombiningPass());
+    
+    PM.add(createJumpThreadingPass());
+    
+    // Delete basic blocks, which optimization passes may have killed.
+    PM.add(createCFGSimplificationPass());
+   
+    // Now that we have optimized the program, discard unreachable functions.
+    PM.add(createGlobalDCEPass());
+  }
+};
+
+  
+} // end namespace llvm
+#endif
diff --git a/include/llvm/Support/PatternMatch.h b/include/llvm/Support/PatternMatch.h
index 948ae51..f0fb516 100644
--- a/include/llvm/Support/PatternMatch.h
+++ b/include/llvm/Support/PatternMatch.h
@@ -40,6 +40,23 @@ bool match(Val *V, const Pattern &P) {
   return const_cast<Pattern&>(P).match(V);
 }
 
+  
+template<typename SubPattern_t>
+struct OneUse_match {
+  SubPattern_t SubPattern;
+  
+  OneUse_match(const SubPattern_t &SP) : SubPattern(SP) {}
+  
+  template<typename OpTy>
+  bool match(OpTy *V) {
+    return V->hasOneUse() && SubPattern.match(V);
+  }
+};
+
+template<typename T>
+inline OneUse_match<T> m_OneUse(const T &SubPattern) { return SubPattern; }
+  
+  
 template<typename Class>
 struct class_match {
   template<typename ITy>
@@ -227,7 +244,25 @@ struct specificval_ty {
 /// m_Specific - Match if we have a specific specified value.
 inline specificval_ty m_Specific(const Value *V) { return V; }
 
+struct bind_const_intval_ty {
+  uint64_t &VR;
+  bind_const_intval_ty(uint64_t &V) : VR(V) {}
+  
+  template<typename ITy>
+  bool match(ITy *V) {
+    if (ConstantInt *CV = dyn_cast<ConstantInt>(V))
+      if (CV->getBitWidth() <= 64) {
+        VR = CV->getZExtValue();
+        return true;
+      }
+    return false;
+  }
+};
 
+/// m_ConstantInt - Match a ConstantInt and bind to its value.  This does not
+/// match ConstantInts wider than 64-bits.
+inline bind_const_intval_ty m_ConstantInt(uint64_t &V) { return V; }
+  
 //===----------------------------------------------------------------------===//
 // Matchers for specific binary operators.
 //
@@ -659,6 +694,99 @@ inline brc_match<Cond_t> m_Br(const Cond_t &C, BasicBlock *&T, BasicBlock *&F) {
   return brc_match<Cond_t>(C, T, F);
 }
 
+
+//===----------------------------------------------------------------------===//
+// Matchers for max/min idioms, eg: "select (sgt x, y), x, y" -> smax(x,y).
+//
+
+template<typename LHS_t, typename RHS_t, typename Pred_t>
+struct MaxMin_match {
+  LHS_t L;
+  RHS_t R;
+
+  MaxMin_match(const LHS_t &LHS, const RHS_t &RHS)
+    : L(LHS), R(RHS) {}
+
+  template<typename OpTy>
+  bool match(OpTy *V) {
+    // Look for "(x pred y) ? x : y" or "(x pred y) ? y : x".
+    SelectInst *SI = dyn_cast<SelectInst>(V);
+    if (!SI)
+      return false;
+    ICmpInst *Cmp = dyn_cast<ICmpInst>(SI->getCondition());
+    if (!Cmp)
+      return false;
+    // At this point we have a select conditioned on a comparison.  Check that
+    // it is the values returned by the select that are being compared.
+    Value *TrueVal = SI->getTrueValue();
+    Value *FalseVal = SI->getFalseValue();
+    Value *LHS = Cmp->getOperand(0);
+    Value *RHS = Cmp->getOperand(1);
+    if ((TrueVal != LHS || FalseVal != RHS) &&
+        (TrueVal != RHS || FalseVal != LHS))
+      return false;
+    ICmpInst::Predicate Pred = LHS == TrueVal ?
+      Cmp->getPredicate() : Cmp->getSwappedPredicate();
+    // Does "(x pred y) ? x : y" represent the desired max/min operation?
+    if (!Pred_t::match(Pred))
+      return false;
+    // It does!  Bind the operands.
+    return L.match(LHS) && R.match(RHS);
+  }
+};
+
+/// smax_pred_ty - Helper class for identifying signed max predicates.
+struct smax_pred_ty {
+  static bool match(ICmpInst::Predicate Pred) {
+    return Pred == CmpInst::ICMP_SGT || Pred == CmpInst::ICMP_SGE;
+  }
+};
+
+/// smin_pred_ty - Helper class for identifying signed min predicates.
+struct smin_pred_ty {
+  static bool match(ICmpInst::Predicate Pred) {
+    return Pred == CmpInst::ICMP_SLT || Pred == CmpInst::ICMP_SLE;
+  }
+};
+
+/// umax_pred_ty - Helper class for identifying unsigned max predicates.
+struct umax_pred_ty {
+  static bool match(ICmpInst::Predicate Pred) {
+    return Pred == CmpInst::ICMP_UGT || Pred == CmpInst::ICMP_UGE;
+  }
+};
+
+/// umin_pred_ty - Helper class for identifying unsigned min predicates.
+struct umin_pred_ty {
+  static bool match(ICmpInst::Predicate Pred) {
+    return Pred == CmpInst::ICMP_ULT || Pred == CmpInst::ICMP_ULE;
+  }
+};
+
+template<typename LHS, typename RHS>
+inline MaxMin_match<LHS, RHS, smax_pred_ty>
+m_SMax(const LHS &L, const RHS &R) {
+  return MaxMin_match<LHS, RHS, smax_pred_ty>(L, R);
+}
+
+template<typename LHS, typename RHS>
+inline MaxMin_match<LHS, RHS, smin_pred_ty>
+m_SMin(const LHS &L, const RHS &R) {
+  return MaxMin_match<LHS, RHS, smin_pred_ty>(L, R);
+}
+
+template<typename LHS, typename RHS>
+inline MaxMin_match<LHS, RHS, umax_pred_ty>
+m_UMax(const LHS &L, const RHS &R) {
+  return MaxMin_match<LHS, RHS, umax_pred_ty>(L, R);
+}
+
+template<typename LHS, typename RHS>
+inline MaxMin_match<LHS, RHS, umin_pred_ty>
+m_UMin(const LHS &L, const RHS &R) {
+  return MaxMin_match<LHS, RHS, umin_pred_ty>(L, R);
+}
+
 } // end namespace PatternMatch
 } // end namespace llvm
 
diff --git a/include/llvm/Support/PrettyStackTrace.h b/include/llvm/Support/PrettyStackTrace.h
index 6dbce39..9b3ecda 100644
--- a/include/llvm/Support/PrettyStackTrace.h
+++ b/include/llvm/Support/PrettyStackTrace.h
@@ -20,7 +20,7 @@ namespace llvm {
   class raw_ostream;
 
   /// DisablePrettyStackTrace - Set this to true to disable this module. This
-  /// might be neccessary if the host application installs its own signal
+  /// might be necessary if the host application installs its own signal
   /// handlers which conflict with the ones installed by this module.
   /// Defaults to false.
   extern bool DisablePrettyStackTrace;
diff --git a/include/llvm/Support/Program.h b/include/llvm/Support/Program.h
index 78a495e..a502657 100644
--- a/include/llvm/Support/Program.h
+++ b/include/llvm/Support/Program.h
@@ -85,8 +85,9 @@ namespace sys {
     /// This function waits for the program to exit. This function will block
     /// the current program until the invoked program exits.
     /// @returns an integer result code indicating the status of the program.
-    /// A zero or positive value indicates the result code of the program. A
-    /// negative value is the signal number on which it terminated.
+    /// A zero or positive value indicates the result code of the program.
+    /// -1 indicates failure to execute
+    /// -2 indicates a crash during execution or timeout
     /// @see Execute
     /// @brief Waits for the program to exit.
     int Wait
@@ -102,7 +103,7 @@ namespace sys {
       );
 
     /// This function terminates the program.
-    /// @returns true if an error occured.
+    /// @returns true if an error occurred.
     /// @see Execute
     /// @brief Terminates the program.
     bool Kill
diff --git a/include/llvm/Support/Regex.h b/include/llvm/Support/Regex.h
index b46a668..7648e77 100644
--- a/include/llvm/Support/Regex.h
+++ b/include/llvm/Support/Regex.h
@@ -53,7 +53,7 @@ namespace llvm {
 
     /// matches - Match the regex against a given \arg String.
     ///
-    /// \param Matches - If given, on a succesful match this will be filled in
+    /// \param Matches - If given, on a successful match this will be filled in
     /// with references to the matched group expressions (inside \arg String),
     /// the first group is always the entire pattern.
     ///
diff --git a/include/llvm/Support/Signals.h b/include/llvm/Support/Signals.h
index 9a84df6..634f4cf 100644
--- a/include/llvm/Support/Signals.h
+++ b/include/llvm/Support/Signals.h
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 // This file defines some helpful functions for dealing with the possibility of
-// unix signals occuring while your program is running.
+// unix signals occurring while your program is running.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/include/llvm/Support/SourceMgr.h b/include/llvm/Support/SourceMgr.h
index 2a712e4..030db8f 100644
--- a/include/llvm/Support/SourceMgr.h
+++ b/include/llvm/Support/SourceMgr.h
@@ -106,7 +106,9 @@ public:
   /// AddIncludeFile - Search for a file with the specified name in the current
   /// directory or in one of the IncludeDirs.  If no file is found, this returns
   /// ~0, otherwise it returns the buffer ID of the stacked file.
-  unsigned AddIncludeFile(const std::string &Filename, SMLoc IncludeLoc);
+  /// The full path to the included file can be found in IncludedFile.
+  unsigned AddIncludeFile(const std::string &Filename, SMLoc IncludeLoc,
+                          std::string &IncludedFile);
 
   /// FindBufferContainingLoc - Return the ID of the buffer containing the
   /// specified location, returning -1 if not found.
diff --git a/include/llvm/Support/StandardPasses.h b/include/llvm/Support/StandardPasses.h
deleted file mode 100644
index d774faf..0000000
--- a/include/llvm/Support/StandardPasses.h
+++ /dev/null
@@ -1,242 +0,0 @@
-//===-- llvm/Support/StandardPasses.h - Standard pass lists -----*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines utility functions for creating a "standard" set of
-// optimization passes, so that compilers and tools which use optimization
-// passes use the same set of standard passes.
-//
-// These are implemented as inline functions so that we do not have to worry
-// about link issues.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_SUPPORT_STANDARDPASSES_H
-#define LLVM_SUPPORT_STANDARDPASSES_H
-
-#include "llvm/PassManager.h"
-#include "llvm/Analysis/Passes.h"
-#include "llvm/Analysis/Verifier.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/IPO.h"
-
-namespace llvm {
-
-  static inline void createStandardAliasAnalysisPasses(PassManagerBase *PM) {
-    // Add TypeBasedAliasAnalysis before BasicAliasAnalysis so that
-    // BasicAliasAnalysis wins if they disagree. This is intended to help
-    // support "obvious" type-punning idioms.
-    PM->add(createTypeBasedAliasAnalysisPass());
-    PM->add(createBasicAliasAnalysisPass());
-  }
-
-  /// createStandardFunctionPasses - Add the standard list of function passes to
-  /// the provided pass manager.
-  ///
-  /// \arg OptimizationLevel - The optimization level, corresponding to -O0,
-  /// -O1, etc.
-  static inline void createStandardFunctionPasses(PassManagerBase *PM,
-                                                  unsigned OptimizationLevel) {
-    if (OptimizationLevel > 0) {
-      createStandardAliasAnalysisPasses(PM);
-      PM->add(createCFGSimplificationPass());
-      PM->add(createScalarReplAggregatesPass());
-      PM->add(createEarlyCSEPass());
-    }
-  }
-
-  /// createStandardModulePasses - Add the standard list of module passes to the
-  /// provided pass manager.
-  ///
-  /// \arg OptimizationLevel - The optimization level, corresponding to -O0,
-  /// -O1, etc.
-  /// \arg OptimizeSize - Whether the transformations should optimize for size.
-  /// \arg UnitAtATime - Allow passes which may make global module changes.
-  /// \arg UnrollLoops - Allow loop unrolling.
-  /// \arg SimplifyLibCalls - Allow library calls to be simplified.
-  /// \arg HaveExceptions - Whether the module may have code using exceptions.
-  /// \arg InliningPass - The inlining pass to use, if any, or null. This will
-  /// always be added, even at -O0.a
-  static inline void createStandardModulePasses(PassManagerBase *PM,
-                                                unsigned OptimizationLevel,
-                                                bool OptimizeSize,
-                                                bool UnitAtATime,
-                                                bool UnrollLoops,
-                                                bool SimplifyLibCalls,
-                                                bool HaveExceptions,
-                                                Pass *InliningPass) {
-    createStandardAliasAnalysisPasses(PM);
-
-    if (OptimizationLevel == 0) {
-      if (InliningPass)
-        PM->add(InliningPass);
-      return;
-    }
-    
-    if (UnitAtATime) {
-      PM->add(createGlobalOptimizerPass());     // Optimize out global vars
-      
-      PM->add(createIPSCCPPass());              // IP SCCP
-      PM->add(createDeadArgEliminationPass());  // Dead argument elimination
-    }
-    PM->add(createInstructionCombiningPass());  // Clean up after IPCP & DAE
-    PM->add(createCFGSimplificationPass());     // Clean up after IPCP & DAE
-    
-    // Start of CallGraph SCC passes.
-    if (UnitAtATime && HaveExceptions)
-      PM->add(createPruneEHPass());             // Remove dead EH info
-    if (InliningPass)
-      PM->add(InliningPass);
-    if (UnitAtATime)
-      PM->add(createFunctionAttrsPass());       // Set readonly/readnone attrs
-    if (OptimizationLevel > 2)
-      PM->add(createArgumentPromotionPass());   // Scalarize uninlined fn args
-    
-    // Start of function pass.
-    // Break up aggregate allocas, using SSAUpdater.
-    PM->add(createScalarReplAggregatesPass(-1, false));
-    PM->add(createEarlyCSEPass());              // Catch trivial redundancies
-    if (SimplifyLibCalls)
-      PM->add(createSimplifyLibCallsPass());    // Library Call Optimizations
-    PM->add(createJumpThreadingPass());         // Thread jumps.
-    PM->add(createCorrelatedValuePropagationPass()); // Propagate conditionals
-    PM->add(createCFGSimplificationPass());     // Merge & remove BBs
-    PM->add(createInstructionCombiningPass());  // Combine silly seq's
-    
-    PM->add(createTailCallEliminationPass());   // Eliminate tail calls
-    PM->add(createCFGSimplificationPass());     // Merge & remove BBs
-    PM->add(createReassociatePass());           // Reassociate expressions
-    PM->add(createLoopRotatePass());            // Rotate Loop
-    PM->add(createLICMPass());                  // Hoist loop invariants
-    PM->add(createLoopUnswitchPass(OptimizeSize || OptimizationLevel < 3));
-    PM->add(createInstructionCombiningPass());  
-    PM->add(createIndVarSimplifyPass());        // Canonicalize indvars
-    PM->add(createLoopIdiomPass());             // Recognize idioms like memset.
-    PM->add(createLoopDeletionPass());          // Delete dead loops
-    if (UnrollLoops)
-      PM->add(createLoopUnrollPass());          // Unroll small loops
-    PM->add(createInstructionCombiningPass());  // Clean up after the unroller
-    if (OptimizationLevel > 1)
-      PM->add(createGVNPass());                 // Remove redundancies
-    PM->add(createMemCpyOptPass());             // Remove memcpy / form memset
-    PM->add(createSCCPPass());                  // Constant prop with SCCP
-  
-    // Run instcombine after redundancy elimination to exploit opportunities
-    // opened up by them.
-    PM->add(createInstructionCombiningPass());
-    PM->add(createJumpThreadingPass());         // Thread jumps
-    PM->add(createCorrelatedValuePropagationPass());
-    PM->add(createDeadStoreEliminationPass());  // Delete dead stores
-    PM->add(createAggressiveDCEPass());         // Delete dead instructions
-    PM->add(createCFGSimplificationPass());     // Merge & remove BBs
-
-    if (UnitAtATime) {
-      PM->add(createStripDeadPrototypesPass()); // Get rid of dead prototypes
-      PM->add(createDeadTypeEliminationPass()); // Eliminate dead types
-
-      // GlobalOpt already deletes dead functions and globals, at -O3 try a
-      // late pass of GlobalDCE.  It is capable of deleting dead cycles.
-      if (OptimizationLevel > 2)
-        PM->add(createGlobalDCEPass());         // Remove dead fns and globals.
-    
-      if (OptimizationLevel > 1)
-        PM->add(createConstantMergePass());       // Merge dup global constants
-    }
-  }
-
-  static inline void addOnePass(PassManagerBase *PM, Pass *P, bool AndVerify) {
-    PM->add(P);
-
-    if (AndVerify)
-      PM->add(createVerifierPass());
-  }
-
-  /// createStandardLTOPasses - Add the standard list of module passes suitable
-  /// for link time optimization.
-  ///
-  /// Internalize - Run the internalize pass.
-  /// RunInliner - Use a function inlining pass.
-  /// VerifyEach - Run the verifier after each pass.
-  static inline void createStandardLTOPasses(PassManagerBase *PM,
-                                             bool Internalize,
-                                             bool RunInliner,
-                                             bool VerifyEach) {
-    // Provide AliasAnalysis services for optimizations.
-    createStandardAliasAnalysisPasses(PM);
-
-    // Now that composite has been compiled, scan through the module, looking
-    // for a main function.  If main is defined, mark all other functions
-    // internal.
-    if (Internalize)
-      addOnePass(PM, createInternalizePass(true), VerifyEach);
-
-    // Propagate constants at call sites into the functions they call.  This
-    // opens opportunities for globalopt (and inlining) by substituting function
-    // pointers passed as arguments to direct uses of functions.  
-    addOnePass(PM, createIPSCCPPass(), VerifyEach);
-
-    // Now that we internalized some globals, see if we can hack on them!
-    addOnePass(PM, createGlobalOptimizerPass(), VerifyEach);
-    
-    // Linking modules together can lead to duplicated global constants, only
-    // keep one copy of each constant...
-    addOnePass(PM, createConstantMergePass(), VerifyEach);
-    
-    // Remove unused arguments from functions...
-    addOnePass(PM, createDeadArgEliminationPass(), VerifyEach);
-
-    // Reduce the code after globalopt and ipsccp.  Both can open up significant
-    // simplification opportunities, and both can propagate functions through
-    // function pointers.  When this happens, we often have to resolve varargs
-    // calls, etc, so let instcombine do this.
-    addOnePass(PM, createInstructionCombiningPass(), VerifyEach);
-
-    // Inline small functions
-    if (RunInliner)
-      addOnePass(PM, createFunctionInliningPass(), VerifyEach);
-
-    addOnePass(PM, createPruneEHPass(), VerifyEach);   // Remove dead EH info.
-    // Optimize globals again if we ran the inliner.
-    if (RunInliner)
-      addOnePass(PM, createGlobalOptimizerPass(), VerifyEach);
-    addOnePass(PM, createGlobalDCEPass(), VerifyEach); // Remove dead functions.
-
-    // If we didn't decide to inline a function, check to see if we can
-    // transform it to pass arguments by value instead of by reference.
-    addOnePass(PM, createArgumentPromotionPass(), VerifyEach);
-
-    // The IPO passes may leave cruft around.  Clean up after them.
-    addOnePass(PM, createInstructionCombiningPass(), VerifyEach);
-    addOnePass(PM, createJumpThreadingPass(), VerifyEach);
-    // Break up allocas
-    addOnePass(PM, createScalarReplAggregatesPass(), VerifyEach);
-
-    // Run a few AA driven optimizations here and now, to cleanup the code.
-    addOnePass(PM, createFunctionAttrsPass(), VerifyEach); // Add nocapture.
-    addOnePass(PM, createGlobalsModRefPass(), VerifyEach); // IP alias analysis.
-
-    addOnePass(PM, createLICMPass(), VerifyEach);      // Hoist loop invariants.
-    addOnePass(PM, createGVNPass(), VerifyEach);       // Remove redundancies.
-    addOnePass(PM, createMemCpyOptPass(), VerifyEach); // Remove dead memcpys.
-    // Nuke dead stores.
-    addOnePass(PM, createDeadStoreEliminationPass(), VerifyEach);
-
-    // Cleanup and simplify the code after the scalar optimizations.
-    addOnePass(PM, createInstructionCombiningPass(), VerifyEach);
-
-    addOnePass(PM, createJumpThreadingPass(), VerifyEach);
-    
-    // Delete basic blocks, which optimization passes may have killed.
-    addOnePass(PM, createCFGSimplificationPass(), VerifyEach);
-
-    // Now that we have optimized the program, discard unreachable functions.
-    addOnePass(PM, createGlobalDCEPass(), VerifyEach);
-  }
-}
-
-#endif
diff --git a/include/llvm/Support/TimeValue.h b/include/llvm/Support/TimeValue.h
index e122711..94f132a 100644
--- a/include/llvm/Support/TimeValue.h
+++ b/include/llvm/Support/TimeValue.h
@@ -35,13 +35,13 @@ namespace sys {
   public:
 
     /// A constant TimeValue representing the smallest time
-    /// value permissable by the class. MinTime is some point
+    /// value permissible by the class. MinTime is some point
     /// in the distant past, about 300 billion years BCE.
     /// @brief The smallest possible time value.
     static const TimeValue MinTime;
 
     /// A constant TimeValue representing the largest time
-    /// value permissable by the class. MaxTime is some point
+    /// value permissible by the class. MaxTime is some point
     /// in the distant future, about 300 billion years AD.
     /// @brief The largest possible time value.
     static const TimeValue MaxTime;
diff --git a/include/llvm/Support/Win64EH.h b/include/llvm/Support/Win64EH.h
new file mode 100644
index 0000000..8d74e10
--- /dev/null
+++ b/include/llvm/Support/Win64EH.h
@@ -0,0 +1,100 @@
+//===-- llvm/Support/Win64EH.h ---Win64 EH Constants-------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains constants and structures used for implementing
+// exception handling on Win64 platforms. For more information, see
+// http://msdn.microsoft.com/en-us/library/1eyas8tf.aspx
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_WIN64EH_H
+#define LLVM_SUPPORT_WIN64EH_H
+
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+namespace Win64EH {
+
+/// UnwindOpcodes - Enumeration whose values specify a single operation in
+/// the prolog of a function.
+enum UnwindOpcodes {
+  UOP_PushNonVol = 0,
+  UOP_AllocLarge,
+  UOP_AllocSmall,
+  UOP_SetFPReg,
+  UOP_SaveNonVol,
+  UOP_SaveNonVolBig,
+  UOP_SaveXMM128 = 8,
+  UOP_SaveXMM128Big,
+  UOP_PushMachFrame
+};
+
+/// UnwindCode - This union describes a single operation in a function prolog,
+/// or part thereof.
+union UnwindCode {
+  struct {
+    uint8_t codeOffset;
+    uint8_t unwindOp:4,
+            opInfo:4;
+  } u;
+  uint16_t frameOffset;
+};
+
+enum {
+  /// UNW_ExceptionHandler - Specifies that this function has an exception
+  /// handler.
+  UNW_ExceptionHandler = 0x01,
+  /// UNW_TerminateHandler - Specifies that this function has a termination
+  /// handler.
+  UNW_TerminateHandler = 0x02,
+  /// UNW_ChainInfo - Specifies that this UnwindInfo structure is chained to
+  /// another one.
+  UNW_ChainInfo = 0x04
+};
+
+/// RuntimeFunction - An entry in the table of functions with unwind info.
+struct RuntimeFunction {
+  uint64_t startAddress;
+  uint64_t endAddress;
+  uint64_t unwindInfoOffset;
+};
+
+/// UnwindInfo - An entry in the exception table.
+struct UnwindInfo {
+  uint8_t version:3,
+          flags:5;
+  uint8_t prologSize;
+  uint8_t numCodes;
+  uint8_t frameRegister:4,
+          frameOffset:4;
+  UnwindCode unwindCodes[1];
+
+  void *getLanguageSpecificData() {
+    return reinterpret_cast<void *>(&unwindCodes[(numCodes+1) & ~1]);
+  }
+  uint64_t getLanguageSpecificHandlerOffset() {
+    return *reinterpret_cast<uint64_t *>(getLanguageSpecificData());
+  }
+  void setLanguageSpecificHandlerOffset(uint64_t offset) {
+    *reinterpret_cast<uint64_t *>(getLanguageSpecificData()) = offset;
+  }
+  RuntimeFunction *getChainedFunctionEntry() {
+    return reinterpret_cast<RuntimeFunction *>(getLanguageSpecificData());
+  }
+  void *getExceptionData() {
+    return reinterpret_cast<void *>(reinterpret_cast<uint64_t *>(
+                                                  getLanguageSpecificData())+1);
+  }
+};
+
+
+} // End of namespace Win64EH
+} // End of namespace llvm
+
+#endif
diff --git a/include/llvm/Support/system_error.h b/include/llvm/Support/system_error.h
index 47759b9..b77030d 100644
--- a/include/llvm/Support/system_error.h
+++ b/include/llvm/Support/system_error.h
@@ -669,7 +669,7 @@ const error_category& generic_category();
 const error_category& system_category();
 
 /// Get the error_category used for errno values from POSIX functions. This is
-/// the same as the system_category on POISIX systems, but is the same as the
+/// the same as the system_category on POSIX systems, but is the same as the
 /// generic_category on Windows.
 const error_category& posix_category();
 
diff --git a/include/llvm/Target/SubtargetFeature.h b/include/llvm/Target/SubtargetFeature.h
index 6c21ae9..4213d9b 100644
--- a/include/llvm/Target/SubtargetFeature.h
+++ b/include/llvm/Target/SubtargetFeature.h
@@ -35,8 +35,8 @@ namespace llvm {
 struct SubtargetFeatureKV {
   const char *Key;                      // K-V key string
   const char *Desc;                     // Help descriptor
-  uint32_t Value;                       // K-V integer value
-  uint32_t Implies;                     // K-V bit mask
+  uint64_t Value;                       // K-V integer value
+  uint64_t Implies;                     // K-V bit mask
   
   // Compare routine for std binary search
   bool operator<(const SubtargetFeatureKV &S) const {
@@ -94,7 +94,7 @@ public:
   void AddFeature(const std::string &String, bool IsEnabled = true);
            
   /// Get feature bits.
-  uint32_t getBits(const SubtargetFeatureKV *CPUTable,
+  uint64_t getBits(const SubtargetFeatureKV *CPUTable,
                          size_t CPUTableSize,
                    const SubtargetFeatureKV *FeatureTable,
                          size_t FeatureTableSize);
diff --git a/include/llvm/Target/Target.td b/include/llvm/Target/Target.td
index d015815..616087c 100644
--- a/include/llvm/Target/Target.td
+++ b/include/llvm/Target/Target.td
@@ -32,17 +32,6 @@ class Register<string n> {
   string Namespace = "";
   string AsmName = n;
 
-  // SpillSize - If this value is set to a non-zero value, it is the size in
-  // bits of the spill slot required to hold this register.  If this value is
-  // set to zero, the information is inferred from any register classes the
-  // register belongs to.
-  int SpillSize = 0;
-
-  // SpillAlignment - This value is used to specify the alignment required for
-  // spilling the register.  Like SpillSize, this should only be explicitly
-  // specified if the register is not in a register class.
-  int SpillAlignment = 0;
-
   // Aliases - A list of registers that this register overlaps with.  A read or
   // modification of this register can potentially read or modify the aliased
   // registers.
@@ -78,6 +67,13 @@ class Register<string n> {
   // -1 indicates that the gcc number is undefined and -2 that register number
   // is invalid for this mode/flavour.
   list<int> DwarfNumbers = [];
+
+  // CostPerUse - Additional cost of instructions using this register compared
+  // to other registers in its class. The register allocator will try to
+  // minimize the number of instructions using a register with a CostPerUse.
+  // This is used by the x86-64 and ARM Thumb targets where some registers 
+  // require larger instruction encodings.
+  int CostPerUse = 0;
 }
 
 // RegisterWithSubRegs - This can be used to define instances of Register which
@@ -96,7 +92,7 @@ class RegisterWithSubRegs<string n, list<Register> subregs> : Register<n> {
 // registers by register allocators.
 //
 class RegisterClass<string namespace, list<ValueType> regTypes, int alignment,
-                    list<Register> regList> {
+                    dag regList> {
   string Namespace = namespace;
 
   // RegType - Specify the list ValueType of the registers in this register
@@ -126,12 +122,17 @@ class RegisterClass<string namespace, list<ValueType> regTypes, int alignment,
   // allocation_order_* method are not specified, this also defines the order of
   // allocation used by the register allocator.
   //
-  list<Register> MemberList = regList;
+  dag MemberList = regList;
 
   // SubRegClasses - Specify the register class of subregisters as a list of
   // dags: (RegClass SubRegIndex, SubRegindex, ...)
   list<dag> SubRegClasses = [];
 
+  // isAllocatable - Specify that the register class can be used for virtual
+  // registers and register allocation.  Some register classes are only used to
+  // model instruction operand constraints, and should have isAllocatable = 0.
+  bit isAllocatable = 1;
+
   // MethodProtos/MethodBodies - These members can be used to insert arbitrary
   // code into a generated register class.   The normal usage of this is to
   // overload virtual methods.
@@ -139,6 +140,39 @@ class RegisterClass<string namespace, list<ValueType> regTypes, int alignment,
   code MethodBodies = [{}];
 }
 
+// The memberList in a RegisterClass is a dag of set operations. TableGen
+// evaluates these set operations and expand them into register lists. These
+// are the most common operation, see test/TableGen/SetTheory.td for more
+// examples of what is possible:
+//
+// (add R0, R1, R2) - Set Union. Each argument can be an individual register, a
+// register class, or a sub-expression. This is also the way to simply list
+// registers.
+//
+// (sub GPR, SP) - Set difference. Subtract the last arguments from the first.
+//
+// (and GPR, CSR) - Set intersection. All registers from the first set that are
+// also in the second set.
+//
+// (sequence "R%u", 0, 15) -> [R0, R1, ..., R15]. Generate a sequence of
+// numbered registers.
+//
+// (shl GPR, 4) - Remove the first N elements.
+//
+// (trunc GPR, 4) - Truncate after the first N elements.
+//
+// (rotl GPR, 1) - Rotate N places to the left.
+//
+// (rotr GPR, 1) - Rotate N places to the right.
+//
+// (decimate GPR, 2) - Pick every N'th element, starting with the first.
+//
+// All of these operators work on ordered sets, not lists. That means
+// duplicates are removed from sub-expressions.
+
+// Set operators. The rest is defined in TargetSelectionDAG.td.
+def sequence;
+def decimate;
 
 //===----------------------------------------------------------------------===//
 // DwarfRegNum - This class provides a mapping of the llvm register enumeration
@@ -155,6 +189,14 @@ class DwarfRegNum<list<int> Numbers> {
   list<int> DwarfNumbers = Numbers;
 }
 
+// DwarfRegAlias - This class declares that a given register uses the same dwarf
+// numbers as another one. This is useful for making it clear that the two
+// registers do have the same number. It also lets us build a mapping
+// from dwarf register number to llvm register.
+class DwarfRegAlias<Register reg> {
+      Register DwarfAlias = reg;
+}
+
 //===----------------------------------------------------------------------===//
 // Pull in the common support for scheduling
 //
@@ -591,9 +633,10 @@ class MnemonicAlias<string From, string To> {
 /// InstAlias - This defines an alternate assembly syntax that is allowed to
 /// match an instruction that has a different (more canonical) assembly
 /// representation.
-class InstAlias<string Asm, dag Result> {
+class InstAlias<string Asm, dag Result, bit Emit = 0b1> {
   string AsmString = Asm;      // The .s format to match the instruction with.
   dag ResultInst = Result;     // The MCInst to generate.
+  bit EmitAlias = Emit;        // Emit the alias instead of what's aliased.
 
   // Predicates - Predicates that must be true for this to match.
   list<Predicate> Predicates = [];
diff --git a/include/llvm/Target/TargetAsmInfo.h b/include/llvm/Target/TargetAsmInfo.h
index 98aab14..743a2d4 100644
--- a/include/llvm/Target/TargetAsmInfo.h
+++ b/include/llvm/Target/TargetAsmInfo.h
@@ -22,6 +22,7 @@
 namespace llvm {
   class MCSection;
   class MCContext;
+  class MachineFunction;
   class TargetMachine;
   class TargetLoweringObjectFile;
 
@@ -58,6 +59,30 @@ public:
     return TLOF->getEHFrameSection();
   }
 
+  const MCSection *getDwarfFrameSection() const {
+    return TLOF->getDwarfFrameSection();
+  }
+
+  const MCSection *getWin64EHFuncTableSection(StringRef Suffix) const {
+    return TLOF->getWin64EHFuncTableSection(Suffix);
+  }
+
+  const MCSection *getWin64EHTableSection(StringRef Suffix) const {
+    return TLOF->getWin64EHTableSection(Suffix);
+  }
+
+  unsigned getFDEEncoding(bool CFI) const {
+    return TLOF->getFDEEncoding(CFI);
+  }
+
+  bool isFunctionEHFrameSymbolPrivate() const {
+    return TLOF->isFunctionEHFrameSymbolPrivate();
+  }
+
+  const unsigned *getCalleeSavedRegs(MachineFunction *MF = 0) const {
+    return TRI->getCalleeSavedRegs(MF);
+  }
+
   unsigned getDwarfRARegNum(bool isEH) const {
     return TRI->getDwarfRegNum(TRI->getRARegister(), isEH);
   }
@@ -69,6 +94,14 @@ public:
   int getDwarfRegNum(unsigned RegNum, bool isEH) const {
     return TRI->getDwarfRegNum(RegNum, isEH);
   }
+
+  int getLLVMRegNum(unsigned DwarfRegNum, bool isEH) const {
+    return TRI->getLLVMRegNum(DwarfRegNum, isEH);
+  }
+
+  int getSEHRegNum(unsigned RegNum) const {
+    return TRI->getSEHRegNum(RegNum);
+  }
 };
 
 }
diff --git a/include/llvm/Target/TargetInstrInfo.h b/include/llvm/Target/TargetInstrInfo.h
index c903f31..418f3fe 100644
--- a/include/llvm/Target/TargetInstrInfo.h
+++ b/include/llvm/Target/TargetInstrInfo.h
@@ -477,7 +477,7 @@ public:
   }
 
   /// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to
-  /// determine (in conjuction with areLoadsFromSameBasePtr) if two loads should
+  /// determine (in conjunction with areLoadsFromSameBasePtr) if two loads should
   /// be scheduled togther. On some targets if two loads are loading from
   /// addresses in the same cache line, it's better if they are scheduled
   /// together. This function takes two integers that represent the load offsets
diff --git a/include/llvm/Target/TargetInstrItineraries.h b/include/llvm/Target/TargetInstrItineraries.h
index a95b70f..6011402 100644
--- a/include/llvm/Target/TargetInstrItineraries.h
+++ b/include/llvm/Target/TargetInstrItineraries.h
@@ -122,7 +122,8 @@ public:
 
   InstrItineraryData(const InstrStage *S, const unsigned *OS,
                      const unsigned *F, const InstrItinerary *I)
-    : Stages(S), OperandCycles(OS), Forwardings(F), Itineraries(I) {}
+    : Stages(S), OperandCycles(OS), Forwardings(F), Itineraries(I),
+      IssueWidth(0) {}
 
   /// isEmpty - Returns true if there are no itineraries.
   ///
@@ -155,9 +156,13 @@ public:
   /// in the itinerary.
   ///
   unsigned getStageLatency(unsigned ItinClassIndx) const {
-    // If the target doesn't provide itinerary information, use a
-    // simple non-zero default value for all instructions.
-    if (isEmpty())
+    // If the target doesn't provide itinerary information, use a simple
+    // non-zero default value for all instructions.  Some target's provide a
+    // dummy (Generic) itinerary which should be handled as if it's itinerary is
+    // empty. We identify this by looking for a reference to stage zero (invalid
+    // stage). This is different from beginStage == endState != 0, which could
+    // be used for zero-latency pseudo ops.
+    if (isEmpty() || Itineraries[ItinClassIndx].FirstStage == 0)
       return 1;
 
     // Calculate the maximum completion time for any stage.
diff --git a/include/llvm/Target/TargetLibraryInfo.h b/include/llvm/Target/TargetLibraryInfo.h
index 1847a37..02a1a3c 100644
--- a/include/llvm/Target/TargetLibraryInfo.h
+++ b/include/llvm/Target/TargetLibraryInfo.h
@@ -23,6 +23,9 @@ namespace llvm {
       // void *memcpy(void *s1, const void *s2, size_t n);
       memcpy,
       
+      // void *memmove(void *s1, const void *s2, size_t n);
+      memmove,
+      
       /// void memset_pattern16(void *b, const void *pattern16, size_t len);
       memset_pattern16,
       
@@ -48,6 +51,7 @@ public:
   static char ID;
   TargetLibraryInfo();
   TargetLibraryInfo(const Triple &T);
+  explicit TargetLibraryInfo(const TargetLibraryInfo &TLI);
   
   /// has - This function is used by optimizations that want to match on or form
   /// a given library function.
diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h
index ac5b1f6..093e79b 100644
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -94,6 +94,19 @@ public:
     Custom      // Use the LowerOperation hook to implement custom lowering.
   };
 
+  /// LegalizeAction - This enum indicates whether a types are legal for a
+  /// target, and if not, what action should be used to make them valid.
+  enum LegalizeTypeAction {
+    TypeLegal,           // The target natively supports this type.
+    TypePromoteInteger,  // Replace this integer with a larger one.
+    TypeExpandInteger,   // Split this integer into two of half the size.
+    TypeSoftenFloat,     // Convert this float to a same size integer type.
+    TypeExpandFloat,     // Split this float into two of half the size.
+    TypeScalarizeVector, // Replace this one-element vector with its element.
+    TypeSplitVector,     // Split this vector into two of half the size.
+    TypeWidenVector      // This vector should be widened into a larger vector.
+  };
+
   enum BooleanContent { // How the target represents true/false values.
     UndefinedBooleanContent,    // Only bit 0 counts, the rest can hold garbage.
     ZeroOrOneBooleanContent,        // All bits zero except for bit 0.
@@ -200,71 +213,20 @@ public:
   }
 
   class ValueTypeActionImpl {
-    /// ValueTypeActions - For each value type, keep a LegalizeAction enum
+    /// ValueTypeActions - For each value type, keep a LegalizeTypeAction enum
     /// that indicates how instruction selection should deal with the type.
     uint8_t ValueTypeActions[MVT::LAST_VALUETYPE];
 
-    LegalizeAction getExtendedTypeAction(EVT VT) const {
-      // Handle non-vector integers.
-      if (!VT.isVector()) {
-        assert(VT.isInteger() && "Unsupported extended type!");
-        unsigned BitSize = VT.getSizeInBits();
-        // First promote to a power-of-two size, then expand if necessary.
-        if (BitSize < 8 || !isPowerOf2_32(BitSize))
-          return Promote;
-        return Expand;
-      }
-
-      // Vectors with only one element are always scalarized.
-      if (VT.getVectorNumElements() == 1)
-        return Expand;
-
-      // Vectors with a number of elements that is not a power of two are always
-      // widened, for example <3 x float> -> <4 x float>.
-      if (!VT.isPow2VectorType())
-        return Promote;
-
-      // Vectors with a crazy element type are always expanded, for example
-      // <4 x i2> is expanded into two vectors of type <2 x i2>.
-      if (!VT.getVectorElementType().isSimple())
-        return Expand;
-
-      // If this type is smaller than a legal vector type then widen it,
-      // otherwise expand it.  E.g. <2 x float> -> <4 x float>.
-      MVT EltType = VT.getVectorElementType().getSimpleVT();
-      unsigned NumElts = VT.getVectorNumElements();
-      while (1) {
-        // Round up to the next power of 2.
-        NumElts = (unsigned)NextPowerOf2(NumElts);
-
-        // If there is no simple vector type with this many elements then there
-        // cannot be a larger legal vector type.  Note that this assumes that
-        // there are no skipped intermediate vector types in the simple types.
-        MVT LargerVector = MVT::getVectorVT(EltType, NumElts);
-        if (LargerVector == MVT())
-          return Expand;
-
-        // If this type is legal then widen the vector.
-        if (getTypeAction(LargerVector) == Legal)
-          return Promote;
-      }
-    }
   public:
     ValueTypeActionImpl() {
       std::fill(ValueTypeActions, array_endof(ValueTypeActions), 0);
     }
 
-    LegalizeAction getTypeAction(EVT VT) const {
-      if (!VT.isExtended())
-        return getTypeAction(VT.getSimpleVT());
-      return getExtendedTypeAction(VT);
-    }
-
-    LegalizeAction getTypeAction(MVT VT) const {
-      return (LegalizeAction)ValueTypeActions[VT.SimpleTy];
+    LegalizeTypeAction getTypeAction(MVT VT) const {
+      return (LegalizeTypeAction)ValueTypeActions[VT.SimpleTy];
     }
 
-    void setTypeAction(EVT VT, LegalizeAction Action) {
+    void setTypeAction(EVT VT, LegalizeTypeAction Action) {
       unsigned I = VT.getSimpleVT().SimpleTy;
       ValueTypeActions[I] = Action;
     }
@@ -278,10 +240,10 @@ public:
   /// it is already legal (return 'Legal') or we need to promote it to a larger
   /// type (return 'Promote'), or we need to expand it into multiple registers
   /// of smaller integer type (return 'Expand').  'Custom' is not an option.
-  LegalizeAction getTypeAction(EVT VT) const {
-    return ValueTypeActions.getTypeAction(VT);
+  LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const {
+    return getTypeConversion(Context, VT).first;
   }
-  LegalizeAction getTypeAction(MVT VT) const {
+  LegalizeTypeAction getTypeAction(MVT VT) const {
     return ValueTypeActions.getTypeAction(VT);
   }
 
@@ -292,38 +254,7 @@ public:
   /// to get to the smaller register. For illegal floating point types, this
   /// returns the integer type to transform to.
   EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const {
-    if (VT.isSimple()) {
-      assert((unsigned)VT.getSimpleVT().SimpleTy <
-             array_lengthof(TransformToType));
-      EVT NVT = TransformToType[VT.getSimpleVT().SimpleTy];
-      assert(getTypeAction(NVT) != Promote &&
-             "Promote may not follow Expand or Promote");
-      return NVT;
-    }
-
-    if (VT.isVector()) {
-      EVT NVT = VT.getPow2VectorType(Context);
-      if (NVT == VT) {
-        // Vector length is a power of 2 - split to half the size.
-        unsigned NumElts = VT.getVectorNumElements();
-        EVT EltVT = VT.getVectorElementType();
-        return (NumElts == 1) ?
-          EltVT : EVT::getVectorVT(Context, EltVT, NumElts / 2);
-      }
-      // Promote to a power of two size, avoiding multi-step promotion.
-      return getTypeAction(NVT) == Promote ?
-        getTypeToTransformTo(Context, NVT) : NVT;
-    } else if (VT.isInteger()) {
-      EVT NVT = VT.getRoundIntegerType(Context);
-      if (NVT == VT)      // Size is a power of two - expand to half the size.
-        return EVT::getIntegerVT(Context, VT.getSizeInBits() / 2);
-
-      // Promote to a power of two size, avoiding multi-step promotion.
-      return getTypeAction(NVT) == Promote ?
-        getTypeToTransformTo(Context, NVT) : NVT;
-    }
-    assert(0 && "Unsupported extended type!");
-    return MVT(MVT::Other); // Not reached
+    return getTypeConversion(Context, VT).second;
   }
 
   /// getTypeToExpandTo - For types supported by the target, this is an
@@ -333,7 +264,7 @@ public:
   EVT getTypeToExpandTo(LLVMContext &Context, EVT VT) const {
     assert(!VT.isVector());
     while (true) {
-      switch (getTypeAction(VT)) {
+      switch (getTypeAction(Context, VT)) {
       case Legal:
         return VT;
       case Expand:
@@ -761,6 +692,18 @@ public:
     return MinStackArgumentAlignment;
   }
 
+  /// getMinFunctionAlignment - return the minimum function alignment.
+  ///
+  unsigned getMinFunctionAlignment() const {
+    return MinFunctionAlignment;
+  }
+
+  /// getPrefFunctionAlignment - return the preferred function alignment.
+  ///
+  unsigned getPrefFunctionAlignment() const {
+    return PrefFunctionAlignment;
+  }
+
   /// getPrefLoopAlignment - return the preferred loop alignment.
   ///
   unsigned getPrefLoopAlignment() const {
@@ -824,9 +767,6 @@ public:
   /// PIC relocation models.
   virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
 
-  /// getFunctionAlignment - Return the Log2 alignment of this function.
-  virtual unsigned getFunctionAlignment(const Function *) const = 0;
-
   /// getStackCookieLocation - Return true if the target stores stack
   /// protector cookies at a fixed offset in some non-standard address
   /// space, and populates the address space and offset as
@@ -1042,7 +982,7 @@ protected:
   }
 
   /// JumpIsExpensive - Tells the code generator not to expand sequence of
-  /// operations into a seperate sequences that increases the amount of
+  /// operations into a separate sequences that increases the amount of
   /// flow control.
   void setJumpIsExpensive(bool isExpensive = true) {
     JumpIsExpensive = isExpensive;
@@ -1167,6 +1107,18 @@ protected:
     JumpBufAlignment = Align;
   }
 
+  /// setMinFunctionAlignment - Set the target's minimum function alignment.
+  void setMinFunctionAlignment(unsigned Align) {
+    MinFunctionAlignment = Align;
+  }
+
+  /// setPrefFunctionAlignment - Set the target's preferred function alignment.
+  /// This should be set if there is a performance benefit to
+  /// higher-than-minimum alignment
+  void setPrefFunctionAlignment(unsigned Align) {
+    PrefFunctionAlignment = Align;
+  }
+
   /// setPrefLoopAlignment - Set the target's preferred loop alignment. Default
   /// alignment is zero, it means the target does not care about loop alignment.
   void setPrefLoopAlignment(unsigned Align) {
@@ -1253,13 +1205,14 @@ public:
   }
 
   /// HandleByVal - Target-specific cleanup for formal ByVal parameters.
-  virtual void HandleByVal(CCState *) const {}
+  virtual void HandleByVal(CCState *, unsigned &) const {}
 
   /// CanLowerReturn - This hook should be implemented to check whether the
   /// return values described by the Outs array can fit into the return
   /// registers.  If false is returned, an sret-demotion is performed.
   ///
-  virtual bool CanLowerReturn(CallingConv::ID CallConv, bool isVarArg,
+  virtual bool CanLowerReturn(CallingConv::ID CallConv,
+			      MachineFunction &MF, bool isVarArg,
                const SmallVectorImpl<ISD::OutputArg> &Outs,
                LLVMContext &Context) const
   {
@@ -1497,7 +1450,7 @@ public:
 
   /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
   /// vector.  If it is invalid, don't add anything to Ops.
-  virtual void LowerAsmOperandForConstraint(SDValue Op, char ConstraintLetter,
+  virtual void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
                                             std::vector<SDValue> &Ops,
                                             SelectionDAG &DAG) const;
 
@@ -1583,6 +1536,14 @@ public:
     return true;
   }
 
+  /// isLegalAddImmediate - Return true if the specified immediate is legal
+  /// add immediate, that is the target has add instructions which can add
+  /// a register with the immediate without having to materialize the
+  /// immediate into a register.
+  virtual bool isLegalAddImmediate(int64_t Imm) const {
+    return true;
+  }
+
   //===--------------------------------------------------------------------===//
   // Div utility functions
   //
@@ -1637,6 +1598,13 @@ private:
   const TargetData *TD;
   const TargetLoweringObjectFile &TLOF;
 
+  /// We are in the process of implementing a new TypeLegalization action
+  /// which is the promotion of vector elements. This feature is under
+  /// development. Until this feature is complete, it is only enabled using a
+  /// flag. We pass this flag using a member because of circular dep issues.
+  /// This member will be removed with the flag once we complete the transition.
+  bool mayPromoteElements;
+
   /// PointerTy - The type to use for pointers, usually i32 or i64.
   ///
   MVT PointerTy;
@@ -1693,7 +1661,18 @@ private:
   ///
   unsigned MinStackArgumentAlignment;
 
-  /// PrefLoopAlignment - The perferred loop alignment.
+  /// MinFunctionAlignment - The minimum function alignment (used when
+  /// optimizing for size, and to prevent explicitly provided alignment
+  /// from leading to incorrect code).
+  ///
+  unsigned MinFunctionAlignment;
+
+  /// PrefFunctionAlignment - The preferred function alignment (used when
+  /// alignment unspecified and optimizing for speed).
+  ///
+  unsigned PrefFunctionAlignment;
+
+  /// PrefLoopAlignment - The preferred loop alignment.
   ///
   unsigned PrefLoopAlignment;
 
@@ -1774,6 +1753,129 @@ private:
 
   ValueTypeActionImpl ValueTypeActions;
 
+  typedef std::pair<LegalizeTypeAction, EVT> LegalizeKind;
+
+  LegalizeKind
+  getTypeConversion(LLVMContext &Context, EVT VT) const {
+    // If this is a simple type, use the ComputeRegisterProp mechanism.
+    if (VT.isSimple()) {
+      assert((unsigned)VT.getSimpleVT().SimpleTy <
+             array_lengthof(TransformToType));
+      EVT NVT = TransformToType[VT.getSimpleVT().SimpleTy];
+      LegalizeTypeAction LA = ValueTypeActions.getTypeAction(VT.getSimpleVT());
+
+      assert(
+        (!(NVT.isSimple() && LA != TypeLegal) ||
+         ValueTypeActions.getTypeAction(NVT.getSimpleVT()) != TypePromoteInteger)
+         && "Promote may not follow Expand or Promote");
+
+      return LegalizeKind(LA, NVT);
+    }
+
+    // Handle Extended Scalar Types.
+    if (!VT.isVector()) {
+      assert(VT.isInteger() && "Float types must be simple");
+      unsigned BitSize = VT.getSizeInBits();
+      // First promote to a power-of-two size, then expand if necessary.
+      if (BitSize < 8 || !isPowerOf2_32(BitSize)) {
+        EVT NVT = VT.getRoundIntegerType(Context);
+        assert(NVT != VT && "Unable to round integer VT");
+        LegalizeKind NextStep = getTypeConversion(Context, NVT);
+        // Avoid multi-step promotion.
+        if (NextStep.first == TypePromoteInteger) return NextStep;
+        // Return rounded integer type.
+        return LegalizeKind(TypePromoteInteger, NVT);
+      }
+
+      return LegalizeKind(TypeExpandInteger,
+                          EVT::getIntegerVT(Context, VT.getSizeInBits()/2));
+    }
+
+    // Handle vector types.
+    unsigned NumElts = VT.getVectorNumElements();
+    EVT EltVT = VT.getVectorElementType();
+
+    // Vectors with only one element are always scalarized.
+    if (NumElts == 1)
+      return LegalizeKind(TypeScalarizeVector, EltVT);
+
+    // If we allow the promotion of vector elements using a flag,
+    // then try to widen vector elements until a legal type is found.
+    if (mayPromoteElements && EltVT.isInteger()) {
+      // Vectors with a number of elements that is not a power of two are always
+      // widened, for example <3 x float> -> <4 x float>.
+      if (!VT.isPow2VectorType()) {
+        NumElts = (unsigned)NextPowerOf2(NumElts);
+        EVT NVT = EVT::getVectorVT(Context, EltVT, NumElts);
+        return LegalizeKind(TypeWidenVector, NVT);
+      }
+
+      // Examine the element type.
+      LegalizeKind LK = getTypeConversion(Context, EltVT);
+
+      // If type is to be expanded, split the vector.
+      //  <4 x i140> -> <2 x i140>
+      if (LK.first == TypeExpandInteger)
+        return LegalizeKind(TypeSplitVector,
+                            EVT::getVectorVT(Context, EltVT, NumElts / 2));
+
+      // Promote the integer element types until a legal vector type is found
+      // or until the element integer type is too big. If a legal type was not
+      // found, fallback to the usual mechanism of widening/splitting the
+      // vector.
+      while (1) {
+        // Increase the bitwidth of the element to the next pow-of-two
+        // (which is greater than 8 bits).
+        EltVT = EVT::getIntegerVT(Context, 1 + EltVT.getSizeInBits()
+                                 ).getRoundIntegerType(Context);
+
+        // Stop trying when getting a non-simple element type.
+        // Note that vector elements may be greater than legal vector element
+        // types. Example: X86 XMM registers hold 64bit element on 32bit systems.
+        if (!EltVT.isSimple()) break;
+
+        // Build a new vector type and check if it is legal.
+        MVT NVT = MVT::getVectorVT(EltVT.getSimpleVT(), NumElts);
+
+        // Found a legal promoted vector type.
+        if (ValueTypeActions.getTypeAction(NVT) == TypeLegal)
+          return LegalizeKind(TypePromoteInteger,
+                              EVT::getVectorVT(Context, EltVT, NumElts));
+      }
+    }
+
+    // Try to widen the vector until a legal type is found.
+    // If there is no wider legal type, split the vector.
+    while (1) {
+      // Round up to the next power of 2.
+      NumElts = (unsigned)NextPowerOf2(NumElts);
+
+      // If there is no simple vector type with this many elements then there
+      // cannot be a larger legal vector type.  Note that this assumes that
+      // there are no skipped intermediate vector types in the simple types.
+      if (!EltVT.isSimple()) break;
+      MVT LargerVector = MVT::getVectorVT(EltVT.getSimpleVT(), NumElts);
+      if (LargerVector == MVT()) break;
+
+      // If this type is legal then widen the vector.
+      if (ValueTypeActions.getTypeAction(LargerVector) == TypeLegal)
+        return LegalizeKind(TypeWidenVector, LargerVector);
+    }
+
+    // Widen odd vectors to next power of two.
+    if (!VT.isPow2VectorType()) {
+      EVT NVT = VT.getPow2VectorType(Context);
+      return LegalizeKind(TypeWidenVector, NVT);
+    }
+
+    // Vectors with illegal element types are expanded.
+    EVT NVT = EVT::getVectorVT(Context, EltVT, VT.getVectorNumElements() / 2);
+    return LegalizeKind(TypeSplitVector, NVT);
+
+    assert(false && "Unable to handle this kind of vector type");
+    return LegalizeKind(TypeLegal, VT);
+  }
+
   std::vector<std::pair<EVT, TargetRegisterClass*> > AvailableRegClasses;
 
   /// TargetDAGCombineArray - Targets can specify ISD nodes that they would
diff --git a/include/llvm/Target/TargetLoweringObjectFile.h b/include/llvm/Target/TargetLoweringObjectFile.h
index 34bf271..3991035 100644
--- a/include/llvm/Target/TargetLoweringObjectFile.h
+++ b/include/llvm/Target/TargetLoweringObjectFile.h
@@ -97,10 +97,6 @@ protected:
   /// weak_definition of constant 0 for an omitted EH frame.
   bool SupportsWeakOmittedEHFrame;
   
-  /// IsFunctionEHSymbolGlobal - This flag is set to true if the ".eh" symbol
-  /// for a function should be marked .globl.
-  bool IsFunctionEHSymbolGlobal;
-  
   /// IsFunctionEHFrameSymbolPrivate - This flag is set to true if the
   /// "EH_frame" symbol for EH information should be an assembler temporary (aka
   /// private linkage, aka an L or .L label) or false if it should be a normal
@@ -119,9 +115,6 @@ public:
     Ctx = &ctx;
   }
   
-  bool isFunctionEHSymbolGlobal() const {
-    return IsFunctionEHSymbolGlobal;
-  }
   bool isFunctionEHFrameSymbolPrivate() const {
     return IsFunctionEHFrameSymbolPrivate;
   }
@@ -140,6 +133,9 @@ public:
   const MCSection *getStaticDtorSection() const { return StaticDtorSection; }
   const MCSection *getLSDASection() const { return LSDASection; }
   virtual const MCSection *getEHFrameSection() const = 0;
+  virtual void emitPersonalityValue(MCStreamer &Streamer,
+                                    const TargetMachine &TM,
+                                    const MCSymbol *Sym) const;
   const MCSection *getDwarfAbbrevSection() const { return DwarfAbbrevSection; }
   const MCSection *getDwarfInfoSection() const { return DwarfInfoSection; }
   const MCSection *getDwarfLineSection() const { return DwarfLineSection; }
@@ -159,6 +155,8 @@ public:
   const MCSection *getTLSExtraDataSection() const {
     return TLSExtraDataSection;
   }
+  virtual const MCSection *getWin64EHFuncTableSection(StringRef suffix)const=0;
+  virtual const MCSection *getWin64EHTableSection(StringRef suffix) const = 0;
   
   /// shouldEmitUsedDirectiveFor - This hook allows targets to selectively
   /// decide not to emit the UsedDirective for some symbols in llvm.used.
@@ -218,15 +216,19 @@ public:
                                  MachineModuleInfo *MMI, unsigned Encoding,
                                  MCStreamer &Streamer) const;
 
+  // getCFIPersonalitySymbol - The symbol that gets passed to .cfi_personality.
+  virtual MCSymbol *
+  getCFIPersonalitySymbol(const GlobalValue *GV, Mangler *Mang,
+                          MachineModuleInfo *MMI) const;
+
   /// 
   const MCExpr *
-  getExprForDwarfReference(const MCSymbol *Sym, Mangler *Mang,
-                           MachineModuleInfo *MMI, unsigned Encoding,
+  getExprForDwarfReference(const MCSymbol *Sym, unsigned Encoding,
                            MCStreamer &Streamer) const;
   
   virtual unsigned getPersonalityEncoding() const;
   virtual unsigned getLSDAEncoding() const;
-  virtual unsigned getFDEEncoding() const;
+  virtual unsigned getFDEEncoding(bool CFI) const;
   virtual unsigned getTTypeEncoding() const;
 
 protected:
diff --git a/include/llvm/Target/TargetMachine.h b/include/llvm/Target/TargetMachine.h
index 627ab42..78f770c 100644
--- a/include/llvm/Target/TargetMachine.h
+++ b/include/llvm/Target/TargetMachine.h
@@ -108,6 +108,7 @@ protected: // Can only create subclasses.
   unsigned MCNoExecStack : 1;
   unsigned MCSaveTempLabels : 1;
   unsigned MCUseLoc : 1;
+  unsigned MCUseCFI : 1;
 
 public:
   virtual ~TargetMachine();
@@ -193,6 +194,12 @@ public:
   /// setMCUseLoc - Set whether all we should use dwarf's .loc directive.
   void setMCUseLoc(bool Value) { MCUseLoc = Value; }
 
+  /// hasMCUseCFI - Check whether we should use dwarf's .cfi_* directives.
+  bool hasMCUseCFI() const { return MCUseCFI; }
+
+  /// setMCUseCFI - Set whether all we should use dwarf's .cfi_* directives.
+  void setMCUseCFI(bool Value) { MCUseCFI = Value; }
+
   /// getRelocationModel - Returns the code generation relocation model. The
   /// choices are static, PIC, and dynamic-no-pic, and target default.
   static Reloc::Model getRelocationModel();
diff --git a/include/llvm/Target/TargetOpcodes.h b/include/llvm/Target/TargetOpcodes.h
index 01fba66..37f7b2f 100644
--- a/include/llvm/Target/TargetOpcodes.h
+++ b/include/llvm/Target/TargetOpcodes.h
@@ -71,6 +71,10 @@ namespace TargetOpcode {
     /// REG_SEQUENCE - This variadic instruction is used to form a register that
     /// represent a consecutive sequence of sub-registers. It's used as register
     /// coalescing / allocation aid and must be eliminated before code emission.
+    // In SDNode form, the first operand encodes the register class created by
+    // the REG_SEQUENCE, while each subsequent pair names a vreg + subreg index
+    // pair.  Once it has been lowered to a MachineInstr, the regclass operand
+    // is no longer present.
     /// e.g. v1027 = REG_SEQUENCE v1024, 3, v1025, 4, v1026, 5
     /// After register coalescing references of v1024 should be replace with
     /// v1027:3, v1025 with v1027:4, etc.
diff --git a/include/llvm/Target/TargetOptions.h b/include/llvm/Target/TargetOptions.h
index fc80a06..beed039 100644
--- a/include/llvm/Target/TargetOptions.h
+++ b/include/llvm/Target/TargetOptions.h
@@ -125,10 +125,6 @@ namespace llvm {
   /// flag is hidden and is only for debugging the debug info.
   extern bool JITEmitDebugInfoToDisk;
 
-  /// UnwindTablesMandatory - This flag indicates that unwind tables should
-  /// be emitted for all functions.
-  extern bool UnwindTablesMandatory;
-
   /// GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is
   /// specified on the commandline. When the flag is on, participating targets
   /// will perform tail call optimization on all calls which use the fastcc
@@ -157,9 +153,10 @@ namespace llvm {
   /// wth earlier copy coalescing.
   extern bool StrongPHIElim;
 
-  /// HasDivModLibcall - This flag indicates whether the target compiler
-  /// runtime library has integer divmod libcalls.
-  extern bool HasDivModLibcall;
+  /// getTrapFunctionName - If this returns a non-empty string, this means isel
+  /// should lower Intrinsic::trap to a call to the specified function name
+  /// instead of an ISD::TRAP node.
+  extern StringRef getTrapFunctionName();
 
 } // End llvm namespace
 
diff --git a/include/llvm/Target/TargetRegisterInfo.h b/include/llvm/Target/TargetRegisterInfo.h
index d06cdc4..feb0929 100644
--- a/include/llvm/Target/TargetRegisterInfo.h
+++ b/include/llvm/Target/TargetRegisterInfo.h
@@ -18,6 +18,7 @@
 
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseSet.h"
 #include <cassert>
 #include <functional>
@@ -46,6 +47,8 @@ struct TargetRegisterDesc {
   const unsigned *Overlaps;     // Overlapping registers, described above
   const unsigned *SubRegs;      // Sub-register set, described above
   const unsigned *SuperRegs;    // Super-register set, described above
+  unsigned CostPerUse;          // Extra cost of instructions using register.
+  bool inAllocatableClass;      // Register belongs to an allocatable regclass.
 };
 
 class TargetRegisterClass {
@@ -65,6 +68,7 @@ private:
   const sc_iterator SuperRegClasses;
   const unsigned RegSize, Alignment;    // Size & Alignment of register in bytes
   const int CopyCost;
+  const bool Allocatable;
   const iterator RegsBegin, RegsEnd;
   DenseSet<unsigned> RegSet;
 public:
@@ -75,11 +79,12 @@ public:
                       const TargetRegisterClass * const *supcs,
                       const TargetRegisterClass * const *subregcs,
                       const TargetRegisterClass * const *superregcs,
-                      unsigned RS, unsigned Al, int CC,
+                      unsigned RS, unsigned Al, int CC, bool Allocable,
                       iterator RB, iterator RE)
     : ID(id), Name(name), VTs(vts), SubClasses(subcs), SuperClasses(supcs),
     SubRegClasses(subregcs), SuperRegClasses(superregcs),
-    RegSize(RS), Alignment(Al), CopyCost(CC), RegsBegin(RB), RegsEnd(RE) {
+    RegSize(RS), Alignment(Al), CopyCost(CC), Allocatable(Allocable),
+    RegsBegin(RB), RegsEnd(RE) {
       for (iterator I = RegsBegin, E = RegsEnd; I != E; ++I)
         RegSet.insert(*I);
     }
@@ -181,6 +186,12 @@ public:
     return false;
   }
 
+  /// hasSubClassEq - Returns true if RC is a subclass of or equal to this
+  /// class.
+  bool hasSubClassEq(const TargetRegisterClass *RC) const {
+    return RC == this || hasSubClass(RC);
+  }
+
   /// subclasses_begin / subclasses_end - Loop over all of the classes
   /// that are proper subsets of this register class.
   sc_iterator subclasses_begin() const {
@@ -202,6 +213,12 @@ public:
     return false;
   }
 
+  /// hasSuperClassEq - Returns true if RC is a superclass of or equal to this
+  /// class.
+  bool hasSuperClassEq(const TargetRegisterClass *RC) const {
+    return RC == this || hasSuperClass(RC);
+  }
+
   /// superclasses_begin / superclasses_end - Loop over all of the classes
   /// that are proper supersets of this register class.
   sc_iterator superclasses_begin() const {
@@ -243,6 +260,27 @@ public:
     return end();
   }
 
+  /// getRawAllocationOrder - Returns the preferred order for allocating
+  /// registers from this register class in MF. The raw order comes directly
+  /// from the .td file and may include reserved registers that are not
+  /// allocatable. Register allocators should also make sure to allocate
+  /// callee-saved registers only after all the volatiles are used. The
+  /// RegisterClassInfo class provides filtered allocation orders with
+  /// callee-saved registers moved to the end.
+  ///
+  /// The MachineFunction argument can be used to tune the allocatable
+  /// registers based on the characteristics of the function, subtarget, or
+  /// other criteria.
+  ///
+  /// By default, this method returns all registers in the class.
+  ///
+  virtual
+  ArrayRef<unsigned> getRawAllocationOrder(const MachineFunction &MF) const {
+    iterator B = allocation_order_begin(MF);
+    iterator E = allocation_order_end(MF);
+    return ArrayRef<unsigned>(B, E - B);
+  }
+
   /// getSize - Return the size of the register in bytes, which is also the size
   /// of a stack slot allocated to hold a spilled copy of this register.
   unsigned getSize() const { return RegSize; }
@@ -255,6 +293,10 @@ public:
   /// this class. A negative number means the register class is very expensive
   /// to copy e.g. status flag register classes.
   int getCopyCost() const { return CopyCost; }
+
+  /// isAllocatable - Return true if this register class may be used to create
+  /// virtual registers.
+  bool isAllocatable() const { return Allocatable; }
 };
 
 
@@ -265,11 +307,6 @@ public:
 /// descriptor.
 ///
 class TargetRegisterInfo {
-protected:
-  const unsigned* SubregHash;
-  const unsigned SubregHashSize;
-  const unsigned* AliasesHash;
-  const unsigned AliasesHashSize;
 public:
   typedef const TargetRegisterClass * const * regclass_iterator;
 private:
@@ -287,11 +324,7 @@ protected:
                      regclass_iterator RegClassEnd,
                      const char *const *subregindexnames,
                      int CallFrameSetupOpcode = -1,
-                     int CallFrameDestroyOpcode = -1,
-                     const unsigned* subregs = 0,
-                     const unsigned subregsize = 0,
-                     const unsigned* aliases = 0,
-                     const unsigned aliasessize = 0);
+                     int CallFrameDestroyOpcode = -1);
   virtual ~TargetRegisterInfo();
 public:
 
@@ -350,13 +383,13 @@ public:
   /// The first virtual register in a function will get the index 0.
   static unsigned virtReg2Index(unsigned Reg) {
     assert(isVirtualRegister(Reg) && "Not a virtual register");
-    return Reg - (1u << 31);
+    return Reg & ~(1u << 31);
   }
 
   /// index2VirtReg - Convert a 0-based index to a virtual register number.
   /// This is the inverse operation of VirtReg2IndexFunctor below.
   static unsigned index2VirtReg(unsigned Index) {
-    return Index + (1u << 31);
+    return Index | (1u << 31);
   }
 
   /// getMinimalPhysRegClass - Returns the Register Class of a physical
@@ -414,7 +447,7 @@ public:
   /// getSuperRegisters - Return the list of registers that are super-registers
   /// of the specified register, or a null list of there are none. The list
   /// returned is zero terminated and sorted according to super-sub register
-  /// relations. e.g. X86::AL's super-register list is RAX, EAX, AX.
+  /// relations. e.g. X86::AL's super-register list is AX, EAX, RAX.
   ///
   const unsigned *getSuperRegisters(unsigned RegNo) const {
     return get(RegNo).SuperRegs;
@@ -426,6 +459,12 @@ public:
     return get(RegNo).Name;
   }
 
+  /// getCostPerUse - Return the additional cost of using this register instead
+  /// of other registers in its class.
+  unsigned getCostPerUse(unsigned RegNo) const {
+    return get(RegNo).CostPerUse;
+  }
+
   /// getNumRegs - Return the number of registers this target has (useful for
   /// sizing arrays holding per register information)
   unsigned getNumRegs() const {
@@ -442,49 +481,28 @@ public:
   /// regsOverlap - Returns true if the two registers are equal or alias each
   /// other. The registers may be virtual register.
   bool regsOverlap(unsigned regA, unsigned regB) const {
-    if (regA == regB)
-      return true;
-
+    if (regA == regB) return true;
     if (isVirtualRegister(regA) || isVirtualRegister(regB))
       return false;
-
-    // regA and regB are distinct physical registers. Do they alias?
-    size_t index = (regA + regB * 37) & (AliasesHashSize-1);
-    unsigned ProbeAmt = 0;
-    while (AliasesHash[index*2] != 0 &&
-           AliasesHash[index*2+1] != 0) {
-      if (AliasesHash[index*2] == regA && AliasesHash[index*2+1] == regB)
-        return true;
-
-      index = (index + ProbeAmt) & (AliasesHashSize-1);
-      ProbeAmt += 2;
+    for (const unsigned *regList = getOverlaps(regA)+1; *regList; ++regList) {
+      if (*regList == regB) return true;
     }
-
     return false;
   }
 
   /// isSubRegister - Returns true if regB is a sub-register of regA.
   ///
   bool isSubRegister(unsigned regA, unsigned regB) const {
-    // SubregHash is a simple quadratically probed hash table.
-    size_t index = (regA + regB * 37) & (SubregHashSize-1);
-    unsigned ProbeAmt = 2;
-    while (SubregHash[index*2] != 0 &&
-           SubregHash[index*2+1] != 0) {
-      if (SubregHash[index*2] == regA && SubregHash[index*2+1] == regB)
-        return true;
-
-      index = (index + ProbeAmt) & (SubregHashSize-1);
-      ProbeAmt += 2;
-    }
-
-    return false;
+    return isSuperRegister(regB, regA);
   }
 
   /// isSuperRegister - Returns true if regB is a super-register of regA.
   ///
   bool isSuperRegister(unsigned regA, unsigned regB) const {
-    return isSubRegister(regB, regA);
+    for (const unsigned *regList = getSuperRegisters(regA); *regList;++regList){
+      if (*regList == regB) return true;
+    }
+    return false;
   }
 
   /// getCalleeSavedRegs - Return a null-terminated list of all of the
@@ -597,6 +615,17 @@ public:
     return RC;
   }
 
+  /// getLargestLegalSuperClass - Returns the largest super class of RC that is
+  /// legal to use in the current sub-target and has the same spill size.
+  /// The returned register class can be used to create virtual registers which
+  /// means that all its registers can be copied and spilled.
+  virtual const TargetRegisterClass*
+  getLargestLegalSuperClass(const TargetRegisterClass *RC) const {
+    /// The default implementation is very conservative and doesn't allow the
+    /// register allocator to inflate register classes.
+    return RC;
+  }
+
   /// getRegPressureLimit - Return the register pressure "high water mark" for
   /// the specific register class. The scheduler is in high register pressure
   /// mode (for the specific register class) if it goes over the limit.
@@ -605,14 +634,17 @@ public:
     return 0;
   }
 
-  /// getAllocationOrder - Returns the register allocation order for a specified
-  /// register class in the form of a pair of TargetRegisterClass iterators.
-  virtual std::pair<TargetRegisterClass::iterator,TargetRegisterClass::iterator>
-  getAllocationOrder(const TargetRegisterClass *RC,
-                     unsigned HintType, unsigned HintReg,
-                     const MachineFunction &MF) const {
-    return std::make_pair(RC->allocation_order_begin(MF),
-                          RC->allocation_order_end(MF));
+  /// getRawAllocationOrder - Returns the register allocation order for a
+  /// specified register class with a target-dependent hint. The returned list
+  /// may contain reserved registers that cannot be allocated.
+  ///
+  /// Register allocators need only call this function to resolve
+  /// target-dependent hints, but it should work without hinting as well.
+  virtual ArrayRef<unsigned>
+  getRawAllocationOrder(const TargetRegisterClass *RC,
+                        unsigned HintType, unsigned HintReg,
+                        const MachineFunction &MF) const {
+    return RC->getRawAllocationOrder(MF);
   }
 
   /// ResolveRegAllocHint - Resolves the specified register allocation hint
@@ -624,6 +656,14 @@ public:
     return 0;
   }
 
+  /// avoidWriteAfterWrite - Return true if the register allocator should avoid
+  /// writing a register from RC in two consecutive instructions.
+  /// This can avoid pipeline stalls on certain architectures.
+  /// It does cause increased register pressure, though.
+  virtual bool avoidWriteAfterWrite(const TargetRegisterClass *RC) const {
+    return false;
+  }
+
   /// UpdateRegAllocHint - A callback to allow target a chance to update
   /// register allocation hints when a register is "changed" (e.g. coalesced)
   /// to another register. e.g. On ARM, some virtual registers should target
@@ -776,6 +816,8 @@ public:
   /// debugging info.
   virtual int getDwarfRegNum(unsigned RegNum, bool isEH) const = 0;
 
+  virtual int getLLVMRegNum(unsigned RegNum, bool isEH) const = 0;
+
   /// getFrameRegister - This method should return the register used as a base
   /// for values allocated in the current stack frame.
   virtual unsigned getFrameRegister(const MachineFunction &MF) const = 0;
@@ -783,6 +825,12 @@ public:
   /// getRARegister - This method should return the register where the return
   /// address can be found.
   virtual unsigned getRARegister() const = 0;
+
+  /// getSEHRegNum - Map a target register to an equivalent SEH register
+  /// number.  Returns -1 if there is no equivalent value.
+  virtual int getSEHRegNum(unsigned i) const {
+    return i;
+  }
 };
 
 
diff --git a/include/llvm/Target/TargetRegistry.h b/include/llvm/Target/TargetRegistry.h
index 3739a0f..a464822 100644
--- a/include/llvm/Target/TargetRegistry.h
+++ b/include/llvm/Target/TargetRegistry.h
@@ -43,7 +43,7 @@ namespace llvm {
 
   MCStreamer *createAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
                                 bool isVerboseAsm,
-                                bool useLoc,
+                                bool useLoc, bool useCFI,
                                 MCInstPrinter *InstPrint,
                                 MCCodeEmitter *CE,
                                 TargetAsmBackend *TAB,
@@ -96,6 +96,7 @@ namespace llvm {
                                              formatted_raw_ostream &OS,
                                              bool isVerboseAsm,
                                              bool useLoc,
+                                             bool useCFI,
                                              MCInstPrinter *InstPrint,
                                              MCCodeEmitter *CE,
                                              TargetAsmBackend *TAB,
@@ -329,12 +330,13 @@ namespace llvm {
                                   formatted_raw_ostream &OS,
                                   bool isVerboseAsm,
                                   bool useLoc,
+                                  bool useCFI,
                                   MCInstPrinter *InstPrint,
                                   MCCodeEmitter *CE,
                                   TargetAsmBackend *TAB,
                                   bool ShowInst) const {
       // AsmStreamerCtorFn is default to llvm::createAsmStreamer
-      return AsmStreamerCtorFn(Ctx, OS, isVerboseAsm, useLoc,
+      return AsmStreamerCtorFn(Ctx, OS, isVerboseAsm, useLoc, useCFI,
                                InstPrint, CE, TAB, ShowInst);
     }
 
diff --git a/include/llvm/Target/TargetSelectionDAG.td b/include/llvm/Target/TargetSelectionDAG.td
index c9be40d..285b8b1 100644
--- a/include/llvm/Target/TargetSelectionDAG.td
+++ b/include/llvm/Target/TargetSelectionDAG.td
@@ -197,8 +197,8 @@ def SDTSubVecInsert : SDTypeProfile<1, 3, [ // subvector insert
   SDTCisSubVecOfVec<2, 1>, SDTCisSameAs<0,1>, SDTCisInt<3>
 ]>;
 
-def SDTPrefetch : SDTypeProfile<0, 3, [     // prefetch
-  SDTCisPtrTy<0>, SDTCisSameAs<1, 2>, SDTCisInt<1>
+def SDTPrefetch : SDTypeProfile<0, 4, [     // prefetch
+  SDTCisPtrTy<0>, SDTCisSameAs<1, 2>, SDTCisSameAs<1, 3>, SDTCisInt<1>
 ]>;
 
 def SDTMemBarrier : SDTypeProfile<0, 5, [   // memory barier
@@ -354,6 +354,7 @@ def fmul       : SDNode<"ISD::FMUL"       , SDTFPBinOp, [SDNPCommutative]>;
 def fdiv       : SDNode<"ISD::FDIV"       , SDTFPBinOp>;
 def frem       : SDNode<"ISD::FREM"       , SDTFPBinOp>;
 def fabs       : SDNode<"ISD::FABS"       , SDTFPUnaryOp>;
+def fgetsign   : SDNode<"ISD::FGETSIGN"   , SDTFPToIntOp>;
 def fneg       : SDNode<"ISD::FNEG"       , SDTFPUnaryOp>;
 def fsqrt      : SDNode<"ISD::FSQRT"      , SDTFPUnaryOp>;
 def fsin       : SDNode<"ISD::FSIN"       , SDTFPUnaryOp>;
@@ -490,6 +491,18 @@ class SDNodeXForm<SDNode opc, code xformFunction> {
 
 def NOOP_SDNodeXForm : SDNodeXForm<imm, [{}]>;
 
+//===----------------------------------------------------------------------===//
+// PatPred Subclasses.
+//
+// These allow specifying different sorts of predicates that control whether a
+// node is matched.
+//
+class PatPred;
+
+class CodePatPred<code predicate> : PatPred {
+  code PredicateCode = predicate;
+}
+
 
 //===----------------------------------------------------------------------===//
 // Selection DAG Pattern Fragments.
@@ -507,7 +520,8 @@ class PatFrag<dag ops, dag frag, code pred = [{}],
               SDNodeXForm xform = NOOP_SDNodeXForm> : SDPatternOperator {
   dag Operands = ops;
   dag Fragment = frag;
-  code Predicate = pred;
+  code PredicateCode = pred;
+  code ImmediateCode = [{}];
   SDNodeXForm OperandTransform = xform;
 }
 
@@ -516,6 +530,27 @@ class PatFrag<dag ops, dag frag, code pred = [{}],
 class PatLeaf<dag frag, code pred = [{}], SDNodeXForm xform = NOOP_SDNodeXForm>
  : PatFrag<(ops), frag, pred, xform>;
 
+
+// ImmLeaf is a pattern fragment with a constraint on the immediate.  The
+// constraint is a function that is run on the immediate (always with the value
+// sign extended out to an int64_t) as Imm.  For example:
+//
+//  def immSExt8 : ImmLeaf<i16, [{ return (char)Imm == Imm; }]>;
+//
+// this is a more convenient form to match 'imm' nodes in than PatLeaf and also
+// is preferred over using PatLeaf because it allows the code generator to
+// reason more about the constraint.
+//
+// If FastIsel should ignore all instructions that have an operand of this type,
+// the FastIselShouldIgnore flag can be set.  This is an optimization to reduce
+// the code size of the generated fast instruction selector.
+class ImmLeaf<ValueType vt, code pred, SDNodeXForm xform = NOOP_SDNodeXForm>
+  : PatFrag<(ops), (vt imm), [{}], xform> {
+  let ImmediateCode = pred;
+  bit FastIselShouldIgnore = 0;
+}
+
+
 // Leaf fragments.
 
 def vtInt      : PatLeaf<(vt),  [{ return N->getVT().isInteger(); }]>;
diff --git a/include/llvm/Transforms/IPO.h b/include/llvm/Transforms/IPO.h
index 1239881..d12fd1d 100644
--- a/include/llvm/Transforms/IPO.h
+++ b/include/llvm/Transforms/IPO.h
@@ -152,7 +152,6 @@ ModulePass *createDeadArgHackingPass();
 /// equal to maxElements (maxElements == 0 means always promote).
 ///
 Pass *createArgumentPromotionPass(unsigned maxElements = 3);
-Pass *createStructRetPromotionPass();
 
 //===----------------------------------------------------------------------===//
 /// createIPConstantPropagationPass - This pass propagates constants from call
diff --git a/include/llvm/Transforms/Instrumentation.h b/include/llvm/Transforms/Instrumentation.h
index aa9873f..8d55231 100644
--- a/include/llvm/Transforms/Instrumentation.h
+++ b/include/llvm/Transforms/Instrumentation.h
@@ -17,7 +17,6 @@
 namespace llvm {
 
 class ModulePass;
-class FunctionPass;
 
 // Insert edge profiling instrumentation
 ModulePass *createEdgeProfilerPass();
@@ -28,6 +27,10 @@ ModulePass *createOptimalEdgeProfilerPass();
 // Insert path profiling instrumentation
 ModulePass *createPathProfilerPass();
 
+// Insert GCOV profiling instrumentation
+ModulePass *createGCOVProfilerPass(bool EmitNotes = true, bool EmitData = true,
+                                   bool Use402Format = false);
+
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/Transforms/Scalar.h b/include/llvm/Transforms/Scalar.h
index 8d5ed44..e830435 100644
--- a/include/llvm/Transforms/Scalar.h
+++ b/include/llvm/Transforms/Scalar.h
@@ -128,7 +128,7 @@ Pass *createLoopInstSimplifyPass();
 //
 // LoopUnroll - This pass is a simple loop unrolling pass.
 //
-Pass *createLoopUnrollPass();
+Pass *createLoopUnrollPass(int Threshold = -1, int Count = -1, int AllowPartial = -1);
 
 //===----------------------------------------------------------------------===//
 //
@@ -338,6 +338,24 @@ Pass *createCorrelatedValuePropagationPass();
 
 //===----------------------------------------------------------------------===//
 //
+// ObjCARCExpand - ObjC ARC preliminary simplifications.
+//
+Pass *createObjCARCExpandPass();
+
+//===----------------------------------------------------------------------===//
+//
+// ObjCARCContract - Late ObjC ARC cleanups.
+//
+Pass *createObjCARCContractPass();
+
+//===----------------------------------------------------------------------===//
+//
+// ObjCARCOpt - ObjC ARC optimization.
+//
+Pass *createObjCARCOptPass();
+
+//===----------------------------------------------------------------------===//
+//
 // InstructionSimplifier - Remove redundant instructions.
 //
 FunctionPass *createInstructionSimplifierPass();
diff --git a/include/llvm/Transforms/Utils/BasicBlockUtils.h b/include/llvm/Transforms/Utils/BasicBlockUtils.h
index 5335860..90eabef 100644
--- a/include/llvm/Transforms/Utils/BasicBlockUtils.h
+++ b/include/llvm/Transforms/Utils/BasicBlockUtils.h
@@ -19,6 +19,7 @@
 
 #include "llvm/BasicBlock.h"
 #include "llvm/Support/CFG.h"
+#include "llvm/Support/DebugLoc.h"
 
 namespace llvm {
 
@@ -181,6 +182,10 @@ BasicBlock *SplitBlockPredecessors(BasicBlock *BB, BasicBlock *const *Preds,
 ReturnInst *FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB,
                                        BasicBlock *Pred);
 
+/// GetFirstDebugLocInBasicBlock - Return first valid DebugLoc entry in a 
+/// given basic block.
+DebugLoc GetFirstDebugLocInBasicBlock(const BasicBlock *BB);
+
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/Transforms/Utils/Cloning.h b/include/llvm/Transforms/Utils/Cloning.h
index 24ebb10..674c2d0 100644
--- a/include/llvm/Transforms/Utils/Cloning.h
+++ b/include/llvm/Transforms/Utils/Cloning.h
@@ -107,12 +107,6 @@ BasicBlock *CloneBasicBlock(const BasicBlock *BB,
                             const Twine &NameSuffix = "", Function *F = 0,
                             ClonedCodeInfo *CodeInfo = 0);
 
-
-/// CloneLoop - Clone Loop. Clone dominator info for loop insiders. Populate
-/// VMap using old blocks to new blocks mapping.
-Loop *CloneLoop(Loop *L, LPPassManager *LPM, LoopInfo *LI, 
-                ValueToValueMapTy &VMap, Pass *P);
-
 /// CloneFunction - Return a copy of the specified function, but without
 /// embedding the function into another module.  Also, any references specified
 /// in the VMap are changed to refer to their mapped value instead of the
@@ -207,7 +201,7 @@ public:
 ///
 /// Note that this only does one level of inlining.  For example, if the
 /// instruction 'call B' is inlined, and 'B' calls 'C', then the call to 'C' now
-/// exists in the instruction stream.  Similiarly this will inline a recursive
+/// exists in the instruction stream.  Similarly this will inline a recursive
 /// function by one level.
 ///
 bool InlineFunction(CallInst *C, InlineFunctionInfo &IFI);
diff --git a/include/llvm/Transforms/Utils/Local.h b/include/llvm/Transforms/Utils/Local.h
index e61dcb3..7f99dbc 100644
--- a/include/llvm/Transforms/Utils/Local.h
+++ b/include/llvm/Transforms/Utils/Local.h
@@ -43,8 +43,10 @@ template<typename T> class SmallVectorImpl;
 /// constant value, convert it into an unconditional branch to the constant
 /// destination.  This is a nontrivial operation because the successors of this
 /// basic block must have their PHI nodes updated.
-///
-bool ConstantFoldTerminator(BasicBlock *BB);
+/// Also calls RecursivelyDeleteTriviallyDeadInstructions() on any branch/switch
+/// conditions and indirectbr addresses this might make dead if
+/// DeleteDeadConditions is true.
+bool ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions = false);
 
 //===----------------------------------------------------------------------===//
 //  Local dead code elimination.
@@ -176,6 +178,10 @@ bool ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
 /// of llvm.dbg.value intrinsics.
 bool LowerDbgDeclare(Function &F);
 
+/// FindAllocaDbgDeclare - Finds the llvm.dbg.declare intrinsic corresponding to
+/// an alloca, if any.
+DbgDeclareInst *FindAllocaDbgDeclare(Value *V);
+
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/Transforms/Utils/SSAUpdater.h b/include/llvm/Transforms/Utils/SSAUpdater.h
index b4048b9..51c8467 100644
--- a/include/llvm/Transforms/Utils/SSAUpdater.h
+++ b/include/llvm/Transforms/Utils/SSAUpdater.h
@@ -21,6 +21,8 @@ namespace llvm {
   class PHINode;
   template<typename T> class SmallVectorImpl;
   template<typename T> class SSAUpdaterTraits;
+  class DbgDeclareInst;
+  class DIBuilder;
   class BumpPtrAllocator;
 
 /// SSAUpdater - This class updates SSA form for a set of values defined in
@@ -120,9 +122,12 @@ private:
 class LoadAndStorePromoter {
 protected:
   SSAUpdater &SSA;
+  DbgDeclareInst *DDI;
+  DIBuilder *DIB;
 public:
   LoadAndStorePromoter(const SmallVectorImpl<Instruction*> &Insts,
-                       SSAUpdater &S, StringRef Name = StringRef());
+                       SSAUpdater &S, DbgDeclareInst *DDI, DIBuilder *DIB,
+                       StringRef Name = StringRef());
   virtual ~LoadAndStorePromoter() {}
   
   /// run - This does the promotion.  Insts is a list of loads and stores to
diff --git a/include/llvm/Type.h b/include/llvm/Type.h
index 0939d67..0bd4350 100644
--- a/include/llvm/Type.h
+++ b/include/llvm/Type.h
@@ -6,13 +6,17 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the Type class.  For more "Type"
+// stuff, look in DerivedTypes.h.
+//
+//===----------------------------------------------------------------------===//
 
 #ifndef LLVM_TYPE_H
 #define LLVM_TYPE_H
 
 #include "llvm/AbstractTypeUser.h"
 #include "llvm/Support/Casting.h"
-#include "llvm/ADT/GraphTraits.h"
 #include <string>
 #include <vector>
 
@@ -25,10 +29,8 @@ class TypeMapBase;
 class raw_ostream;
 class Module;
 class LLVMContext;
+template<class GraphType> struct GraphTraits;
 
-/// This file contains the declaration of the Type class.  For more "Type" type
-/// stuff, look in DerivedTypes.h.
-///
 /// The instances of the Type class are immutable: once they are created,
 /// they are never changed.  Also note that only one instance of a particular
 /// type is ever created.  Thus seeing if two types are equal is a matter of
@@ -72,7 +74,7 @@ public:
     DoubleTyID,      ///<  2: 64 bit floating point type
     X86_FP80TyID,    ///<  3: 80 bit floating point type (X87)
     FP128TyID,       ///<  4: 128 bit floating point type (112-bit mantissa)
-    PPC_FP128TyID,   ///<  5: 128 bit floating point type (two 64-bits)
+    PPC_FP128TyID,   ///<  5: 128 bit floating point type (two 64-bits, PowerPC)
     LabelTyID,       ///<  6: Labels
     MetadataTyID,    ///<  7: Metadata
     X86_MMXTyID,     ///<  8: MMX vectors (64 bits)
@@ -273,6 +275,9 @@ public:
   /// @brief Determine if this type could be losslessly bitcast to Ty
   bool canLosslesslyBitCastTo(const Type *Ty) const;
 
+  /// isEmptyTy - Return true if this type is empty, that is, it has no
+  /// elements or all its elements are empty.
+  bool isEmptyTy() const;
 
   /// Here are some useful little methods to query what type derived types are
   /// Note that all other types can just compare to see if this == Type::xxxTy;
@@ -356,11 +361,6 @@ public:
     return getForwardedTypeInternal();
   }
 
-  /// getVAArgsPromotedType - Return the type an argument of this type
-  /// will be promoted to if passed through a variable argument
-  /// function.
-  const Type *getVAArgsPromotedType(LLVMContext &C) const; 
-
   /// getScalarType - If this is a vector type, return the element type,
   /// otherwise return this.
   const Type *getScalarType() const;
@@ -503,7 +503,7 @@ inline void PATypeHandle::removeUser() {
 /// type we are pointing to is forwarding to a new type.  If so, we drop our
 /// reference to the type.
 ///
-inline Type* PATypeHolder::get() const {
+inline Type *PATypeHolder::get() const {
   if (Ty == 0) return 0;
   const Type *NewTy = Ty->getForwardedType();
   if (!NewTy) return const_cast<Type*>(Ty);
diff --git a/include/llvm/TypeSymbolTable.h b/include/llvm/TypeSymbolTable.h
index 9fdcb98..89ad534 100644
--- a/include/llvm/TypeSymbolTable.h
+++ b/include/llvm/TypeSymbolTable.h
@@ -133,7 +133,7 @@ private:
   /// is refined.
   virtual void refineAbstractType(const DerivedType *OldTy, const Type *NewTy);
 
-  /// This function markes a type as being concrete (defined).
+  /// This function marks a type as being concrete (defined).
   virtual void typeBecameConcrete(const DerivedType *AbsTy);
 
 /// @}
diff --git a/lib/Analysis/AliasAnalysis.cpp b/lib/Analysis/AliasAnalysis.cpp
index be02ddb..c189a00 100644
--- a/lib/Analysis/AliasAnalysis.cpp
+++ b/lib/Analysis/AliasAnalysis.cpp
@@ -86,14 +86,20 @@ AliasAnalysis::getModRefInfo(ImmutableCallSite CS,
 
   if (onlyAccessesArgPointees(MRB)) {
     bool doesAlias = false;
-    if (doesAccessArgPointees(MRB))
+    if (doesAccessArgPointees(MRB)) {
+      MDNode *CSTag = CS.getInstruction()->getMetadata(LLVMContext::MD_tbaa);
       for (ImmutableCallSite::arg_iterator AI = CS.arg_begin(), AE = CS.arg_end();
-           AI != AE; ++AI)
-        if (!isNoAlias(Location(*AI), Loc)) {
+           AI != AE; ++AI) {
+        const Value *Arg = *AI;
+        if (!Arg->getType()->isPointerTy())
+          continue;
+        Location CSLoc(Arg, UnknownSize, CSTag);
+        if (!isNoAlias(CSLoc, Loc)) {
           doesAlias = true;
           break;
         }
-
+      }
+    }
     if (!doesAlias)
       return NoModRef;
   }
@@ -138,13 +144,19 @@ AliasAnalysis::getModRefInfo(ImmutableCallSite CS1, ImmutableCallSite CS2) {
   // CS2's arguments.
   if (onlyAccessesArgPointees(CS2B)) {
     AliasAnalysis::ModRefResult R = NoModRef;
-    if (doesAccessArgPointees(CS2B))
+    if (doesAccessArgPointees(CS2B)) {
+      MDNode *CS2Tag = CS2.getInstruction()->getMetadata(LLVMContext::MD_tbaa);
       for (ImmutableCallSite::arg_iterator
            I = CS2.arg_begin(), E = CS2.arg_end(); I != E; ++I) {
-        R = ModRefResult((R | getModRefInfo(CS1, *I, UnknownSize)) & Mask);
+        const Value *Arg = *I;
+        if (!Arg->getType()->isPointerTy())
+          continue;
+        Location CS2Loc(Arg, UnknownSize, CS2Tag);
+        R = ModRefResult((R | getModRefInfo(CS1, CS2Loc)) & Mask);
         if (R == Mask)
           break;
       }
+    }
     return R;
   }
 
@@ -152,13 +164,20 @@ AliasAnalysis::getModRefInfo(ImmutableCallSite CS1, ImmutableCallSite CS2) {
   // any of the memory referenced by CS1's arguments. If not, return NoModRef.
   if (onlyAccessesArgPointees(CS1B)) {
     AliasAnalysis::ModRefResult R = NoModRef;
-    if (doesAccessArgPointees(CS1B))
+    if (doesAccessArgPointees(CS1B)) {
+      MDNode *CS1Tag = CS1.getInstruction()->getMetadata(LLVMContext::MD_tbaa);
       for (ImmutableCallSite::arg_iterator
-           I = CS1.arg_begin(), E = CS1.arg_end(); I != E; ++I)
-        if (getModRefInfo(CS2, *I, UnknownSize) != NoModRef) {
+           I = CS1.arg_begin(), E = CS1.arg_end(); I != E; ++I) {
+        const Value *Arg = *I;
+        if (!Arg->getType()->isPointerTy())
+          continue;
+        Location CS1Loc(Arg, UnknownSize, CS1Tag);
+        if (getModRefInfo(CS2, CS1Loc) != NoModRef) {
           R = Mask;
           break;
         }
+      }
+    }
     if (R == NoModRef)
       return R;
   }
diff --git a/lib/Analysis/AliasSetTracker.cpp b/lib/Analysis/AliasSetTracker.cpp
index 3a46976..2ed6949 100644
--- a/lib/Analysis/AliasSetTracker.cpp
+++ b/lib/Analysis/AliasSetTracker.cpp
@@ -602,6 +602,10 @@ void AliasSetTracker::ASTCallbackVH::deleted() {
   // this now dangles!
 }
 
+void AliasSetTracker::ASTCallbackVH::allUsesReplacedWith(Value *V) {
+  AST->copyValue(getValPtr(), V);
+}
+
 AliasSetTracker::ASTCallbackVH::ASTCallbackVH(Value *V, AliasSetTracker *ast)
   : CallbackVH(V), AST(ast) {}
 
diff --git a/lib/Analysis/Analysis.cpp b/lib/Analysis/Analysis.cpp
index 6ebe100..e57ba78 100644
--- a/lib/Analysis/Analysis.cpp
+++ b/lib/Analysis/Analysis.cpp
@@ -23,6 +23,7 @@ void llvm::initializeAnalysis(PassRegistry &Registry) {
   initializeAliasSetPrinterPass(Registry);
   initializeNoAAPass(Registry);
   initializeBasicAliasAnalysisPass(Registry);
+  initializeBranchProbabilityInfoPass(Registry);
   initializeCFGViewerPass(Registry);
   initializeCFGPrinterPass(Registry);
   initializeCFGOnlyViewerPass(Registry);
diff --git a/lib/Analysis/BasicAliasAnalysis.cpp b/lib/Analysis/BasicAliasAnalysis.cpp
index f7bcd9e..8330ea7 100644
--- a/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/lib/Analysis/BasicAliasAnalysis.cpp
@@ -281,17 +281,20 @@ DecomposeGEPExpression(const Value *V, int64_t &BaseOffs,
       continue;
     }
 
-    if (const Instruction *I = dyn_cast<Instruction>(V))
-      // TODO: Get a DominatorTree and use it here.
-      if (const Value *Simplified =
-            SimplifyInstruction(const_cast<Instruction *>(I), TD)) {
-        V = Simplified;
-        continue;
-      }
-    
     const GEPOperator *GEPOp = dyn_cast<GEPOperator>(Op);
-    if (GEPOp == 0)
+    if (GEPOp == 0) {
+      // If it's not a GEP, hand it off to SimplifyInstruction to see if it
+      // can come up with something. This matches what GetUnderlyingObject does.
+      if (const Instruction *I = dyn_cast<Instruction>(V))
+        // TODO: Get a DominatorTree and use it here.
+        if (const Value *Simplified =
+              SimplifyInstruction(const_cast<Instruction *>(I), TD)) {
+          V = Simplified;
+          continue;
+        }
+    
       return V;
+    }
     
     // Don't attempt to analyze GEPs over unsized objects.
     if (!cast<PointerType>(GEPOp->getOperand(0)->getType())
@@ -350,7 +353,7 @@ DecomposeGEPExpression(const Value *V, int64_t &BaseOffs,
       Scale *= IndexScale.getSExtValue();
       
       
-      // If we already had an occurrance of this index variable, merge this
+      // If we already had an occurrence of this index variable, merge this
       // scale into it.  For example, we want to handle:
       //   A[x][x] -> x*16 + x*4 -> x*20
       // This also ensures that 'x' only appears in the index list once.
@@ -448,7 +451,13 @@ namespace {
   /// BasicAliasAnalysis - This is the primary alias analysis implementation.
   struct BasicAliasAnalysis : public ImmutablePass, public AliasAnalysis {
     static char ID; // Class identification, replacement for typeinfo
-    BasicAliasAnalysis() : ImmutablePass(ID) {
+    BasicAliasAnalysis() : ImmutablePass(ID),
+                           // AliasCache rarely has more than 1 or 2 elements,
+                           // so start it off fairly small so that clear()
+                           // doesn't have to tromp through 64 (the default)
+                           // elements on each alias query. This really wants
+                           // something like a SmallDenseMap.
+                           AliasCache(8) {
       initializeBasicAliasAnalysisPass(*PassRegistry::getPassRegistry());
     }
 
@@ -462,12 +471,12 @@ namespace {
 
     virtual AliasResult alias(const Location &LocA,
                               const Location &LocB) {
-      assert(Visited.empty() && "Visited must be cleared after use!");
+      assert(AliasCache.empty() && "AliasCache must be cleared after use!");
       assert(notDifferentParent(LocA.Ptr, LocB.Ptr) &&
              "BasicAliasAnalysis doesn't support interprocedural queries.");
       AliasResult Alias = aliasCheck(LocA.Ptr, LocA.Size, LocA.TBAATag,
                                      LocB.Ptr, LocB.Size, LocB.TBAATag);
-      Visited.clear();
+      AliasCache.clear();
       return Alias;
     }
 
@@ -503,7 +512,12 @@ namespace {
     }
     
   private:
-    // Visited - Track instructions visited by a aliasPHI, aliasSelect(), and aliasGEP().
+    // AliasCache - Track alias queries to guard against recursion.
+    typedef std::pair<Location, Location> LocPair;
+    typedef DenseMap<LocPair, AliasResult> AliasCacheTy;
+    AliasCacheTy AliasCache;
+
+    // Visited - Track instructions visited by pointsToConstantMemory.
     SmallPtrSet<const Value*, 16> Visited;
 
     // aliasGEP - Provide a bunch of ad-hoc rules to disambiguate a GEP
@@ -680,9 +694,12 @@ BasicAliasAnalysis::getModRefInfo(ImmutableCallSite CS,
     unsigned ArgNo = 0;
     for (ImmutableCallSite::arg_iterator CI = CS.arg_begin(), CE = CS.arg_end();
          CI != CE; ++CI, ++ArgNo) {
-      // Only look at the no-capture pointer arguments.
+      // Only look at the no-capture or byval pointer arguments.  If this
+      // pointer were passed to arguments that were neither of these, then it
+      // couldn't be no-capture.
       if (!(*CI)->getType()->isPointerTy() ||
-          !CS.paramHasAttr(ArgNo+1, Attribute::NoCapture))
+          (!CS.paramHasAttr(ArgNo+1, Attribute::NoCapture) &&
+           !CS.paramHasAttr(ArgNo+1, Attribute::ByVal)))
         continue;
       
       // If this is a no-capture pointer argument, see if we can tell that it
@@ -779,6 +796,26 @@ BasicAliasAnalysis::getModRefInfo(ImmutableCallSite CS,
         return NoModRef;
       break;
     }
+    case Intrinsic::arm_neon_vld1: {
+      // LLVM's vld1 and vst1 intrinsics currently only support a single
+      // vector register.
+      uint64_t Size =
+        TD ? TD->getTypeStoreSize(II->getType()) : UnknownSize;
+      if (isNoAlias(Location(II->getArgOperand(0), Size,
+                             II->getMetadata(LLVMContext::MD_tbaa)),
+                    Loc))
+        return NoModRef;
+      break;
+    }
+    case Intrinsic::arm_neon_vst1: {
+      uint64_t Size =
+        TD ? TD->getTypeStoreSize(II->getArgOperand(1)->getType()) : UnknownSize;
+      if (isNoAlias(Location(II->getArgOperand(0), Size,
+                             II->getMetadata(LLVMContext::MD_tbaa)),
+                    Loc))
+        return NoModRef;
+      break;
+    }
     }
 
   // The AliasAnalysis base class has some smarts, lets use them.
@@ -796,13 +833,6 @@ BasicAliasAnalysis::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
                              const MDNode *V2TBAAInfo,
                              const Value *UnderlyingV1,
                              const Value *UnderlyingV2) {
-  // If this GEP has been visited before, we're on a use-def cycle.
-  // Such cycles are only valid when PHI nodes are involved or in unreachable
-  // code. The visitPHI function catches cycles containing PHIs, but there
-  // could still be a cycle without PHIs in unreachable code.
-  if (!Visited.insert(GEP1))
-    return MayAlias;
-
   int64_t GEP1BaseOffset;
   SmallVector<VariableGEPIndex, 4> GEP1VariableIndices;
 
@@ -883,7 +913,7 @@ BasicAliasAnalysis::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
   if (GEP1BaseOffset == 0 && GEP1VariableIndices.empty())
     return MustAlias;
 
-  // If there is a difference betwen the pointers, but the difference is
+  // If there is a difference between the pointers, but the difference is
   // less than the size of the associated memory object, then we know
   // that the objects are partially overlapping.
   if (GEP1BaseOffset != 0 && GEP1VariableIndices.empty()) {
@@ -920,7 +950,30 @@ BasicAliasAnalysis::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
       return NoAlias;
   }
   
-  return MayAlias;
+  // Statically, we can see that the base objects are the same, but the
+  // pointers have dynamic offsets which we can't resolve. And none of our
+  // little tricks above worked.
+  //
+  // TODO: Returning PartialAlias instead of MayAlias is a mild hack; the
+  // practical effect of this is protecting TBAA in the case of dynamic
+  // indices into arrays of unions. An alternative way to solve this would
+  // be to have clang emit extra metadata for unions and/or union accesses.
+  // A union-specific solution wouldn't handle the problem for malloc'd
+  // memory however.
+  return PartialAlias;
+}
+
+static AliasAnalysis::AliasResult
+MergeAliasResults(AliasAnalysis::AliasResult A, AliasAnalysis::AliasResult B) {
+  // If the results agree, take it.
+  if (A == B)
+    return A;
+  // A mix of PartialAlias and MustAlias is PartialAlias.
+  if ((A == AliasAnalysis::PartialAlias && B == AliasAnalysis::MustAlias) ||
+      (B == AliasAnalysis::PartialAlias && A == AliasAnalysis::MustAlias))
+    return AliasAnalysis::PartialAlias;
+  // Otherwise, we don't know anything.
+  return AliasAnalysis::MayAlias;
 }
 
 /// aliasSelect - Provide a bunch of ad-hoc rules to disambiguate a Select
@@ -930,13 +983,6 @@ BasicAliasAnalysis::aliasSelect(const SelectInst *SI, uint64_t SISize,
                                 const MDNode *SITBAAInfo,
                                 const Value *V2, uint64_t V2Size,
                                 const MDNode *V2TBAAInfo) {
-  // If this select has been visited before, we're on a use-def cycle.
-  // Such cycles are only valid when PHI nodes are involved or in unreachable
-  // code. The visitPHI function catches cycles containing PHIs, but there
-  // could still be a cycle without PHIs in unreachable code.
-  if (!Visited.insert(SI))
-    return MayAlias;
-
   // If the values are Selects with the same condition, we can do a more precise
   // check: just check for aliases between the values on corresponding arms.
   if (const SelectInst *SI2 = dyn_cast<SelectInst>(V2))
@@ -949,9 +995,7 @@ BasicAliasAnalysis::aliasSelect(const SelectInst *SI, uint64_t SISize,
       AliasResult ThisAlias =
         aliasCheck(SI->getFalseValue(), SISize, SITBAAInfo,
                    SI2->getFalseValue(), V2Size, V2TBAAInfo);
-      if (ThisAlias != Alias)
-        return MayAlias;
-      return Alias;
+      return MergeAliasResults(ThisAlias, Alias);
     }
 
   // If both arms of the Select node NoAlias or MustAlias V2, then returns
@@ -961,16 +1005,9 @@ BasicAliasAnalysis::aliasSelect(const SelectInst *SI, uint64_t SISize,
   if (Alias == MayAlias)
     return MayAlias;
 
-  // If V2 is visited, the recursive case will have been caught in the
-  // above aliasCheck call, so these subsequent calls to aliasCheck
-  // don't need to assume that V2 is being visited recursively.
-  Visited.erase(V2);
-
   AliasResult ThisAlias =
     aliasCheck(V2, V2Size, V2TBAAInfo, SI->getFalseValue(), SISize, SITBAAInfo);
-  if (ThisAlias != Alias)
-    return MayAlias;
-  return Alias;
+  return MergeAliasResults(ThisAlias, Alias);
 }
 
 // aliasPHI - Provide a bunch of ad-hoc rules to disambiguate a PHI instruction
@@ -980,10 +1017,6 @@ BasicAliasAnalysis::aliasPHI(const PHINode *PN, uint64_t PNSize,
                              const MDNode *PNTBAAInfo,
                              const Value *V2, uint64_t V2Size,
                              const MDNode *V2TBAAInfo) {
-  // The PHI node has already been visited, avoid recursion any further.
-  if (!Visited.insert(PN))
-    return MayAlias;
-
   // If the values are PHIs in the same block, we can do a more precise
   // as well as efficient check: just check for aliases between the values
   // on corresponding edges.
@@ -1000,8 +1033,9 @@ BasicAliasAnalysis::aliasPHI(const PHINode *PN, uint64_t PNSize,
           aliasCheck(PN->getIncomingValue(i), PNSize, PNTBAAInfo,
                      PN2->getIncomingValueForBlock(PN->getIncomingBlock(i)),
                      V2Size, V2TBAAInfo);
-        if (ThisAlias != Alias)
-          return MayAlias;
+        Alias = MergeAliasResults(ThisAlias, Alias);
+        if (Alias == MayAlias)
+          break;
       }
       return Alias;
     }
@@ -1032,15 +1066,11 @@ BasicAliasAnalysis::aliasPHI(const PHINode *PN, uint64_t PNSize,
   for (unsigned i = 1, e = V1Srcs.size(); i != e; ++i) {
     Value *V = V1Srcs[i];
 
-    // If V2 is visited, the recursive case will have been caught in the
-    // above aliasCheck call, so these subsequent calls to aliasCheck
-    // don't need to assume that V2 is being visited recursively.
-    Visited.erase(V2);
-
     AliasResult ThisAlias = aliasCheck(V2, V2Size, V2TBAAInfo,
                                        V, PNSize, PNTBAAInfo);
-    if (ThisAlias != Alias || ThisAlias == MayAlias)
-      return MayAlias;
+    Alias = MergeAliasResults(ThisAlias, Alias);
+    if (Alias == MayAlias)
+      break;
   }
 
   return Alias;
@@ -1125,6 +1155,17 @@ BasicAliasAnalysis::aliasCheck(const Value *V1, uint64_t V1Size,
         (V2Size != UnknownSize && isObjectSmallerThan(O1, V2Size, *TD)))
       return NoAlias;
   
+  // Check the cache before climbing up use-def chains. This also terminates
+  // otherwise infinitely recursive queries.
+  LocPair Locs(Location(V1, V1Size, V1TBAAInfo),
+               Location(V2, V2Size, V2TBAAInfo));
+  if (V1 > V2)
+    std::swap(Locs.first, Locs.second);
+  std::pair<AliasCacheTy::iterator, bool> Pair =
+    AliasCache.insert(std::make_pair(Locs, MayAlias));
+  if (!Pair.second)
+    return Pair.first->second;
+
   // FIXME: This isn't aggressively handling alias(GEP, PHI) for example: if the
   // GEP can't simplify, we don't even look at the PHI cases.
   if (!isa<GEPOperator>(V1) && isa<GEPOperator>(V2)) {
@@ -1134,7 +1175,7 @@ BasicAliasAnalysis::aliasCheck(const Value *V1, uint64_t V1Size,
   }
   if (const GEPOperator *GV1 = dyn_cast<GEPOperator>(V1)) {
     AliasResult Result = aliasGEP(GV1, V1Size, V2, V2Size, V2TBAAInfo, O1, O2);
-    if (Result != MayAlias) return Result;
+    if (Result != MayAlias) return AliasCache[Locs] = Result;
   }
 
   if (isa<PHINode>(V2) && !isa<PHINode>(V1)) {
@@ -1144,7 +1185,7 @@ BasicAliasAnalysis::aliasCheck(const Value *V1, uint64_t V1Size,
   if (const PHINode *PN = dyn_cast<PHINode>(V1)) {
     AliasResult Result = aliasPHI(PN, V1Size, V1TBAAInfo,
                                   V2, V2Size, V2TBAAInfo);
-    if (Result != MayAlias) return Result;
+    if (Result != MayAlias) return AliasCache[Locs] = Result;
   }
 
   if (isa<SelectInst>(V2) && !isa<SelectInst>(V1)) {
@@ -1154,7 +1195,7 @@ BasicAliasAnalysis::aliasCheck(const Value *V1, uint64_t V1Size,
   if (const SelectInst *S1 = dyn_cast<SelectInst>(V1)) {
     AliasResult Result = aliasSelect(S1, V1Size, V1TBAAInfo,
                                      V2, V2Size, V2TBAAInfo);
-    if (Result != MayAlias) return Result;
+    if (Result != MayAlias) return AliasCache[Locs] = Result;
   }
 
   // If both pointers are pointing into the same object and one of them
@@ -1163,8 +1204,10 @@ BasicAliasAnalysis::aliasCheck(const Value *V1, uint64_t V1Size,
   if (TD && O1 == O2)
     if ((V1Size != UnknownSize && isObjectSize(O1, V1Size, *TD)) ||
         (V2Size != UnknownSize && isObjectSize(O2, V2Size, *TD)))
-      return PartialAlias;
+      return AliasCache[Locs] = PartialAlias;
 
-  return AliasAnalysis::alias(Location(V1, V1Size, V1TBAAInfo),
-                              Location(V2, V2Size, V2TBAAInfo));
+  AliasResult Result =
+    AliasAnalysis::alias(Location(V1, V1Size, V1TBAAInfo),
+                         Location(V2, V2Size, V2TBAAInfo));
+  return AliasCache[Locs] = Result;
 }
diff --git a/lib/Analysis/BranchProbabilityInfo.cpp b/lib/Analysis/BranchProbabilityInfo.cpp
new file mode 100644
index 0000000..15059c7
--- /dev/null
+++ b/lib/Analysis/BranchProbabilityInfo.cpp
@@ -0,0 +1,358 @@
+//===-- BranchProbabilityInfo.cpp - Branch Probability Analysis -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Loops should be simplified before this analysis.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Instructions.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+INITIALIZE_PASS_BEGIN(BranchProbabilityInfo, "branch-prob",
+                      "Branch Probability Analysis", false, true)
+INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_END(BranchProbabilityInfo, "branch-prob",
+                    "Branch Probability Analysis", false, true)
+
+char BranchProbabilityInfo::ID = 0;
+
+namespace {
+// Please note that BranchProbabilityAnalysis is not a FunctionPass.
+// It is created by BranchProbabilityInfo (which is a FunctionPass), which
+// provides a clear interface. Thanks to that, all heuristics and other
+// private methods are hidden in the .cpp file.
+class BranchProbabilityAnalysis {
+
+  typedef std::pair<BasicBlock *, BasicBlock *> Edge;
+
+  DenseMap<Edge, uint32_t> *Weights;
+
+  BranchProbabilityInfo *BP;
+
+  LoopInfo *LI;
+
+
+  // Weights are for internal use only. They are used by heuristics to help to
+  // estimate edges' probability. Example:
+  //
+  // Using "Loop Branch Heuristics" we predict weights of edges for the
+  // block BB2.
+  //         ...
+  //          |
+  //          V
+  //         BB1<-+
+  //          |   |
+  //          |   | (Weight = 128)
+  //          V   |
+  //         BB2--+
+  //          |
+  //          | (Weight = 4)
+  //          V
+  //         BB3
+  //
+  // Probability of the edge BB2->BB1 = 128 / (128 + 4) = 0.9696..
+  // Probability of the edge BB2->BB3 = 4 / (128 + 4) = 0.0303..
+
+  static const uint32_t LBH_TAKEN_WEIGHT = 128;
+  static const uint32_t LBH_NONTAKEN_WEIGHT = 4;
+
+  // Standard weight value. Used when none of the heuristics set weight for
+  // the edge.
+  static const uint32_t NORMAL_WEIGHT = 16;
+
+  // Minimum weight of an edge. Please note, that weight is NEVER 0.
+  static const uint32_t MIN_WEIGHT = 1;
+
+  // Return TRUE if BB leads directly to a Return Instruction.
+  static bool isReturningBlock(BasicBlock *BB) {
+    SmallPtrSet<BasicBlock *, 8> Visited;
+
+    while (true) {
+      TerminatorInst *TI = BB->getTerminator();
+      if (isa<ReturnInst>(TI))
+        return true;
+
+      if (TI->getNumSuccessors() > 1)
+        break;
+
+      // It is unreachable block which we can consider as a return instruction.
+      if (TI->getNumSuccessors() == 0)
+        return true;
+
+      Visited.insert(BB);
+      BB = TI->getSuccessor(0);
+
+      // Stop if cycle is detected.
+      if (Visited.count(BB))
+        return false;
+    }
+
+    return false;
+  }
+
+  // Multiply Edge Weight by two.
+  void incEdgeWeight(BasicBlock *Src, BasicBlock *Dst) {
+    uint32_t Weight = BP->getEdgeWeight(Src, Dst);
+    uint32_t MaxWeight = getMaxWeightFor(Src);
+
+    if (Weight * 2 > MaxWeight)
+      BP->setEdgeWeight(Src, Dst, MaxWeight);
+    else
+      BP->setEdgeWeight(Src, Dst, Weight * 2);
+  }
+
+  // Divide Edge Weight by two.
+  void decEdgeWeight(BasicBlock *Src, BasicBlock *Dst) {
+    uint32_t Weight = BP->getEdgeWeight(Src, Dst);
+
+    assert(Weight > 0);
+    if (Weight / 2 < MIN_WEIGHT)
+      BP->setEdgeWeight(Src, Dst, MIN_WEIGHT);
+    else
+      BP->setEdgeWeight(Src, Dst, Weight / 2);
+  }
+
+
+  uint32_t getMaxWeightFor(BasicBlock *BB) const {
+    return UINT32_MAX / BB->getTerminator()->getNumSuccessors();
+  }
+
+public:
+  BranchProbabilityAnalysis(DenseMap<Edge, uint32_t> *W,
+                            BranchProbabilityInfo *BP, LoopInfo *LI)
+    : Weights(W), BP(BP), LI(LI) {
+  }
+
+  // Return Heuristics
+  void calcReturnHeuristics(BasicBlock *BB);
+
+  // Pointer Heuristics
+  void calcPointerHeuristics(BasicBlock *BB);
+
+  // Loop Branch Heuristics
+  void calcLoopBranchHeuristics(BasicBlock *BB);
+
+  bool runOnFunction(Function &F);
+};
+} // end anonymous namespace
+
+// Calculate Edge Weights using "Return Heuristics". Predict a successor which
+// leads directly to Return Instruction will not be taken.
+void BranchProbabilityAnalysis::calcReturnHeuristics(BasicBlock *BB){
+  if (BB->getTerminator()->getNumSuccessors() == 1)
+    return;
+
+  for (succ_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I) {
+    BasicBlock *Succ = *I;
+    if (isReturningBlock(Succ)) {
+      decEdgeWeight(BB, Succ);
+    }
+  }
+}
+
+// Calculate Edge Weights using "Pointer Heuristics". Predict a comparsion
+// between two pointer or pointer and NULL will fail.
+void BranchProbabilityAnalysis::calcPointerHeuristics(BasicBlock *BB) {
+  BranchInst * BI = dyn_cast<BranchInst>(BB->getTerminator());
+  if (!BI || !BI->isConditional())
+    return;
+
+  Value *Cond = BI->getCondition();
+  ICmpInst *CI = dyn_cast<ICmpInst>(Cond);
+  if (!CI)
+    return;
+
+  Value *LHS = CI->getOperand(0);
+
+  if (!LHS->getType()->isPointerTy())
+    return;
+
+  assert(CI->getOperand(1)->getType()->isPointerTy());
+
+  BasicBlock *Taken = BI->getSuccessor(0);
+  BasicBlock *NonTaken = BI->getSuccessor(1);
+
+  // p != 0   ->   isProb = true
+  // p == 0   ->   isProb = false
+  // p != q   ->   isProb = true
+  // p == q   ->   isProb = false;
+  bool isProb = !CI->isEquality();
+  if (!isProb)
+    std::swap(Taken, NonTaken);
+
+  incEdgeWeight(BB, Taken);
+  decEdgeWeight(BB, NonTaken);
+}
+
+// Calculate Edge Weights using "Loop Branch Heuristics". Predict backedges
+// as taken, exiting edges as not-taken.
+void BranchProbabilityAnalysis::calcLoopBranchHeuristics(BasicBlock *BB) {
+  uint32_t numSuccs = BB->getTerminator()->getNumSuccessors();
+
+  Loop *L = LI->getLoopFor(BB);
+  if (!L)
+    return;
+
+  SmallVector<BasicBlock *, 8> BackEdges;
+  SmallVector<BasicBlock *, 8> ExitingEdges;
+
+  for (succ_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I) {
+    BasicBlock *Succ = *I;
+    Loop *SuccL = LI->getLoopFor(Succ);
+    if (SuccL != L)
+      ExitingEdges.push_back(Succ);
+    else if (Succ == L->getHeader())
+      BackEdges.push_back(Succ);
+  }
+
+  if (uint32_t numBackEdges = BackEdges.size()) {
+    uint32_t backWeight = LBH_TAKEN_WEIGHT / numBackEdges;
+    if (backWeight < NORMAL_WEIGHT)
+      backWeight = NORMAL_WEIGHT;
+
+    for (SmallVector<BasicBlock *, 8>::iterator EI = BackEdges.begin(),
+         EE = BackEdges.end(); EI != EE; ++EI) {
+      BasicBlock *Back = *EI;
+      BP->setEdgeWeight(BB, Back, backWeight);
+    }
+  }
+
+  uint32_t numExitingEdges = ExitingEdges.size();
+  if (uint32_t numNonExitingEdges = numSuccs - numExitingEdges) {
+    uint32_t exitWeight = LBH_NONTAKEN_WEIGHT / numNonExitingEdges;
+    if (exitWeight < MIN_WEIGHT)
+      exitWeight = MIN_WEIGHT;
+
+    for (SmallVector<BasicBlock *, 8>::iterator EI = ExitingEdges.begin(),
+         EE = ExitingEdges.end(); EI != EE; ++EI) {
+      BasicBlock *Exiting = *EI;
+      BP->setEdgeWeight(BB, Exiting, exitWeight);
+    }
+  }
+}
+
+bool BranchProbabilityAnalysis::runOnFunction(Function &F) {
+
+  for (Function::iterator I = F.begin(), E = F.end(); I != E; ) {
+    BasicBlock *BB = I++;
+
+    // Only LBH uses setEdgeWeight method.
+    calcLoopBranchHeuristics(BB);
+
+    // PH and RH use only incEdgeWeight and decEwdgeWeight methods to
+    // not efface LBH results.
+    calcPointerHeuristics(BB);
+    calcReturnHeuristics(BB);
+  }
+
+  return false;
+}
+
+
+bool BranchProbabilityInfo::runOnFunction(Function &F) {
+  LoopInfo &LI = getAnalysis<LoopInfo>();
+  BranchProbabilityAnalysis BPA(&Weights, this, &LI);
+  return BPA.runOnFunction(F);
+}
+
+uint32_t BranchProbabilityInfo::getSumForBlock(BasicBlock *BB) const {
+  uint32_t Sum = 0;
+
+  for (succ_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I) {
+    BasicBlock *Succ = *I;
+    uint32_t Weight = getEdgeWeight(BB, Succ);
+    uint32_t PrevSum = Sum;
+
+    Sum += Weight;
+    assert(Sum > PrevSum); (void) PrevSum;
+  }
+
+  return Sum;
+}
+
+bool BranchProbabilityInfo::isEdgeHot(BasicBlock *Src, BasicBlock *Dst) const {
+  // Hot probability is at least 4/5 = 80%
+  uint32_t Weight = getEdgeWeight(Src, Dst);
+  uint32_t Sum = getSumForBlock(Src);
+
+  // FIXME: Implement BranchProbability::compare then change this code to
+  // compare this BranchProbability against a static "hot" BranchProbability.
+  return (uint64_t)Weight * 5 > (uint64_t)Sum * 4;
+}
+
+BasicBlock *BranchProbabilityInfo::getHotSucc(BasicBlock *BB) const {
+  uint32_t Sum = 0;
+  uint32_t MaxWeight = 0;
+  BasicBlock *MaxSucc = 0;
+
+  for (succ_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I) {
+    BasicBlock *Succ = *I;
+    uint32_t Weight = getEdgeWeight(BB, Succ);
+    uint32_t PrevSum = Sum;
+
+    Sum += Weight;
+    assert(Sum > PrevSum); (void) PrevSum;
+
+    if (Weight > MaxWeight) {
+      MaxWeight = Weight;
+      MaxSucc = Succ;
+    }
+  }
+
+  // FIXME: Use BranchProbability::compare.
+  if ((uint64_t)MaxWeight * 5 > (uint64_t)Sum * 4)
+    return MaxSucc;
+
+  return 0;
+}
+
+// Return edge's weight. If can't find it, return DEFAULT_WEIGHT value.
+uint32_t
+BranchProbabilityInfo::getEdgeWeight(BasicBlock *Src, BasicBlock *Dst) const {
+  Edge E(Src, Dst);
+  DenseMap<Edge, uint32_t>::const_iterator I = Weights.find(E);
+
+  if (I != Weights.end())
+    return I->second;
+
+  return DEFAULT_WEIGHT;
+}
+
+void BranchProbabilityInfo::setEdgeWeight(BasicBlock *Src, BasicBlock *Dst,
+                                     uint32_t Weight) {
+  Weights[std::make_pair(Src, Dst)] = Weight;
+  DEBUG(dbgs() << "set edge " << Src->getNameStr() << " -> "
+               << Dst->getNameStr() << " weight to " << Weight
+               << (isEdgeHot(Src, Dst) ? " [is HOT now]\n" : "\n"));
+}
+
+
+BranchProbability BranchProbabilityInfo::
+getEdgeProbability(BasicBlock *Src, BasicBlock *Dst) const {
+
+  uint32_t N = getEdgeWeight(Src, Dst);
+  uint32_t D = getSumForBlock(Src);
+
+  return BranchProbability(N, D);
+}
+
+raw_ostream &
+BranchProbabilityInfo::printEdgeProbability(raw_ostream &OS, BasicBlock *Src,
+                                            BasicBlock *Dst) const {
+
+  const BranchProbability Prob = getEdgeProbability(Src, Dst);
+  OS << "edge " << Src->getNameStr() << " -> " << Dst->getNameStr()
+     << " probability is " << Prob
+     << (isEdgeHot(Src, Dst) ? " [HOT edge]\n" : "\n");
+
+  return OS;
+}
diff --git a/lib/Analysis/CMakeLists.txt b/lib/Analysis/CMakeLists.txt
index 6be5617..1a975bf 100644
--- a/lib/Analysis/CMakeLists.txt
+++ b/lib/Analysis/CMakeLists.txt
@@ -6,6 +6,7 @@ add_llvm_library(LLVMAnalysis
   AliasSetTracker.cpp
   Analysis.cpp
   BasicAliasAnalysis.cpp
+  BranchProbabilityInfo.cpp
   CFGPrinter.cpp
   CaptureTracking.cpp
   ConstantFolding.cpp
diff --git a/lib/Analysis/CaptureTracking.cpp b/lib/Analysis/CaptureTracking.cpp
index 42a54d9..b2c27d1 100644
--- a/lib/Analysis/CaptureTracking.cpp
+++ b/lib/Analysis/CaptureTracking.cpp
@@ -17,6 +17,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Constants.h"
 #include "llvm/Instructions.h"
 #include "llvm/Value.h"
 #include "llvm/Analysis/AliasAnalysis.h"
diff --git a/lib/Analysis/ConstantFolding.cpp b/lib/Analysis/ConstantFolding.cpp
index c6aad0c..08a6065 100644
--- a/lib/Analysis/ConstantFolding.cpp
+++ b/lib/Analysis/ConstantFolding.cpp
@@ -23,6 +23,7 @@
 #include "llvm/GlobalVariable.h"
 #include "llvm/Instructions.h"
 #include "llvm/Intrinsics.h"
+#include "llvm/Operator.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/ADT/SmallVector.h"
@@ -1084,7 +1085,7 @@ llvm::canConstantFoldCallTo(const Function *F) {
   case 'c':
     return Name == "cos" || Name == "ceil" || Name == "cosf" || Name == "cosh";
   case 'e':
-    return Name == "exp";
+    return Name == "exp" || Name == "exp2";
   case 'f':
     return Name == "fabs" || Name == "fmod" || Name == "floor";
   case 'l':
@@ -1220,6 +1221,12 @@ llvm::ConstantFoldCall(Function *F,
       case 'e':
         if (Name == "exp")
           return ConstantFoldFP(exp, V, Ty);
+  
+        if (Name == "exp2") {
+          // Constant fold exp2(x) as pow(2,x) in case the host doesn't have a
+          // C99 library.
+          return ConstantFoldBinaryFP(pow, 2.0, V, Ty);
+        }
         break;
       case 'f':
         if (Name == "fabs")
diff --git a/lib/Analysis/DIBuilder.cpp b/lib/Analysis/DIBuilder.cpp
index 108d2d2..ef5d03a 100644
--- a/lib/Analysis/DIBuilder.cpp
+++ b/lib/Analysis/DIBuilder.cpp
@@ -50,7 +50,11 @@ void DIBuilder::createCompileUnit(unsigned Lang, StringRef Filename,
     MDString::get(VMContext, Flags),
     ConstantInt::get(Type::getInt32Ty(VMContext), RunTimeVer)
   };
-  TheCU = DICompileUnit(MDNode::get(VMContext, &Elts[0], array_lengthof(Elts)));
+  TheCU = DICompileUnit(MDNode::get(VMContext, Elts));
+
+  // Create a named metadata so that it is easier to find cu in a module.
+  NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.dbg.cu");
+  NMD->addOperand(TheCU);
 }
 
 /// createFile - Create a file descriptor to hold debugging information
@@ -63,7 +67,7 @@ DIFile DIBuilder::createFile(StringRef Filename, StringRef Directory) {
     MDString::get(VMContext, Directory),
     TheCU
   };
-  return DIFile(MDNode::get(VMContext, &Elts[0], array_lengthof(Elts)));
+  return DIFile(MDNode::get(VMContext, Elts));
 }
 
 /// createEnumerator - Create a single enumerator value.
@@ -73,7 +77,7 @@ DIEnumerator DIBuilder::createEnumerator(StringRef Name, uint64_t Val) {
     MDString::get(VMContext, Name),
     ConstantInt::get(Type::getInt64Ty(VMContext), Val)
   };
-  return DIEnumerator(MDNode::get(VMContext, &Elts[0], array_lengthof(Elts)));
+  return DIEnumerator(MDNode::get(VMContext, Elts));
 }
 
 /// createBasicType - Create debugging information entry for a basic 
@@ -95,7 +99,7 @@ DIType DIBuilder::createBasicType(StringRef Name, uint64_t SizeInBits,
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags;
     ConstantInt::get(Type::getInt32Ty(VMContext), Encoding)
   };
-  return DIType(MDNode::get(VMContext, &Elts[0], array_lengthof(Elts)));
+  return DIType(MDNode::get(VMContext, Elts));
 }
 
 /// createQaulifiedType - Create debugging information entry for a qualified
@@ -114,7 +118,7 @@ DIType DIBuilder::createQualifiedType(unsigned Tag, DIType FromTy) {
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags
     FromTy
   };
-  return DIType(MDNode::get(VMContext, &Elts[0], array_lengthof(Elts)));
+  return DIType(MDNode::get(VMContext, Elts));
 }
 
 /// createPointerType - Create debugging information entry for a pointer.
@@ -133,7 +137,7 @@ DIType DIBuilder::createPointerType(DIType PointeeTy, uint64_t SizeInBits,
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags
     PointeeTy
   };
-  return DIType(MDNode::get(VMContext, &Elts[0], array_lengthof(Elts)));
+  return DIType(MDNode::get(VMContext, Elts));
 }
 
 /// createReferenceType - Create debugging information entry for a reference.
@@ -151,17 +155,17 @@ DIType DIBuilder::createReferenceType(DIType RTy) {
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags
     RTy
   };
-  return DIType(MDNode::get(VMContext, &Elts[0], array_lengthof(Elts)));
+  return DIType(MDNode::get(VMContext, Elts));
 }
 
 /// createTypedef - Create debugging information entry for a typedef.
 DIType DIBuilder::createTypedef(DIType Ty, StringRef Name, DIFile File,
-                                unsigned LineNo) {
+                                unsigned LineNo, DIDescriptor Context) {
   // typedefs are encoded in DIDerivedType format.
   assert(Ty.Verify() && "Invalid typedef type!");
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_typedef),
-    Ty.getContext(),
+    Context,
     MDString::get(VMContext, Name),
     File,
     ConstantInt::get(Type::getInt32Ty(VMContext), LineNo),
@@ -171,7 +175,7 @@ DIType DIBuilder::createTypedef(DIType Ty, StringRef Name, DIFile File,
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags
     Ty
   };
-  return DIType(MDNode::get(VMContext, &Elts[0], array_lengthof(Elts)));
+  return DIType(MDNode::get(VMContext, Elts));
 }
 
 /// createFriend - Create debugging information entry for a 'friend'.
@@ -191,7 +195,7 @@ DIType DIBuilder::createFriend(DIType Ty, DIType FriendTy) {
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags
     FriendTy
   };
-  return DIType(MDNode::get(VMContext, &Elts[0], array_lengthof(Elts)));
+  return DIType(MDNode::get(VMContext, Elts));
 }
 
 /// createInheritance - Create debugging information entry to establish
@@ -211,7 +215,7 @@ DIType DIBuilder::createInheritance(DIType Ty, DIType BaseTy,
     ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
     BaseTy
   };
-  return DIType(MDNode::get(VMContext, &Elts[0], array_lengthof(Elts)));
+  return DIType(MDNode::get(VMContext, Elts));
 }
 
 /// createMemberType - Create debugging information entry for a member.
@@ -233,7 +237,36 @@ DIType DIBuilder::createMemberType(StringRef Name,
     ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
     Ty
   };
-  return DIType(MDNode::get(VMContext, &Elts[0], array_lengthof(Elts)));
+  return DIType(MDNode::get(VMContext, Elts));
+}
+
+/// createObjCIVar - Create debugging information entry for Objective-C
+/// instance variable.
+DIType DIBuilder::createObjCIVar(StringRef Name, 
+                                 DIFile File, unsigned LineNumber, 
+                                 uint64_t SizeInBits, uint64_t AlignInBits,
+                                 uint64_t OffsetInBits, unsigned Flags, 
+                                 DIType Ty, StringRef PropertyName,
+                                 StringRef GetterName, StringRef SetterName,
+                                 unsigned PropertyAttributes) {
+  // TAG_member is encoded in DIDerivedType format.
+  Value *Elts[] = {
+    GetTagConstant(VMContext, dwarf::DW_TAG_member),
+    File, // Or TheCU ? Ty ?
+    MDString::get(VMContext, Name),
+    File,
+    ConstantInt::get(Type::getInt32Ty(VMContext), LineNumber),
+    ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits),
+    ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
+    ConstantInt::get(Type::getInt64Ty(VMContext), OffsetInBits),
+    ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
+    Ty,
+    MDString::get(VMContext, PropertyName),
+    MDString::get(VMContext, GetterName),
+    MDString::get(VMContext, SetterName),
+    ConstantInt::get(Type::getInt32Ty(VMContext), PropertyAttributes)
+  };
+  return DIType(MDNode::get(VMContext, Elts));
 }
 
 /// createClassType - Create debugging information entry for a class.
@@ -260,7 +293,7 @@ DIType DIBuilder::createClassType(DIDescriptor Context, StringRef Name,
     VTableHoder,
     TemplateParams
   };
-  return DIType(MDNode::get(VMContext, &Elts[0], array_lengthof(Elts)));
+  return DIType(MDNode::get(VMContext, Elts));
 }
 
 /// createTemplateTypeParameter - Create debugging information for template
@@ -278,8 +311,7 @@ DIBuilder::createTemplateTypeParameter(DIDescriptor Context, StringRef Name,
     ConstantInt::get(Type::getInt32Ty(VMContext), LineNo),
     ConstantInt::get(Type::getInt32Ty(VMContext), ColumnNo)
   };
-  return DITemplateTypeParameter(MDNode::get(VMContext, &Elts[0], 
-                                             array_lengthof(Elts)));
+  return DITemplateTypeParameter(MDNode::get(VMContext, Elts));
 }
 
 /// createTemplateValueParameter - Create debugging information for template
@@ -299,8 +331,7 @@ DIBuilder::createTemplateValueParameter(DIDescriptor Context, StringRef Name,
     ConstantInt::get(Type::getInt32Ty(VMContext), LineNo),
     ConstantInt::get(Type::getInt32Ty(VMContext), ColumnNo)
   };
-  return DITemplateValueParameter(MDNode::get(VMContext, &Elts[0], 
-                                              array_lengthof(Elts)));
+  return DITemplateValueParameter(MDNode::get(VMContext, Elts));
 }
 
 /// createStructType - Create debugging information entry for a struct.
@@ -325,7 +356,7 @@ DIType DIBuilder::createStructType(DIDescriptor Context, StringRef Name,
     ConstantInt::get(Type::getInt32Ty(VMContext), RunTimeLang),
     llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)),
   };
-  return DIType(MDNode::get(VMContext, &Elts[0], array_lengthof(Elts)));
+  return DIType(MDNode::get(VMContext, Elts));
 }
 
 /// createUnionType - Create debugging information entry for an union.
@@ -350,7 +381,7 @@ DIType DIBuilder::createUnionType(DIDescriptor Scope, StringRef Name,
     ConstantInt::get(Type::getInt32Ty(VMContext), RunTimeLang),
     llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)),
   };
-  return DIType(MDNode::get(VMContext, &Elts[0], array_lengthof(Elts)));
+  return DIType(MDNode::get(VMContext, Elts));
 }
 
 /// createSubroutineType - Create subroutine type.
@@ -371,7 +402,7 @@ DIType DIBuilder::createSubroutineType(DIFile File, DIArray ParameterTypes) {
     ConstantInt::get(Type::getInt32Ty(VMContext), 0),
     llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)),
   };
-  return DIType(MDNode::get(VMContext, &Elts[0], array_lengthof(Elts)));
+  return DIType(MDNode::get(VMContext, Elts));
 }
 
 /// createEnumerationType - Create debugging information entry for an 
@@ -396,7 +427,7 @@ DIType DIBuilder::createEnumerationType(DIDescriptor Scope, StringRef Name,
     ConstantInt::get(Type::getInt32Ty(VMContext), 0),
     llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)),
   };
-  MDNode *Node = MDNode::get(VMContext, &Elts[0], array_lengthof(Elts));
+  MDNode *Node = MDNode::get(VMContext, Elts);
   NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.dbg.enum");
   NMD->addOperand(Node);
   return DIType(Node);
@@ -421,7 +452,7 @@ DIType DIBuilder::createArrayType(uint64_t Size, uint64_t AlignInBits,
     ConstantInt::get(Type::getInt32Ty(VMContext), 0),
     llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)),
   };
-  return DIType(MDNode::get(VMContext, &Elts[0], array_lengthof(Elts)));
+  return DIType(MDNode::get(VMContext, Elts));
 }
 
 /// createVectorType - Create debugging information entry for a vector.
@@ -443,7 +474,7 @@ DIType DIBuilder::createVectorType(uint64_t Size, uint64_t AlignInBits,
     ConstantInt::get(Type::getInt32Ty(VMContext), 0),
     llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)),
   };
-  return DIType(MDNode::get(VMContext, &Elts[0], array_lengthof(Elts)));
+  return DIType(MDNode::get(VMContext, Elts));
 }
 
 /// createArtificialType - Create a new DIType with "artificial" flag set.
@@ -467,7 +498,7 @@ DIType DIBuilder::createArtificialType(DIType Ty) {
   // Flags are stored at this slot.
   Elts[8] =  ConstantInt::get(Type::getInt32Ty(VMContext), CurFlags);
 
-  return DIType(MDNode::get(VMContext, Elts.data(), Elts.size()));
+  return DIType(MDNode::get(VMContext, Elts));
 }
 
 /// retainType - Retain DIType in a module even if it is not referenced 
@@ -483,7 +514,7 @@ DIDescriptor DIBuilder::createUnspecifiedParameter() {
   Value *Elts[] = { 
     GetTagConstant(VMContext, dwarf::DW_TAG_unspecified_parameters) 
   };
-  return DIDescriptor(MDNode::get(VMContext, &Elts[0], 1));
+  return DIDescriptor(MDNode::get(VMContext, Elts));
 }
 
 /// createTemporaryType - Create a temporary forward-declared type.
@@ -491,7 +522,7 @@ DIType DIBuilder::createTemporaryType() {
   // Give the temporary MDNode a tag. It doesn't matter what tag we
   // use here as long as DIType accepts it.
   Value *Elts[] = { GetTagConstant(VMContext, DW_TAG_base_type) };
-  MDNode *Node = MDNode::getTemporary(VMContext, Elts, array_lengthof(Elts));
+  MDNode *Node = MDNode::getTemporary(VMContext, Elts);
   return DIType(Node);
 }
 
@@ -505,17 +536,17 @@ DIType DIBuilder::createTemporaryType(DIFile F) {
     NULL,
     F
   };
-  MDNode *Node = MDNode::getTemporary(VMContext, Elts, array_lengthof(Elts));
+  MDNode *Node = MDNode::getTemporary(VMContext, Elts);
   return DIType(Node);
 }
 
 /// getOrCreateArray - Get a DIArray, create one if required.
-DIArray DIBuilder::getOrCreateArray(Value *const *Elements, unsigned NumElements) {
-  if (NumElements == 0) {
+DIArray DIBuilder::getOrCreateArray(ArrayRef<Value *> Elements) {
+  if (Elements.empty()) {
     Value *Null = llvm::Constant::getNullValue(Type::getInt32Ty(VMContext));
-    return DIArray(MDNode::get(VMContext, &Null, 1));
+    return DIArray(MDNode::get(VMContext, Null));
   }
-  return DIArray(MDNode::get(VMContext, Elements, NumElements));
+  return DIArray(MDNode::get(VMContext, Elements));
 }
 
 /// getOrCreateSubrange - Create a descriptor for a value range.  This
@@ -527,7 +558,7 @@ DISubrange DIBuilder::getOrCreateSubrange(int64_t Lo, int64_t Hi) {
     ConstantInt::get(Type::getInt64Ty(VMContext), Hi)
   };
 
-  return DISubrange(MDNode::get(VMContext, &Elts[0], 3));
+  return DISubrange(MDNode::get(VMContext, Elts));
 }
 
 /// createGlobalVariable - Create a new descriptor for the specified global.
@@ -548,7 +579,7 @@ createGlobalVariable(StringRef Name, DIFile F, unsigned LineNumber,
     ConstantInt::get(Type::getInt32Ty(VMContext), 1), /* isDefinition*/
     Val
   };
-  MDNode *Node = MDNode::get(VMContext, &Elts[0], array_lengthof(Elts));
+  MDNode *Node = MDNode::get(VMContext, Elts);
   // Create a named metadata so that we do not lose this mdnode.
   NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.dbg.gv");
   NMD->addOperand(Node);
@@ -575,7 +606,7 @@ createStaticVariable(DIDescriptor Context, StringRef Name,
     ConstantInt::get(Type::getInt32Ty(VMContext), 1), /* isDefinition*/
     Val
   };
-  MDNode *Node = MDNode::get(VMContext, &Elts[0], array_lengthof(Elts));
+  MDNode *Node = MDNode::get(VMContext, Elts);
   // Create a named metadata so that we do not lose this mdnode.
   NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.dbg.gv");
   NMD->addOperand(Node);
@@ -597,7 +628,7 @@ DIVariable DIBuilder::createLocalVariable(unsigned Tag, DIDescriptor Scope,
     Ty,
     ConstantInt::get(Type::getInt32Ty(VMContext), Flags)
   };
-  MDNode *Node = MDNode::get(VMContext, &Elts[0], array_lengthof(Elts));
+  MDNode *Node = MDNode::get(VMContext, Elts);
   if (AlwaysPreserve) {
     // The optimizer may remove local variable. If there is an interest
     // to preserve variable info in such situation then stash it in a
@@ -620,8 +651,8 @@ DIVariable DIBuilder::createLocalVariable(unsigned Tag, DIDescriptor Scope,
 DIVariable DIBuilder::createComplexVariable(unsigned Tag, DIDescriptor Scope,
                                             StringRef Name, DIFile F,
                                             unsigned LineNo,
-                                            DIType Ty, Value *const *Addr,
-                                            unsigned NumAddr, unsigned ArgNo) {
+                                            DIType Ty, ArrayRef<Value *> Addr,
+                                            unsigned ArgNo) {
   SmallVector<Value *, 15> Elts;
   Elts.push_back(GetTagConstant(VMContext, Tag));
   Elts.push_back(Scope);
@@ -629,9 +660,10 @@ DIVariable DIBuilder::createComplexVariable(unsigned Tag, DIDescriptor Scope,
   Elts.push_back(F);
   Elts.push_back(ConstantInt::get(Type::getInt32Ty(VMContext), (LineNo | (ArgNo << 24))));
   Elts.push_back(Ty);
-  Elts.append(Addr, Addr+NumAddr);
+  Elts.push_back(llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)));
+  Elts.append(Addr.begin(), Addr.end());
 
-  return DIVariable(MDNode::get(VMContext, Elts.data(), Elts.size()));
+  return DIVariable(MDNode::get(VMContext, Elts));
 }
 
 /// createFunction - Create a new descriptor for the specified function.
@@ -643,7 +675,8 @@ DISubprogram DIBuilder::createFunction(DIDescriptor Context,
                                        bool isLocalToUnit, bool isDefinition,
                                        unsigned Flags, bool isOptimized,
                                        Function *Fn,
-                                       MDNode *TParams) {
+                                       MDNode *TParams,
+                                       MDNode *Decl) {
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_subprogram),
     llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)),
@@ -662,9 +695,10 @@ DISubprogram DIBuilder::createFunction(DIDescriptor Context,
     ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
     ConstantInt::get(Type::getInt1Ty(VMContext), isOptimized),
     Fn,
-    TParams
+    TParams,
+    Decl
   };
-  MDNode *Node = MDNode::get(VMContext, &Elts[0], array_lengthof(Elts));
+  MDNode *Node = MDNode::get(VMContext, Elts);
 
   // Create a named metadata so that we do not lose this mdnode.
   NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.dbg.sp");
@@ -706,7 +740,7 @@ DISubprogram DIBuilder::createMethod(DIDescriptor Context,
     Fn,
     TParam,
   };
-  MDNode *Node = MDNode::get(VMContext, &Elts[0], array_lengthof(Elts));
+  MDNode *Node = MDNode::get(VMContext, Elts);
 
   // Create a named metadata so that we do not lose this mdnode.
   NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.dbg.sp");
@@ -725,7 +759,7 @@ DINameSpace DIBuilder::createNameSpace(DIDescriptor Scope, StringRef Name,
     File,
     ConstantInt::get(Type::getInt32Ty(VMContext), LineNo)
   };
-  return DINameSpace(MDNode::get(VMContext, &Elts[0], array_lengthof(Elts)));
+  return DINameSpace(MDNode::get(VMContext, Elts));
 }
 
 DILexicalBlock DIBuilder::createLexicalBlock(DIDescriptor Scope, DIFile File,
@@ -740,7 +774,7 @@ DILexicalBlock DIBuilder::createLexicalBlock(DIDescriptor Scope, DIFile File,
     File,
     ConstantInt::get(Type::getInt32Ty(VMContext), unique_id++)
   };
-  return DILexicalBlock(MDNode::get(VMContext, &Elts[0], array_lengthof(Elts)));
+  return DILexicalBlock(MDNode::get(VMContext, Elts));
 }
 
 /// insertDeclare - Insert a new llvm.dbg.declare intrinsic call.
@@ -751,7 +785,7 @@ Instruction *DIBuilder::insertDeclare(Value *Storage, DIVariable VarInfo,
   if (!DeclareFn)
     DeclareFn = Intrinsic::getDeclaration(&M, Intrinsic::dbg_declare);
 
-  Value *Args[] = { MDNode::get(Storage->getContext(), &Storage, 1), VarInfo };
+  Value *Args[] = { MDNode::get(Storage->getContext(), Storage), VarInfo };
   return CallInst::Create(DeclareFn, Args, Args+2, "", InsertBefore);
 }
 
@@ -763,7 +797,7 @@ Instruction *DIBuilder::insertDeclare(Value *Storage, DIVariable VarInfo,
   if (!DeclareFn)
     DeclareFn = Intrinsic::getDeclaration(&M, Intrinsic::dbg_declare);
 
-  Value *Args[] = { MDNode::get(Storage->getContext(), &Storage, 1), VarInfo };
+  Value *Args[] = { MDNode::get(Storage->getContext(), Storage), VarInfo };
 
   // If this block already has a terminator then insert this intrinsic
   // before the terminator.
@@ -782,7 +816,7 @@ Instruction *DIBuilder::insertDbgValueIntrinsic(Value *V, uint64_t Offset,
   if (!ValueFn)
     ValueFn = Intrinsic::getDeclaration(&M, Intrinsic::dbg_value);
 
-  Value *Args[] = { MDNode::get(V->getContext(), &V, 1),
+  Value *Args[] = { MDNode::get(V->getContext(), V),
                     ConstantInt::get(Type::getInt64Ty(V->getContext()), Offset),
                     VarInfo };
   return CallInst::Create(ValueFn, Args, Args+3, "", InsertBefore);
@@ -797,7 +831,7 @@ Instruction *DIBuilder::insertDbgValueIntrinsic(Value *V, uint64_t Offset,
   if (!ValueFn)
     ValueFn = Intrinsic::getDeclaration(&M, Intrinsic::dbg_value);
 
-  Value *Args[] = { MDNode::get(V->getContext(), &V, 1),
+  Value *Args[] = { MDNode::get(V->getContext(), V),
                     ConstantInt::get(Type::getInt64Ty(V->getContext()), Offset),
                     VarInfo };
   return CallInst::Create(ValueFn, Args, Args+3, "", InsertAtEnd);
diff --git a/lib/Analysis/IPA/CallGraph.cpp b/lib/Analysis/IPA/CallGraph.cpp
index 690c4b4..2e79eab 100644
--- a/lib/Analysis/IPA/CallGraph.cpp
+++ b/lib/Analysis/IPA/CallGraph.cpp
@@ -148,7 +148,7 @@ private:
       for (BasicBlock::iterator II = BB->begin(), IE = BB->end();
            II != IE; ++II) {
         CallSite CS(cast<Value>(II));
-        if (CS && !isa<DbgInfoIntrinsic>(II)) {
+        if (CS && !isa<IntrinsicInst>(II)) {
           const Function *Callee = CS.getCalledFunction();
           if (Callee)
             Node->addCalledFunction(CS, getOrInsertFunction(Callee));
diff --git a/lib/Analysis/IPA/CallGraphSCCPass.cpp b/lib/Analysis/IPA/CallGraphSCCPass.cpp
index 725ab72..659ffab 100644
--- a/lib/Analysis/IPA/CallGraphSCCPass.cpp
+++ b/lib/Analysis/IPA/CallGraphSCCPass.cpp
@@ -245,8 +245,8 @@ bool CGPassManager::RefreshCallGraph(CallGraphSCC &CurSCC,
     
     for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
       for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
-          CallSite CS(cast<Value>(I));
-        if (!CS || isa<DbgInfoIntrinsic>(I)) continue;
+        CallSite CS(cast<Value>(I));
+        if (!CS || isa<IntrinsicInst>(I)) continue;
         
         // If this call site already existed in the callgraph, just verify it
         // matches up to expectations and remove it from CallSites.
diff --git a/lib/Analysis/IPA/FindUsedTypes.cpp b/lib/Analysis/IPA/FindUsedTypes.cpp
index 06ae34c..dde2556 100644
--- a/lib/Analysis/IPA/FindUsedTypes.cpp
+++ b/lib/Analysis/IPA/FindUsedTypes.cpp
@@ -32,7 +32,7 @@ INITIALIZE_PASS(FindUsedTypes, "print-used-types",
 void FindUsedTypes::IncorporateType(const Type *Ty) {
   // If ty doesn't already exist in the used types map, add it now, otherwise
   // return.
-  if (!UsedTypes.insert(Ty).second) return;  // Already contain Ty.
+  if (!UsedTypes.insert(Ty)) return;  // Already contain Ty.
 
   // Make sure to add any types this type references now.
   //
@@ -94,7 +94,7 @@ bool FindUsedTypes::runOnModule(Module &m) {
 //
 void FindUsedTypes::print(raw_ostream &OS, const Module *M) const {
   OS << "Types in use by this module:\n";
-  for (std::set<const Type *>::const_iterator I = UsedTypes.begin(),
+  for (SetVector<const Type *>::const_iterator I = UsedTypes.begin(),
        E = UsedTypes.end(); I != E; ++I) {
     OS << "   ";
     WriteTypeSymbolic(OS, *I, M);
diff --git a/lib/Analysis/IPA/GlobalsModRef.cpp b/lib/Analysis/IPA/GlobalsModRef.cpp
index 116aaf4..b226d66 100644
--- a/lib/Analysis/IPA/GlobalsModRef.cpp
+++ b/lib/Analysis/IPA/GlobalsModRef.cpp
@@ -602,7 +602,7 @@ void GlobalsModRef::addEscapingUse(Use &U) {
   // For the purposes of this analysis, it is conservatively correct to treat
   // a newly escaping value equivalently to a deleted one.  We could perhaps
   // be more precise by processing the new use and attempting to update our
-  // saved analysis results to accomodate it.
+  // saved analysis results to accommodate it.
   deleteValue(U);
   
   AliasAnalysis::addEscapingUse(U);
diff --git a/lib/Analysis/IVUsers.cpp b/lib/Analysis/IVUsers.cpp
index 2cda791..a0c42f0 100644
--- a/lib/Analysis/IVUsers.cpp
+++ b/lib/Analysis/IVUsers.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Assembly/Writer.h"
 #include "llvm/ADT/STLExtras.h"
@@ -38,6 +39,15 @@ INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
 INITIALIZE_PASS_END(IVUsers, "iv-users",
                       "Induction Variable Users", false, true)
 
+// IVUsers behavior currently depends on this temporary indvars mode. The
+// option must be defined upstream from its uses.
+namespace llvm {
+  bool DisableIVRewrite = false;
+}
+cl::opt<bool, true> DisableIVRewriteOpt(
+  "disable-iv-rewrite", cl::Hidden, cl::location(llvm::DisableIVRewrite),
+  cl::desc("Disable canonical induction variable rewriting"));
+
 Pass *llvm::createIVUsersPass() {
   return new IVUsers();
 }
@@ -79,7 +89,7 @@ static bool isInteresting(const SCEV *S, const Instruction *I, const Loop *L,
 /// AddUsersIfInteresting - Inspect the specified instruction.  If it is a
 /// reducible SCEV, recursively add its users to the IVUsesByStride set and
 /// return true.  Otherwise, return false.
-bool IVUsers::AddUsersIfInteresting(Instruction *I) {
+bool IVUsers::AddUsersIfInteresting(Instruction *I, PHINode *Phi) {
   if (!SE->isSCEVable(I->getType()))
     return false;   // Void and FP expressions cannot be reduced.
 
@@ -90,6 +100,11 @@ bool IVUsers::AddUsersIfInteresting(Instruction *I) {
   if (Width > 64 || (TD && !TD->isLegalInteger(Width)))
     return false;
 
+  // We expect Sign/Zero extension to be eliminated from the IR before analyzing
+  // any downstream uses.
+  if (DisableIVRewrite && (isa<SExtInst>(I) || isa<ZExtInst>(I)))
+    return false;
+
   if (!Processed.insert(I))
     return true;    // Instruction already handled.
 
@@ -121,13 +136,13 @@ bool IVUsers::AddUsersIfInteresting(Instruction *I) {
     bool AddUserToIVUsers = false;
     if (LI->getLoopFor(User->getParent()) != L) {
       if (isa<PHINode>(User) || Processed.count(User) ||
-          !AddUsersIfInteresting(User)) {
+          !AddUsersIfInteresting(User, Phi)) {
         DEBUG(dbgs() << "FOUND USER in other loop: " << *User << '\n'
                      << "   OF SCEV: " << *ISE << '\n');
         AddUserToIVUsers = true;
       }
     } else if (Processed.count(User) ||
-               !AddUsersIfInteresting(User)) {
+               !AddUsersIfInteresting(User, Phi)) {
       DEBUG(dbgs() << "FOUND USER: " << *User << '\n'
                    << "   OF SCEV: " << *ISE << '\n');
       AddUserToIVUsers = true;
@@ -135,9 +150,11 @@ bool IVUsers::AddUsersIfInteresting(Instruction *I) {
 
     if (AddUserToIVUsers) {
       // Okay, we found a user that we cannot reduce.
-      IVUses.push_back(new IVStrideUse(this, User, I));
+      IVUses.push_back(new IVStrideUse(this, User, I, Phi));
       IVStrideUse &NewUse = IVUses.back();
-      // Transform the expression into a normalized form.
+      // Autodetect the post-inc loop set, populating NewUse.PostIncLoops.
+      // The regular return value here is discarded; instead of recording
+      // it, we just recompute it when we need it.
       ISE = TransformForPostIncUse(NormalizeAutodetect,
                                    ISE, User, I,
                                    NewUse.PostIncLoops,
@@ -148,8 +165,8 @@ bool IVUsers::AddUsersIfInteresting(Instruction *I) {
   return true;
 }
 
-IVStrideUse &IVUsers::AddUser(Instruction *User, Value *Operand) {
-  IVUses.push_back(new IVStrideUse(this, User, Operand));
+IVStrideUse &IVUsers::AddUser(Instruction *User, Value *Operand, PHINode *Phi) {
+  IVUses.push_back(new IVStrideUse(this, User, Operand, Phi));
   return IVUses.back();
 }
 
@@ -177,7 +194,7 @@ bool IVUsers::runOnLoop(Loop *l, LPPassManager &LPM) {
   // them by stride.  Start by finding all of the PHI nodes in the header for
   // this loop.  If they are induction variables, inspect their uses.
   for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I)
-    (void)AddUsersIfInteresting(I);
+    (void)AddUsersIfInteresting(I, cast<PHINode>(I));
 
   return false;
 }
diff --git a/lib/Analysis/InlineCost.cpp b/lib/Analysis/InlineCost.cpp
index 47f91cf..efde598 100644
--- a/lib/Analysis/InlineCost.cpp
+++ b/lib/Analysis/InlineCost.cpp
@@ -66,21 +66,13 @@ void CodeMetrics::analyzeBasicBlock(const BasicBlock *BB) {
 
       ImmutableCallSite CS(cast<Instruction>(II));
 
-      // If this function contains a call to setjmp or _setjmp, never inline
-      // it.  This is a hack because we depend on the user marking their local
-      // variables as volatile if they are live across a setjmp call, and they
-      // probably won't do this in callers.
       if (const Function *F = CS.getCalledFunction()) {
         // If a function is both internal and has a single use, then it is 
         // extremely likely to get inlined in the future (it was probably 
         // exposed by an interleaved devirtualization pass).
         if (F->hasInternalLinkage() && F->hasOneUse())
           ++NumInlineCandidates;
-        
-        if (F->isDeclaration() && 
-            (F->getName() == "setjmp" || F->getName() == "_setjmp"))
-          callsSetJmp = true;
-       
+
         // If this call is to function itself, then the function is recursive.
         // Inlining it into other functions is a bad idea, because this is
         // basically just a form of loop peeling, and our metrics aren't useful
@@ -226,6 +218,13 @@ unsigned CodeMetrics::CountCodeReductionForAlloca(Value *V) {
 /// analyzeFunction - Fill in the current structure with information gleaned
 /// from the specified function.
 void CodeMetrics::analyzeFunction(Function *F) {
+  // If this function contains a call to setjmp or _setjmp, never inline
+  // it.  This is a hack because we depend on the user marking their local
+  // variables as volatile if they are live across a setjmp call, and they
+  // probably won't do this in callers.
+  if (F->callsFunctionThatReturnsTwice())
+    callsSetJmp = true;
+
   // Look at the size of the callee.
   for (Function::const_iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
     analyzeBasicBlock(&*BB);
@@ -501,7 +500,7 @@ InlineCost InlineCostAnalyzer::getInlineCost(CallSite CS,
     return InlineCost::getAlways();
     
   if (CalleeFI->Metrics.usesDynamicAlloca) {
-    // Get infomation about the caller.
+    // Get information about the caller.
     FunctionInfo &CallerFI = CachedFunctionInfo[Caller];
 
     // If we haven't calculated this information yet, do so now.
@@ -549,7 +548,7 @@ InlineCost InlineCostAnalyzer::getSpecializationCost(Function *Callee,
 
   int Cost = 0;
   
-  // Look at the orginal size of the callee.  Each instruction counts as 5.
+  // Look at the original size of the callee.  Each instruction counts as 5.
   Cost += CalleeFI->Metrics.NumInsts * InlineConstants::InstrCost;
 
   // Offset that with the amount of code that can be constant-folded
@@ -594,7 +593,7 @@ InlineCostAnalyzer::growCachedCostInfo(Function *Caller, Function *Callee) {
   CodeMetrics &CallerMetrics = CachedFunctionInfo[Caller].Metrics;
 
   // For small functions we prefer to recalculate the cost for better accuracy.
-  if (CallerMetrics.NumBlocks < 10 || CallerMetrics.NumInsts < 1000) {
+  if (CallerMetrics.NumBlocks < 10 && CallerMetrics.NumInsts < 1000) {
     resetCachedCostInfo(Caller);
     return;
   }
diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp
index 9dd5f05..9d78f8b 100644
--- a/lib/Analysis/InstructionSimplify.cpp
+++ b/lib/Analysis/InstructionSimplify.cpp
@@ -18,6 +18,7 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "instsimplify"
+#include "llvm/Operator.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/ConstantFolding.h"
@@ -900,6 +901,109 @@ Value *llvm::SimplifyFDivInst(Value *Op0, Value *Op1, const TargetData *TD,
   return ::SimplifyFDivInst(Op0, Op1, TD, DT, RecursionLimit);
 }
 
+/// SimplifyRem - Given operands for an SRem or URem, see if we can
+/// fold the result.  If not, this returns null.
+static Value *SimplifyRem(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1,
+                          const TargetData *TD, const DominatorTree *DT,
+                          unsigned MaxRecurse) {
+  if (Constant *C0 = dyn_cast<Constant>(Op0)) {
+    if (Constant *C1 = dyn_cast<Constant>(Op1)) {
+      Constant *Ops[] = { C0, C1 };
+      return ConstantFoldInstOperands(Opcode, C0->getType(), Ops, 2, TD);
+    }
+  }
+
+  // X % undef -> undef
+  if (match(Op1, m_Undef()))
+    return Op1;
+
+  // undef % X -> 0
+  if (match(Op0, m_Undef()))
+    return Constant::getNullValue(Op0->getType());
+
+  // 0 % X -> 0, we don't need to preserve faults!
+  if (match(Op0, m_Zero()))
+    return Op0;
+
+  // X % 0 -> undef, we don't need to preserve faults!
+  if (match(Op1, m_Zero()))
+    return UndefValue::get(Op0->getType());
+
+  // X % 1 -> 0
+  if (match(Op1, m_One()))
+    return Constant::getNullValue(Op0->getType());
+
+  if (Op0->getType()->isIntegerTy(1))
+    // It can't be remainder by zero, hence it must be remainder by one.
+    return Constant::getNullValue(Op0->getType());
+
+  // X % X -> 0
+  if (Op0 == Op1)
+    return Constant::getNullValue(Op0->getType());
+
+  // If the operation is with the result of a select instruction, check whether
+  // operating on either branch of the select always yields the same value.
+  if (isa<SelectInst>(Op0) || isa<SelectInst>(Op1))
+    if (Value *V = ThreadBinOpOverSelect(Opcode, Op0, Op1, TD, DT, MaxRecurse))
+      return V;
+
+  // If the operation is with the result of a phi instruction, check whether
+  // operating on all incoming values of the phi always yields the same value.
+  if (isa<PHINode>(Op0) || isa<PHINode>(Op1))
+    if (Value *V = ThreadBinOpOverPHI(Opcode, Op0, Op1, TD, DT, MaxRecurse))
+      return V;
+
+  return 0;
+}
+
+/// SimplifySRemInst - Given operands for an SRem, see if we can
+/// fold the result.  If not, this returns null.
+static Value *SimplifySRemInst(Value *Op0, Value *Op1, const TargetData *TD,
+                               const DominatorTree *DT, unsigned MaxRecurse) {
+  if (Value *V = SimplifyRem(Instruction::SRem, Op0, Op1, TD, DT, MaxRecurse))
+    return V;
+
+  return 0;
+}
+
+Value *llvm::SimplifySRemInst(Value *Op0, Value *Op1, const TargetData *TD,
+                              const DominatorTree *DT) {
+  return ::SimplifySRemInst(Op0, Op1, TD, DT, RecursionLimit);
+}
+
+/// SimplifyURemInst - Given operands for a URem, see if we can
+/// fold the result.  If not, this returns null.
+static Value *SimplifyURemInst(Value *Op0, Value *Op1, const TargetData *TD,
+                               const DominatorTree *DT, unsigned MaxRecurse) {
+  if (Value *V = SimplifyRem(Instruction::URem, Op0, Op1, TD, DT, MaxRecurse))
+    return V;
+
+  return 0;
+}
+
+Value *llvm::SimplifyURemInst(Value *Op0, Value *Op1, const TargetData *TD,
+                              const DominatorTree *DT) {
+  return ::SimplifyURemInst(Op0, Op1, TD, DT, RecursionLimit);
+}
+
+static Value *SimplifyFRemInst(Value *Op0, Value *Op1, const TargetData *,
+                               const DominatorTree *, unsigned) {
+  // undef % X -> undef    (the undef could be a snan).
+  if (match(Op0, m_Undef()))
+    return Op0;
+
+  // X % undef -> undef
+  if (match(Op1, m_Undef()))
+    return Op1;
+
+  return 0;
+}
+
+Value *llvm::SimplifyFRemInst(Value *Op0, Value *Op1, const TargetData *TD,
+                              const DominatorTree *DT) {
+  return ::SimplifyFRemInst(Op0, Op1, TD, DT, RecursionLimit);
+}
+
 /// SimplifyShift - Given operands for an Shl, LShr or AShr, see if we can
 /// fold the result.  If not, this returns null.
 static Value *SimplifyShift(unsigned Opcode, Value *Op0, Value *Op1,
@@ -1272,6 +1376,26 @@ static const Type *GetCompareTy(Value *Op) {
   return CmpInst::makeCmpResultType(Op->getType());
 }
 
+/// ExtractEquivalentCondition - Rummage around inside V looking for something
+/// equivalent to the comparison "LHS Pred RHS".  Return such a value if found,
+/// otherwise return null.  Helper function for analyzing max/min idioms.
+static Value *ExtractEquivalentCondition(Value *V, CmpInst::Predicate Pred,
+                                         Value *LHS, Value *RHS) {
+  SelectInst *SI = dyn_cast<SelectInst>(V);
+  if (!SI)
+    return 0;
+  CmpInst *Cmp = dyn_cast<CmpInst>(SI->getCondition());
+  if (!Cmp)
+    return 0;
+  Value *CmpLHS = Cmp->getOperand(0), *CmpRHS = Cmp->getOperand(1);
+  if (Pred == Cmp->getPredicate() && LHS == CmpLHS && RHS == CmpRHS)
+    return Cmp;
+  if (Pred == CmpInst::getSwappedPredicate(Cmp->getPredicate()) &&
+      LHS == CmpRHS && RHS == CmpLHS)
+    return Cmp;
+  return 0;
+}
+
 /// SimplifyICmpInst - Given operands for an ICmpInst, see if we can
 /// fold the result.  If not, this returns null.
 static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
@@ -1354,46 +1478,48 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
     default:
       assert(false && "Unknown ICmp predicate!");
     case ICmpInst::ICMP_ULT:
-      return ConstantInt::getFalse(LHS->getContext());
+      // getNullValue also works for vectors, unlike getFalse.
+      return Constant::getNullValue(ITy);
     case ICmpInst::ICMP_UGE:
-      return ConstantInt::getTrue(LHS->getContext());
+      // getAllOnesValue also works for vectors, unlike getTrue.
+      return ConstantInt::getAllOnesValue(ITy);
     case ICmpInst::ICMP_EQ:
     case ICmpInst::ICMP_ULE:
       if (isKnownNonZero(LHS, TD))
-        return ConstantInt::getFalse(LHS->getContext());
+        return Constant::getNullValue(ITy);
       break;
     case ICmpInst::ICMP_NE:
     case ICmpInst::ICMP_UGT:
       if (isKnownNonZero(LHS, TD))
-        return ConstantInt::getTrue(LHS->getContext());
+        return ConstantInt::getAllOnesValue(ITy);
       break;
     case ICmpInst::ICMP_SLT:
       ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, TD);
       if (LHSKnownNegative)
-        return ConstantInt::getTrue(LHS->getContext());
+        return ConstantInt::getAllOnesValue(ITy);
       if (LHSKnownNonNegative)
-        return ConstantInt::getFalse(LHS->getContext());
+        return Constant::getNullValue(ITy);
       break;
     case ICmpInst::ICMP_SLE:
       ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, TD);
       if (LHSKnownNegative)
-        return ConstantInt::getTrue(LHS->getContext());
+        return ConstantInt::getAllOnesValue(ITy);
       if (LHSKnownNonNegative && isKnownNonZero(LHS, TD))
-        return ConstantInt::getFalse(LHS->getContext());
+        return Constant::getNullValue(ITy);
       break;
     case ICmpInst::ICMP_SGE:
       ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, TD);
       if (LHSKnownNegative)
-        return ConstantInt::getFalse(LHS->getContext());
+        return Constant::getNullValue(ITy);
       if (LHSKnownNonNegative)
-        return ConstantInt::getTrue(LHS->getContext());
+        return ConstantInt::getAllOnesValue(ITy);
       break;
     case ICmpInst::ICMP_SGT:
       ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, TD);
       if (LHSKnownNegative)
-        return ConstantInt::getFalse(LHS->getContext());
+        return Constant::getNullValue(ITy);
       if (LHSKnownNonNegative && isKnownNonZero(LHS, TD))
-        return ConstantInt::getTrue(LHS->getContext());
+        return ConstantInt::getAllOnesValue(ITy);
       break;
     }
   }
@@ -1685,7 +1811,8 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
     case ICmpInst::ICMP_EQ:
     case ICmpInst::ICMP_UGT:
     case ICmpInst::ICMP_UGE:
-      return ConstantInt::getFalse(RHS->getContext());
+      // getNullValue also works for vectors, unlike getFalse.
+      return Constant::getNullValue(ITy);
     case ICmpInst::ICMP_SLT:
     case ICmpInst::ICMP_SLE:
       ComputeSignBit(LHS, KnownNonNegative, KnownNegative, TD);
@@ -1695,7 +1822,8 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
     case ICmpInst::ICMP_NE:
     case ICmpInst::ICMP_ULT:
     case ICmpInst::ICMP_ULE:
-      return ConstantInt::getTrue(RHS->getContext());
+      // getAllOnesValue also works for vectors, unlike getTrue.
+      return Constant::getAllOnesValue(ITy);
     }
   }
   if (RBO && match(RBO, m_URem(m_Value(), m_Specific(LHS)))) {
@@ -1712,7 +1840,8 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
     case ICmpInst::ICMP_NE:
     case ICmpInst::ICMP_UGT:
     case ICmpInst::ICMP_UGE:
-      return ConstantInt::getTrue(RHS->getContext());
+      // getAllOnesValue also works for vectors, unlike getTrue.
+      return Constant::getAllOnesValue(ITy);
     case ICmpInst::ICMP_SLT:
     case ICmpInst::ICMP_SLE:
       ComputeSignBit(RHS, KnownNonNegative, KnownNegative, TD);
@@ -1722,7 +1851,8 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
     case ICmpInst::ICMP_EQ:
     case ICmpInst::ICMP_ULT:
     case ICmpInst::ICMP_ULE:
-      return ConstantInt::getFalse(RHS->getContext());
+      // getNullValue also works for vectors, unlike getFalse.
+      return Constant::getNullValue(ITy);
     }
   }
 
@@ -1737,7 +1867,7 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
       // fall-through
     case Instruction::SDiv:
     case Instruction::AShr:
-      if (!LBO->isExact() && !RBO->isExact())
+      if (!LBO->isExact() || !RBO->isExact())
         break;
       if (Value *V = SimplifyICmpInst(Pred, LBO->getOperand(0),
                                       RBO->getOperand(0), TD, DT, MaxRecurse-1))
@@ -1758,6 +1888,194 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
     }
   }
 
+  // Simplify comparisons involving max/min.
+  Value *A, *B;
+  CmpInst::Predicate P = CmpInst::BAD_ICMP_PREDICATE;
+  CmpInst::Predicate EqP; // Chosen so that "A == max/min(A,B)" iff "A EqP B".
+
+  // Signed variants on "max(a,b)>=a -> true".
+  if (match(LHS, m_SMax(m_Value(A), m_Value(B))) && (A == RHS || B == RHS)) {
+    if (A != RHS) std::swap(A, B); // smax(A, B) pred A.
+    EqP = CmpInst::ICMP_SGE; // "A == smax(A, B)" iff "A sge B".
+    // We analyze this as smax(A, B) pred A.
+    P = Pred;
+  } else if (match(RHS, m_SMax(m_Value(A), m_Value(B))) &&
+             (A == LHS || B == LHS)) {
+    if (A != LHS) std::swap(A, B); // A pred smax(A, B).
+    EqP = CmpInst::ICMP_SGE; // "A == smax(A, B)" iff "A sge B".
+    // We analyze this as smax(A, B) swapped-pred A.
+    P = CmpInst::getSwappedPredicate(Pred);
+  } else if (match(LHS, m_SMin(m_Value(A), m_Value(B))) &&
+             (A == RHS || B == RHS)) {
+    if (A != RHS) std::swap(A, B); // smin(A, B) pred A.
+    EqP = CmpInst::ICMP_SLE; // "A == smin(A, B)" iff "A sle B".
+    // We analyze this as smax(-A, -B) swapped-pred -A.
+    // Note that we do not need to actually form -A or -B thanks to EqP.
+    P = CmpInst::getSwappedPredicate(Pred);
+  } else if (match(RHS, m_SMin(m_Value(A), m_Value(B))) &&
+             (A == LHS || B == LHS)) {
+    if (A != LHS) std::swap(A, B); // A pred smin(A, B).
+    EqP = CmpInst::ICMP_SLE; // "A == smin(A, B)" iff "A sle B".
+    // We analyze this as smax(-A, -B) pred -A.
+    // Note that we do not need to actually form -A or -B thanks to EqP.
+    P = Pred;
+  }
+  if (P != CmpInst::BAD_ICMP_PREDICATE) {
+    // Cases correspond to "max(A, B) p A".
+    switch (P) {
+    default:
+      break;
+    case CmpInst::ICMP_EQ:
+    case CmpInst::ICMP_SLE:
+      // Equivalent to "A EqP B".  This may be the same as the condition tested
+      // in the max/min; if so, we can just return that.
+      if (Value *V = ExtractEquivalentCondition(LHS, EqP, A, B))
+        return V;
+      if (Value *V = ExtractEquivalentCondition(RHS, EqP, A, B))
+        return V;
+      // Otherwise, see if "A EqP B" simplifies.
+      if (MaxRecurse)
+        if (Value *V = SimplifyICmpInst(EqP, A, B, TD, DT, MaxRecurse-1))
+          return V;
+      break;
+    case CmpInst::ICMP_NE:
+    case CmpInst::ICMP_SGT: {
+      CmpInst::Predicate InvEqP = CmpInst::getInversePredicate(EqP);
+      // Equivalent to "A InvEqP B".  This may be the same as the condition
+      // tested in the max/min; if so, we can just return that.
+      if (Value *V = ExtractEquivalentCondition(LHS, InvEqP, A, B))
+        return V;
+      if (Value *V = ExtractEquivalentCondition(RHS, InvEqP, A, B))
+        return V;
+      // Otherwise, see if "A InvEqP B" simplifies.
+      if (MaxRecurse)
+        if (Value *V = SimplifyICmpInst(InvEqP, A, B, TD, DT, MaxRecurse-1))
+          return V;
+      break;
+    }
+    case CmpInst::ICMP_SGE:
+      // Always true.
+      return Constant::getAllOnesValue(ITy);
+    case CmpInst::ICMP_SLT:
+      // Always false.
+      return Constant::getNullValue(ITy);
+    }
+  }
+
+  // Unsigned variants on "max(a,b)>=a -> true".
+  P = CmpInst::BAD_ICMP_PREDICATE;
+  if (match(LHS, m_UMax(m_Value(A), m_Value(B))) && (A == RHS || B == RHS)) {
+    if (A != RHS) std::swap(A, B); // umax(A, B) pred A.
+    EqP = CmpInst::ICMP_UGE; // "A == umax(A, B)" iff "A uge B".
+    // We analyze this as umax(A, B) pred A.
+    P = Pred;
+  } else if (match(RHS, m_UMax(m_Value(A), m_Value(B))) &&
+             (A == LHS || B == LHS)) {
+    if (A != LHS) std::swap(A, B); // A pred umax(A, B).
+    EqP = CmpInst::ICMP_UGE; // "A == umax(A, B)" iff "A uge B".
+    // We analyze this as umax(A, B) swapped-pred A.
+    P = CmpInst::getSwappedPredicate(Pred);
+  } else if (match(LHS, m_UMin(m_Value(A), m_Value(B))) &&
+             (A == RHS || B == RHS)) {
+    if (A != RHS) std::swap(A, B); // umin(A, B) pred A.
+    EqP = CmpInst::ICMP_ULE; // "A == umin(A, B)" iff "A ule B".
+    // We analyze this as umax(-A, -B) swapped-pred -A.
+    // Note that we do not need to actually form -A or -B thanks to EqP.
+    P = CmpInst::getSwappedPredicate(Pred);
+  } else if (match(RHS, m_UMin(m_Value(A), m_Value(B))) &&
+             (A == LHS || B == LHS)) {
+    if (A != LHS) std::swap(A, B); // A pred umin(A, B).
+    EqP = CmpInst::ICMP_ULE; // "A == umin(A, B)" iff "A ule B".
+    // We analyze this as umax(-A, -B) pred -A.
+    // Note that we do not need to actually form -A or -B thanks to EqP.
+    P = Pred;
+  }
+  if (P != CmpInst::BAD_ICMP_PREDICATE) {
+    // Cases correspond to "max(A, B) p A".
+    switch (P) {
+    default:
+      break;
+    case CmpInst::ICMP_EQ:
+    case CmpInst::ICMP_ULE:
+      // Equivalent to "A EqP B".  This may be the same as the condition tested
+      // in the max/min; if so, we can just return that.
+      if (Value *V = ExtractEquivalentCondition(LHS, EqP, A, B))
+        return V;
+      if (Value *V = ExtractEquivalentCondition(RHS, EqP, A, B))
+        return V;
+      // Otherwise, see if "A EqP B" simplifies.
+      if (MaxRecurse)
+        if (Value *V = SimplifyICmpInst(EqP, A, B, TD, DT, MaxRecurse-1))
+          return V;
+      break;
+    case CmpInst::ICMP_NE:
+    case CmpInst::ICMP_UGT: {
+      CmpInst::Predicate InvEqP = CmpInst::getInversePredicate(EqP);
+      // Equivalent to "A InvEqP B".  This may be the same as the condition
+      // tested in the max/min; if so, we can just return that.
+      if (Value *V = ExtractEquivalentCondition(LHS, InvEqP, A, B))
+        return V;
+      if (Value *V = ExtractEquivalentCondition(RHS, InvEqP, A, B))
+        return V;
+      // Otherwise, see if "A InvEqP B" simplifies.
+      if (MaxRecurse)
+        if (Value *V = SimplifyICmpInst(InvEqP, A, B, TD, DT, MaxRecurse-1))
+          return V;
+      break;
+    }
+    case CmpInst::ICMP_UGE:
+      // Always true.
+      return Constant::getAllOnesValue(ITy);
+    case CmpInst::ICMP_ULT:
+      // Always false.
+      return Constant::getNullValue(ITy);
+    }
+  }
+
+  // Variants on "max(x,y) >= min(x,z)".
+  Value *C, *D;
+  if (match(LHS, m_SMax(m_Value(A), m_Value(B))) &&
+      match(RHS, m_SMin(m_Value(C), m_Value(D))) &&
+      (A == C || A == D || B == C || B == D)) {
+    // max(x, ?) pred min(x, ?).
+    if (Pred == CmpInst::ICMP_SGE)
+      // Always true.
+      return Constant::getAllOnesValue(ITy);
+    if (Pred == CmpInst::ICMP_SLT)
+      // Always false.
+      return Constant::getNullValue(ITy);
+  } else if (match(LHS, m_SMin(m_Value(A), m_Value(B))) &&
+             match(RHS, m_SMax(m_Value(C), m_Value(D))) &&
+             (A == C || A == D || B == C || B == D)) {
+    // min(x, ?) pred max(x, ?).
+    if (Pred == CmpInst::ICMP_SLE)
+      // Always true.
+      return Constant::getAllOnesValue(ITy);
+    if (Pred == CmpInst::ICMP_SGT)
+      // Always false.
+      return Constant::getNullValue(ITy);
+  } else if (match(LHS, m_UMax(m_Value(A), m_Value(B))) &&
+             match(RHS, m_UMin(m_Value(C), m_Value(D))) &&
+             (A == C || A == D || B == C || B == D)) {
+    // max(x, ?) pred min(x, ?).
+    if (Pred == CmpInst::ICMP_UGE)
+      // Always true.
+      return Constant::getAllOnesValue(ITy);
+    if (Pred == CmpInst::ICMP_ULT)
+      // Always false.
+      return Constant::getNullValue(ITy);
+  } else if (match(LHS, m_UMin(m_Value(A), m_Value(B))) &&
+             match(RHS, m_UMax(m_Value(C), m_Value(D))) &&
+             (A == C || A == D || B == C || B == D)) {
+    // min(x, ?) pred max(x, ?).
+    if (Pred == CmpInst::ICMP_ULE)
+      // Always true.
+      return Constant::getAllOnesValue(ITy);
+    if (Pred == CmpInst::ICMP_UGT)
+      // Always false.
+      return Constant::getNullValue(ITy);
+  }
+
   // If the comparison is with the result of a select instruction, check whether
   // comparing with either branch of the select always yields the same value.
   if (isa<SelectInst>(LHS) || isa<SelectInst>(RHS))
@@ -1993,6 +2311,9 @@ static Value *SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS,
   case Instruction::SDiv: return SimplifySDivInst(LHS, RHS, TD, DT, MaxRecurse);
   case Instruction::UDiv: return SimplifyUDivInst(LHS, RHS, TD, DT, MaxRecurse);
   case Instruction::FDiv: return SimplifyFDivInst(LHS, RHS, TD, DT, MaxRecurse);
+  case Instruction::SRem: return SimplifySRemInst(LHS, RHS, TD, DT, MaxRecurse);
+  case Instruction::URem: return SimplifyURemInst(LHS, RHS, TD, DT, MaxRecurse);
+  case Instruction::FRem: return SimplifyFRemInst(LHS, RHS, TD, DT, MaxRecurse);
   case Instruction::Shl:
     return SimplifyShlInst(LHS, RHS, /*isNSW*/false, /*isNUW*/false,
                            TD, DT, MaxRecurse);
@@ -2087,6 +2408,15 @@ Value *llvm::SimplifyInstruction(Instruction *I, const TargetData *TD,
   case Instruction::FDiv:
     Result = SimplifyFDivInst(I->getOperand(0), I->getOperand(1), TD, DT);
     break;
+  case Instruction::SRem:
+    Result = SimplifySRemInst(I->getOperand(0), I->getOperand(1), TD, DT);
+    break;
+  case Instruction::URem:
+    Result = SimplifyURemInst(I->getOperand(0), I->getOperand(1), TD, DT);
+    break;
+  case Instruction::FRem:
+    Result = SimplifyFRemInst(I->getOperand(0), I->getOperand(1), TD, DT);
+    break;
   case Instruction::Shl:
     Result = SimplifyShlInst(I->getOperand(0), I->getOperand(1),
                              cast<BinaryOperator>(I)->hasNoSignedWrap(),
diff --git a/lib/Analysis/LazyValueInfo.cpp b/lib/Analysis/LazyValueInfo.cpp
index 9e7da6c..6e27597 100644
--- a/lib/Analysis/LazyValueInfo.cpp
+++ b/lib/Analysis/LazyValueInfo.cpp
@@ -29,7 +29,6 @@
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
 #include <map>
-#include <set>
 #include <stack>
 using namespace llvm;
 
@@ -268,6 +267,8 @@ public:
 } // end anonymous namespace.
 
 namespace llvm {
+raw_ostream &operator<<(raw_ostream &OS, const LVILatticeVal &Val)
+    LLVM_ATTRIBUTE_USED;
 raw_ostream &operator<<(raw_ostream &OS, const LVILatticeVal &Val) {
   if (Val.isUndefined())
     return OS << "undefined";
@@ -588,16 +589,18 @@ static bool InstructionDereferencesPointer(Instruction *I, Value *Ptr) {
   }
   if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) {
     if (MI->isVolatile()) return false;
-    if (MI->getAddressSpace() != 0) return false;
 
     // FIXME: check whether it has a valuerange that excludes zero?
     ConstantInt *Len = dyn_cast<ConstantInt>(MI->getLength());
     if (!Len || Len->isZero()) return false;
 
-    if (MI->getRawDest() == Ptr || MI->getDest() == Ptr)
-      return true;
+    if (MI->getDestAddressSpace() == 0)
+      if (MI->getRawDest() == Ptr || MI->getDest() == Ptr)
+        return true;
     if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI))
-      return MTI->getRawSource() == Ptr || MTI->getSource() == Ptr;
+      if (MTI->getSourceAddressSpace() == 0)
+        if (MTI->getRawSource() == Ptr || MTI->getSource() == Ptr)
+          return true;
   }
   return false;
 }
diff --git a/lib/Analysis/Lint.cpp b/lib/Analysis/Lint.cpp
index fc7edc0..f130f30 100644
--- a/lib/Analysis/Lint.cpp
+++ b/lib/Analysis/Lint.cpp
@@ -606,7 +606,7 @@ Value *Lint::findValueImpl(Value *V, bool OffsetOk,
                                     Type::getInt64Ty(V->getContext())))
         return findValueImpl(CE->getOperand(0), OffsetOk, Visited);
     } else if (CE->getOpcode() == Instruction::ExtractValue) {
-      const SmallVector<unsigned, 4> &Indices = CE->getIndices();
+      ArrayRef<unsigned> Indices = CE->getIndices();
       if (Value *W = FindInsertedValue(CE->getOperand(0),
                                        Indices.begin(),
                                        Indices.end()))
diff --git a/lib/Analysis/Loads.cpp b/lib/Analysis/Loads.cpp
index 2ea27fb..c5c676b 100644
--- a/lib/Analysis/Loads.cpp
+++ b/lib/Analysis/Loads.cpp
@@ -17,6 +17,7 @@
 #include "llvm/GlobalAlias.h"
 #include "llvm/GlobalVariable.h"
 #include "llvm/IntrinsicInst.h"
+#include "llvm/Operator.h"
 using namespace llvm;
 
 /// AreEquivalentAddressValues - Test if A and B will obviously have the same
@@ -30,7 +31,7 @@ using namespace llvm;
 static bool AreEquivalentAddressValues(const Value *A, const Value *B) {
   // Test if the values are trivially equivalent.
   if (A == B) return true;
-  
+
   // Test if the values come from identical arithmetic instructions.
   // Use isIdenticalToWhenDefined instead of isIdenticalTo because
   // this function is only used when one address use dominates the
@@ -41,7 +42,7 @@ static bool AreEquivalentAddressValues(const Value *A, const Value *B) {
     if (const Instruction *BI = dyn_cast<Instruction>(B))
       if (cast<Instruction>(A)->isIdenticalToWhenDefined(BI))
         return true;
-  
+
   // Otherwise they may not be equivalent.
   return false;
 }
diff --git a/lib/Analysis/MemDepPrinter.cpp b/lib/Analysis/MemDepPrinter.cpp
index 64d215c..2283db0 100644
--- a/lib/Analysis/MemDepPrinter.cpp
+++ b/lib/Analysis/MemDepPrinter.cpp
@@ -79,8 +79,8 @@ bool MemDepPrinter::runOnFunction(Function &F) {
 
     MemDepResult Res = MDA.getDependency(Inst);
     if (!Res.isNonLocal()) {
-      assert(Res.isClobber() != Res.isDef() &&
-             "Local dep should be def or clobber!");
+      assert((Res.isUnknown() || Res.isClobber() || Res.isDef()) &&
+              "Local dep should be unknown, def or clobber!");
       Deps[Inst].insert(std::make_pair(InstAndClobberFlag(Res.getInst(),
                                                           Res.isClobber()),
                                        static_cast<BasicBlock *>(0)));
@@ -92,8 +92,9 @@ bool MemDepPrinter::runOnFunction(Function &F) {
       for (MemoryDependenceAnalysis::NonLocalDepInfo::const_iterator
            I = NLDI.begin(), E = NLDI.end(); I != E; ++I) {
         const MemDepResult &Res = I->getResult();
-        assert(Res.isClobber() != Res.isDef() &&
-               "Resolved non-local call dep should be def or clobber!");
+        assert((Res.isUnknown() || Res.isClobber() || Res.isDef()) &&
+                "Resolved non-local call dep should be unknown, def or "
+                "clobber!");
         InstDeps.insert(std::make_pair(InstAndClobberFlag(Res.getInst(),
                                                           Res.isClobber()),
                                        I->getBB()));
@@ -148,16 +149,24 @@ void MemDepPrinter::print(raw_ostream &OS, const Module *M) const {
       bool isClobber = I->first.getInt();
       const BasicBlock *DepBB = I->second;
 
-      OS << "    " << (isClobber ? "Clobber" : "    Def");
+      OS << "    ";
+      if (!DepInst)
+        OS << "Unknown";
+      else if (isClobber)
+        OS << "Clobber";
+      else
+        OS << "    Def";
       if (DepBB) {
         OS << " in block ";
         WriteAsOperand(OS, DepBB, /*PrintType=*/false, M);
       }
-      OS << " from: ";
-      if (DepInst == Inst)
-        OS << "<unspecified>";
-      else
-        DepInst->print(OS);
+      if (DepInst) {
+        OS << " from: ";
+        if (DepInst == Inst)
+          OS << "<unspecified>";
+        else
+          DepInst->print(OS);
+      }
       OS << "\n";
     }
 
diff --git a/lib/Analysis/MemoryDependenceAnalysis.cpp b/lib/Analysis/MemoryDependenceAnalysis.cpp
index 35043bdd..bba4482 100644
--- a/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -16,6 +16,7 @@
 
 #define DEBUG_TYPE "memdep"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Function.h"
@@ -46,6 +47,11 @@ STATISTIC(NumUncacheNonLocalPtr,
 STATISTIC(NumCacheCompleteNonLocalPtr,
           "Number of block queries that were completely cached");
 
+// Limit for the number of instructions to scan in a block.
+// FIXME: Figure out what a sane value is for this.
+//        (500 is relatively insane.)
+static const int BlockScanLimit = 500;
+
 char MemoryDependenceAnalysis::ID = 0;
   
 // Register this pass...
@@ -179,8 +185,16 @@ AliasAnalysis::ModRefResult GetLocation(const Instruction *Inst,
 MemDepResult MemoryDependenceAnalysis::
 getCallSiteDependencyFrom(CallSite CS, bool isReadOnlyCall,
                           BasicBlock::iterator ScanIt, BasicBlock *BB) {
+  unsigned Limit = BlockScanLimit;
+
   // Walk backwards through the block, looking for dependencies
   while (ScanIt != BB->begin()) {
+    // Limit the amount of scanning we do so we don't end up with quadratic
+    // running time on extreme testcases. 
+    --Limit;
+    if (!Limit)
+      return MemDepResult::getUnknown();
+
     Instruction *Inst = --ScanIt;
     
     // If this inst is a memory op, get the pointer it accessed
@@ -214,11 +228,101 @@ getCallSiteDependencyFrom(CallSite CS, bool isReadOnlyCall,
     }
   }
   
-  // No dependence found.  If this is the entry block of the function, it is a
-  // clobber, otherwise it is non-local.
+  // No dependence found.  If this is the entry block of the function, it is
+  // unknown, otherwise it is non-local.
   if (BB != &BB->getParent()->getEntryBlock())
     return MemDepResult::getNonLocal();
-  return MemDepResult::getClobber(ScanIt);
+  return MemDepResult::getUnknown();
+}
+
+/// isLoadLoadClobberIfExtendedToFullWidth - Return true if LI is a load that
+/// would fully overlap MemLoc if done as a wider legal integer load.
+///
+/// MemLocBase, MemLocOffset are lazily computed here the first time the
+/// base/offs of memloc is needed.
+static bool 
+isLoadLoadClobberIfExtendedToFullWidth(const AliasAnalysis::Location &MemLoc,
+                                       const Value *&MemLocBase,
+                                       int64_t &MemLocOffs,
+                                       const LoadInst *LI,
+                                       const TargetData *TD) {
+  // If we have no target data, we can't do this.
+  if (TD == 0) return false;
+
+  // If we haven't already computed the base/offset of MemLoc, do so now.
+  if (MemLocBase == 0)
+    MemLocBase = GetPointerBaseWithConstantOffset(MemLoc.Ptr, MemLocOffs, *TD);
+
+  unsigned Size = MemoryDependenceAnalysis::
+    getLoadLoadClobberFullWidthSize(MemLocBase, MemLocOffs, MemLoc.Size,
+                                    LI, *TD);
+  return Size != 0;
+}
+
+/// getLoadLoadClobberFullWidthSize - This is a little bit of analysis that
+/// looks at a memory location for a load (specified by MemLocBase, Offs,
+/// and Size) and compares it against a load.  If the specified load could
+/// be safely widened to a larger integer load that is 1) still efficient,
+/// 2) safe for the target, and 3) would provide the specified memory
+/// location value, then this function returns the size in bytes of the
+/// load width to use.  If not, this returns zero.
+unsigned MemoryDependenceAnalysis::
+getLoadLoadClobberFullWidthSize(const Value *MemLocBase, int64_t MemLocOffs,
+                                unsigned MemLocSize, const LoadInst *LI,
+                                const TargetData &TD) {
+  // We can only extend non-volatile integer loads.
+  if (!isa<IntegerType>(LI->getType()) || LI->isVolatile()) return 0;
+  
+  // Get the base of this load.
+  int64_t LIOffs = 0;
+  const Value *LIBase = 
+    GetPointerBaseWithConstantOffset(LI->getPointerOperand(), LIOffs, TD);
+  
+  // If the two pointers are not based on the same pointer, we can't tell that
+  // they are related.
+  if (LIBase != MemLocBase) return 0;
+  
+  // Okay, the two values are based on the same pointer, but returned as
+  // no-alias.  This happens when we have things like two byte loads at "P+1"
+  // and "P+3".  Check to see if increasing the size of the "LI" load up to its
+  // alignment (or the largest native integer type) will allow us to load all
+  // the bits required by MemLoc.
+  
+  // If MemLoc is before LI, then no widening of LI will help us out.
+  if (MemLocOffs < LIOffs) return 0;
+  
+  // Get the alignment of the load in bytes.  We assume that it is safe to load
+  // any legal integer up to this size without a problem.  For example, if we're
+  // looking at an i8 load on x86-32 that is known 1024 byte aligned, we can
+  // widen it up to an i32 load.  If it is known 2-byte aligned, we can widen it
+  // to i16.
+  unsigned LoadAlign = LI->getAlignment();
+
+  int64_t MemLocEnd = MemLocOffs+MemLocSize;
+  
+  // If no amount of rounding up will let MemLoc fit into LI, then bail out.
+  if (LIOffs+LoadAlign < MemLocEnd) return 0;
+  
+  // This is the size of the load to try.  Start with the next larger power of
+  // two.
+  unsigned NewLoadByteSize = LI->getType()->getPrimitiveSizeInBits()/8U;
+  NewLoadByteSize = NextPowerOf2(NewLoadByteSize);
+  
+  while (1) {
+    // If this load size is bigger than our known alignment or would not fit
+    // into a native integer register, then we fail.
+    if (NewLoadByteSize > LoadAlign ||
+        !TD.fitsInLegalInteger(NewLoadByteSize*8))
+      return 0;
+
+    // If a load of this width would include all of MemLoc, then we succeed.
+    if (LIOffs+NewLoadByteSize >= MemLocEnd)
+      return NewLoadByteSize;
+    
+    NewLoadByteSize <<= 1;
+  }
+  
+  return 0;
 }
 
 /// getPointerDependencyFrom - Return the instruction on which a memory
@@ -229,58 +333,39 @@ MemDepResult MemoryDependenceAnalysis::
 getPointerDependencyFrom(const AliasAnalysis::Location &MemLoc, bool isLoad, 
                          BasicBlock::iterator ScanIt, BasicBlock *BB) {
 
-  Value *InvariantTag = 0;
+  const Value *MemLocBase = 0;
+  int64_t MemLocOffset = 0;
+
+  unsigned Limit = BlockScanLimit;
 
   // Walk backwards through the basic block, looking for dependencies.
   while (ScanIt != BB->begin()) {
+    // Limit the amount of scanning we do so we don't end up with quadratic
+    // running time on extreme testcases.
+    --Limit;
+    if (!Limit)
+      return MemDepResult::getUnknown();
+
     Instruction *Inst = --ScanIt;
 
-    // If we're in an invariant region, no dependencies can be found before
-    // we pass an invariant-begin marker.
-    if (InvariantTag == Inst) {
-      InvariantTag = 0;
-      continue;
-    }
-    
     if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
       // Debug intrinsics don't (and can't) cause dependences.
       if (isa<DbgInfoIntrinsic>(II)) continue;
       
-      // If we pass an invariant-end marker, then we've just entered an
-      // invariant region and can start ignoring dependencies.
-      if (II->getIntrinsicID() == Intrinsic::invariant_end) {
-        // FIXME: This only considers queries directly on the invariant-tagged
-        // pointer, not on query pointers that are indexed off of them.  It'd
-        // be nice to handle that at some point.
-        AliasAnalysis::AliasResult R =
-          AA->alias(AliasAnalysis::Location(II->getArgOperand(2)), MemLoc);
-        if (R == AliasAnalysis::MustAlias)
-          InvariantTag = II->getArgOperand(0);
-
-        continue;
-      }
-
       // If we reach a lifetime begin or end marker, then the query ends here
       // because the value is undefined.
       if (II->getIntrinsicID() == Intrinsic::lifetime_start) {
         // FIXME: This only considers queries directly on the invariant-tagged
         // pointer, not on query pointers that are indexed off of them.  It'd
-        // be nice to handle that at some point.
-        AliasAnalysis::AliasResult R =
-          AA->alias(AliasAnalysis::Location(II->getArgOperand(1)), MemLoc);
-        if (R == AliasAnalysis::MustAlias)
+        // be nice to handle that at some point (the right approach is to use
+        // GetPointerBaseWithConstantOffset).
+        if (AA->isMustAlias(AliasAnalysis::Location(II->getArgOperand(1)),
+                            MemLoc))
           return MemDepResult::getDef(II);
         continue;
       }
     }
 
-    // If we're querying on a load and we're in an invariant region, we're done
-    // at this point. Nothing a load depends on can live in an invariant region.
-    //
-    // FIXME: this will prevent us from returning load/load must-aliases, so GVN
-    // won't remove redundant loads.
-    if (isLoad && InvariantTag) continue;
-
     // Values depend on loads if the pointers are must aliased.  This means that
     // a load depends on another must aliased load from the same value.
     if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
@@ -288,27 +373,57 @@ getPointerDependencyFrom(const AliasAnalysis::Location &MemLoc, bool isLoad,
       
       // If we found a pointer, check if it could be the same as our pointer.
       AliasAnalysis::AliasResult R = AA->alias(LoadLoc, MemLoc);
-      if (R == AliasAnalysis::NoAlias)
-        continue;
       
-      // May-alias loads don't depend on each other without a dependence.
-      if (isLoad && R != AliasAnalysis::MustAlias)
+      if (isLoad) {
+        if (R == AliasAnalysis::NoAlias) {
+          // If this is an over-aligned integer load (for example,
+          // "load i8* %P, align 4") see if it would obviously overlap with the
+          // queried location if widened to a larger load (e.g. if the queried
+          // location is 1 byte at P+1).  If so, return it as a load/load
+          // clobber result, allowing the client to decide to widen the load if
+          // it wants to.
+          if (const IntegerType *ITy = dyn_cast<IntegerType>(LI->getType()))
+            if (LI->getAlignment()*8 > ITy->getPrimitiveSizeInBits() &&
+                isLoadLoadClobberIfExtendedToFullWidth(MemLoc, MemLocBase,
+                                                       MemLocOffset, LI, TD))
+              return MemDepResult::getClobber(Inst);
+          
+          continue;
+        }
+        
+        // Must aliased loads are defs of each other.
+        if (R == AliasAnalysis::MustAlias)
+          return MemDepResult::getDef(Inst);
+
+#if 0 // FIXME: Temporarily disabled. GVN is cleverly rewriting loads
+      // in terms of clobbering loads, but since it does this by looking
+      // at the clobbering load directly, it doesn't know about any
+      // phi translation that may have happened along the way.
+
+        // If we have a partial alias, then return this as a clobber for the
+        // client to handle.
+        if (R == AliasAnalysis::PartialAlias)
+          return MemDepResult::getClobber(Inst);
+#endif
+        
+        // Random may-alias loads don't depend on each other without a
+        // dependence.
+        continue;
+      }
+
+      // Stores don't depend on other no-aliased accesses.
+      if (R == AliasAnalysis::NoAlias)
         continue;
 
       // Stores don't alias loads from read-only memory.
-      if (!isLoad && AA->pointsToConstantMemory(LoadLoc))
+      if (AA->pointsToConstantMemory(LoadLoc))
         continue;
 
-      // Stores depend on may and must aliased loads, loads depend on must-alias
-      // loads.
+      // Stores depend on may/must aliased loads.
       return MemDepResult::getDef(Inst);
     }
     
     if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
-      // There can't be stores to the value we care about inside an 
-      // invariant region.
-      if (InvariantTag) continue;
-      
       // If alias analysis can tell that this store is guaranteed to not modify
       // the query pointer, ignore it.  Use getModRefInfo to handle cases where
       // the query pointer points to constant memory etc.
@@ -341,8 +456,7 @@ getPointerDependencyFrom(const AliasAnalysis::Location &MemLoc, bool isLoad,
         (isa<CallInst>(Inst) && extractMallocCall(Inst))) {
       const Value *AccessPtr = GetUnderlyingObject(MemLoc.Ptr, TD);
       
-      if (AccessPtr == Inst ||
-          AA->alias(Inst, 1, AccessPtr, 1) == AliasAnalysis::MustAlias)
+      if (AccessPtr == Inst || AA->isMustAlias(Inst, AccessPtr))
         return MemDepResult::getDef(Inst);
       continue;
     }
@@ -353,9 +467,6 @@ getPointerDependencyFrom(const AliasAnalysis::Location &MemLoc, bool isLoad,
       // If the call has no effect on the queried pointer, just ignore it.
       continue;
     case AliasAnalysis::Mod:
-      // If we're in an invariant region, we can ignore calls that ONLY
-      // modify the pointer.
-      if (InvariantTag) continue;
       return MemDepResult::getClobber(Inst);
     case AliasAnalysis::Ref:
       // If the call is known to never store to the pointer, and if this is a
@@ -368,11 +479,11 @@ getPointerDependencyFrom(const AliasAnalysis::Location &MemLoc, bool isLoad,
     }
   }
   
-  // No dependence found.  If this is the entry block of the function, it is a
-  // clobber, otherwise it is non-local.
+  // No dependence found.  If this is the entry block of the function, it is
+  // unknown, otherwise it is non-local.
   if (BB != &BB->getParent()->getEntryBlock())
     return MemDepResult::getNonLocal();
-  return MemDepResult::getClobber(ScanIt);
+  return MemDepResult::getUnknown();
 }
 
 /// getDependency - Return the instruction on which a memory operation
@@ -400,12 +511,12 @@ MemDepResult MemoryDependenceAnalysis::getDependency(Instruction *QueryInst) {
   
   // Do the scan.
   if (BasicBlock::iterator(QueryInst) == QueryParent->begin()) {
-    // No dependence found.  If this is the entry block of the function, it is a
-    // clobber, otherwise it is non-local.
+    // No dependence found.  If this is the entry block of the function, it is
+    // unknown, otherwise it is non-local.
     if (QueryParent != &QueryParent->getParent()->getEntryBlock())
       LocalCache = MemDepResult::getNonLocal();
     else
-      LocalCache = MemDepResult::getClobber(QueryInst);
+      LocalCache = MemDepResult::getUnknown();
   } else {
     AliasAnalysis::Location MemLoc;
     AliasAnalysis::ModRefResult MR = GetLocation(QueryInst, MemLoc, AA);
@@ -413,7 +524,7 @@ MemDepResult MemoryDependenceAnalysis::getDependency(Instruction *QueryInst) {
       // If we can do a pointer scan, make it happen.
       bool isLoad = !(MR & AliasAnalysis::Mod);
       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(QueryInst))
-        isLoad |= II->getIntrinsicID() == Intrinsic::lifetime_end;
+        isLoad |= II->getIntrinsicID() == Intrinsic::lifetime_start;
 
       LocalCache = getPointerDependencyFrom(MemLoc, isLoad, ScanPos,
                                             QueryParent);
@@ -424,7 +535,7 @@ MemDepResult MemoryDependenceAnalysis::getDependency(Instruction *QueryInst) {
                                              QueryParent);
     } else
       // Non-memory instruction.
-      LocalCache = MemDepResult::getClobber(--BasicBlock::iterator(ScanPos));
+      LocalCache = MemDepResult::getUnknown();
   }
   
   // Remember the result!
@@ -558,10 +669,10 @@ MemoryDependenceAnalysis::getNonLocalCallDependency(CallSite QueryCS) {
       Dep = getCallSiteDependencyFrom(QueryCS, isReadonlyCall,ScanPos, DirtyBB);
     } else if (DirtyBB != &DirtyBB->getParent()->getEntryBlock()) {
       // No dependence found.  If this is the entry block of the function, it is
-      // a clobber, otherwise it is non-local.
+      // a clobber, otherwise it is unknown.
       Dep = MemDepResult::getNonLocal();
     } else {
-      Dep = MemDepResult::getClobber(ScanPos);
+      Dep = MemDepResult::getUnknown();
     }
     
     // If we had a dirty entry for the block, update it.  Otherwise, just add
@@ -617,7 +728,7 @@ getNonLocalPointerDependency(const AliasAnalysis::Location &Loc, bool isLoad,
     return;
   Result.clear();
   Result.push_back(NonLocalDepResult(FromBB,
-                                     MemDepResult::getClobber(FromBB->begin()),
+                                     MemDepResult::getUnknown(),
                                      const_cast<Value *>(Loc.Ptr)));
 }
 
@@ -679,7 +790,7 @@ GetNonLocalInfoForBlock(const AliasAnalysis::Location &Loc,
   // If the block has a dependency (i.e. it isn't completely transparent to
   // the value), remember the reverse association because we just added it
   // to Cache!
-  if (Dep.isNonLocal())
+  if (Dep.isNonLocal() || Dep.isUnknown())
     return Dep;
   
   // Keep the ReverseNonLocalPtrDeps map up to date so we can efficiently
@@ -853,6 +964,9 @@ getNonLocalPointerDepFromBB(const PHITransAddr &Pointer,
   SmallVector<BasicBlock*, 32> Worklist;
   Worklist.push_back(StartBB);
   
+  // PredList used inside loop.
+  SmallVector<std::pair<BasicBlock*, PHITransAddr>, 16> PredList;
+
   // Keep track of the entries that we know are sorted.  Previously cached
   // entries will all be sorted.  The entries we add we only sort on demand (we
   // don't insert every element into its sorted position).  We know that we
@@ -889,22 +1003,29 @@ getNonLocalPointerDepFromBB(const PHITransAddr &Pointer,
     // the same Pointer.
     if (!Pointer.NeedsPHITranslationFromBlock(BB)) {
       SkipFirstBlock = false;
+      SmallVector<BasicBlock*, 16> NewBlocks;
       for (BasicBlock **PI = PredCache->GetPreds(BB); *PI; ++PI) {
         // Verify that we haven't looked at this block yet.
         std::pair<DenseMap<BasicBlock*,Value*>::iterator, bool>
           InsertRes = Visited.insert(std::make_pair(*PI, Pointer.getAddr()));
         if (InsertRes.second) {
           // First time we've looked at *PI.
-          Worklist.push_back(*PI);
+          NewBlocks.push_back(*PI);
           continue;
         }
         
         // If we have seen this block before, but it was with a different
         // pointer then we have a phi translation failure and we have to treat
         // this as a clobber.
-        if (InsertRes.first->second != Pointer.getAddr())
+        if (InsertRes.first->second != Pointer.getAddr()) {
+          // Make sure to clean up the Visited map before continuing on to
+          // PredTranslationFailure.
+          for (unsigned i = 0; i < NewBlocks.size(); i++)
+            Visited.erase(NewBlocks[i]);
           goto PredTranslationFailure;
+        }
       }
+      Worklist.append(NewBlocks.begin(), NewBlocks.end());
       continue;
     }
     
@@ -923,13 +1044,15 @@ getNonLocalPointerDepFromBB(const PHITransAddr &Pointer,
       NumSortedEntries = Cache->size();
     }
     Cache = 0;
-    
+
+    PredList.clear();
     for (BasicBlock **PI = PredCache->GetPreds(BB); *PI; ++PI) {
       BasicBlock *Pred = *PI;
-      
+      PredList.push_back(std::make_pair(Pred, Pointer));
+
       // Get the PHI translated pointer in this predecessor.  This can fail if
       // not translatable, in which case the getAddr() returns null.
-      PHITransAddr PredPointer(Pointer);
+      PHITransAddr &PredPointer = PredList.back().second;
       PredPointer.PHITranslateValue(BB, Pred, 0);
 
       Value *PredPtrVal = PredPointer.getAddr();
@@ -943,6 +1066,9 @@ getNonLocalPointerDepFromBB(const PHITransAddr &Pointer,
         InsertRes = Visited.insert(std::make_pair(Pred, PredPtrVal));
 
       if (!InsertRes.second) {
+        // We found the pred; take it off the list of preds to visit.
+        PredList.pop_back();
+
         // If the predecessor was visited with PredPtr, then we already did
         // the analysis and can ignore it.
         if (InsertRes.first->second == PredPtrVal)
@@ -951,18 +1077,49 @@ getNonLocalPointerDepFromBB(const PHITransAddr &Pointer,
         // Otherwise, the block was previously analyzed with a different
         // pointer.  We can't represent the result of this case, so we just
         // treat this as a phi translation failure.
+
+        // Make sure to clean up the Visited map before continuing on to
+        // PredTranslationFailure.
+        for (unsigned i = 0; i < PredList.size(); i++)
+          Visited.erase(PredList[i].first);
+
         goto PredTranslationFailure;
       }
-      
+    }
+
+    // Actually process results here; this need to be a separate loop to avoid
+    // calling getNonLocalPointerDepFromBB for blocks we don't want to return
+    // any results for.  (getNonLocalPointerDepFromBB will modify our 
+    // datastructures in ways the code after the PredTranslationFailure label
+    // doesn't expect.)
+    for (unsigned i = 0; i < PredList.size(); i++) {
+      BasicBlock *Pred = PredList[i].first;
+      PHITransAddr &PredPointer = PredList[i].second;
+      Value *PredPtrVal = PredPointer.getAddr();
+
+      bool CanTranslate = true;
       // If PHI translation was unable to find an available pointer in this
       // predecessor, then we have to assume that the pointer is clobbered in
       // that predecessor.  We can still do PRE of the load, which would insert
       // a computation of the pointer in this predecessor.
-      if (PredPtrVal == 0) {
+      if (PredPtrVal == 0)
+        CanTranslate = false;
+
+      // FIXME: it is entirely possible that PHI translating will end up with
+      // the same value.  Consider PHI translating something like:
+      // X = phi [x, bb1], [y, bb2].  PHI translating for bb1 doesn't *need*
+      // to recurse here, pedantically speaking.
+
+      // If getNonLocalPointerDepFromBB fails here, that means the cached
+      // result conflicted with the Visited list; we have to conservatively
+      // assume it is unknown, but this also does not block PRE of the load.
+      if (!CanTranslate ||
+          getNonLocalPointerDepFromBB(PredPointer,
+                                      Loc.getWithNewPtr(PredPtrVal),
+                                      isLoad, Pred,
+                                      Result, Visited)) {
         // Add the entry to the Result list.
-        NonLocalDepResult Entry(Pred,
-                                MemDepResult::getClobber(Pred->getTerminator()),
-                                PredPtrVal);
+        NonLocalDepResult Entry(Pred, MemDepResult::getUnknown(), PredPtrVal);
         Result.push_back(Entry);
 
         // Since we had a phi translation failure, the cache for CacheKey won't
@@ -974,19 +1131,6 @@ getNonLocalPointerDepFromBB(const PHITransAddr &Pointer,
         NLPI.Pair = BBSkipFirstBlockPair();
         continue;
       }
-
-      // FIXME: it is entirely possible that PHI translating will end up with
-      // the same value.  Consider PHI translating something like:
-      // X = phi [x, bb1], [y, bb2].  PHI translating for bb1 doesn't *need*
-      // to recurse here, pedantically speaking.
-      
-      // If we have a problem phi translating, fall through to the code below
-      // to handle the failure condition.
-      if (getNonLocalPointerDepFromBB(PredPointer,
-                                      Loc.getWithNewPtr(PredPointer.getAddr()),
-                                      isLoad, Pred,
-                                      Result, Visited))
-        goto PredTranslationFailure;
     }
     
     // Refresh the CacheInfo/Cache pointer so that it isn't invalidated.
@@ -1003,6 +1147,9 @@ getNonLocalPointerDepFromBB(const PHITransAddr &Pointer,
     continue;
 
   PredTranslationFailure:
+    // The following code is "failure"; we can't produce a sane translation
+    // for the given block.  It assumes that we haven't modified any of
+    // our datastructures while processing the current block.
     
     if (Cache == 0) {
       // Refresh the CacheInfo/Cache pointer if it got invalidated.
@@ -1017,8 +1164,7 @@ getNonLocalPointerDepFromBB(const PHITransAddr &Pointer,
     // results from the set".  Clear out the indicator for this.
     CacheInfo->Pair = BBSkipFirstBlockPair();
     
-    // If *nothing* works, mark the pointer as being clobbered by the first
-    // instruction in this block.
+    // If *nothing* works, mark the pointer as unknown.
     //
     // If this is the magic first block, return this as a clobber of the whole
     // incoming value.  Since we can't phi translate to one of the predecessors,
@@ -1033,8 +1179,7 @@ getNonLocalPointerDepFromBB(const PHITransAddr &Pointer,
       
       assert(I->getResult().isNonLocal() &&
              "Should only be here with transparent block");
-      I->setResult(MemDepResult::getClobber(BB->begin()));
-      ReverseNonLocalPtrDeps[BB->begin()].insert(CacheKey);
+      I->setResult(MemDepResult::getUnknown());
       Result.push_back(NonLocalDepResult(I->getBB(), I->getResult(),
                                          Pointer.getAddr()));
       break;
diff --git a/lib/Analysis/PHITransAddr.cpp b/lib/Analysis/PHITransAddr.cpp
index 93da5a4..70dcd0d 100644
--- a/lib/Analysis/PHITransAddr.cpp
+++ b/lib/Analysis/PHITransAddr.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/PHITransAddr.h"
+#include "llvm/Constants.h"
 #include "llvm/Instructions.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/InstructionSimplify.h"
diff --git a/lib/Analysis/PathNumbering.cpp b/lib/Analysis/PathNumbering.cpp
index 5d3f6bb..7c584da 100644
--- a/lib/Analysis/PathNumbering.cpp
+++ b/lib/Analysis/PathNumbering.cpp
@@ -38,13 +38,10 @@
 #include "llvm/Support/TypeBuilder.h"
 #include "llvm/Support/raw_ostream.h"
 
-#include <map>
 #include <queue>
-#include <set>
 #include <stack>
 #include <string>
 #include <utility>
-#include <vector>
 #include <sstream>
 
 using namespace llvm;
@@ -286,7 +283,7 @@ void BallLarusDag::calculatePathNumbers() {
       BallLarusEdge* exitEdge = addEdge(node, getExit(), 0);
       exitEdge->setType(BallLarusEdge::SPLITEDGE_PHONY);
 
-      // Counters to handle the possibilty of a multi-graph
+      // Counters to handle the possibility of a multi-graph
       BasicBlock* oldTarget = 0;
       unsigned duplicateNumber = 0;
 
diff --git a/lib/Analysis/PathProfileVerifier.cpp b/lib/Analysis/PathProfileVerifier.cpp
index c549773..0ae734e 100644
--- a/lib/Analysis/PathProfileVerifier.cpp
+++ b/lib/Analysis/PathProfileVerifier.cpp
@@ -124,7 +124,7 @@ bool PathProfileVerifier::runOnModule (Module &M) {
       ProfilePathEdgeVector* pev = currentPath->getPathEdges();
       DEBUG(dbgs () << "path #" << currentPath->getNumber() << ": "
             << currentPath->getCount() << "\n");
-      // setup the entry edge (normally path profiling doens't care about this)
+      // setup the entry edge (normally path profiling doesn't care about this)
       if (currentPath->getFirstBlockInPath() == &F->getEntryBlock())
         edgeArray[arrayMap[0][currentPath->getFirstBlockInPath()][0]]
           += currentPath->getCount();
diff --git a/lib/Analysis/ProfileEstimatorPass.cpp b/lib/Analysis/ProfileEstimatorPass.cpp
index 667ee1c..b594e2b 100644
--- a/lib/Analysis/ProfileEstimatorPass.cpp
+++ b/lib/Analysis/ProfileEstimatorPass.cpp
@@ -140,7 +140,7 @@ void ProfileEstimatorPass::recurseBasicBlock(BasicBlock *BB) {
     // loop, thus the edge is a backedge, continue and do not check if the
     // value is valid.
     if (BBisHeader && BBLoop->contains(*bbi)) {
-      printEdgeError(edge, "but is backedge, continueing");
+      printEdgeError(edge, "but is backedge, continuing");
       continue;
     }
     // If the edges value is missing (and this is no loop header, and this is
diff --git a/lib/Analysis/ProfileInfo.cpp b/lib/Analysis/ProfileInfo.cpp
index 36f211e..173de2c 100644
--- a/lib/Analysis/ProfileInfo.cpp
+++ b/lib/Analysis/ProfileInfo.cpp
@@ -309,9 +309,9 @@ void ProfileInfoT<Function,BasicBlock>::
   removeEdge(oldedge);
 }
 
-/// Replaces all occurences of RmBB in the ProfilingInfo with DestBB.
+/// Replaces all occurrences of RmBB in the ProfilingInfo with DestBB.
 /// This checks all edges of the function the blocks reside in and replaces the
-/// occurences of RmBB with DestBB.
+/// occurrences of RmBB with DestBB.
 template<>
 void ProfileInfoT<Function,BasicBlock>::
         replaceAllUses(const BasicBlock *RmBB, const BasicBlock *DestBB) {
@@ -812,7 +812,7 @@ void ProfileInfoT<Function,BasicBlock>::repair(const Function *F) {
       }
       if (iw < 0) continue;
 
-      // Check the recieving end of the path if it can handle the flow.
+      // Check the receiving end of the path if it can handle the flow.
       double ow = getExecutionCount(Dest);
       Processed.clear();
       for (succ_const_iterator NBB = succ_begin(BB), End = succ_end(BB);
diff --git a/lib/Analysis/ProfileInfoLoader.cpp b/lib/Analysis/ProfileInfoLoader.cpp
index 25481b2..eaa38da 100644
--- a/lib/Analysis/ProfileInfoLoader.cpp
+++ b/lib/Analysis/ProfileInfoLoader.cpp
@@ -19,7 +19,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include <cstdio>
 #include <cstdlib>
-#include <map>
 using namespace llvm;
 
 // ByteSwap - Byteswap 'Var' if 'Really' is true.
diff --git a/lib/Analysis/RegionPass.cpp b/lib/Analysis/RegionPass.cpp
index 3269dcc..80eda79 100644
--- a/lib/Analysis/RegionPass.cpp
+++ b/lib/Analysis/RegionPass.cpp
@@ -249,7 +249,7 @@ void RegionPass::assignPassManager(PMStack &PMS,
     assert (!PMS.empty() && "Unable to create Region Pass Manager");
     PMDataManager *PMD = PMS.top();
 
-    // [1] Create new Call Graph Pass Manager
+    // [1] Create new Region Pass Manager
     RGPM = new RGPassManager(PMD->getDepth() + 1);
     RGPM->populateInheritedAnalysis(PMS);
 
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index 228974d..025718e 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -1035,6 +1035,93 @@ const SCEV *ScalarEvolution::getZeroExtendExpr(const SCEV *Op,
   return S;
 }
 
+// Get the limit of a recurrence such that incrementing by Step cannot cause
+// signed overflow as long as the value of the recurrence within the loop does
+// not exceed this limit before incrementing.
+static const SCEV *getOverflowLimitForStep(const SCEV *Step,
+                                           ICmpInst::Predicate *Pred,
+                                           ScalarEvolution *SE) {
+  unsigned BitWidth = SE->getTypeSizeInBits(Step->getType());
+  if (SE->isKnownPositive(Step)) {
+    *Pred = ICmpInst::ICMP_SLT;
+    return SE->getConstant(APInt::getSignedMinValue(BitWidth) -
+                           SE->getSignedRange(Step).getSignedMax());
+  }
+  if (SE->isKnownNegative(Step)) {
+    *Pred = ICmpInst::ICMP_SGT;
+    return SE->getConstant(APInt::getSignedMaxValue(BitWidth) -
+                       SE->getSignedRange(Step).getSignedMin());
+  }
+  return 0;
+}
+
+// The recurrence AR has been shown to have no signed wrap. Typically, if we can
+// prove NSW for AR, then we can just as easily prove NSW for its preincrement
+// or postincrement sibling. This allows normalizing a sign extended AddRec as
+// such: {sext(Step + Start),+,Step} => {(Step + sext(Start),+,Step} As a
+// result, the expression "Step + sext(PreIncAR)" is congruent with
+// "sext(PostIncAR)"
+static const SCEV *getPreStartForSignExtend(const SCEVAddRecExpr *AR,
+                                            const Type *Ty,
+                                            ScalarEvolution *SE) {
+  const Loop *L = AR->getLoop();
+  const SCEV *Start = AR->getStart();
+  const SCEV *Step = AR->getStepRecurrence(*SE);
+
+  // Check for a simple looking step prior to loop entry.
+  const SCEVAddExpr *SA = dyn_cast<SCEVAddExpr>(Start);
+  if (!SA || SA->getNumOperands() != 2 || SA->getOperand(0) != Step)
+    return 0;
+
+  // This is a postinc AR. Check for overflow on the preinc recurrence using the
+  // same three conditions that getSignExtendedExpr checks.
+
+  // 1. NSW flags on the step increment.
+  const SCEV *PreStart = SA->getOperand(1);
+  const SCEVAddRecExpr *PreAR = dyn_cast<SCEVAddRecExpr>(
+    SE->getAddRecExpr(PreStart, Step, L, SCEV::FlagAnyWrap));
+
+  if (PreAR && PreAR->getNoWrapFlags(SCEV::FlagNSW))
+    return PreStart;
+
+  // 2. Direct overflow check on the step operation's expression.
+  unsigned BitWidth = SE->getTypeSizeInBits(AR->getType());
+  const Type *WideTy = IntegerType::get(SE->getContext(), BitWidth * 2);
+  const SCEV *OperandExtendedStart =
+    SE->getAddExpr(SE->getSignExtendExpr(PreStart, WideTy),
+                   SE->getSignExtendExpr(Step, WideTy));
+  if (SE->getSignExtendExpr(Start, WideTy) == OperandExtendedStart) {
+    // Cache knowledge of PreAR NSW.
+    if (PreAR)
+      const_cast<SCEVAddRecExpr *>(PreAR)->setNoWrapFlags(SCEV::FlagNSW);
+    // FIXME: this optimization needs a unit test
+    DEBUG(dbgs() << "SCEV: untested prestart overflow check\n");
+    return PreStart;
+  }
+
+  // 3. Loop precondition.
+  ICmpInst::Predicate Pred;
+  const SCEV *OverflowLimit = getOverflowLimitForStep(Step, &Pred, SE);
+
+  if (OverflowLimit &&
+      SE->isLoopEntryGuardedByCond(L, Pred, PreStart, OverflowLimit)) {
+    return PreStart;
+  }
+  return 0;
+}
+
+// Get the normalized sign-extended expression for this AddRec's Start.
+static const SCEV *getSignExtendAddRecStart(const SCEVAddRecExpr *AR,
+                                            const Type *Ty,
+                                            ScalarEvolution *SE) {
+  const SCEV *PreStart = getPreStartForSignExtend(AR, Ty, SE);
+  if (!PreStart)
+    return SE->getSignExtendExpr(AR->getStart(), Ty);
+
+  return SE->getAddExpr(SE->getSignExtendExpr(AR->getStepRecurrence(*SE), Ty),
+                        SE->getSignExtendExpr(PreStart, Ty));
+}
+
 const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op,
                                                const Type *Ty) {
   assert(getTypeSizeInBits(Op->getType()) < getTypeSizeInBits(Ty) &&
@@ -1097,7 +1184,7 @@ const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op,
       // If we have special knowledge that this addrec won't overflow,
       // we don't need to do any further analysis.
       if (AR->getNoWrapFlags(SCEV::FlagNSW))
-        return getAddRecExpr(getSignExtendExpr(Start, Ty),
+        return getAddRecExpr(getSignExtendAddRecStart(AR, Ty, this),
                              getSignExtendExpr(Step, Ty),
                              L, SCEV::FlagNSW);
 
@@ -1133,7 +1220,7 @@ const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op,
             // Cache knowledge of AR NSW, which is propagated to this AddRec.
             const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNSW);
             // Return the expression with the addrec on the outside.
-            return getAddRecExpr(getSignExtendExpr(Start, Ty),
+            return getAddRecExpr(getSignExtendAddRecStart(AR, Ty, this),
                                  getSignExtendExpr(Step, Ty),
                                  L, AR->getNoWrapFlags());
           }
@@ -1149,7 +1236,7 @@ const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op,
             // Cache knowledge of AR NSW, which is propagated to this AddRec.
             const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNSW);
             // Return the expression with the addrec on the outside.
-            return getAddRecExpr(getSignExtendExpr(Start, Ty),
+            return getAddRecExpr(getSignExtendAddRecStart(AR, Ty, this),
                                  getZeroExtendExpr(Step, Ty),
                                  L, AR->getNoWrapFlags());
           }
@@ -1159,34 +1246,18 @@ const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op,
         // the addrec is safe. Also, if the entry is guarded by a comparison
         // with the start value and the backedge is guarded by a comparison
         // with the post-inc value, the addrec is safe.
-        if (isKnownPositive(Step)) {
-          const SCEV *N = getConstant(APInt::getSignedMinValue(BitWidth) -
-                                      getSignedRange(Step).getSignedMax());
-          if (isLoopBackedgeGuardedByCond(L, ICmpInst::ICMP_SLT, AR, N) ||
-              (isLoopEntryGuardedByCond(L, ICmpInst::ICMP_SLT, Start, N) &&
-               isLoopBackedgeGuardedByCond(L, ICmpInst::ICMP_SLT,
-                                           AR->getPostIncExpr(*this), N))) {
-            // Cache knowledge of AR NSW, which is propagated to this AddRec.
-            const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNSW);
-            // Return the expression with the addrec on the outside.
-            return getAddRecExpr(getSignExtendExpr(Start, Ty),
-                                 getSignExtendExpr(Step, Ty),
-                                 L, AR->getNoWrapFlags());
-          }
-        } else if (isKnownNegative(Step)) {
-          const SCEV *N = getConstant(APInt::getSignedMaxValue(BitWidth) -
-                                      getSignedRange(Step).getSignedMin());
-          if (isLoopBackedgeGuardedByCond(L, ICmpInst::ICMP_SGT, AR, N) ||
-              (isLoopEntryGuardedByCond(L, ICmpInst::ICMP_SGT, Start, N) &&
-               isLoopBackedgeGuardedByCond(L, ICmpInst::ICMP_SGT,
-                                           AR->getPostIncExpr(*this), N))) {
-            // Cache knowledge of AR NSW, which is propagated to this AddRec.
-            const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNSW);
-            // Return the expression with the addrec on the outside.
-            return getAddRecExpr(getSignExtendExpr(Start, Ty),
-                                 getSignExtendExpr(Step, Ty),
-                                 L, AR->getNoWrapFlags());
-          }
+        ICmpInst::Predicate Pred;
+        const SCEV *OverflowLimit = getOverflowLimitForStep(Step, &Pred, this);
+        if (OverflowLimit &&
+            (isLoopBackedgeGuardedByCond(L, Pred, AR, OverflowLimit) ||
+             (isLoopEntryGuardedByCond(L, Pred, Start, OverflowLimit) &&
+              isLoopBackedgeGuardedByCond(L, Pred, AR->getPostIncExpr(*this),
+                                          OverflowLimit)))) {
+          // Cache knowledge of AR NSW, then propagate NSW to the wide AddRec.
+          const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNSW);
+          return getAddRecExpr(getSignExtendAddRecStart(AR, Ty, this),
+                               getSignExtendExpr(Step, Ty),
+                               L, AR->getNoWrapFlags());
         }
       }
     }
@@ -1882,7 +1953,7 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
       // outer mul and the inner addrec are guaranteed to have no overflow.
       //
       // No self-wrap cannot be guaranteed after changing the step size, but
-      // will be infered if either NUW or NSW is true.
+      // will be inferred if either NUW or NSW is true.
       Flags = AddRec->getNoWrapFlags(clearFlags(Flags, SCEV::FlagNW));
       const SCEV *NewRec = getAddRecExpr(NewOps, AddRecLoop, Flags);
 
@@ -2015,7 +2086,7 @@ const SCEV *ScalarEvolution::getUDivExpr(const SCEV *LHS,
           }
       }
       // (A+B)/C --> (A/C + B/C) if safe and A/C and B/C can be folded.
-      if (const SCEVAddRecExpr *A = dyn_cast<SCEVAddRecExpr>(LHS)) {
+      if (const SCEVAddExpr *A = dyn_cast<SCEVAddExpr>(LHS)) {
         SmallVector<const SCEV *, 4> Operands;
         for (unsigned i = 0, e = A->getNumOperands(); i != e; ++i)
           Operands.push_back(getZeroExtendExpr(A->getOperand(i), ExtTy));
@@ -3783,24 +3854,25 @@ ScalarEvolution::getBackedgeTakenInfo(const Loop *L) {
   // update the value. The temporary CouldNotCompute value tells SCEV
   // code elsewhere that it shouldn't attempt to request a new
   // backedge-taken count, which could result in infinite recursion.
-  std::pair<std::map<const Loop *, BackedgeTakenInfo>::iterator, bool> Pair =
+  std::pair<DenseMap<const Loop *, BackedgeTakenInfo>::iterator, bool> Pair =
     BackedgeTakenCounts.insert(std::make_pair(L, getCouldNotCompute()));
   if (!Pair.second)
     return Pair.first->second;
 
-  BackedgeTakenInfo BECount = ComputeBackedgeTakenCount(L);
-  if (BECount.Exact != getCouldNotCompute()) {
-    assert(isLoopInvariant(BECount.Exact, L) &&
-           isLoopInvariant(BECount.Max, L) &&
+  BackedgeTakenInfo Result = getCouldNotCompute();
+  BackedgeTakenInfo Computed = ComputeBackedgeTakenCount(L);
+  if (Computed.Exact != getCouldNotCompute()) {
+    assert(isLoopInvariant(Computed.Exact, L) &&
+           isLoopInvariant(Computed.Max, L) &&
            "Computed backedge-taken count isn't loop invariant for loop!");
     ++NumTripCountsComputed;
 
     // Update the value in the map.
-    Pair.first->second = BECount;
+    Result = Computed;
   } else {
-    if (BECount.Max != getCouldNotCompute())
+    if (Computed.Max != getCouldNotCompute())
       // Update the value in the map.
-      Pair.first->second = BECount;
+      Result = Computed;
     if (isa<PHINode>(L->getHeader()->begin()))
       // Only count loops that have phi nodes as not being computable.
       ++NumTripCountsNotComputed;
@@ -3811,7 +3883,7 @@ ScalarEvolution::getBackedgeTakenInfo(const Loop *L) {
   // conservative estimates made without the benefit of trip count
   // information. This is similar to the code in forgetLoop, except that
   // it handles SCEVUnknown PHI nodes specially.
-  if (BECount.hasAnyInfo()) {
+  if (Computed.hasAnyInfo()) {
     SmallVector<Instruction *, 16> Worklist;
     PushLoopPHIs(L, Worklist);
 
@@ -3842,7 +3914,13 @@ ScalarEvolution::getBackedgeTakenInfo(const Loop *L) {
       PushDefUseChildren(I, Worklist);
     }
   }
-  return Pair.first->second;
+
+  // Re-lookup the insert position, since the call to
+  // ComputeBackedgeTakenCount above could result in a
+  // recusive call to getBackedgeTakenInfo (on a different
+  // loop), which would invalidate the iterator computed
+  // earlier.
+  return BackedgeTakenCounts.find(L)->second = Result;
 }
 
 /// forgetLoop - This method should be called by the client when it has
@@ -4426,7 +4504,7 @@ Constant *
 ScalarEvolution::getConstantEvolutionLoopExitValue(PHINode *PN,
                                                    const APInt &BEs,
                                                    const Loop *L) {
-  std::map<PHINode*, Constant*>::const_iterator I =
+  DenseMap<PHINode*, Constant*>::const_iterator I =
     ConstantEvolutionLoopExitValue.find(PN);
   if (I != ConstantEvolutionLoopExitValue.end())
     return I->second;
@@ -4694,9 +4772,15 @@ const SCEV *ScalarEvolution::computeSCEVAtScope(const SCEV *V, const Loop *L) {
       for (++i; i != e; ++i)
         NewOps.push_back(getSCEVAtScope(AddRec->getOperand(i), L));
 
-      AddRec = cast<SCEVAddRecExpr>(
+      const SCEV *FoldedRec =
         getAddRecExpr(NewOps, AddRec->getLoop(),
-                      AddRec->getNoWrapFlags(SCEV::FlagNW)));
+                      AddRec->getNoWrapFlags(SCEV::FlagNW));
+      AddRec = dyn_cast<SCEVAddRecExpr>(FoldedRec);
+      // The addrec may be folded to a nonrecurrence, for example, if the
+      // induction variable is multiplied by zero after constant folding. Go
+      // ahead and return the folded value.
+      if (!AddRec)
+        return FoldedRec;
       break;
     }
 
diff --git a/lib/Analysis/TypeBasedAliasAnalysis.cpp b/lib/Analysis/TypeBasedAliasAnalysis.cpp
index 40e18ab..0faf139 100644
--- a/lib/Analysis/TypeBasedAliasAnalysis.cpp
+++ b/lib/Analysis/TypeBasedAliasAnalysis.cpp
@@ -31,7 +31,7 @@
 //
 // The second field identifies the type's parent node in the tree, or
 // is null or omitted for a root node. A type is considered to alias
-// all of its decendents and all of its ancestors in the tree. Also,
+// all of its descendants and all of its ancestors in the tree. Also,
 // a type is considered to alias all types in other trees, so that
 // bitcode produced from multiple front-ends is handled conservatively.
 //
@@ -59,6 +59,7 @@
 
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/Passes.h"
+#include "llvm/Constants.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
 #include "llvm/Metadata.h"
diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
index a8117e6..dab5aeb 100644
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@@ -131,8 +131,18 @@ void llvm::ComputeMaskedBits(Value *V, const APInt &Mask,
     }
     return;
   }
+  
+  if (Argument *A = dyn_cast<Argument>(V)) {
+    // Get alignment information off byval arguments if specified in the IR.
+    if (A->hasByValAttr())
+      if (unsigned Align = A->getParamAlignment())
+        KnownZero = Mask & APInt::getLowBitsSet(BitWidth,
+                                                CountTrailingZeros_32(Align));
+    return;
+  }
 
-  KnownZero.clearAllBits(); KnownOne.clearAllBits();   // Start out not knowing anything.
+  // Start out not knowing anything.
+  KnownZero.clearAllBits(); KnownOne.clearAllBits();
 
   if (Depth == MaxDepth || Mask == 0)
     return;  // Limit search depth.
@@ -670,6 +680,10 @@ void llvm::ComputeMaskedBits(Value *V, const APInt &Mask,
         KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - LowBits);
         break;
       }
+      case Intrinsic::x86_sse42_crc32_64_8:
+      case Intrinsic::x86_sse42_crc32_64_64:
+        KnownZero = APInt::getHighBitsSet(64, 32);
+        break;
       }
     }
     break;
@@ -1328,7 +1342,7 @@ static Value *BuildSubAggregate(Value *From, Value* To, const Type *IndexedType,
         break;
       }
     }
-    // If we succesfully found a value for each of our subaggregates 
+    // If we successfully found a value for each of our subaggregates
     if (To)
       return To;
   }
@@ -1757,7 +1771,7 @@ llvm::GetUnderlyingObject(Value *V, const TargetData *TD, unsigned MaxLookup) {
     } else {
       // See if InstructionSimplify knows any relevant tricks.
       if (Instruction *I = dyn_cast<Instruction>(V))
-        // TODO: Aquire a DominatorTree and use it.
+        // TODO: Acquire a DominatorTree and use it.
         if (Value *Simplified = SimplifyInstruction(I, TD, 0)) {
           V = Simplified;
           continue;
diff --git a/lib/AsmParser/LLLexer.cpp b/lib/AsmParser/LLLexer.cpp
index 857fa1e..85defca 100644
--- a/lib/AsmParser/LLLexer.cpp
+++ b/lib/AsmParser/LLLexer.cpp
@@ -308,16 +308,8 @@ lltok::Kind LLLexer::LexAt() {
   }
 
   // Handle GlobalVarName: @[-a-zA-Z$._][-a-zA-Z$._0-9]*
-  if (isalpha(CurPtr[0]) || CurPtr[0] == '-' || CurPtr[0] == '$' ||
-      CurPtr[0] == '.' || CurPtr[0] == '_') {
-    ++CurPtr;
-    while (isalnum(CurPtr[0]) || CurPtr[0] == '-' || CurPtr[0] == '$' ||
-           CurPtr[0] == '.' || CurPtr[0] == '_')
-      ++CurPtr;
-
-    StrVal.assign(TokStart+1, CurPtr);   // Skip @
+  if (ReadVarName())
     return lltok::GlobalVar;
-  }
 
   // Handle GlobalVarID: @[0-9]+
   if (isdigit(CurPtr[0])) {
@@ -334,6 +326,39 @@ lltok::Kind LLLexer::LexAt() {
   return lltok::Error;
 }
 
+/// ReadString - Read a string until the closing quote.
+lltok::Kind LLLexer::ReadString(lltok::Kind kind) {
+  const char *Start = CurPtr;
+  while (1) {
+    int CurChar = getNextChar();
+
+    if (CurChar == EOF) {
+      Error("end of file in string constant");
+      return lltok::Error;
+    }
+    if (CurChar == '"') {
+      StrVal.assign(Start, CurPtr-1);
+      UnEscapeLexed(StrVal);
+      return kind;
+    }
+  }
+}
+
+/// ReadVarName - Read the rest of a token containing a variable name.
+bool LLLexer::ReadVarName() {
+  const char *NameStart = CurPtr;
+  if (isalpha(CurPtr[0]) || CurPtr[0] == '-' || CurPtr[0] == '$' ||
+      CurPtr[0] == '.' || CurPtr[0] == '_') {
+    ++CurPtr;
+    while (isalnum(CurPtr[0]) || CurPtr[0] == '-' || CurPtr[0] == '$' ||
+           CurPtr[0] == '.' || CurPtr[0] == '_')
+      ++CurPtr;
+
+    StrVal.assign(NameStart, CurPtr);
+    return true;
+  }
+  return false;
+}
 
 /// LexPercent - Lex all tokens that start with a % character:
 ///   LocalVar   ::= %\"[^\"]*\"
@@ -343,33 +368,12 @@ lltok::Kind LLLexer::LexPercent() {
   // Handle LocalVarName: %\"[^\"]*\"
   if (CurPtr[0] == '"') {
     ++CurPtr;
-
-    while (1) {
-      int CurChar = getNextChar();
-
-      if (CurChar == EOF) {
-        Error("end of file in string constant");
-        return lltok::Error;
-      }
-      if (CurChar == '"') {
-        StrVal.assign(TokStart+2, CurPtr-1);
-        UnEscapeLexed(StrVal);
-        return lltok::LocalVar;
-      }
-    }
+    return ReadString(lltok::LocalVar);
   }
 
   // Handle LocalVarName: %[-a-zA-Z$._][-a-zA-Z$._0-9]*
-  if (isalpha(CurPtr[0]) || CurPtr[0] == '-' || CurPtr[0] == '$' ||
-      CurPtr[0] == '.' || CurPtr[0] == '_') {
-    ++CurPtr;
-    while (isalnum(CurPtr[0]) || CurPtr[0] == '-' || CurPtr[0] == '$' ||
-           CurPtr[0] == '.' || CurPtr[0] == '_')
-      ++CurPtr;
-
-    StrVal.assign(TokStart+1, CurPtr);   // Skip %
+  if (ReadVarName())
     return lltok::LocalVar;
-  }
 
   // Handle LocalVarID: %[0-9]+
   if (isdigit(CurPtr[0])) {
@@ -390,38 +394,16 @@ lltok::Kind LLLexer::LexPercent() {
 ///   QuoteLabel        "[^"]+":
 ///   StringConstant    "[^"]*"
 lltok::Kind LLLexer::LexQuote() {
-  while (1) {
-    int CurChar = getNextChar();
-
-    if (CurChar == EOF) {
-      Error("end of file in quoted string");
-      return lltok::Error;
-    }
-
-    if (CurChar != '"') continue;
-
-    if (CurPtr[0] != ':') {
-      StrVal.assign(TokStart+1, CurPtr-1);
-      UnEscapeLexed(StrVal);
-      return lltok::StringConstant;
-    }
+  lltok::Kind kind = ReadString(lltok::StringConstant);
+  if (kind == lltok::Error || kind == lltok::Eof)
+    return kind;
 
+  if (CurPtr[0] == ':') {
     ++CurPtr;
-    StrVal.assign(TokStart+1, CurPtr-2);
-    UnEscapeLexed(StrVal);
-    return lltok::LabelStr;
+    kind = lltok::LabelStr;
   }
-}
 
-static bool JustWhitespaceNewLine(const char *&Ptr) {
-  const char *ThisPtr = Ptr;
-  while (*ThisPtr == ' ' || *ThisPtr == '\t')
-    ++ThisPtr;
-  if (*ThisPtr == '\n' || *ThisPtr == '\r') {
-    Ptr = ThisPtr;
-    return true;
-  }
-  return false;
+  return kind;
 }
 
 /// LexExclaim:
@@ -429,13 +411,15 @@ static bool JustWhitespaceNewLine(const char *&Ptr) {
 ///    !
 lltok::Kind LLLexer::LexExclaim() {
   // Lex a metadata name as a MetadataVar.
-  if (isalpha(CurPtr[0])) {
+  if (isalpha(CurPtr[0]) || CurPtr[0] == '-' || CurPtr[0] == '$' ||
+      CurPtr[0] == '.' || CurPtr[0] == '_' || CurPtr[0] == '\\') {
     ++CurPtr;
     while (isalnum(CurPtr[0]) || CurPtr[0] == '-' || CurPtr[0] == '$' ||
-           CurPtr[0] == '.' || CurPtr[0] == '_')
+           CurPtr[0] == '.' || CurPtr[0] == '_' || CurPtr[0] == '\\')
       ++CurPtr;
 
     StrVal.assign(TokStart+1, CurPtr);   // Skip !
+    UnEscapeLexed(StrVal);
     return lltok::MetadataVar;
   }
   return lltok::exclaim;
@@ -565,6 +549,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(nest);
   KEYWORD(readnone);
   KEYWORD(readonly);
+  KEYWORD(uwtable);
 
   KEYWORD(inlinehint);
   KEYWORD(noinline);
@@ -576,6 +561,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(noimplicitfloat);
   KEYWORD(naked);
   KEYWORD(hotpatch);
+  KEYWORD(nonlazybind);
 
   KEYWORD(type);
   KEYWORD(opaque);
@@ -604,26 +590,6 @@ lltok::Kind LLLexer::LexIdentifier() {
   TYPEKEYWORD("x86_mmx",   Type::getX86_MMXTy(Context));
 #undef TYPEKEYWORD
 
-  // Handle special forms for autoupgrading.  Drop these in LLVM 3.0.  This is
-  // to avoid conflicting with the sext/zext instructions, below.
-  if (Len == 4 && !memcmp(StartChar, "sext", 4)) {
-    // Scan CurPtr ahead, seeing if there is just whitespace before the newline.
-    if (JustWhitespaceNewLine(CurPtr))
-      return lltok::kw_signext;
-  } else if (Len == 4 && !memcmp(StartChar, "zext", 4)) {
-    // Scan CurPtr ahead, seeing if there is just whitespace before the newline.
-    if (JustWhitespaceNewLine(CurPtr))
-      return lltok::kw_zeroext;
-  } else if (Len == 6 && !memcmp(StartChar, "malloc", 6)) {
-    // FIXME: Remove in LLVM 3.0.
-    // Autoupgrade malloc instruction.
-    return lltok::kw_malloc;
-  } else if (Len == 4 && !memcmp(StartChar, "free", 4)) {
-    // FIXME: Remove in LLVM 3.0.
-    // Autoupgrade malloc instruction.
-    return lltok::kw_free;
-  }
-
   // Keywords for instructions.
 #define INSTKEYWORD(STR, Enum) \
   if (Len == strlen(#STR) && !memcmp(StartChar, #STR, strlen(#STR))) { \
@@ -695,14 +661,6 @@ lltok::Kind LLLexer::LexIdentifier() {
     return lltok::kw_cc;
   }
 
-  // If this starts with "call", return it as CALL.  This is to support old
-  // broken .ll files.  FIXME: remove this with LLVM 3.0.
-  if (CurPtr-TokStart > 4 && !memcmp(TokStart, "call", 4)) {
-    CurPtr = TokStart+4;
-    UIntVal = Instruction::Call;
-    return lltok::kw_call;
-  }
-
   // Finally, if this isn't known, return an error.
   CurPtr = TokStart+1;
   return lltok::Error;
diff --git a/lib/AsmParser/LLLexer.h b/lib/AsmParser/LLLexer.h
index 09ae801..4fe705e 100644
--- a/lib/AsmParser/LLLexer.h
+++ b/lib/AsmParser/LLLexer.h
@@ -71,6 +71,9 @@ namespace llvm {
 
     int getNextChar();
     void SkipLineComment();
+    lltok::Kind ReadString(lltok::Kind kind);
+    bool ReadVarName();
+
     lltok::Kind LexIdentifier();
     lltok::Kind LexDigitOrNegative();
     lltok::Kind LexPositive();
diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp
index 0c3237a..fa1d97d 100644
--- a/lib/AsmParser/LLParser.cpp
+++ b/lib/AsmParser/LLParser.cpp
@@ -59,24 +59,6 @@ bool LLParser::ValidateEndOfModule() {
   }
   
   
-  // Update auto-upgraded malloc calls to "malloc".
-  // FIXME: Remove in LLVM 3.0.
-  if (MallocF) {
-    MallocF->setName("malloc");
-    // If setName() does not set the name to "malloc", then there is already a 
-    // declaration of "malloc".  In that case, iterate over all calls to MallocF
-    // and get them to call the declared "malloc" instead.
-    if (MallocF->getName() != "malloc") {
-      Constant *RealMallocF = M->getFunction("malloc");
-      if (RealMallocF->getType() != MallocF->getType())
-        RealMallocF = ConstantExpr::getBitCast(RealMallocF, MallocF->getType());
-      MallocF->replaceAllUsesWith(RealMallocF);
-      MallocF->eraseFromParent();
-      MallocF = NULL;
-    }
-  }
-  
-  
   // If there are entries in ForwardRefBlockAddresses at this point, they are
   // references after the function was defined.  Resolve those now.
   while (!ForwardRefBlockAddresses.empty()) {
@@ -176,7 +158,6 @@ bool LLParser::ParseTopLevelEntities() {
     switch (Lex.getKind()) {
     default:         return TokError("expected top-level entity");
     case lltok::Eof: return false;
-    //case lltok::kw_define:
     case lltok::kw_declare: if (ParseDeclare()) return true; break;
     case lltok::kw_define:  if (ParseDefine()) return true; break;
     case lltok::kw_module:  if (ParseModuleAsm()) return true; break;
@@ -514,7 +495,7 @@ bool LLParser::ParseMDNodeID(MDNode *&Result) {
   if (Result) return false;
 
   // Otherwise, create MDNode forward reference.
-  MDNode *FwdNode = MDNode::getTemporary(Context, 0, 0);
+  MDNode *FwdNode = MDNode::getTemporary(Context, ArrayRef<Value*>());
   ForwardRefMDNodes[MID] = std::make_pair(FwdNode, Lex.getLoc());
   
   if (NumberedMetadata.size() <= MID)
@@ -572,7 +553,7 @@ bool LLParser::ParseStandaloneMetadata() {
       ParseToken(lltok::rbrace, "expected end of metadata node"))
     return true;
 
-  MDNode *Init = MDNode::get(Context, Elts.data(), Elts.size());
+  MDNode *Init = MDNode::get(Context, Elts);
   
   // See if this was forward referenced, if so, handle it.
   std::map<unsigned, std::pair<TrackingVH<MDNode>, LocTy> >::iterator
@@ -931,33 +912,23 @@ bool LLParser::ParseOptionalAddrSpace(unsigned &AddrSpace) {
 /// ParseOptionalAttrs - Parse a potentially empty attribute list.  AttrKind
 /// indicates what kind of attribute list this is: 0: function arg, 1: result,
 /// 2: function attr.
-/// 3: function arg after value: FIXME: REMOVE IN LLVM 3.0
 bool LLParser::ParseOptionalAttrs(unsigned &Attrs, unsigned AttrKind) {
   Attrs = Attribute::None;
   LocTy AttrLoc = Lex.getLoc();
 
   while (1) {
     switch (Lex.getKind()) {
-    case lltok::kw_sext:
-    case lltok::kw_zext:
-      // Treat these as signext/zeroext if they occur in the argument list after
-      // the value, as in "call i8 @foo(i8 10 sext)".  If they occur before the
-      // value, as in "call i8 @foo(i8 sext (" then it is part of a constant
-      // expr.
-      // FIXME: REMOVE THIS IN LLVM 3.0
-      if (AttrKind == 3) {
-        if (Lex.getKind() == lltok::kw_sext)
-          Attrs |= Attribute::SExt;
-        else
-          Attrs |= Attribute::ZExt;
-        break;
-      }
-      // FALL THROUGH.
     default:  // End of attributes.
       if (AttrKind != 2 && (Attrs & Attribute::FunctionOnly))
         return Error(AttrLoc, "invalid use of function-only attribute");
 
-      if (AttrKind != 0 && AttrKind != 3 && (Attrs & Attribute::ParameterOnly))
+      // As a hack, we allow "align 2" on functions as a synonym for
+      // "alignstack 2".
+      if (AttrKind == 2 &&
+          (Attrs & ~(Attribute::FunctionOnly | Attribute::Alignment)))
+        return Error(AttrLoc, "invalid use of attribute on a function");
+
+      if (AttrKind != 0 && (Attrs & Attribute::ParameterOnly))
         return Error(AttrLoc, "invalid use of parameter-only attribute");
 
       return false;
@@ -972,6 +943,7 @@ bool LLParser::ParseOptionalAttrs(unsigned &Attrs, unsigned AttrKind) {
 
     case lltok::kw_noreturn:        Attrs |= Attribute::NoReturn; break;
     case lltok::kw_nounwind:        Attrs |= Attribute::NoUnwind; break;
+    case lltok::kw_uwtable:         Attrs |= Attribute::UWTable; break;
     case lltok::kw_noinline:        Attrs |= Attribute::NoInline; break;
     case lltok::kw_readnone:        Attrs |= Attribute::ReadNone; break;
     case lltok::kw_readonly:        Attrs |= Attribute::ReadOnly; break;
@@ -984,6 +956,7 @@ bool LLParser::ParseOptionalAttrs(unsigned &Attrs, unsigned AttrKind) {
     case lltok::kw_noimplicitfloat: Attrs |= Attribute::NoImplicitFloat; break;
     case lltok::kw_naked:           Attrs |= Attribute::Naked; break;
     case lltok::kw_hotpatch:        Attrs |= Attribute::Hotpatch; break;
+    case lltok::kw_nonlazybind:     Attrs |= Attribute::NonLazyBind; break;
 
     case lltok::kw_alignstack: {
       unsigned Alignment;
@@ -1494,11 +1467,7 @@ bool LLParser::ParseParameterList(SmallVectorImpl<ParamInfo> &ArgList,
       return true;
 
     // Otherwise, handle normal operands.
-    if (ParseOptionalAttrs(ArgAttrs1, 0) ||
-        ParseValue(ArgTy, V, PFS) ||
-        // FIXME: Should not allow attributes after the argument, remove this
-        // in LLVM 3.0.
-        ParseOptionalAttrs(ArgAttrs2, 3))
+    if (ParseOptionalAttrs(ArgAttrs1, 0) || ParseValue(ArgTy, V, PFS))
       return true;
     ArgList.push_back(ParamInfo(ArgLoc, V, ArgAttrs1|ArgAttrs2));
   }
@@ -2498,7 +2467,7 @@ bool LLParser::ParseMetadataListValue(ValID &ID, PerFunctionState *PFS) {
       ParseToken(lltok::rbrace, "expected end of metadata node"))
     return true;
 
-  ID.MDNodeVal = MDNode::get(Context, Elts.data(), Elts.size());
+  ID.MDNodeVal = MDNode::get(Context, Elts);
   ID.Kind = ValID::t_MDNode;
   return false;
 }
@@ -2761,13 +2730,6 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
   // and do semantic checks.
   std::vector<const Type*> ParamTypeList;
   SmallVector<AttributeWithIndex, 8> Attrs;
-  // FIXME : In 3.0, stop accepting zext, sext and inreg as optional function
-  // attributes.
-  unsigned ObsoleteFuncAttrs = Attribute::ZExt|Attribute::SExt|Attribute::InReg;
-  if (FuncAttrs & ObsoleteFuncAttrs) {
-    RetAttrs |= FuncAttrs & ObsoleteFuncAttrs;
-    FuncAttrs &= ~ObsoleteFuncAttrs;
-  }
 
   if (RetAttrs != Attribute::None)
     Attrs.push_back(AttributeWithIndex::get(0, RetAttrs));
@@ -3003,7 +2965,6 @@ int LLParser::ParseInstruction(Instruction *&Inst, BasicBlock *BB,
   case lltok::kw_sub:
   case lltok::kw_mul:
   case lltok::kw_shl: {
-    LocTy ModifierLoc = Lex.getLoc();
     bool NUW = EatIfPresent(lltok::kw_nuw);
     bool NSW = EatIfPresent(lltok::kw_nsw);
     if (!NUW) NUW = EatIfPresent(lltok::kw_nuw);
@@ -3062,8 +3023,6 @@ int LLParser::ParseInstruction(Instruction *&Inst, BasicBlock *BB,
   case lltok::kw_tail:           return ParseCall(Inst, PFS, true);
   // Memory.
   case lltok::kw_alloca:         return ParseAlloc(Inst, PFS);
-  case lltok::kw_malloc:         return ParseAlloc(Inst, PFS, BB, false);
-  case lltok::kw_free:           return ParseFree(Inst, PFS, BB);
   case lltok::kw_load:           return ParseLoad(Inst, PFS, false);
   case lltok::kw_store:          return ParseStore(Inst, PFS, false);
   case lltok::kw_volatile:
@@ -3341,14 +3300,6 @@ bool LLParser::ParseInvoke(Instruction *&Inst, PerFunctionState &PFS) {
   Value *Callee;
   if (ConvertValIDToValue(PFTy, CalleeID, Callee, &PFS)) return true;
 
-  // FIXME: In LLVM 3.0, stop accepting zext, sext and inreg as optional
-  // function attributes.
-  unsigned ObsoleteFuncAttrs = Attribute::ZExt|Attribute::SExt|Attribute::InReg;
-  if (FnAttrs & ObsoleteFuncAttrs) {
-    RetAttrs |= FnAttrs & ObsoleteFuncAttrs;
-    FnAttrs &= ~ObsoleteFuncAttrs;
-  }
-
   // Set up the Attributes for the function.
   SmallVector<AttributeWithIndex, 8> Attrs;
   if (RetAttrs != Attribute::None)
@@ -3686,14 +3637,6 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS,
   Value *Callee;
   if (ConvertValIDToValue(PFTy, CalleeID, Callee, &PFS)) return true;
 
-  // FIXME: In LLVM 3.0, stop accepting zext, sext and inreg as optional
-  // function attributes.
-  unsigned ObsoleteFuncAttrs = Attribute::ZExt|Attribute::SExt|Attribute::InReg;
-  if (FnAttrs & ObsoleteFuncAttrs) {
-    RetAttrs |= FnAttrs & ObsoleteFuncAttrs;
-    FnAttrs &= ~ObsoleteFuncAttrs;
-  }
-
   // Set up the Attributes for the function.
   SmallVector<AttributeWithIndex, 8> Attrs;
   if (RetAttrs != Attribute::None)
@@ -3743,10 +3686,8 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS,
 //===----------------------------------------------------------------------===//
 
 /// ParseAlloc
-///   ::= 'malloc' Type (',' TypeAndValue)? (',' OptionalInfo)?
 ///   ::= 'alloca' Type (',' TypeAndValue)? (',' OptionalInfo)?
-int LLParser::ParseAlloc(Instruction *&Inst, PerFunctionState &PFS,
-                         BasicBlock* BB, bool isAlloca) {
+int LLParser::ParseAlloc(Instruction *&Inst, PerFunctionState &PFS) {
   PATypeHolder Ty(Type::getVoidTy(Context));
   Value *Size = 0;
   LocTy SizeLoc;
@@ -3769,37 +3710,8 @@ int LLParser::ParseAlloc(Instruction *&Inst, PerFunctionState &PFS,
   if (Size && !Size->getType()->isIntegerTy())
     return Error(SizeLoc, "element count must have integer type");
 
-  if (isAlloca) {
-    Inst = new AllocaInst(Ty, Size, Alignment);
-    return AteExtraComma ? InstExtraComma : InstNormal;
-  }
-
-  // Autoupgrade old malloc instruction to malloc call.
-  // FIXME: Remove in LLVM 3.0.
-  if (Size && !Size->getType()->isIntegerTy(32))
-    return Error(SizeLoc, "element count must be i32");
-  const Type *IntPtrTy = Type::getInt32Ty(Context);
-  Constant *AllocSize = ConstantExpr::getSizeOf(Ty);
-  AllocSize = ConstantExpr::getTruncOrBitCast(AllocSize, IntPtrTy);
-  if (!MallocF)
-    // Prototype malloc as "void *(int32)".
-    // This function is renamed as "malloc" in ValidateEndOfModule().
-    MallocF = cast<Function>(
-       M->getOrInsertFunction("", Type::getInt8PtrTy(Context), IntPtrTy, NULL));
-  Inst = CallInst::CreateMalloc(BB, IntPtrTy, Ty, AllocSize, Size, MallocF);
-return AteExtraComma ? InstExtraComma : InstNormal;
-}
-
-/// ParseFree
-///   ::= 'free' TypeAndValue
-bool LLParser::ParseFree(Instruction *&Inst, PerFunctionState &PFS,
-                         BasicBlock* BB) {
-  Value *Val; LocTy Loc;
-  if (ParseTypeAndValue(Val, Loc, PFS)) return true;
-  if (!Val->getType()->isPointerTy())
-    return Error(Loc, "operand to free must be a pointer");
-  Inst = CallInst::CreateFree(Val, BB);
-  return false;
+  Inst = new AllocaInst(Ty, Size, Alignment);
+  return AteExtraComma ? InstExtraComma : InstNormal;
 }
 
 /// ParseLoad
diff --git a/lib/AsmParser/LLParser.h b/lib/AsmParser/LLParser.h
index 93e7f77..bbc641c 100644
--- a/lib/AsmParser/LLParser.h
+++ b/lib/AsmParser/LLParser.h
@@ -131,11 +131,10 @@ namespace llvm {
     std::map<ValID, std::vector<std::pair<ValID, GlobalValue*> > >
       ForwardRefBlockAddresses;
     
-    Function *MallocF;
   public:
     LLParser(MemoryBuffer *F, SourceMgr &SM, SMDiagnostic &Err, Module *m) : 
       Context(m->getContext()), Lex(F, SM, Err, m->getContext()),
-      M(m), MallocF(NULL) {}
+      M(m) {}
     bool Run();
 
     LLVMContext& getContext() { return Context; }
@@ -359,9 +358,7 @@ namespace llvm {
     bool ParseShuffleVector(Instruction *&I, PerFunctionState &PFS);
     int ParsePHI(Instruction *&I, PerFunctionState &PFS);
     bool ParseCall(Instruction *&I, PerFunctionState &PFS, bool isTail);
-    int ParseAlloc(Instruction *&I, PerFunctionState &PFS,
-                    BasicBlock *BB = 0, bool isAlloca = true);
-    bool ParseFree(Instruction *&I, PerFunctionState &PFS, BasicBlock *BB);
+    int ParseAlloc(Instruction *&I, PerFunctionState &PFS);
     int ParseLoad(Instruction *&I, PerFunctionState &PFS, bool isVolatile);
     int ParseStore(Instruction *&I, PerFunctionState &PFS, bool isVolatile);
     bool ParseGetResult(Instruction *&I, PerFunctionState &PFS);
diff --git a/lib/AsmParser/LLToken.h b/lib/AsmParser/LLToken.h
index 576da19..ea01264 100644
--- a/lib/AsmParser/LLToken.h
+++ b/lib/AsmParser/LLToken.h
@@ -87,6 +87,7 @@ namespace lltok {
     kw_nest,
     kw_readnone,
     kw_readonly,
+    kw_uwtable,
 
     kw_inlinehint,
     kw_noinline,
@@ -98,6 +99,7 @@ namespace lltok {
     kw_noimplicitfloat,
     kw_naked,
     kw_hotpatch,
+    kw_nonlazybind,
 
     kw_type,
     kw_opaque,
@@ -120,7 +122,7 @@ namespace lltok {
     kw_ret, kw_br, kw_switch, kw_indirectbr, kw_invoke, kw_unwind,
     kw_unreachable,
 
-    kw_malloc, kw_alloca, kw_free, kw_load, kw_store, kw_getelementptr,
+    kw_alloca, kw_load, kw_store, kw_getelementptr,
 
     kw_extractelement, kw_insertelement, kw_shufflevector, kw_getresult,
     kw_extractvalue, kw_insertvalue, kw_blockaddress,
diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp
index 8223f76..bc995ae 100644
--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -301,8 +301,7 @@ void BitcodeReaderValueList::ResolveConstantForwardRefs() {
         NewC = ConstantVector::get(NewOps);
       } else {
         assert(isa<ConstantExpr>(UserC) && "Must be a ConstantExpr.");
-        NewC = cast<ConstantExpr>(UserC)->getWithOperands(&NewOps[0],
-                                                          NewOps.size());
+        NewC = cast<ConstantExpr>(UserC)->getWithOperands(NewOps);
       }
 
       UserC->replaceAllUsesWith(NewC);
@@ -350,7 +349,7 @@ Value *BitcodeReaderMDValueList::getValueFwdRef(unsigned Idx) {
   }
 
   // Create and return a placeholder, which will later be RAUW'd.
-  Value *V = MDNode::getTemporary(Context, 0, 0);
+  Value *V = MDNode::getTemporary(Context, ArrayRef<Value*>());
   MDValuePtrs[Idx] = V;
   return V;
 }
@@ -844,9 +843,7 @@ bool BitcodeReader::ParseMetadata() {
         else
           Elts.push_back(NULL);
       }
-      Value *V = MDNode::getWhenValsUnresolved(Context,
-                                               Elts.data(), Elts.size(),
-                                               IsFunctionLocal);
+      Value *V = MDNode::getWhenValsUnresolved(Context, Elts, IsFunctionLocal);
       IsFunctionLocal = false;
       MDValueList.AssignValue(V, NextMDValueNo++);
       break;
@@ -1591,8 +1588,18 @@ bool BitcodeReader::ParseBitcodeInto(Module *M) {
   while (!Stream.AtEndOfStream()) {
     unsigned Code = Stream.ReadCode();
 
-    if (Code != bitc::ENTER_SUBBLOCK)
+    if (Code != bitc::ENTER_SUBBLOCK) {
+
+      // The ranlib in xcode 4 will align archive members by appending newlines to the
+      // end of them. If this file size is a multiple of 4 but not 8, we have to read and
+      // ignore these final 4 bytes :-(
+      if (Stream.GetAbbrevIDWidth() == 2 && Code == 2 &&
+          Stream.Read(6) == 2 && Stream.Read(24) == 0xa0a0a &&
+	  Stream.AtEndOfStream())
+        return false;
+
       return Error("Invalid record at top-level");
+    }
 
     unsigned BlockID = Stream.ReadSubBlockID();
 
@@ -1845,7 +1852,6 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
         FunctionBBs[i] = BasicBlock::Create(Context, "", F);
       CurBB = FunctionBBs[0];
       continue;
-
         
     case bitc::FUNC_CODE_DEBUG_LOC_AGAIN:  // DEBUG_LOC_AGAIN
       // This record indicates that the last instruction is at the same
diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp
index e34137f..bc218b3 100644
--- a/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -23,6 +23,7 @@
 #include "llvm/Operator.h"
 #include "llvm/TypeSymbolTable.h"
 #include "llvm/ValueSymbolTable.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
@@ -100,8 +101,6 @@ static unsigned GetEncodedBinaryOpcode(unsigned Opcode) {
   }
 }
 
-
-
 static void WriteStringRecord(unsigned Code, const std::string &Str,
                               unsigned AbbrevToUse, BitstreamWriter &Stream) {
   SmallVector<unsigned, 64> Vals;
@@ -447,7 +446,6 @@ static void WriteModuleInfo(const Module *M, const ValueEnumerator &VE,
     Vals.clear();
   }
 
-
   // Emit the alias information.
   for (Module::const_alias_iterator AI = M->alias_begin(), E = M->alias_end();
        AI != E; ++AI) {
@@ -871,8 +869,6 @@ static void WriteConstants(unsigned FirstVal, unsigned LastVal,
         break;
       }
     } else if (const BlockAddress *BA = dyn_cast<BlockAddress>(C)) {
-      assert(BA->getFunction() == BA->getBasicBlock()->getParent() &&
-             "Malformed blockaddress");
       Code = bitc::CST_CODE_BLOCKADDRESS;
       Record.push_back(VE.getTypeID(BA->getFunction()->getType()));
       Record.push_back(VE.getValueID(BA->getFunction()));
@@ -1514,9 +1510,9 @@ static void WriteModule(const Module *M, BitstreamWriter &Stream) {
   WriteModuleMetadata(M, VE, Stream);
 
   // Emit function bodies.
-  for (Module::const_iterator I = M->begin(), E = M->end(); I != E; ++I)
-    if (!I->isDeclaration())
-      WriteFunction(*I, VE, Stream);
+  for (Module::const_iterator F = M->begin(), E = M->end(); F != E; ++F)
+    if (!F->isDeclaration())
+      WriteFunction(*F, VE, Stream);
 
   // Emit metadata.
   WriteModuleMetadataStore(M, Stream);
@@ -1548,40 +1544,7 @@ enum {
   DarwinBCHeaderSize = 5*4
 };
 
-/// isARMTriplet - Return true if the triplet looks like:
-/// arm-*, thumb-*, armv[0-9]-*, thumbv[0-9]-*, armv5te-*, or armv6t2-*.
-static bool isARMTriplet(const std::string &TT) {
-  size_t Pos = 0;
-  size_t Size = TT.size();
-  if (Size >= 6 &&
-      TT[0] == 't' && TT[1] == 'h' && TT[2] == 'u' &&
-      TT[3] == 'm' && TT[4] == 'b')
-    Pos = 5;
-  else if (Size >= 4 && TT[0] == 'a' && TT[1] == 'r' && TT[2] == 'm')
-    Pos = 3;
-  else
-    return false;
-
-  if (TT[Pos] == '-')
-    return true;
-  else if (TT[Pos] == 'v') {
-    if (Size >= Pos+4 &&
-        TT[Pos+1] == '6' && TT[Pos+2] == 't' && TT[Pos+3] == '2')
-      return true;
-    else if (Size >= Pos+4 &&
-             TT[Pos+1] == '5' && TT[Pos+2] == 't' && TT[Pos+3] == 'e')
-      return true;
-  } else
-    return false;
-  while (++Pos < Size && TT[Pos] != '-') {
-    if (!isdigit(TT[Pos]))
-      return false;
-  }
-  return true;
-}
-
-static void EmitDarwinBCHeader(BitstreamWriter &Stream,
-                               const std::string &TT) {
+static void EmitDarwinBCHeader(BitstreamWriter &Stream, const Triple &TT) {
   unsigned CPUType = ~0U;
 
   // Match x86_64-*, i[3-9]86-*, powerpc-*, powerpc64-*, arm-*, thumb-*,
@@ -1595,16 +1558,16 @@ static void EmitDarwinBCHeader(BitstreamWriter &Stream,
     DARWIN_CPU_TYPE_POWERPC    = 18
   };
 
-  if (TT.find("x86_64-") == 0)
+  Triple::ArchType Arch = TT.getArch();
+  if (Arch == Triple::x86_64)
     CPUType = DARWIN_CPU_TYPE_X86 | DARWIN_CPU_ARCH_ABI64;
-  else if (TT.size() >= 5 && TT[0] == 'i' && TT[2] == '8' && TT[3] == '6' &&
-           TT[4] == '-' && TT[1] - '3' < 6)
+  else if (Arch == Triple::x86)
     CPUType = DARWIN_CPU_TYPE_X86;
-  else if (TT.find("powerpc-") == 0)
+  else if (Arch == Triple::ppc)
     CPUType = DARWIN_CPU_TYPE_POWERPC;
-  else if (TT.find("powerpc64-") == 0)
+  else if (Arch == Triple::ppc64)
     CPUType = DARWIN_CPU_TYPE_POWERPC | DARWIN_CPU_ARCH_ABI64;
-  else if (isARMTriplet(TT))
+  else if (Arch == Triple::arm || Arch == Triple::thumb)
     CPUType = DARWIN_CPU_TYPE_ARM;
 
   // Traditional Bitcode starts after header.
@@ -1650,11 +1613,9 @@ void llvm::WriteBitcodeToFile(const Module *M, raw_ostream &Out) {
 void llvm::WriteBitcodeToStream(const Module *M, BitstreamWriter &Stream) {
   // If this is darwin or another generic macho target, emit a file header and
   // trailer if needed.
-  bool isMacho =
-    M->getTargetTriple().find("-darwin") != std::string::npos ||
-    M->getTargetTriple().find("-macho") != std::string::npos;
-  if (isMacho)
-    EmitDarwinBCHeader(Stream, M->getTargetTriple());
+  Triple TT(M->getTargetTriple());
+  if (TT.isOSDarwin())
+    EmitDarwinBCHeader(Stream, TT);
 
   // Emit the file header.
   Stream.Emit((unsigned)'B', 8);
@@ -1667,6 +1628,6 @@ void llvm::WriteBitcodeToStream(const Module *M, BitstreamWriter &Stream) {
   // Emit the module.
   WriteModule(M, Stream);
 
-  if (isMacho)
+  if (TT.isOSDarwin())
     EmitDarwinBCTrailer(Stream, Stream.getBuffer().size());
 }
diff --git a/lib/Bitcode/Writer/ValueEnumerator.cpp b/lib/Bitcode/Writer/ValueEnumerator.cpp
index 05078ca..5138c3c 100644
--- a/lib/Bitcode/Writer/ValueEnumerator.cpp
+++ b/lib/Bitcode/Writer/ValueEnumerator.cpp
@@ -363,7 +363,7 @@ void ValueEnumerator::EnumerateValue(const Value *V) {
       // Initializers for globals are handled explicitly elsewhere.
     } else if (isa<ConstantArray>(C) && cast<ConstantArray>(C)->isString()) {
       // Do not enumerate the initializers for an array of simple characters.
-      // The initializers just polute the value table, and we emit the strings
+      // The initializers just pollute the value table, and we emit the strings
       // specially.
     } else if (C->getNumOperands()) {
       // If a constant has operands, enumerate them.  This makes sure that if a
@@ -423,7 +423,7 @@ void ValueEnumerator::EnumerateOperandType(const Value *V) {
     // This constant may have operands, make sure to enumerate the types in
     // them.
     for (unsigned i = 0, e = C->getNumOperands(); i != e; ++i) {
-      const User *Op = C->getOperand(i);
+      const Value *Op = C->getOperand(i);
       
       // Don't enumerate basic blocks here, this happens as operands to
       // blockaddress.
@@ -452,7 +452,6 @@ void ValueEnumerator::EnumerateAttributes(const AttrListPtr &PAL) {
   }
 }
 
-
 void ValueEnumerator::incorporateFunction(const Function &F) {
   InstructionCount = 0;
   NumModuleValues = Values.size();
diff --git a/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/lib/CodeGen/AggressiveAntiDepBreaker.cpp
index b520d8f..c23351b 100644
--- a/lib/CodeGen/AggressiveAntiDepBreaker.cpp
+++ b/lib/CodeGen/AggressiveAntiDepBreaker.cpp
@@ -16,6 +16,7 @@
 
 #define DEBUG_TYPE "post-RA-sched"
 #include "AggressiveAntiDepBreaker.h"
+#include "RegisterClassInfo.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -114,12 +115,13 @@ bool AggressiveAntiDepState::IsLive(unsigned Reg)
 
 AggressiveAntiDepBreaker::
 AggressiveAntiDepBreaker(MachineFunction& MFi,
+                         const RegisterClassInfo &RCI,
                          TargetSubtarget::RegClassVector& CriticalPathRCs) :
   AntiDepBreaker(), MF(MFi),
   MRI(MF.getRegInfo()),
   TII(MF.getTarget().getInstrInfo()),
   TRI(MF.getTarget().getRegisterInfo()),
-  AllocatableSet(TRI->getAllocatableSet(MF)),
+  RegClassInfo(RCI),
   State(NULL) {
   /* Collect a bitset of all registers that are only broken if they
      are on the critical path. */
@@ -357,7 +359,7 @@ void AggressiveAntiDepBreaker::PrescanInstruction(MachineInstr *MI,
     RegRefs = State->GetRegRefs();
 
   // Handle dead defs by simulating a last-use of the register just
-  // after the def. A dead def can occur because the def is truely
+  // after the def. A dead def can occur because the def is truly
   // dead, or because only a subregister is live at the def. If we
   // don't do this the dead def will be incorrectly merged into the
   // previous def.
@@ -618,9 +620,8 @@ bool AggressiveAntiDepBreaker::FindSuitableFreeRegisters(
   const TargetRegisterClass *SuperRC =
     TRI->getMinimalPhysRegClass(SuperReg, MVT::Other);
 
-  const TargetRegisterClass::iterator RB = SuperRC->allocation_order_begin(MF);
-  const TargetRegisterClass::iterator RE = SuperRC->allocation_order_end(MF);
-  if (RB == RE) {
+  ArrayRef<unsigned> Order = RegClassInfo.getOrder(SuperRC);
+  if (Order.empty()) {
     DEBUG(dbgs() << "\tEmpty Super Regclass!!\n");
     return false;
   }
@@ -628,17 +629,17 @@ bool AggressiveAntiDepBreaker::FindSuitableFreeRegisters(
   DEBUG(dbgs() << "\tFind Registers:");
 
   if (RenameOrder.count(SuperRC) == 0)
-    RenameOrder.insert(RenameOrderType::value_type(SuperRC, RE));
+    RenameOrder.insert(RenameOrderType::value_type(SuperRC, Order.size()));
 
-  const TargetRegisterClass::iterator OrigR = RenameOrder[SuperRC];
-  const TargetRegisterClass::iterator EndR = ((OrigR == RE) ? RB : OrigR);
-  TargetRegisterClass::iterator R = OrigR;
+  unsigned OrigR = RenameOrder[SuperRC];
+  unsigned EndR = ((OrigR == Order.size()) ? 0 : OrigR);
+  unsigned R = OrigR;
   do {
-    if (R == RB) R = RE;
+    if (R == 0) R = Order.size();
     --R;
-    const unsigned NewSuperReg = *R;
+    const unsigned NewSuperReg = Order[R];
     // Don't consider non-allocatable registers
-    if (!AllocatableSet.test(NewSuperReg)) continue;
+    if (!RegClassInfo.isAllocatable(NewSuperReg)) continue;
     // Don't replace a register with itself.
     if (NewSuperReg == SuperReg) continue;
 
@@ -719,7 +720,9 @@ unsigned AggressiveAntiDepBreaker::BreakAntiDependencies(
                               const std::vector<SUnit>& SUnits,
                               MachineBasicBlock::iterator Begin,
                               MachineBasicBlock::iterator End,
-                              unsigned InsertPosIndex) {
+                              unsigned InsertPosIndex,
+                              DbgValueVector &DbgValues) {
+
   std::vector<unsigned> &KillIndices = State->GetKillIndices();
   std::vector<unsigned> &DefIndices = State->GetDefIndices();
   std::multimap<unsigned, AggressiveAntiDepState::RegisterReference>&
@@ -817,7 +820,7 @@ unsigned AggressiveAntiDepBreaker::BreakAntiDependencies(
         DEBUG(dbgs() << "\tAntidep reg: " << TRI->getName(AntiDepReg));
         assert(AntiDepReg != 0 && "Anti-dependence on reg0?");
 
-        if (!AllocatableSet.test(AntiDepReg)) {
+        if (!RegClassInfo.isAllocatable(AntiDepReg)) {
           // Don't break anti-dependencies on non-allocatable registers.
           DEBUG(dbgs() << " (non-allocatable)\n");
           continue;
@@ -923,14 +926,10 @@ unsigned AggressiveAntiDepBreaker::BreakAntiDependencies(
               // sure to update that as well.
               const SUnit *SU = MISUnitMap[Q->second.Operand->getParent()];
               if (!SU) continue;
-              for (unsigned i = 0, e = SU->DbgInstrList.size() ; i < e ; ++i) {
-                MachineInstr *DI = SU->DbgInstrList[i];
-                assert (DI->getNumOperands()==3 && DI->getOperand(0).isReg() &&
-                        DI->getOperand(0).getReg()
-                        && "Non register dbg_value attached to SUnit!");
-                if (DI->getOperand(0).getReg() == AntiDepReg)
-                  DI->getOperand(0).setReg(NewReg);
-              }
+              for (DbgValueVector::iterator DVI = DbgValues.begin(),
+                     DVE = DbgValues.end(); DVI != DVE; ++DVI)
+                if (DVI->second == Q->second.Operand->getParent())
+                  UpdateDbgValue(DVI->first, AntiDepReg, NewReg);
             }
 
             // We just went back in time and modified history; the
diff --git a/lib/CodeGen/AggressiveAntiDepBreaker.h b/lib/CodeGen/AggressiveAntiDepBreaker.h
index 9d715cc..e43fe65 100644
--- a/lib/CodeGen/AggressiveAntiDepBreaker.h
+++ b/lib/CodeGen/AggressiveAntiDepBreaker.h
@@ -30,6 +30,8 @@
 #include <map>
 
 namespace llvm {
+class RegisterClassInfo;
+
   /// Class AggressiveAntiDepState
   /// Contains all the state necessary for anti-dep breaking.
   class AggressiveAntiDepState {
@@ -117,11 +119,7 @@ namespace llvm {
     MachineRegisterInfo &MRI;
     const TargetInstrInfo *TII;
     const TargetRegisterInfo *TRI;
-
-    /// AllocatableSet - The set of allocatable registers.
-    /// We'll be ignoring anti-dependencies on non-allocatable registers,
-    /// because they may not be safe to break.
-    const BitVector AllocatableSet;
+    const RegisterClassInfo &RegClassInfo;
 
     /// CriticalPathSet - The set of registers that should only be
     /// renamed if they are on the critical path.
@@ -133,6 +131,7 @@ namespace llvm {
 
   public:
     AggressiveAntiDepBreaker(MachineFunction& MFi,
+                             const RegisterClassInfo &RCI,
                              TargetSubtarget::RegClassVector& CriticalPathRCs);
     ~AggressiveAntiDepBreaker();
 
@@ -146,7 +145,8 @@ namespace llvm {
     unsigned BreakAntiDependencies(const std::vector<SUnit>& SUnits,
                                    MachineBasicBlock::iterator Begin,
                                    MachineBasicBlock::iterator End,
-                                   unsigned InsertPosIndex);
+                                   unsigned InsertPosIndex,
+                                   DbgValueVector &DbgValues);
 
     /// Observe - Update liveness information to account for the current
     /// instruction, which will not be scheduled.
@@ -157,8 +157,8 @@ namespace llvm {
     void FinishBlock();
 
   private:
-    typedef std::map<const TargetRegisterClass *,
-                     TargetRegisterClass::const_iterator> RenameOrderType;
+    /// Keep track of a position in the allocation order for each regclass.
+    typedef std::map<const TargetRegisterClass *, unsigned> RenameOrderType;
 
     /// IsImplicitDefUse - Return true if MO represents a register
     /// that is both implicitly used and defined in MI
diff --git a/lib/CodeGen/AllocationOrder.cpp b/lib/CodeGen/AllocationOrder.cpp
index 20c7625..1005f10 100644
--- a/lib/CodeGen/AllocationOrder.cpp
+++ b/lib/CodeGen/AllocationOrder.cpp
@@ -15,6 +15,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AllocationOrder.h"
+#include "RegisterClassInfo.h"
 #include "VirtRegMap.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 
@@ -23,8 +24,8 @@ using namespace llvm;
 // Compare VirtRegMap::getRegAllocPref().
 AllocationOrder::AllocationOrder(unsigned VirtReg,
                                  const VirtRegMap &VRM,
-                                 const BitVector &ReservedRegs)
-  : Pos(0), Reserved(ReservedRegs) {
+                                 const RegisterClassInfo &RegClassInfo)
+  : Begin(0), End(0), Pos(0), RCI(RegClassInfo), OwnedBegin(false) {
   const TargetRegisterClass *RC = VRM.getRegInfo().getRegClass(VirtReg);
   std::pair<unsigned, unsigned> HintPair =
     VRM.getRegInfo().getRegAllocationHint(VirtReg);
@@ -36,33 +37,43 @@ AllocationOrder::AllocationOrder(unsigned VirtReg,
   if (TargetRegisterInfo::isVirtualRegister(Hint))
     Hint = VRM.getPhys(Hint);
 
-  // The remaining allocation order may depend on the hint.
-  tie(Begin, End) = VRM.getTargetRegInfo()
-        .getAllocationOrder(RC, HintPair.first, Hint, VRM.getMachineFunction());
+  // The first hint pair component indicates a target-specific hint.
+  if (HintPair.first) {
+    const TargetRegisterInfo &TRI = VRM.getTargetRegInfo();
+    // The remaining allocation order may depend on the hint.
+    ArrayRef<unsigned> Order =
+      TRI.getRawAllocationOrder(RC, HintPair.first, Hint,
+                                VRM.getMachineFunction());
+    if (Order.empty())
+      return;
 
-  // Target-dependent hints require resolution.
-  if (HintPair.first)
-    Hint = VRM.getTargetRegInfo().ResolveRegAllocHint(HintPair.first, Hint,
-                                                      VRM.getMachineFunction());
+    // Copy the allocation order with reserved registers removed.
+    OwnedBegin = true;
+    unsigned *P = new unsigned[Order.size()];
+    Begin = P;
+    for (unsigned i = 0; i != Order.size(); ++i)
+      if (!RCI.isReserved(Order[i]))
+        *P++ = Order[i];
+    End = P;
+
+    // Target-dependent hints require resolution.
+    Hint = TRI.ResolveRegAllocHint(HintPair.first, Hint,
+                                   VRM.getMachineFunction());
+  } else {
+    // If there is no hint or just a normal hint, use the cached allocation
+    // order from RegisterClassInfo.
+    ArrayRef<unsigned> O = RCI.getOrder(RC);
+    Begin = O.begin();
+    End = O.end();
+  }
 
   // The hint must be a valid physreg for allocation.
   if (Hint && (!TargetRegisterInfo::isPhysicalRegister(Hint) ||
-               !RC->contains(Hint) || ReservedRegs.test(Hint)))
+               !RC->contains(Hint) || RCI.isReserved(Hint)))
     Hint = 0;
 }
 
-unsigned AllocationOrder::next() {
-  // First take the hint.
-  if (!Pos) {
-    Pos = Begin;
-    if (Hint)
-      return Hint;
-  }
-  // Then look at the order from TRI.
-  while(Pos != End) {
-    unsigned Reg = *Pos++;
-    if (Reg != Hint && !Reserved.test(Reg))
-      return Reg;
-  }
-  return 0;
+AllocationOrder::~AllocationOrder() {
+  if (OwnedBegin)
+    delete [] Begin;
 }
diff --git a/lib/CodeGen/AllocationOrder.h b/lib/CodeGen/AllocationOrder.h
index 61fd8f8..d1e48a1 100644
--- a/lib/CodeGen/AllocationOrder.h
+++ b/lib/CodeGen/AllocationOrder.h
@@ -19,15 +19,16 @@
 
 namespace llvm {
 
-class BitVector;
+class RegisterClassInfo;
 class VirtRegMap;
 
 class AllocationOrder {
   const unsigned *Begin;
   const unsigned *End;
   const unsigned *Pos;
-  const BitVector &Reserved;
+  const RegisterClassInfo &RCI;
   unsigned Hint;
+  bool OwnedBegin;
 public:
 
   /// AllocationOrder - Create a new AllocationOrder for VirtReg.
@@ -37,12 +38,28 @@ public:
   ///        TargetRegisterInfo::getReservedRegs().
   AllocationOrder(unsigned VirtReg,
                   const VirtRegMap &VRM,
-                  const BitVector &ReservedRegs);
+                  const RegisterClassInfo &RegClassInfo);
+
+  ~AllocationOrder();
 
   /// next - Return the next physical register in the allocation order, or 0.
   /// It is safe to call next again after it returned 0.
   /// It will keep returning 0 until rewind() is called.
-  unsigned next();
+  unsigned next() {
+    // First take the hint.
+    if (!Pos) {
+      Pos = Begin;
+      if (Hint)
+        return Hint;
+    }
+    // Then look at the order from TRI.
+    while (Pos != End) {
+      unsigned Reg = *Pos++;
+      if (Reg != Hint)
+        return Reg;
+    }
+    return 0;
+  }
 
   /// rewind - Start over from the beginning.
   void rewind() { Pos = 0; }
diff --git a/lib/CodeGen/AntiDepBreaker.h b/lib/CodeGen/AntiDepBreaker.h
index 086b757..df47f98 100644
--- a/lib/CodeGen/AntiDepBreaker.h
+++ b/lib/CodeGen/AntiDepBreaker.h
@@ -30,6 +30,9 @@ namespace llvm {
 /// anti-dependencies.
 class AntiDepBreaker {
 public:
+  typedef std::vector<std::pair<MachineInstr *, MachineInstr *> > 
+    DbgValueVector;
+
   virtual ~AntiDepBreaker();
 
   /// Start - Initialize anti-dep breaking for a new basic block.
@@ -40,9 +43,10 @@ public:
   /// the number of anti-dependencies broken.
   ///
   virtual unsigned BreakAntiDependencies(const std::vector<SUnit>& SUnits,
-                                MachineBasicBlock::iterator Begin,
-                                MachineBasicBlock::iterator End,
-                                unsigned InsertPosIndex) =0;
+                                         MachineBasicBlock::iterator Begin,
+                                         MachineBasicBlock::iterator End,
+                                         unsigned InsertPosIndex,
+                                         DbgValueVector &DbgValues) = 0;
   
   /// Observe - Update liveness information to account for the current
   /// instruction, which will not be scheduled.
@@ -52,6 +56,14 @@ public:
   
   /// Finish - Finish anti-dep breaking for a basic block.
   virtual void FinishBlock() =0;
+
+  /// UpdateDbgValue - Update DBG_VALUE if dependency breaker is updating
+  /// other machine instruction to use NewReg.
+  void UpdateDbgValue(MachineInstr *MI, unsigned OldReg, unsigned NewReg) {
+    assert (MI->isDebugValue() && "MI is not DBG_VALUE!");
+    if (MI && MI->getOperand(0).isReg() && MI->getOperand(0).getReg() == OldReg)
+      MI->getOperand(0).setReg(NewReg);
+  }
 };
 
 }
diff --git a/lib/CodeGen/AsmPrinter/ARMException.cpp b/lib/CodeGen/AsmPrinter/ARMException.cpp
index 0db28a6..5861fa4 100644
--- a/lib/CodeGen/AsmPrinter/ARMException.cpp
+++ b/lib/CodeGen/AsmPrinter/ARMException.cpp
@@ -52,7 +52,7 @@ void ARMException::EndModule() {
 /// being emitted immediately after the function entry point.
 void ARMException::BeginFunction(const MachineFunction *MF) {
   Asm->OutStreamer.EmitFnStart();
-  if (!Asm->MF->getFunction()->doesNotThrow() || UnwindTablesMandatory)
+  if (Asm->MF->getFunction()->needsUnwindTableEntry())
     Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("eh_func_begin",
                                                   Asm->getFunctionNumber()));
 }
@@ -60,7 +60,7 @@ void ARMException::BeginFunction(const MachineFunction *MF) {
 /// EndFunction - Gather and emit post-function exception information.
 ///
 void ARMException::EndFunction() {
-  if (Asm->MF->getFunction()->doesNotThrow() && !UnwindTablesMandatory)
+  if (!Asm->MF->getFunction()->needsUnwindTableEntry())
     Asm->OutStreamer.EmitCantUnwind();
   else {
     Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("eh_func_end",
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 61f5672..b544ff1 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -35,10 +35,12 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Target/Mangler.h"
+#include "llvm/Target/TargetAsmInfo.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Assembly/Writer.h"
 #include "llvm/ADT/SmallString.h"
@@ -191,22 +193,25 @@ bool AsmPrinter::doInitialization(Module &M) {
   if (MAI->doesSupportDebugInformation())
     DD = new DwarfDebug(this, &M);
 
-  if (MAI->doesSupportExceptionHandling())
-    switch (MAI->getExceptionHandlingType()) {
-    default:
-    case ExceptionHandling::DwarfTable:
-      DE = new DwarfTableException(this);
-      break;
-    case ExceptionHandling::DwarfCFI:
-      DE = new DwarfCFIException(this);
-      break;
-    case ExceptionHandling::ARM:
-      DE = new ARMException(this);
-      break;
-    }
+  switch (MAI->getExceptionHandlingType()) {
+  case ExceptionHandling::None:
+    return false;
+  case ExceptionHandling::SjLj:
+  case ExceptionHandling::DwarfCFI:
+    DE = new DwarfCFIException(this);
+    return false;
+  case ExceptionHandling::ARM:
+    DE = new ARMException(this);
+    return false;
+  case ExceptionHandling::Win64:
+    DE = new Win64Exception(this);
+    return false;
+  }
+#else
+  return false;
 #endif // ANDROID_TARGET_BUILD
 
-  return false;
+  llvm_unreachable("Unknown exception type.");
 }
 
 void AsmPrinter::EmitLinkage(unsigned Linkage, MCSymbol *GVSym) const {
@@ -271,7 +276,7 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
   }
 
   MCSymbol *GVSym = Mang->getSymbol(GV);
-  EmitVisibility(GVSym, GV->getVisibility());
+  EmitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
 
   if (!GV->hasInitializer())   // External globals require no extra code.
     return;
@@ -293,12 +298,6 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
   if (GVKind.isCommon() || GVKind.isBSSLocal()) {
     if (Size == 0) Size = 1;   // .comm Foo, 0 is undefined, avoid it.
 
-    if (isVerbose()) {
-      WriteAsOperand(OutStreamer.GetCommentOS(), GV,
-                     /*PrintType=*/false, GV->getParent());
-      OutStreamer.GetCommentOS() << '\n';
-    }
-
     // Handle common symbols.
     if (GVKind.isCommon()) {
       unsigned Align = 1 << AlignLog;
@@ -492,39 +491,11 @@ void AsmPrinter::EmitFunctionEntryLabel() {
 }
 
 
-static void EmitDebugLoc(DebugLoc DL, const MachineFunction *MF,
-                         raw_ostream &CommentOS) {
-  const LLVMContext &Ctx = MF->getFunction()->getContext();
-  if (!DL.isUnknown()) {          // Print source line info.
-    DIScope Scope(DL.getScope(Ctx));
-    // Omit the directory, because it's likely to be long and uninteresting.
-    if (Scope.Verify())
-      CommentOS << Scope.getFilename();
-    else
-      CommentOS << "<unknown>";
-    CommentOS << ':' << DL.getLine();
-    if (DL.getCol() != 0)
-      CommentOS << ':' << DL.getCol();
-    DebugLoc InlinedAtDL = DebugLoc::getFromDILocation(DL.getInlinedAt(Ctx));
-    if (!InlinedAtDL.isUnknown()) {
-      CommentOS << "[ ";
-      EmitDebugLoc(InlinedAtDL, MF, CommentOS);
-      CommentOS << " ]";
-    }
-  }
-}
-
 /// EmitComments - Pretty-print comments for instructions.
 static void EmitComments(const MachineInstr &MI, raw_ostream &CommentOS) {
   const MachineFunction *MF = MI.getParent()->getParent();
   const TargetMachine &TM = MF->getTarget();
 
-  DebugLoc DL = MI.getDebugLoc();
-  if (!DL.isUnknown()) {          // Print source line info.
-    EmitDebugLoc(DL, MF, CommentOS);
-    CommentOS << '\n';
-  }
-
   // Check for spills and reloads
   int FI;
 
@@ -631,6 +602,45 @@ static bool EmitDebugValueComment(const MachineInstr *MI, AsmPrinter &AP) {
   return true;
 }
 
+AsmPrinter::CFIMoveType AsmPrinter::needsCFIMoves() {
+  if (MAI->getExceptionHandlingType() == ExceptionHandling::DwarfCFI &&
+      MF->getFunction()->needsUnwindTableEntry())
+    return CFI_M_EH;
+
+  if (MMI->hasDebugInfo())
+    return CFI_M_Debug;
+
+  return CFI_M_None;
+}
+
+bool AsmPrinter::needsSEHMoves() {
+  return MAI->getExceptionHandlingType() == ExceptionHandling::Win64 &&
+    MF->getFunction()->needsUnwindTableEntry();
+}
+
+void AsmPrinter::emitPrologLabel(const MachineInstr &MI) {
+  MCSymbol *Label = MI.getOperand(0).getMCSymbol();
+
+  if (MAI->getExceptionHandlingType() != ExceptionHandling::DwarfCFI)
+    return;
+
+  if (needsCFIMoves() == CFI_M_None)
+    return;
+
+  MachineModuleInfo &MMI = MF->getMMI();
+  std::vector<MachineMove> &Moves = MMI.getFrameMoves();
+  bool FoundOne = false;
+  (void)FoundOne;
+  for (std::vector<MachineMove>::iterator I = Moves.begin(),
+         E = Moves.end(); I != E; ++I) {
+    if (I->getLabel() == Label) {
+      EmitCFIFrameMove(*I);
+      FoundOne = true;
+    }
+  }
+  assert(FoundOne);
+}
+
 /// EmitFunctionBody - This method emits the body and trailer for a
 /// function.
 void AsmPrinter::EmitFunctionBody() {
@@ -669,6 +679,9 @@ void AsmPrinter::EmitFunctionBody() {
 
       switch (II->getOpcode()) {
       case TargetOpcode::PROLOG_LABEL:
+        emitPrologLabel(*II);
+        break;
+
       case TargetOpcode::EH_LABEL:
       case TargetOpcode::GC_LABEL:
         OutStreamer.EmitLabel(II->getOperand(0).getMCSymbol());
@@ -689,6 +702,9 @@ void AsmPrinter::EmitFunctionBody() {
         if (isVerbose()) EmitKill(II, *this);
         break;
       default:
+        if (!TM.hasMCUseLoc())
+          MCLineEntry::Make(&OutStreamer, getCurrentSection());
+
         EmitInstruction(II);
         break;
       }
@@ -767,6 +783,53 @@ getDebugValueLocation(const MachineInstr *MI) const {
   return MachineLocation();
 }
 
+/// EmitDwarfRegOp - Emit dwarf register operation.
+void AsmPrinter::EmitDwarfRegOp(const MachineLocation &MLoc) const {
+  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+  int Reg = TRI->getDwarfRegNum(MLoc.getReg(), false);
+
+  for (const unsigned *SR = TRI->getSuperRegisters(MLoc.getReg());
+       *SR && Reg < 0; ++SR) {
+    Reg = TRI->getDwarfRegNum(*SR, false);
+    // FIXME: Get the bit range this register uses of the superregister
+    // so that we can produce a DW_OP_bit_piece
+  }
+
+  // FIXME: Handle cases like a super register being encoded as
+  // DW_OP_reg 32 DW_OP_piece 4 DW_OP_reg 33
+
+  // FIXME: We have no reasonable way of handling errors in here. The
+  // caller might be in the middle of an dwarf expression. We should
+  // probably assert that Reg >= 0 once debug info generation is more mature.
+
+  if (int Offset =  MLoc.getOffset()) {
+    if (Reg < 32) {
+      OutStreamer.AddComment(
+        dwarf::OperationEncodingString(dwarf::DW_OP_breg0 + Reg));
+      EmitInt8(dwarf::DW_OP_breg0 + Reg);
+    } else {
+      OutStreamer.AddComment("DW_OP_bregx");
+      EmitInt8(dwarf::DW_OP_bregx);
+      OutStreamer.AddComment(Twine(Reg));
+      EmitULEB128(Reg);
+    }
+    EmitSLEB128(Offset);
+  } else {
+    if (Reg < 32) {
+      OutStreamer.AddComment(
+        dwarf::OperationEncodingString(dwarf::DW_OP_reg0 + Reg));
+      EmitInt8(dwarf::DW_OP_reg0 + Reg);
+    } else {
+      OutStreamer.AddComment("DW_OP_regx");
+      EmitInt8(dwarf::DW_OP_regx);
+      OutStreamer.AddComment(Twine(Reg));
+      EmitULEB128(Reg);
+    }
+  }
+
+  // FIXME: Produce a DW_OP_bit_piece if we used a superregister
+}
+
 bool AsmPrinter::doFinalization(Module &M) {
   // Emit global variables.
   for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
@@ -1879,7 +1942,7 @@ isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const {
     return false;
 
   // The predecessor has to be immediately before this block.
-  const MachineBasicBlock *Pred = *PI;
+  MachineBasicBlock *Pred = *PI;
 
   if (!Pred->isLayoutSuccessor(MBB))
     return false;
@@ -1888,9 +1951,28 @@ isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const {
   if (Pred->empty())
     return true;
 
-  // Otherwise, check the last instruction.
-  const MachineInstr &LastInst = Pred->back();
-  return !LastInst.getDesc().isBarrier();
+  // Check the terminators in the previous blocks
+  for (MachineBasicBlock::iterator II = Pred->getFirstTerminator(),
+         IE = Pred->end(); II != IE; ++II) {
+    MachineInstr &MI = *II;
+
+    // If it is not a simple branch, we are in a table somewhere.
+    if (!MI.getDesc().isBranch() || MI.getDesc().isIndirectBranch())
+      return false;
+
+    // If we are the operands of one of the branches, this is not
+    // a fall through.
+    for (MachineInstr::mop_iterator OI = MI.operands_begin(),
+           OE = MI.operands_end(); OI != OE; ++OI) {
+      const MachineOperand& OP = *OI;
+      if (OP.isJTI())
+        return false;
+      if (OP.isMBB() && OP.getMBB() == MBB)
+        return false;
+    }
+  }
+
+  return true;
 }
 
 
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
index 9c8184a..dd5b0e2 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
@@ -155,7 +155,7 @@ void AsmPrinter::EmitReference(const MCSymbol *Sym, unsigned Encoding) const {
   const TargetLoweringObjectFile &TLOF = getObjFileLowering();
   
   const MCExpr *Exp =
-    TLOF.getExprForDwarfReference(Sym, Mang, MMI, Encoding, OutStreamer);
+    TLOF.getExprForDwarfReference(Sym, Encoding, OutStreamer);
   OutStreamer.EmitAbsValue(Exp, GetSizeOfEncodedValue(Encoding));
 }
 
@@ -206,108 +206,28 @@ void AsmPrinter::EmitSectionOffset(const MCSymbol *Label,
 // Dwarf Lowering Routines
 //===----------------------------------------------------------------------===//
 
-
-/// EmitFrameMoves - Emit frame instructions to describe the layout of the
-/// frame.
-void AsmPrinter::EmitFrameMoves(const std::vector<MachineMove> &Moves,
-                                MCSymbol *BaseLabel, bool isEH) const {
-  const TargetRegisterInfo *RI = TM.getRegisterInfo();
-  
-  int stackGrowth = TM.getTargetData()->getPointerSize();
-  if (TM.getFrameLowering()->getStackGrowthDirection() !=
-      TargetFrameLowering::StackGrowsUp)
-    stackGrowth *= -1;
-  
-  for (unsigned i = 0, N = Moves.size(); i < N; ++i) {
-    const MachineMove &Move = Moves[i];
-    MCSymbol *Label = Move.getLabel();
-    // Throw out move if the label is invalid.
-    if (Label && !Label->isDefined()) continue; // Not emitted, in dead code.
-    
-    const MachineLocation &Dst = Move.getDestination();
-    const MachineLocation &Src = Move.getSource();
-    
-    // Advance row if new location.
-    if (BaseLabel && Label) {
-      MCSymbol *ThisSym = Label;
-      if (ThisSym != BaseLabel) {
-        EmitCFAByte(dwarf::DW_CFA_advance_loc4);
-        EmitLabelDifference(ThisSym, BaseLabel, 4);
-        BaseLabel = ThisSym;
-      }
-    }
-    
-    // If advancing cfa.
-    if (Dst.isReg() && Dst.getReg() == MachineLocation::VirtualFP) {
-      assert(!Src.isReg() && "Machine move not supported yet.");
-      
-      if (Src.getReg() == MachineLocation::VirtualFP) {
-        EmitCFAByte(dwarf::DW_CFA_def_cfa_offset);
-      } else {
-        EmitCFAByte(dwarf::DW_CFA_def_cfa);
-        EmitULEB128(RI->getDwarfRegNum(Src.getReg(), isEH), "Register");
-      }
-      
-      EmitULEB128(-Src.getOffset(), "Offset");
-      continue;
-    }
-    
-    if (Src.isReg() && Src.getReg() == MachineLocation::VirtualFP) {
-      assert(Dst.isReg() && "Machine move not supported yet.");
-      EmitCFAByte(dwarf::DW_CFA_def_cfa_register);
-      EmitULEB128(RI->getDwarfRegNum(Dst.getReg(), isEH), "Register");
-      continue;
-    }
-    
-    unsigned Reg = RI->getDwarfRegNum(Src.getReg(), isEH);
-    int Offset = Dst.getOffset() / stackGrowth;
-    
-    if (Offset < 0) {
-      EmitCFAByte(dwarf::DW_CFA_offset_extended_sf);
-      EmitULEB128(Reg, "Reg");
-      EmitSLEB128(Offset, "Offset");
-    } else if (Reg < 64) {
-      EmitCFAByte(dwarf::DW_CFA_offset + Reg);
-      EmitULEB128(Offset, "Offset");
-    } else {
-      EmitCFAByte(dwarf::DW_CFA_offset_extended);
-      EmitULEB128(Reg, "Reg");
-      EmitULEB128(Offset, "Offset");
-    }
-  }
-}
-
-/// EmitFrameMoves - Emit frame instructions to describe the layout of the
-/// frame.
-void AsmPrinter::EmitCFIFrameMoves(const std::vector<MachineMove> &Moves) const {
+/// EmitCFIFrameMove - Emit a frame instruction.
+void AsmPrinter::EmitCFIFrameMove(const MachineMove &Move) const {
   const TargetRegisterInfo *RI = TM.getRegisterInfo();
 
-  for (unsigned i = 0, N = Moves.size(); i < N; ++i) {
-    const MachineMove &Move = Moves[i];
-    MCSymbol *Label = Move.getLabel();
-    // Throw out move if the label is invalid.
-    if (Label && !Label->isDefined()) continue; // Not emitted, in dead code.
-
-    const MachineLocation &Dst = Move.getDestination();
-    const MachineLocation &Src = Move.getSource();
-
-    // If advancing cfa.
-    if (Dst.isReg() && Dst.getReg() == MachineLocation::VirtualFP) {
-      assert(!Src.isReg() && "Machine move not supported yet.");
+  const MachineLocation &Dst = Move.getDestination();
+  const MachineLocation &Src = Move.getSource();
 
-      if (Src.getReg() == MachineLocation::VirtualFP) {
-        OutStreamer.EmitCFIDefCfaOffset(-Src.getOffset());
-      } else {
-        assert("Machine move not supported yet");
-        // Reg + Offset
-      }
-    } else if (Src.isReg() && Src.getReg() == MachineLocation::VirtualFP) {
-      assert(Dst.isReg() && "Machine move not supported yet.");
-      OutStreamer.EmitCFIDefCfaRegister(RI->getDwarfRegNum(Dst.getReg(), true));
+  // If advancing cfa.
+  if (Dst.isReg() && Dst.getReg() == MachineLocation::VirtualFP) {
+    if (Src.getReg() == MachineLocation::VirtualFP) {
+      OutStreamer.EmitCFIDefCfaOffset(-Src.getOffset());
     } else {
-      assert(!Dst.isReg() && "Machine move not supported yet.");
-      OutStreamer.EmitCFIOffset(RI->getDwarfRegNum(Src.getReg(), true),
-                                Dst.getOffset());
+      // Reg + Offset
+      OutStreamer.EmitCFIDefCfa(RI->getDwarfRegNum(Src.getReg(), true),
+                                Src.getOffset());
     }
+  } else if (Src.isReg() && Src.getReg() == MachineLocation::VirtualFP) {
+    assert(Dst.isReg() && "Machine move not supported yet.");
+    OutStreamer.EmitCFIDefCfaRegister(RI->getDwarfRegNum(Dst.getReg(), true));
+  } else {
+    assert(!Dst.isReg() && "Machine move not supported yet.");
+    OutStreamer.EmitCFIOffset(RI->getDwarfRegNum(Src.getReg(), true),
+                              Dst.getOffset());
   }
 }
diff --git a/lib/CodeGen/AsmPrinter/CMakeLists.txt b/lib/CodeGen/AsmPrinter/CMakeLists.txt
index 1377e4d..4da7876 100644
--- a/lib/CodeGen/AsmPrinter/CMakeLists.txt
+++ b/lib/CodeGen/AsmPrinter/CMakeLists.txt
@@ -5,9 +5,10 @@ add_llvm_library(LLVMAsmPrinter
   AsmPrinterInlineAsm.cpp
   DIE.cpp
   DwarfCFIException.cpp
+  DwarfCompileUnit.cpp
   DwarfDebug.cpp
   DwarfException.cpp
-  DwarfTableException.cpp
   OcamlGCPrinter.cpp
+  Win64Exception.cpp
   )
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp b/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
index 68be2ee..91b7d08 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
@@ -32,6 +32,7 @@
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Support/Dwarf.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
@@ -40,92 +41,105 @@ using namespace llvm;
 
 DwarfCFIException::DwarfCFIException(AsmPrinter *A)
   : DwarfException(A),
-    shouldEmitTable(false), shouldEmitMoves(false), shouldEmitTableModule(false)
-    {}
+    shouldEmitPersonality(false), shouldEmitLSDA(false), shouldEmitMoves(false),
+    moveTypeModule(AsmPrinter::CFI_M_None) {}
 
 DwarfCFIException::~DwarfCFIException() {}
 
 /// EndModule - Emit all exception information that should come after the
 /// content.
 void DwarfCFIException::EndModule() {
-  if (!Asm->MAI->isExceptionHandlingDwarf())
-    return;
+  if (moveTypeModule == AsmPrinter::CFI_M_Debug)
+    Asm->OutStreamer.EmitCFISections(false, true);
 
-  if (!shouldEmitTableModule)
+  if (!Asm->MAI->isExceptionHandlingDwarf())
     return;
 
   const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
+
   unsigned PerEncoding = TLOF.getPersonalityEncoding();
 
-  // Begin eh frame section.
-  Asm->OutStreamer.SwitchSection(TLOF.getEHFrameSection());
+  if ((PerEncoding & 0x70) != dwarf::DW_EH_PE_pcrel)
+    return;
 
   // Emit references to all used personality functions
+  bool AtLeastOne = false;
   const std::vector<const Function*> &Personalities = MMI->getPersonalities();
   for (size_t i = 0, e = Personalities.size(); i != e; ++i) {
-    Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("personality", i));
-    Asm->EmitReference(Personalities[i], PerEncoding);
+    if (!Personalities[i])
+      continue;
+    MCSymbol *Sym = Asm->Mang->getSymbol(Personalities[i]);
+    TLOF.emitPersonalityValue(Asm->OutStreamer, Asm->TM, Sym);
+    AtLeastOne = true;
+  }
+
+  if (AtLeastOne && !TLOF.isFunctionEHFrameSymbolPrivate()) {
+    // This is a temporary hack to keep sections in the same order they
+    // were before. This lets us produce bit identical outputs while
+    // transitioning to CFI.
+    Asm->OutStreamer.SwitchSection(TLOF.getEHFrameSection());
   }
 }
 
 /// BeginFunction - Gather pre-function exception information. Assumes it's
 /// being emitted immediately after the function entry point.
 void DwarfCFIException::BeginFunction(const MachineFunction *MF) {
-  shouldEmitTable = shouldEmitMoves = false;
+  shouldEmitMoves = shouldEmitPersonality = shouldEmitLSDA = false;
 
   // If any landing pads survive, we need an EH table.
-  shouldEmitTable = !MMI->getLandingPads().empty();
+  bool hasLandingPads = !MMI->getLandingPads().empty();
 
   // See if we need frame move info.
-  shouldEmitMoves =
-    !Asm->MF->getFunction()->doesNotThrow() || UnwindTablesMandatory;
+  AsmPrinter::CFIMoveType MoveType = Asm->needsCFIMoves();
+  if (MoveType == AsmPrinter::CFI_M_EH ||
+      (MoveType == AsmPrinter::CFI_M_Debug &&
+       moveTypeModule == AsmPrinter::CFI_M_None))
+    moveTypeModule = MoveType;
 
-  if (shouldEmitMoves || shouldEmitTable)
-    // Assumes in correct section after the entry point.
-    Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("eh_func_begin",
-                                                  Asm->getFunctionNumber()));
+  shouldEmitMoves = MoveType != AsmPrinter::CFI_M_None;
 
-  shouldEmitTableModule |= shouldEmitTable;
+  const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
+  unsigned PerEncoding = TLOF.getPersonalityEncoding();
+  const Function *Per = MMI->getPersonalities()[MMI->getPersonalityIndex()];
 
-  if (shouldEmitMoves) {
-    const TargetFrameLowering *TFL = Asm->TM.getFrameLowering();
-    Asm->OutStreamer.EmitCFIStartProc();
+  shouldEmitPersonality = hasLandingPads &&
+    PerEncoding != dwarf::DW_EH_PE_omit && Per;
 
-    // Indicate locations of general callee saved registers in frame.
-    std::vector<MachineMove> Moves;
-    TFL->getInitialFrameState(Moves);
-    Asm->EmitCFIFrameMoves(Moves);
-    Asm->EmitCFIFrameMoves(MMI->getFrameMoves());
-  }
+  unsigned LSDAEncoding = TLOF.getLSDAEncoding();
+  shouldEmitLSDA = shouldEmitPersonality &&
+    LSDAEncoding != dwarf::DW_EH_PE_omit;
 
-  if (!shouldEmitTable)
+  if (!shouldEmitPersonality && !shouldEmitMoves)
     return;
 
-  const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
+  Asm->OutStreamer.EmitCFIStartProc();
+
+  // Indicate personality routine, if any.
+  if (!shouldEmitPersonality)
+    return;
+
+  const MCSymbol *Sym = TLOF.getCFIPersonalitySymbol(Per, Asm->Mang, MMI);
+  Asm->OutStreamer.EmitCFIPersonality(Sym, PerEncoding);
+
+  Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("eh_func_begin",
+                                                Asm->getFunctionNumber()));
 
   // Provide LSDA information.
-  unsigned LSDAEncoding = TLOF.getLSDAEncoding();
-  if (LSDAEncoding != dwarf::DW_EH_PE_omit)
-    Asm->OutStreamer.EmitCFILsda(Asm->GetTempSymbol("exception",
-                                                    Asm->getFunctionNumber()),
-                                 LSDAEncoding);
+  if (!shouldEmitLSDA)
+    return;
 
-  // Indicate personality routine, if any.
-  unsigned PerEncoding = TLOF.getPersonalityEncoding();
-  if (PerEncoding != dwarf::DW_EH_PE_omit &&
-      MMI->getPersonalities()[MMI->getPersonalityIndex()])
-    Asm->OutStreamer.EmitCFIPersonality(Asm->GetTempSymbol("personality",
-                                                    MMI->getPersonalityIndex()),
-                                        PerEncoding);
+  Asm->OutStreamer.EmitCFILsda(Asm->GetTempSymbol("exception",
+                                                  Asm->getFunctionNumber()),
+                               LSDAEncoding);
 }
 
 /// EndFunction - Gather and emit post-function exception information.
 ///
 void DwarfCFIException::EndFunction() {
-  if (!shouldEmitMoves && !shouldEmitTable) return;
+  if (!shouldEmitPersonality && !shouldEmitMoves)
+    return;
 
-  if (shouldEmitMoves)
-    Asm->OutStreamer.EmitCFIEndProc();
+  Asm->OutStreamer.EmitCFIEndProc();
 
   Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("eh_func_end",
                                                 Asm->getFunctionNumber()));
@@ -133,6 +147,6 @@ void DwarfCFIException::EndFunction() {
   // Map all labels and get rid of any dead landing pads.
   MMI->TidyLandingPads();
 
-  if (shouldEmitTable)
+  if (shouldEmitPersonality)
     EmitExceptionTable();
 }
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
new file mode 100644
index 0000000..bff1a35
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -0,0 +1,1054 @@
+//===-- llvm/CodeGen/DwarfCompileUnit.cpp - Dwarf Compile Unit ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing dwarf compile unit.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "dwarfdebug"
+
+#include "DwarfCompileUnit.h"
+#include "DwarfDebug.h"
+#include "llvm/Constants.h"
+#include "llvm/Analysis/DIBuilder.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+/// CompileUnit - Compile unit constructor.
+CompileUnit::CompileUnit(unsigned I, DIE *D, AsmPrinter *A, DwarfDebug *DW)
+  : ID(I), CUDie(D), Asm(A), DD(DW), IndexTyDie(0) {
+  DIEIntegerOne = new (DIEValueAllocator) DIEInteger(1);
+}
+
+/// ~CompileUnit - Destructor for compile unit.
+CompileUnit::~CompileUnit() {
+  for (unsigned j = 0, M = DIEBlocks.size(); j < M; ++j)
+    DIEBlocks[j]->~DIEBlock();
+}
+
+/// createDIEEntry - Creates a new DIEEntry to be a proxy for a debug
+/// information entry.
+DIEEntry *CompileUnit::createDIEEntry(DIE *Entry) {
+  DIEEntry *Value = new (DIEValueAllocator) DIEEntry(Entry);
+  return Value;
+}
+
+/// addUInt - Add an unsigned integer attribute data and value.
+///
+void CompileUnit::addUInt(DIE *Die, unsigned Attribute,
+                          unsigned Form, uint64_t Integer) {
+  if (!Form) Form = DIEInteger::BestForm(false, Integer);
+  DIEValue *Value = Integer == 1 ?
+    DIEIntegerOne : new (DIEValueAllocator) DIEInteger(Integer);
+  Die->addValue(Attribute, Form, Value);
+}
+
+/// addSInt - Add an signed integer attribute data and value.
+///
+void CompileUnit::addSInt(DIE *Die, unsigned Attribute,
+                          unsigned Form, int64_t Integer) {
+  if (!Form) Form = DIEInteger::BestForm(true, Integer);
+  DIEValue *Value = new (DIEValueAllocator) DIEInteger(Integer);
+  Die->addValue(Attribute, Form, Value);
+}
+
+/// addString - Add a string attribute data and value. DIEString only
+/// keeps string reference.
+void CompileUnit::addString(DIE *Die, unsigned Attribute, unsigned Form,
+                            StringRef String) {
+  DIEValue *Value = new (DIEValueAllocator) DIEString(String);
+  Die->addValue(Attribute, Form, Value);
+}
+
+/// addLabel - Add a Dwarf label attribute data and value.
+///
+void CompileUnit::addLabel(DIE *Die, unsigned Attribute, unsigned Form,
+                           const MCSymbol *Label) {
+  DIEValue *Value = new (DIEValueAllocator) DIELabel(Label);
+  Die->addValue(Attribute, Form, Value);
+}
+
+/// addDelta - Add a label delta attribute data and value.
+///
+void CompileUnit::addDelta(DIE *Die, unsigned Attribute, unsigned Form,
+                           const MCSymbol *Hi, const MCSymbol *Lo) {
+  DIEValue *Value = new (DIEValueAllocator) DIEDelta(Hi, Lo);
+  Die->addValue(Attribute, Form, Value);
+}
+
+/// addDIEEntry - Add a DIE attribute data and value.
+///
+void CompileUnit::addDIEEntry(DIE *Die, unsigned Attribute, unsigned Form,
+                              DIE *Entry) {
+  Die->addValue(Attribute, Form, createDIEEntry(Entry));
+}
+
+
+/// addBlock - Add block data.
+///
+void CompileUnit::addBlock(DIE *Die, unsigned Attribute, unsigned Form,
+                           DIEBlock *Block) {
+  Block->ComputeSize(Asm);
+  DIEBlocks.push_back(Block); // Memoize so we can call the destructor later on.
+  Die->addValue(Attribute, Block->BestForm(), Block);
+}
+
+/// addSourceLine - Add location information to specified debug information
+/// entry.
+void CompileUnit::addSourceLine(DIE *Die, DIVariable V) {
+  // Verify variable.
+  if (!V.Verify())
+    return;
+  
+  unsigned Line = V.getLineNumber();
+  if (Line == 0)
+    return;
+  unsigned FileID = DD->GetOrCreateSourceID(V.getContext().getFilename(),
+                                            V.getContext().getDirectory());
+  assert(FileID && "Invalid file id");
+  addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID);
+  addUInt(Die, dwarf::DW_AT_decl_line, 0, Line);
+}
+
+/// addSourceLine - Add location information to specified debug information
+/// entry.
+void CompileUnit::addSourceLine(DIE *Die, DIGlobalVariable G) {
+  // Verify global variable.
+  if (!G.Verify())
+    return;
+
+  unsigned Line = G.getLineNumber();
+  if (Line == 0)
+    return;
+  unsigned FileID = DD->GetOrCreateSourceID(G.getContext().getFilename(),
+                                            G.getContext().getDirectory());
+  assert(FileID && "Invalid file id");
+  addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID);
+  addUInt(Die, dwarf::DW_AT_decl_line, 0, Line);
+}
+
+/// addSourceLine - Add location information to specified debug information
+/// entry.
+void CompileUnit::addSourceLine(DIE *Die, DISubprogram SP) {
+  // Verify subprogram.
+  if (!SP.Verify())
+    return;
+  // If the line number is 0, don't add it.
+  if (SP.getLineNumber() == 0)
+    return;
+
+  unsigned Line = SP.getLineNumber();
+  if (!SP.getContext().Verify())
+    return;
+  unsigned FileID = DD->GetOrCreateSourceID(SP.getFilename(), SP.getDirectory());
+  assert(FileID && "Invalid file id");
+  addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID);
+  addUInt(Die, dwarf::DW_AT_decl_line, 0, Line);
+}
+
+/// addSourceLine - Add location information to specified debug information
+/// entry.
+void CompileUnit::addSourceLine(DIE *Die, DIType Ty) {
+  // Verify type.
+  if (!Ty.Verify())
+    return;
+
+  unsigned Line = Ty.getLineNumber();
+  if (Line == 0 || !Ty.getContext().Verify())
+    return;
+  unsigned FileID = DD->GetOrCreateSourceID(Ty.getFilename(), Ty.getDirectory());
+  assert(FileID && "Invalid file id");
+  addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID);
+  addUInt(Die, dwarf::DW_AT_decl_line, 0, Line);
+}
+
+/// addSourceLine - Add location information to specified debug information
+/// entry.
+void CompileUnit::addSourceLine(DIE *Die, DINameSpace NS) {
+  // Verify namespace.
+  if (!NS.Verify())
+    return;
+
+  unsigned Line = NS.getLineNumber();
+  if (Line == 0)
+    return;
+  StringRef FN = NS.getFilename();
+
+  unsigned FileID = DD->GetOrCreateSourceID(FN, NS.getDirectory());
+  assert(FileID && "Invalid file id");
+  addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID);
+  addUInt(Die, dwarf::DW_AT_decl_line, 0, Line);
+}
+
+/// addVariableAddress - Add DW_AT_location attribute for a 
+/// DbgVariable based on provided MachineLocation.
+void CompileUnit::addVariableAddress(DbgVariable *&DV, DIE *Die, 
+                                     MachineLocation Location) {
+  if (DV->variableHasComplexAddress())
+    addComplexAddress(DV, Die, dwarf::DW_AT_location, Location);
+  else if (DV->isBlockByrefVariable())
+    addBlockByrefAddress(DV, Die, dwarf::DW_AT_location, Location);
+  else
+    addAddress(Die, dwarf::DW_AT_location, Location);
+}
+
+/// addRegisterOp - Add register operand.
+void CompileUnit::addRegisterOp(DIE *TheDie, unsigned Reg) {
+  const TargetRegisterInfo *RI = Asm->TM.getRegisterInfo();
+  unsigned DWReg = RI->getDwarfRegNum(Reg, false);
+  if (DWReg < 32)
+    addUInt(TheDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_reg0 + DWReg);
+  else {
+    addUInt(TheDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_regx);
+    addUInt(TheDie, 0, dwarf::DW_FORM_udata, DWReg);
+  }
+}
+
+/// addRegisterOffset - Add register offset.
+void CompileUnit::addRegisterOffset(DIE *TheDie, unsigned Reg,
+                                    int64_t Offset) {
+  const TargetRegisterInfo *RI = Asm->TM.getRegisterInfo();
+  unsigned DWReg = RI->getDwarfRegNum(Reg, false);
+  const TargetRegisterInfo *TRI = Asm->TM.getRegisterInfo();
+  if (Reg == TRI->getFrameRegister(*Asm->MF))
+    // If variable offset is based in frame register then use fbreg.
+    addUInt(TheDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_fbreg);
+  else if (DWReg < 32)
+    addUInt(TheDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_breg0 + DWReg);
+  else {
+    addUInt(TheDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_bregx);
+    addUInt(TheDie, 0, dwarf::DW_FORM_udata, DWReg);
+  }
+  addSInt(TheDie, 0, dwarf::DW_FORM_sdata, Offset);
+}
+
+/// addAddress - Add an address attribute to a die based on the location
+/// provided.
+void CompileUnit::addAddress(DIE *Die, unsigned Attribute,
+                             const MachineLocation &Location) {
+  DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
+
+  if (Location.isReg())
+    addRegisterOp(Block, Location.getReg());
+  else
+    addRegisterOffset(Block, Location.getReg(), Location.getOffset());
+
+  // Now attach the location information to the DIE.
+  addBlock(Die, Attribute, 0, Block);
+}
+
+/// addComplexAddress - Start with the address based on the location provided,
+/// and generate the DWARF information necessary to find the actual variable
+/// given the extra address information encoded in the DIVariable, starting from
+/// the starting location.  Add the DWARF information to the die.
+///
+void CompileUnit::addComplexAddress(DbgVariable *&DV, DIE *Die,
+                                    unsigned Attribute,
+                                    const MachineLocation &Location) {
+  DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
+  unsigned N = DV->getNumAddrElements();
+  unsigned i = 0;
+  if (Location.isReg()) {
+    if (N >= 2 && DV->getAddrElement(0) == DIBuilder::OpPlus) {
+      // If first address element is OpPlus then emit
+      // DW_OP_breg + Offset instead of DW_OP_reg + Offset.
+      addRegisterOffset(Block, Location.getReg(), DV->getAddrElement(1));
+      i = 2;
+    } else
+      addRegisterOp(Block, Location.getReg());
+  }
+  else
+    addRegisterOffset(Block, Location.getReg(), Location.getOffset());
+
+  for (;i < N; ++i) {
+    uint64_t Element = DV->getAddrElement(i);
+    if (Element == DIBuilder::OpPlus) {
+      addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
+      addUInt(Block, 0, dwarf::DW_FORM_udata, DV->getAddrElement(++i));
+    } else if (Element == DIBuilder::OpDeref) {
+      addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
+    } else llvm_unreachable("unknown DIBuilder Opcode");
+  }
+
+  // Now attach the location information to the DIE.
+  addBlock(Die, Attribute, 0, Block);
+}
+
+/* Byref variables, in Blocks, are declared by the programmer as "SomeType
+   VarName;", but the compiler creates a __Block_byref_x_VarName struct, and
+   gives the variable VarName either the struct, or a pointer to the struct, as
+   its type.  This is necessary for various behind-the-scenes things the
+   compiler needs to do with by-reference variables in Blocks.
+
+   However, as far as the original *programmer* is concerned, the variable
+   should still have type 'SomeType', as originally declared.
+
+   The function getBlockByrefType dives into the __Block_byref_x_VarName
+   struct to find the original type of the variable, which is then assigned to
+   the variable's Debug Information Entry as its real type.  So far, so good.
+   However now the debugger will expect the variable VarName to have the type
+   SomeType.  So we need the location attribute for the variable to be an
+   expression that explains to the debugger how to navigate through the
+   pointers and struct to find the actual variable of type SomeType.
+
+   The following function does just that.  We start by getting
+   the "normal" location for the variable. This will be the location
+   of either the struct __Block_byref_x_VarName or the pointer to the
+   struct __Block_byref_x_VarName.
+
+   The struct will look something like:
+
+   struct __Block_byref_x_VarName {
+     ... <various fields>
+     struct __Block_byref_x_VarName *forwarding;
+     ... <various other fields>
+     SomeType VarName;
+     ... <maybe more fields>
+   };
+
+   If we are given the struct directly (as our starting point) we
+   need to tell the debugger to:
+
+   1).  Add the offset of the forwarding field.
+
+   2).  Follow that pointer to get the real __Block_byref_x_VarName
+   struct to use (the real one may have been copied onto the heap).
+
+   3).  Add the offset for the field VarName, to find the actual variable.
+
+   If we started with a pointer to the struct, then we need to
+   dereference that pointer first, before the other steps.
+   Translating this into DWARF ops, we will need to append the following
+   to the current location description for the variable:
+
+   DW_OP_deref                    -- optional, if we start with a pointer
+   DW_OP_plus_uconst <forward_fld_offset>
+   DW_OP_deref
+   DW_OP_plus_uconst <varName_fld_offset>
+
+   That is what this function does.  */
+
+/// addBlockByrefAddress - Start with the address based on the location
+/// provided, and generate the DWARF information necessary to find the
+/// actual Block variable (navigating the Block struct) based on the
+/// starting location.  Add the DWARF information to the die.  For
+/// more information, read large comment just above here.
+///
+void CompileUnit::addBlockByrefAddress(DbgVariable *&DV, DIE *Die,
+                                       unsigned Attribute,
+                                       const MachineLocation &Location) {
+  DIType Ty = DV->getType();
+  DIType TmpTy = Ty;
+  unsigned Tag = Ty.getTag();
+  bool isPointer = false;
+
+  StringRef varName = DV->getName();
+
+  if (Tag == dwarf::DW_TAG_pointer_type) {
+    DIDerivedType DTy = DIDerivedType(Ty);
+    TmpTy = DTy.getTypeDerivedFrom();
+    isPointer = true;
+  }
+
+  DICompositeType blockStruct = DICompositeType(TmpTy);
+
+  // Find the __forwarding field and the variable field in the __Block_byref
+  // struct.
+  DIArray Fields = blockStruct.getTypeArray();
+  DIDescriptor varField = DIDescriptor();
+  DIDescriptor forwardingField = DIDescriptor();
+
+  for (unsigned i = 0, N = Fields.getNumElements(); i < N; ++i) {
+    DIDescriptor Element = Fields.getElement(i);
+    DIDerivedType DT = DIDerivedType(Element);
+    StringRef fieldName = DT.getName();
+    if (fieldName == "__forwarding")
+      forwardingField = Element;
+    else if (fieldName == varName)
+      varField = Element;
+  }
+
+  // Get the offsets for the forwarding field and the variable field.
+  unsigned forwardingFieldOffset =
+    DIDerivedType(forwardingField).getOffsetInBits() >> 3;
+  unsigned varFieldOffset =
+    DIDerivedType(varField).getOffsetInBits() >> 3;
+
+  // Decode the original location, and use that as the start of the byref
+  // variable's location.
+  const TargetRegisterInfo *RI = Asm->TM.getRegisterInfo();
+  unsigned Reg = RI->getDwarfRegNum(Location.getReg(), false);
+  DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
+
+  if (Location.isReg()) {
+    if (Reg < 32)
+      addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_reg0 + Reg);
+    else {
+      addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_regx);
+      addUInt(Block, 0, dwarf::DW_FORM_udata, Reg);
+    }
+  } else {
+    if (Reg < 32)
+      addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_breg0 + Reg);
+    else {
+      addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_bregx);
+      addUInt(Block, 0, dwarf::DW_FORM_udata, Reg);
+    }
+
+    addUInt(Block, 0, dwarf::DW_FORM_sdata, Location.getOffset());
+  }
+
+  // If we started with a pointer to the __Block_byref... struct, then
+  // the first thing we need to do is dereference the pointer (DW_OP_deref).
+  if (isPointer)
+    addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
+
+  // Next add the offset for the '__forwarding' field:
+  // DW_OP_plus_uconst ForwardingFieldOffset.  Note there's no point in
+  // adding the offset if it's 0.
+  if (forwardingFieldOffset > 0) {
+    addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
+    addUInt(Block, 0, dwarf::DW_FORM_udata, forwardingFieldOffset);
+  }
+
+  // Now dereference the __forwarding field to get to the real __Block_byref
+  // struct:  DW_OP_deref.
+  addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
+
+  // Now that we've got the real __Block_byref... struct, add the offset
+  // for the variable's field to get to the location of the actual variable:
+  // DW_OP_plus_uconst varFieldOffset.  Again, don't add if it's 0.
+  if (varFieldOffset > 0) {
+    addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
+    addUInt(Block, 0, dwarf::DW_FORM_udata, varFieldOffset);
+  }
+
+  // Now attach the location information to the DIE.
+  addBlock(Die, Attribute, 0, Block);
+}
+
+/// addConstantValue - Add constant value entry in variable DIE.
+bool CompileUnit::addConstantValue(DIE *Die, const MachineOperand &MO,
+                                   DIType Ty) {
+  assert (MO.isImm() && "Invalid machine operand!");
+  DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
+  unsigned form = dwarf::DW_FORM_udata;
+  switch (Ty.getSizeInBits()) {
+    case 8: form = dwarf::DW_FORM_data1; break;
+    case 16: form = dwarf::DW_FORM_data2; break;
+    case 32: form = dwarf::DW_FORM_data4; break;
+    case 64: form = dwarf::DW_FORM_data8; break;
+    default: break;
+  }
+
+  DIBasicType BTy(Ty);
+  if (BTy.Verify() &&
+      (BTy.getEncoding()  == dwarf::DW_ATE_signed 
+       || BTy.getEncoding() == dwarf::DW_ATE_signed_char))
+    addSInt(Block, 0, form, MO.getImm());
+  else
+    addUInt(Block, 0, form, MO.getImm());
+
+  addBlock(Die, dwarf::DW_AT_const_value, 0, Block);
+  return true;
+}
+
+/// addConstantFPValue - Add constant value entry in variable DIE.
+bool CompileUnit::addConstantFPValue(DIE *Die, const MachineOperand &MO) {
+  assert (MO.isFPImm() && "Invalid machine operand!");
+  DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
+  APFloat FPImm = MO.getFPImm()->getValueAPF();
+
+  // Get the raw data form of the floating point.
+  const APInt FltVal = FPImm.bitcastToAPInt();
+  const char *FltPtr = (const char*)FltVal.getRawData();
+
+  int NumBytes = FltVal.getBitWidth() / 8; // 8 bits per byte.
+  bool LittleEndian = Asm->getTargetData().isLittleEndian();
+  int Incr = (LittleEndian ? 1 : -1);
+  int Start = (LittleEndian ? 0 : NumBytes - 1);
+  int Stop = (LittleEndian ? NumBytes : -1);
+
+  // Output the constant to DWARF one byte at a time.
+  for (; Start != Stop; Start += Incr)
+    addUInt(Block, 0, dwarf::DW_FORM_data1,
+            (unsigned char)0xFF & FltPtr[Start]);
+
+  addBlock(Die, dwarf::DW_AT_const_value, 0, Block);
+  return true;
+}
+
+/// addConstantValue - Add constant value entry in variable DIE.
+bool CompileUnit::addConstantValue(DIE *Die, ConstantInt *CI,
+                                   bool Unsigned) {
+  unsigned CIBitWidth = CI->getBitWidth();
+  if (CIBitWidth <= 64) {
+    unsigned form = 0;
+    switch (CIBitWidth) {
+    case 8: form = dwarf::DW_FORM_data1; break;
+    case 16: form = dwarf::DW_FORM_data2; break;
+    case 32: form = dwarf::DW_FORM_data4; break;
+    case 64: form = dwarf::DW_FORM_data8; break;
+    default: 
+      form = Unsigned ? dwarf::DW_FORM_udata : dwarf::DW_FORM_sdata;
+    }
+    if (Unsigned)
+      addUInt(Die, dwarf::DW_AT_const_value, form, CI->getZExtValue());
+    else
+      addSInt(Die, dwarf::DW_AT_const_value, form, CI->getSExtValue());
+    return true;
+  }
+
+  DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
+
+  // Get the raw data form of the large APInt.
+  const APInt Val = CI->getValue();
+  const char *Ptr = (const char*)Val.getRawData();
+
+  int NumBytes = Val.getBitWidth() / 8; // 8 bits per byte.
+  bool LittleEndian = Asm->getTargetData().isLittleEndian();
+  int Incr = (LittleEndian ? 1 : -1);
+  int Start = (LittleEndian ? 0 : NumBytes - 1);
+  int Stop = (LittleEndian ? NumBytes : -1);
+
+  // Output the constant to DWARF one byte at a time.
+  for (; Start != Stop; Start += Incr)
+    addUInt(Block, 0, dwarf::DW_FORM_data1,
+            (unsigned char)0xFF & Ptr[Start]);
+
+  addBlock(Die, dwarf::DW_AT_const_value, 0, Block);
+  return true;
+}
+
+/// addTemplateParams - Add template parameters in buffer.
+void CompileUnit::addTemplateParams(DIE &Buffer, DIArray TParams) {
+  // Add template parameters.
+  for (unsigned i = 0, e = TParams.getNumElements(); i != e; ++i) {
+    DIDescriptor Element = TParams.getElement(i);
+    if (Element.isTemplateTypeParameter())
+      Buffer.addChild(getOrCreateTemplateTypeParameterDIE(
+                        DITemplateTypeParameter(Element)));
+    else if (Element.isTemplateValueParameter())
+      Buffer.addChild(getOrCreateTemplateValueParameterDIE(
+                        DITemplateValueParameter(Element)));
+  }
+
+}
+/// addToContextOwner - Add Die into the list of its context owner's children.
+void CompileUnit::addToContextOwner(DIE *Die, DIDescriptor Context) {
+  if (Context.isType()) {
+    DIE *ContextDIE = getOrCreateTypeDIE(DIType(Context));
+    ContextDIE->addChild(Die);
+  } else if (Context.isNameSpace()) {
+    DIE *ContextDIE = getOrCreateNameSpace(DINameSpace(Context));
+    ContextDIE->addChild(Die);
+  } else if (Context.isSubprogram()) {
+    DIE *ContextDIE = DD->createSubprogramDIE(DISubprogram(Context));
+    ContextDIE->addChild(Die);
+  } else if (DIE *ContextDIE = getDIE(Context))
+    ContextDIE->addChild(Die);
+  else
+    addDie(Die);
+}
+
+/// getOrCreateTypeDIE - Find existing DIE or create new DIE for the
+/// given DIType.
+DIE *CompileUnit::getOrCreateTypeDIE(DIType Ty) {
+  DIE *TyDIE = getDIE(Ty);
+  if (TyDIE)
+    return TyDIE;
+
+  // Create new type.
+  TyDIE = new DIE(dwarf::DW_TAG_base_type);
+  insertDIE(Ty, TyDIE);
+  if (Ty.isBasicType())
+    constructTypeDIE(*TyDIE, DIBasicType(Ty));
+  else if (Ty.isCompositeType())
+    constructTypeDIE(*TyDIE, DICompositeType(Ty));
+  else {
+    assert(Ty.isDerivedType() && "Unknown kind of DIType");
+    constructTypeDIE(*TyDIE, DIDerivedType(Ty));
+  }
+
+  addToContextOwner(TyDIE, Ty.getContext());
+  return TyDIE;
+}
+
+/// addType - Add a new type attribute to the specified entity.
+void CompileUnit::addType(DIE *Entity, DIType Ty) {
+  if (!Ty.Verify())
+    return;
+
+  // Check for pre-existence.
+  DIEEntry *Entry = getDIEEntry(Ty);
+  // If it exists then use the existing value.
+  if (Entry) {
+    Entity->addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, Entry);
+    return;
+  }
+
+  // Construct type.
+  DIE *Buffer = getOrCreateTypeDIE(Ty);
+
+  // Set up proxy.
+  Entry = createDIEEntry(Buffer);
+  insertDIEEntry(Ty, Entry);
+  Entity->addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, Entry);
+
+  // If this is a complete composite type then include it in the
+  // list of global types.
+  addGlobalType(Ty);
+}
+
+/// addGlobalType - Add a new global type to the compile unit.
+///
+void CompileUnit::addGlobalType(DIType Ty) {
+  DIDescriptor Context = Ty.getContext();
+  if (Ty.isCompositeType() && !Ty.getName().empty() && !Ty.isForwardDecl() 
+      && (Context.isCompileUnit() || Context.isFile() || Context.isNameSpace()))
+    if (DIEEntry *Entry = getDIEEntry(Ty))
+      GlobalTypes[Ty.getName()] = Entry->getEntry();
+}
+
+/// addPubTypes - Add type for pubtypes section.
+void CompileUnit::addPubTypes(DISubprogram SP) {
+  DICompositeType SPTy = SP.getType();
+  unsigned SPTag = SPTy.getTag();
+  if (SPTag != dwarf::DW_TAG_subroutine_type)
+    return;
+
+  DIArray Args = SPTy.getTypeArray();
+  for (unsigned i = 0, e = Args.getNumElements(); i != e; ++i) {
+    DIType ATy(Args.getElement(i));
+    if (!ATy.Verify())
+      continue;
+    addGlobalType(ATy);
+  }
+}
+
+/// constructTypeDIE - Construct basic type die from DIBasicType.
+void CompileUnit::constructTypeDIE(DIE &Buffer, DIBasicType BTy) {
+  // Get core information.
+  StringRef Name = BTy.getName();
+  Buffer.setTag(dwarf::DW_TAG_base_type);
+  addUInt(&Buffer, dwarf::DW_AT_encoding,  dwarf::DW_FORM_data1,
+          BTy.getEncoding());
+
+  // Add name if not anonymous or intermediate type.
+  if (!Name.empty())
+    addString(&Buffer, dwarf::DW_AT_name, dwarf::DW_FORM_string, Name);
+  uint64_t Size = BTy.getSizeInBits() >> 3;
+  addUInt(&Buffer, dwarf::DW_AT_byte_size, 0, Size);
+}
+
+/// constructTypeDIE - Construct derived type die from DIDerivedType.
+void CompileUnit::constructTypeDIE(DIE &Buffer, DIDerivedType DTy) {
+  // Get core information.
+  StringRef Name = DTy.getName();
+  uint64_t Size = DTy.getSizeInBits() >> 3;
+  unsigned Tag = DTy.getTag();
+
+  // FIXME - Workaround for templates.
+  if (Tag == dwarf::DW_TAG_inheritance) Tag = dwarf::DW_TAG_reference_type;
+
+  Buffer.setTag(Tag);
+
+  // Map to main type, void will not have a type.
+  DIType FromTy = DTy.getTypeDerivedFrom();
+  addType(&Buffer, FromTy);
+
+  // Add name if not anonymous or intermediate type.
+  if (!Name.empty())
+    addString(&Buffer, dwarf::DW_AT_name, dwarf::DW_FORM_string, Name);
+
+  // Add size if non-zero (derived types might be zero-sized.)
+  if (Size)
+    addUInt(&Buffer, dwarf::DW_AT_byte_size, 0, Size);
+
+  // Add source line info if available and TyDesc is not a forward declaration.
+  if (!DTy.isForwardDecl())
+    addSourceLine(&Buffer, DTy);
+}
+
+/// constructTypeDIE - Construct type DIE from DICompositeType.
+void CompileUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
+  // Get core information.
+  StringRef Name = CTy.getName();
+
+  uint64_t Size = CTy.getSizeInBits() >> 3;
+  unsigned Tag = CTy.getTag();
+  Buffer.setTag(Tag);
+
+  switch (Tag) {
+  case dwarf::DW_TAG_vector_type:
+  case dwarf::DW_TAG_array_type:
+    constructArrayTypeDIE(Buffer, &CTy);
+    break;
+  case dwarf::DW_TAG_enumeration_type: {
+    DIArray Elements = CTy.getTypeArray();
+
+    // Add enumerators to enumeration type.
+    for (unsigned i = 0, N = Elements.getNumElements(); i < N; ++i) {
+      DIE *ElemDie = NULL;
+      DIDescriptor Enum(Elements.getElement(i));
+      if (Enum.isEnumerator()) {
+        ElemDie = constructEnumTypeDIE(DIEnumerator(Enum));
+        Buffer.addChild(ElemDie);
+      }
+    }
+  }
+    break;
+  case dwarf::DW_TAG_subroutine_type: {
+    // Add return type.
+    DIArray Elements = CTy.getTypeArray();
+    DIDescriptor RTy = Elements.getElement(0);
+    addType(&Buffer, DIType(RTy));
+
+    bool isPrototyped = true;
+    // Add arguments.
+    for (unsigned i = 1, N = Elements.getNumElements(); i < N; ++i) {
+      DIDescriptor Ty = Elements.getElement(i);
+      if (Ty.isUnspecifiedParameter()) {
+        DIE *Arg = new DIE(dwarf::DW_TAG_unspecified_parameters);
+        Buffer.addChild(Arg);
+        isPrototyped = false;
+      } else {
+        DIE *Arg = new DIE(dwarf::DW_TAG_formal_parameter);
+        addType(Arg, DIType(Ty));
+        Buffer.addChild(Arg);
+      }
+    }
+    // Add prototype flag.
+    if (isPrototyped)
+      addUInt(&Buffer, dwarf::DW_AT_prototyped, dwarf::DW_FORM_flag, 1);
+  }
+    break;
+  case dwarf::DW_TAG_structure_type:
+  case dwarf::DW_TAG_union_type:
+  case dwarf::DW_TAG_class_type: {
+    // Add elements to structure type.
+    DIArray Elements = CTy.getTypeArray();
+
+    // A forward struct declared type may not have elements available.
+    unsigned N = Elements.getNumElements();
+    if (N == 0)
+      break;
+
+    // Add elements to structure type.
+    for (unsigned i = 0; i < N; ++i) {
+      DIDescriptor Element = Elements.getElement(i);
+      DIE *ElemDie = NULL;
+      if (Element.isSubprogram()) {
+        DISubprogram SP(Element);
+        ElemDie = DD->createSubprogramDIE(DISubprogram(Element));
+        if (SP.isProtected())
+          addUInt(ElemDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_flag,
+                  dwarf::DW_ACCESS_protected);
+        else if (SP.isPrivate())
+          addUInt(ElemDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_flag,
+                  dwarf::DW_ACCESS_private);
+        else 
+          addUInt(ElemDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_flag,
+            dwarf::DW_ACCESS_public);
+        if (SP.isExplicit())
+          addUInt(ElemDie, dwarf::DW_AT_explicit, dwarf::DW_FORM_flag, 1);
+      }
+      else if (Element.isVariable()) {
+        DIVariable DV(Element);
+        ElemDie = new DIE(dwarf::DW_TAG_variable);
+        addString(ElemDie, dwarf::DW_AT_name, dwarf::DW_FORM_string,
+                  DV.getName());
+        addType(ElemDie, DV.getType());
+        addUInt(ElemDie, dwarf::DW_AT_declaration, dwarf::DW_FORM_flag, 1);
+        addUInt(ElemDie, dwarf::DW_AT_external, dwarf::DW_FORM_flag, 1);
+        addSourceLine(ElemDie, DV);
+      } else if (Element.isDerivedType())
+        ElemDie = createMemberDIE(DIDerivedType(Element));
+      else
+        continue;
+      Buffer.addChild(ElemDie);
+    }
+
+    if (CTy.isAppleBlockExtension())
+      addUInt(&Buffer, dwarf::DW_AT_APPLE_block, dwarf::DW_FORM_flag, 1);
+
+    unsigned RLang = CTy.getRunTimeLang();
+    if (RLang)
+      addUInt(&Buffer, dwarf::DW_AT_APPLE_runtime_class,
+              dwarf::DW_FORM_data1, RLang);
+
+    DICompositeType ContainingType = CTy.getContainingType();
+    if (DIDescriptor(ContainingType).isCompositeType())
+      addDIEEntry(&Buffer, dwarf::DW_AT_containing_type, dwarf::DW_FORM_ref4,
+                  getOrCreateTypeDIE(DIType(ContainingType)));
+    else {
+      DIDescriptor Context = CTy.getContext();
+      addToContextOwner(&Buffer, Context);
+    }
+
+    if (CTy.isObjcClassComplete())
+      addUInt(&Buffer, dwarf::DW_AT_APPLE_objc_complete_type,
+              dwarf::DW_FORM_flag, 1);
+
+    if (Tag == dwarf::DW_TAG_class_type) 
+      addTemplateParams(Buffer, CTy.getTemplateParams());
+
+    break;
+  }
+  default:
+    break;
+  }
+
+  // Add name if not anonymous or intermediate type.
+  if (!Name.empty())
+    addString(&Buffer, dwarf::DW_AT_name, dwarf::DW_FORM_string, Name);
+
+  if (Tag == dwarf::DW_TAG_enumeration_type || Tag == dwarf::DW_TAG_class_type
+      || Tag == dwarf::DW_TAG_structure_type || Tag == dwarf::DW_TAG_union_type)
+    {
+    // Add size if non-zero (derived types might be zero-sized.)
+    if (Size)
+      addUInt(&Buffer, dwarf::DW_AT_byte_size, 0, Size);
+    else {
+      // Add zero size if it is not a forward declaration.
+      if (CTy.isForwardDecl())
+        addUInt(&Buffer, dwarf::DW_AT_declaration, dwarf::DW_FORM_flag, 1);
+      else
+        addUInt(&Buffer, dwarf::DW_AT_byte_size, 0, 0);
+    }
+
+    // Add source line info if available.
+    if (!CTy.isForwardDecl())
+      addSourceLine(&Buffer, CTy);
+  }
+}
+
+/// getOrCreateTemplateTypeParameterDIE - Find existing DIE or create new DIE 
+/// for the given DITemplateTypeParameter.
+DIE *
+CompileUnit::getOrCreateTemplateTypeParameterDIE(DITemplateTypeParameter TP) {
+  DIE *ParamDIE = getDIE(TP);
+  if (ParamDIE)
+    return ParamDIE;
+
+  ParamDIE = new DIE(dwarf::DW_TAG_template_type_parameter);
+  addType(ParamDIE, TP.getType());
+  addString(ParamDIE, dwarf::DW_AT_name, dwarf::DW_FORM_string, TP.getName());
+  return ParamDIE;
+}
+
+/// getOrCreateTemplateValueParameterDIE - Find existing DIE or create new DIE 
+/// for the given DITemplateValueParameter.
+DIE *
+CompileUnit::getOrCreateTemplateValueParameterDIE(DITemplateValueParameter TPV) {
+  DIE *ParamDIE = getDIE(TPV);
+  if (ParamDIE)
+    return ParamDIE;
+
+  ParamDIE = new DIE(dwarf::DW_TAG_template_value_parameter);
+  addType(ParamDIE, TPV.getType());
+  if (!TPV.getName().empty())
+    addString(ParamDIE, dwarf::DW_AT_name, dwarf::DW_FORM_string, TPV.getName());
+  addUInt(ParamDIE, dwarf::DW_AT_const_value, dwarf::DW_FORM_udata, 
+          TPV.getValue());
+  return ParamDIE;
+}
+
+/// getOrCreateNameSpace - Create a DIE for DINameSpace.
+DIE *CompileUnit::getOrCreateNameSpace(DINameSpace NS) {
+  DIE *NDie = getDIE(NS);
+  if (NDie)
+    return NDie;
+  NDie = new DIE(dwarf::DW_TAG_namespace);
+  insertDIE(NS, NDie);
+  if (!NS.getName().empty())
+    addString(NDie, dwarf::DW_AT_name, dwarf::DW_FORM_string, NS.getName());
+  addSourceLine(NDie, NS);
+  addToContextOwner(NDie, NS.getContext());
+  return NDie;
+}
+
+/// constructSubrangeDIE - Construct subrange DIE from DISubrange.
+void CompileUnit::constructSubrangeDIE(DIE &Buffer, DISubrange SR, DIE *IndexTy){
+  DIE *DW_Subrange = new DIE(dwarf::DW_TAG_subrange_type);
+  addDIEEntry(DW_Subrange, dwarf::DW_AT_type, dwarf::DW_FORM_ref4, IndexTy);
+  int64_t L = SR.getLo();
+  int64_t H = SR.getHi();
+
+  // The L value defines the lower bounds which is typically zero for C/C++. The
+  // H value is the upper bounds.  Values are 64 bit.  H - L + 1 is the size
+  // of the array. If L > H then do not emit DW_AT_lower_bound and 
+  // DW_AT_upper_bound attributes. If L is zero and H is also zero then the
+  // array has one element and in such case do not emit lower bound.
+
+  if (L > H) {
+    Buffer.addChild(DW_Subrange);
+    return;
+  }
+  if (L)
+    addSInt(DW_Subrange, dwarf::DW_AT_lower_bound, 0, L);
+  addSInt(DW_Subrange, dwarf::DW_AT_upper_bound, 0, H);
+  Buffer.addChild(DW_Subrange);
+}
+
+/// constructArrayTypeDIE - Construct array type DIE from DICompositeType.
+void CompileUnit::constructArrayTypeDIE(DIE &Buffer,
+                                        DICompositeType *CTy) {
+  Buffer.setTag(dwarf::DW_TAG_array_type);
+  if (CTy->getTag() == dwarf::DW_TAG_vector_type)
+    addUInt(&Buffer, dwarf::DW_AT_GNU_vector, dwarf::DW_FORM_flag, 1);
+
+  // Emit derived type.
+  addType(&Buffer, CTy->getTypeDerivedFrom());
+  DIArray Elements = CTy->getTypeArray();
+
+  // Get an anonymous type for index type.
+  DIE *IdxTy = getIndexTyDie();
+  if (!IdxTy) {
+    // Construct an anonymous type for index type.
+    IdxTy = new DIE(dwarf::DW_TAG_base_type);
+    addUInt(IdxTy, dwarf::DW_AT_byte_size, 0, sizeof(int32_t));
+    addUInt(IdxTy, dwarf::DW_AT_encoding, dwarf::DW_FORM_data1,
+            dwarf::DW_ATE_signed);
+    addDie(IdxTy);
+    setIndexTyDie(IdxTy);
+  }
+
+  // Add subranges to array type.
+  for (unsigned i = 0, N = Elements.getNumElements(); i < N; ++i) {
+    DIDescriptor Element = Elements.getElement(i);
+    if (Element.getTag() == dwarf::DW_TAG_subrange_type)
+      constructSubrangeDIE(Buffer, DISubrange(Element), IdxTy);
+  }
+}
+
+/// constructEnumTypeDIE - Construct enum type DIE from DIEnumerator.
+DIE *CompileUnit::constructEnumTypeDIE(DIEnumerator ETy) {
+  DIE *Enumerator = new DIE(dwarf::DW_TAG_enumerator);
+  StringRef Name = ETy.getName();
+  addString(Enumerator, dwarf::DW_AT_name, dwarf::DW_FORM_string, Name);
+  int64_t Value = ETy.getEnumValue();
+  addSInt(Enumerator, dwarf::DW_AT_const_value, dwarf::DW_FORM_sdata, Value);
+  return Enumerator;
+}
+
+/// createMemberDIE - Create new member DIE.
+DIE *CompileUnit::createMemberDIE(DIDerivedType DT) {
+  DIE *MemberDie = new DIE(DT.getTag());
+  StringRef Name = DT.getName();
+  if (!Name.empty())
+    addString(MemberDie, dwarf::DW_AT_name, dwarf::DW_FORM_string, Name);
+
+  addType(MemberDie, DT.getTypeDerivedFrom());
+
+  addSourceLine(MemberDie, DT);
+
+  DIEBlock *MemLocationDie = new (DIEValueAllocator) DIEBlock();
+  addUInt(MemLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
+
+  uint64_t Size = DT.getSizeInBits();
+  uint64_t FieldSize = DT.getOriginalTypeSize();
+
+  if (Size != FieldSize) {
+    // Handle bitfield.
+    addUInt(MemberDie, dwarf::DW_AT_byte_size, 0, DT.getOriginalTypeSize()>>3);
+    addUInt(MemberDie, dwarf::DW_AT_bit_size, 0, DT.getSizeInBits());
+
+    uint64_t Offset = DT.getOffsetInBits();
+    uint64_t AlignMask = ~(DT.getAlignInBits() - 1);
+    uint64_t HiMark = (Offset + FieldSize) & AlignMask;
+    uint64_t FieldOffset = (HiMark - FieldSize);
+    Offset -= FieldOffset;
+
+    // Maybe we need to work from the other end.
+    if (Asm->getTargetData().isLittleEndian())
+      Offset = FieldSize - (Offset + Size);
+    addUInt(MemberDie, dwarf::DW_AT_bit_offset, 0, Offset);
+
+    // Here WD_AT_data_member_location points to the anonymous
+    // field that includes this bit field.
+    addUInt(MemLocationDie, 0, dwarf::DW_FORM_udata, FieldOffset >> 3);
+
+  } else
+    // This is not a bitfield.
+    addUInt(MemLocationDie, 0, dwarf::DW_FORM_udata, DT.getOffsetInBits() >> 3);
+
+  if (DT.getTag() == dwarf::DW_TAG_inheritance
+      && DT.isVirtual()) {
+
+    // For C++, virtual base classes are not at fixed offset. Use following
+    // expression to extract appropriate offset from vtable.
+    // BaseAddr = ObAddr + *((*ObAddr) - Offset)
+
+    DIEBlock *VBaseLocationDie = new (DIEValueAllocator) DIEBlock();
+    addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_dup);
+    addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
+    addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
+    addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_udata, DT.getOffsetInBits());
+    addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_minus);
+    addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
+    addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus);
+
+    addBlock(MemberDie, dwarf::DW_AT_data_member_location, 0,
+             VBaseLocationDie);
+  } else
+    addBlock(MemberDie, dwarf::DW_AT_data_member_location, 0, MemLocationDie);
+
+  if (DT.isProtected())
+    addUInt(MemberDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_flag,
+            dwarf::DW_ACCESS_protected);
+  else if (DT.isPrivate())
+    addUInt(MemberDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_flag,
+            dwarf::DW_ACCESS_private);
+  // Otherwise C++ member and base classes are considered public.
+  else if (DT.getCompileUnit().getLanguage() == dwarf::DW_LANG_C_plus_plus)
+    addUInt(MemberDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_flag,
+            dwarf::DW_ACCESS_public);
+  if (DT.isVirtual())
+    addUInt(MemberDie, dwarf::DW_AT_virtuality, dwarf::DW_FORM_flag,
+            dwarf::DW_VIRTUALITY_virtual);
+
+  // Objective-C properties.
+  StringRef PropertyName = DT.getObjCPropertyName();
+  if (!PropertyName.empty()) {
+    addString(MemberDie, dwarf::DW_AT_APPLE_property_name, dwarf::DW_FORM_string,
+              PropertyName);
+    StringRef GetterName = DT.getObjCPropertyGetterName();
+    if (!GetterName.empty())
+      addString(MemberDie, dwarf::DW_AT_APPLE_property_getter,
+                dwarf::DW_FORM_string, GetterName);
+    StringRef SetterName = DT.getObjCPropertySetterName();
+    if (!SetterName.empty())
+      addString(MemberDie, dwarf::DW_AT_APPLE_property_setter,
+                dwarf::DW_FORM_string, SetterName);
+    unsigned PropertyAttributes = 0;
+    if (DT.isReadOnlyObjCProperty())
+      PropertyAttributes |= dwarf::DW_APPLE_PROPERTY_readonly;
+    if (DT.isReadWriteObjCProperty())
+      PropertyAttributes |= dwarf::DW_APPLE_PROPERTY_readwrite;
+    if (DT.isAssignObjCProperty())
+      PropertyAttributes |= dwarf::DW_APPLE_PROPERTY_assign;
+    if (DT.isRetainObjCProperty())
+      PropertyAttributes |= dwarf::DW_APPLE_PROPERTY_retain;
+    if (DT.isCopyObjCProperty())
+      PropertyAttributes |= dwarf::DW_APPLE_PROPERTY_copy;
+    if (DT.isNonAtomicObjCProperty())
+      PropertyAttributes |= dwarf::DW_APPLE_PROPERTY_nonatomic;
+    if (PropertyAttributes)
+      addUInt(MemberDie, dwarf::DW_AT_APPLE_property_attribute, 0, 
+              PropertyAttributes);
+  }
+  return MemberDie;
+}
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
new file mode 100644
index 0000000..60a9b28
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
@@ -0,0 +1,280 @@
+//===-- llvm/CodeGen/DwarfCompileUnit.h - Dwarf Compile Unit ---*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing dwarf compile unit.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CODEGEN_ASMPRINTER_DWARFCOMPILEUNIT_H
+#define CODEGEN_ASMPRINTER_DWARFCOMPILEUNIT_H
+
+#include "DIE.h"
+#include "llvm/Analysis/DebugInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/OwningPtr.h"
+
+namespace llvm {
+
+class DwarfDebug;
+class MachineLocation;
+class MachineOperand;
+class ConstantInt;
+class DbgVariable;
+
+//===----------------------------------------------------------------------===//
+/// CompileUnit - This dwarf writer support class manages information associate
+/// with a source file.
+class CompileUnit {
+  /// ID - File identifier for source.
+  ///
+  unsigned ID;
+
+  /// Die - Compile unit debug information entry.
+  ///
+  const OwningPtr<DIE> CUDie;
+
+  /// Asm - Target of Dwarf emission.
+  AsmPrinter *Asm;
+
+  DwarfDebug *DD;
+
+  /// IndexTyDie - An anonymous type for index type.  Owned by CUDie.
+  DIE *IndexTyDie;
+
+  /// MDNodeToDieMap - Tracks the mapping of unit level debug informaton
+  /// variables to debug information entries.
+  DenseMap<const MDNode *, DIE *> MDNodeToDieMap;
+
+  /// MDNodeToDIEEntryMap - Tracks the mapping of unit level debug informaton
+  /// descriptors to debug information entries using a DIEEntry proxy.
+  DenseMap<const MDNode *, DIEEntry *> MDNodeToDIEEntryMap;
+
+  /// Globals - A map of globally visible named entities for this unit.
+  ///
+  StringMap<DIE*> Globals;
+
+  /// GlobalTypes - A map of globally visible types for this unit.
+  ///
+  StringMap<DIE*> GlobalTypes;
+
+  /// DIEBlocks - A list of all the DIEBlocks in use.
+  std::vector<DIEBlock *> DIEBlocks;
+
+public:
+  CompileUnit(unsigned I, DIE *D, AsmPrinter *A, DwarfDebug *DW);
+  ~CompileUnit();
+
+  // Accessors.
+  unsigned getID()                  const { return ID; }
+  DIE* getCUDie()                   const { return CUDie.get(); }
+  const StringMap<DIE*> &getGlobals()     const { return Globals; }
+  const StringMap<DIE*> &getGlobalTypes() const { return GlobalTypes; }
+
+  /// hasContent - Return true if this compile unit has something to write out.
+  ///
+  bool hasContent() const { return !CUDie->getChildren().empty(); }
+
+  /// addGlobal - Add a new global entity to the compile unit.
+  ///
+  void addGlobal(StringRef Name, DIE *Die) { Globals[Name] = Die; }
+
+  /// addGlobalType - Add a new global type to the compile unit.
+  ///
+  void addGlobalType(DIType Ty);
+
+  /// getDIE - Returns the debug information entry map slot for the
+  /// specified debug variable.
+  DIE *getDIE(const MDNode *N) { return MDNodeToDieMap.lookup(N); }
+
+  DIEBlock *getDIEBlock() { 
+    return new (DIEValueAllocator) DIEBlock();
+  }
+
+  /// insertDIE - Insert DIE into the map.
+  void insertDIE(const MDNode *N, DIE *D) {
+    MDNodeToDieMap.insert(std::make_pair(N, D));
+  }
+
+  /// getDIEEntry - Returns the debug information entry for the specified
+  /// debug variable.
+  DIEEntry *getDIEEntry(const MDNode *N) {
+    DenseMap<const MDNode *, DIEEntry *>::iterator I =
+      MDNodeToDIEEntryMap.find(N);
+    if (I == MDNodeToDIEEntryMap.end())
+      return NULL;
+    return I->second;
+  }
+
+  /// insertDIEEntry - Insert debug information entry into the map.
+  void insertDIEEntry(const MDNode *N, DIEEntry *E) {
+    MDNodeToDIEEntryMap.insert(std::make_pair(N, E));
+  }
+
+  /// addDie - Adds or interns the DIE to the compile unit.
+  ///
+  void addDie(DIE *Buffer) {
+    this->CUDie->addChild(Buffer);
+  }
+
+  // getIndexTyDie - Get an anonymous type for index type.
+  DIE *getIndexTyDie() {
+    return IndexTyDie;
+  }
+
+  // setIndexTyDie - Set D as anonymous type for index which can be reused
+  // later.
+  void setIndexTyDie(DIE *D) {
+    IndexTyDie = D;
+  }
+public:
+
+  /// addUInt - Add an unsigned integer attribute data and value.
+  ///
+  void addUInt(DIE *Die, unsigned Attribute, unsigned Form, uint64_t Integer);
+
+  /// addSInt - Add an signed integer attribute data and value.
+  ///
+  void addSInt(DIE *Die, unsigned Attribute, unsigned Form, int64_t Integer);
+
+  /// addString - Add a string attribute data and value.
+  ///
+  void addString(DIE *Die, unsigned Attribute, unsigned Form,
+                 const StringRef Str);
+
+  /// addLabel - Add a Dwarf label attribute data and value.
+  ///
+  void addLabel(DIE *Die, unsigned Attribute, unsigned Form,
+                const MCSymbol *Label);
+
+  /// addDelta - Add a label delta attribute data and value.
+  ///
+  void addDelta(DIE *Die, unsigned Attribute, unsigned Form,
+                const MCSymbol *Hi, const MCSymbol *Lo);
+
+  /// addDIEEntry - Add a DIE attribute data and value.
+  ///
+  void addDIEEntry(DIE *Die, unsigned Attribute, unsigned Form, DIE *Entry);
+  
+  /// addBlock - Add block data.
+  ///
+  void addBlock(DIE *Die, unsigned Attribute, unsigned Form, DIEBlock *Block);
+
+  /// addSourceLine - Add location information to specified debug information
+  /// entry.
+  void addSourceLine(DIE *Die, DIVariable V);
+  void addSourceLine(DIE *Die, DIGlobalVariable G);
+  void addSourceLine(DIE *Die, DISubprogram SP);
+  void addSourceLine(DIE *Die, DIType Ty);
+  void addSourceLine(DIE *Die, DINameSpace NS);
+
+  /// addAddress - Add an address attribute to a die based on the location
+  /// provided.
+  void addAddress(DIE *Die, unsigned Attribute,
+                  const MachineLocation &Location);
+
+  /// addConstantValue - Add constant value entry in variable DIE.
+  bool addConstantValue(DIE *Die, const MachineOperand &MO, DIType Ty);
+  bool addConstantValue(DIE *Die, ConstantInt *CI, bool Unsigned);
+
+  /// addConstantFPValue - Add constant value entry in variable DIE.
+  bool addConstantFPValue(DIE *Die, const MachineOperand &MO);
+
+  /// addTemplateParams - Add template parameters in buffer.
+  void addTemplateParams(DIE &Buffer, DIArray TParams);
+
+  /// addRegisterOp - Add register operand.
+  void addRegisterOp(DIE *TheDie, unsigned Reg);
+
+  /// addRegisterOffset - Add register offset.
+  void addRegisterOffset(DIE *TheDie, unsigned Reg, int64_t Offset);
+
+  /// addComplexAddress - Start with the address based on the location provided,
+  /// and generate the DWARF information necessary to find the actual variable
+  /// (navigating the extra location information encoded in the type) based on
+  /// the starting location.  Add the DWARF information to the die.
+  ///
+  void addComplexAddress(DbgVariable *&DV, DIE *Die, unsigned Attribute,
+                         const MachineLocation &Location);
+
+  // FIXME: Should be reformulated in terms of addComplexAddress.
+  /// addBlockByrefAddress - Start with the address based on the location
+  /// provided, and generate the DWARF information necessary to find the
+  /// actual Block variable (navigating the Block struct) based on the
+  /// starting location.  Add the DWARF information to the die.  Obsolete,
+  /// please use addComplexAddress instead.
+  ///
+  void addBlockByrefAddress(DbgVariable *&DV, DIE *Die, unsigned Attribute,
+                            const MachineLocation &Location);
+
+  /// addVariableAddress - Add DW_AT_location attribute for a 
+  /// DbgVariable based on provided MachineLocation.
+  void addVariableAddress(DbgVariable *&DV, DIE *Die, MachineLocation Location);
+
+  /// addToContextOwner - Add Die into the list of its context owner's children.
+  void addToContextOwner(DIE *Die, DIDescriptor Context);
+
+  /// addType - Add a new type attribute to the specified entity.
+  void addType(DIE *Entity, DIType Ty);
+
+  /// getOrCreateNameSpace - Create a DIE for DINameSpace.
+  DIE *getOrCreateNameSpace(DINameSpace NS);
+
+  /// getOrCreateTypeDIE - Find existing DIE or create new DIE for the
+  /// given DIType.
+  DIE *getOrCreateTypeDIE(DIType Ty);
+
+  /// getOrCreateTemplateTypeParameterDIE - Find existing DIE or create new DIE 
+  /// for the given DITemplateTypeParameter.
+  DIE *getOrCreateTemplateTypeParameterDIE(DITemplateTypeParameter TP);
+
+  /// getOrCreateTemplateValueParameterDIE - Find existing DIE or create new DIE 
+  /// for the given DITemplateValueParameter.
+  DIE *getOrCreateTemplateValueParameterDIE(DITemplateValueParameter TVP);
+
+  /// createDIEEntry - Creates a new DIEEntry to be a proxy for a debug
+  /// information entry.
+  DIEEntry *createDIEEntry(DIE *Entry);
+
+  void addPubTypes(DISubprogram SP);
+
+  /// constructTypeDIE - Construct basic type die from DIBasicType.
+  void constructTypeDIE(DIE &Buffer,
+                        DIBasicType BTy);
+
+  /// constructTypeDIE - Construct derived type die from DIDerivedType.
+  void constructTypeDIE(DIE &Buffer,
+                        DIDerivedType DTy);
+
+  /// constructTypeDIE - Construct type DIE from DICompositeType.
+  void constructTypeDIE(DIE &Buffer,
+                        DICompositeType CTy);
+
+  /// constructSubrangeDIE - Construct subrange DIE from DISubrange.
+  void constructSubrangeDIE(DIE &Buffer, DISubrange SR, DIE *IndexTy);
+
+  /// constructArrayTypeDIE - Construct array type DIE from DICompositeType.
+  void constructArrayTypeDIE(DIE &Buffer, 
+                             DICompositeType *CTy);
+
+  /// constructEnumTypeDIE - Construct enum type DIE from DIEnumerator.
+  DIE *constructEnumTypeDIE(DIEnumerator ETy);
+
+  /// createMemberDIE - Create new member DIE.
+  DIE *createMemberDIE(DIDerivedType DT);
+
+private:
+
+  // DIEValueAllocator - All DIEValues are allocated through this allocator.
+  BumpPtrAllocator DIEValueAllocator;
+  DIEInteger *DIEIntegerOne;
+};
+
+} // end llvm namespace
+#endif
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index bad87c1..8845bfa 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -14,6 +14,7 @@
 #define DEBUG_TYPE "dwarfdebug"
 #include "DwarfDebug.h"
 #include "DIE.h"
+#include "DwarfCompileUnit.h"
 #include "llvm/Constants.h"
 #include "llvm/Module.h"
 #include "llvm/Instructions.h"
@@ -52,13 +53,9 @@ static cl::opt<bool> DisableDebugInfoPrinting("disable-debug-info-print",
      cl::desc("Disable debug info printing"));
 
 static cl::opt<bool> UnknownLocations("use-unknown-locations", cl::Hidden,
-     cl::desc("Make an absense of debug location information explicit."),
+     cl::desc("Make an absence of debug location information explicit."),
      cl::init(false));
 
-#ifndef NDEBUG
-STATISTIC(BlocksWithoutLineNo, "Number of blocks without any line number");
-#endif
-
 namespace {
   const char *DWARFGroupName = "DWARF Emission";
   const char *DbgTimerName = "DWARF Debug Writer";
@@ -72,189 +69,56 @@ static const unsigned InitAbbreviationsSetSize = 9; // log2(512)
 
 namespace llvm {
 
-//===----------------------------------------------------------------------===//
-/// CompileUnit - This dwarf writer support class manages information associate
-/// with a source file.
-class CompileUnit {
-  /// ID - File identifier for source.
-  ///
-  unsigned ID;
-
-  /// Die - Compile unit debug information entry.
-  ///
-  const OwningPtr<DIE> CUDie;
-
-  /// IndexTyDie - An anonymous type for index type.  Owned by CUDie.
-  DIE *IndexTyDie;
-
-  /// MDNodeToDieMap - Tracks the mapping of unit level debug informaton
-  /// variables to debug information entries.
-  DenseMap<const MDNode *, DIE *> MDNodeToDieMap;
-
-  /// MDNodeToDIEEntryMap - Tracks the mapping of unit level debug informaton
-  /// descriptors to debug information entries using a DIEEntry proxy.
-  DenseMap<const MDNode *, DIEEntry *> MDNodeToDIEEntryMap;
-
-  /// Globals - A map of globally visible named entities for this unit.
-  ///
-  StringMap<DIE*> Globals;
-
-  /// GlobalTypes - A map of globally visible types for this unit.
-  ///
-  StringMap<DIE*> GlobalTypes;
-
-public:
-  CompileUnit(unsigned I, DIE *D)
-    : ID(I), CUDie(D), IndexTyDie(0) {}
-
-  // Accessors.
-  unsigned getID()                  const { return ID; }
-  DIE* getCUDie()                   const { return CUDie.get(); }
-  const StringMap<DIE*> &getGlobals()     const { return Globals; }
-  const StringMap<DIE*> &getGlobalTypes() const { return GlobalTypes; }
-
-  /// hasContent - Return true if this compile unit has something to write out.
-  ///
-  bool hasContent() const { return !CUDie->getChildren().empty(); }
-
-  /// addGlobal - Add a new global entity to the compile unit.
-  ///
-  void addGlobal(StringRef Name, DIE *Die) { Globals[Name] = Die; }
-
-  /// addGlobalType - Add a new global type to the compile unit.
-  ///
-  void addGlobalType(StringRef Name, DIE *Die) {
-    GlobalTypes[Name] = Die;
-  }
-
-  /// getDIE - Returns the debug information entry map slot for the
-  /// specified debug variable.
-  DIE *getDIE(const MDNode *N) { return MDNodeToDieMap.lookup(N); }
-
-  /// insertDIE - Insert DIE into the map.
-  void insertDIE(const MDNode *N, DIE *D) {
-    MDNodeToDieMap.insert(std::make_pair(N, D));
-  }
-
-  /// getDIEEntry - Returns the debug information entry for the speciefied
-  /// debug variable.
-  DIEEntry *getDIEEntry(const MDNode *N) {
-    DenseMap<const MDNode *, DIEEntry *>::iterator I =
-      MDNodeToDIEEntryMap.find(N);
-    if (I == MDNodeToDIEEntryMap.end())
-      return NULL;
-    return I->second;
-  }
-
-  /// insertDIEEntry - Insert debug information entry into the map.
-  void insertDIEEntry(const MDNode *N, DIEEntry *E) {
-    MDNodeToDIEEntryMap.insert(std::make_pair(N, E));
-  }
-
-  /// addDie - Adds or interns the DIE to the compile unit.
-  ///
-  void addDie(DIE *Buffer) {
-    this->CUDie->addChild(Buffer);
-  }
-
-  // getIndexTyDie - Get an anonymous type for index type.
-  DIE *getIndexTyDie() {
-    return IndexTyDie;
-  }
-
-  // setIndexTyDie - Set D as anonymous type for index which can be reused
-  // later.
-  void setIndexTyDie(DIE *D) {
-    IndexTyDie = D;
-  }
-
-};
-
-//===----------------------------------------------------------------------===//
-/// DbgVariable - This class is used to track local variable information.
-///
-class DbgVariable {
-  DIVariable Var;                    // Variable Descriptor.
-  DIE *TheDIE;                       // Variable DIE.
-  unsigned DotDebugLocOffset;        // Offset in DotDebugLocEntries.
-public:
-  // AbsVar may be NULL.
-  DbgVariable(DIVariable V) : Var(V), TheDIE(0), DotDebugLocOffset(~0U) {}
-
-  // Accessors.
-  DIVariable getVariable()           const { return Var; }
-  void setDIE(DIE *D)                      { TheDIE = D; }
-  DIE *getDIE()                      const { return TheDIE; }
-  void setDotDebugLocOffset(unsigned O)    { DotDebugLocOffset = O; }
-  unsigned getDotDebugLocOffset()    const { return DotDebugLocOffset; }
-  StringRef getName()                const { return Var.getName(); }
-  unsigned getTag()                  const { return Var.getTag(); }
-  bool variableHasComplexAddress()   const {
-    assert(Var.Verify() && "Invalid complex DbgVariable!");
-    return Var.hasComplexAddress();
-  }
-  bool isBlockByrefVariable()        const {
-    assert(Var.Verify() && "Invalid complex DbgVariable!");
-    return Var.isBlockByrefVariable();
-  }
-  unsigned getNumAddrElements()      const { 
-    assert(Var.Verify() && "Invalid complex DbgVariable!");
-    return Var.getNumAddrElements();
-  }
-  uint64_t getAddrElement(unsigned i) const {
-    return Var.getAddrElement(i);
-  }
-  DIType getType()               const {
-    DIType Ty = Var.getType();
-    // FIXME: isBlockByrefVariable should be reformulated in terms of complex
-    // addresses instead.
-    if (Var.isBlockByrefVariable()) {
-      /* Byref variables, in Blocks, are declared by the programmer as
-         "SomeType VarName;", but the compiler creates a
-         __Block_byref_x_VarName struct, and gives the variable VarName
-         either the struct, or a pointer to the struct, as its type.  This
-         is necessary for various behind-the-scenes things the compiler
-         needs to do with by-reference variables in blocks.
-         
-         However, as far as the original *programmer* is concerned, the
-         variable should still have type 'SomeType', as originally declared.
-         
-         The following function dives into the __Block_byref_x_VarName
-         struct to find the original type of the variable.  This will be
-         passed back to the code generating the type for the Debug
-         Information Entry for the variable 'VarName'.  'VarName' will then
-         have the original type 'SomeType' in its debug information.
-         
-         The original type 'SomeType' will be the type of the field named
-         'VarName' inside the __Block_byref_x_VarName struct.
-         
-         NOTE: In order for this to not completely fail on the debugger
-         side, the Debug Information Entry for the variable VarName needs to
-         have a DW_AT_location that tells the debugger how to unwind through
-         the pointers and __Block_byref_x_VarName struct to find the actual
-         value of the variable.  The function addBlockByrefType does this.  */
-      DIType subType = Ty;
-      unsigned tag = Ty.getTag();
-      
-      if (tag == dwarf::DW_TAG_pointer_type) {
-        DIDerivedType DTy = DIDerivedType(Ty);
-        subType = DTy.getTypeDerivedFrom();
-      }
-      
-      DICompositeType blockStruct = DICompositeType(subType);
-      DIArray Elements = blockStruct.getTypeArray();
-      
-      for (unsigned i = 0, N = Elements.getNumElements(); i < N; ++i) {
-        DIDescriptor Element = Elements.getElement(i);
-        DIDerivedType DT = DIDerivedType(Element);
-        if (getName() == DT.getName())
-          return (DT.getTypeDerivedFrom());
-      }
-      return Ty;
+DIType DbgVariable::getType()               const {
+  DIType Ty = Var.getType();
+  // FIXME: isBlockByrefVariable should be reformulated in terms of complex
+  // addresses instead.
+  if (Var.isBlockByrefVariable()) {
+    /* Byref variables, in Blocks, are declared by the programmer as
+       "SomeType VarName;", but the compiler creates a
+       __Block_byref_x_VarName struct, and gives the variable VarName
+       either the struct, or a pointer to the struct, as its type.  This
+       is necessary for various behind-the-scenes things the compiler
+       needs to do with by-reference variables in blocks.
+       
+       However, as far as the original *programmer* is concerned, the
+       variable should still have type 'SomeType', as originally declared.
+       
+       The following function dives into the __Block_byref_x_VarName
+       struct to find the original type of the variable.  This will be
+       passed back to the code generating the type for the Debug
+       Information Entry for the variable 'VarName'.  'VarName' will then
+       have the original type 'SomeType' in its debug information.
+       
+       The original type 'SomeType' will be the type of the field named
+       'VarName' inside the __Block_byref_x_VarName struct.
+       
+       NOTE: In order for this to not completely fail on the debugger
+       side, the Debug Information Entry for the variable VarName needs to
+       have a DW_AT_location that tells the debugger how to unwind through
+       the pointers and __Block_byref_x_VarName struct to find the actual
+       value of the variable.  The function addBlockByrefType does this.  */
+    DIType subType = Ty;
+    unsigned tag = Ty.getTag();
+    
+    if (tag == dwarf::DW_TAG_pointer_type) {
+      DIDerivedType DTy = DIDerivedType(Ty);
+      subType = DTy.getTypeDerivedFrom();
+    }
+    
+    DICompositeType blockStruct = DICompositeType(subType);
+    DIArray Elements = blockStruct.getTypeArray();
+    
+    for (unsigned i = 0, N = Elements.getNumElements(); i < N; ++i) {
+      DIDescriptor Element = Elements.getElement(i);
+      DIDerivedType DT = DIDerivedType(Element);
+      if (getName() == DT.getName())
+        return (DT.getTypeDerivedFrom());
     }
     return Ty;
   }
-};
+  return Ty;
+}
 
 //===----------------------------------------------------------------------===//
 /// DbgRange - This is used to track range of instructions with identical
@@ -392,19 +256,16 @@ DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M)
     CurrentFnDbgScope(0), PrevLabel(NULL) {
   NextStringPoolNumber = 0;
 
-  DwarfFrameSectionSym = DwarfInfoSectionSym = DwarfAbbrevSectionSym = 0;
+  DwarfInfoSectionSym = DwarfAbbrevSectionSym = 0;
   DwarfStrSectionSym = TextSectionSym = 0;
   DwarfDebugRangeSectionSym = DwarfDebugLocSectionSym = 0;
   FunctionBeginSym = FunctionEndSym = 0;
-  DIEIntegerOne = new (DIEValueAllocator) DIEInteger(1);
   {
     NamedRegionTimer T(DbgTimerName, DWARFGroupName, TimePassesIsEnabled);
     beginModule(M);
   }
 }
 DwarfDebug::~DwarfDebug() {
-  for (unsigned j = 0, M = DIEBlocks.size(); j < M; ++j)
-    DIEBlocks[j]->~DIEBlock();
 }
 
 MCSymbol *DwarfDebug::getStringPoolEntry(StringRef Str) {
@@ -439,858 +300,6 @@ void DwarfDebug::assignAbbrevNumber(DIEAbbrev &Abbrev) {
   }
 }
 
-/// createDIEEntry - Creates a new DIEEntry to be a proxy for a debug
-/// information entry.
-DIEEntry *DwarfDebug::createDIEEntry(DIE *Entry) {
-  DIEEntry *Value = new (DIEValueAllocator) DIEEntry(Entry);
-  return Value;
-}
-
-/// addUInt - Add an unsigned integer attribute data and value.
-///
-void DwarfDebug::addUInt(DIE *Die, unsigned Attribute,
-                         unsigned Form, uint64_t Integer) {
-  if (!Form) Form = DIEInteger::BestForm(false, Integer);
-  DIEValue *Value = Integer == 1 ?
-    DIEIntegerOne : new (DIEValueAllocator) DIEInteger(Integer);
-  Die->addValue(Attribute, Form, Value);
-}
-
-/// addSInt - Add an signed integer attribute data and value.
-///
-void DwarfDebug::addSInt(DIE *Die, unsigned Attribute,
-                         unsigned Form, int64_t Integer) {
-  if (!Form) Form = DIEInteger::BestForm(true, Integer);
-  DIEValue *Value = new (DIEValueAllocator) DIEInteger(Integer);
-  Die->addValue(Attribute, Form, Value);
-}
-
-/// addString - Add a string attribute data and value. DIEString only
-/// keeps string reference.
-void DwarfDebug::addString(DIE *Die, unsigned Attribute, unsigned Form,
-                           StringRef String) {
-  DIEValue *Value = new (DIEValueAllocator) DIEString(String);
-  Die->addValue(Attribute, Form, Value);
-}
-
-/// addLabel - Add a Dwarf label attribute data and value.
-///
-void DwarfDebug::addLabel(DIE *Die, unsigned Attribute, unsigned Form,
-                          const MCSymbol *Label) {
-  DIEValue *Value = new (DIEValueAllocator) DIELabel(Label);
-  Die->addValue(Attribute, Form, Value);
-}
-
-/// addDelta - Add a label delta attribute data and value.
-///
-void DwarfDebug::addDelta(DIE *Die, unsigned Attribute, unsigned Form,
-                          const MCSymbol *Hi, const MCSymbol *Lo) {
-  DIEValue *Value = new (DIEValueAllocator) DIEDelta(Hi, Lo);
-  Die->addValue(Attribute, Form, Value);
-}
-
-/// addDIEEntry - Add a DIE attribute data and value.
-///
-void DwarfDebug::addDIEEntry(DIE *Die, unsigned Attribute, unsigned Form,
-                             DIE *Entry) {
-  Die->addValue(Attribute, Form, createDIEEntry(Entry));
-}
-
-
-/// addBlock - Add block data.
-///
-void DwarfDebug::addBlock(DIE *Die, unsigned Attribute, unsigned Form,
-                          DIEBlock *Block) {
-  Block->ComputeSize(Asm);
-  DIEBlocks.push_back(Block); // Memoize so we can call the destructor later on.
-  Die->addValue(Attribute, Block->BestForm(), Block);
-}
-
-/// addSourceLine - Add location information to specified debug information
-/// entry.
-void DwarfDebug::addSourceLine(DIE *Die, DIVariable V) {
-  // Verify variable.
-  if (!V.Verify())
-    return;
-
-  unsigned Line = V.getLineNumber();
-  if (Line == 0)
-    return;
-  unsigned FileID = GetOrCreateSourceID(V.getContext().getFilename(),
-                                        V.getContext().getDirectory());
-  assert(FileID && "Invalid file id");
-  addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID);
-  addUInt(Die, dwarf::DW_AT_decl_line, 0, Line);
-}
-
-/// addSourceLine - Add location information to specified debug information
-/// entry.
-void DwarfDebug::addSourceLine(DIE *Die, DIGlobalVariable G) {
-  // Verify global variable.
-  if (!G.Verify())
-    return;
-
-  unsigned Line = G.getLineNumber();
-  if (Line == 0)
-    return;
-  unsigned FileID = GetOrCreateSourceID(G.getContext().getFilename(),
-                                        G.getContext().getDirectory());
-  assert(FileID && "Invalid file id");
-  addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID);
-  addUInt(Die, dwarf::DW_AT_decl_line, 0, Line);
-}
-
-/// addSourceLine - Add location information to specified debug information
-/// entry.
-void DwarfDebug::addSourceLine(DIE *Die, DISubprogram SP) {
-  // Verify subprogram.
-  if (!SP.Verify())
-    return;
-  // If the line number is 0, don't add it.
-  if (SP.getLineNumber() == 0)
-    return;
-
-  unsigned Line = SP.getLineNumber();
-  if (!SP.getContext().Verify())
-    return;
-  unsigned FileID = GetOrCreateSourceID(SP.getFilename(), SP.getDirectory());
-  assert(FileID && "Invalid file id");
-  addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID);
-  addUInt(Die, dwarf::DW_AT_decl_line, 0, Line);
-}
-
-/// addSourceLine - Add location information to specified debug information
-/// entry.
-void DwarfDebug::addSourceLine(DIE *Die, DIType Ty) {
-  // Verify type.
-  if (!Ty.Verify())
-    return;
-
-  unsigned Line = Ty.getLineNumber();
-  if (Line == 0 || !Ty.getContext().Verify())
-    return;
-  unsigned FileID = GetOrCreateSourceID(Ty.getFilename(), Ty.getDirectory());
-  assert(FileID && "Invalid file id");
-  addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID);
-  addUInt(Die, dwarf::DW_AT_decl_line, 0, Line);
-}
-
-/// addSourceLine - Add location information to specified debug information
-/// entry.
-void DwarfDebug::addSourceLine(DIE *Die, DINameSpace NS) {
-  // Verify namespace.
-  if (!NS.Verify())
-    return;
-
-  unsigned Line = NS.getLineNumber();
-  if (Line == 0)
-    return;
-  StringRef FN = NS.getFilename();
-
-  unsigned FileID = GetOrCreateSourceID(FN, NS.getDirectory());
-  assert(FileID && "Invalid file id");
-  addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID);
-  addUInt(Die, dwarf::DW_AT_decl_line, 0, Line);
-}
-
-/// addVariableAddress - Add DW_AT_location attribute for a DbgVariable based
-/// on provided frame index.
-void DwarfDebug::addVariableAddress(DbgVariable *&DV, DIE *Die, int64_t FI) {
-  MachineLocation Location;
-  unsigned FrameReg;
-  const TargetFrameLowering *TFI = Asm->TM.getFrameLowering();
-  int Offset = TFI->getFrameIndexReference(*Asm->MF, FI, FrameReg);
-  Location.set(FrameReg, Offset);
-
-  if (DV->variableHasComplexAddress())
-    addComplexAddress(DV, Die, dwarf::DW_AT_location, Location);
-  else if (DV->isBlockByrefVariable())
-    addBlockByrefAddress(DV, Die, dwarf::DW_AT_location, Location);
-  else
-    addAddress(Die, dwarf::DW_AT_location, Location);
-}
-
-/// addComplexAddress - Start with the address based on the location provided,
-/// and generate the DWARF information necessary to find the actual variable
-/// given the extra address information encoded in the DIVariable, starting from
-/// the starting location.  Add the DWARF information to the die.
-///
-void DwarfDebug::addComplexAddress(DbgVariable *&DV, DIE *Die,
-                                   unsigned Attribute,
-                                   const MachineLocation &Location) {
-  DIType Ty = DV->getType();
-
-  // Decode the original location, and use that as the start of the byref
-  // variable's location.
-  const TargetRegisterInfo *RI = Asm->TM.getRegisterInfo();
-  unsigned Reg = RI->getDwarfRegNum(Location.getReg(), false);
-  DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
-
-  if (Location.isReg()) {
-    if (Reg < 32) {
-      addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_reg0 + Reg);
-    } else {
-      addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_regx);
-      addUInt(Block, 0, dwarf::DW_FORM_udata, Reg);
-    }
-  } else {
-    if (Reg < 32)
-      addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_breg0 + Reg);
-    else {
-      addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_bregx);
-      addUInt(Block, 0, dwarf::DW_FORM_udata, Reg);
-    }
-
-    addUInt(Block, 0, dwarf::DW_FORM_sdata, Location.getOffset());
-  }
-
-  for (unsigned i = 0, N = DV->getNumAddrElements(); i < N; ++i) {
-    uint64_t Element = DV->getAddrElement(i);
-
-    if (Element == DIBuilder::OpPlus) {
-      addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
-      addUInt(Block, 0, dwarf::DW_FORM_udata, DV->getAddrElement(++i));
-    } else if (Element == DIBuilder::OpDeref) {
-      addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
-    } else llvm_unreachable("unknown DIBuilder Opcode");
-  }
-
-  // Now attach the location information to the DIE.
-  addBlock(Die, Attribute, 0, Block);
-}
-
-/* Byref variables, in Blocks, are declared by the programmer as "SomeType
-   VarName;", but the compiler creates a __Block_byref_x_VarName struct, and
-   gives the variable VarName either the struct, or a pointer to the struct, as
-   its type.  This is necessary for various behind-the-scenes things the
-   compiler needs to do with by-reference variables in Blocks.
-
-   However, as far as the original *programmer* is concerned, the variable
-   should still have type 'SomeType', as originally declared.
-
-   The function getBlockByrefType dives into the __Block_byref_x_VarName
-   struct to find the original type of the variable, which is then assigned to
-   the variable's Debug Information Entry as its real type.  So far, so good.
-   However now the debugger will expect the variable VarName to have the type
-   SomeType.  So we need the location attribute for the variable to be an
-   expression that explains to the debugger how to navigate through the
-   pointers and struct to find the actual variable of type SomeType.
-
-   The following function does just that.  We start by getting
-   the "normal" location for the variable. This will be the location
-   of either the struct __Block_byref_x_VarName or the pointer to the
-   struct __Block_byref_x_VarName.
-
-   The struct will look something like:
-
-   struct __Block_byref_x_VarName {
-     ... <various fields>
-     struct __Block_byref_x_VarName *forwarding;
-     ... <various other fields>
-     SomeType VarName;
-     ... <maybe more fields>
-   };
-
-   If we are given the struct directly (as our starting point) we
-   need to tell the debugger to:
-
-   1).  Add the offset of the forwarding field.
-
-   2).  Follow that pointer to get the real __Block_byref_x_VarName
-   struct to use (the real one may have been copied onto the heap).
-
-   3).  Add the offset for the field VarName, to find the actual variable.
-
-   If we started with a pointer to the struct, then we need to
-   dereference that pointer first, before the other steps.
-   Translating this into DWARF ops, we will need to append the following
-   to the current location description for the variable:
-
-   DW_OP_deref                    -- optional, if we start with a pointer
-   DW_OP_plus_uconst <forward_fld_offset>
-   DW_OP_deref
-   DW_OP_plus_uconst <varName_fld_offset>
-
-   That is what this function does.  */
-
-/// addBlockByrefAddress - Start with the address based on the location
-/// provided, and generate the DWARF information necessary to find the
-/// actual Block variable (navigating the Block struct) based on the
-/// starting location.  Add the DWARF information to the die.  For
-/// more information, read large comment just above here.
-///
-void DwarfDebug::addBlockByrefAddress(DbgVariable *&DV, DIE *Die,
-                                      unsigned Attribute,
-                                      const MachineLocation &Location) {
-  DIType Ty = DV->getType();
-  DIType TmpTy = Ty;
-  unsigned Tag = Ty.getTag();
-  bool isPointer = false;
-
-  StringRef varName = DV->getName();
-
-  if (Tag == dwarf::DW_TAG_pointer_type) {
-    DIDerivedType DTy = DIDerivedType(Ty);
-    TmpTy = DTy.getTypeDerivedFrom();
-    isPointer = true;
-  }
-
-  DICompositeType blockStruct = DICompositeType(TmpTy);
-
-  // Find the __forwarding field and the variable field in the __Block_byref
-  // struct.
-  DIArray Fields = blockStruct.getTypeArray();
-  DIDescriptor varField = DIDescriptor();
-  DIDescriptor forwardingField = DIDescriptor();
-
-  for (unsigned i = 0, N = Fields.getNumElements(); i < N; ++i) {
-    DIDescriptor Element = Fields.getElement(i);
-    DIDerivedType DT = DIDerivedType(Element);
-    StringRef fieldName = DT.getName();
-    if (fieldName == "__forwarding")
-      forwardingField = Element;
-    else if (fieldName == varName)
-      varField = Element;
-  }
-
-  // Get the offsets for the forwarding field and the variable field.
-  unsigned forwardingFieldOffset =
-    DIDerivedType(forwardingField).getOffsetInBits() >> 3;
-  unsigned varFieldOffset =
-    DIDerivedType(varField).getOffsetInBits() >> 3;
-
-  // Decode the original location, and use that as the start of the byref
-  // variable's location.
-  const TargetRegisterInfo *RI = Asm->TM.getRegisterInfo();
-  unsigned Reg = RI->getDwarfRegNum(Location.getReg(), false);
-  DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
-
-  if (Location.isReg()) {
-    if (Reg < 32)
-      addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_reg0 + Reg);
-    else {
-      addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_regx);
-      addUInt(Block, 0, dwarf::DW_FORM_udata, Reg);
-    }
-  } else {
-    if (Reg < 32)
-      addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_breg0 + Reg);
-    else {
-      addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_bregx);
-      addUInt(Block, 0, dwarf::DW_FORM_udata, Reg);
-    }
-
-    addUInt(Block, 0, dwarf::DW_FORM_sdata, Location.getOffset());
-  }
-
-  // If we started with a pointer to the __Block_byref... struct, then
-  // the first thing we need to do is dereference the pointer (DW_OP_deref).
-  if (isPointer)
-    addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
-
-  // Next add the offset for the '__forwarding' field:
-  // DW_OP_plus_uconst ForwardingFieldOffset.  Note there's no point in
-  // adding the offset if it's 0.
-  if (forwardingFieldOffset > 0) {
-    addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
-    addUInt(Block, 0, dwarf::DW_FORM_udata, forwardingFieldOffset);
-  }
-
-  // Now dereference the __forwarding field to get to the real __Block_byref
-  // struct:  DW_OP_deref.
-  addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
-
-  // Now that we've got the real __Block_byref... struct, add the offset
-  // for the variable's field to get to the location of the actual variable:
-  // DW_OP_plus_uconst varFieldOffset.  Again, don't add if it's 0.
-  if (varFieldOffset > 0) {
-    addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
-    addUInt(Block, 0, dwarf::DW_FORM_udata, varFieldOffset);
-  }
-
-  // Now attach the location information to the DIE.
-  addBlock(Die, Attribute, 0, Block);
-}
-
-/// addAddress - Add an address attribute to a die based on the location
-/// provided.
-void DwarfDebug::addAddress(DIE *Die, unsigned Attribute,
-                            const MachineLocation &Location) {
-  const TargetRegisterInfo *RI = Asm->TM.getRegisterInfo();
-  unsigned Reg = RI->getDwarfRegNum(Location.getReg(), false);
-  DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
-
-  if (RI->getFrameRegister(*Asm->MF) == Location.getReg()
-      && Location.getOffset()) {
-    // If variable offset is based in frame register then use fbreg.
-    addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_fbreg);
-    addSInt(Block, 0, dwarf::DW_FORM_sdata, Location.getOffset());
-    addBlock(Die, Attribute, 0, Block);
-    return;
-  }
-
-  if (Location.isReg()) {
-    if (Reg < 32) {
-      addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_reg0 + Reg);
-    } else {
-      addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_regx);
-      addUInt(Block, 0, dwarf::DW_FORM_udata, Reg);
-    }
-  } else {
-    if (Reg < 32) {
-      addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_breg0 + Reg);
-    } else {
-      addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_bregx);
-      addUInt(Block, 0, dwarf::DW_FORM_udata, Reg);
-    }
-
-    addUInt(Block, 0, dwarf::DW_FORM_sdata, Location.getOffset());
-  }
-
-  addBlock(Die, Attribute, 0, Block);
-}
-
-/// addRegisterAddress - Add register location entry in variable DIE.
-bool DwarfDebug::addRegisterAddress(DIE *Die, const MachineOperand &MO) {
-  assert (MO.isReg() && "Invalid machine operand!");
-  if (!MO.getReg())
-    return false;
-  MachineLocation Location;
-  Location.set(MO.getReg());
-  addAddress(Die, dwarf::DW_AT_location, Location);
-  return true;
-}
-
-/// addConstantValue - Add constant value entry in variable DIE.
-bool DwarfDebug::addConstantValue(DIE *Die, const MachineOperand &MO) {
-  assert (MO.isImm() && "Invalid machine operand!");
-  DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
-  unsigned Imm = MO.getImm();
-  addUInt(Block, 0, dwarf::DW_FORM_udata, Imm);
-  addBlock(Die, dwarf::DW_AT_const_value, 0, Block);
-  return true;
-}
-
-/// addConstantFPValue - Add constant value entry in variable DIE.
-bool DwarfDebug::addConstantFPValue(DIE *Die, const MachineOperand &MO) {
-  assert (MO.isFPImm() && "Invalid machine operand!");
-  DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
-  APFloat FPImm = MO.getFPImm()->getValueAPF();
-
-  // Get the raw data form of the floating point.
-  const APInt FltVal = FPImm.bitcastToAPInt();
-  const char *FltPtr = (const char*)FltVal.getRawData();
-
-  int NumBytes = FltVal.getBitWidth() / 8; // 8 bits per byte.
-  bool LittleEndian = Asm->getTargetData().isLittleEndian();
-  int Incr = (LittleEndian ? 1 : -1);
-  int Start = (LittleEndian ? 0 : NumBytes - 1);
-  int Stop = (LittleEndian ? NumBytes : -1);
-
-  // Output the constant to DWARF one byte at a time.
-  for (; Start != Stop; Start += Incr)
-    addUInt(Block, 0, dwarf::DW_FORM_data1,
-            (unsigned char)0xFF & FltPtr[Start]);
-
-  addBlock(Die, dwarf::DW_AT_const_value, 0, Block);
-  return true;
-}
-
-/// addConstantValue - Add constant value entry in variable DIE.
-bool DwarfDebug::addConstantValue(DIE *Die, ConstantInt *CI,
-                                  bool Unsigned) {
-  if (CI->getBitWidth() <= 64) {
-    if (Unsigned)
-      addUInt(Die, dwarf::DW_AT_const_value, dwarf::DW_FORM_udata,
-              CI->getZExtValue());
-    else
-      addSInt(Die, dwarf::DW_AT_const_value, dwarf::DW_FORM_sdata,
-              CI->getSExtValue());
-    return true;
-  }
-
-  DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
-
-  // Get the raw data form of the large APInt.
-  const APInt Val = CI->getValue();
-  const char *Ptr = (const char*)Val.getRawData();
-
-  int NumBytes = Val.getBitWidth() / 8; // 8 bits per byte.
-  bool LittleEndian = Asm->getTargetData().isLittleEndian();
-  int Incr = (LittleEndian ? 1 : -1);
-  int Start = (LittleEndian ? 0 : NumBytes - 1);
-  int Stop = (LittleEndian ? NumBytes : -1);
-
-  // Output the constant to DWARF one byte at a time.
-  for (; Start != Stop; Start += Incr)
-    addUInt(Block, 0, dwarf::DW_FORM_data1,
-            (unsigned char)0xFF & Ptr[Start]);
-
-  addBlock(Die, dwarf::DW_AT_const_value, 0, Block);
-  return true;
-}
-
-/// addTemplateParams - Add template parameters in buffer.
-void DwarfDebug::addTemplateParams(DIE &Buffer, DIArray TParams) {
-  // Add template parameters.
-  for (unsigned i = 0, e = TParams.getNumElements(); i != e; ++i) {
-    DIDescriptor Element = TParams.getElement(i);
-    if (Element.isTemplateTypeParameter())
-      Buffer.addChild(getOrCreateTemplateTypeParameterDIE(
-                        DITemplateTypeParameter(Element)));
-    else if (Element.isTemplateValueParameter())
-      Buffer.addChild(getOrCreateTemplateValueParameterDIE(
-                        DITemplateValueParameter(Element)));
-  }
-
-}
-/// addToContextOwner - Add Die into the list of its context owner's children.
-void DwarfDebug::addToContextOwner(DIE *Die, DIDescriptor Context) {
-  if (Context.isType()) {
-    DIE *ContextDIE = getOrCreateTypeDIE(DIType(Context));
-    ContextDIE->addChild(Die);
-  } else if (Context.isNameSpace()) {
-    DIE *ContextDIE = getOrCreateNameSpace(DINameSpace(Context));
-    ContextDIE->addChild(Die);
-  } else if (Context.isSubprogram()) {
-    DIE *ContextDIE = createSubprogramDIE(DISubprogram(Context));
-    ContextDIE->addChild(Die);
-  } else if (DIE *ContextDIE = getCompileUnit(Context)->getDIE(Context))
-    ContextDIE->addChild(Die);
-  else
-    getCompileUnit(Context)->addDie(Die);
-}
-
-/// getOrCreateTypeDIE - Find existing DIE or create new DIE for the
-/// given DIType.
-DIE *DwarfDebug::getOrCreateTypeDIE(DIType Ty) {
-  CompileUnit *TypeCU = getCompileUnit(Ty);
-  DIE *TyDIE = TypeCU->getDIE(Ty);
-  if (TyDIE)
-    return TyDIE;
-
-  // Create new type.
-  TyDIE = new DIE(dwarf::DW_TAG_base_type);
-  TypeCU->insertDIE(Ty, TyDIE);
-  if (Ty.isBasicType())
-    constructTypeDIE(*TyDIE, DIBasicType(Ty));
-  else if (Ty.isCompositeType())
-    constructTypeDIE(*TyDIE, DICompositeType(Ty));
-  else {
-    assert(Ty.isDerivedType() && "Unknown kind of DIType");
-    constructTypeDIE(*TyDIE, DIDerivedType(Ty));
-  }
-
-  addToContextOwner(TyDIE, Ty.getContext());
-  return TyDIE;
-}
-
-/// addType - Add a new type attribute to the specified entity.
-void DwarfDebug::addType(DIE *Entity, DIType Ty) {
-  if (!Ty.Verify())
-    return;
-
-  // Check for pre-existence.
-  CompileUnit *TypeCU = getCompileUnit(Ty);
-  DIEEntry *Entry = TypeCU->getDIEEntry(Ty);
-  // If it exists then use the existing value.
-  if (Entry) {
-    Entity->addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, Entry);
-    return;
-  }
-
-  // Construct type.
-  DIE *Buffer = getOrCreateTypeDIE(Ty);
-
-  // Set up proxy.
-  Entry = createDIEEntry(Buffer);
-  TypeCU->insertDIEEntry(Ty, Entry);
-
-  Entity->addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, Entry);
-}
-
-/// constructTypeDIE - Construct basic type die from DIBasicType.
-void DwarfDebug::constructTypeDIE(DIE &Buffer, DIBasicType BTy) {
-  // Get core information.
-  StringRef Name = BTy.getName();
-  Buffer.setTag(dwarf::DW_TAG_base_type);
-  addUInt(&Buffer, dwarf::DW_AT_encoding,  dwarf::DW_FORM_data1,
-          BTy.getEncoding());
-
-  // Add name if not anonymous or intermediate type.
-  if (!Name.empty())
-    addString(&Buffer, dwarf::DW_AT_name, dwarf::DW_FORM_string, Name);
-  uint64_t Size = BTy.getSizeInBits() >> 3;
-  addUInt(&Buffer, dwarf::DW_AT_byte_size, 0, Size);
-}
-
-/// constructTypeDIE - Construct derived type die from DIDerivedType.
-void DwarfDebug::constructTypeDIE(DIE &Buffer, DIDerivedType DTy) {
-  // Get core information.
-  StringRef Name = DTy.getName();
-  uint64_t Size = DTy.getSizeInBits() >> 3;
-  unsigned Tag = DTy.getTag();
-
-  // FIXME - Workaround for templates.
-  if (Tag == dwarf::DW_TAG_inheritance) Tag = dwarf::DW_TAG_reference_type;
-
-  Buffer.setTag(Tag);
-
-  // Map to main type, void will not have a type.
-  DIType FromTy = DTy.getTypeDerivedFrom();
-  addType(&Buffer, FromTy);
-
-  // Add name if not anonymous or intermediate type.
-  if (!Name.empty())
-    addString(&Buffer, dwarf::DW_AT_name, dwarf::DW_FORM_string, Name);
-
-  // Add size if non-zero (derived types might be zero-sized.)
-  if (Size)
-    addUInt(&Buffer, dwarf::DW_AT_byte_size, 0, Size);
-
-  // Add source line info if available and TyDesc is not a forward declaration.
-  if (!DTy.isForwardDecl())
-    addSourceLine(&Buffer, DTy);
-}
-
-/// constructTypeDIE - Construct type DIE from DICompositeType.
-void DwarfDebug::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
-  // Get core information.
-  StringRef Name = CTy.getName();
-
-  uint64_t Size = CTy.getSizeInBits() >> 3;
-  unsigned Tag = CTy.getTag();
-  Buffer.setTag(Tag);
-
-  switch (Tag) {
-  case dwarf::DW_TAG_vector_type:
-  case dwarf::DW_TAG_array_type:
-    constructArrayTypeDIE(Buffer, &CTy);
-    break;
-  case dwarf::DW_TAG_enumeration_type: {
-    DIArray Elements = CTy.getTypeArray();
-
-    // Add enumerators to enumeration type.
-    for (unsigned i = 0, N = Elements.getNumElements(); i < N; ++i) {
-      DIE *ElemDie = NULL;
-      DIDescriptor Enum(Elements.getElement(i));
-      if (Enum.isEnumerator()) {
-        ElemDie = constructEnumTypeDIE(DIEnumerator(Enum));
-        Buffer.addChild(ElemDie);
-      }
-    }
-  }
-    break;
-  case dwarf::DW_TAG_subroutine_type: {
-    // Add return type.
-    DIArray Elements = CTy.getTypeArray();
-    DIDescriptor RTy = Elements.getElement(0);
-    addType(&Buffer, DIType(RTy));
-
-    bool isPrototyped = true;
-    // Add arguments.
-    for (unsigned i = 1, N = Elements.getNumElements(); i < N; ++i) {
-      DIDescriptor Ty = Elements.getElement(i);
-      if (Ty.isUnspecifiedParameter()) {
-        DIE *Arg = new DIE(dwarf::DW_TAG_unspecified_parameters);
-        Buffer.addChild(Arg);
-        isPrototyped = false;
-      } else {
-        DIE *Arg = new DIE(dwarf::DW_TAG_formal_parameter);
-        addType(Arg, DIType(Ty));
-        Buffer.addChild(Arg);
-      }
-    }
-    // Add prototype flag.
-    if (isPrototyped)
-      addUInt(&Buffer, dwarf::DW_AT_prototyped, dwarf::DW_FORM_flag, 1);
-  }
-    break;
-  case dwarf::DW_TAG_structure_type:
-  case dwarf::DW_TAG_union_type:
-  case dwarf::DW_TAG_class_type: {
-    // Add elements to structure type.
-    DIArray Elements = CTy.getTypeArray();
-
-    // A forward struct declared type may not have elements available.
-    unsigned N = Elements.getNumElements();
-    if (N == 0)
-      break;
-
-    // Add elements to structure type.
-    for (unsigned i = 0; i < N; ++i) {
-      DIDescriptor Element = Elements.getElement(i);
-      DIE *ElemDie = NULL;
-      if (Element.isSubprogram()) {
-        DISubprogram SP(Element);
-        ElemDie = createSubprogramDIE(DISubprogram(Element));
-        if (SP.isProtected())
-          addUInt(ElemDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_flag,
-                  dwarf::DW_ACCESS_protected);
-        else if (SP.isPrivate())
-          addUInt(ElemDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_flag,
-                  dwarf::DW_ACCESS_private);
-        else 
-          addUInt(ElemDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_flag,
-            dwarf::DW_ACCESS_public);
-        if (SP.isExplicit())
-          addUInt(ElemDie, dwarf::DW_AT_explicit, dwarf::DW_FORM_flag, 1);
-      }
-      else if (Element.isVariable()) {
-        DIVariable DV(Element);
-        ElemDie = new DIE(dwarf::DW_TAG_variable);
-        addString(ElemDie, dwarf::DW_AT_name, dwarf::DW_FORM_string,
-                  DV.getName());
-        addType(ElemDie, DV.getType());
-        addUInt(ElemDie, dwarf::DW_AT_declaration, dwarf::DW_FORM_flag, 1);
-        addUInt(ElemDie, dwarf::DW_AT_external, dwarf::DW_FORM_flag, 1);
-        addSourceLine(ElemDie, DV);
-      } else if (Element.isDerivedType())
-        ElemDie = createMemberDIE(DIDerivedType(Element));
-      else
-        continue;
-      Buffer.addChild(ElemDie);
-    }
-
-    if (CTy.isAppleBlockExtension())
-      addUInt(&Buffer, dwarf::DW_AT_APPLE_block, dwarf::DW_FORM_flag, 1);
-
-    unsigned RLang = CTy.getRunTimeLang();
-    if (RLang)
-      addUInt(&Buffer, dwarf::DW_AT_APPLE_runtime_class,
-              dwarf::DW_FORM_data1, RLang);
-
-    DICompositeType ContainingType = CTy.getContainingType();
-    if (DIDescriptor(ContainingType).isCompositeType())
-      addDIEEntry(&Buffer, dwarf::DW_AT_containing_type, dwarf::DW_FORM_ref4,
-                  getOrCreateTypeDIE(DIType(ContainingType)));
-    else {
-      DIDescriptor Context = CTy.getContext();
-      addToContextOwner(&Buffer, Context);
-    }
-
-    if (Tag == dwarf::DW_TAG_class_type) 
-      addTemplateParams(Buffer, CTy.getTemplateParams());
-
-    break;
-  }
-  default:
-    break;
-  }
-
-  // Add name if not anonymous or intermediate type.
-  if (!Name.empty())
-    addString(&Buffer, dwarf::DW_AT_name, dwarf::DW_FORM_string, Name);
-
-  if (Tag == dwarf::DW_TAG_enumeration_type || Tag == dwarf::DW_TAG_class_type
-      || Tag == dwarf::DW_TAG_structure_type || Tag == dwarf::DW_TAG_union_type)
-    {
-    // Add size if non-zero (derived types might be zero-sized.)
-    if (Size)
-      addUInt(&Buffer, dwarf::DW_AT_byte_size, 0, Size);
-    else {
-      // Add zero size if it is not a forward declaration.
-      if (CTy.isForwardDecl())
-        addUInt(&Buffer, dwarf::DW_AT_declaration, dwarf::DW_FORM_flag, 1);
-      else
-        addUInt(&Buffer, dwarf::DW_AT_byte_size, 0, 0);
-    }
-
-    // Add source line info if available.
-    if (!CTy.isForwardDecl())
-      addSourceLine(&Buffer, CTy);
-  }
-}
-
-/// getOrCreateTemplateTypeParameterDIE - Find existing DIE or create new DIE 
-/// for the given DITemplateTypeParameter.
-DIE *
-DwarfDebug::getOrCreateTemplateTypeParameterDIE(DITemplateTypeParameter TP) {
-  CompileUnit *TypeCU = getCompileUnit(TP);
-  DIE *ParamDIE = TypeCU->getDIE(TP);
-  if (ParamDIE)
-    return ParamDIE;
-
-  ParamDIE = new DIE(dwarf::DW_TAG_template_type_parameter);
-  addType(ParamDIE, TP.getType());
-  addString(ParamDIE, dwarf::DW_AT_name, dwarf::DW_FORM_string, TP.getName());
-  return ParamDIE;
-}
-
-/// getOrCreateTemplateValueParameterDIE - Find existing DIE or create new DIE 
-/// for the given DITemplateValueParameter.
-DIE *
-DwarfDebug::getOrCreateTemplateValueParameterDIE(DITemplateValueParameter TPV) {
-  CompileUnit *TVCU = getCompileUnit(TPV);
-  DIE *ParamDIE = TVCU->getDIE(TPV);
-  if (ParamDIE)
-    return ParamDIE;
-
-  ParamDIE = new DIE(dwarf::DW_TAG_template_value_parameter);
-  addType(ParamDIE, TPV.getType());
-  if (!TPV.getName().empty())
-    addString(ParamDIE, dwarf::DW_AT_name, dwarf::DW_FORM_string, TPV.getName());
-  addUInt(ParamDIE, dwarf::DW_AT_const_value, dwarf::DW_FORM_udata, 
-          TPV.getValue());
-  return ParamDIE;
-}
-
-/// constructSubrangeDIE - Construct subrange DIE from DISubrange.
-void DwarfDebug::constructSubrangeDIE(DIE &Buffer, DISubrange SR, DIE *IndexTy){
-  int64_t L = SR.getLo();
-  int64_t H = SR.getHi();
-  DIE *DW_Subrange = new DIE(dwarf::DW_TAG_subrange_type);
-
-  addDIEEntry(DW_Subrange, dwarf::DW_AT_type, dwarf::DW_FORM_ref4, IndexTy);
-  if (L)
-    addSInt(DW_Subrange, dwarf::DW_AT_lower_bound, 0, L);
-  addSInt(DW_Subrange, dwarf::DW_AT_upper_bound, 0, H);
-
-  Buffer.addChild(DW_Subrange);
-}
-
-/// constructArrayTypeDIE - Construct array type DIE from DICompositeType.
-void DwarfDebug::constructArrayTypeDIE(DIE &Buffer,
-                                       DICompositeType *CTy) {
-  Buffer.setTag(dwarf::DW_TAG_array_type);
-  if (CTy->getTag() == dwarf::DW_TAG_vector_type)
-    addUInt(&Buffer, dwarf::DW_AT_GNU_vector, dwarf::DW_FORM_flag, 1);
-
-  // Emit derived type.
-  addType(&Buffer, CTy->getTypeDerivedFrom());
-  DIArray Elements = CTy->getTypeArray();
-
-  // Get an anonymous type for index type.
-  CompileUnit *TheCU = getCompileUnit(*CTy);
-  DIE *IdxTy = TheCU->getIndexTyDie();
-  if (!IdxTy) {
-    // Construct an anonymous type for index type.
-    IdxTy = new DIE(dwarf::DW_TAG_base_type);
-    addUInt(IdxTy, dwarf::DW_AT_byte_size, 0, sizeof(int32_t));
-    addUInt(IdxTy, dwarf::DW_AT_encoding, dwarf::DW_FORM_data1,
-            dwarf::DW_ATE_signed);
-    TheCU->addDie(IdxTy);
-    TheCU->setIndexTyDie(IdxTy);
-  }
-
-  // Add subranges to array type.
-  for (unsigned i = 0, N = Elements.getNumElements(); i < N; ++i) {
-    DIDescriptor Element = Elements.getElement(i);
-    if (Element.getTag() == dwarf::DW_TAG_subrange_type)
-      constructSubrangeDIE(Buffer, DISubrange(Element), IdxTy);
-  }
-}
-
-/// constructEnumTypeDIE - Construct enum type DIE from DIEnumerator.
-DIE *DwarfDebug::constructEnumTypeDIE(DIEnumerator ETy) {
-  DIE *Enumerator = new DIE(dwarf::DW_TAG_enumerator);
-  StringRef Name = ETy.getName();
-  addString(Enumerator, dwarf::DW_AT_name, dwarf::DW_FORM_string, Name);
-  int64_t Value = ETy.getEnumValue();
-  addSInt(Enumerator, dwarf::DW_AT_const_value, dwarf::DW_FORM_sdata, Value);
-  return Enumerator;
-}
-
 /// getRealLinkageName - If special LLVM prefix that is used to inform the asm
 /// printer to not emit usual symbol prefix before the symbol name is used then
 /// return linkage name after skipping this special LLVM prefix.
@@ -1301,84 +310,6 @@ static StringRef getRealLinkageName(StringRef LinkageName) {
   return LinkageName;
 }
 
-/// createMemberDIE - Create new member DIE.
-DIE *DwarfDebug::createMemberDIE(DIDerivedType DT) {
-  DIE *MemberDie = new DIE(DT.getTag());
-  StringRef Name = DT.getName();
-  if (!Name.empty())
-    addString(MemberDie, dwarf::DW_AT_name, dwarf::DW_FORM_string, Name);
-
-  addType(MemberDie, DT.getTypeDerivedFrom());
-
-  addSourceLine(MemberDie, DT);
-
-  DIEBlock *MemLocationDie = new (DIEValueAllocator) DIEBlock();
-  addUInt(MemLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
-
-  uint64_t Size = DT.getSizeInBits();
-  uint64_t FieldSize = DT.getOriginalTypeSize();
-
-  if (Size != FieldSize) {
-    // Handle bitfield.
-    addUInt(MemberDie, dwarf::DW_AT_byte_size, 0, DT.getOriginalTypeSize()>>3);
-    addUInt(MemberDie, dwarf::DW_AT_bit_size, 0, DT.getSizeInBits());
-
-    uint64_t Offset = DT.getOffsetInBits();
-    uint64_t AlignMask = ~(DT.getAlignInBits() - 1);
-    uint64_t HiMark = (Offset + FieldSize) & AlignMask;
-    uint64_t FieldOffset = (HiMark - FieldSize);
-    Offset -= FieldOffset;
-
-    // Maybe we need to work from the other end.
-    if (Asm->getTargetData().isLittleEndian())
-      Offset = FieldSize - (Offset + Size);
-    addUInt(MemberDie, dwarf::DW_AT_bit_offset, 0, Offset);
-
-    // Here WD_AT_data_member_location points to the anonymous
-    // field that includes this bit field.
-    addUInt(MemLocationDie, 0, dwarf::DW_FORM_udata, FieldOffset >> 3);
-
-  } else
-    // This is not a bitfield.
-    addUInt(MemLocationDie, 0, dwarf::DW_FORM_udata, DT.getOffsetInBits() >> 3);
-
-  if (DT.getTag() == dwarf::DW_TAG_inheritance
-      && DT.isVirtual()) {
-
-    // For C++, virtual base classes are not at fixed offset. Use following
-    // expression to extract appropriate offset from vtable.
-    // BaseAddr = ObAddr + *((*ObAddr) - Offset)
-
-    DIEBlock *VBaseLocationDie = new (DIEValueAllocator) DIEBlock();
-    addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_dup);
-    addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
-    addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
-    addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_udata, DT.getOffsetInBits());
-    addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_minus);
-    addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
-    addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus);
-
-    addBlock(MemberDie, dwarf::DW_AT_data_member_location, 0,
-             VBaseLocationDie);
-  } else
-    addBlock(MemberDie, dwarf::DW_AT_data_member_location, 0, MemLocationDie);
-
-  if (DT.isProtected())
-    addUInt(MemberDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_flag,
-            dwarf::DW_ACCESS_protected);
-  else if (DT.isPrivate())
-    addUInt(MemberDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_flag,
-            dwarf::DW_ACCESS_private);
-  // Otherwise C++ member and base classes are considered public.
-  else if (DT.getCompileUnit().getLanguage() == dwarf::DW_LANG_C_plus_plus)
-    addUInt(MemberDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_flag,
-            dwarf::DW_ACCESS_public);
-  if (DT.isVirtual())
-    addUInt(MemberDie, dwarf::DW_AT_virtuality, dwarf::DW_FORM_flag,
-            dwarf::DW_VIRTUALITY_virtual);
-  return MemberDie;
-}
-
 /// createSubprogramDIE - Create new DIE using SP.
 DIE *DwarfDebug::createSubprogramDIE(DISubprogram SP) {
   CompileUnit *SPCU = getCompileUnit(SP);
@@ -1387,19 +318,35 @@ DIE *DwarfDebug::createSubprogramDIE(DISubprogram SP) {
     return SPDie;
 
   SPDie = new DIE(dwarf::DW_TAG_subprogram);
-  // Constructors and operators for anonymous aggregates do not have names.
-  if (!SP.getName().empty())
-    addString(SPDie, dwarf::DW_AT_name, dwarf::DW_FORM_string, SP.getName());
+  
+  // DW_TAG_inlined_subroutine may refer to this DIE.
+  SPCU->insertDIE(SP, SPDie);
+  
+  // Add to context owner.
+  SPCU->addToContextOwner(SPDie, SP.getContext());
+
+  // Add function template parameters.
+  SPCU->addTemplateParams(*SPDie, SP.getTemplateParams());
 
   StringRef LinkageName = SP.getLinkageName();
   if (!LinkageName.empty())
-    addString(SPDie, dwarf::DW_AT_MIPS_linkage_name, dwarf::DW_FORM_string,
-              getRealLinkageName(LinkageName));
+    SPCU->addString(SPDie, dwarf::DW_AT_MIPS_linkage_name, dwarf::DW_FORM_string,
+                    getRealLinkageName(LinkageName));
 
-  addSourceLine(SPDie, SP);
+  // If this DIE is going to refer declaration info using AT_specification
+  // then there is no need to add other attributes.
+  if (SP.getFunctionDeclaration().isSubprogram())
+    return SPDie;
+
+  // Constructors and operators for anonymous aggregates do not have names.
+  if (!SP.getName().empty())
+    SPCU->addString(SPDie, dwarf::DW_AT_name, dwarf::DW_FORM_string, 
+                    SP.getName());
+
+  SPCU->addSourceLine(SPDie, SP);
 
   if (SP.isPrototyped()) 
-    addUInt(SPDie, dwarf::DW_AT_prototyped, dwarf::DW_FORM_flag, 1);
+    SPCU->addUInt(SPDie, dwarf::DW_AT_prototyped, dwarf::DW_FORM_flag, 1);
 
   // Add Return Type.
   DICompositeType SPTy = SP.getType();
@@ -1407,24 +354,24 @@ DIE *DwarfDebug::createSubprogramDIE(DISubprogram SP) {
   unsigned SPTag = SPTy.getTag();
 
   if (Args.getNumElements() == 0 || SPTag != dwarf::DW_TAG_subroutine_type)
-    addType(SPDie, SPTy);
+    SPCU->addType(SPDie, SPTy);
   else
-    addType(SPDie, DIType(Args.getElement(0)));
+    SPCU->addType(SPDie, DIType(Args.getElement(0)));
 
   unsigned VK = SP.getVirtuality();
   if (VK) {
-    addUInt(SPDie, dwarf::DW_AT_virtuality, dwarf::DW_FORM_flag, VK);
-    DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
-    addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
-    addUInt(Block, 0, dwarf::DW_FORM_udata, SP.getVirtualIndex());
-    addBlock(SPDie, dwarf::DW_AT_vtable_elem_location, 0, Block);
+    SPCU->addUInt(SPDie, dwarf::DW_AT_virtuality, dwarf::DW_FORM_flag, VK);
+    DIEBlock *Block = SPCU->getDIEBlock();
+    SPCU->addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
+    SPCU->addUInt(Block, 0, dwarf::DW_FORM_udata, SP.getVirtualIndex());
+    SPCU->addBlock(SPDie, dwarf::DW_AT_vtable_elem_location, 0, Block);
     ContainingTypeMap.insert(std::make_pair(SPDie,
                                             SP.getContainingType()));
   }
 
   if (!SP.isDefinition()) {
-    addUInt(SPDie, dwarf::DW_AT_declaration, dwarf::DW_FORM_flag, 1);
-
+    SPCU->addUInt(SPDie, dwarf::DW_AT_declaration, dwarf::DW_FORM_flag, 1);
+    
     // Add arguments. Do not add arguments for subprogram definition. They will
     // be handled while processing variables.
     DICompositeType SPTy = SP.getType();
@@ -1435,35 +382,26 @@ DIE *DwarfDebug::createSubprogramDIE(DISubprogram SP) {
       for (unsigned i = 1, N =  Args.getNumElements(); i < N; ++i) {
         DIE *Arg = new DIE(dwarf::DW_TAG_formal_parameter);
         DIType ATy = DIType(DIType(Args.getElement(i)));
-        addType(Arg, ATy);
+        SPCU->addType(Arg, ATy);
         if (ATy.isArtificial())
-          addUInt(Arg, dwarf::DW_AT_artificial, dwarf::DW_FORM_flag, 1);
+          SPCU->addUInt(Arg, dwarf::DW_AT_artificial, dwarf::DW_FORM_flag, 1);
         SPDie->addChild(Arg);
       }
   }
 
   if (SP.isArtificial())
-    addUInt(SPDie, dwarf::DW_AT_artificial, dwarf::DW_FORM_flag, 1);
+    SPCU->addUInt(SPDie, dwarf::DW_AT_artificial, dwarf::DW_FORM_flag, 1);
 
   if (!SP.isLocalToUnit())
-    addUInt(SPDie, dwarf::DW_AT_external, dwarf::DW_FORM_flag, 1);
+    SPCU->addUInt(SPDie, dwarf::DW_AT_external, dwarf::DW_FORM_flag, 1);
 
   if (SP.isOptimized())
-    addUInt(SPDie, dwarf::DW_AT_APPLE_optimized, dwarf::DW_FORM_flag, 1);
+    SPCU->addUInt(SPDie, dwarf::DW_AT_APPLE_optimized, dwarf::DW_FORM_flag, 1);
 
   if (unsigned isa = Asm->getISAEncoding()) {
-    addUInt(SPDie, dwarf::DW_AT_APPLE_isa, dwarf::DW_FORM_flag, isa);
+    SPCU->addUInt(SPDie, dwarf::DW_AT_APPLE_isa, dwarf::DW_FORM_flag, isa);
   }
 
-  // Add function template parameters.
-  addTemplateParams(*SPDie, SP.getTemplateParams());
-
-  // DW_TAG_inlined_subroutine may refer to this DIE.
-  SPCU->insertDIE(SP, SPDie);
-
-  // Add to context owner.
-  addToContextOwner(SPDie, SP.getContext());
-
   return SPDie;
 }
 
@@ -1518,51 +456,57 @@ DIE *DwarfDebug::updateSubprogramScopeDIE(const MDNode *SPNode) {
   assert(SPDie && "Unable to find subprogram DIE!");
   DISubprogram SP(SPNode);
 
-  // There is not any need to generate specification DIE for a function
-  // defined at compile unit level. If a function is defined inside another
-  // function then gdb prefers the definition at top level and but does not
-  // expect specification DIE in parent function. So avoid creating
-  // specification DIE for a function defined inside a function.
-  if (SP.isDefinition() && !SP.getContext().isCompileUnit() &&
-      !SP.getContext().isFile() &&
-      !isSubprogramContext(SP.getContext())) {
-    addUInt(SPDie, dwarf::DW_AT_declaration, dwarf::DW_FORM_flag, 1);
-
-    // Add arguments.
-    DICompositeType SPTy = SP.getType();
-    DIArray Args = SPTy.getTypeArray();
-    unsigned SPTag = SPTy.getTag();
-    if (SPTag == dwarf::DW_TAG_subroutine_type)
-      for (unsigned i = 1, N = Args.getNumElements(); i < N; ++i) {
-        DIE *Arg = new DIE(dwarf::DW_TAG_formal_parameter);
-        DIType ATy = DIType(DIType(Args.getElement(i)));
-        addType(Arg, ATy);
-        if (ATy.isArtificial())
-          addUInt(Arg, dwarf::DW_AT_artificial, dwarf::DW_FORM_flag, 1);
-        SPDie->addChild(Arg);
-      }
-    DIE *SPDeclDie = SPDie;
-    SPDie = new DIE(dwarf::DW_TAG_subprogram);
-    addDIEEntry(SPDie, dwarf::DW_AT_specification, dwarf::DW_FORM_ref4,
-                SPDeclDie);
-    SPCU->addDie(SPDie);
+  DISubprogram SPDecl = SP.getFunctionDeclaration();
+  if (SPDecl.isSubprogram())
+    // Refer function declaration directly.
+    SPCU->addDIEEntry(SPDie, dwarf::DW_AT_specification, dwarf::DW_FORM_ref4,
+                      createSubprogramDIE(SPDecl));
+  else {
+    // There is not any need to generate specification DIE for a function
+    // defined at compile unit level. If a function is defined inside another
+    // function then gdb prefers the definition at top level and but does not
+    // expect specification DIE in parent function. So avoid creating
+    // specification DIE for a function defined inside a function.
+    if (SP.isDefinition() && !SP.getContext().isCompileUnit() &&
+        !SP.getContext().isFile() &&
+        !isSubprogramContext(SP.getContext())) {
+      SPCU-> addUInt(SPDie, dwarf::DW_AT_declaration, dwarf::DW_FORM_flag, 1);
+      
+      // Add arguments.
+      DICompositeType SPTy = SP.getType();
+      DIArray Args = SPTy.getTypeArray();
+      unsigned SPTag = SPTy.getTag();
+      if (SPTag == dwarf::DW_TAG_subroutine_type)
+        for (unsigned i = 1, N = Args.getNumElements(); i < N; ++i) {
+          DIE *Arg = new DIE(dwarf::DW_TAG_formal_parameter);
+          DIType ATy = DIType(DIType(Args.getElement(i)));
+          SPCU->addType(Arg, ATy);
+          if (ATy.isArtificial())
+            SPCU->addUInt(Arg, dwarf::DW_AT_artificial, dwarf::DW_FORM_flag, 1);
+          SPDie->addChild(Arg);
+        }
+      DIE *SPDeclDie = SPDie;
+      SPDie = new DIE(dwarf::DW_TAG_subprogram);
+      SPCU->addDIEEntry(SPDie, dwarf::DW_AT_specification, dwarf::DW_FORM_ref4,
+                        SPDeclDie);
+      SPCU->addDie(SPDie);
+    }
   }
-
   // Pick up abstract subprogram DIE.
   if (DIE *AbsSPDIE = AbstractSPDies.lookup(SPNode)) {
     SPDie = new DIE(dwarf::DW_TAG_subprogram);
-    addDIEEntry(SPDie, dwarf::DW_AT_abstract_origin,
-                dwarf::DW_FORM_ref4, AbsSPDIE);
+    SPCU->addDIEEntry(SPDie, dwarf::DW_AT_abstract_origin,
+                      dwarf::DW_FORM_ref4, AbsSPDIE);
     SPCU->addDie(SPDie);
   }
 
-  addLabel(SPDie, dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr,
-           Asm->GetTempSymbol("func_begin", Asm->getFunctionNumber()));
-  addLabel(SPDie, dwarf::DW_AT_high_pc, dwarf::DW_FORM_addr,
-           Asm->GetTempSymbol("func_end", Asm->getFunctionNumber()));
+  SPCU->addLabel(SPDie, dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr,
+                 Asm->GetTempSymbol("func_begin", Asm->getFunctionNumber()));
+  SPCU->addLabel(SPDie, dwarf::DW_AT_high_pc, dwarf::DW_FORM_addr,
+                 Asm->GetTempSymbol("func_end", Asm->getFunctionNumber()));
   const TargetRegisterInfo *RI = Asm->TM.getRegisterInfo();
   MachineLocation Location(RI->getFrameRegister(*Asm->MF));
-  addAddress(SPDie, dwarf::DW_AT_frame_base, Location);
+  SPCU->addAddress(SPDie, dwarf::DW_AT_frame_base, Location);
 
   return SPDie;
 }
@@ -1579,13 +523,14 @@ DIE *DwarfDebug::constructLexicalScopeDIE(DbgScope *Scope) {
   if (Ranges.empty())
     return 0;
 
+  CompileUnit *TheCU = getCompileUnit(Scope->getScopeNode());
   SmallVector<DbgRange, 4>::const_iterator RI = Ranges.begin();
   if (Ranges.size() > 1) {
     // .debug_range section has not been laid out yet. Emit offset in
     // .debug_range as a uint, size 4, for now. emitDIE will handle
     // DW_AT_ranges appropriately.
-    addUInt(ScopeDIE, dwarf::DW_AT_ranges, dwarf::DW_FORM_data4,
-            DebugRangeSymbols.size() * Asm->getTargetData().getPointerSize());
+    TheCU->addUInt(ScopeDIE, dwarf::DW_AT_ranges, dwarf::DW_FORM_data4,
+                   DebugRangeSymbols.size() * Asm->getTargetData().getPointerSize());
     for (SmallVector<DbgRange, 4>::const_iterator RI = Ranges.begin(),
          RE = Ranges.end(); RI != RE; ++RI) {
       DebugRangeSymbols.push_back(getLabelBeforeInsn(RI->first));
@@ -1604,8 +549,8 @@ DIE *DwarfDebug::constructLexicalScopeDIE(DbgScope *Scope) {
   assert(Start->isDefined() && "Invalid starting label for an inlined scope!");
   assert(End->isDefined() && "Invalid end label for an inlined scope!");
 
-  addLabel(ScopeDIE, dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr, Start);
-  addLabel(ScopeDIE, dwarf::DW_AT_high_pc, dwarf::DW_FORM_addr, End);
+  TheCU->addLabel(ScopeDIE, dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr, Start);
+  TheCU->addLabel(ScopeDIE, dwarf::DW_AT_high_pc, dwarf::DW_FORM_addr, End);
 
   return ScopeDIE;
 }
@@ -1639,17 +584,19 @@ DIE *DwarfDebug::constructInlinedScopeDIE(DbgScope *Scope) {
   if (!Scope->getScopeNode())
     return NULL;
   DIScope DS(Scope->getScopeNode());
-  DIE *ScopeDIE = new DIE(dwarf::DW_TAG_inlined_subroutine);
-
   DISubprogram InlinedSP = getDISubprogram(DS);
   CompileUnit *TheCU = getCompileUnit(InlinedSP);
   DIE *OriginDIE = TheCU->getDIE(InlinedSP);
-  assert(OriginDIE && "Unable to find Origin DIE!");
-  addDIEEntry(ScopeDIE, dwarf::DW_AT_abstract_origin,
-              dwarf::DW_FORM_ref4, OriginDIE);
+  if (!OriginDIE) {
+    DEBUG(dbgs() << "Unable to find original DIE for inlined subprogram.");
+    return NULL;
+  }
+  DIE *ScopeDIE = new DIE(dwarf::DW_TAG_inlined_subroutine);
+  TheCU->addDIEEntry(ScopeDIE, dwarf::DW_AT_abstract_origin,
+                     dwarf::DW_FORM_ref4, OriginDIE);
 
-  addLabel(ScopeDIE, dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr, StartLabel);
-  addLabel(ScopeDIE, dwarf::DW_AT_high_pc, dwarf::DW_FORM_addr, EndLabel);
+  TheCU->addLabel(ScopeDIE, dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr, StartLabel);
+  TheCU->addLabel(ScopeDIE, dwarf::DW_AT_high_pc, dwarf::DW_FORM_addr, EndLabel);
 
   InlinedSubprogramDIEs.insert(OriginDIE);
 
@@ -1665,8 +612,8 @@ DIE *DwarfDebug::constructInlinedScopeDIE(DbgScope *Scope) {
     I->second.push_back(std::make_pair(StartLabel, ScopeDIE));
 
   DILocation DL(Scope->getInlinedAt());
-  addUInt(ScopeDIE, dwarf::DW_AT_call_file, 0, TheCU->getID());
-  addUInt(ScopeDIE, dwarf::DW_AT_call_line, 0, DL.getLineNumber());
+  TheCU->addUInt(ScopeDIE, dwarf::DW_AT_call_file, 0, TheCU->getID());
+  TheCU->addUInt(ScopeDIE, dwarf::DW_AT_call_line, 0, DL.getLineNumber());
 
   return ScopeDIE;
 }
@@ -1695,7 +642,7 @@ DIE *DwarfDebug::constructVariableDIE(DbgVariable *DV, DbgScope *Scope) {
 
   // Define variable debug information entry.
   DIE *VariableDie = new DIE(Tag);
-
+  CompileUnit *VariableCU = getCompileUnit(DV->getVariable());
   DIE *AbsDIE = NULL;
   DenseMap<const DbgVariable *, const DbgVariable *>::iterator
     V2AVI = VarToAbstractVarMap.find(DV);
@@ -1703,20 +650,23 @@ DIE *DwarfDebug::constructVariableDIE(DbgVariable *DV, DbgScope *Scope) {
     AbsDIE = V2AVI->second->getDIE();
 
   if (AbsDIE)
-    addDIEEntry(VariableDie, dwarf::DW_AT_abstract_origin,
-                dwarf::DW_FORM_ref4, AbsDIE);
+    VariableCU->addDIEEntry(VariableDie, dwarf::DW_AT_abstract_origin,
+                       dwarf::DW_FORM_ref4, AbsDIE);
   else {
-    addString(VariableDie, dwarf::DW_AT_name, dwarf::DW_FORM_string, Name);
-    addSourceLine(VariableDie, DV->getVariable());
+    VariableCU->addString(VariableDie, dwarf::DW_AT_name, dwarf::DW_FORM_string,
+                          Name);
+    VariableCU->addSourceLine(VariableDie, DV->getVariable());
 
     // Add variable type.
-    addType(VariableDie, DV->getType());
+    VariableCU->addType(VariableDie, DV->getType());
   }
 
   if (Tag == dwarf::DW_TAG_formal_parameter && DV->getType().isArtificial())
-    addUInt(VariableDie, dwarf::DW_AT_artificial, dwarf::DW_FORM_flag, 1);
+    VariableCU->addUInt(VariableDie, dwarf::DW_AT_artificial, 
+                        dwarf::DW_FORM_flag, 1);
   else if (DIVariable(DV->getVariable()).isArtificial())
-    addUInt(VariableDie, dwarf::DW_AT_artificial, dwarf::DW_FORM_flag, 1);
+    VariableCU->addUInt(VariableDie, dwarf::DW_AT_artificial, 
+                        dwarf::DW_FORM_flag, 1);
 
   if (Scope->isAbstractScope()) {
     DV->setDIE(VariableDie);
@@ -1727,7 +677,7 @@ DIE *DwarfDebug::constructVariableDIE(DbgVariable *DV, DbgScope *Scope) {
 
   unsigned Offset = DV->getDotDebugLocOffset();
   if (Offset != ~0U) {
-    addLabel(VariableDie, dwarf::DW_AT_location, dwarf::DW_FORM_data4,
+    VariableCU->addLabel(VariableDie, dwarf::DW_AT_location, dwarf::DW_FORM_data4,
              Asm->GetTempSymbol("debug_loc", Offset));
     DV->setDIE(VariableDie);
     UseDotDebugLocEntry.insert(VariableDie);
@@ -1747,22 +697,31 @@ DIE *DwarfDebug::constructVariableDIE(DbgVariable *DV, DbgScope *Scope) {
         const TargetRegisterInfo *TRI = Asm->TM.getRegisterInfo();
         if (DVInsn->getOperand(1).isImm() &&
             TRI->getFrameRegister(*Asm->MF) == RegOp.getReg()) {
-          addVariableAddress(DV, VariableDie, DVInsn->getOperand(1).getImm());
-          updated = true;
-        } else
-          updated = addRegisterAddress(VariableDie, RegOp);
+          unsigned FrameReg = 0;
+          const TargetFrameLowering *TFI = Asm->TM.getFrameLowering();
+          int Offset = 
+            TFI->getFrameIndexReference(*Asm->MF, 
+                                        DVInsn->getOperand(1).getImm(), 
+                                        FrameReg);
+          MachineLocation Location(FrameReg, Offset);
+          VariableCU->addVariableAddress(DV, VariableDie, Location);
+          
+        } else if (RegOp.getReg())
+          VariableCU->addVariableAddress(DV, VariableDie, 
+                                         MachineLocation(RegOp.getReg()));
+        updated = true;
       }
       else if (DVInsn->getOperand(0).isImm())
-        updated = addConstantValue(VariableDie, DVInsn->getOperand(0));
+        updated = 
+          VariableCU->addConstantValue(VariableDie, DVInsn->getOperand(0),
+                                       DV->getType());
       else if (DVInsn->getOperand(0).isFPImm())
         updated =
-          addConstantFPValue(VariableDie, DVInsn->getOperand(0));
+          VariableCU->addConstantFPValue(VariableDie, DVInsn->getOperand(0));
     } else {
-      MachineLocation Location = Asm->getDebugValueLocation(DVInsn);
-      if (Location.getReg()) {
-        addAddress(VariableDie, dwarf::DW_AT_location, Location);
-        updated = true;
-      }
+      VariableCU->addVariableAddress(DV, VariableDie, 
+                                     Asm->getDebugValueLocation(DVInsn));
+      updated = true;
     }
     if (!updated) {
       // If variableDie is not updated then DBG_VALUE instruction does not
@@ -1776,35 +735,20 @@ DIE *DwarfDebug::constructVariableDIE(DbgVariable *DV, DbgScope *Scope) {
 
   // .. else use frame index, if available.
   int FI = 0;
-  if (findVariableFrameIndex(DV, &FI))
-    addVariableAddress(DV, VariableDie, FI);
-  
+  if (findVariableFrameIndex(DV, &FI)) {
+    unsigned FrameReg = 0;
+    const TargetFrameLowering *TFI = Asm->TM.getFrameLowering();
+    int Offset = 
+      TFI->getFrameIndexReference(*Asm->MF, FI, FrameReg);
+    MachineLocation Location(FrameReg, Offset);
+    VariableCU->addVariableAddress(DV, VariableDie, Location);
+  }
+
   DV->setDIE(VariableDie);
   return VariableDie;
 
 }
 
-void DwarfDebug::addPubTypes(DISubprogram SP) {
-  DICompositeType SPTy = SP.getType();
-  unsigned SPTag = SPTy.getTag();
-  if (SPTag != dwarf::DW_TAG_subroutine_type)
-    return;
-
-  DIArray Args = SPTy.getTypeArray();
-  for (unsigned i = 0, e = Args.getNumElements(); i != e; ++i) {
-    DIType ATy(Args.getElement(i));
-    if (!ATy.Verify())
-      continue;
-    DICompositeType CATy = getDICompositeType(ATy);
-    if (DIDescriptor(CATy).Verify() && !CATy.getName().empty()
-        && !CATy.isForwardDecl()) {
-      CompileUnit *TheCU = getCompileUnit(CATy);
-      if (DIEEntry *Entry = TheCU->getDIEEntry(CATy))
-        TheCU->addGlobalType(CATy.getName(), Entry->getEntry());
-    }
-  }
-}
-
 /// constructScopeDIE - Construct a DIE for this scope.
 DIE *DwarfDebug::constructScopeDIE(DbgScope *Scope) {
   if (!Scope || !Scope->getScopeNode())
@@ -1858,7 +802,7 @@ DIE *DwarfDebug::constructScopeDIE(DbgScope *Scope) {
     ScopeDIE->addChild(*I);
 
   if (DS.isSubprogram())
-    addPubTypes(DISubprogram(DS));
+    getCompileUnit(DS)->addPubTypes(DISubprogram(DS));
 
  return ScopeDIE;
 }
@@ -1875,11 +819,9 @@ unsigned DwarfDebug::GetOrCreateSourceID(StringRef FileName,
     return GetOrCreateSourceID("<stdin>", StringRef());
 
   // MCStream expects full path name as filename.
-  if (!DirName.empty() && !FileName.startswith("/")) {
-    std::string FullPathName(DirName.data());
-    if (!DirName.endswith("/"))
-      FullPathName += "/";
-    FullPathName += FileName.data();
+  if (!DirName.empty() && !sys::path::is_absolute(FileName)) {
+    SmallString<128> FullPathName = DirName;
+    sys::path::append(FullPathName, FileName);
     // Here FullPathName will be copied into StringMap by GetOrCreateSourceID.
     return GetOrCreateSourceID(StringRef(FullPathName), StringRef());
   }
@@ -1897,21 +839,6 @@ unsigned DwarfDebug::GetOrCreateSourceID(StringRef FileName,
   return SrcId;
 }
 
-/// getOrCreateNameSpace - Create a DIE for DINameSpace.
-DIE *DwarfDebug::getOrCreateNameSpace(DINameSpace NS) {
-  CompileUnit *TheCU = getCompileUnit(NS);
-  DIE *NDie = TheCU->getDIE(NS);
-  if (NDie)
-    return NDie;
-  NDie = new DIE(dwarf::DW_TAG_namespace);
-  TheCU->insertDIE(NS, NDie);
-  if (!NS.getName().empty())
-    addString(NDie, dwarf::DW_AT_name, dwarf::DW_FORM_string, NS.getName());
-  addSourceLine(NDie, NS);
-  addToContextOwner(NDie, NS.getContext());
-  return NDie;
-}
-
 /// constructCompileUnit - Create new CompileUnit for the given
 /// metadata node with tag DW_TAG_compile_unit.
 void DwarfDebug::constructCompileUnit(const MDNode *N) {
@@ -1921,37 +848,37 @@ void DwarfDebug::constructCompileUnit(const MDNode *N) {
   unsigned ID = GetOrCreateSourceID(FN, Dir);
 
   DIE *Die = new DIE(dwarf::DW_TAG_compile_unit);
-  addString(Die, dwarf::DW_AT_producer, dwarf::DW_FORM_string,
-            DIUnit.getProducer());
-  addUInt(Die, dwarf::DW_AT_language, dwarf::DW_FORM_data2,
-          DIUnit.getLanguage());
-  addString(Die, dwarf::DW_AT_name, dwarf::DW_FORM_string, FN);
+  CompileUnit *NewCU = new CompileUnit(ID, Die, Asm, this);
+  NewCU->addString(Die, dwarf::DW_AT_producer, dwarf::DW_FORM_string,
+                   DIUnit.getProducer());
+  NewCU->addUInt(Die, dwarf::DW_AT_language, dwarf::DW_FORM_data2,
+                 DIUnit.getLanguage());
+  NewCU->addString(Die, dwarf::DW_AT_name, dwarf::DW_FORM_string, FN);
   // Use DW_AT_entry_pc instead of DW_AT_low_pc/DW_AT_high_pc pair. This
   // simplifies debug range entries.
-  addUInt(Die, dwarf::DW_AT_entry_pc, dwarf::DW_FORM_addr, 0);
+  NewCU->addUInt(Die, dwarf::DW_AT_entry_pc, dwarf::DW_FORM_addr, 0);
   // DW_AT_stmt_list is a offset of line number information for this
   // compile unit in debug_line section.
-  if (Asm->MAI->doesDwarfUsesAbsoluteLabelForStmtList())
-    addLabel(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_addr,
-             Asm->GetTempSymbol("section_line"));
+  if(Asm->MAI->doesDwarfRequireRelocationForSectionOffset())
+    NewCU->addLabel(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_data4,
+                    Asm->GetTempSymbol("section_line"));
   else
-    addUInt(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_data4, 0);
+    NewCU->addUInt(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_data4, 0);
 
   if (!Dir.empty())
-    addString(Die, dwarf::DW_AT_comp_dir, dwarf::DW_FORM_string, Dir);
+    NewCU->addString(Die, dwarf::DW_AT_comp_dir, dwarf::DW_FORM_string, Dir);
   if (DIUnit.isOptimized())
-    addUInt(Die, dwarf::DW_AT_APPLE_optimized, dwarf::DW_FORM_flag, 1);
+    NewCU->addUInt(Die, dwarf::DW_AT_APPLE_optimized, dwarf::DW_FORM_flag, 1);
 
   StringRef Flags = DIUnit.getFlags();
   if (!Flags.empty())
-    addString(Die, dwarf::DW_AT_APPLE_flags, dwarf::DW_FORM_string, Flags);
-
+    NewCU->addString(Die, dwarf::DW_AT_APPLE_flags, dwarf::DW_FORM_string, Flags);
+  
   unsigned RVer = DIUnit.getRunTimeVersion();
   if (RVer)
-    addUInt(Die, dwarf::DW_AT_APPLE_major_runtime_vers,
+    NewCU->addUInt(Die, dwarf::DW_AT_APPLE_major_runtime_vers,
             dwarf::DW_FORM_data1, RVer);
 
-  CompileUnit *NewCU = new CompileUnit(ID, Die);
   if (!FirstCU)
     FirstCU = NewCU;
   CUMap.insert(std::make_pair(N, NewCU));
@@ -2047,38 +974,34 @@ void DwarfDebug::constructGlobalVariableDIE(const MDNode *N) {
   bool isGlobalVariable = GV.getGlobal() != NULL;
 
   // Add name.
-  addString(VariableDIE, dwarf::DW_AT_name, dwarf::DW_FORM_string,
-            GV.getDisplayName());
+  TheCU->addString(VariableDIE, dwarf::DW_AT_name, dwarf::DW_FORM_string,
+                   GV.getDisplayName());
   StringRef LinkageName = GV.getLinkageName();
   if (!LinkageName.empty() && isGlobalVariable)
-    addString(VariableDIE, dwarf::DW_AT_MIPS_linkage_name, dwarf::DW_FORM_string,
-              getRealLinkageName(LinkageName));
+    TheCU->addString(VariableDIE, dwarf::DW_AT_MIPS_linkage_name, 
+                     dwarf::DW_FORM_string,
+                     getRealLinkageName(LinkageName));
   // Add type.
-  addType(VariableDIE, GTy);
-  if (GTy.isCompositeType() && !GTy.getName().empty()
-      && !GTy.isForwardDecl()) {
-    DIEEntry *Entry = TheCU->getDIEEntry(GTy);
-    assert(Entry && "Missing global type!");
-    TheCU->addGlobalType(GTy.getName(), Entry->getEntry());
-  }
+  TheCU->addType(VariableDIE, GTy);
+
   // Add scoping info.
   if (!GV.isLocalToUnit()) {
-    addUInt(VariableDIE, dwarf::DW_AT_external, dwarf::DW_FORM_flag, 1);
+    TheCU->addUInt(VariableDIE, dwarf::DW_AT_external, dwarf::DW_FORM_flag, 1);
     // Expose as global. 
     TheCU->addGlobal(GV.getName(), VariableDIE);
   }
   // Add line number info.
-  addSourceLine(VariableDIE, GV);
+  TheCU->addSourceLine(VariableDIE, GV);
   // Add to map.
   TheCU->insertDIE(N, VariableDIE);
   // Add to context owner.
   DIDescriptor GVContext = GV.getContext();
-  addToContextOwner(VariableDIE, GVContext);
+  TheCU->addToContextOwner(VariableDIE, GVContext);
   // Add location.
   if (isGlobalVariable) {
     DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
-    addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_addr);
-    addLabel(Block, 0, dwarf::DW_FORM_udata,
+    TheCU->addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_addr);
+    TheCU->addLabel(Block, 0, dwarf::DW_FORM_udata,
              Asm->Mang->getSymbol(GV.getGlobal()));
     // Do not create specification DIE if context is either compile unit
     // or a subprogram.
@@ -2086,28 +1009,28 @@ void DwarfDebug::constructGlobalVariableDIE(const MDNode *N) {
         !GVContext.isFile() && !isSubprogramContext(GVContext)) {
       // Create specification DIE.
       DIE *VariableSpecDIE = new DIE(dwarf::DW_TAG_variable);
-      addDIEEntry(VariableSpecDIE, dwarf::DW_AT_specification,
+      TheCU->addDIEEntry(VariableSpecDIE, dwarf::DW_AT_specification,
                   dwarf::DW_FORM_ref4, VariableDIE);
-      addBlock(VariableSpecDIE, dwarf::DW_AT_location, 0, Block);
-      addUInt(VariableDIE, dwarf::DW_AT_declaration, dwarf::DW_FORM_flag, 1);
+      TheCU->addBlock(VariableSpecDIE, dwarf::DW_AT_location, 0, Block);
+      TheCU->addUInt(VariableDIE, dwarf::DW_AT_declaration, dwarf::DW_FORM_flag, 1);
       TheCU->addDie(VariableSpecDIE);
     } else {
-      addBlock(VariableDIE, dwarf::DW_AT_location, 0, Block);
+      TheCU->addBlock(VariableDIE, dwarf::DW_AT_location, 0, Block);
     } 
   } else if (ConstantInt *CI = 
              dyn_cast_or_null<ConstantInt>(GV.getConstant()))
-    addConstantValue(VariableDIE, CI, isUnsignedDIType(GTy));
+    TheCU->addConstantValue(VariableDIE, CI, isUnsignedDIType(GTy));
   else if (const ConstantExpr *CE = getMergedGlobalExpr(N->getOperand(11))) {
     // GV is a merged global.
     DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
-    addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_addr);
-    addLabel(Block, 0, dwarf::DW_FORM_udata,
-             Asm->Mang->getSymbol(cast<GlobalValue>(CE->getOperand(0))));
+    TheCU->addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_addr);
+    TheCU->addLabel(Block, 0, dwarf::DW_FORM_udata,
+                    Asm->Mang->getSymbol(cast<GlobalValue>(CE->getOperand(0))));
     ConstantInt *CII = cast<ConstantInt>(CE->getOperand(2));
-    addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
-    addUInt(Block, 0, dwarf::DW_FORM_udata, CII->getZExtValue());
-    addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus);
-    addBlock(VariableDIE, dwarf::DW_AT_location, 0, Block);
+    TheCU->addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
+    TheCU->addUInt(Block, 0, dwarf::DW_FORM_udata, CII->getZExtValue());
+    TheCU->addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus);
+    TheCU->addBlock(VariableDIE, dwarf::DW_AT_location, 0, Block);
   }
 
   return;
@@ -2133,7 +1056,7 @@ void DwarfDebug::constructSubprogramDIE(const MDNode *N) {
   TheCU->insertDIE(N, SubprogramDie);
 
   // Add to context owner.
-  addToContextOwner(SubprogramDie, SP.getContext());
+  TheCU->addToContextOwner(SubprogramDie, SP.getContext());
 
   // Expose as global.
   TheCU->addGlobal(SP.getName(), SubprogramDie);
@@ -2148,52 +1071,80 @@ void DwarfDebug::beginModule(Module *M) {
   if (DisableDebugInfoPrinting)
     return;
 
-  DebugInfoFinder DbgFinder;
-  DbgFinder.processModule(*M);
-
-  bool HasDebugInfo = false;
+  // If module has named metadata anchors then use them, otherwise scan the module
+  // using debug info finder to collect debug info.
+  NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu");
+  if (CU_Nodes) {
+
+    NamedMDNode *GV_Nodes = M->getNamedMetadata("llvm.dbg.gv");
+    NamedMDNode *SP_Nodes = M->getNamedMetadata("llvm.dbg.sp");
+    if (!GV_Nodes && !SP_Nodes)
+      // If there are not any global variables or any functions then
+      // there is not any debug info in this module.
+      return;
+
+    for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i)
+      constructCompileUnit(CU_Nodes->getOperand(i));
+
+    if (GV_Nodes)
+      for (unsigned i = 0, e = GV_Nodes->getNumOperands(); i != e; ++i)
+        constructGlobalVariableDIE(GV_Nodes->getOperand(i));
+
+    if (SP_Nodes)
+      for (unsigned i = 0, e = SP_Nodes->getNumOperands(); i != e; ++i)
+        constructSubprogramDIE(SP_Nodes->getOperand(i));
+    
+  } else {
 
-  // Scan all the compile-units to see if there are any marked as the main unit.
-  // if not, we do not generate debug info.
-  for (DebugInfoFinder::iterator I = DbgFinder.compile_unit_begin(),
-       E = DbgFinder.compile_unit_end(); I != E; ++I) {
-    if (DICompileUnit(*I).isMain()) {
-      HasDebugInfo = true;
-      break;
+    DebugInfoFinder DbgFinder;
+    DbgFinder.processModule(*M);
+    
+    bool HasDebugInfo = false;
+    // Scan all the compile-units to see if there are any marked as the main unit.
+    // if not, we do not generate debug info.
+    for (DebugInfoFinder::iterator I = DbgFinder.compile_unit_begin(),
+           E = DbgFinder.compile_unit_end(); I != E; ++I) {
+      if (DICompileUnit(*I).isMain()) {
+        HasDebugInfo = true;
+        break;
+      }
     }
+    if (!HasDebugInfo) return;
+    
+    // Create all the compile unit DIEs.
+    for (DebugInfoFinder::iterator I = DbgFinder.compile_unit_begin(),
+           E = DbgFinder.compile_unit_end(); I != E; ++I)
+      constructCompileUnit(*I);
+    
+    // Create DIEs for each global variable.
+    for (DebugInfoFinder::iterator I = DbgFinder.global_variable_begin(),
+           E = DbgFinder.global_variable_end(); I != E; ++I)
+      constructGlobalVariableDIE(*I);
+    
+    // Create DIEs for each subprogram.
+    for (DebugInfoFinder::iterator I = DbgFinder.subprogram_begin(),
+           E = DbgFinder.subprogram_end(); I != E; ++I)
+      constructSubprogramDIE(*I);
   }
-
-  if (!HasDebugInfo) return;
-
+  
   // Tell MMI that we have debug info.
   MMI->setDebugInfoAvailability(true);
-
+  
   // Emit initial sections.
   EmitSectionLabels();
 
-  // Create all the compile unit DIEs.
-  for (DebugInfoFinder::iterator I = DbgFinder.compile_unit_begin(),
-         E = DbgFinder.compile_unit_end(); I != E; ++I)
-    constructCompileUnit(*I);
-
-  // Create DIEs for each subprogram.
-  for (DebugInfoFinder::iterator I = DbgFinder.subprogram_begin(),
-         E = DbgFinder.subprogram_end(); I != E; ++I)
-    constructSubprogramDIE(*I);
-
-  // Create DIEs for each global variable.
-  for (DebugInfoFinder::iterator I = DbgFinder.global_variable_begin(),
-         E = DbgFinder.global_variable_end(); I != E; ++I)
-    constructGlobalVariableDIE(*I);
-
   //getOrCreateTypeDIE
   if (NamedMDNode *NMD = M->getNamedMetadata("llvm.dbg.enum"))
-    for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i)
-      getOrCreateTypeDIE(DIType(NMD->getOperand(i)));
+    for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) {
+      DIType Ty(NMD->getOperand(i));
+      getCompileUnit(Ty)->getOrCreateTypeDIE(Ty);
+    }
 
   if (NamedMDNode *NMD = M->getNamedMetadata("llvm.dbg.ty"))
-    for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i)
-      getOrCreateTypeDIE(DIType(NMD->getOperand(i)));
+    for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) {
+      DIType Ty(NMD->getOperand(i));
+      getCompileUnit(Ty)->getOrCreateTypeDIE(Ty);
+    }
 
   // Prime section data.
   SectionMap.insert(Asm->getObjFileLowering().getTextSection());
@@ -2244,7 +1195,7 @@ void DwarfDebug::endModule() {
   for (SmallPtrSet<DIE *, 4>::iterator AI = InlinedSubprogramDIEs.begin(),
          AE = InlinedSubprogramDIEs.end(); AI != AE; ++AI) {
     DIE *ISP = *AI;
-    addUInt(ISP, dwarf::DW_AT_inline, 0, dwarf::DW_INL_inlined);
+    FirstCU->addUInt(ISP, dwarf::DW_AT_inline, 0, dwarf::DW_INL_inlined);
   }
 
   for (DenseMap<DIE *, const MDNode *>::iterator CI = ContainingTypeMap.begin(),
@@ -2254,7 +1205,8 @@ void DwarfDebug::endModule() {
     if (!N) continue;
     DIE *NDie = getCompileUnit(N)->getDIE(N);
     if (!NDie) continue;
-    addDIEEntry(SPDie, dwarf::DW_AT_containing_type, dwarf::DW_FORM_ref4, NDie);
+    getCompileUnit(N)->addDIEEntry(SPDie, dwarf::DW_AT_containing_type, 
+                                   dwarf::DW_FORM_ref4, NDie);
   }
 
   // Standard sections final addresses.
@@ -2269,14 +1221,6 @@ void DwarfDebug::endModule() {
     Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("section_end", i));
   }
 
-  // Emit common frame information.
-  emitCommonDebugFrame();
-
-  // Emit function debug frame information
-  for (std::vector<FunctionDebugFrameInfo>::iterator I = DebugFrames.begin(),
-         E = DebugFrames.end(); I != E; ++I)
-    emitFunctionDebugFrame(*I);
-
   // Compute DIE offsets and sizes.
   computeSizeAndOffsets();
 
@@ -2464,15 +1408,10 @@ DwarfDebug::collectVariableInfo(const MachineFunction *MF,
            HI = History.begin(), HE = History.end(); HI != HE; ++HI) {
       const MachineInstr *Begin = *HI;
       assert(Begin->isDebugValue() && "Invalid History entry");
-      MachineLocation MLoc;
-      if (Begin->getNumOperands() == 3) {
-        if (Begin->getOperand(0).isReg() && Begin->getOperand(1).isImm())
-          MLoc.set(Begin->getOperand(0).getReg(), Begin->getOperand(1).getImm());
-      } else
-        MLoc = Asm->getDebugValueLocation(Begin);
 
-      // FIXME: emitDebugLoc only understands registers.
-      if (!MLoc.getReg())
+      // Check if DBG_VALUE is truncating a range.
+      if (Begin->getNumOperands() > 1 && Begin->getOperand(0).isReg()
+          && !Begin->getOperand(0).getReg())
         continue;
 
       // Compute the range for a register location.
@@ -2481,7 +1420,7 @@ DwarfDebug::collectVariableInfo(const MachineFunction *MF,
 
       if (HI + 1 == HE)
         // If Begin is the last instruction in History then its value is valid
-        // until the end of the funtion.
+        // until the end of the function.
         SLabel = FunctionEndSym;
       else {
         const MachineInstr *End = HI[1];
@@ -2496,7 +1435,25 @@ DwarfDebug::collectVariableInfo(const MachineFunction *MF,
       }
 
       // The value is valid until the next DBG_VALUE or clobber.
-      DotDebugLocEntries.push_back(DotDebugLocEntry(FLabel, SLabel, MLoc));
+      MachineLocation MLoc;
+      if (Begin->getNumOperands() == 3) {
+        if (Begin->getOperand(0).isReg() && Begin->getOperand(1).isImm()) {
+          MLoc.set(Begin->getOperand(0).getReg(), 
+                   Begin->getOperand(1).getImm());
+          DotDebugLocEntries.
+            push_back(DotDebugLocEntry(FLabel, SLabel, MLoc, Var));
+        }
+        // FIXME: Handle isFPImm also.
+        else if (Begin->getOperand(0).isImm()) {
+          DotDebugLocEntries.
+            push_back(DotDebugLocEntry(FLabel, SLabel, 
+                                       Begin->getOperand(0).getImm()));
+        }
+      } else {
+        MLoc = Asm->getDebugValueLocation(Begin);
+        DotDebugLocEntries.
+          push_back(DotDebugLocEntry(FLabel, SLabel, MLoc, Var));
+      }
     }
     DotDebugLocEntries.push_back(DotDebugLocEntry());
   }
@@ -2533,12 +1490,17 @@ void DwarfDebug::beginInstruction(const MachineInstr *MI) {
   if (!MI->isDebugValue()) {
     DebugLoc DL = MI->getDebugLoc();
     if (DL != PrevInstLoc && (!DL.isUnknown() || UnknownLocations)) {
+      unsigned Flags = DWARF2_FLAG_IS_STMT;
       PrevInstLoc = DL;
+      if (DL == PrologEndLoc) {
+        Flags |= DWARF2_FLAG_PROLOGUE_END;
+        PrologEndLoc = DebugLoc();
+      }
       if (!DL.isUnknown()) {
         const MDNode *Scope = DL.getScope(Asm->MF->getFunction()->getContext());
-        recordSourceLine(DL.getLine(), DL.getCol(), Scope);
+        recordSourceLine(DL.getLine(), DL.getCol(), Scope, Flags);
       } else
-        recordSourceLine(0, 0, 0);
+        recordSourceLine(0, 0, 0, 0);
     }
   }
 
@@ -2850,41 +1812,22 @@ void DwarfDebug::identifyScopeMarkers() {
   }
 }
 
-/// FindFirstDebugLoc - Find the first debug location in the function. This
-/// is intended to be an approximation for the source position of the
-/// beginning of the function.
-static DebugLoc FindFirstDebugLoc(const MachineFunction *MF) {
-  for (MachineFunction::const_iterator I = MF->begin(), E = MF->end();
-       I != E; ++I)
-    for (MachineBasicBlock::const_iterator MBBI = I->begin(), MBBE = I->end();
-         MBBI != MBBE; ++MBBI) {
-      DebugLoc DL = MBBI->getDebugLoc();
-      if (!DL.isUnknown())
-        return DL;
-    }
-  return DebugLoc();
+/// getScopeNode - Get MDNode for DebugLoc's scope.
+static MDNode *getScopeNode(DebugLoc DL, const LLVMContext &Ctx) {
+  if (MDNode *InlinedAt = DL.getInlinedAt(Ctx))
+    return getScopeNode(DebugLoc::getFromDILocation(InlinedAt), Ctx);
+  return DL.getScope(Ctx);
 }
 
-#ifndef NDEBUG
-/// CheckLineNumbers - Count basicblocks whose instructions do not have any
-/// line number information.
-static void CheckLineNumbers(const MachineFunction *MF) {
-  for (MachineFunction::const_iterator I = MF->begin(), E = MF->end();
-       I != E; ++I) {
-    bool FoundLineNo = false;
-    for (MachineBasicBlock::const_iterator II = I->begin(), IE = I->end();
-         II != IE; ++II) {
-      const MachineInstr *MI = II;
-      if (!MI->getDebugLoc().isUnknown()) {
-        FoundLineNo = true;
-        break;
-      }
-    }
-    if (!FoundLineNo && I->size())
-      ++BlocksWithoutLineNo;      
-  }
+/// getFnDebugLoc - Walk up the scope chain of given debug loc and find
+/// line number  info for the function.
+static DebugLoc getFnDebugLoc(DebugLoc DL, const LLVMContext &Ctx) {
+  const MDNode *Scope = getScopeNode(DL, Ctx);
+  DISubprogram SP = getDISubprogram(Scope);
+  if (SP.Verify()) 
+    return DebugLoc::get(SP.getLineNumber(), 0, SP);
+  return DebugLoc();
 }
-#endif
 
 /// beginFunction - Gather pre-function debug information.  Assumes being
 /// emitted immediately after the function entry point.
@@ -2892,44 +1835,16 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
   if (!MMI->hasDebugInfo()) return;
   if (!extractScopeInformation()) return;
 
-#ifndef NDEBUG
-  CheckLineNumbers(MF);
-#endif
-
   FunctionBeginSym = Asm->GetTempSymbol("func_begin",
                                         Asm->getFunctionNumber());
   // Assumes in correct section after the entry point.
   Asm->OutStreamer.EmitLabel(FunctionBeginSym);
 
-  // Emit label for the implicitly defined dbg.stoppoint at the start of the
-  // function.
-  DebugLoc FDL = FindFirstDebugLoc(MF);
-  if (FDL.isUnknown()) return;
-
-  const MDNode *Scope = FDL.getScope(MF->getFunction()->getContext());
-  const MDNode *TheScope = 0;
-
-  DISubprogram SP = getDISubprogram(Scope);
-  unsigned Line, Col;
-  if (SP.Verify()) {
-    Line = SP.getLineNumber();
-    Col = 0;
-    TheScope = SP;
-  } else {
-    Line = FDL.getLine();
-    Col = FDL.getCol();
-    TheScope = Scope;
-  }
-
-  recordSourceLine(Line, Col, TheScope);
-
   assert(UserVariables.empty() && DbgValues.empty() && "Maps weren't cleaned");
 
   /// ProcessedArgs - Collection of arguments already processed.
   SmallPtrSet<const MDNode *, 8> ProcessedArgs;
-
   const TargetRegisterInfo *TRI = Asm->TM.getRegisterInfo();
-
   /// LiveUserVar - Map physreg numbers to the MDNode they contain.
   std::vector<const MDNode*> LiveUserVar(TRI->getNumRegs());
 
@@ -2995,6 +1910,11 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
         if (!MI->isLabel())
           AtBlockEntry = false;
 
+        // First known non DBG_VALUE location marks beginning of function
+        // body.
+        if (PrologEndLoc.isUnknown() && !MI->getDebugLoc().isUnknown())
+          PrologEndLoc = MI->getDebugLoc();
+
         // Check if the instruction clobbers any registers with debug vars.
         for (MachineInstr::const_mop_iterator MOI = MI->operands_begin(),
                MOE = MI->operands_end(); MOI != MOE; ++MOI) {
@@ -3063,6 +1983,15 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
 
   PrevInstLoc = DebugLoc();
   PrevLabel = FunctionBeginSym;
+
+  // Record beginning of function.
+  if (!PrologEndLoc.isUnknown()) {
+    DebugLoc FnStartDL = getFnDebugLoc(PrologEndLoc,
+                                       MF->getFunction()->getContext());
+    recordSourceLine(FnStartDL.getLine(), FnStartDL.getCol(),
+                     FnStartDL.getScope(MF->getFunction()->getContext()),
+                     DWARF2_FLAG_IS_STMT);
+  }
 }
 
 /// endFunction - Gather and emit post-function debug information.
@@ -3109,8 +2038,9 @@ void DwarfDebug::endFunction(const MachineFunction *MF) {
     DIE *CurFnDIE = constructScopeDIE(CurrentFnDbgScope);
 
     if (!DisableFramePointerElim(*MF))
-      addUInt(CurFnDIE, dwarf::DW_AT_APPLE_omit_frame_ptr,
-              dwarf::DW_FORM_flag, 1);
+      getCompileUnit(CurrentFnDbgScope->getScopeNode())->addUInt(CurFnDIE, 
+                                                                 dwarf::DW_AT_APPLE_omit_frame_ptr,
+                                                                 dwarf::DW_FORM_flag, 1);
 
 
     DebugFrames.push_back(FunctionDebugFrameInfo(Asm->getFunctionNumber(),
@@ -3119,7 +2049,7 @@ void DwarfDebug::endFunction(const MachineFunction *MF) {
 
   // Clear debug info
   CurrentFnDbgScope = NULL;
-  CurrentFnArguments.clear();
+  DeleteContainerPointers(CurrentFnArguments);
   DbgVariableToFrameIndexMap.clear();
   VarToAbstractVarMap.clear();
   DbgVariableToDbgInstMap.clear();
@@ -3176,7 +2106,8 @@ DbgScope *DwarfDebug::findDbgScope(const MachineInstr *MInsn) {
 /// recordSourceLine - Register a source line with debug info. Returns the
 /// unique label that was emitted and which provides correspondence to
 /// the source line list.
-void DwarfDebug::recordSourceLine(unsigned Line, unsigned Col, const MDNode *S){
+void DwarfDebug::recordSourceLine(unsigned Line, unsigned Col, const MDNode *S,
+                                  unsigned Flags) {
   StringRef Fn;
   StringRef Dir;
   unsigned Src = 1;
@@ -3204,9 +2135,8 @@ void DwarfDebug::recordSourceLine(unsigned Line, unsigned Col, const MDNode *S){
 
     Src = GetOrCreateSourceID(Fn, Dir);
   }
-
-  Asm->OutStreamer.EmitDwarfLocDirective(Src, Line, Col, DWARF2_FLAG_IS_STMT,
-                                         0, 0);
+  Asm->OutStreamer.EmitDwarfLocDirective(Src, Line, Col, Flags,
+                                         0, 0, Fn);
 }
 
 //===----------------------------------------------------------------------===//
@@ -3264,17 +2194,15 @@ DwarfDebug::computeSizeAndOffset(DIE *Die, unsigned Offset, bool Last) {
 /// computeSizeAndOffsets - Compute the size and offset of all the DIEs.
 ///
 void DwarfDebug::computeSizeAndOffsets() {
-  unsigned PrevOffset = 0;
   for (DenseMap<const MDNode *, CompileUnit *>::iterator I = CUMap.begin(),
          E = CUMap.end(); I != E; ++I) {
     // Compute size of compile unit header.
-    static unsigned Offset = PrevOffset +
+    unsigned Offset = 
       sizeof(int32_t) + // Length of Compilation Unit Info
       sizeof(int16_t) + // DWARF version number
       sizeof(int32_t) + // Offset Into Abbrev. Section
       sizeof(int8_t);   // Pointer Size (in bytes)
     computeSizeAndOffset(I->second->getCUDie(), Offset, true);
-    PrevOffset = Offset;
   }
 }
 
@@ -3296,11 +2224,6 @@ void DwarfDebug::EmitSectionLabels() {
   const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
 
   // Dwarf sections base addresses.
-  if (Asm->MAI->doesDwarfRequireFrameSection()) {
-    DwarfFrameSectionSym =
-      EmitSectionSym(Asm, TLOF.getDwarfFrameSection(), "section_debug_frame");
-   }
-
   DwarfInfoSectionSym =
     EmitSectionSym(Asm, TLOF.getDwarfInfoSection(), "section_info");
   DwarfAbbrevSectionSym =
@@ -3435,8 +2358,7 @@ void DwarfDebug::emitDebugInfo() {
     unsigned ContentSize = Die->getSize() +
       sizeof(int16_t) + // DWARF version number
       sizeof(int32_t) + // Offset Into Abbrev. Section
-      sizeof(int8_t) +  // Pointer Size (in bytes)
-      sizeof(int32_t);  // FIXME - extra pad for gdb bug.
+      sizeof(int8_t);   // Pointer Size (in bytes)
 
     Asm->OutStreamer.AddComment("Length of Compilation Unit Info");
     Asm->EmitInt32(ContentSize);
@@ -3449,12 +2371,6 @@ void DwarfDebug::emitDebugInfo() {
     Asm->EmitInt8(Asm->getTargetData().getPointerSize());
 
     emitDIE(Die);
-    // FIXME - extra padding for gdb bug.
-    Asm->OutStreamer.AddComment("4 extra padding bytes for GDB");
-    Asm->EmitInt8(0);
-    Asm->EmitInt8(0);
-    Asm->EmitInt8(0);
-    Asm->EmitInt8(0);
     Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("info_end", TheCU->getID()));
   }
 }
@@ -3515,91 +2431,6 @@ void DwarfDebug::emitEndOfLineMatrix(unsigned SectionEnd) {
   Asm->EmitInt8(1);
 }
 
-/// emitCommonDebugFrame - Emit common frame info into a debug frame section.
-///
-void DwarfDebug::emitCommonDebugFrame() {
-  if (!Asm->MAI->doesDwarfRequireFrameSection())
-    return;
-
-  int stackGrowth = Asm->getTargetData().getPointerSize();
-  if (Asm->TM.getFrameLowering()->getStackGrowthDirection() ==
-      TargetFrameLowering::StackGrowsDown)
-    stackGrowth *= -1;
-
-  // Start the dwarf frame section.
-  Asm->OutStreamer.SwitchSection(
-                              Asm->getObjFileLowering().getDwarfFrameSection());
-
-  Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("debug_frame_common"));
-  Asm->OutStreamer.AddComment("Length of Common Information Entry");
-  Asm->EmitLabelDifference(Asm->GetTempSymbol("debug_frame_common_end"),
-                           Asm->GetTempSymbol("debug_frame_common_begin"), 4);
-
-  Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("debug_frame_common_begin"));
-  Asm->OutStreamer.AddComment("CIE Identifier Tag");
-  Asm->EmitInt32((int)dwarf::DW_CIE_ID);
-  Asm->OutStreamer.AddComment("CIE Version");
-  Asm->EmitInt8(dwarf::DW_CIE_VERSION);
-  Asm->OutStreamer.AddComment("CIE Augmentation");
-  Asm->OutStreamer.EmitIntValue(0, 1, /*addrspace*/0); // nul terminator.
-  Asm->EmitULEB128(1, "CIE Code Alignment Factor");
-  Asm->EmitSLEB128(stackGrowth, "CIE Data Alignment Factor");
-  Asm->OutStreamer.AddComment("CIE RA Column");
-  const TargetRegisterInfo *RI = Asm->TM.getRegisterInfo();
-  const TargetFrameLowering *TFI = Asm->TM.getFrameLowering();
-  Asm->EmitInt8(RI->getDwarfRegNum(RI->getRARegister(), false));
-
-  std::vector<MachineMove> Moves;
-  TFI->getInitialFrameState(Moves);
-
-  Asm->EmitFrameMoves(Moves, 0, false);
-
-  Asm->EmitAlignment(2);
-  Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("debug_frame_common_end"));
-}
-
-/// emitFunctionDebugFrame - Emit per function frame info into a debug frame
-/// section.
-void DwarfDebug::
-emitFunctionDebugFrame(const FunctionDebugFrameInfo &DebugFrameInfo) {
-  if (!Asm->MAI->doesDwarfRequireFrameSection())
-    return;
-
-  // Start the dwarf frame section.
-  Asm->OutStreamer.SwitchSection(
-                              Asm->getObjFileLowering().getDwarfFrameSection());
-
-  Asm->OutStreamer.AddComment("Length of Frame Information Entry");
-  MCSymbol *DebugFrameBegin =
-    Asm->GetTempSymbol("debug_frame_begin", DebugFrameInfo.Number);
-  MCSymbol *DebugFrameEnd =
-    Asm->GetTempSymbol("debug_frame_end", DebugFrameInfo.Number);
-  Asm->EmitLabelDifference(DebugFrameEnd, DebugFrameBegin, 4);
-
-  Asm->OutStreamer.EmitLabel(DebugFrameBegin);
-
-  Asm->OutStreamer.AddComment("FDE CIE offset");
-  Asm->EmitSectionOffset(Asm->GetTempSymbol("debug_frame_common"),
-                         DwarfFrameSectionSym);
-
-  Asm->OutStreamer.AddComment("FDE initial location");
-  MCSymbol *FuncBeginSym =
-    Asm->GetTempSymbol("func_begin", DebugFrameInfo.Number);
-  Asm->OutStreamer.EmitSymbolValue(FuncBeginSym,
-                                   Asm->getTargetData().getPointerSize(),
-                                   0/*AddrSpace*/);
-
-
-  Asm->OutStreamer.AddComment("FDE address range");
-  Asm->EmitLabelDifference(Asm->GetTempSymbol("func_end",DebugFrameInfo.Number),
-                           FuncBeginSym, Asm->getTargetData().getPointerSize());
-
-  Asm->EmitFrameMoves(DebugFrameInfo.Moves, FuncBeginSym, false);
-
-  Asm->EmitAlignment(2);
-  Asm->OutStreamer.EmitLabel(DebugFrameEnd);
-}
-
 /// emitDebugPubNames - Emit visible names into a debug pubnames section.
 ///
 void DwarfDebug::emitDebugPubNames() {
@@ -3760,33 +2591,63 @@ void DwarfDebug::emitDebugLoc() {
     } else {
       Asm->OutStreamer.EmitSymbolValue(Entry.Begin, Size, 0);
       Asm->OutStreamer.EmitSymbolValue(Entry.End, Size, 0);
-      const TargetRegisterInfo *RI = Asm->TM.getRegisterInfo();
-      unsigned Reg = RI->getDwarfRegNum(Entry.Loc.getReg(), false);
-      if (int Offset =  Entry.Loc.getOffset()) {
-        // If the value is at a certain offset from frame register then
-        // use DW_OP_fbreg.
-        unsigned OffsetSize = Offset ? MCAsmInfo::getSLEB128Size(Offset) : 1;
-        Asm->OutStreamer.AddComment("Loc expr size");
-        Asm->EmitInt16(1 + OffsetSize);
-        Asm->OutStreamer.AddComment(
-          dwarf::OperationEncodingString(dwarf::DW_OP_fbreg));
-        Asm->EmitInt8(dwarf::DW_OP_fbreg);
-        Asm->OutStreamer.AddComment("Offset");
-        Asm->EmitSLEB128(Offset);
-      } else {
-        if (Reg < 32) {
-          Asm->OutStreamer.AddComment("Loc expr size");
-          Asm->EmitInt16(1);
-          Asm->OutStreamer.AddComment(
-            dwarf::OperationEncodingString(dwarf::DW_OP_reg0 + Reg));
-          Asm->EmitInt8(dwarf::DW_OP_reg0 + Reg);
+      DIVariable DV(Entry.Variable);
+      Asm->OutStreamer.AddComment("Loc expr size");
+      MCSymbol *begin = Asm->OutStreamer.getContext().CreateTempSymbol();
+      MCSymbol *end = Asm->OutStreamer.getContext().CreateTempSymbol();
+      Asm->EmitLabelDifference(end, begin, 2);
+      Asm->OutStreamer.EmitLabel(begin);
+      if (Entry.isConstant()) {
+        DIBasicType BTy(DV.getType());
+        if (BTy.Verify() &&
+            (BTy.getEncoding()  == dwarf::DW_ATE_signed 
+             || BTy.getEncoding() == dwarf::DW_ATE_signed_char)) {
+          Asm->OutStreamer.AddComment("DW_OP_consts");
+          Asm->EmitInt8(dwarf::DW_OP_consts);
+          Asm->EmitSLEB128(Entry.getConstant());
         } else {
-          Asm->OutStreamer.AddComment("Loc expr size");
-          Asm->EmitInt16(1 + MCAsmInfo::getULEB128Size(Reg));
-          Asm->EmitInt8(dwarf::DW_OP_regx);
-          Asm->EmitULEB128(Reg);
+          Asm->OutStreamer.AddComment("DW_OP_constu");
+          Asm->EmitInt8(dwarf::DW_OP_constu);
+          Asm->EmitULEB128(Entry.getConstant());
         }
+      } else if (DV.hasComplexAddress()) {
+        unsigned N = DV.getNumAddrElements();
+        unsigned i = 0;
+        if (N >= 2 && DV.getAddrElement(0) == DIBuilder::OpPlus) {
+          if (Entry.Loc.getOffset()) {
+            i = 2;
+            Asm->EmitDwarfRegOp(Entry.Loc);
+            Asm->OutStreamer.AddComment("DW_OP_deref");
+            Asm->EmitInt8(dwarf::DW_OP_deref);
+            Asm->OutStreamer.AddComment("DW_OP_plus_uconst");
+            Asm->EmitInt8(dwarf::DW_OP_plus_uconst);
+            Asm->EmitSLEB128(DV.getAddrElement(1));
+          } else {
+            // If first address element is OpPlus then emit
+            // DW_OP_breg + Offset instead of DW_OP_reg + Offset.
+            MachineLocation Loc(Entry.Loc.getReg(), DV.getAddrElement(1));
+            Asm->EmitDwarfRegOp(Loc);
+            i = 2;
+          }
+        } else {
+          Asm->EmitDwarfRegOp(Entry.Loc);
+        }
+
+        // Emit remaining complex address elements.
+        for (; i < N; ++i) {
+          uint64_t Element = DV.getAddrElement(i);
+          if (Element == DIBuilder::OpPlus) {
+            Asm->EmitInt8(dwarf::DW_OP_plus_uconst);
+            Asm->EmitULEB128(DV.getAddrElement(++i));
+          } else if (Element == DIBuilder::OpDeref)
+            Asm->EmitInt8(dwarf::DW_OP_deref);
+          else llvm_unreachable("unknown Opcode found in complex address");
+        }
+      } else {
+        // Regular entry.
+        Asm->EmitDwarfRegOp(Entry.Loc);
       }
+      Asm->OutStreamer.EmitLabel(end);
     }
   }
 }
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h
index 4aeefde..abda2e6 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -16,6 +16,7 @@
 
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineLocation.h"
+#include "llvm/Analysis/DebugInfo.h"
 #include "DIE.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/FoldingSet.h"
@@ -39,21 +40,6 @@ class DIEAbbrev;
 class DIE;
 class DIEBlock;
 class DIEEntry;
-class DIArray;
-class DIEnumerator;
-class DIDescriptor;
-class DIVariable;
-class DIGlobal;
-class DIGlobalVariable;
-class DISubprogram;
-class DIBasicType;
-class DIDerivedType;
-class DIType;
-class DINameSpace;
-class DISubrange;
-class DICompositeType;
-class DITemplateTypeParameter;
-class DITemplateValueParameter;
 
 //===----------------------------------------------------------------------===//
 /// SrcLineInfo - This class is used to record source line correspondence.
@@ -80,10 +66,21 @@ typedef struct DotDebugLocEntry {
   const MCSymbol *Begin;
   const MCSymbol *End;
   MachineLocation Loc;
+  const MDNode *Variable;
   bool Merged;
-  DotDebugLocEntry() : Begin(0), End(0), Merged(false) {}
-  DotDebugLocEntry(const MCSymbol *B, const MCSymbol *E, MachineLocation &L) 
-    : Begin(B), End(E), Loc(L), Merged(false) {}
+  bool Constant;
+  int64_t iConstant;
+  DotDebugLocEntry() 
+    : Begin(0), End(0), Variable(0), Merged(false), 
+      Constant(false), iConstant(0) {}
+  DotDebugLocEntry(const MCSymbol *B, const MCSymbol *E, MachineLocation &L,
+                   const MDNode *V) 
+    : Begin(B), End(E), Loc(L), Variable(V), Merged(false), 
+      Constant(false), iConstant(0) {}
+  DotDebugLocEntry(const MCSymbol *B, const MCSymbol *E, int64_t i)
+    : Begin(B), End(E), Variable(0), Merged(false), 
+      Constant(true), iConstant(i) {}
+
   /// Empty entries are also used as a trigger to emit temp label. Such
   /// labels are referenced is used to find debug_loc offset for a given DIE.
   bool isEmpty() { return Begin == 0 && End == 0; }
@@ -94,8 +91,47 @@ typedef struct DotDebugLocEntry {
     Next->Begin = Begin;
     Merged = true;
   }
+  bool isConstant() { return Constant; }
+  int64_t getConstant() { return iConstant; }
 } DotDebugLocEntry;
 
+//===----------------------------------------------------------------------===//
+/// DbgVariable - This class is used to track local variable information.
+///
+class DbgVariable {
+  DIVariable Var;                    // Variable Descriptor.
+  DIE *TheDIE;                       // Variable DIE.
+  unsigned DotDebugLocOffset;        // Offset in DotDebugLocEntries.
+public:
+  // AbsVar may be NULL.
+  DbgVariable(DIVariable V) : Var(V), TheDIE(0), DotDebugLocOffset(~0U) {}
+
+  // Accessors.
+  DIVariable getVariable()           const { return Var; }
+  void setDIE(DIE *D)                      { TheDIE = D; }
+  DIE *getDIE()                      const { return TheDIE; }
+  void setDotDebugLocOffset(unsigned O)    { DotDebugLocOffset = O; }
+  unsigned getDotDebugLocOffset()    const { return DotDebugLocOffset; }
+  StringRef getName()                const { return Var.getName(); }
+  unsigned getTag()                  const { return Var.getTag(); }
+  bool variableHasComplexAddress()   const {
+    assert(Var.Verify() && "Invalid complex DbgVariable!");
+    return Var.hasComplexAddress();
+  }
+  bool isBlockByrefVariable()        const {
+    assert(Var.Verify() && "Invalid complex DbgVariable!");
+    return Var.isBlockByrefVariable();
+  }
+  unsigned getNumAddrElements()      const { 
+    assert(Var.Verify() && "Invalid complex DbgVariable!");
+    return Var.getNumAddrElements();
+  }
+  uint64_t getAddrElement(unsigned i) const {
+    return Var.getAddrElement(i);
+  }
+  DIType getType() const;
+};
+
 class DwarfDebug {
   /// Asm - Target of Dwarf emission.
   AsmPrinter *Asm;
@@ -122,12 +158,6 @@ class DwarfDebug {
   /// id mapped to a unique id.
   StringMap<unsigned> SourceIdMap;
 
-  /// DIEBlocks - A list of all the DIEBlocks in use.
-  std::vector<DIEBlock *> DIEBlocks;
-
-  // DIEValueAllocator - All DIEValues are allocated through this allocator.
-  BumpPtrAllocator DIEValueAllocator;
-
   /// StringPool - A String->Symbol mapping of strings used by indirect
   /// references.
   StringMap<std::pair<MCSymbol*, unsigned> > StringPool;
@@ -198,8 +228,6 @@ class DwarfDebug {
   /// corresponds to the MDNode mapped with the subprogram DIE.
   DenseMap<DIE *, const MDNode *> ContainingTypeMap;
 
-  typedef SmallVector<DbgScope *, 2> ScopeVector;
-
   /// InlineInfo - Keep track of inlined functions and their location.  This
   /// information is used to populate debug_inlined section.
   typedef std::pair<const MCSymbol *, DIE *> InlineInfoLabels;
@@ -236,6 +264,10 @@ class DwarfDebug {
   DebugLoc PrevInstLoc;
   MCSymbol *PrevLabel;
 
+  /// PrologEndLoc - This location indicates end of function prologue and
+  /// beginning of function body.
+  DebugLoc PrologEndLoc;
+
   struct FunctionDebugFrameInfo {
     unsigned Number;
     std::vector<MachineMove> Moves;
@@ -246,157 +278,23 @@ class DwarfDebug {
 
   std::vector<FunctionDebugFrameInfo> DebugFrames;
 
+  // DIEValueAllocator - All DIEValues are allocated through this allocator.
+  BumpPtrAllocator DIEValueAllocator;
+
   // Section Symbols: these are assembler temporary labels that are emitted at
   // the beginning of each supported dwarf section.  These are used to form
   // section offsets and are created by EmitSectionLabels.
-  MCSymbol *DwarfFrameSectionSym, *DwarfInfoSectionSym, *DwarfAbbrevSectionSym;
+  MCSymbol *DwarfInfoSectionSym, *DwarfAbbrevSectionSym;
   MCSymbol *DwarfStrSectionSym, *TextSectionSym, *DwarfDebugRangeSectionSym;
   MCSymbol *DwarfDebugLocSectionSym;
   MCSymbol *FunctionBeginSym, *FunctionEndSym;
 
-  DIEInteger *DIEIntegerOne;
-
 private:
 
   /// assignAbbrevNumber - Define a unique number for the abbreviation.
   ///
   void assignAbbrevNumber(DIEAbbrev &Abbrev);
 
-  /// createDIEEntry - Creates a new DIEEntry to be a proxy for a debug
-  /// information entry.
-  DIEEntry *createDIEEntry(DIE *Entry);
-
-  /// addUInt - Add an unsigned integer attribute data and value.
-  ///
-  void addUInt(DIE *Die, unsigned Attribute, unsigned Form, uint64_t Integer);
-
-  /// addSInt - Add an signed integer attribute data and value.
-  ///
-  void addSInt(DIE *Die, unsigned Attribute, unsigned Form, int64_t Integer);
-
-  /// addString - Add a string attribute data and value.
-  ///
-  void addString(DIE *Die, unsigned Attribute, unsigned Form,
-                 const StringRef Str);
-
-  /// addLabel - Add a Dwarf label attribute data and value.
-  ///
-  void addLabel(DIE *Die, unsigned Attribute, unsigned Form,
-                const MCSymbol *Label);
-
-  /// addDelta - Add a label delta attribute data and value.
-  ///
-  void addDelta(DIE *Die, unsigned Attribute, unsigned Form,
-                const MCSymbol *Hi, const MCSymbol *Lo);
-
-  /// addDIEEntry - Add a DIE attribute data and value.
-  ///
-  void addDIEEntry(DIE *Die, unsigned Attribute, unsigned Form, DIE *Entry);
-  
-  /// addBlock - Add block data.
-  ///
-  void addBlock(DIE *Die, unsigned Attribute, unsigned Form, DIEBlock *Block);
-
-  /// addSourceLine - Add location information to specified debug information
-  /// entry.
-  void addSourceLine(DIE *Die, DIVariable V);
-  void addSourceLine(DIE *Die, DIGlobalVariable G);
-  void addSourceLine(DIE *Die, DISubprogram SP);
-  void addSourceLine(DIE *Die, DIType Ty);
-  void addSourceLine(DIE *Die, DINameSpace NS);
-
-  /// addAddress - Add an address attribute to a die based on the location
-  /// provided.
-  void addAddress(DIE *Die, unsigned Attribute,
-                  const MachineLocation &Location);
-
-  /// addRegisterAddress - Add register location entry in variable DIE.
-  bool addRegisterAddress(DIE *Die, const MachineOperand &MO);
-
-  /// addConstantValue - Add constant value entry in variable DIE.
-  bool addConstantValue(DIE *Die, const MachineOperand &MO);
-  bool addConstantValue(DIE *Die, ConstantInt *CI, bool Unsigned);
-
-  /// addConstantFPValue - Add constant value entry in variable DIE.
-  bool addConstantFPValue(DIE *Die, const MachineOperand &MO);
-
-  /// addTemplateParams - Add template parameters in buffer.
-  void addTemplateParams(DIE &Buffer, DIArray TParams);
-
-  /// addComplexAddress - Start with the address based on the location provided,
-  /// and generate the DWARF information necessary to find the actual variable
-  /// (navigating the extra location information encoded in the type) based on
-  /// the starting location.  Add the DWARF information to the die.
-  ///
-  void addComplexAddress(DbgVariable *&DV, DIE *Die, unsigned Attribute,
-                         const MachineLocation &Location);
-
-  // FIXME: Should be reformulated in terms of addComplexAddress.
-  /// addBlockByrefAddress - Start with the address based on the location
-  /// provided, and generate the DWARF information necessary to find the
-  /// actual Block variable (navigating the Block struct) based on the
-  /// starting location.  Add the DWARF information to the die.  Obsolete,
-  /// please use addComplexAddress instead.
-  ///
-  void addBlockByrefAddress(DbgVariable *&DV, DIE *Die, unsigned Attribute,
-                            const MachineLocation &Location);
-
-  /// addVariableAddress - Add DW_AT_location attribute for a DbgVariable based
-  /// on provided frame index.
-  void addVariableAddress(DbgVariable *&DV, DIE *Die, int64_t FI);
-
-  /// addToContextOwner - Add Die into the list of its context owner's children.
-  void addToContextOwner(DIE *Die, DIDescriptor Context);
-
-  /// addType - Add a new type attribute to the specified entity.
-  void addType(DIE *Entity, DIType Ty);
-
- 
-  /// getOrCreateNameSpace - Create a DIE for DINameSpace.
-  DIE *getOrCreateNameSpace(DINameSpace NS);
-
-  /// getOrCreateTypeDIE - Find existing DIE or create new DIE for the
-  /// given DIType.
-  DIE *getOrCreateTypeDIE(DIType Ty);
-
-  /// getOrCreateTemplateTypeParameterDIE - Find existing DIE or create new DIE 
-  /// for the given DITemplateTypeParameter.
-  DIE *getOrCreateTemplateTypeParameterDIE(DITemplateTypeParameter TP);
-
-  /// getOrCreateTemplateValueParameterDIE - Find existing DIE or create new DIE 
-  /// for the given DITemplateValueParameter.
-  DIE *getOrCreateTemplateValueParameterDIE(DITemplateValueParameter TVP);
-
-  void addPubTypes(DISubprogram SP);
-
-  /// constructTypeDIE - Construct basic type die from DIBasicType.
-  void constructTypeDIE(DIE &Buffer,
-                        DIBasicType BTy);
-
-  /// constructTypeDIE - Construct derived type die from DIDerivedType.
-  void constructTypeDIE(DIE &Buffer,
-                        DIDerivedType DTy);
-
-  /// constructTypeDIE - Construct type DIE from DICompositeType.
-  void constructTypeDIE(DIE &Buffer,
-                        DICompositeType CTy);
-
-  /// constructSubrangeDIE - Construct subrange DIE from DISubrange.
-  void constructSubrangeDIE(DIE &Buffer, DISubrange SR, DIE *IndexTy);
-
-  /// constructArrayTypeDIE - Construct array type DIE from DICompositeType.
-  void constructArrayTypeDIE(DIE &Buffer, 
-                             DICompositeType *CTy);
-
-  /// constructEnumTypeDIE - Construct enum type DIE from DIEnumerator.
-  DIE *constructEnumTypeDIE(DIEnumerator ETy);
-
-  /// createMemberDIE - Create new member DIE.
-  DIE *createMemberDIE(DIDerivedType DT);
-
-  /// createSubprogramDIE - Create new DIE using SP.
-  DIE *createSubprogramDIE(DISubprogram SP);
-
   /// getOrCreateDbgScope - Create DbgScope for the scope.
   DbgScope *getOrCreateDbgScope(const MDNode *Scope, const MDNode *InlinedAt);
 
@@ -455,14 +353,6 @@ private:
   ///
   void emitEndOfLineMatrix(unsigned SectionEnd);
 
-  /// emitCommonDebugFrame - Emit common frame info into a debug frame section.
-  ///
-  void emitCommonDebugFrame();
-
-  /// emitFunctionDebugFrame - Emit per function frame info into a debug frame
-  /// section.
-  void emitFunctionDebugFrame(const FunctionDebugFrameInfo &DebugFrameInfo);
-
   /// emitDebugPubNames - Emit visible names into a debug pubnames section.
   ///
   void emitDebugPubNames();
@@ -511,11 +401,6 @@ private:
   ///  inlining instance.
   void emitDebugInlineInfo();
 
-  /// GetOrCreateSourceID - Look up the source id with the given directory and
-  /// source file names. If none currently exists, create a new id and insert it
-  /// in the SourceIds map.
-  unsigned GetOrCreateSourceID(StringRef DirName, StringRef FullName);
-
   /// constructCompileUnit - Create new CompileUnit for the given 
   /// metadata node with tag DW_TAG_compile_unit.
   void constructCompileUnit(const MDNode *N);
@@ -532,7 +417,8 @@ private:
   /// recordSourceLine - Register a source line with debug info. Returns the
   /// unique label that was emitted and which provides correspondence to
   /// the source line list.
-  void recordSourceLine(unsigned Line, unsigned Col, const MDNode *Scope);
+  void recordSourceLine(unsigned Line, unsigned Col, const MDNode *Scope,
+                        unsigned Flags);
   
   /// recordVariableFrameIndex - Record a variable's index.
   void recordVariableFrameIndex(const DbgVariable *V, int Index);
@@ -611,6 +497,14 @@ public:
 
   /// endInstruction - Prcess end of an instruction.
   void endInstruction(const MachineInstr *MI);
+
+  /// GetOrCreateSourceID - Look up the source id with the given directory and
+  /// source file names. If none currently exists, create a new id and insert it
+  /// in the SourceIds map.
+  unsigned GetOrCreateSourceID(StringRef DirName, StringRef FullName);
+
+  /// createSubprogramDIE - Create new DIE using SP.
+  DIE *createSubprogramDIE(DISubprogram SP);
 };
 } // End of namespace llvm
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfException.h b/lib/CodeGen/AsmPrinter/DwarfException.h
index 06b1de6..b5f86ab 100644
--- a/lib/CodeGen/AsmPrinter/DwarfException.h
+++ b/lib/CodeGen/AsmPrinter/DwarfException.h
@@ -15,6 +15,7 @@
 #define LLVM_CODEGEN_ASMPRINTER_DWARFEXCEPTION_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/AsmPrinter.h"
 #include <vector>
 
 namespace llvm {
@@ -140,17 +141,20 @@ public:
 };
 
 class DwarfCFIException : public DwarfException {
-  /// shouldEmitTable - Per-function flag to indicate if EH tables should
-  /// be emitted.
-  bool shouldEmitTable;
+  /// shouldEmitPersonality - Per-function flag to indicate if .cfi_personality
+  /// should be emitted.
+  bool shouldEmitPersonality;
+
+  /// shouldEmitLSDA - Per-function flag to indicate if .cfi_lsda
+  /// should be emitted.
+  bool shouldEmitLSDA;
 
   /// shouldEmitMoves - Per-function flag to indicate if frame moves info
   /// should be emitted.
   bool shouldEmitMoves;
 
-  /// shouldEmitTableModule - Per-module flag to indicate if EH tables
-  /// should be emitted.
-  bool shouldEmitTableModule;
+  AsmPrinter::CFIMoveType moveTypeModule;
+
 public:
   //===--------------------------------------------------------------------===//
   // Main entry points.
@@ -170,7 +174,7 @@ public:
   virtual void EndFunction();
 };
 
-class DwarfTableException : public DwarfException {
+class ARMException : public DwarfException {
   /// shouldEmitTable - Per-function flag to indicate if EH tables should
   /// be emitted.
   bool shouldEmitTable;
@@ -182,48 +186,12 @@ class DwarfTableException : public DwarfException {
   /// shouldEmitTableModule - Per-module flag to indicate if EH tables
   /// should be emitted.
   bool shouldEmitTableModule;
-
-  /// shouldEmitMovesModule - Per-module flag to indicate if frame moves
-  /// should be emitted.
-  bool shouldEmitMovesModule;
-
-  struct FunctionEHFrameInfo {
-    MCSymbol *FunctionEHSym;  // L_foo.eh
-    unsigned Number;
-    unsigned PersonalityIndex;
-    bool adjustsStack;
-    bool hasLandingPads;
-    std::vector<MachineMove> Moves;
-    const Function *function;
-
-    FunctionEHFrameInfo(MCSymbol *EHSym, unsigned Num, unsigned P,
-                        bool hC, bool hL,
-                        const std::vector<MachineMove> &M,
-                        const Function *f):
-      FunctionEHSym(EHSym), Number(Num), PersonalityIndex(P),
-      adjustsStack(hC), hasLandingPads(hL), Moves(M), function (f) { }
-  };
-
-  std::vector<FunctionEHFrameInfo> EHFrames;
-
-  /// UsesLSDA - Indicates whether an FDE that uses the CIE at the given index
-  /// uses an LSDA. If so, then we need to encode that information in the CIE's
-  /// augmentation.
-  DenseMap<unsigned, bool> UsesLSDA;
-
-  /// EmitCIE - Emit a Common Information Entry (CIE). This holds information
-  /// that is shared among many Frame Description Entries.  There is at least
-  /// one CIE in every non-empty .debug_frame section.
-  void EmitCIE(const Function *Personality, unsigned Index);
-
-  /// EmitFDE - Emit the Frame Description Entry (FDE) for the function.
-  void EmitFDE(const FunctionEHFrameInfo &EHFrameInfo);
 public:
   //===--------------------------------------------------------------------===//
   // Main entry points.
   //
-  DwarfTableException(AsmPrinter *A);
-  virtual ~DwarfTableException();
+  ARMException(AsmPrinter *A);
+  virtual ~ARMException();
 
   /// EndModule - Emit all exception information that should come after the
   /// content.
@@ -237,25 +205,25 @@ public:
   virtual void EndFunction();
 };
 
+class Win64Exception : public DwarfException {
+  /// shouldEmitPersonality - Per-function flag to indicate if personality
+  /// info should be emitted.
+  bool shouldEmitPersonality;
 
-class ARMException : public DwarfException {
-  /// shouldEmitTable - Per-function flag to indicate if EH tables should
-  /// be emitted.
-  bool shouldEmitTable;
+  /// shouldEmitLSDA - Per-function flag to indicate if the LSDA
+  /// should be emitted.
+  bool shouldEmitLSDA;
 
   /// shouldEmitMoves - Per-function flag to indicate if frame moves info
   /// should be emitted.
   bool shouldEmitMoves;
 
-  /// shouldEmitTableModule - Per-module flag to indicate if EH tables
-  /// should be emitted.
-  bool shouldEmitTableModule;
 public:
   //===--------------------------------------------------------------------===//
   // Main entry points.
   //
-  ARMException(AsmPrinter *A);
-  virtual ~ARMException();
+  Win64Exception(AsmPrinter *A);
+  virtual ~Win64Exception();
 
   /// EndModule - Emit all exception information that should come after the
   /// content.
diff --git a/lib/CodeGen/AsmPrinter/DwarfTableException.cpp b/lib/CodeGen/AsmPrinter/DwarfTableException.cpp
deleted file mode 100644
index 7519011..0000000
--- a/lib/CodeGen/AsmPrinter/DwarfTableException.cpp
+++ /dev/null
@@ -1,349 +0,0 @@
-//===-- CodeGen/AsmPrinter/DwarfTableException.cpp - Dwarf Exception Impl --==//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains support for writing DWARF exception info into asm files.
-// The implementation emits all the necessary tables "by hands".
-//
-//===----------------------------------------------------------------------===//
-
-#include "DwarfException.h"
-#include "llvm/Module.h"
-#include "llvm/CodeGen/AsmPrinter.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineLocation.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCSection.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/Target/Mangler.h"
-#include "llvm/Target/TargetData.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetLoweringObjectFile.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetOptions.h"
-#include "llvm/Target/TargetRegisterInfo.h"
-#include "llvm/Support/Dwarf.h"
-#include "llvm/Support/FormattedStream.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/Twine.h"
-using namespace llvm;
-
-DwarfTableException::DwarfTableException(AsmPrinter *A)
-  :  DwarfException(A),
-     shouldEmitTable(false), shouldEmitMoves(false),
-     shouldEmitTableModule(false), shouldEmitMovesModule(false) {}
-
-DwarfTableException::~DwarfTableException() {}
-
-/// EmitCIE - Emit a Common Information Entry (CIE). This holds information that
-/// is shared among many Frame Description Entries.  There is at least one CIE
-/// in every non-empty .debug_frame section.
-void DwarfTableException::EmitCIE(const Function *PersonalityFn, unsigned Index) {
-  // Size and sign of stack growth.
-  int stackGrowth = Asm->getTargetData().getPointerSize();
-  if (Asm->TM.getFrameLowering()->getStackGrowthDirection() ==
-      TargetFrameLowering::StackGrowsDown)
-    stackGrowth *= -1;
-
-  const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
-
-  // Begin eh frame section.
-  Asm->OutStreamer.SwitchSection(TLOF.getEHFrameSection());
-
-  MCSymbol *EHFrameSym;
-  if (TLOF.isFunctionEHFrameSymbolPrivate())
-    EHFrameSym = Asm->GetTempSymbol("EH_frame", Index);
-  else
-    EHFrameSym = Asm->OutContext.GetOrCreateSymbol(Twine("EH_frame") +
-                                                   Twine(Index));
-  Asm->OutStreamer.EmitLabel(EHFrameSym);
-
-  Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("section_eh_frame", Index));
-
-  // Define base labels.
-  Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("eh_frame_common", Index));
-
-  // Define the eh frame length.
-  Asm->OutStreamer.AddComment("Length of Common Information Entry");
-  Asm->EmitLabelDifference(Asm->GetTempSymbol("eh_frame_common_end", Index),
-                           Asm->GetTempSymbol("eh_frame_common_begin", Index),
-                           4);
-
-  // EH frame header.
-  Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("eh_frame_common_begin",Index));
-  Asm->OutStreamer.AddComment("CIE Identifier Tag");
-  Asm->OutStreamer.EmitIntValue(0, 4/*size*/, 0/*addrspace*/);
-  Asm->OutStreamer.AddComment("DW_CIE_VERSION");
-  Asm->OutStreamer.EmitIntValue(dwarf::DW_CIE_VERSION, 1/*size*/, 0/*addr*/);
-
-  // The personality presence indicates that language specific information will
-  // show up in the eh frame.  Find out how we are supposed to lower the
-  // personality function reference:
-
-  unsigned LSDAEncoding = TLOF.getLSDAEncoding();
-  unsigned FDEEncoding = TLOF.getFDEEncoding();
-  unsigned PerEncoding = TLOF.getPersonalityEncoding();
-
-  char Augmentation[6] = { 0 };
-  unsigned AugmentationSize = 0;
-  char *APtr = Augmentation + 1;
-
-  if (PersonalityFn) {
-    // There is a personality function.
-    *APtr++ = 'P';
-    AugmentationSize += 1 + Asm->GetSizeOfEncodedValue(PerEncoding);
-  }
-
-  if (UsesLSDA[Index]) {
-    // An LSDA pointer is in the FDE augmentation.
-    *APtr++ = 'L';
-    ++AugmentationSize;
-  }
-
-  if (FDEEncoding != dwarf::DW_EH_PE_absptr) {
-    // A non-default pointer encoding for the FDE.
-    *APtr++ = 'R';
-    ++AugmentationSize;
-  }
-
-  if (APtr != Augmentation + 1)
-    Augmentation[0] = 'z';
-
-  Asm->OutStreamer.AddComment("CIE Augmentation");
-  Asm->OutStreamer.EmitBytes(StringRef(Augmentation, strlen(Augmentation)+1),0);
-
-  // Round out reader.
-  Asm->EmitULEB128(1, "CIE Code Alignment Factor");
-  Asm->EmitSLEB128(stackGrowth, "CIE Data Alignment Factor");
-  Asm->OutStreamer.AddComment("CIE Return Address Column");
-
-  const TargetRegisterInfo *RI = Asm->TM.getRegisterInfo();
-  const TargetFrameLowering *TFI = Asm->TM.getFrameLowering();
-  Asm->EmitInt8(RI->getDwarfRegNum(RI->getRARegister(), true));
-
-  if (Augmentation[0]) {
-    Asm->EmitULEB128(AugmentationSize, "Augmentation Size");
-
-    // If there is a personality, we need to indicate the function's location.
-    if (PersonalityFn) {
-      Asm->EmitEncodingByte(PerEncoding, "Personality");
-      Asm->OutStreamer.AddComment("Personality");
-      Asm->EmitReference(PersonalityFn, PerEncoding);
-    }
-    if (UsesLSDA[Index])
-      Asm->EmitEncodingByte(LSDAEncoding, "LSDA");
-    if (FDEEncoding != dwarf::DW_EH_PE_absptr)
-      Asm->EmitEncodingByte(FDEEncoding, "FDE");
-  }
-
-  // Indicate locations of general callee saved registers in frame.
-  std::vector<MachineMove> Moves;
-  TFI->getInitialFrameState(Moves);
-  Asm->EmitFrameMoves(Moves, 0, true);
-
-  // On Darwin the linker honors the alignment of eh_frame, which means it must
-  // be 8-byte on 64-bit targets to match what gcc does.  Otherwise you get
-  // holes which confuse readers of eh_frame.
-  Asm->EmitAlignment(Asm->getTargetData().getPointerSize() == 4 ? 2 : 3);
-  Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("eh_frame_common_end", Index));
-}
-
-/// EmitFDE - Emit the Frame Description Entry (FDE) for the function.
-void DwarfTableException::EmitFDE(const FunctionEHFrameInfo &EHFrameInfo) {
-  assert(!EHFrameInfo.function->hasAvailableExternallyLinkage() &&
-         "Should not emit 'available externally' functions at all");
-
-  const Function *TheFunc = EHFrameInfo.function;
-  const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
-
-  unsigned LSDAEncoding = TLOF.getLSDAEncoding();
-  unsigned FDEEncoding = TLOF.getFDEEncoding();
-
-  Asm->OutStreamer.SwitchSection(TLOF.getEHFrameSection());
-
-  // Externally visible entry into the functions eh frame info. If the
-  // corresponding function is static, this should not be externally visible.
-  if (!TheFunc->hasLocalLinkage() && TLOF.isFunctionEHSymbolGlobal())
-    Asm->OutStreamer.EmitSymbolAttribute(EHFrameInfo.FunctionEHSym,MCSA_Global);
-
-  // If corresponding function is weak definition, this should be too.
-  if (TheFunc->isWeakForLinker() && Asm->MAI->getWeakDefDirective())
-    Asm->OutStreamer.EmitSymbolAttribute(EHFrameInfo.FunctionEHSym,
-                                         MCSA_WeakDefinition);
-
-  // If corresponding function is hidden, this should be too.
-  if (TheFunc->hasHiddenVisibility())
-    if (MCSymbolAttr HiddenAttr = Asm->MAI->getHiddenVisibilityAttr())
-      Asm->OutStreamer.EmitSymbolAttribute(EHFrameInfo.FunctionEHSym,
-                                           HiddenAttr);
-
-  // If there are no calls then you can't unwind.  This may mean we can omit the
-  // EH Frame, but some environments do not handle weak absolute symbols. If
-  // UnwindTablesMandatory is set we cannot do this optimization; the unwind
-  // info is to be available for non-EH uses.
-  if (!EHFrameInfo.adjustsStack && !UnwindTablesMandatory &&
-      (!TheFunc->isWeakForLinker() ||
-       !Asm->MAI->getWeakDefDirective() ||
-       TLOF.getSupportsWeakOmittedEHFrame())) {
-    Asm->OutStreamer.EmitAssignment(EHFrameInfo.FunctionEHSym,
-                                    MCConstantExpr::Create(0, Asm->OutContext));
-    // This name has no connection to the function, so it might get
-    // dead-stripped when the function is not, erroneously.  Prohibit
-    // dead-stripping unconditionally.
-    if (Asm->MAI->hasNoDeadStrip())
-      Asm->OutStreamer.EmitSymbolAttribute(EHFrameInfo.FunctionEHSym,
-                                           MCSA_NoDeadStrip);
-  } else {
-    Asm->OutStreamer.EmitLabel(EHFrameInfo.FunctionEHSym);
-
-    // EH frame header.
-    Asm->OutStreamer.AddComment("Length of Frame Information Entry");
-    Asm->EmitLabelDifference(
-                Asm->GetTempSymbol("eh_frame_end", EHFrameInfo.Number),
-                Asm->GetTempSymbol("eh_frame_begin", EHFrameInfo.Number), 4);
-
-    Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("eh_frame_begin",
-                                                  EHFrameInfo.Number));
-
-    Asm->OutStreamer.AddComment("FDE CIE offset");
-    Asm->EmitLabelDifference(
-                       Asm->GetTempSymbol("eh_frame_begin", EHFrameInfo.Number),
-                       Asm->GetTempSymbol("eh_frame_common",
-                                          EHFrameInfo.PersonalityIndex), 4);
-
-    MCSymbol *EHFuncBeginSym =
-      Asm->GetTempSymbol("eh_func_begin", EHFrameInfo.Number);
-
-    Asm->OutStreamer.AddComment("FDE initial location");
-    Asm->EmitReference(EHFuncBeginSym, FDEEncoding);
-
-    Asm->OutStreamer.AddComment("FDE address range");
-    Asm->EmitLabelDifference(Asm->GetTempSymbol("eh_func_end",
-                                                EHFrameInfo.Number),
-                             EHFuncBeginSym,
-                             Asm->GetSizeOfEncodedValue(FDEEncoding));
-
-    // If there is a personality and landing pads then point to the language
-    // specific data area in the exception table.
-    if (MMI->getPersonalities()[0] != NULL) {
-      unsigned Size = Asm->GetSizeOfEncodedValue(LSDAEncoding);
-
-      Asm->EmitULEB128(Size, "Augmentation size");
-      Asm->OutStreamer.AddComment("Language Specific Data Area");
-      if (EHFrameInfo.hasLandingPads)
-        Asm->EmitReference(Asm->GetTempSymbol("exception", EHFrameInfo.Number),
-                           LSDAEncoding);
-      else
-        Asm->OutStreamer.EmitIntValue(0, Size/*size*/, 0/*addrspace*/);
-
-    } else {
-      Asm->EmitULEB128(0, "Augmentation size");
-    }
-
-    // Indicate locations of function specific callee saved registers in frame.
-    Asm->EmitFrameMoves(EHFrameInfo.Moves, EHFuncBeginSym, true);
-
-    // On Darwin the linker honors the alignment of eh_frame, which means it
-    // must be 8-byte on 64-bit targets to match what gcc does.  Otherwise you
-    // get holes which confuse readers of eh_frame.
-    Asm->EmitAlignment(Asm->getTargetData().getPointerSize() == 4 ? 2 : 3);
-    Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("eh_frame_end",
-                                                  EHFrameInfo.Number));
-
-    // If the function is marked used, this table should be also.  We cannot
-    // make the mark unconditional in this case, since retaining the table also
-    // retains the function in this case, and there is code around that depends
-    // on unused functions (calling undefined externals) being dead-stripped to
-    // link correctly.  Yes, there really is.
-    if (MMI->isUsedFunction(EHFrameInfo.function))
-      if (Asm->MAI->hasNoDeadStrip())
-        Asm->OutStreamer.EmitSymbolAttribute(EHFrameInfo.FunctionEHSym,
-                                             MCSA_NoDeadStrip);
-  }
-  Asm->OutStreamer.AddBlankLine();
-}
-
-/// EndModule - Emit all exception information that should come after the
-/// content.
-void DwarfTableException::EndModule() {
-  if (!Asm->MAI->isExceptionHandlingDwarf())
-    return;
-
-  if (!shouldEmitMovesModule && !shouldEmitTableModule)
-    return;
-
-  const std::vector<const Function*> &Personalities = MMI->getPersonalities();
-
-  for (unsigned I = 0, E = Personalities.size(); I < E; ++I)
-    EmitCIE(Personalities[I], I);
-
-  for (std::vector<FunctionEHFrameInfo>::iterator
-         I = EHFrames.begin(), E = EHFrames.end(); I != E; ++I)
-    EmitFDE(*I);
-}
-
-/// BeginFunction - Gather pre-function exception information. Assumes it's
-/// being emitted immediately after the function entry point.
-void DwarfTableException::BeginFunction(const MachineFunction *MF) {
-  shouldEmitTable = shouldEmitMoves = false;
-
-  // If any landing pads survive, we need an EH table.
-  shouldEmitTable = !MMI->getLandingPads().empty();
-
-  // See if we need frame move info.
-  shouldEmitMoves =
-    !Asm->MF->getFunction()->doesNotThrow() || UnwindTablesMandatory;
-
-  if (shouldEmitMoves || shouldEmitTable)
-    // Assumes in correct section after the entry point.
-    Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("eh_func_begin",
-                                                  Asm->getFunctionNumber()));
-
-  shouldEmitTableModule |= shouldEmitTable;
-  shouldEmitMovesModule |= shouldEmitMoves;
-}
-
-/// EndFunction - Gather and emit post-function exception information.
-///
-void DwarfTableException::EndFunction() {
-  if (!shouldEmitMoves && !shouldEmitTable) return;
-
-  Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("eh_func_end",
-                                                Asm->getFunctionNumber()));
-
-  // Record if this personality index uses a landing pad.
-  bool HasLandingPad = !MMI->getLandingPads().empty();
-  UsesLSDA[MMI->getPersonalityIndex()] |= HasLandingPad;
-
-  // Map all labels and get rid of any dead landing pads.
-  MMI->TidyLandingPads();
-
-  if (HasLandingPad)
-    EmitExceptionTable();
-
-  const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
-  MCSymbol *FunctionEHSym =
-    Asm->GetSymbolWithGlobalValueBase(Asm->MF->getFunction(), ".eh",
-                                      TLOF.isFunctionEHFrameSymbolPrivate());
-
-  // Save EH frame information
-  EHFrames.
-    push_back(FunctionEHFrameInfo(FunctionEHSym,
-                                  Asm->getFunctionNumber(),
-                                  MMI->getPersonalityIndex(),
-                                  Asm->MF->getFrameInfo()->adjustsStack(),
-                                  !MMI->getLandingPads().empty(),
-                                  MMI->getFrameMoves(),
-                                  Asm->MF->getFunction()));
-}
diff --git a/lib/CodeGen/AsmPrinter/Win64Exception.cpp b/lib/CodeGen/AsmPrinter/Win64Exception.cpp
new file mode 100644
index 0000000..c2ad5eb
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/Win64Exception.cpp
@@ -0,0 +1,116 @@
+//===-- CodeGen/AsmPrinter/Win64Exception.cpp - Dwarf Exception Impl ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing Win64 exception info into asm files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DwarfException.h"
+#include "llvm/Module.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineLocation.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Target/Mangler.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Twine.h"
+using namespace llvm;
+
+Win64Exception::Win64Exception(AsmPrinter *A)
+  : DwarfException(A),
+    shouldEmitPersonality(false), shouldEmitLSDA(false), shouldEmitMoves(false)
+    {}
+
+Win64Exception::~Win64Exception() {}
+
+/// EndModule - Emit all exception information that should come after the
+/// content.
+void Win64Exception::EndModule() {
+}
+
+/// BeginFunction - Gather pre-function exception information. Assumes it's
+/// being emitted immediately after the function entry point.
+void Win64Exception::BeginFunction(const MachineFunction *MF) {
+  shouldEmitMoves = shouldEmitPersonality = shouldEmitLSDA = false;
+
+  // If any landing pads survive, we need an EH table.
+  bool hasLandingPads = !MMI->getLandingPads().empty();
+
+  shouldEmitMoves = Asm->needsSEHMoves();
+
+  const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
+  unsigned PerEncoding = TLOF.getPersonalityEncoding();
+  const Function *Per = MMI->getPersonalities()[MMI->getPersonalityIndex()];
+
+  shouldEmitPersonality = hasLandingPads &&
+    PerEncoding != dwarf::DW_EH_PE_omit && Per;
+
+  unsigned LSDAEncoding = TLOF.getLSDAEncoding();
+  shouldEmitLSDA = shouldEmitPersonality &&
+    LSDAEncoding != dwarf::DW_EH_PE_omit;
+
+  if (!shouldEmitPersonality && !shouldEmitMoves)
+    return;
+
+  Asm->OutStreamer.EmitWin64EHStartProc(Asm->CurrentFnSym);
+
+  if (!shouldEmitPersonality)
+    return;
+
+  MCSymbol *GCCHandlerSym =
+    Asm->GetExternalSymbolSymbol("_GCC_specific_handler");
+  Asm->OutStreamer.EmitWin64EHHandler(GCCHandlerSym, true, true);
+
+  Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("eh_func_begin",
+                                                Asm->getFunctionNumber()));
+}
+
+/// EndFunction - Gather and emit post-function exception information.
+///
+void Win64Exception::EndFunction() {
+  if (!shouldEmitPersonality && !shouldEmitMoves)
+    return;
+
+  Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("eh_func_end",
+                                                Asm->getFunctionNumber()));
+
+  // Map all labels and get rid of any dead landing pads.
+  MMI->TidyLandingPads();
+
+  if (shouldEmitPersonality) {
+    const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
+    const Function *Per = MMI->getPersonalities()[MMI->getPersonalityIndex()];
+    const MCSymbol *Sym = TLOF.getCFIPersonalitySymbol(Per, Asm->Mang, MMI);
+
+    Asm->OutStreamer.PushSection();
+    Asm->OutStreamer.EmitWin64EHHandlerData();
+    Asm->OutStreamer.EmitValue(MCSymbolRefExpr::Create(Sym, Asm->OutContext),
+                               4);
+    EmitExceptionTable();
+    Asm->OutStreamer.PopSection();
+  }
+  Asm->OutStreamer.EmitWin64EHEndProc();
+}
diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp
index 78a8743..d95f77e 100644
--- a/lib/CodeGen/BranchFolding.cpp
+++ b/lib/CodeGen/BranchFolding.cpp
@@ -41,6 +41,7 @@ using namespace llvm;
 STATISTIC(NumDeadBlocks, "Number of dead blocks removed");
 STATISTIC(NumBranchOpts, "Number of branches optimized");
 STATISTIC(NumTailMerge , "Number of block tails merged");
+STATISTIC(NumHoist     , "Number of times common instructions are hoisted");
 
 static cl::opt<cl::boolOrDefault> FlagEnableTailMerge("enable-tail-merge",
                               cl::init(cl::BOU_UNSET), cl::Hidden);
@@ -65,7 +66,7 @@ namespace {
   public:
     static char ID;
     explicit BranchFolderPass(bool defaultEnableTailMerge)
-      : MachineFunctionPass(ID), BranchFolder(defaultEnableTailMerge) {}
+      : MachineFunctionPass(ID), BranchFolder(defaultEnableTailMerge, true) {}
 
     virtual bool runOnMachineFunction(MachineFunction &MF);
     virtual const char *getPassName() const { return "Control Flow Optimizer"; }
@@ -86,12 +87,14 @@ bool BranchFolderPass::runOnMachineFunction(MachineFunction &MF) {
 }
 
 
-BranchFolder::BranchFolder(bool defaultEnableTailMerge) {
+BranchFolder::BranchFolder(bool defaultEnableTailMerge, bool CommonHoist) {
   switch (FlagEnableTailMerge) {
   case cl::BOU_UNSET: EnableTailMerge = defaultEnableTailMerge; break;
   case cl::BOU_TRUE: EnableTailMerge = true; break;
   case cl::BOU_FALSE: EnableTailMerge = false; break;
   }
+
+  EnableHoistCommonCode = CommonHoist;
 }
 
 /// RemoveDeadBlock - Remove the specified dead machine basic block from the
@@ -105,6 +108,9 @@ void BranchFolder::RemoveDeadBlock(MachineBasicBlock *MBB) {
   while (!MBB->succ_empty())
     MBB->removeSuccessor(MBB->succ_end()-1);
 
+  // Avoid matching if this pointer gets reused.
+  TriedMerging.erase(MBB);
+
   // Remove the block.
   MF->erase(MBB);
 }
@@ -168,6 +174,8 @@ bool BranchFolder::OptimizeFunction(MachineFunction &MF,
                                     MachineModuleInfo *mmi) {
   if (!tii) return false;
 
+  TriedMerging.clear();
+
   TII = tii;
   TRI = tri;
   MMI = mmi;
@@ -186,9 +194,10 @@ bool BranchFolder::OptimizeFunction(MachineFunction &MF,
 
   bool MadeChangeThisIteration = true;
   while (MadeChangeThisIteration) {
-    MadeChangeThisIteration = false;
-    MadeChangeThisIteration |= TailMergeBlocks(MF);
-    MadeChangeThisIteration |= OptimizeBranches(MF);
+    MadeChangeThisIteration    = TailMergeBlocks(MF);
+    MadeChangeThisIteration   |= OptimizeBranches(MF);
+    if (EnableHoistCommonCode)
+      MadeChangeThisIteration |= HoistCommonCode(MF);
     MadeChange |= MadeChangeThisIteration;
   }
 
@@ -795,14 +804,21 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) {
 
   // First find blocks with no successors.
   MergePotentials.clear();
-  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) {
+  for (MachineFunction::iterator I = MF.begin(), E = MF.end();
+       I != E && MergePotentials.size() < TailMergeThreshold; ++I) {
+    if (TriedMerging.count(I))
+      continue;
     if (I->succ_empty())
       MergePotentials.push_back(MergePotentialsElt(HashEndOfMBB(I), I));
   }
 
+  // If this is a large problem, avoid visiting the same basic blocks
+  // multiple times.
+  if (MergePotentials.size() == TailMergeThreshold)
+    for (unsigned i = 0, e = MergePotentials.size(); i != e; ++i)
+      TriedMerging.insert(MergePotentials[i].getBlock());
   // See if we can do any tail merging on those.
-  if (MergePotentials.size() < TailMergeThreshold &&
-      MergePotentials.size() >= 2)
+  if (MergePotentials.size() >= 2)
     MadeChange |= TryTailMergeBlocks(NULL, NULL);
 
   // Look at blocks (IBB) with multiple predecessors (PBB).
@@ -826,15 +842,17 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) {
 
   for (MachineFunction::iterator I = llvm::next(MF.begin()), E = MF.end();
        I != E; ++I) {
-    if (I->pred_size() >= 2 && I->pred_size() < TailMergeThreshold) {
+    if (I->pred_size() >= 2) {
       SmallPtrSet<MachineBasicBlock *, 8> UniquePreds;
       MachineBasicBlock *IBB = I;
       MachineBasicBlock *PredBB = prior(I);
       MergePotentials.clear();
       for (MachineBasicBlock::pred_iterator P = I->pred_begin(),
                                             E2 = I->pred_end();
-           P != E2; ++P) {
+           P != E2 && MergePotentials.size() < TailMergeThreshold; ++P) {
         MachineBasicBlock *PBB = *P;
+        if (TriedMerging.count(PBB))
+          continue;
         // Skip blocks that loop to themselves, can't tail merge these.
         if (PBB == IBB)
           continue;
@@ -887,6 +905,11 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) {
           MergePotentials.push_back(MergePotentialsElt(HashEndOfMBB(PBB), *P));
         }
       }
+      // If this is a large problem, avoid visiting the same basic blocks
+      // multiple times.
+      if (MergePotentials.size() == TailMergeThreshold)
+        for (unsigned i = 0, e = MergePotentials.size(); i != e; ++i)
+          TriedMerging.insert(MergePotentials[i].getBlock());
       if (MergePotentials.size() >= 2)
         MadeChange |= TryTailMergeBlocks(IBB, PredBB);
       // Reinsert an unconditional branch if needed.
@@ -910,7 +933,8 @@ bool BranchFolder::OptimizeBranches(MachineFunction &MF) {
   // Make sure blocks are numbered in order
   MF.RenumberBlocks();
 
-  for (MachineFunction::iterator I = ++MF.begin(), E = MF.end(); I != E; ) {
+  for (MachineFunction::iterator I = llvm::next(MF.begin()), E = MF.end();
+       I != E; ) {
     MachineBasicBlock *MBB = I++;
     MadeChange |= OptimizeBlock(MBB);
 
@@ -1048,9 +1072,25 @@ ReoptimizeBlock:
     // AnalyzeBranch.
     if (PriorCond.empty() && !PriorTBB && MBB->pred_size() == 1 &&
         PrevBB.succ_size() == 1 &&
-        !MBB->hasAddressTaken()) {
+        !MBB->hasAddressTaken() && !MBB->isLandingPad()) {
       DEBUG(dbgs() << "\nMerging into block: " << PrevBB
                    << "From MBB: " << *MBB);
+      // Remove redundant DBG_VALUEs first.
+      if (PrevBB.begin() != PrevBB.end()) {
+        MachineBasicBlock::iterator PrevBBIter = PrevBB.end();
+        --PrevBBIter;
+        MachineBasicBlock::iterator MBBIter = MBB->begin();
+        // Check if DBG_VALUE at the end of PrevBB is identical to the 
+        // DBG_VALUE at the beginning of MBB.
+        while (PrevBBIter != PrevBB.begin() && MBBIter != MBB->end()
+               && PrevBBIter->isDebugValue() && MBBIter->isDebugValue()) {
+          if (!MBBIter->isIdenticalTo(PrevBBIter))
+            break;
+          MachineInstr *DuplicateDbg = MBBIter;
+          ++MBBIter; -- PrevBBIter;
+          DuplicateDbg->eraseFromParent();
+        }
+      }
       PrevBB.splice(PrevBB.end(), MBB, MBB->begin(), MBB->end());
       PrevBB.removeSuccessor(PrevBB.succ_begin());;
       assert(PrevBB.succ_empty());
@@ -1339,3 +1379,282 @@ ReoptimizeBlock:
 
   return MadeChange;
 }
+
+//===----------------------------------------------------------------------===//
+//  Hoist Common Code
+//===----------------------------------------------------------------------===//
+
+/// HoistCommonCode - Hoist common instruction sequences at the start of basic
+/// blocks to their common predecessor.
+bool BranchFolder::HoistCommonCode(MachineFunction &MF) {
+  bool MadeChange = false;
+  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ) {
+    MachineBasicBlock *MBB = I++;
+    MadeChange |= HoistCommonCodeInSuccs(MBB);
+  }
+
+  return MadeChange;
+}
+
+/// findFalseBlock - BB has a fallthrough. Find its 'false' successor given
+/// its 'true' successor.
+static MachineBasicBlock *findFalseBlock(MachineBasicBlock *BB,
+                                         MachineBasicBlock *TrueBB) {
+  for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(),
+         E = BB->succ_end(); SI != E; ++SI) {
+    MachineBasicBlock *SuccBB = *SI;
+    if (SuccBB != TrueBB)
+      return SuccBB;
+  }
+  return NULL;
+}
+
+/// findHoistingInsertPosAndDeps - Find the location to move common instructions
+/// in successors to. The location is ususally just before the terminator,
+/// however if the terminator is a conditional branch and its previous
+/// instruction is the flag setting instruction, the previous instruction is
+/// the preferred location. This function also gathers uses and defs of the
+/// instructions from the insertion point to the end of the block. The data is
+/// used by HoistCommonCodeInSuccs to ensure safety.
+static
+MachineBasicBlock::iterator findHoistingInsertPosAndDeps(MachineBasicBlock *MBB,
+                                                  const TargetInstrInfo *TII,
+                                                  const TargetRegisterInfo *TRI,
+                                                  SmallSet<unsigned,4> &Uses,
+                                                  SmallSet<unsigned,4> &Defs) {
+  MachineBasicBlock::iterator Loc = MBB->getFirstTerminator();
+  if (!TII->isUnpredicatedTerminator(Loc))
+    return MBB->end();
+
+  for (unsigned i = 0, e = Loc->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = Loc->getOperand(i);
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg)
+      continue;
+    if (MO.isUse()) {
+      Uses.insert(Reg);
+      for (const unsigned *AS = TRI->getAliasSet(Reg); *AS; ++AS)
+        Uses.insert(*AS);
+    } else if (!MO.isDead())
+      // Don't try to hoist code in the rare case the terminator defines a
+      // register that is later used.
+      return MBB->end();
+  }
+
+  if (Uses.empty())
+    return Loc;
+  if (Loc == MBB->begin())
+    return MBB->end();
+
+  // The terminator is probably a conditional branch, try not to separate the
+  // branch from condition setting instruction.
+  MachineBasicBlock::iterator PI = Loc;
+  --PI;
+  while (PI != MBB->begin() && Loc->isDebugValue())
+    --PI;
+
+  bool IsDef = false;
+  for (unsigned i = 0, e = PI->getNumOperands(); !IsDef && i != e; ++i) {
+    const MachineOperand &MO = PI->getOperand(i);
+    if (!MO.isReg() || MO.isUse())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg)
+      continue;
+    if (Uses.count(Reg))
+      IsDef = true;
+  }
+  if (!IsDef)
+    // The condition setting instruction is not just before the conditional
+    // branch.
+    return Loc;
+
+  // Be conservative, don't insert instruction above something that may have
+  // side-effects. And since it's potentially bad to separate flag setting
+  // instruction from the conditional branch, just abort the optimization
+  // completely.
+  // Also avoid moving code above predicated instruction since it's hard to
+  // reason about register liveness with predicated instruction.
+  bool DontMoveAcrossStore = true;
+  if (!PI->isSafeToMove(TII, 0, DontMoveAcrossStore) ||
+      TII->isPredicated(PI))
+    return MBB->end();
+
+
+  // Find out what registers are live. Note this routine is ignoring other live
+  // registers which are only used by instructions in successor blocks.
+  for (unsigned i = 0, e = PI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = PI->getOperand(i);
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg)
+      continue;
+    if (MO.isUse()) {
+      Uses.insert(Reg);
+      for (const unsigned *AS = TRI->getAliasSet(Reg); *AS; ++AS)
+        Uses.insert(*AS);
+    } else {
+      if (Uses.count(Reg)) {
+        Uses.erase(Reg);
+        for (const unsigned *SR = TRI->getSubRegisters(Reg); *SR; ++SR)
+          Uses.erase(*SR); // Use getSubRegisters to be conservative
+      }
+      Defs.insert(Reg);
+      for (const unsigned *AS = TRI->getAliasSet(Reg); *AS; ++AS)
+        Defs.insert(*AS);
+    }
+  }
+
+  return PI;
+}
+
+/// HoistCommonCodeInSuccs - If the successors of MBB has common instruction
+/// sequence at the start of the function, move the instructions before MBB
+/// terminator if it's legal.
+bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
+  MachineBasicBlock *TBB = 0, *FBB = 0;
+  SmallVector<MachineOperand, 4> Cond;
+  if (TII->AnalyzeBranch(*MBB, TBB, FBB, Cond, true) || !TBB || Cond.empty())
+    return false;
+
+  if (!FBB) FBB = findFalseBlock(MBB, TBB);
+  if (!FBB)
+    // Malformed bcc? True and false blocks are the same?
+    return false;
+
+  // Restrict the optimization to cases where MBB is the only predecessor,
+  // it is an obvious win.
+  if (TBB->pred_size() > 1 || FBB->pred_size() > 1)
+    return false;
+
+  // Find a suitable position to hoist the common instructions to. Also figure
+  // out which registers are used or defined by instructions from the insertion
+  // point to the end of the block.
+  SmallSet<unsigned, 4> Uses, Defs;
+  MachineBasicBlock::iterator Loc =
+    findHoistingInsertPosAndDeps(MBB, TII, TRI, Uses, Defs);
+  if (Loc == MBB->end())
+    return false;
+
+  bool HasDups = false;
+  SmallVector<unsigned, 4> LocalDefs;
+  SmallSet<unsigned, 4> LocalDefsSet;
+  MachineBasicBlock::iterator TIB = TBB->begin();
+  MachineBasicBlock::iterator FIB = FBB->begin();
+  MachineBasicBlock::iterator TIE = TBB->end();
+  MachineBasicBlock::iterator FIE = FBB->end();
+  while (TIB != TIE && FIB != FIE) {
+    // Skip dbg_value instructions. These do not count.
+    if (TIB->isDebugValue()) {
+      while (TIB != TIE && TIB->isDebugValue())
+        ++TIB;
+      if (TIB == TIE)
+        break;
+    }
+    if (FIB->isDebugValue()) {
+      while (FIB != FIE && FIB->isDebugValue())
+        ++FIB;
+      if (FIB == FIE)
+        break;
+    }
+    if (!TIB->isIdenticalTo(FIB, MachineInstr::CheckKillDead))
+      break;
+
+    if (TII->isPredicated(TIB))
+      // Hard to reason about register liveness with predicated instruction.
+      break;
+
+    bool IsSafe = true;
+    for (unsigned i = 0, e = TIB->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = TIB->getOperand(i);
+      if (!MO.isReg())
+        continue;
+      unsigned Reg = MO.getReg();
+      if (!Reg)
+        continue;
+      if (MO.isDef()) {
+        if (Uses.count(Reg)) {
+          // Avoid clobbering a register that's used by the instruction at
+          // the point of insertion.
+          IsSafe = false;
+          break;
+        }
+
+        if (Defs.count(Reg) && !MO.isDead()) {
+          // Don't hoist the instruction if the def would be clobber by the
+          // instruction at the point insertion. FIXME: This is overly
+          // conservative. It should be possible to hoist the instructions
+          // in BB2 in the following example:
+          // BB1:
+          // r1, eflag = op1 r2, r3
+          // brcc eflag
+          //
+          // BB2:
+          // r1 = op2, ...
+          //    = op3, r1<kill>
+          IsSafe = false;
+          break;
+        }
+      } else if (!LocalDefsSet.count(Reg)) {
+        if (Defs.count(Reg)) {
+          // Use is defined by the instruction at the point of insertion.
+          IsSafe = false;
+          break;
+        }
+      }
+    }
+    if (!IsSafe)
+      break;
+
+    bool DontMoveAcrossStore = true;
+    if (!TIB->isSafeToMove(TII, 0, DontMoveAcrossStore))
+      break;
+
+    // Track local defs so we can update liveins.
+    for (unsigned i = 0, e = TIB->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = TIB->getOperand(i);
+      if (!MO.isReg())
+        continue;
+      unsigned Reg = MO.getReg();
+      if (!Reg)
+        continue;
+      if (MO.isDef()) {
+        if (!MO.isDead()) {
+          LocalDefs.push_back(Reg);
+          LocalDefsSet.insert(Reg);
+          for (const unsigned *SR = TRI->getSubRegisters(Reg); *SR; ++SR)
+            LocalDefsSet.insert(*SR);
+        }
+      } else if (MO.isKill() && LocalDefsSet.count(Reg)) {
+        LocalDefsSet.erase(Reg);
+        for (const unsigned *SR = TRI->getSubRegisters(Reg); *SR; ++SR)
+          LocalDefsSet.erase(*SR);
+      }
+    }
+
+    HasDups = true;;
+    ++TIB;
+    ++FIB;
+  }
+
+  if (!HasDups)
+    return false;
+
+  MBB->splice(Loc, TBB, TBB->begin(), TIB);
+  FBB->erase(FBB->begin(), FIB);
+
+  // Update livein's.
+  for (unsigned i = 0, e = LocalDefs.size(); i != e; ++i) {
+    unsigned Def = LocalDefs[i];
+    if (LocalDefsSet.count(Def)) {
+      TBB->addLiveIn(Def);
+      FBB->addLiveIn(Def);
+    }
+  }
+
+  ++NumHoist;
+  return true;
+}
diff --git a/lib/CodeGen/BranchFolding.h b/lib/CodeGen/BranchFolding.h
index 15dfa7f..4ed42c0 100644
--- a/lib/CodeGen/BranchFolding.h
+++ b/lib/CodeGen/BranchFolding.h
@@ -10,6 +10,7 @@
 #ifndef LLVM_CODEGEN_BRANCHFOLDING_HPP
 #define LLVM_CODEGEN_BRANCHFOLDING_HPP
 
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include <vector>
 
@@ -19,11 +20,10 @@ namespace llvm {
   class RegScavenger;
   class TargetInstrInfo;
   class TargetRegisterInfo;
-  template<typename T> class SmallVectorImpl;
 
   class BranchFolder {
   public:
-    explicit BranchFolder(bool defaultEnableTailMerge);
+    explicit BranchFolder(bool defaultEnableTailMerge, bool CommonHoist);
 
     bool OptimizeFunction(MachineFunction &MF,
                           const TargetInstrInfo *tii,
@@ -48,6 +48,7 @@ namespace llvm {
     };
     typedef std::vector<MergePotentialsElt>::iterator MPIterator;
     std::vector<MergePotentialsElt> MergePotentials;
+    SmallPtrSet<const MachineBasicBlock*, 2> TriedMerging;
 
     class SameTailElt {
       MPIterator MPIter;
@@ -85,6 +86,7 @@ namespace llvm {
     std::vector<SameTailElt> SameTails;
 
     bool EnableTailMerge;
+    bool EnableHoistCommonCode;
     const TargetInstrInfo *TII;
     const TargetRegisterInfo *TRI;
     MachineModuleInfo *MMI;
@@ -110,6 +112,9 @@ namespace llvm {
     bool OptimizeBlock(MachineBasicBlock *MBB);
     void RemoveDeadBlock(MachineBasicBlock *MBB);
     bool OptimizeImpDefsBlock(MachineBasicBlock *MBB);
+
+    bool HoistCommonCode(MachineFunction &MF);
+    bool HoistCommonCodeInSuccs(MachineBasicBlock *MBB);
   };
 }
 
diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt
index 2ca3859..aef4ff2 100644
--- a/lib/CodeGen/CMakeLists.txt
+++ b/lib/CodeGen/CMakeLists.txt
@@ -33,6 +33,7 @@ add_llvm_library(LLVMCodeGen
   LocalStackSlotAllocation.cpp
   LowerSubregs.cpp
   MachineBasicBlock.cpp
+  MachineBranchProbabilityInfo.cpp
   MachineCSE.cpp
   MachineDominators.cpp
   MachineFunction.cpp
@@ -67,6 +68,7 @@ add_llvm_library(LLVMCodeGen
   RegAllocGreedy.cpp
   RegAllocLinearScan.cpp
   RegAllocPBQP.cpp
+  RegisterClassInfo.cpp
   RegisterCoalescer.cpp
   RegisterScavenging.cpp
   RenderMachineFunction.cpp
diff --git a/lib/CodeGen/CalcSpillWeights.cpp b/lib/CodeGen/CalcSpillWeights.cpp
index 86ab2b6..5d722ee 100644
--- a/lib/CodeGen/CalcSpillWeights.cpp
+++ b/lib/CodeGen/CalcSpillWeights.cpp
@@ -87,8 +87,8 @@ static unsigned copyHint(const MachineInstr *mi, unsigned reg,
 }
 
 void VirtRegAuxInfo::CalculateWeightAndHint(LiveInterval &li) {
-  MachineRegisterInfo &mri = mf_.getRegInfo();
-  const TargetRegisterInfo &tri = *mf_.getTarget().getRegisterInfo();
+  MachineRegisterInfo &mri = MF.getRegInfo();
+  const TargetRegisterInfo &tri = *MF.getTarget().getRegisterInfo();
   MachineBasicBlock *mbb = 0;
   MachineLoop *loop = 0;
   unsigned loopDepth = 0;
@@ -118,7 +118,7 @@ void VirtRegAuxInfo::CalculateWeightAndHint(LiveInterval &li) {
       // Get loop info for mi.
       if (mi->getParent() != mbb) {
         mbb = mi->getParent();
-        loop = loops_.getLoopFor(mbb);
+        loop = Loops.getLoopFor(mbb);
         loopDepth = loop ? loop->getLoopDepth() : 0;
         isExiting = loop ? loop->isLoopExiting(mbb) : false;
       }
@@ -129,7 +129,7 @@ void VirtRegAuxInfo::CalculateWeightAndHint(LiveInterval &li) {
       weight = LiveIntervals::getSpillWeight(writes, reads, loopDepth);
 
       // Give extra weight to what looks like a loop induction variable update.
-      if (writes && isExiting && lis_.isLiveOutOfMBB(li, mbb))
+      if (writes && isExiting && LIS.isLiveOutOfMBB(li, mbb))
         weight *= 3;
 
       totalWeight += weight;
@@ -141,9 +141,9 @@ void VirtRegAuxInfo::CalculateWeightAndHint(LiveInterval &li) {
     unsigned hint = copyHint(mi, li.reg, tri, mri);
     if (!hint)
       continue;
-    float hweight = hint_[hint] += weight;
+    float hweight = Hint[hint] += weight;
     if (TargetRegisterInfo::isPhysicalRegister(hint)) {
-      if (hweight > bestPhys && lis_.isAllocatable(hint))
+      if (hweight > bestPhys && LIS.isAllocatable(hint))
         bestPhys = hweight, hintPhys = hint;
     } else {
       if (hweight > bestVirt)
@@ -151,7 +151,7 @@ void VirtRegAuxInfo::CalculateWeightAndHint(LiveInterval &li) {
     }
   }
 
-  hint_.clear();
+  Hint.clear();
 
   // Always prefer the physreg hint.
   if (unsigned hint = hintPhys ? hintPhys : hintVirt) {
@@ -165,7 +165,7 @@ void VirtRegAuxInfo::CalculateWeightAndHint(LiveInterval &li) {
     return;
 
   // Mark li as unspillable if all live ranges are tiny.
-  if (li.isZeroLength()) {
+  if (li.isZeroLength(LIS.getSlotIndexes())) {
     li.markNotSpillable();
     return;
   }
@@ -176,7 +176,7 @@ void VirtRegAuxInfo::CalculateWeightAndHint(LiveInterval &li) {
   // FIXME: this gets much more complicated once we support non-trivial
   // re-materialization.
   bool isLoad = false;
-  if (lis_.isReMaterializable(li, 0, isLoad)) {
+  if (LIS.isReMaterializable(li, 0, isLoad)) {
     if (isLoad)
       totalWeight *= 0.9F;
     else
@@ -187,50 +187,29 @@ void VirtRegAuxInfo::CalculateWeightAndHint(LiveInterval &li) {
 }
 
 void VirtRegAuxInfo::CalculateRegClass(unsigned reg) {
-  MachineRegisterInfo &mri = mf_.getRegInfo();
-  const TargetRegisterInfo *tri = mf_.getTarget().getRegisterInfo();
-  const TargetRegisterClass *orc = mri.getRegClass(reg);
-  SmallPtrSet<const TargetRegisterClass*,8> rcs;
-
-  for (MachineRegisterInfo::reg_nodbg_iterator I = mri.reg_nodbg_begin(reg),
-       E = mri.reg_nodbg_end(); I != E; ++I) {
-    // The targets don't have accurate enough regclass descriptions that we can
-    // handle subregs. We need something similar to
-    // TRI::getMatchingSuperRegClass, but returning a super class instead of a
-    // sub class.
-    if (I.getOperand().getSubReg()) {
-      DEBUG(dbgs() << "Cannot handle subregs: " << I.getOperand() << '\n');
-      return;
-    }
-    if (const TargetRegisterClass *rc =
-                                I->getDesc().getRegClass(I.getOperandNo(), tri))
-      rcs.insert(rc);
-  }
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo();
+  const TargetRegisterClass *OldRC = MRI.getRegClass(reg);
+  const TargetRegisterClass *NewRC = TRI->getLargestLegalSuperClass(OldRC);
 
-  // If we found no regclass constraints, just leave reg as is.
-  // In theory, we could inflate to the largest superclass of reg's existing
-  // class, but that might not be legal for the current cpu setting.
-  // This could happen if reg is only used by COPY instructions, so we may need
-  // to improve on this.
-  if (rcs.empty()) {
+  // Stop early if there is no room to grow.
+  if (NewRC == OldRC)
     return;
-  }
 
-  // Compute the intersection of all classes in rcs.
-  // This ought to be independent of iteration order, but if the target register
-  // classes don't form a proper algebra, it is possible to get different
-  // results. The solution is to make sure the intersection of any two register
-  // classes is also a register class or the null set.
-  const TargetRegisterClass *rc = 0;
-  for (SmallPtrSet<const TargetRegisterClass*,8>::iterator I = rcs.begin(),
-         E = rcs.end(); I != E; ++I) {
-    rc = rc ? getCommonSubClass(rc, *I) : *I;
-    assert(rc && "Incompatible regclass constraints found");
+  // Accumulate constraints from all uses.
+  for (MachineRegisterInfo::reg_nodbg_iterator I = MRI.reg_nodbg_begin(reg),
+       E = MRI.reg_nodbg_end(); I != E; ++I) {
+    // TRI doesn't have accurate enough information to model this yet.
+    if (I.getOperand().getSubReg())
+      return;
+    const TargetRegisterClass *OpRC =
+      I->getDesc().getRegClass(I.getOperandNo(), TRI);
+    if (OpRC)
+      NewRC = getCommonSubClass(NewRC, OpRC);
+    if (!NewRC || NewRC == OldRC)
+      return;
   }
-
-  if (rc == orc)
-    return;
-  DEBUG(dbgs() << "Inflating " << orc->getName() << ':' << PrintReg(reg)
-               << " to " << rc->getName() <<".\n");
-  mri.setRegClass(reg, rc);
+  DEBUG(dbgs() << "Inflating " << OldRC->getName() << ':' << PrintReg(reg)
+               << " to " << NewRC->getName() <<".\n");
+  MRI.setRegClass(reg, NewRC);
 }
diff --git a/lib/CodeGen/CallingConvLower.cpp b/lib/CodeGen/CallingConvLower.cpp
index ecd69a0..14eb054 100644
--- a/lib/CodeGen/CallingConvLower.cpp
+++ b/lib/CodeGen/CallingConvLower.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -22,19 +23,22 @@
 #include "llvm/Target/TargetLowering.h"
 using namespace llvm;
 
-CCState::CCState(CallingConv::ID CC, bool isVarArg, const TargetMachine &tm,
-                 SmallVector<CCValAssign, 16> &locs, LLVMContext &C)
-  : CallingConv(CC), IsVarArg(isVarArg), TM(tm),
-    TRI(*TM.getRegisterInfo()), Locs(locs), Context(C) {
+CCState::CCState(CallingConv::ID CC, bool isVarArg, MachineFunction &mf,
+                 const TargetMachine &tm, SmallVector<CCValAssign, 16> &locs,
+                 LLVMContext &C)
+  : CallingConv(CC), IsVarArg(isVarArg), MF(mf), TM(tm),
+    TRI(*TM.getRegisterInfo()), Locs(locs), Context(C),
+    CallOrPrologue(Unknown) {
   // No stack is used.
   StackOffset = 0;
-  
+
+  clearFirstByValReg();
   UsedRegs.resize((TRI.getNumRegs()+31)/32);
 }
 
-// HandleByVal - Allocate a stack slot large enough to pass an argument by
-// value. The size and alignment information of the argument is encoded in its
-// parameter attribute.
+// HandleByVal - Allocate space on the stack large enough to pass an argument
+// by value. The size and alignment information of the argument is encoded in
+// its parameter attribute.
 void CCState::HandleByVal(unsigned ValNo, MVT ValVT,
                           MVT LocVT, CCValAssign::LocInfo LocInfo,
                           int MinSize, int MinAlign,
@@ -45,10 +49,11 @@ void CCState::HandleByVal(unsigned ValNo, MVT ValVT,
     Size = MinSize;
   if (MinAlign > (int)Align)
     Align = MinAlign;
+  if (MF.getFrameInfo()->getMaxAlignment() < Align)
+    MF.getFrameInfo()->setMaxAlignment(Align);
+  TM.getTargetLowering()->HandleByVal(this, Size);
   unsigned Offset = AllocateStack(Size, Align);
-
   addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
-  TM.getTargetLowering()->HandleByVal(const_cast<CCState*>(this));
 }
 
 /// MarkAllocated - Mark a register and all of its aliases as allocated.
diff --git a/lib/CodeGen/CodePlacementOpt.cpp b/lib/CodeGen/CodePlacementOpt.cpp
index e37356a..270c337 100644
--- a/lib/CodeGen/CodePlacementOpt.cpp
+++ b/lib/CodeGen/CodePlacementOpt.cpp
@@ -254,7 +254,7 @@ bool CodePlacementOpt::MoveDiscontiguousLoopBlocks(MachineFunction &MF,
 
   // Determine a position to move orphaned loop blocks to. If TopMBB is not
   // entered via fallthrough and BotMBB is exited via fallthrough, prepend them
-  // to the top of the loop to avoid loosing that fallthrough. Otherwise append
+  // to the top of the loop to avoid losing that fallthrough. Otherwise append
   // them to the bottom, even if it previously had a fallthrough, on the theory
   // that it's worth an extra branch to keep the loop contiguous.
   MachineFunction::iterator InsertPt =
diff --git a/lib/CodeGen/CriticalAntiDepBreaker.cpp b/lib/CodeGen/CriticalAntiDepBreaker.cpp
index f79598d..4cac453 100644
--- a/lib/CodeGen/CriticalAntiDepBreaker.cpp
+++ b/lib/CodeGen/CriticalAntiDepBreaker.cpp
@@ -27,12 +27,12 @@
 using namespace llvm;
 
 CriticalAntiDepBreaker::
-CriticalAntiDepBreaker(MachineFunction& MFi) :
+CriticalAntiDepBreaker(MachineFunction& MFi, const RegisterClassInfo &RCI) :
   AntiDepBreaker(), MF(MFi),
   MRI(MF.getRegInfo()),
   TII(MF.getTarget().getInstrInfo()),
   TRI(MF.getTarget().getRegisterInfo()),
-  AllocatableSet(TRI->getAllocatableSet(MF)),
+  RegClassInfo(RCI),
   Classes(TRI->getNumRegs(), static_cast<const TargetRegisterClass *>(0)),
   KillIndices(TRI->getNumRegs(), 0),
   DefIndices(TRI->getNumRegs(), 0) {}
@@ -385,11 +385,9 @@ CriticalAntiDepBreaker::findSuitableFreeRegister(RegRefIter RegRefBegin,
                                                  unsigned LastNewReg,
                                                  const TargetRegisterClass *RC)
 {
-  for (TargetRegisterClass::iterator R = RC->allocation_order_begin(MF),
-       RE = RC->allocation_order_end(MF); R != RE; ++R) {
-    unsigned NewReg = *R;
-    // Don't consider non-allocatable registers
-    if (!AllocatableSet.test(NewReg)) continue;
+  ArrayRef<unsigned> Order = RegClassInfo.getOrder(RC);
+  for (unsigned i = 0; i != Order.size(); ++i) {
+    unsigned NewReg = Order[i];
     // Don't replace a register with itself.
     if (NewReg == AntiDepReg) continue;
     // Don't replace a register with one that was recently used to repair
@@ -421,7 +419,8 @@ unsigned CriticalAntiDepBreaker::
 BreakAntiDependencies(const std::vector<SUnit>& SUnits,
                       MachineBasicBlock::iterator Begin,
                       MachineBasicBlock::iterator End,
-                      unsigned InsertPosIndex) {
+                      unsigned InsertPosIndex,
+                      DbgValueVector &DbgValues) {
   // The code below assumes that there is at least one instruction,
   // so just duck out immediately if the block is empty.
   if (SUnits.empty()) return 0;
@@ -533,7 +532,7 @@ BreakAntiDependencies(const std::vector<SUnit>& SUnits,
         if (Edge->getKind() == SDep::Anti) {
           AntiDepReg = Edge->getReg();
           assert(AntiDepReg != 0 && "Anti-dependence on reg0?");
-          if (!AllocatableSet.test(AntiDepReg))
+          if (!RegClassInfo.isAllocatable(AntiDepReg))
             // Don't break anti-dependencies on non-allocatable registers.
             AntiDepReg = 0;
           else if (KeepRegs.count(AntiDepReg))
@@ -628,14 +627,10 @@ BreakAntiDependencies(const std::vector<SUnit>& SUnits,
           // as well.
           const SUnit *SU = MISUnitMap[Q->second->getParent()];
           if (!SU) continue;
-          for (unsigned i = 0, e = SU->DbgInstrList.size() ; i < e ; ++i) {
-            MachineInstr *DI = SU->DbgInstrList[i];
-            assert (DI->getNumOperands()==3 && DI->getOperand(0).isReg() &&
-                    DI->getOperand(0).getReg()
-                    && "Non register dbg_value attached to SUnit!");
-            if (DI->getOperand(0).getReg() == AntiDepReg)
-              DI->getOperand(0).setReg(NewReg);
-          }
+          for (DbgValueVector::iterator DVI = DbgValues.begin(),
+                 DVE = DbgValues.end(); DVI != DVE; ++DVI)
+            if (DVI->second == Q->second->getParent())
+              UpdateDbgValue(DVI->first, AntiDepReg, NewReg);
         }
 
         // We just went back in time and modified history; the
diff --git a/lib/CodeGen/CriticalAntiDepBreaker.h b/lib/CodeGen/CriticalAntiDepBreaker.h
index 0daaef2..0710780 100644
--- a/lib/CodeGen/CriticalAntiDepBreaker.h
+++ b/lib/CodeGen/CriticalAntiDepBreaker.h
@@ -17,6 +17,7 @@
 #define LLVM_CODEGEN_CRITICALANTIDEPBREAKER_H
 
 #include "AntiDepBreaker.h"
+#include "RegisterClassInfo.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -27,6 +28,7 @@
 #include <map>
 
 namespace llvm {
+class RegisterClassInfo;
 class TargetInstrInfo;
 class TargetRegisterInfo;
 
@@ -35,6 +37,7 @@ class TargetRegisterInfo;
     MachineRegisterInfo &MRI;
     const TargetInstrInfo *TII;
     const TargetRegisterInfo *TRI;
+    const RegisterClassInfo &RegClassInfo;
 
     /// AllocatableSet - The set of allocatable registers.
     /// We'll be ignoring anti-dependencies on non-allocatable registers,
@@ -66,7 +69,7 @@ class TargetRegisterInfo;
     SmallSet<unsigned, 4> KeepRegs;
 
   public:
-    CriticalAntiDepBreaker(MachineFunction& MFi);
+    CriticalAntiDepBreaker(MachineFunction& MFi, const RegisterClassInfo&);
     ~CriticalAntiDepBreaker();
 
     /// Start - Initialize anti-dep breaking for a new basic block.
@@ -79,7 +82,8 @@ class TargetRegisterInfo;
     unsigned BreakAntiDependencies(const std::vector<SUnit>& SUnits,
                                    MachineBasicBlock::iterator Begin,
                                    MachineBasicBlock::iterator End,
-                                   unsigned InsertPosIndex);
+                                   unsigned InsertPosIndex,
+                                   DbgValueVector &DbgValues);
 
     /// Observe - Update liveness information to account for the current
     /// instruction, which will not be scheduled.
diff --git a/lib/CodeGen/DwarfEHPrepare.cpp b/lib/CodeGen/DwarfEHPrepare.cpp
index 34b1a39..22c5465 100644
--- a/lib/CodeGen/DwarfEHPrepare.cpp
+++ b/lib/CodeGen/DwarfEHPrepare.cpp
@@ -30,6 +30,7 @@ using namespace llvm;
 
 STATISTIC(NumLandingPadsSplit,     "Number of landing pads split");
 STATISTIC(NumUnwindsLowered,       "Number of unwind instructions lowered");
+STATISTIC(NumResumesLowered,       "Number of eh.resume calls lowered");
 STATISTIC(NumExceptionValuesMoved, "Number of eh.exception calls moved");
 
 namespace {
@@ -63,7 +64,7 @@ namespace {
     BBSet LandingPads;
 
     bool NormalizeLandingPads();
-    bool LowerUnwinds();
+    bool LowerUnwindsAndResumes();
     bool MoveExceptionValueCalls();
 
     Instruction *CreateExceptionValueCall(BasicBlock *BB);
@@ -251,10 +252,7 @@ bool DwarfEHPrepare::HandleURoRInvokes() {
 
   if (!URoR) {
     URoR = F->getParent()->getFunction("_Unwind_Resume_or_Rethrow");
-    if (!URoR) {
-      URoR = F->getParent()->getFunction("_Unwind_SjLj_Resume");
-      if (!URoR) return CleanupSelectors(CatchAllSels);
-    }
+    if (!URoR) return CleanupSelectors(CatchAllSels);
   }
 
   SmallPtrSet<InvokeInst*, 32> URoRInvokes;
@@ -480,20 +478,25 @@ bool DwarfEHPrepare::NormalizeLandingPads() {
 /// rethrowing any previously caught exception.  This will crash horribly
 /// at runtime if there is no such exception: using unwind to throw a new
 /// exception is currently not supported.
-bool DwarfEHPrepare::LowerUnwinds() {
-  SmallVector<TerminatorInst*, 16> UnwindInsts;
-
-  for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) {
-    TerminatorInst *TI = I->getTerminator();
-    if (isa<UnwindInst>(TI))
-      UnwindInsts.push_back(TI);
+bool DwarfEHPrepare::LowerUnwindsAndResumes() {
+  SmallVector<Instruction*, 16> ResumeInsts;
+
+  for (Function::iterator fi = F->begin(), fe = F->end(); fi != fe; ++fi) {
+    for (BasicBlock::iterator bi = fi->begin(), be = fi->end(); bi != be; ++bi){
+      if (isa<UnwindInst>(bi))
+        ResumeInsts.push_back(bi);
+      else if (CallInst *call = dyn_cast<CallInst>(bi))
+        if (Function *fn = dyn_cast<Function>(call->getCalledValue()))
+          if (fn->getName() == "llvm.eh.resume")
+            ResumeInsts.push_back(bi);
+    }
   }
 
-  if (UnwindInsts.empty()) return false;
+  if (ResumeInsts.empty()) return false;
 
   // Find the rewind function if we didn't already.
   if (!RewindFunction) {
-    LLVMContext &Ctx = UnwindInsts[0]->getContext();
+    LLVMContext &Ctx = ResumeInsts[0]->getContext();
     std::vector<const Type*>
       Params(1, Type::getInt8PtrTy(Ctx));
     FunctionType *FTy = FunctionType::get(Type::getVoidTy(Ctx),
@@ -504,24 +507,36 @@ bool DwarfEHPrepare::LowerUnwinds() {
 
   bool Changed = false;
 
-  for (SmallVectorImpl<TerminatorInst*>::iterator
-         I = UnwindInsts.begin(), E = UnwindInsts.end(); I != E; ++I) {
-    TerminatorInst *TI = *I;
+  for (SmallVectorImpl<Instruction*>::iterator
+         I = ResumeInsts.begin(), E = ResumeInsts.end(); I != E; ++I) {
+    Instruction *RI = *I;
 
-    // Replace the unwind instruction with a call to _Unwind_Resume (or the
-    // appropriate target equivalent) followed by an UnreachableInst.
+    // Replace the resuming instruction with a call to _Unwind_Resume (or the
+    // appropriate target equivalent).
+
+    llvm::Value *ExnValue;
+    if (isa<UnwindInst>(RI))
+      ExnValue = CreateExceptionValueCall(RI->getParent());
+    else
+      ExnValue = cast<CallInst>(RI)->getArgOperand(0);
 
     // Create the call...
-    CallInst *CI = CallInst::Create(RewindFunction,
-                                    CreateExceptionValueCall(TI->getParent()),
-                                    "", TI);
+    CallInst *CI = CallInst::Create(RewindFunction, ExnValue, "", RI);
     CI->setCallingConv(TLI->getLibcallCallingConv(RTLIB::UNWIND_RESUME));
-    // ...followed by an UnreachableInst.
-    new UnreachableInst(TI->getContext(), TI);
 
-    // Nuke the unwind instruction.
-    TI->eraseFromParent();
-    ++NumUnwindsLowered;
+    // ...followed by an UnreachableInst, if it was an unwind.
+    // Calls to llvm.eh.resume are typically already followed by this.
+    if (isa<UnwindInst>(RI))
+      new UnreachableInst(RI->getContext(), RI);
+
+    if (isa<UnwindInst>(RI))
+      ++NumUnwindsLowered;
+    else
+      ++NumResumesLowered;
+
+    // Nuke the resume instruction.
+    RI->eraseFromParent();
+
     Changed = true;
   }
 
@@ -657,8 +672,8 @@ bool DwarfEHPrepare::runOnFunction(Function &Fn) {
   // basic block where an invoke unwind edge ends).
   Changed |= NormalizeLandingPads();
 
-  // Turn unwind instructions into libcalls.
-  Changed |= LowerUnwinds();
+  // Turn unwind instructions and eh.resume calls into libcalls.
+  Changed |= LowerUnwindsAndResumes();
 
   // TODO: Move eh.selector calls to landing pads and combine them.
 
diff --git a/lib/CodeGen/ELF.h b/lib/CodeGen/ELF.h
index e08feeb..5b63468 100644
--- a/lib/CodeGen/ELF.h
+++ b/lib/CodeGen/ELF.h
@@ -173,7 +173,7 @@ namespace llvm {
     unsigned Offset;    // sh_offset - Offset from the file start
     unsigned Size;      // sh_size - The section size.
     unsigned Link;      // sh_link - Section header table index link.
-    unsigned Info;      // sh_info - Auxillary information.
+    unsigned Info;      // sh_info - Auxiliary information.
     unsigned Align;     // sh_addralign - Alignment of section.
     unsigned EntSize;   // sh_entsize - Size of entries in the section e
 
diff --git a/lib/CodeGen/ELFWriter.cpp b/lib/CodeGen/ELFWriter.cpp
index 25c2e02..fa2319b 100644
--- a/lib/CodeGen/ELFWriter.cpp
+++ b/lib/CodeGen/ELFWriter.cpp
@@ -77,7 +77,7 @@ ELFWriter::ELFWriter(raw_ostream &o, TargetMachine &tm)
   // Create the object code emitter object for this target.
   ElfCE = new ELFCodeEmitter(*this);
 
-  // Inital number of sections
+  // Initial number of sections
   NumSections = 0;
 }
 
@@ -662,12 +662,16 @@ bool ELFWriter::EmitSpecialLLVMGlobal(const GlobalVariable *GV) {
 void ELFWriter::EmitXXStructorList(Constant *List, ELFSection &Xtor) {
   // Should be an array of '{ i32, void ()* }' structs.  The first value is the
   // init priority, which we ignore.
+  if (List->isNullValue()) return;
   ConstantArray *InitList = cast<ConstantArray>(List);
   for (unsigned i = 0, e = InitList->getNumOperands(); i != e; ++i) {
+    if (InitList->getOperand(i)->isNullValue())
+      continue;
     ConstantStruct *CS = cast<ConstantStruct>(InitList->getOperand(i));
 
     if (CS->getOperand(1)->isNullValue())
-      return;  // Found a null terminator, exit printing.
+      continue;
+
     // Emit the function pointer.
     EmitGlobalConstant(CS->getOperand(1), Xtor);
   }
diff --git a/lib/CodeGen/EdgeBundles.cpp b/lib/CodeGen/EdgeBundles.cpp
index aed8bc9..a7aba89 100644
--- a/lib/CodeGen/EdgeBundles.cpp
+++ b/lib/CodeGen/EdgeBundles.cpp
@@ -39,7 +39,7 @@ void EdgeBundles::getAnalysisUsage(AnalysisUsage &AU) const {
 bool EdgeBundles::runOnMachineFunction(MachineFunction &mf) {
   MF = &mf;
   EC.clear();
-  EC.grow(2 * MF->size());
+  EC.grow(2 * MF->getNumBlockIDs());
 
   for (MachineFunction::const_iterator I = MF->begin(), E = MF->end(); I != E;
        ++I) {
@@ -53,6 +53,19 @@ bool EdgeBundles::runOnMachineFunction(MachineFunction &mf) {
   EC.compress();
   if (ViewEdgeBundles)
     view();
+
+  // Compute the reverse mapping.
+  Blocks.clear();
+  Blocks.resize(getNumBundles());
+
+  for (unsigned i = 0, e = MF->getNumBlockIDs(); i != e; ++i) {
+    unsigned b0 = getBundle(i, 0);
+    unsigned b1 = getBundle(i, 1);
+    Blocks[b0].push_back(i);
+    if (b1 != b0)
+      Blocks[b1].push_back(i);
+  }
+
   return false;
 }
 
@@ -82,5 +95,3 @@ raw_ostream &llvm::WriteGraph(raw_ostream &O, const EdgeBundles &G,
   O << "}\n";
   return O;
 }
-
-
diff --git a/lib/CodeGen/ExpandISelPseudos.cpp b/lib/CodeGen/ExpandISelPseudos.cpp
index b5ec303..ebc2fc9 100644
--- a/lib/CodeGen/ExpandISelPseudos.cpp
+++ b/lib/CodeGen/ExpandISelPseudos.cpp
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// Expand Psuedo-instructions produced by ISel. These are usually to allow
+// Expand Pseudo-instructions produced by ISel. These are usually to allow
 // the expansion to contain control flow, such as a conditional move
 // implemented with a conditional branch and a phi, or an atomic operation
 // implemented with a loop.
diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp
index db53b04..8b2c981 100644
--- a/lib/CodeGen/IfConversion.cpp
+++ b/lib/CodeGen/IfConversion.cpp
@@ -27,7 +27,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/STLExtras.h"
@@ -146,10 +145,6 @@ namespace {
         : BBI(b), Kind(k), NeedSubsumption(s), NumDups(d), NumDups2(d2) {}
     };
 
-    /// Roots - Basic blocks that do not have successors. These are the starting
-    /// points of Graph traversal.
-    std::vector<MachineBasicBlock*> Roots;
-
     /// BBAnalysis - Results of if-conversion feasibility analysis indexed by
     /// basic block number.
     std::vector<BBInfo> BBAnalysis;
@@ -270,7 +265,7 @@ bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
   if (!TII) return false;
 
   // Tail merge tend to expose more if-conversion opportunities.
-  BranchFolder BF(true);
+  BranchFolder BF(true, false);
   bool BFChange = BF.OptimizeFunction(MF, TII,
                                    MF.getTarget().getRegisterInfo(),
                                    getAnalysisIfAvailable<MachineModuleInfo>());
@@ -287,11 +282,6 @@ bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
   MF.RenumberBlocks();
   BBAnalysis.resize(MF.getNumBlockIDs());
 
-  // Look for root nodes, i.e. blocks without successors.
-  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I)
-    if (I->succ_empty())
-      Roots.push_back(I);
-
   std::vector<IfcvtToken*> Tokens;
   MadeChange = false;
   unsigned NumIfCvts = NumSimple + NumSimpleFalse + NumTriangle +
@@ -406,11 +396,10 @@ bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
   }
 
   Tokens.clear();
-  Roots.clear();
   BBAnalysis.clear();
 
   if (MadeChange && IfCvtBranchFold) {
-    BranchFolder BF(false);
+    BranchFolder BF(false, false);
     BF.OptimizeFunction(MF, TII,
                         MF.getTarget().getRegisterInfo(),
                         getAnalysisIfAvailable<MachineModuleInfo>());
@@ -924,13 +913,9 @@ IfConverter::BBInfo &IfConverter::AnalyzeBlock(MachineBasicBlock *BB,
 /// candidates.
 void IfConverter::AnalyzeBlocks(MachineFunction &MF,
                                 std::vector<IfcvtToken*> &Tokens) {
-  std::set<MachineBasicBlock*> Visited;
-  for (unsigned i = 0, e = Roots.size(); i != e; ++i) {
-    for (idf_ext_iterator<MachineBasicBlock*> I=idf_ext_begin(Roots[i],Visited),
-           E = idf_ext_end(Roots[i], Visited); I != E; ++I) {
-      MachineBasicBlock *BB = *I;
-      AnalyzeBlock(BB, Tokens);
-    }
+  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) {
+    MachineBasicBlock *BB = I;
+    AnalyzeBlock(BB, Tokens);
   }
 
   // Sort to favor more complex ifcvt scheme.
diff --git a/lib/CodeGen/InlineSpiller.cpp b/lib/CodeGen/InlineSpiller.cpp
index 86f4cfc..19ae333 100644
--- a/lib/CodeGen/InlineSpiller.cpp
+++ b/lib/CodeGen/InlineSpiller.cpp
@@ -16,6 +16,7 @@
 #include "Spiller.h"
 #include "LiveRangeEdit.h"
 #include "VirtRegMap.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/LiveStackAnalysis.h"
@@ -31,6 +32,18 @@
 
 using namespace llvm;
 
+STATISTIC(NumSpilledRanges,   "Number of spilled live ranges");
+STATISTIC(NumSnippets,        "Number of snippets included in spills");
+STATISTIC(NumSpills,          "Number of spills inserted");
+STATISTIC(NumReloads,         "Number of reloads inserted");
+STATISTIC(NumFolded,          "Number of folded stack accesses");
+STATISTIC(NumFoldedLoads,     "Number of folded loads");
+STATISTIC(NumRemats,          "Number of rematerialized defs for spilling");
+STATISTIC(NumOmitReloadSpill, "Number of omitted spills after reloads");
+STATISTIC(NumHoistLocal,      "Number of locally hoisted spills");
+STATISTIC(NumHoistGlobal,     "Number of globally hoisted spills");
+STATISTIC(NumRedundantSpills, "Number of redundant spills identified");
+
 namespace {
 class InlineSpiller : public Spiller {
   MachineFunctionPass &Pass;
@@ -134,9 +147,10 @@ private:
   bool foldMemoryOperand(MachineBasicBlock::iterator MI,
                          const SmallVectorImpl<unsigned> &Ops,
                          MachineInstr *LoadMI = 0);
-  void insertReload(LiveInterval &NewLI, MachineBasicBlock::iterator MI);
+  void insertReload(LiveInterval &NewLI, SlotIndex,
+                    MachineBasicBlock::iterator MI);
   void insertSpill(LiveInterval &NewLI, const LiveInterval &OldLI,
-                   MachineBasicBlock::iterator MI);
+                   SlotIndex, MachineBasicBlock::iterator MI);
 
   void spillAroundUses(unsigned Reg);
   void spillAll();
@@ -246,10 +260,11 @@ void InlineSpiller::collectRegsToSpill() {
     if (!isSnippet(SnipLI))
       continue;
     SnippetCopies.insert(MI);
-    if (!isRegToSpill(SnipReg))
-      RegsToSpill.push_back(SnipReg);
-
+    if (isRegToSpill(SnipReg))
+      continue;
+    RegsToSpill.push_back(SnipReg);
     DEBUG(dbgs() << "\talso spill snippet " << SnipLI << '\n');
+    ++NumSnippets;
   }
 }
 
@@ -419,8 +434,10 @@ void InlineSpiller::analyzeSiblingValues() {
       }
       if (!DefMI && !VNI->isPHIDef())
         DefMI = LIS.getInstructionFromIndex(VNI->def);
-      if (DefMI)
-        Edit->checkRematerializable(VNI, DefMI, TII, AA);
+      if (DefMI && Edit->checkRematerializable(VNI, DefMI, TII, AA)) {
+        DEBUG(dbgs() << "Value " << PrintReg(Reg) << ':' << VNI->id << '@'
+                     << VNI->def << " may remat from " << *DefMI);
+      }
     }
   }
 }
@@ -431,7 +448,7 @@ bool InlineSpiller::hoistSpill(LiveInterval &SpillLI, MachineInstr *CopyMI) {
   SlotIndex Idx = LIS.getInstructionIndex(CopyMI);
   VNInfo *VNI = SpillLI.getVNInfoAt(Idx.getDefIndex());
   assert(VNI && VNI->def == Idx.getDefIndex() && "Not defined by copy");
-  SibValueMap::const_iterator I = SibValues.find(VNI);
+  SibValueMap::iterator I = SibValues.find(VNI);
   if (I == SibValues.end())
     return false;
 
@@ -441,6 +458,20 @@ bool InlineSpiller::hoistSpill(LiveInterval &SpillLI, MachineInstr *CopyMI) {
   if (!SVI.AllDefsAreReloads && SVI.SpillVNI == VNI)
     return false;
 
+  // SpillReg may have been deleted by remat and DCE.
+  if (!LIS.hasInterval(SVI.SpillReg)) {
+    DEBUG(dbgs() << "Stale interval: " << PrintReg(SVI.SpillReg) << '\n');
+    SibValues.erase(I);
+    return false;
+  }
+
+  LiveInterval &SibLI = LIS.getInterval(SVI.SpillReg);
+  if (!SibLI.containsValue(SVI.SpillVNI)) {
+    DEBUG(dbgs() << "Stale value: " << PrintReg(SVI.SpillReg) << '\n');
+    SibValues.erase(I);
+    return false;
+  }
+
   // Conservatively extend the stack slot range to the range of the original
   // value. We may be able to do better with stack slot coloring by being more
   // careful here.
@@ -452,19 +483,22 @@ bool InlineSpiller::hoistSpill(LiveInterval &SpillLI, MachineInstr *CopyMI) {
                << *StackInt << '\n');
 
   // Already spilled everywhere.
-  if (SVI.AllDefsAreReloads)
+  if (SVI.AllDefsAreReloads) {
+    ++NumOmitReloadSpill;
     return true;
-
+  }
   // We are going to spill SVI.SpillVNI immediately after its def, so clear out
   // any later spills of the same value.
-  eliminateRedundantSpills(LIS.getInterval(SVI.SpillReg), SVI.SpillVNI);
+  eliminateRedundantSpills(SibLI, SVI.SpillVNI);
 
   MachineBasicBlock *MBB = LIS.getMBBFromIndex(SVI.SpillVNI->def);
   MachineBasicBlock::iterator MII;
   if (SVI.SpillVNI->isPHIDef())
     MII = MBB->SkipPHIsAndLabels(MBB->begin());
   else {
-    MII = LIS.getInstructionFromIndex(SVI.SpillVNI->def);
+    MachineInstr *DefMI = LIS.getInstructionFromIndex(SVI.SpillVNI->def);
+    assert(DefMI && "Defining instruction disappeared");
+    MII = DefMI;
     ++MII;
   }
   // Insert spill without kill flag immediately after def.
@@ -474,6 +508,11 @@ bool InlineSpiller::hoistSpill(LiveInterval &SpillLI, MachineInstr *CopyMI) {
   LIS.InsertMachineInstrInMaps(MII);
   VRM.addSpillSlotUse(StackSlot, MII);
   DEBUG(dbgs() << "\thoisted: " << SVI.SpillVNI->def << '\t' << *MII);
+
+  if (MBB == CopyMI->getParent())
+    ++NumHoistLocal;
+  else
+    ++NumHoistGlobal;
   return true;
 }
 
@@ -489,8 +528,8 @@ void InlineSpiller::eliminateRedundantSpills(LiveInterval &SLI, VNInfo *VNI) {
     LiveInterval *LI;
     tie(LI, VNI) = WorkList.pop_back_val();
     unsigned Reg = LI->reg;
-    DEBUG(dbgs() << "Checking redundant spills for " << PrintReg(Reg) << ':'
-                 << VNI->id << '@' << VNI->def << '\n');
+    DEBUG(dbgs() << "Checking redundant spills for "
+                 << VNI->id << '@' << VNI->def << " in " << *LI << '\n');
 
     // Regs to spill are taken care of.
     if (isRegToSpill(Reg))
@@ -528,6 +567,7 @@ void InlineSpiller::eliminateRedundantSpills(LiveInterval &SLI, VNInfo *VNI) {
         // eliminateDeadDefs won't normally remove stores, so switch opcode.
         MI->setDesc(TII.get(TargetOpcode::KILL));
         DeadDefs.push_back(MI);
+        ++NumRedundantSpills;
       }
     }
   } while (!WorkList.empty());
@@ -623,6 +663,7 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg,
   if (RM.OrigMI->getDesc().canFoldAsLoad() &&
       foldMemoryOperand(MI, Ops, RM.OrigMI)) {
     Edit->markRematerialized(RM.ParentVNI);
+    ++NumFoldedLoads;
     return true;
   }
 
@@ -649,6 +690,7 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg,
   VNInfo *DefVNI = NewLI.getNextValue(DefIdx, 0, LIS.getVNInfoAllocator());
   NewLI.addRange(LiveRange(DefIdx, UseIdx.getDefIndex(), DefVNI));
   DEBUG(dbgs() << "\tinterval: " << NewLI << '\n');
+  ++NumRemats;
   return true;
 }
 
@@ -775,14 +817,15 @@ bool InlineSpiller::foldMemoryOperand(MachineBasicBlock::iterator MI,
     VRM.addSpillSlotUse(StackSlot, FoldMI);
   MI->eraseFromParent();
   DEBUG(dbgs() << "\tfolded: " << *FoldMI);
+  ++NumFolded;
   return true;
 }
 
 /// insertReload - Insert a reload of NewLI.reg before MI.
 void InlineSpiller::insertReload(LiveInterval &NewLI,
+                                 SlotIndex Idx,
                                  MachineBasicBlock::iterator MI) {
   MachineBasicBlock &MBB = *MI->getParent();
-  SlotIndex Idx = LIS.getInstructionIndex(MI).getDefIndex();
   TII.loadRegFromStackSlot(MBB, MI, NewLI.reg, StackSlot,
                            MRI.getRegClass(NewLI.reg), &TRI);
   --MI; // Point to load instruction.
@@ -792,19 +835,13 @@ void InlineSpiller::insertReload(LiveInterval &NewLI,
   VNInfo *LoadVNI = NewLI.getNextValue(LoadIdx, 0,
                                        LIS.getVNInfoAllocator());
   NewLI.addRange(LiveRange(LoadIdx, Idx, LoadVNI));
+  ++NumReloads;
 }
 
 /// insertSpill - Insert a spill of NewLI.reg after MI.
 void InlineSpiller::insertSpill(LiveInterval &NewLI, const LiveInterval &OldLI,
-                                MachineBasicBlock::iterator MI) {
+                                SlotIndex Idx, MachineBasicBlock::iterator MI) {
   MachineBasicBlock &MBB = *MI->getParent();
-
-  // Get the defined value. It could be an early clobber so keep the def index.
-  SlotIndex Idx = LIS.getInstructionIndex(MI).getDefIndex();
-  VNInfo *VNI = OldLI.getVNInfoAt(Idx);
-  assert(VNI && VNI->def.getDefIndex() == Idx && "Inconsistent VNInfo");
-  Idx = VNI->def;
-
   TII.storeRegToStackSlot(MBB, ++MI, NewLI.reg, true, StackSlot,
                           MRI.getRegClass(NewLI.reg), &TRI);
   --MI; // Point to store instruction.
@@ -813,10 +850,12 @@ void InlineSpiller::insertSpill(LiveInterval &NewLI, const LiveInterval &OldLI,
   DEBUG(dbgs() << "\tspilled: " << StoreIdx << '\t' << *MI);
   VNInfo *StoreVNI = NewLI.getNextValue(Idx, 0, LIS.getVNInfoAllocator());
   NewLI.addRange(LiveRange(Idx, StoreIdx, StoreVNI));
+  ++NumSpills;
 }
 
 /// spillAroundUses - insert spill code around each use of Reg.
 void InlineSpiller::spillAroundUses(unsigned Reg) {
+  DEBUG(dbgs() << "spillAroundUses " << PrintReg(Reg) << '\n');
   LiveInterval &OldLI = LIS.getInterval(Reg);
 
   // Iterate over instructions using Reg.
@@ -854,9 +893,22 @@ void InlineSpiller::spillAroundUses(unsigned Reg) {
     SmallVector<unsigned, 8> Ops;
     tie(Reads, Writes) = MI->readsWritesVirtualRegister(Reg, &Ops);
 
+    // Find the slot index where this instruction reads and writes OldLI.
+    // This is usually the def slot, except for tied early clobbers.
+    SlotIndex Idx = LIS.getInstructionIndex(MI).getDefIndex();
+    if (VNInfo *VNI = OldLI.getVNInfoAt(Idx.getUseIndex()))
+      if (SlotIndex::isSameInstr(Idx, VNI->def))
+        Idx = VNI->def;
+
     // Check for a sibling copy.
     unsigned SibReg = isFullCopyOf(MI, Reg);
     if (SibReg && isSibling(SibReg)) {
+      // This may actually be a copy between snippets.
+      if (isRegToSpill(SibReg)) {
+        DEBUG(dbgs() << "Found new snippet copy: " << *MI);
+        SnippetCopies.insert(MI);
+        continue;
+      }
       if (Writes) {
         // Hoist the spill of a sib-reg copy.
         if (hoistSpill(OldLI, MI)) {
@@ -867,7 +919,6 @@ void InlineSpiller::spillAroundUses(unsigned Reg) {
         }
       } else {
         // This is a reload for a sib-reg copy. Drop spills downstream.
-        SlotIndex Idx = LIS.getInstructionIndex(MI).getDefIndex();
         LiveInterval &SibLI = LIS.getInterval(SibReg);
         eliminateRedundantSpills(SibLI, SibLI.getVNInfoAt(Idx));
         // The COPY will fold to a reload below.
@@ -884,7 +935,7 @@ void InlineSpiller::spillAroundUses(unsigned Reg) {
     NewLI.markNotSpillable();
 
     if (Reads)
-      insertReload(NewLI, MI);
+      insertReload(NewLI, Idx, MI);
 
     // Rewrite instruction operands.
     bool hasLiveDef = false;
@@ -899,10 +950,11 @@ void InlineSpiller::spillAroundUses(unsigned Reg) {
           hasLiveDef = true;
       }
     }
+    DEBUG(dbgs() << "\trewrite: " << Idx << '\t' << *MI);
 
     // FIXME: Use a second vreg if instruction has no tied ops.
     if (Writes && hasLiveDef)
-      insertSpill(NewLI, OldLI, MI);
+      insertSpill(NewLI, OldLI, Idx, MI);
 
     DEBUG(dbgs() << "\tinterval: " << NewLI << '\n');
   }
@@ -938,13 +990,15 @@ void InlineSpiller::spillAll() {
   }
 
   // Finally delete the SnippetCopies.
-  for (MachineRegisterInfo::reg_iterator RI = MRI.reg_begin(Edit->getReg());
-       MachineInstr *MI = RI.skipInstruction();) {
-    assert(SnippetCopies.count(MI) && "Remaining use wasn't a snippet copy");
-    // FIXME: Do this with a LiveRangeEdit callback.
-    VRM.RemoveMachineInstrFromMaps(MI);
-    LIS.RemoveMachineInstrFromMaps(MI);
-    MI->eraseFromParent();
+  for (unsigned i = 0, e = RegsToSpill.size(); i != e; ++i) {
+    for (MachineRegisterInfo::reg_iterator RI = MRI.reg_begin(RegsToSpill[i]);
+         MachineInstr *MI = RI.skipInstruction();) {
+      assert(SnippetCopies.count(MI) && "Remaining use wasn't a snippet copy");
+      // FIXME: Do this with a LiveRangeEdit callback.
+      VRM.RemoveMachineInstrFromMaps(MI);
+      LIS.RemoveMachineInstrFromMaps(MI);
+      MI->eraseFromParent();
+    }
   }
 
   // Delete all spilled registers.
@@ -953,6 +1007,7 @@ void InlineSpiller::spillAll() {
 }
 
 void InlineSpiller::spill(LiveRangeEdit &edit) {
+  ++NumSpilledRanges;
   Edit = &edit;
   assert(!TargetRegisterInfo::isStackSlot(edit.getReg())
          && "Trying to spill a stack slot.");
diff --git a/lib/CodeGen/InterferenceCache.cpp b/lib/CodeGen/InterferenceCache.cpp
index 0aff128..b1014a9 100644
--- a/lib/CodeGen/InterferenceCache.cpp
+++ b/lib/CodeGen/InterferenceCache.cpp
@@ -26,7 +26,7 @@ void InterferenceCache::init(MachineFunction *mf,
   TRI = tri;
   PhysRegEntries.assign(TRI->getNumRegs(), 0);
   for (unsigned i = 0; i != CacheEntries; ++i)
-    Entries[i].clear(indexes);
+    Entries[i].clear(mf, indexes);
 }
 
 InterferenceCache::Entry *InterferenceCache::get(unsigned PhysReg) {
@@ -91,10 +91,6 @@ bool InterferenceCache::Entry::valid(LiveIntervalUnion *LIUArray,
 }
 
 void InterferenceCache::Entry::update(unsigned MBBNum) {
-  BlockInterference *BI = &Blocks[MBBNum];
-  BI->Tag = Tag;
-  BI->First = BI->Last = SlotIndex();
-
   SlotIndex Start, Stop;
   tie(Start, Stop) = Indexes->getMBBRange(MBBNum);
 
@@ -109,23 +105,39 @@ void InterferenceCache::Entry::update(unsigned MBBNum) {
     PrevPos = Start;
   }
 
-  // Check for first interference.
-  for (unsigned i = 0, e = Iters.size(); i != e; ++i) {
-    Iter &I = Iters[i];
-    if (!I.valid())
-      continue;
-    SlotIndex StartI = I.start();
-    if (StartI >= Stop)
-      continue;
-    if (!BI->First.isValid() || StartI < BI->First)
-      BI->First = StartI;
-  }
+  MachineFunction::const_iterator MFI = MF->getBlockNumbered(MBBNum);
+  BlockInterference *BI = &Blocks[MBBNum];
+  for (;;) {
+    BI->Tag = Tag;
+    BI->First = BI->Last = SlotIndex();
+
+    // Check for first interference.
+    for (unsigned i = 0, e = Iters.size(); i != e; ++i) {
+      Iter &I = Iters[i];
+      if (!I.valid())
+        continue;
+      SlotIndex StartI = I.start();
+      if (StartI >= Stop)
+        continue;
+      if (!BI->First.isValid() || StartI < BI->First)
+        BI->First = StartI;
+    }
 
-  // No interference in block.
-  if (!BI->First.isValid())
-    return;
+    PrevPos = Stop;
+    if (BI->First.isValid())
+      break;
+
+    // No interference in this block? Go ahead and precompute the next block.
+    if (++MFI == MF->end())
+      return;
+    MBBNum = MFI->getNumber();
+    BI = &Blocks[MBBNum];
+    if (BI->Tag == Tag)
+      return;
+    tie(Start, Stop) = Indexes->getMBBRange(MBBNum);
+  }
 
-  // Check for last interference.
+  // Check for last interference in block.
   for (unsigned i = 0, e = Iters.size(); i != e; ++i) {
     Iter &I = Iters[i];
     if (!I.valid() || I.start() >= Stop)
@@ -140,5 +152,4 @@ void InterferenceCache::Entry::update(unsigned MBBNum) {
     if (Backup)
       ++I;
   }
-  PrevPos = Stop;
 }
diff --git a/lib/CodeGen/InterferenceCache.h b/lib/CodeGen/InterferenceCache.h
index 2e613ae..6c36fa4 100644
--- a/lib/CodeGen/InterferenceCache.h
+++ b/lib/CodeGen/InterferenceCache.h
@@ -43,6 +43,9 @@ class InterferenceCache {
     /// change.
     unsigned Tag;
 
+    /// MF - The current function.
+    MachineFunction *MF;
+
     /// Indexes - Mapping block numbers to SlotIndex ranges.
     SlotIndexes *Indexes;
 
@@ -67,8 +70,9 @@ class InterferenceCache {
   public:
     Entry() : PhysReg(0), Tag(0), Indexes(0) {}
 
-    void clear(SlotIndexes *indexes) {
+    void clear(MachineFunction *mf, SlotIndexes *indexes) {
       PhysReg = 0;
+      MF = mf;
       Indexes = indexes;
     }
 
diff --git a/lib/CodeGen/LLVMTargetMachine.cpp b/lib/CodeGen/LLVMTargetMachine.cpp
index 8c2794a..b98fbed 100644
--- a/lib/CodeGen/LLVMTargetMachine.cpp
+++ b/lib/CodeGen/LLVMTargetMachine.cpp
@@ -13,6 +13,7 @@
 
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/PassManager.h"
+#include "llvm/Analysis/Passes.h"
 #include "llvm/Analysis/Verifier.h"
 #include "llvm/Assembly/PrintModulePass.h"
 #include "llvm/CodeGen/AsmPrinter.h"
@@ -32,7 +33,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FormattedStream.h"
-#include "llvm/Support/StandardPasses.h"
 using namespace llvm;
 
 namespace llvm {
@@ -149,6 +149,7 @@ bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
     MCStreamer *S = getTarget().createAsmStreamer(*Context, Out,
                                                   getVerboseAsm(),
                                                   hasMCUseLoc(),
+                                                  hasMCUseCFI(),
                                                   InstPrinter,
                                                   MCE, TAB,
                                                   ShowMCInst);
@@ -291,13 +292,21 @@ bool LLVMTargetMachine::addCommonCodeGenPasses(PassManagerBase &PM,
   // Standard LLVM-Level Passes.
 
   // Basic AliasAnalysis support.
-  createStandardAliasAnalysisPasses(&PM);
+  // Add TypeBasedAliasAnalysis before BasicAliasAnalysis so that
+  // BasicAliasAnalysis wins if they disagree. This is intended to help
+  // support "obvious" type-punning idioms.
+  PM.add(createTypeBasedAliasAnalysisPass());
+  PM.add(createBasicAliasAnalysisPass());
 
   // Before running any passes, run the verifier to determine if the input
   // coming from the front-end and/or optimizer is valid.
   if (!DisableVerify)
     PM.add(createVerifierPass());
 
+  // Simplify ObjC ARC code. This is done late because it makes re-optimization
+  // difficult.
+  PM.add(createObjCARCContractPass());
+
   // Run loop strength reduction before anything else.
   if (OptLevel != CodeGenOpt::None && !DisableLSR) {
     PM.add(createLoopStrengthReducePass(getTargetLowering()));
@@ -323,8 +332,8 @@ bool LLVMTargetMachine::addCommonCodeGenPasses(PassManagerBase &PM,
     PM.add(createSjLjEHPass(getTargetLowering()));
     // FALLTHROUGH
   case ExceptionHandling::DwarfCFI:
-  case ExceptionHandling::DwarfTable:
   case ExceptionHandling::ARM:
+  case ExceptionHandling::Win64:
     PM.add(createDwarfEHPass(this));
     break;
   case ExceptionHandling::None:
diff --git a/lib/CodeGen/LiveDebugVariables.cpp b/lib/CodeGen/LiveDebugVariables.cpp
index 333d15f..292928f 100644
--- a/lib/CodeGen/LiveDebugVariables.cpp
+++ b/lib/CodeGen/LiveDebugVariables.cpp
@@ -101,9 +101,13 @@ class UserValue {
   void insertDebugValue(MachineBasicBlock *MBB, SlotIndex Idx, unsigned LocNo,
                         LiveIntervals &LIS, const TargetInstrInfo &TII);
 
+  /// splitLocation - Replace OldLocNo ranges with NewRegs ranges where NewRegs
+  /// is live. Returns true if any changes were made.
+  bool splitLocation(unsigned OldLocNo, ArrayRef<LiveInterval*> NewRegs);
+
 public:
   /// UserValue - Create a new UserValue.
-  UserValue(const MDNode *var, unsigned o, DebugLoc L, 
+  UserValue(const MDNode *var, unsigned o, DebugLoc L,
             LocMap::Allocator &alloc)
     : variable(var), offset(o), dl(L), leader(this), next(0), locInts(alloc)
   {}
@@ -215,6 +219,10 @@ public:
   void renameRegister(unsigned OldReg, unsigned NewReg, unsigned SubIdx,
                       const TargetRegisterInfo *TRI);
 
+  /// splitRegister - Replace OldReg ranges with NewRegs ranges where NewRegs is
+  /// live. Returns true if any changes were made.
+  bool splitRegister(unsigned OldLocNo, ArrayRef<LiveInterval*> NewRegs);
+
   /// rewriteLocations - Rewrite virtual register locations according to the
   /// provided virtual register map.
   void rewriteLocations(VirtRegMap &VRM, const TargetRegisterInfo &TRI);
@@ -228,7 +236,7 @@ public:
   /// Only first one needs DebugLoc to identify variable's lexical scope
   /// in source file.
   DebugLoc findDebugLoc();
-  void print(raw_ostream&, const TargetRegisterInfo*);
+  void print(raw_ostream&, const TargetMachine*);
 };
 } // namespace
 
@@ -290,9 +298,12 @@ public:
   /// mapVirtReg - Map virtual register to an equivalence class.
   void mapVirtReg(unsigned VirtReg, UserValue *EC);
 
-  /// renameRegister - Replace all references to OldReg wiht NewReg:SubIdx.
+  /// renameRegister - Replace all references to OldReg with NewReg:SubIdx.
   void renameRegister(unsigned OldReg, unsigned NewReg, unsigned SubIdx);
 
+  /// splitRegister -  Replace all references to OldReg with NewRegs.
+  void splitRegister(unsigned OldReg, ArrayRef<LiveInterval*> NewRegs);
+
   /// emitDebugVariables - Recreate DBG_VALUE instruction from data structures.
   void emitDebugValues(VirtRegMap *VRM);
 
@@ -300,7 +311,7 @@ public:
 };
 } // namespace
 
-void UserValue::print(raw_ostream &OS, const TargetRegisterInfo *TRI) {
+void UserValue::print(raw_ostream &OS, const TargetMachine *TM) {
   if (const MDString *MDS = dyn_cast<MDString>(variable->getOperand(2)))
     OS << "!\"" << MDS->getString() << "\"\t";
   if (offset)
@@ -312,15 +323,17 @@ void UserValue::print(raw_ostream &OS, const TargetRegisterInfo *TRI) {
     else
       OS << I.value();
   }
-  for (unsigned i = 0, e = locations.size(); i != e; ++i)
-    OS << " Loc" << i << '=' << locations[i];
+  for (unsigned i = 0, e = locations.size(); i != e; ++i) {
+    OS << " Loc" << i << '=';
+    locations[i].print(OS, TM);
+  }
   OS << '\n';
 }
 
 void LDVImpl::print(raw_ostream &OS) {
   OS << "********** DEBUG VARIABLES **********\n";
   for (unsigned i = 0, e = userValues.size(); i != e; ++i)
-    userValues[i]->print(OS, TRI);
+    userValues[i]->print(OS, &MF->getTarget());
 }
 
 void UserValue::coalesceLocation(unsigned LocNo) {
@@ -677,6 +690,143 @@ renameRegister(unsigned OldReg, unsigned NewReg, unsigned SubIdx) {
     static_cast<LDVImpl*>(pImpl)->renameRegister(OldReg, NewReg, SubIdx);
 }
 
+//===----------------------------------------------------------------------===//
+//                           Live Range Splitting
+//===----------------------------------------------------------------------===//
+
+bool
+UserValue::splitLocation(unsigned OldLocNo, ArrayRef<LiveInterval*> NewRegs) {
+  DEBUG({
+    dbgs() << "Splitting Loc" << OldLocNo << '\t';
+    print(dbgs(), 0);
+  });
+  bool DidChange = false;
+  LocMap::iterator LocMapI;
+  LocMapI.setMap(locInts);
+  for (unsigned i = 0; i != NewRegs.size(); ++i) {
+    LiveInterval *LI = NewRegs[i];
+    if (LI->empty())
+      continue;
+
+    // Don't allocate the new LocNo until it is needed.
+    unsigned NewLocNo = ~0u;
+
+    // Iterate over the overlaps between locInts and LI.
+    LocMapI.find(LI->beginIndex());
+    if (!LocMapI.valid())
+      continue;
+    LiveInterval::iterator LII = LI->advanceTo(LI->begin(), LocMapI.start());
+    LiveInterval::iterator LIE = LI->end();
+    while (LocMapI.valid() && LII != LIE) {
+      // At this point, we know that LocMapI.stop() > LII->start.
+      LII = LI->advanceTo(LII, LocMapI.start());
+      if (LII == LIE)
+        break;
+
+      // Now LII->end > LocMapI.start(). Do we have an overlap?
+      if (LocMapI.value() == OldLocNo && LII->start < LocMapI.stop()) {
+        // Overlapping correct location. Allocate NewLocNo now.
+        if (NewLocNo == ~0u) {
+          MachineOperand MO = MachineOperand::CreateReg(LI->reg, false);
+          MO.setSubReg(locations[OldLocNo].getSubReg());
+          NewLocNo = getLocationNo(MO);
+          DidChange = true;
+        }
+
+        SlotIndex LStart = LocMapI.start();
+        SlotIndex LStop  = LocMapI.stop();
+
+        // Trim LocMapI down to the LII overlap.
+        if (LStart < LII->start)
+          LocMapI.setStartUnchecked(LII->start);
+        if (LStop > LII->end)
+          LocMapI.setStopUnchecked(LII->end);
+
+        // Change the value in the overlap. This may trigger coalescing.
+        LocMapI.setValue(NewLocNo);
+
+        // Re-insert any removed OldLocNo ranges.
+        if (LStart < LocMapI.start()) {
+          LocMapI.insert(LStart, LocMapI.start(), OldLocNo);
+          ++LocMapI;
+          assert(LocMapI.valid() && "Unexpected coalescing");
+        }
+        if (LStop > LocMapI.stop()) {
+          ++LocMapI;
+          LocMapI.insert(LII->end, LStop, OldLocNo);
+          --LocMapI;
+        }
+      }
+
+      // Advance to the next overlap.
+      if (LII->end < LocMapI.stop()) {
+        if (++LII == LIE)
+          break;
+        LocMapI.advanceTo(LII->start);
+      } else {
+        ++LocMapI;
+        if (!LocMapI.valid())
+          break;
+        LII = LI->advanceTo(LII, LocMapI.start());
+      }
+    }
+  }
+
+  // Finally, remove any remaining OldLocNo intervals and OldLocNo itself.
+  locations.erase(locations.begin() + OldLocNo);
+  LocMapI.goToBegin();
+  while (LocMapI.valid()) {
+    unsigned v = LocMapI.value();
+    if (v == OldLocNo) {
+      DEBUG(dbgs() << "Erasing [" << LocMapI.start() << ';'
+                   << LocMapI.stop() << ")\n");
+      LocMapI.erase();
+    } else {
+      if (v > OldLocNo)
+        LocMapI.setValueUnchecked(v-1);
+      ++LocMapI;
+    }
+  }
+
+  DEBUG({dbgs() << "Split result: \t"; print(dbgs(), 0);});
+  return DidChange;
+}
+
+bool
+UserValue::splitRegister(unsigned OldReg, ArrayRef<LiveInterval*> NewRegs) {
+  bool DidChange = false;
+  // Split locations referring to OldReg. Iterate backwards so splitLocation can
+  // safely erase unuused locations.
+  for (unsigned i = locations.size(); i ; --i) {
+    unsigned LocNo = i-1;
+    const MachineOperand *Loc = &locations[LocNo];
+    if (!Loc->isReg() || Loc->getReg() != OldReg)
+      continue;
+    DidChange |= splitLocation(LocNo, NewRegs);
+  }
+  return DidChange;
+}
+
+void LDVImpl::splitRegister(unsigned OldReg, ArrayRef<LiveInterval*> NewRegs) {
+  bool DidChange = false;
+  for (UserValue *UV = lookupVirtReg(OldReg); UV; UV = UV->getNext())
+    DidChange |= UV->splitRegister(OldReg, NewRegs);
+
+  if (!DidChange)
+    return;
+
+  // Map all of the new virtual registers.
+  UserValue *UV = lookupVirtReg(OldReg);
+  for (unsigned i = 0; i != NewRegs.size(); ++i)
+    mapVirtReg(NewRegs[i]->reg, UV);
+}
+
+void LiveDebugVariables::
+splitRegister(unsigned OldReg, ArrayRef<LiveInterval*> NewRegs) {
+  if (pImpl)
+    static_cast<LDVImpl*>(pImpl)->splitRegister(OldReg, NewRegs);
+}
+
 void
 UserValue::rewriteLocations(VirtRegMap &VRM, const TargetRegisterInfo &TRI) {
   // Iterate over locations in reverse makes it easier to handle coalescing.
@@ -690,6 +840,9 @@ UserValue::rewriteLocations(VirtRegMap &VRM, const TargetRegisterInfo &TRI) {
     unsigned VirtReg = Loc.getReg();
     if (VRM.isAssignedReg(VirtReg) &&
         TargetRegisterInfo::isPhysicalRegister(VRM.getPhys(VirtReg))) {
+      // This can create a %noreg operand in rare cases when the sub-register
+      // index is no longer available. That means the user value is in a
+      // non-existent sub-register, and %noreg is exactly what we want.
       Loc.substPhysReg(VRM.getPhys(VirtReg), TRI);
     } else if (VRM.getStackSlot(VirtReg) != VirtRegMap::NO_STACK_SLOT &&
                VRM.isSpillSlotUsed(VRM.getStackSlot(VirtReg))) {
@@ -701,7 +854,6 @@ UserValue::rewriteLocations(VirtRegMap &VRM, const TargetRegisterInfo &TRI) {
     }
     coalesceLocation(LocNo);
   }
-  DEBUG(print(dbgs(), &TRI));
 }
 
 /// findInsertLocation - Find an iterator for inserting a DBG_VALUE
@@ -793,6 +945,7 @@ void LDVImpl::emitDebugValues(VirtRegMap *VRM) {
   DEBUG(dbgs() << "********** EMITTING LIVE DEBUG VARIABLES **********\n");
   const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
   for (unsigned i = 0, e = userValues.size(); i != e; ++i) {
+    DEBUG(userValues[i]->print(dbgs(), &MF->getTarget()));
     userValues[i]->rewriteLocations(*VRM, *TRI);
     userValues[i]->emitDebugValues(VRM, *LIS, *TII);
   }
diff --git a/lib/CodeGen/LiveDebugVariables.h b/lib/CodeGen/LiveDebugVariables.h
index a6e40a1..3ce3c39 100644
--- a/lib/CodeGen/LiveDebugVariables.h
+++ b/lib/CodeGen/LiveDebugVariables.h
@@ -21,10 +21,12 @@
 #ifndef LLVM_CODEGEN_LIVEDEBUGVARIABLES_H
 #define LLVM_CODEGEN_LIVEDEBUGVARIABLES_H
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 
 namespace llvm {
 
+class LiveInterval;
 class VirtRegMap;
 
 class LiveDebugVariables : public MachineFunctionPass {
@@ -42,6 +44,11 @@ public:
   ///               register.
   void renameRegister(unsigned OldReg, unsigned NewReg, unsigned SubIdx);
 
+  /// splitRegister - Move any user variables in OldReg to the live ranges in
+  /// NewRegs where they are live. Mark the values as unavailable where no new
+  /// register is live.
+  void splitRegister(unsigned OldReg, ArrayRef<LiveInterval*> NewRegs);
+
   /// emitDebugValues - Emit new DBG_VALUE instructions reflecting the changes
   /// that happened during register allocation.
   /// @param VRM Rename virtual registers according to map.
diff --git a/lib/CodeGen/LiveIntervalAnalysis.cpp b/lib/CodeGen/LiveIntervalAnalysis.cpp
index c77ae1b..9257191 100644
--- a/lib/CodeGen/LiveIntervalAnalysis.cpp
+++ b/lib/CodeGen/LiveIntervalAnalysis.cpp
@@ -578,13 +578,6 @@ void LiveIntervals::handleRegisterDef(MachineBasicBlock *MBB,
       CopyMI = MI;
     handlePhysicalRegisterDef(MBB, MI, MIIdx, MO,
                               getOrCreateInterval(MO.getReg()), CopyMI);
-    // Def of a register also defines its sub-registers.
-    for (const unsigned* AS = tri_->getSubRegisters(MO.getReg()); *AS; ++AS)
-      // If MI also modifies the sub-register explicitly, avoid processing it
-      // more than once. Do not pass in TRI here so it checks for exact match.
-      if (!MI->definesRegister(*AS))
-        handlePhysicalRegisterDef(MBB, MI, MIIdx, MO,
-                                  getOrCreateInterval(*AS), 0);
   }
 }
 
@@ -645,7 +638,7 @@ void LiveIntervals::handleLiveInRegister(MachineBasicBlock *MBB,
       end = MIIdx.getStoreIndex();
     } else {
       DEBUG(dbgs() << " live through");
-      end = baseIndex;
+      end = getMBBEndIdx(MBB);
     }
   }
 
@@ -1514,7 +1507,7 @@ rewriteInstructionsForSpills(const LiveInterval &li, bool TrySplit,
         // ...
         // def = ...
         //     = use
-        // It's better to start a new interval to avoid artifically
+        // It's better to start a new interval to avoid artificially
         // extend the new interval.
         if (MI->readsWritesVirtualRegister(li.reg) ==
             std::make_pair(false,true)) {
diff --git a/lib/CodeGen/LiveIntervalUnion.cpp b/lib/CodeGen/LiveIntervalUnion.cpp
index 205f28a..b67f966 100644
--- a/lib/CodeGen/LiveIntervalUnion.cpp
+++ b/lib/CodeGen/LiveIntervalUnion.cpp
@@ -35,12 +35,20 @@ void LiveIntervalUnion::unify(LiveInterval &VirtReg) {
   LiveInterval::iterator RegEnd = VirtReg.end();
   SegmentIter SegPos = Segments.find(RegPos->start);
 
-  for (;;) {
+  while (SegPos.valid()) {
     SegPos.insert(RegPos->start, RegPos->end, &VirtReg);
     if (++RegPos == RegEnd)
       return;
     SegPos.advanceTo(RegPos->start);
   }
+
+  // We have reached the end of Segments, so it is no longer necessary to search
+  // for the insertion position.
+  // It is faster to insert the end first.
+  --RegEnd;
+  SegPos.insert(RegEnd->start, RegEnd->end, &VirtReg);
+  for (; RegPos != RegEnd; ++RegPos, ++SegPos)
+    SegPos.insert(RegPos->start, RegPos->end, &VirtReg);
 }
 
 // Remove a live virtual register's segments from this union.
@@ -168,6 +176,7 @@ LiveIntervalUnion::Query::firstInterference() {
     return FirstInterference;
   CheckedFirstInterference = true;
   InterferenceResult &IR = FirstInterference;
+  IR.LiveUnionI.setMap(LiveUnion->getMap());
 
   // Quickly skip interference check for empty sets.
   if (VirtReg->empty() || LiveUnion->empty()) {
@@ -176,10 +185,10 @@ LiveIntervalUnion::Query::firstInterference() {
     // VirtReg starts first, perform double binary search.
     IR.VirtRegI = VirtReg->find(LiveUnion->startIndex());
     if (IR.VirtRegI != VirtReg->end())
-      IR.LiveUnionI = LiveUnion->find(IR.VirtRegI->start);
+      IR.LiveUnionI.find(IR.VirtRegI->start);
   } else {
     // LiveUnion starts first, perform double binary search.
-    IR.LiveUnionI = LiveUnion->find(VirtReg->beginIndex());
+    IR.LiveUnionI.find(VirtReg->beginIndex());
     if (IR.LiveUnionI.valid())
       IR.VirtRegI = VirtReg->find(IR.LiveUnionI.start());
     else
@@ -235,7 +244,7 @@ bool LiveIntervalUnion::Query::isSeenInterference(LiveInterval *VirtReg) const {
 //
 // For comments on how to speed it up, see Query::findIntersection().
 unsigned LiveIntervalUnion::Query::
-collectInterferingVRegs(unsigned MaxInterferingRegs) {
+collectInterferingVRegs(unsigned MaxInterferingRegs, float MaxWeight) {
   InterferenceResult IR = firstInterference();
   LiveInterval::iterator VirtRegEnd = VirtReg->end();
   LiveInterval *RecentInterferingVReg = NULL;
@@ -277,6 +286,11 @@ collectInterferingVRegs(unsigned MaxInterferingRegs) {
       // Cache the most recent interfering vreg to bypass isSeenInterference.
       RecentInterferingVReg = IR.LiveUnionI.value();
       ++IR.LiveUnionI;
+
+      // Stop collecting when the max weight is exceeded.
+      if (RecentInterferingVReg->weight >= MaxWeight)
+        return InterferingVRegs.size();
+
       continue;
     }
     // VirtRegI may have advanced far beyond LiveUnionI,
diff --git a/lib/CodeGen/LiveIntervalUnion.h b/lib/CodeGen/LiveIntervalUnion.h
index 0964ecd..c83578e 100644
--- a/lib/CodeGen/LiveIntervalUnion.h
+++ b/lib/CodeGen/LiveIntervalUnion.h
@@ -95,6 +95,9 @@ public:
   // Remove a live virtual register's segments from this union.
   void extract(LiveInterval &VirtReg);
 
+  // Remove all inserted virtual registers.
+  void clear() { Segments.clear(); ++Tag; }
+
   // Print union, using TRI to translate register names
   void print(raw_ostream &OS, const TargetRegisterInfo *TRI) const;
 
@@ -226,7 +229,8 @@ public:
 
     // Count the virtual registers in this union that interfere with this
     // query's live virtual register, up to maxInterferingRegs.
-    unsigned collectInterferingVRegs(unsigned MaxInterferingRegs = UINT_MAX);
+    unsigned collectInterferingVRegs(unsigned MaxInterferingRegs = UINT_MAX,
+                                     float MaxWeight = HUGE_VALF);
 
     // Was this virtual register visited during collectInterferingVRegs?
     bool isSeenInterference(LiveInterval *VReg) const;
diff --git a/lib/CodeGen/LiveRangeEdit.cpp b/lib/CodeGen/LiveRangeEdit.cpp
index b96575e..052abad 100644
--- a/lib/CodeGen/LiveRangeEdit.cpp
+++ b/lib/CodeGen/LiveRangeEdit.cpp
@@ -15,6 +15,7 @@
 #include "LiveRangeEdit.h"
 #include "VirtRegMap.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/CalcSpillWeights.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -24,6 +25,10 @@
 
 using namespace llvm;
 
+STATISTIC(NumDCEDeleted,     "Number of instructions deleted by DCE");
+STATISTIC(NumDCEFoldedLoads, "Number of single use loads folded after DCE");
+STATISTIC(NumFracRanges,     "Number of live ranges fractured by DCE");
+
 LiveInterval &LiveRangeEdit::createFrom(unsigned OldReg,
                                         LiveIntervals &LIS,
                                         VirtRegMap &VRM) {
@@ -36,14 +41,16 @@ LiveInterval &LiveRangeEdit::createFrom(unsigned OldReg,
   return LI;
 }
 
-void LiveRangeEdit::checkRematerializable(VNInfo *VNI,
+bool LiveRangeEdit::checkRematerializable(VNInfo *VNI,
                                           const MachineInstr *DefMI,
                                           const TargetInstrInfo &tii,
                                           AliasAnalysis *aa) {
   assert(DefMI && "Missing instruction");
-  if (tii.isTriviallyReMaterializable(DefMI, aa))
-    remattable_.insert(VNI);
   scannedRemattable_ = true;
+  if (!tii.isTriviallyReMaterializable(DefMI, aa))
+    return false;
+  remattable_.insert(VNI);
+  return true;
 }
 
 void LiveRangeEdit::scanRemattable(LiveIntervals &lis,
@@ -59,6 +66,7 @@ void LiveRangeEdit::scanRemattable(LiveIntervals &lis,
       continue;
     checkRematerializable(VNI, DefMI, tii, aa);
   }
+  scannedRemattable_ = true;
 }
 
 bool LiveRangeEdit::anyRematerializable(LiveIntervals &lis,
@@ -137,11 +145,13 @@ SlotIndex LiveRangeEdit::rematerializeAt(MachineBasicBlock &MBB,
                                          const Remat &RM,
                                          LiveIntervals &lis,
                                          const TargetInstrInfo &tii,
-                                         const TargetRegisterInfo &tri) {
+                                         const TargetRegisterInfo &tri,
+                                         bool Late) {
   assert(RM.OrigMI && "Invalid remat");
   tii.reMaterialize(MBB, MI, DestReg, 0, RM.OrigMI, tri);
   rematted_.insert(RM.ParentVNI);
-  return lis.InsertMachineInstrInMaps(--MI).getDefIndex();
+  return lis.getSlotIndexes()->insertMachineInstrInMaps(--MI, Late)
+           .getDefIndex();
 }
 
 void LiveRangeEdit::eraseVirtReg(unsigned Reg, LiveIntervals &LIS) {
@@ -194,6 +204,7 @@ bool LiveRangeEdit::foldAsLoad(LiveInterval *LI,
   UseMI->eraseFromParent();
   DefMI->addRegisterDead(LI->reg, 0);
   Dead.push_back(DefMI);
+  ++NumDCEFoldedLoads;
   return true;
 }
 
@@ -203,6 +214,7 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead,
   SetVector<LiveInterval*,
             SmallVector<LiveInterval*, 8>,
             SmallPtrSet<LiveInterval*, 8> > ToShrink;
+  MachineRegisterInfo &MRI = VRM.getRegInfo();
 
   for (;;) {
     // Erase all dead defs.
@@ -236,8 +248,13 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead,
           continue;
         LiveInterval &LI = LIS.getInterval(Reg);
 
-        // Shrink read registers.
-        if (MI->readsVirtualRegister(Reg))
+        // Shrink read registers, unless it is likely to be expensive and
+        // unlikely to change anything. We typically don't want to shrink the
+        // PIC base register that has lots of uses everywhere.
+        // Always shrink COPY uses that probably come from live range splitting.
+        if (MI->readsVirtualRegister(Reg) &&
+            (MI->isCopy() || MOI->isDef() || MRI.hasOneNonDBGUse(Reg) ||
+             LI.killedAt(Idx)))
           ToShrink.insert(&LI);
 
         // Remove defined value.
@@ -258,6 +275,7 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead,
         delegate_->LRE_WillEraseInstruction(MI);
       LIS.RemoveMachineInstrFromMaps(MI);
       MI->eraseFromParent();
+      ++NumDCEDeleted;
     }
 
     if (ToShrink.empty())
@@ -266,7 +284,7 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead,
     // Shrink just one live interval. Then delete new dead defs.
     LiveInterval *LI = ToShrink.back();
     ToShrink.pop_back();
-    if (foldAsLoad(LI, Dead, VRM.getRegInfo(), LIS, TII))
+    if (foldAsLoad(LI, Dead, MRI, LIS, TII))
       continue;
     if (delegate_)
       delegate_->LRE_WillShrinkVirtReg(LI->reg);
@@ -279,6 +297,7 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead,
     unsigned NumComp = ConEQ.Classify(LI);
     if (NumComp <= 1)
       continue;
+    ++NumFracRanges;
     DEBUG(dbgs() << NumComp << " components: " << *LI << '\n');
     SmallVector<LiveInterval*, 8> Dups(1, LI);
     for (unsigned i = 1; i != NumComp; ++i) {
@@ -286,7 +305,7 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead,
       if (delegate_)
         delegate_->LRE_DidCloneVirtReg(Dups.back()->reg, LI->reg);
     }
-    ConEQ.Distribute(&Dups[0], VRM.getRegInfo());
+    ConEQ.Distribute(&Dups[0], MRI);
   }
 }
 
diff --git a/lib/CodeGen/LiveRangeEdit.h b/lib/CodeGen/LiveRangeEdit.h
index 54a2c83..db6740c 100644
--- a/lib/CodeGen/LiveRangeEdit.h
+++ b/lib/CodeGen/LiveRangeEdit.h
@@ -18,8 +18,9 @@
 #ifndef LLVM_CODEGEN_LIVERANGEEDIT_H
 #define LLVM_CODEGEN_LIVERANGEEDIT_H
 
-#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/CodeGen/LiveInterval.h"
 
 namespace llvm {
 
@@ -113,6 +114,10 @@ public:
   bool empty() const { return size() == 0; }
   LiveInterval *get(unsigned idx) const { return newRegs_[idx+firstNew_]; }
 
+  ArrayRef<LiveInterval*> regs() const {
+    return ArrayRef<LiveInterval*>(newRegs_).slice(firstNew_);
+  }
+
   /// FIXME: Temporary accessors until we can get rid of
   /// LiveIntervals::AddIntervalsForSpills
   SmallVectorImpl<LiveInterval*> *getNewVRegs() { return &newRegs_; }
@@ -137,7 +142,7 @@ public:
 
   /// checkRematerializable - Manually add VNI to the list of rematerializable
   /// values if DefMI may be rematerializable.
-  void checkRematerializable(VNInfo *VNI, const MachineInstr *DefMI,
+  bool checkRematerializable(VNInfo *VNI, const MachineInstr *DefMI,
                              const TargetInstrInfo&, AliasAnalysis*);
 
   /// Remat - Information needed to rematerialize at a specific location.
@@ -165,7 +170,8 @@ public:
                             const Remat &RM,
                             LiveIntervals&,
                             const TargetInstrInfo&,
-                            const TargetRegisterInfo&);
+                            const TargetRegisterInfo&,
+                            bool Late = false);
 
   /// markRematerialized - explicitly mark a value as rematerialized after doing
   /// it manually.
diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp
index ccbff0a..613f0c4 100644
--- a/lib/CodeGen/MachineBasicBlock.cpp
+++ b/lib/CodeGen/MachineBasicBlock.cpp
@@ -61,7 +61,7 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const MachineBasicBlock &MBB) {
   return OS;
 }
 
-/// addNodeToList (MBB) - When an MBB is added to an MF, we need to update the 
+/// addNodeToList (MBB) - When an MBB is added to an MF, we need to update the
 /// parent pointer of the MBB, the MBB numbering, and any instructions in the
 /// MBB to be on the right operand list for registers.
 ///
@@ -93,7 +93,7 @@ void ilist_traits<MachineBasicBlock>::removeNodeFromList(MachineBasicBlock *N) {
 void ilist_traits<MachineInstr>::addNodeToList(MachineInstr *N) {
   assert(N->getParent() == 0 && "machine instruction already in a basic block");
   N->setParent(Parent);
-  
+
   // Add the instruction's register operands to their corresponding
   // use/def lists.
   MachineFunction *MF = Parent->getParent();
@@ -110,7 +110,7 @@ void ilist_traits<MachineInstr>::removeNodeFromList(MachineInstr *N) {
 
   // Remove from the use/def lists.
   N->RemoveRegOperandsFromUseLists();
-  
+
   N->setParent(0);
 
   LeakDetector::addGarbageObject(N);
@@ -339,32 +339,70 @@ void MachineBasicBlock::updateTerminator() {
   }
 }
 
-void MachineBasicBlock::addSuccessor(MachineBasicBlock *succ) {
-  Successors.push_back(succ);
-  succ->addPredecessor(this);
-}
+void MachineBasicBlock::addSuccessor(MachineBasicBlock *succ, uint32_t weight) {
+
+  // If we see non-zero value for the first time it means we actually use Weight
+  // list, so we fill all Weights with 0's.
+  if (weight != 0 && Weights.empty())
+    Weights.resize(Successors.size());
+
+  if (weight != 0 || !Weights.empty())
+    Weights.push_back(weight);
+
+   Successors.push_back(succ);
+   succ->addPredecessor(this);
+ }
 
 void MachineBasicBlock::removeSuccessor(MachineBasicBlock *succ) {
   succ->removePredecessor(this);
   succ_iterator I = std::find(Successors.begin(), Successors.end(), succ);
   assert(I != Successors.end() && "Not a current successor!");
+
+  // If Weight list is empty it means we don't use it (disabled optimization).
+  if (!Weights.empty()) {
+    weight_iterator WI = getWeightIterator(I);
+    Weights.erase(WI);
+  }
+
   Successors.erase(I);
 }
 
-MachineBasicBlock::succ_iterator 
+MachineBasicBlock::succ_iterator
 MachineBasicBlock::removeSuccessor(succ_iterator I) {
   assert(I != Successors.end() && "Not a current successor!");
+
+  // If Weight list is empty it means we don't use it (disabled optimization).
+  if (!Weights.empty()) {
+    weight_iterator WI = getWeightIterator(I);
+    Weights.erase(WI);
+  }
+
   (*I)->removePredecessor(this);
   return Successors.erase(I);
 }
 
+void MachineBasicBlock::replaceSuccessor(MachineBasicBlock *Old,
+                                         MachineBasicBlock *New) {
+  uint32_t weight = 0;
+  succ_iterator SI = std::find(Successors.begin(), Successors.end(), Old);
+
+  // If Weight list is empty it means we don't use it (disabled optimization).
+  if (!Weights.empty()) {
+    weight_iterator WI = getWeightIterator(SI);
+    weight = *WI;
+  }
+
+  // Update the successor information.
+  removeSuccessor(SI);
+  addSuccessor(New, weight);
+}
+
 void MachineBasicBlock::addPredecessor(MachineBasicBlock *pred) {
   Predecessors.push_back(pred);
 }
 
 void MachineBasicBlock::removePredecessor(MachineBasicBlock *pred) {
-  std::vector<MachineBasicBlock *>::iterator I =
-    std::find(Predecessors.begin(), Predecessors.end(), pred);
+  pred_iterator I = std::find(Predecessors.begin(), Predecessors.end(), pred);
   assert(I != Predecessors.end() && "Pred is not a predecessor of this block!");
   Predecessors.erase(I);
 }
@@ -372,10 +410,17 @@ void MachineBasicBlock::removePredecessor(MachineBasicBlock *pred) {
 void MachineBasicBlock::transferSuccessors(MachineBasicBlock *fromMBB) {
   if (this == fromMBB)
     return;
-  
+
   while (!fromMBB->succ_empty()) {
     MachineBasicBlock *Succ = *fromMBB->succ_begin();
-    addSuccessor(Succ);
+    uint32_t weight = 0;
+
+
+    // If Weight list is empty it means we don't use it (disabled optimization).
+    if (!fromMBB->Weights.empty())
+      weight = *fromMBB->Weights.begin();
+
+    addSuccessor(Succ, weight);
     fromMBB->removeSuccessor(Succ);
   }
 }
@@ -384,7 +429,7 @@ void
 MachineBasicBlock::transferSuccessorsAndUpdatePHIs(MachineBasicBlock *fromMBB) {
   if (this == fromMBB)
     return;
-  
+
   while (!fromMBB->succ_empty()) {
     MachineBasicBlock *Succ = *fromMBB->succ_begin();
     addSuccessor(Succ);
@@ -402,8 +447,7 @@ MachineBasicBlock::transferSuccessorsAndUpdatePHIs(MachineBasicBlock *fromMBB) {
 }
 
 bool MachineBasicBlock::isSuccessor(const MachineBasicBlock *MBB) const {
-  std::vector<MachineBasicBlock *>::const_iterator I =
-    std::find(Successors.begin(), Successors.end(), MBB);
+  const_succ_iterator I = std::find(Successors.begin(), Successors.end(), MBB);
   return I != Successors.end();
 }
 
@@ -487,6 +531,30 @@ MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) {
         << " -- BB#" << NMBB->getNumber()
         << " -- BB#" << Succ->getNumber() << '\n');
 
+  // On some targets like Mips, branches may kill virtual registers. Make sure
+  // that LiveVariables is properly updated after updateTerminator replaces the
+  // terminators.
+  LiveVariables *LV = P->getAnalysisIfAvailable<LiveVariables>();
+
+  // Collect a list of virtual registers killed by the terminators.
+  SmallVector<unsigned, 4> KilledRegs;
+  if (LV)
+    for (iterator I = getFirstTerminator(), E = end(); I != E; ++I) {
+      MachineInstr *MI = I;
+      for (MachineInstr::mop_iterator OI = MI->operands_begin(),
+           OE = MI->operands_end(); OI != OE; ++OI) {
+        if (!OI->isReg() || !OI->isUse() || !OI->isKill() || OI->isUndef())
+          continue;
+        unsigned Reg = OI->getReg();
+        if (TargetRegisterInfo::isVirtualRegister(Reg) &&
+            LV->getVarInfo(Reg).removeKill(MI)) {
+          KilledRegs.push_back(Reg);
+          DEBUG(dbgs() << "Removing terminator kill: " << *MI);
+          OI->setIsKill(false);
+        }
+      }
+    }
+
   ReplaceUsesOfBlockWith(Succ, NMBB);
   updateTerminator();
 
@@ -504,9 +572,22 @@ MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) {
       if (i->getOperand(ni+1).getMBB() == this)
         i->getOperand(ni+1).setMBB(NMBB);
 
-  if (LiveVariables *LV =
-        P->getAnalysisIfAvailable<LiveVariables>())
+  // Update LiveVariables.
+  if (LV) {
+    // Restore kills of virtual registers that were killed by the terminators.
+    while (!KilledRegs.empty()) {
+      unsigned Reg = KilledRegs.pop_back_val();
+      for (iterator I = end(), E = begin(); I != E;) {
+        if (!(--I)->addRegisterKilled(Reg, NULL, /* addIfNotFound= */ false))
+          continue;
+        LV->getVarInfo(Reg).Kills.push_back(I);
+        DEBUG(dbgs() << "Restored terminator kill: " << *I);
+        break;
+      }
+    }
+    // Update relevant live-through information.
     LV->addNewBlock(NMBB, this, Succ);
+  }
 
   if (MachineDominatorTree *MDT =
       P->getAnalysisIfAvailable<MachineDominatorTree>()) {
@@ -602,15 +683,14 @@ void MachineBasicBlock::ReplaceUsesOfBlockWith(MachineBasicBlock *Old,
   }
 
   // Update the successor information.
-  removeSuccessor(Old);
-  addSuccessor(New);
+  replaceSuccessor(Old, New);
 }
 
 /// CorrectExtraCFGEdges - Various pieces of code can cause excess edges in the
 /// CFG to be inserted.  If we have proven that MBB can only branch to DestA and
 /// DestB, remove any other MBB successors from the CFG.  DestA and DestB can be
 /// null.
-/// 
+///
 /// Besides DestA and DestB, retain other edges leading to LandingPads
 /// (currently there can be only one; we don't check or require that here).
 /// Note it is possible that DestA and/or DestB are LandingPads.
@@ -685,6 +765,23 @@ MachineBasicBlock::findDebugLoc(MachineBasicBlock::iterator &MBBI) {
   return DL;
 }
 
+/// getSuccWeight - Return weight of the edge from this block to MBB.
+///
+uint32_t MachineBasicBlock::getSuccWeight(MachineBasicBlock *succ) {
+  succ_iterator I = std::find(Successors.begin(), Successors.end(), succ);
+  return *getWeightIterator(I);
+}
+
+/// getWeightIterator - Return wight iterator corresonding to the I successor
+/// iterator
+MachineBasicBlock::weight_iterator MachineBasicBlock::
+getWeightIterator(MachineBasicBlock::succ_iterator I) {
+  assert(Weights.size() == Successors.size() && "Async weight list!");
+  size_t index = std::distance(Successors.begin(), I);
+  assert(index < Weights.size() && "Not a current successor!");
+  return Weights.begin() + index;
+}
+
 void llvm::WriteAsOperand(raw_ostream &OS, const MachineBasicBlock *MBB,
                           bool t) {
   OS << "BB#" << MBB->getNumber();
diff --git a/lib/CodeGen/MachineBranchProbabilityInfo.cpp b/lib/CodeGen/MachineBranchProbabilityInfo.cpp
new file mode 100644
index 0000000..c13fa6b
--- /dev/null
+++ b/lib/CodeGen/MachineBranchProbabilityInfo.cpp
@@ -0,0 +1,113 @@
+//===- MachineBranchProbabilityInfo.cpp - Machine Branch Probability Info -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This analysis uses probability info stored in Machine Basic Blocks.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Instructions.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+INITIALIZE_PASS_BEGIN(MachineBranchProbabilityInfo, "machine-branch-prob",
+                      "Machine Branch Probability Analysis", false, true)
+INITIALIZE_PASS_END(MachineBranchProbabilityInfo, "machine-branch-prob",
+                    "Machine Branch Probability Analysis", false, true)
+
+char MachineBranchProbabilityInfo::ID = 0;
+
+uint32_t MachineBranchProbabilityInfo::
+getSumForBlock(MachineBasicBlock *MBB) const {
+  uint32_t Sum = 0;
+
+  for (MachineBasicBlock::const_succ_iterator I = MBB->succ_begin(),
+       E = MBB->succ_end(); I != E; ++I) {
+    MachineBasicBlock *Succ = *I;
+    uint32_t Weight = getEdgeWeight(MBB, Succ);
+    uint32_t PrevSum = Sum;
+
+    Sum += Weight;
+    assert(Sum > PrevSum); (void) PrevSum;
+  }
+
+  return Sum;
+}
+
+uint32_t
+MachineBranchProbabilityInfo::getEdgeWeight(MachineBasicBlock *Src,
+                                            MachineBasicBlock *Dst) const {
+  uint32_t Weight = Src->getSuccWeight(Dst);
+  if (!Weight)
+    return DEFAULT_WEIGHT;
+  return Weight;
+}
+
+bool MachineBranchProbabilityInfo::isEdgeHot(MachineBasicBlock *Src,
+                                             MachineBasicBlock *Dst) const {
+  // Hot probability is at least 4/5 = 80%
+  uint32_t Weight = getEdgeWeight(Src, Dst);
+  uint32_t Sum = getSumForBlock(Src);
+
+  // FIXME: Implement BranchProbability::compare then change this code to
+  // compare this BranchProbability against a static "hot" BranchProbability.
+  return (uint64_t)Weight * 5 > (uint64_t)Sum * 4;
+}
+
+MachineBasicBlock *
+MachineBranchProbabilityInfo::getHotSucc(MachineBasicBlock *MBB) const {
+  uint32_t Sum = 0;
+  uint32_t MaxWeight = 0;
+  MachineBasicBlock *MaxSucc = 0;
+
+  for (MachineBasicBlock::const_succ_iterator I = MBB->succ_begin(),
+       E = MBB->succ_end(); I != E; ++I) {
+    MachineBasicBlock *Succ = *I;
+    uint32_t Weight = getEdgeWeight(MBB, Succ);
+    uint32_t PrevSum = Sum;
+
+    Sum += Weight;
+    assert(Sum > PrevSum); (void) PrevSum;
+
+    if (Weight > MaxWeight) {
+      MaxWeight = Weight;
+      MaxSucc = Succ;
+    }
+  }
+
+  // FIXME: Use BranchProbability::compare.
+  if ((uint64_t)MaxWeight * 5 >= (uint64_t)Sum * 4)
+    return MaxSucc;
+
+  return 0;
+}
+
+BranchProbability
+MachineBranchProbabilityInfo::getEdgeProbability(MachineBasicBlock *Src,
+                                                 MachineBasicBlock *Dst) const {
+  uint32_t N = getEdgeWeight(Src, Dst);
+  uint32_t D = getSumForBlock(Src);
+
+  return BranchProbability(N, D);
+}
+
+raw_ostream &MachineBranchProbabilityInfo::
+printEdgeProbability(raw_ostream &OS, MachineBasicBlock *Src,
+                     MachineBasicBlock *Dst) const {
+
+  const BranchProbability Prob = getEdgeProbability(Src, Dst);
+  OS << "edge MBB#" << Src->getNumber() << " -> MBB#" << Dst->getNumber()
+     << " probability is "  << Prob 
+     << (isEdgeHot(Src, Dst) ? " [HOT edge]\n" : "\n");
+
+  return OS;
+}
diff --git a/lib/CodeGen/MachineCSE.cpp b/lib/CodeGen/MachineCSE.cpp
index 07a7d27..f97ccf6 100644
--- a/lib/CodeGen/MachineCSE.cpp
+++ b/lib/CodeGen/MachineCSE.cpp
@@ -365,6 +365,8 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
     if (!FoundCSE) {
       // Look for trivial copy coalescing opportunities.
       if (PerformTrivialCoalescing(MI, MBB)) {
+        Changed = true;
+
         // After coalescing MI itself may become a copy.
         if (MI->isCopyLike())
           continue;
@@ -379,10 +381,11 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
       if (NewMI) {
         Commuted = true;
         FoundCSE = VNT.count(NewMI);
-        if (NewMI != MI)
+        if (NewMI != MI) {
           // New instruction. It doesn't need to be kept.
           NewMI->eraseFromParent();
-        else if (!FoundCSE)
+          Changed = true;
+        } else if (!FoundCSE)
           // MI was changed but it didn't help, commute it back!
           (void)TII->commuteInstruction(MI);
       }
@@ -450,6 +453,7 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
         ++NumPhysCSEs;
       if (Commuted)
         ++NumCommutes;
+      Changed = true;
     } else {
       DEBUG(dbgs() << "*** Not profitable, avoid CSE!\n");
       VNT.insert(MI, CurrVN++);
diff --git a/lib/CodeGen/MachineFunction.cpp b/lib/CodeGen/MachineFunction.cpp
index d81e4a1..50750a5 100644
--- a/lib/CodeGen/MachineFunction.cpp
+++ b/lib/CodeGen/MachineFunction.cpp
@@ -65,7 +65,11 @@ MachineFunction::MachineFunction(const Function *F, const TargetMachine &TM,
     FrameInfo->setMaxAlignment(Attribute::getStackAlignmentFromAttrs(
         Fn->getAttributes().getFnAttributes()));
   ConstantPool = new (Allocator) MachineConstantPool(TM.getTargetData());
-  Alignment = TM.getTargetLowering()->getFunctionAlignment(F);
+  Alignment = TM.getTargetLowering()->getMinFunctionAlignment();
+  // FIXME: Shouldn't use pref alignment if explicit alignment is set on Fn.
+  if (!Fn->hasFnAttr(Attribute::OptimizeForSize))
+    Alignment = std::max(Alignment,
+                         TM.getTargetLowering()->getPrefFunctionAlignment());
   FunctionNumber = FunctionNum;
   JumpTableInfo = 0;
 }
@@ -300,31 +304,19 @@ void MachineFunction::print(raw_ostream &OS, SlotIndexes *Indexes) const {
     OS << "Function Live Ins: ";
     for (MachineRegisterInfo::livein_iterator
          I = RegInfo->livein_begin(), E = RegInfo->livein_end(); I != E; ++I) {
-      if (TRI)
-        OS << "%" << TRI->getName(I->first);
-      else
-        OS << " %physreg" << I->first;
-      
+      OS << PrintReg(I->first, TRI);
       if (I->second)
-        OS << " in reg%" << I->second;
-
+        OS << " in " << PrintReg(I->second, TRI);
       if (llvm::next(I) != E)
         OS << ", ";
     }
     OS << '\n';
   }
   if (RegInfo && !RegInfo->liveout_empty()) {
-    OS << "Function Live Outs: ";
+    OS << "Function Live Outs:";
     for (MachineRegisterInfo::liveout_iterator
-         I = RegInfo->liveout_begin(), E = RegInfo->liveout_end(); I != E; ++I){
-      if (TRI)
-        OS << '%' << TRI->getName(*I);
-      else
-        OS << "%physreg" << *I;
-
-      if (llvm::next(I) != E)
-        OS << " ";
-    }
+         I = RegInfo->liveout_begin(), E = RegInfo->liveout_end(); I != E; ++I)
+      OS << ' ' << PrintReg(*I, TRI);
     OS << '\n';
   }
   
diff --git a/lib/CodeGen/MachineInstr.cpp b/lib/CodeGen/MachineInstr.cpp
index 0d137eb..36b0b83 100644
--- a/lib/CodeGen/MachineInstr.cpp
+++ b/lib/CodeGen/MachineInstr.cpp
@@ -125,7 +125,8 @@ void MachineOperand::substPhysReg(unsigned Reg, const TargetRegisterInfo &TRI) {
   assert(TargetRegisterInfo::isPhysicalRegister(Reg));
   if (getSubReg()) {
     Reg = TRI.getSubReg(Reg, getSubReg());
-    assert(Reg && "Invalid SubReg for physical register");
+    // Note that getSubReg() may return 0 if the sub-register doesn't exist.
+    // That won't happen in legal code.
     setSubReg(0);
   }
   setReg(Reg);
@@ -441,6 +442,10 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const MachineMemOperand &MMO) {
     OS << ")";
   }
 
+  // Print nontemporal info.
+  if (MMO.isNonTemporal())
+    OS << "(nontemporal)";
+
   return OS;
 }
 
@@ -759,19 +764,35 @@ bool MachineInstr::isIdenticalTo(const MachineInstr *Other,
   for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = getOperand(i);
     const MachineOperand &OMO = Other->getOperand(i);
+    if (!MO.isReg()) {
+      if (!MO.isIdenticalTo(OMO))
+        return false;
+      continue;
+    }
+
     // Clients may or may not want to ignore defs when testing for equality.
     // For example, machine CSE pass only cares about finding common
     // subexpressions, so it's safe to ignore virtual register defs.
-    if (Check != CheckDefs && MO.isReg() && MO.isDef()) {
+    if (MO.isDef()) {
       if (Check == IgnoreDefs)
         continue;
-      // Check == IgnoreVRegDefs
-      if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()) ||
-          TargetRegisterInfo::isPhysicalRegister(OMO.getReg()))
-        if (MO.getReg() != OMO.getReg())
+      else if (Check == IgnoreVRegDefs) {
+        if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()) ||
+            TargetRegisterInfo::isPhysicalRegister(OMO.getReg()))
+          if (MO.getReg() != OMO.getReg())
+            return false;
+      } else {
+        if (!MO.isIdenticalTo(OMO))
           return false;
-    } else if (!MO.isIdenticalTo(OMO))
-      return false;
+        if (Check == CheckKillDead && MO.isDead() != OMO.isDead())
+          return false;
+      }
+    } else {
+      if (!MO.isIdenticalTo(OMO))
+        return false;
+      if (Check == CheckKillDead && MO.isKill() != OMO.isKill())
+        return false;
+    }
   }
   return true;
 }
diff --git a/lib/CodeGen/MachineLICM.cpp b/lib/CodeGen/MachineLICM.cpp
index 1c0f6ad..b315702 100644
--- a/lib/CodeGen/MachineLICM.cpp
+++ b/lib/CodeGen/MachineLICM.cpp
@@ -39,7 +39,6 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-
 using namespace llvm;
 
 STATISTIC(NumHoisted,
@@ -169,6 +168,10 @@ namespace {
     /// 
     bool IsLoopInvariantInst(MachineInstr &I);
 
+    /// HasAnyPHIUse - Return true if the specified register is used by any
+    /// phi node.
+    bool HasAnyPHIUse(unsigned Reg) const;
+
     /// HasHighOperandLatency - Compute operand latency between a def of 'Reg'
     /// and an use in the current loop, return true if the target considered
     /// it 'high'.
@@ -758,18 +761,25 @@ bool MachineLICM::IsLoopInvariantInst(MachineInstr &I) {
 }
 
 
-/// HasPHIUses - Return true if the specified register has any PHI use.
-static bool HasPHIUses(unsigned Reg, MachineRegisterInfo *MRI) {
+/// HasAnyPHIUse - Return true if the specified register is used by any
+/// phi node.
+bool MachineLICM::HasAnyPHIUse(unsigned Reg) const {
   for (MachineRegisterInfo::use_iterator UI = MRI->use_begin(Reg),
          UE = MRI->use_end(); UI != UE; ++UI) {
     MachineInstr *UseMI = &*UI;
     if (UseMI->isPHI())
       return true;
+    // Look pass copies as well.
+    if (UseMI->isCopy()) {
+      unsigned Def = UseMI->getOperand(0).getReg();
+      if (TargetRegisterInfo::isVirtualRegister(Def) &&
+          HasAnyPHIUse(Def))
+        return true;
+    }
   }
   return false;
 }
 
-
 /// HasHighOperandLatency - Compute operand latency between a def of 'Reg'
 /// and an use in the current loop, return true if the target considered
 /// it 'high'.
@@ -976,14 +986,13 @@ bool MachineLICM::IsProfitableToHoist(MachineInstr &MI) {
       return false;
   }
 
-  // If result(s) of this instruction is used by PHIs, then don't hoist it.
-  // The presence of joins makes it difficult for current register allocator
-  // implementation to perform remat.
+  // If result(s) of this instruction is used by PHIs outside of the loop, then
+  // don't hoist it if the instruction because it will introduce an extra copy.
   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI.getOperand(i);
     if (!MO.isReg() || !MO.isDef())
       continue;
-    if (HasPHIUses(MO.getReg(), MRI))
+    if (HasAnyPHIUse(MO.getReg()))
       return false;
   }
 
diff --git a/lib/CodeGen/MachineRegisterInfo.cpp b/lib/CodeGen/MachineRegisterInfo.cpp
index 7244d5f..08ff5bb 100644
--- a/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/lib/CodeGen/MachineRegisterInfo.cpp
@@ -79,6 +79,8 @@ MachineRegisterInfo::constrainRegClass(unsigned Reg,
 unsigned
 MachineRegisterInfo::createVirtualRegister(const TargetRegisterClass *RegClass){
   assert(RegClass && "Cannot create register without RegClass!");
+  assert(RegClass->isAllocatable() &&
+         "Virtual register RegClass must be allocatable.");
 
   // New virtual register number.
   unsigned Reg = TargetRegisterInfo::index2VirtReg(getNumVirtRegs());
diff --git a/lib/CodeGen/MachineSink.cpp b/lib/CodeGen/MachineSink.cpp
index 8a93a24..916dff7 100644
--- a/lib/CodeGen/MachineSink.cpp
+++ b/lib/CodeGen/MachineSink.cpp
@@ -265,8 +265,11 @@ bool MachineSinking::ProcessBlock(MachineBasicBlock &MBB) {
     if (MI->isDebugValue())
       continue;
 
-    if (PerformTrivialForwardCoalescing(MI, &MBB))
+    bool Joined = PerformTrivialForwardCoalescing(MI, &MBB);
+    if (Joined) {
+      MadeChange = true;
       continue;
+    }
 
     if (SinkInstruction(MI, SawStore))
       ++NumSunk, MadeChange = true;
diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp
index f95f411..471463b 100644
--- a/lib/CodeGen/MachineVerifier.cpp
+++ b/lib/CodeGen/MachineVerifier.cpp
@@ -23,6 +23,7 @@
 // the verifier errors.
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Instructions.h"
 #include "llvm/Function.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/LiveVariables.h"
@@ -32,6 +33,7 @@
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetInstrInfo.h"
@@ -394,7 +396,13 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
     if ((*I)->isLandingPad())
       LandingPadSuccs.insert(*I);
   }
-  if (LandingPadSuccs.size() > 1)
+
+  const MCAsmInfo *AsmInfo = TM->getMCAsmInfo();
+  const BasicBlock *BB = MBB->getBasicBlock();
+  if (LandingPadSuccs.size() > 1 &&
+      !(AsmInfo &&
+        AsmInfo->getExceptionHandlingType() == ExceptionHandling::SjLj &&
+        BB && isa<SwitchInst>(BB->getTerminator())))
     report("MBB has more than one landing pad successor", MBB);
 
   // Call AnalyzeBranch. If it succeeds, there several more conditions to check.
@@ -402,11 +410,6 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
   SmallVector<MachineOperand, 4> Cond;
   if (!TII->AnalyzeBranch(*const_cast<MachineBasicBlock *>(MBB),
                           TBB, FBB, Cond)) {
-    // If the block branches directly to a landing pad successor, pretend that
-    // the landing pad is a normal block.
-    LandingPadSuccs.erase(TBB);
-    LandingPadSuccs.erase(FBB);
-
     // Ok, AnalyzeBranch thinks it knows what's going on with this block. Let's
     // check whether its answers match up with reality.
     if (!TBB && !FBB) {
@@ -741,7 +744,7 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
           RC = SRC;
         }
         if (const TargetRegisterClass *DRC = TOI.getRegClass(TRI)) {
-          if (RC != DRC && !RC->hasSuperClass(DRC)) {
+          if (!RC->hasSuperClassEq(DRC)) {
             report("Illegal virtual register for instruction", MO, MONum);
             *OS << "Expected a " << DRC->getName() << " register, but got a "
                 << RC->getName() << " register\n";
diff --git a/lib/CodeGen/PHIElimination.cpp b/lib/CodeGen/PHIElimination.cpp
index 9fd5b0e..af65f13 100644
--- a/lib/CodeGen/PHIElimination.cpp
+++ b/lib/CodeGen/PHIElimination.cpp
@@ -32,7 +32,6 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include <algorithm>
-#include <map>
 using namespace llvm;
 
 static cl::opt<bool>
diff --git a/lib/CodeGen/Passes.cpp b/lib/CodeGen/Passes.cpp
index 3489db2..315aedd 100644
--- a/lib/CodeGen/Passes.cpp
+++ b/lib/CodeGen/Passes.cpp
@@ -55,6 +55,11 @@ FunctionPass *llvm::createRegisterAllocator(CodeGenOpt::Level OptLevel) {
     RegisterRegAlloc::setDefault(RegAlloc);
   }
 
+  // This forces linking of the linear scan register allocator,
+  // so -regalloc=linearscan still works in clang.
+  if (Ctor == createLinearScanRegisterAllocator)
+    return createLinearScanRegisterAllocator();
+
   if (Ctor != createDefaultRegisterAllocator)
     return Ctor();
 
@@ -63,6 +68,6 @@ FunctionPass *llvm::createRegisterAllocator(CodeGenOpt::Level OptLevel) {
   case CodeGenOpt::None:
     return createFastRegisterAllocator();
   default:
-    return createLinearScanRegisterAllocator();
+    return createGreedyRegisterAllocator();
   }
 }
diff --git a/lib/CodeGen/PostRASchedulerList.cpp b/lib/CodeGen/PostRASchedulerList.cpp
index 60c24b7..982a2a5 100644
--- a/lib/CodeGen/PostRASchedulerList.cpp
+++ b/lib/CodeGen/PostRASchedulerList.cpp
@@ -22,6 +22,7 @@
 #include "AntiDepBreaker.h"
 #include "AggressiveAntiDepBreaker.h"
 #include "CriticalAntiDepBreaker.h"
+#include "RegisterClassInfo.h"
 #include "ScheduleDAGInstrs.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/LatencyPriorityQueue.h"
@@ -80,6 +81,7 @@ namespace {
   class PostRAScheduler : public MachineFunctionPass {
     AliasAnalysis *AA;
     const TargetInstrInfo *TII;
+    RegisterClassInfo RegClassInfo;
     CodeGenOpt::Level OptLevel;
 
   public:
@@ -135,7 +137,8 @@ namespace {
   public:
     SchedulePostRATDList(
       MachineFunction &MF, MachineLoopInfo &MLI, MachineDominatorTree &MDT,
-      AliasAnalysis *AA, TargetSubtarget::AntiDepBreakMode AntiDepMode,
+      AliasAnalysis *AA, const RegisterClassInfo&,
+      TargetSubtarget::AntiDepBreakMode AntiDepMode,
       SmallVectorImpl<TargetRegisterClass*> &CriticalPathRCs);
 
     ~SchedulePostRATDList();
@@ -179,7 +182,8 @@ namespace {
 
 SchedulePostRATDList::SchedulePostRATDList(
   MachineFunction &MF, MachineLoopInfo &MLI, MachineDominatorTree &MDT,
-  AliasAnalysis *AA, TargetSubtarget::AntiDepBreakMode AntiDepMode,
+  AliasAnalysis *AA, const RegisterClassInfo &RCI,
+  TargetSubtarget::AntiDepBreakMode AntiDepMode,
   SmallVectorImpl<TargetRegisterClass*> &CriticalPathRCs)
   : ScheduleDAGInstrs(MF, MLI, MDT), Topo(SUnits), AA(AA),
     KillIndices(TRI->getNumRegs())
@@ -190,9 +194,9 @@ SchedulePostRATDList::SchedulePostRATDList(
     TM.getInstrInfo()->CreateTargetPostRAHazardRecognizer(InstrItins, this);
   AntiDepBreak =
     ((AntiDepMode == TargetSubtarget::ANTIDEP_ALL) ?
-     (AntiDepBreaker *)new AggressiveAntiDepBreaker(MF, CriticalPathRCs) :
+     (AntiDepBreaker *)new AggressiveAntiDepBreaker(MF, RCI, CriticalPathRCs) :
      ((AntiDepMode == TargetSubtarget::ANTIDEP_CRITICAL) ?
-      (AntiDepBreaker *)new CriticalAntiDepBreaker(MF) : NULL));
+      (AntiDepBreaker *)new CriticalAntiDepBreaker(MF, RCI) : NULL));
 }
 
 SchedulePostRATDList::~SchedulePostRATDList() {
@@ -205,6 +209,7 @@ bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) {
   MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
   MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>();
   AliasAnalysis *AA = &getAnalysis<AliasAnalysis>();
+  RegClassInfo.runOnMachineFunction(Fn);
 
   // Check for explicit enable/disable of post-ra scheduling.
   TargetSubtarget::AntiDepBreakMode AntiDepMode = TargetSubtarget::ANTIDEP_NONE;
@@ -230,7 +235,7 @@ bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) {
 
   DEBUG(dbgs() << "PostRAScheduler\n");
 
-  SchedulePostRATDList Scheduler(Fn, MLI, MDT, AA, AntiDepMode,
+  SchedulePostRATDList Scheduler(Fn, MLI, MDT, AA, RegClassInfo, AntiDepMode,
                                  CriticalPathRCs);
 
   // Loop over all of the basic blocks
@@ -304,7 +309,7 @@ void SchedulePostRATDList::Schedule() {
   if (AntiDepBreak != NULL) {
     unsigned Broken =
       AntiDepBreak->BreakAntiDependencies(SUnits, Begin, InsertPos,
-                                          InsertPosIndex);
+                                          InsertPosIndex, DbgValues);
 
     if (Broken != 0) {
       // We made changes. Update the dependency graph.
@@ -540,10 +545,16 @@ void SchedulePostRATDList::ReleaseSucc(SUnit *SU, SDep *SuccEdge) {
 #endif
   --SuccSU->NumPredsLeft;
 
-  // Compute how many cycles it will be before this actually becomes
-  // available.  This is the max of the start time of all predecessors plus
-  // their latencies.
-  SuccSU->setDepthToAtLeast(SU->getDepth() + SuccEdge->getLatency());
+  // Standard scheduler algorithms will recompute the depth of the successor
+  // here as such:
+  //   SuccSU->setDepthToAtLeast(SU->getDepth() + SuccEdge->getLatency());
+  //
+  // However, we lazily compute node depth instead. Note that
+  // ScheduleNodeTopDown has already updated the depth of this node which causes
+  // all descendents to be marked dirty. Setting the successor depth explicitly
+  // here would cause depth to be recomputed for all its ancestors. If the
+  // successor is not yet ready (because of a transitively redundant edge) then
+  // this causes depth computation to be quadratic in the size of the DAG.
 
   // If all the node's predecessors are scheduled, this node is ready
   // to be scheduled. Ignore the special ExitSU node.
@@ -655,6 +666,12 @@ void SchedulePostRATDList::ListScheduleTopDown() {
       ScheduleNodeTopDown(FoundSUnit, CurCycle);
       HazardRec->EmitInstruction(FoundSUnit);
       CycleHasInsts = true;
+      if (HazardRec->atIssueLimit()) {
+        DEBUG(dbgs() << "*** Max instructions per cycle " << CurCycle << '\n');
+        HazardRec->AdvanceCycle();
+        ++CurCycle;
+        CycleHasInsts = false;
+      }
     } else {
       if (CycleHasInsts) {
         DEBUG(dbgs() << "*** Finished cycle " << CurCycle << '\n');
diff --git a/lib/CodeGen/PrologEpilogInserter.cpp b/lib/CodeGen/PrologEpilogInserter.cpp
index 92e25e1..f1f3c99 100644
--- a/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/lib/CodeGen/PrologEpilogInserter.cpp
@@ -337,7 +337,7 @@ void PEI::insertCSRSpillsAndRestores(MachineFunction &Fn) {
         --BeforeI;
 
       // Restore all registers immediately before the return and any
-      // terminators that preceed it.
+      // terminators that precede it.
       if (!TFI->restoreCalleeSavedRegisters(*MBB, I, CSI, TRI)) {
         for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
           unsigned Reg = CSI[i].getReg();
@@ -437,7 +437,7 @@ void PEI::insertCSRSpillsAndRestores(MachineFunction &Fn) {
       --BeforeI;
 
     // Restore all registers immediately before the return and any
-    // terminators that preceed it.
+    // terminators that precede it.
     for (unsigned i = 0, e = blockCSI.size(); i != e; ++i) {
       unsigned Reg = blockCSI[i].getReg();
       const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
diff --git a/lib/CodeGen/README.txt b/lib/CodeGen/README.txt
index b655dda..7f75f65 100644
--- a/lib/CodeGen/README.txt
+++ b/lib/CodeGen/README.txt
@@ -26,7 +26,7 @@ and then "merge" mul and mov:
         sxth r3, r3
         mla r4, r3, lr, r4
 
-It also increase the likelyhood the store may become dead.
+It also increase the likelihood the store may become dead.
 
 //===---------------------------------------------------------------------===//
 
@@ -162,7 +162,7 @@ synthesize the various copy insertion/inspection methods in TargetInstrInfo.
 
 //===---------------------------------------------------------------------===//
 
-Stack coloring improvments:
+Stack coloring improvements:
 
 1. Do proper LiveStackAnalysis on all stack objects including those which are
    not spill slots.
diff --git a/lib/CodeGen/RegAllocBase.h b/lib/CodeGen/RegAllocBase.h
index f431d5a..0316421 100644
--- a/lib/CodeGen/RegAllocBase.h
+++ b/lib/CodeGen/RegAllocBase.h
@@ -39,6 +39,7 @@
 
 #include "llvm/ADT/OwningPtr.h"
 #include "LiveIntervalUnion.h"
+#include "RegisterClassInfo.h"
 
 namespace llvm {
 
@@ -91,6 +92,7 @@ protected:
   MachineRegisterInfo *MRI;
   VirtRegMap *VRM;
   LiveIntervals *LIS;
+  RegisterClassInfo RegClassInfo;
   LiveUnionArray PhysReg2LiveUnion;
 
   // Current queries, one per physreg. They must be reinitialized each time we
@@ -113,6 +115,10 @@ protected:
     return Queries[PhysReg];
   }
 
+  // Invalidate all cached information about virtual registers - live ranges may
+  // have changed.
+  void invalidateVirtRegs() { ++UserTag; }
+
   // The top-level driver. The output is a VirtRegMap that us updated with
   // physical register assignments.
   //
diff --git a/lib/CodeGen/RegAllocBasic.cpp b/lib/CodeGen/RegAllocBasic.cpp
index 0e218a7..1d77b29 100644
--- a/lib/CodeGen/RegAllocBasic.cpp
+++ b/lib/CodeGen/RegAllocBasic.cpp
@@ -13,10 +13,10 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "regalloc"
+#include "RegAllocBase.h"
 #include "LiveDebugVariables.h"
 #include "LiveIntervalUnion.h"
 #include "LiveRangeEdit.h"
-#include "RegAllocBase.h"
 #include "RenderMachineFunction.h"
 #include "Spiller.h"
 #include "VirtRegMap.h"
@@ -85,7 +85,6 @@ class RABasic : public MachineFunctionPass, public RegAllocBase
 {
   // context
   MachineFunction *MF;
-  BitVector ReservedRegs;
 
   // analyses
   LiveStacks *LS;
@@ -235,9 +234,14 @@ void RegAllocBase::init(VirtRegMap &vrm, LiveIntervals &lis) {
   MRI = &vrm.getRegInfo();
   VRM = &vrm;
   LIS = &lis;
-  PhysReg2LiveUnion.init(UnionAllocator, TRI->getNumRegs());
-  // Cache an interferece query for each physical reg
-  Queries.reset(new LiveIntervalUnion::Query[PhysReg2LiveUnion.numRegs()]);
+  RegClassInfo.runOnMachineFunction(vrm.getMachineFunction());
+
+  const unsigned NumRegs = TRI->getNumRegs();
+  if (NumRegs != PhysReg2LiveUnion.numRegs()) {
+    PhysReg2LiveUnion.init(UnionAllocator, NumRegs);
+    // Cache an interferece query for each physical reg
+    Queries.reset(new LiveIntervalUnion::Query[PhysReg2LiveUnion.numRegs()]);
+  }
 }
 
 void RegAllocBase::LiveUnionArray::clear() {
@@ -251,13 +255,15 @@ void RegAllocBase::LiveUnionArray::clear() {
 }
 
 void RegAllocBase::releaseMemory() {
-  PhysReg2LiveUnion.clear();
+  for (unsigned r = 0, e = PhysReg2LiveUnion.numRegs(); r != e; ++r)
+    PhysReg2LiveUnion[r].clear();
 }
 
 // Visit all the live registers. If they are already assigned to a physical
 // register, unify them with the corresponding LiveIntervalUnion, otherwise push
 // them on the priority queue for later assignment.
 void RegAllocBase::seedLiveRegs() {
+  NamedRegionTimer T("Seed Live Regs", TimerGroupName, TimePassesIsEnabled);
   for (LiveIntervals::iterator I = LIS->begin(), E = LIS->end(); I != E; ++I) {
     unsigned RegNum = I->first;
     LiveInterval &VirtReg = *I->second;
@@ -273,6 +279,7 @@ void RegAllocBase::assign(LiveInterval &VirtReg, unsigned PhysReg) {
                << " to " << PrintReg(PhysReg, TRI) << '\n');
   assert(!VRM->hasPhys(VirtReg.reg) && "Duplicate VirtReg assignment");
   VRM->assignVirt2Phys(VirtReg.reg, PhysReg);
+  MRI->setPhysRegUsed(PhysReg);
   PhysReg2LiveUnion[PhysReg].unify(VirtReg);
   ++NumAssigned;
 }
@@ -303,7 +310,7 @@ void RegAllocBase::allocatePhysRegs() {
     }
 
     // Invalidate all interference queries, live ranges could have changed.
-    ++UserTag;
+    invalidateVirtRegs();
 
     // selectOrSplit requests the allocator to return an available physical
     // register if possible and populate a list of new live intervals that
@@ -315,6 +322,23 @@ void RegAllocBase::allocatePhysRegs() {
     VirtRegVec SplitVRegs;
     unsigned AvailablePhysReg = selectOrSplit(*VirtReg, SplitVRegs);
 
+    if (AvailablePhysReg == ~0u) {
+      // selectOrSplit failed to find a register!
+      std::string msg;
+      raw_string_ostream Msg(msg);
+      Msg << "Ran out of registers during register allocation!"
+             "\nCannot allocate: " << *VirtReg;
+      for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(VirtReg->reg);
+      MachineInstr *MI = I.skipInstruction();) {
+        if (!MI->isInlineAsm())
+          continue;
+        Msg << "\nPlease check your inline asm statement for "
+          "invalid constraints:\n";
+        MI->print(Msg, &VRM->getMachineFunction().getTarget());
+      }
+      report_fatal_error(Msg.str());
+    }
+
     if (AvailablePhysReg)
       assign(*VirtReg, AvailablePhysReg);
 
@@ -404,29 +428,31 @@ RegAllocBase::spillInterferences(LiveInterval &VirtReg, unsigned PhysReg,
 // Add newly allocated physical registers to the MBB live in sets.
 void RegAllocBase::addMBBLiveIns(MachineFunction *MF) {
   NamedRegionTimer T("MBB Live Ins", TimerGroupName, TimePassesIsEnabled);
-  typedef SmallVector<MachineBasicBlock*, 8> MBBVec;
-  MBBVec liveInMBBs;
-  MachineBasicBlock &entryMBB = *MF->begin();
+  SlotIndexes *Indexes = LIS->getSlotIndexes();
+  if (MF->size() <= 1)
+    return;
 
+  LiveIntervalUnion::SegmentIter SI;
   for (unsigned PhysReg = 0; PhysReg < PhysReg2LiveUnion.numRegs(); ++PhysReg) {
     LiveIntervalUnion &LiveUnion = PhysReg2LiveUnion[PhysReg];
     if (LiveUnion.empty())
       continue;
-    for (LiveIntervalUnion::SegmentIter SI = LiveUnion.begin(); SI.valid();
-         ++SI) {
-
-      // Find the set of basic blocks which this range is live into...
-      liveInMBBs.clear();
-      if (!LIS->findLiveInMBBs(SI.start(), SI.stop(), liveInMBBs)) continue;
-
-      // And add the physreg for this interval to their live-in sets.
-      for (MBBVec::iterator I = liveInMBBs.begin(), E = liveInMBBs.end();
-           I != E; ++I) {
-        MachineBasicBlock *MBB = *I;
-        if (MBB == &entryMBB) continue;
-        if (MBB->isLiveIn(PhysReg)) continue;
-        MBB->addLiveIn(PhysReg);
-      }
+    MachineFunction::iterator MBB = llvm::next(MF->begin());
+    MachineFunction::iterator MFE = MF->end();
+    SlotIndex Start, Stop;
+    tie(Start, Stop) = Indexes->getMBBRange(MBB);
+    SI.setMap(LiveUnion.getMap());
+    SI.find(Start);
+    while (SI.valid()) {
+      if (SI.start() <= Start) {
+        if (!MBB->isLiveIn(PhysReg))
+          MBB->addLiveIn(PhysReg);
+      } else if (SI.start() > Stop)
+        MBB = Indexes->getMBBFromIndex(SI.start().getPrevIndex());
+      if (++MBB == MFE)
+        break;
+      tie(Start, Stop) = Indexes->getMBBRange(MBB);
+      SI.advanceTo(Start);
     }
   }
 }
@@ -454,14 +480,11 @@ unsigned RABasic::selectOrSplit(LiveInterval &VirtReg,
   SmallVector<unsigned, 8> PhysRegSpillCands;
 
   // Check for an available register in this class.
-  const TargetRegisterClass *TRC = MRI->getRegClass(VirtReg.reg);
-
-  for (TargetRegisterClass::iterator I = TRC->allocation_order_begin(*MF),
-         E = TRC->allocation_order_end(*MF);
-       I != E; ++I) {
-
+  ArrayRef<unsigned> Order =
+    RegClassInfo.getOrder(MRI->getRegClass(VirtReg.reg));
+  for (ArrayRef<unsigned>::iterator I = Order.begin(), E = Order.end(); I != E;
+       ++I) {
     unsigned PhysReg = *I;
-    if (ReservedRegs.test(PhysReg)) continue;
 
     // Check interference and as a side effect, intialize queries for this
     // VirtReg and its aliases.
@@ -490,8 +513,11 @@ unsigned RABasic::selectOrSplit(LiveInterval &VirtReg,
     // Tell the caller to allocate to this newly freed physical register.
     return *PhysRegI;
   }
+
   // No other spill candidates were found, so spill the current VirtReg.
   DEBUG(dbgs() << "spilling: " << VirtReg << '\n');
+  if (!VirtReg.isSpillable())
+    return ~0u;
   LiveRangeEdit LRE(VirtReg, SplitVRegs);
   spiller().spill(LRE);
 
@@ -509,9 +535,6 @@ bool RABasic::runOnMachineFunction(MachineFunction &mf) {
   DEBUG(RMF = &getAnalysis<RenderMachineFunction>());
 
   RegAllocBase::init(getAnalysis<VirtRegMap>(), getAnalysis<LiveIntervals>());
-
-  ReservedRegs = TRI->getReservedRegs(*MF);
-
   SpillerInstance.reset(createInlineSpiller(*this, *MF, *VRM));
 
   allocatePhysRegs();
diff --git a/lib/CodeGen/RegAllocFast.cpp b/lib/CodeGen/RegAllocFast.cpp
index 15036e3..65ebdf8 100644
--- a/lib/CodeGen/RegAllocFast.cpp
+++ b/lib/CodeGen/RegAllocFast.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "regalloc"
+#include "RegisterClassInfo.h"
 #include "llvm/BasicBlock.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -58,6 +59,7 @@ namespace {
     MachineRegisterInfo *MRI;
     const TargetRegisterInfo *TRI;
     const TargetInstrInfo *TII;
+    RegisterClassInfo RegClassInfo;
 
     // Basic block currently being allocated.
     MachineBasicBlock *MBB;
@@ -97,7 +99,7 @@ namespace {
       // immediately without checking aliases.
       regFree,
 
-      // A reserved register has been assigned expolicitly (e.g., setting up a
+      // A reserved register has been assigned explicitly (e.g., setting up a
       // call parameter), and it remains reserved until it is used.
       regReserved
 
@@ -113,9 +115,6 @@ namespace {
     // instruction, and so cannot be allocated.
     BitVector UsedInInstr;
 
-    // Allocatable - vector of allocatable physical registers.
-    BitVector Allocatable;
-
     // SkippedInstrs - Descriptors of instructions whose clobber list was
     // ignored because all registers were spilled. It is still necessary to
     // mark all the clobbered registers as used by the function.
@@ -396,7 +395,6 @@ void RAFast::definePhysReg(MachineInstr *MI, unsigned PhysReg,
   PhysRegState[PhysReg] = NewState;
   for (const unsigned *AS = TRI->getAliasSet(PhysReg);
        unsigned Alias = *AS; ++AS) {
-    UsedInInstr.set(Alias);
     switch (unsigned VirtReg = PhysRegState[Alias]) {
     case regDisabled:
       break;
@@ -420,20 +418,25 @@ void RAFast::definePhysReg(MachineInstr *MI, unsigned PhysReg,
 // can be allocated directly.
 // Returns spillImpossible when PhysReg or an alias can't be spilled.
 unsigned RAFast::calcSpillCost(unsigned PhysReg) const {
-  if (UsedInInstr.test(PhysReg))
+  if (UsedInInstr.test(PhysReg)) {
+    DEBUG(dbgs() << "PhysReg: " << PhysReg << " is already used in instr.\n");
     return spillImpossible;
+  }
   switch (unsigned VirtReg = PhysRegState[PhysReg]) {
   case regDisabled:
     break;
   case regFree:
     return 0;
   case regReserved:
+    DEBUG(dbgs() << "VirtReg: " << VirtReg << " corresponding to PhysReg: "
+          << PhysReg << " is reserved already.\n");
     return spillImpossible;
   default:
     return LiveVirtRegs.lookup(VirtReg).Dirty ? spillDirty : spillClean;
   }
 
-  // This is a disabled register, add up const of aliases.
+  // This is a disabled register, add up cost of aliases.
+  DEBUG(dbgs() << "\tRegister: " << PhysReg << " is disabled.\n");
   unsigned Cost = 0;
   for (const unsigned *AS = TRI->getAliasSet(PhysReg);
        unsigned Alias = *AS; ++AS) {
@@ -479,30 +482,26 @@ void RAFast::allocVirtReg(MachineInstr *MI, LiveRegEntry &LRE, unsigned Hint) {
 
   // Ignore invalid hints.
   if (Hint && (!TargetRegisterInfo::isPhysicalRegister(Hint) ||
-               !RC->contains(Hint) || !Allocatable.test(Hint)))
+               !RC->contains(Hint) || !RegClassInfo.isAllocatable(Hint)))
     Hint = 0;
 
   // Take hint when possible.
   if (Hint) {
-    switch(calcSpillCost(Hint)) {
-    default:
-      definePhysReg(MI, Hint, regFree);
-      // Fall through.
-    case 0:
+    // Ignore the hint if we would have to spill a dirty register.
+    unsigned Cost = calcSpillCost(Hint);
+    if (Cost < spillDirty) {
+      if (Cost)
+        definePhysReg(MI, Hint, regFree);
       return assignVirtToPhysReg(LRE, Hint);
-    case spillImpossible:
-      break;
     }
   }
 
-  TargetRegisterClass::iterator AOB = RC->allocation_order_begin(*MF);
-  TargetRegisterClass::iterator AOE = RC->allocation_order_end(*MF);
+  ArrayRef<unsigned> AO = RegClassInfo.getOrder(RC);
 
   // First try to find a completely free register.
-  for (TargetRegisterClass::iterator I = AOB; I != AOE; ++I) {
+  for (ArrayRef<unsigned>::iterator I = AO.begin(), E = AO.end(); I != E; ++I) {
     unsigned PhysReg = *I;
-    if (PhysRegState[PhysReg] == regFree && !UsedInInstr.test(PhysReg) &&
-        Allocatable.test(PhysReg))
+    if (PhysRegState[PhysReg] == regFree && !UsedInInstr.test(PhysReg))
       return assignVirtToPhysReg(LRE, PhysReg);
   }
 
@@ -510,10 +509,11 @@ void RAFast::allocVirtReg(MachineInstr *MI, LiveRegEntry &LRE, unsigned Hint) {
                << RC->getName() << "\n");
 
   unsigned BestReg = 0, BestCost = spillImpossible;
-  for (TargetRegisterClass::iterator I = AOB; I != AOE; ++I) {
-    if (!Allocatable.test(*I))
-      continue;
+  for (ArrayRef<unsigned>::iterator I = AO.begin(), E = AO.end(); I != E; ++I) {
     unsigned Cost = calcSpillCost(*I);
+    DEBUG(dbgs() << "\tRegister: " << *I << "\n");
+    DEBUG(dbgs() << "\tCost: " << Cost << "\n");
+    DEBUG(dbgs() << "\tBestCost: " << BestCost << "\n");
     // Cost is 0 when all aliases are already disabled.
     if (Cost == 0)
       return assignVirtToPhysReg(LRE, *I);
@@ -722,9 +722,8 @@ void RAFast::handleThroughOperands(MachineInstr *MI,
     if (!MO.isReg() || (MO.isDef() && !MO.isEarlyClobber())) continue;
     unsigned Reg = MO.getReg();
     if (!Reg || !TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
+    DEBUG(dbgs() << "\tSetting reg " << Reg << " as used in instr\n");
     UsedInInstr.set(Reg);
-    for (const unsigned *AS = TRI->getAliasSet(Reg); *AS; ++AS)
-      UsedInInstr.set(*AS);
   }
 
   // Also mark PartialDefs as used to avoid reallocation.
@@ -764,7 +763,7 @@ void RAFast::AllocateBasicBlock() {
   // Add live-in registers as live.
   for (MachineBasicBlock::livein_iterator I = MBB->livein_begin(),
          E = MBB->livein_end(); I != E; ++I)
-    if (Allocatable.test(*I))
+    if (RegClassInfo.isAllocatable(*I))
       definePhysReg(MII, *I, regReserved);
 
   SmallVector<unsigned, 8> VirtDead;
@@ -895,7 +894,7 @@ void RAFast::AllocateBasicBlock() {
         }
         continue;
       }
-      if (!Allocatable.test(Reg)) continue;
+      if (!RegClassInfo.isAllocatable(Reg)) continue;
       if (MO.isUse()) {
         usePhysReg(MO);
       } else if (MO.isEarlyClobber()) {
@@ -984,7 +983,7 @@ void RAFast::AllocateBasicBlock() {
       unsigned Reg = MO.getReg();
 
       if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
-        if (!Allocatable.test(Reg)) continue;
+        if (!RegClassInfo.isAllocatable(Reg)) continue;
         definePhysReg(MI, Reg, (MO.isImplicit() || MO.isDead()) ?
                                regFree : regReserved);
         continue;
@@ -1040,9 +1039,8 @@ bool RAFast::runOnMachineFunction(MachineFunction &Fn) {
   TM = &Fn.getTarget();
   TRI = TM->getRegisterInfo();
   TII = TM->getInstrInfo();
-
+  RegClassInfo.runOnMachineFunction(Fn);
   UsedInInstr.resize(TRI->getNumRegs());
-  Allocatable = TRI->getAllocatableSet(*MF);
 
   // initialize the virtual->physical register map to have a 'null'
   // mapping for all virtual registers
diff --git a/lib/CodeGen/RegAllocGreedy.cpp b/lib/CodeGen/RegAllocGreedy.cpp
index 889bca3..8d06325 100644
--- a/lib/CodeGen/RegAllocGreedy.cpp
+++ b/lib/CodeGen/RegAllocGreedy.cpp
@@ -62,7 +62,6 @@ class RAGreedy : public MachineFunctionPass,
 
   // context
   MachineFunction *MF;
-  BitVector ReservedRegs;
 
   // analyses
   SlotIndexes *Indexes;
@@ -72,6 +71,7 @@ class RAGreedy : public MachineFunctionPass,
   MachineLoopRanges *LoopRanges;
   EdgeBundles *Bundles;
   SpillPlacement *SpillPlacer;
+  LiveDebugVariables *DebugVars;
 
   // state
   std::auto_ptr<Spiller> SpillerInstance;
@@ -94,12 +94,13 @@ class RAGreedy : public MachineFunctionPass,
     RS_New,      ///< Never seen before.
     RS_First,    ///< First time in the queue.
     RS_Second,   ///< Second time in the queue.
-    RS_Region,   ///< Produced by region splitting.
-    RS_Block,    ///< Produced by per-block splitting.
+    RS_Global,   ///< Produced by global splitting.
     RS_Local,    ///< Produced by local splitting.
     RS_Spill     ///< Produced by spilling.
   };
 
+  static const char *const StageName[];
+
   IndexedMap<unsigned char, VirtReg2IndexFunctor> LRStage;
 
   LiveRangeStage getStage(const LiveInterval &VirtReg) const {
@@ -116,6 +117,15 @@ class RAGreedy : public MachineFunctionPass,
     }
   }
 
+  // Eviction. Sometimes an assigned live range can be evicted without
+  // conditions, but other times it must be split after being evicted to avoid
+  // infinite loops.
+  enum CanEvict {
+    CE_Never,    ///< Can never evict.
+    CE_Always,   ///< Can always evict.
+    CE_WithSplit ///< Can evict only if range is also split or spilled.
+  };
+
   // splitting state.
   std::auto_ptr<SplitAnalysis> SA;
   std::auto_ptr<SplitEditor> SE;
@@ -126,14 +136,17 @@ class RAGreedy : public MachineFunctionPass,
   /// All basic blocks where the current register has uses.
   SmallVector<SpillPlacement::BlockConstraint, 8> SplitConstraints;
 
-  /// All basic blocks where the current register is live-through and
-  /// interference free.
-  SmallVector<unsigned, 8> TransparentBlocks;
-
   /// Global live range splitting candidate info.
   struct GlobalSplitCandidate {
     unsigned PhysReg;
     BitVector LiveBundles;
+    SmallVector<unsigned, 8> ActiveBlocks;
+
+    void reset(unsigned Reg) {
+      PhysReg = Reg;
+      LiveBundles.clear();
+      ActiveBlocks.clear();
+    }
   };
 
   /// Candidate info for for each PhysReg in AllocationOrder.
@@ -141,10 +154,6 @@ class RAGreedy : public MachineFunctionPass,
   /// class.
   SmallVector<GlobalSplitCandidate, 32> GlobalCand;
 
-  /// For every instruction in SA->UseSlots, store the previous non-copy
-  /// instruction.
-  SmallVector<SlotIndex, 8> PrevSlot;
-
 public:
   RAGreedy();
 
@@ -173,18 +182,21 @@ private:
   void LRE_WillShrinkVirtReg(unsigned);
   void LRE_DidCloneVirtReg(unsigned, unsigned);
 
-  bool addSplitConstraints(unsigned, float&);
-  float calcGlobalSplitCost(unsigned, const BitVector&);
-  void splitAroundRegion(LiveInterval&, unsigned, const BitVector&,
+  float calcSpillCost();
+  bool addSplitConstraints(InterferenceCache::Cursor, float&);
+  void addThroughConstraints(InterferenceCache::Cursor, ArrayRef<unsigned>);
+  void growRegion(GlobalSplitCandidate &Cand, InterferenceCache::Cursor);
+  float calcGlobalSplitCost(GlobalSplitCandidate&, InterferenceCache::Cursor);
+  void splitAroundRegion(LiveInterval&, GlobalSplitCandidate&,
                          SmallVectorImpl<LiveInterval*>&);
   void calcGapWeights(unsigned, SmallVectorImpl<float>&);
-  SlotIndex getPrevMappedIndex(const MachineInstr*);
-  void calcPrevSlots();
-  unsigned nextSplitPoint(unsigned);
+  CanEvict canEvict(LiveInterval &A, LiveInterval &B);
   bool canEvictInterference(LiveInterval&, unsigned, float&);
 
+  unsigned tryAssign(LiveInterval&, AllocationOrder&,
+                     SmallVectorImpl<LiveInterval*>&);
   unsigned tryEvict(LiveInterval&, AllocationOrder&,
-                    SmallVectorImpl<LiveInterval*>&);
+                    SmallVectorImpl<LiveInterval*>&, unsigned = ~0u);
   unsigned tryRegionSplit(LiveInterval&, AllocationOrder&,
                           SmallVectorImpl<LiveInterval*>&);
   unsigned tryLocalSplit(LiveInterval&, AllocationOrder&,
@@ -196,6 +208,22 @@ private:
 
 char RAGreedy::ID = 0;
 
+#ifndef NDEBUG
+const char *const RAGreedy::StageName[] = {
+  "RS_New",
+  "RS_First",
+  "RS_Second",
+  "RS_Global",
+  "RS_Local",
+  "RS_Spill"
+};
+#endif
+
+// Hysteresis to use when comparing floats.
+// This helps stabilize decisions based on float comparisons.
+const float Hysteresis = 0.98f;
+
+
 FunctionPass* llvm::createGreedyRegisterAllocator() {
   return new RAGreedy();
 }
@@ -287,6 +315,7 @@ void RAGreedy::LRE_DidCloneVirtReg(unsigned New, unsigned Old) {
 void RAGreedy::releaseMemory() {
   SpillerInstance.reset(0);
   LRStage.clear();
+  GlobalCand.clear();
   RegAllocBase::releaseMemory();
 }
 
@@ -329,28 +358,85 @@ LiveInterval *RAGreedy::dequeue() {
   return LI;
 }
 
+
+//===----------------------------------------------------------------------===//
+//                            Direct Assignment
+//===----------------------------------------------------------------------===//
+
+/// tryAssign - Try to assign VirtReg to an available register.
+unsigned RAGreedy::tryAssign(LiveInterval &VirtReg,
+                             AllocationOrder &Order,
+                             SmallVectorImpl<LiveInterval*> &NewVRegs) {
+  Order.rewind();
+  unsigned PhysReg;
+  while ((PhysReg = Order.next()))
+    if (!checkPhysRegInterference(VirtReg, PhysReg))
+      break;
+  if (!PhysReg || Order.isHint(PhysReg))
+    return PhysReg;
+
+  // PhysReg is available. Try to evict interference from a cheaper alternative.
+  unsigned Cost = TRI->getCostPerUse(PhysReg);
+
+  // Most registers have 0 additional cost.
+  if (!Cost)
+    return PhysReg;
+
+  DEBUG(dbgs() << PrintReg(PhysReg, TRI) << " is available at cost " << Cost
+               << '\n');
+  unsigned CheapReg = tryEvict(VirtReg, Order, NewVRegs, Cost);
+  return CheapReg ? CheapReg : PhysReg;
+}
+
+
 //===----------------------------------------------------------------------===//
 //                         Interference eviction
 //===----------------------------------------------------------------------===//
 
+/// canEvict - determine if A can evict the assigned live range B. The eviction
+/// policy defined by this function together with the allocation order defined
+/// by enqueue() decides which registers ultimately end up being split and
+/// spilled.
+///
+/// This function must define a non-circular relation when it returns CE_Always,
+/// otherwise infinite eviction loops are possible. When evicting a <= RS_Second
+/// range, it is possible to return CE_WithSplit which forces the evicted
+/// register to be split or spilled before it can evict anything again. That
+/// guarantees progress.
+RAGreedy::CanEvict RAGreedy::canEvict(LiveInterval &A, LiveInterval &B) {
+  return A.weight > B.weight ? CE_Always : CE_Never;
+}
+
 /// canEvict - Return true if all interferences between VirtReg and PhysReg can
-/// be evicted. Set maxWeight to the maximal spill weight of an interference.
+/// be evicted.
+/// Return false if any interference is heavier than MaxWeight.
+/// On return, set MaxWeight to the maximal spill weight of an interference.
 bool RAGreedy::canEvictInterference(LiveInterval &VirtReg, unsigned PhysReg,
                                     float &MaxWeight) {
   float Weight = 0;
   for (const unsigned *AliasI = TRI->getOverlaps(PhysReg); *AliasI; ++AliasI) {
     LiveIntervalUnion::Query &Q = query(VirtReg, *AliasI);
-    // If there is 10 or more interferences, chances are one is smaller.
-    if (Q.collectInterferingVRegs(10) >= 10)
+    // If there is 10 or more interferences, chances are one is heavier.
+    if (Q.collectInterferingVRegs(10, MaxWeight) >= 10)
       return false;
 
-    // Check if any interfering live range is heavier than VirtReg.
-    for (unsigned i = 0, e = Q.interferingVRegs().size(); i != e; ++i) {
-      LiveInterval *Intf = Q.interferingVRegs()[i];
+    // Check if any interfering live range is heavier than MaxWeight.
+    for (unsigned i = Q.interferingVRegs().size(); i; --i) {
+      LiveInterval *Intf = Q.interferingVRegs()[i - 1];
       if (TargetRegisterInfo::isPhysicalRegister(Intf->reg))
         return false;
-      if (Intf->weight >= VirtReg.weight)
+      if (Intf->weight >= MaxWeight)
         return false;
+      switch (canEvict(VirtReg, *Intf)) {
+      case CE_Always:
+        break;
+      case CE_Never:
+        return false;
+      case CE_WithSplit:
+        if (getStage(*Intf) > RS_Second)
+          return false;
+        break;
+      }
       Weight = std::max(Weight, Intf->weight);
     }
   }
@@ -364,21 +450,28 @@ bool RAGreedy::canEvictInterference(LiveInterval &VirtReg, unsigned PhysReg,
 /// @return         Physreg to assign VirtReg, or 0.
 unsigned RAGreedy::tryEvict(LiveInterval &VirtReg,
                             AllocationOrder &Order,
-                            SmallVectorImpl<LiveInterval*> &NewVRegs){
+                            SmallVectorImpl<LiveInterval*> &NewVRegs,
+                            unsigned CostPerUseLimit) {
   NamedRegionTimer T("Evict", TimerGroupName, TimePassesIsEnabled);
 
   // Keep track of the lightest single interference seen so far.
-  float BestWeight = 0;
+  float BestWeight = HUGE_VALF;
   unsigned BestPhys = 0;
 
   Order.rewind();
   while (unsigned PhysReg = Order.next()) {
-    float Weight = 0;
+    if (TRI->getCostPerUse(PhysReg) >= CostPerUseLimit)
+      continue;
+    // The first use of a register in a function has cost 1.
+    if (CostPerUseLimit == 1 && !MRI->isPhysRegUsed(PhysReg))
+      continue;
+
+    float Weight = BestWeight;
     if (!canEvictInterference(VirtReg, PhysReg, Weight))
       continue;
 
     // This is an eviction candidate.
-    DEBUG(dbgs() << "max " << PrintReg(PhysReg, TRI) << " interference = "
+    DEBUG(dbgs() << PrintReg(PhysReg, TRI) << " interference = "
                  << Weight << '\n');
     if (BestPhys && Weight >= BestWeight)
       continue;
@@ -403,6 +496,11 @@ unsigned RAGreedy::tryEvict(LiveInterval &VirtReg,
       unassign(*Intf, VRM->getPhys(Intf->reg));
       ++NumEvicted;
       NewVRegs.push_back(Intf);
+      // Prevent looping by forcing the evicted ranges to be split before they
+      // can evict anything else.
+      if (getStage(*Intf) < RS_Second &&
+          canEvict(VirtReg, *Intf) == CE_WithSplit)
+        LRStage[Intf->reg] = RS_Second;
     }
   }
   return BestPhys;
@@ -417,9 +515,9 @@ unsigned RAGreedy::tryEvict(LiveInterval &VirtReg,
 /// interference pattern in Physreg and its aliases. Add the constraints to
 /// SpillPlacement and return the static cost of this split in Cost, assuming
 /// that all preferences in SplitConstraints are met.
-/// If it is evident that no bundles will be live, abort early and return false.
-bool RAGreedy::addSplitConstraints(unsigned PhysReg, float &Cost) {
-  InterferenceCache::Cursor Intf(IntfCache, PhysReg);
+/// Return false if there are no bundles with positive bias.
+bool RAGreedy::addSplitConstraints(InterferenceCache::Cursor Intf,
+                                   float &Cost) {
   ArrayRef<SplitAnalysis::BlockInfo> UseBlocks = SA->getUseBlocks();
 
   // Reset interference dependent info.
@@ -446,7 +544,7 @@ bool RAGreedy::addSplitConstraints(unsigned PhysReg, float &Cost) {
         BC.Entry = SpillPlacement::MustSpill, ++Ins;
       else if (Intf.first() < BI.FirstUse)
         BC.Entry = SpillPlacement::PrefSpill, ++Ins;
-      else if (Intf.first() < (BI.LiveThrough ? BI.LastUse : BI.Kill))
+      else if (Intf.first() < BI.LastUse)
         ++Ins;
     }
 
@@ -456,7 +554,7 @@ bool RAGreedy::addSplitConstraints(unsigned PhysReg, float &Cost) {
         BC.Exit = SpillPlacement::MustSpill, ++Ins;
       else if (Intf.last() > BI.LastUse)
         BC.Exit = SpillPlacement::PrefSpill, ++Ins;
-      else if (Intf.last() > (BI.LiveThrough ? BI.FirstUse : BI.Def))
+      else if (Intf.last() > BI.FirstUse)
         ++Ins;
     }
 
@@ -464,35 +562,41 @@ bool RAGreedy::addSplitConstraints(unsigned PhysReg, float &Cost) {
     if (Ins)
       StaticCost += Ins * SpillPlacer->getBlockFrequency(BC.Number);
   }
+  Cost = StaticCost;
 
   // Add constraints for use-blocks. Note that these are the only constraints
   // that may add a positive bias, it is downhill from here.
   SpillPlacer->addConstraints(SplitConstraints);
-  if (SpillPlacer->getPositiveNodes() == 0)
-    return false;
+  return SpillPlacer->scanActiveBundles();
+}
 
-  Cost = StaticCost;
 
-  // Now handle the live-through blocks without uses. These can only add
-  // negative bias, so we can abort whenever there are no more positive nodes.
-  // Compute constraints for a group of 8 blocks at a time.
+/// addThroughConstraints - Add constraints and links to SpillPlacer from the
+/// live-through blocks in Blocks.
+void RAGreedy::addThroughConstraints(InterferenceCache::Cursor Intf,
+                                     ArrayRef<unsigned> Blocks) {
   const unsigned GroupSize = 8;
   SpillPlacement::BlockConstraint BCS[GroupSize];
-  unsigned B = 0;
-  TransparentBlocks.clear();
+  unsigned TBS[GroupSize];
+  unsigned B = 0, T = 0;
 
-  ArrayRef<unsigned> ThroughBlocks = SA->getThroughBlocks();
-  for (unsigned i = 0; i != ThroughBlocks.size(); ++i) {
-    unsigned Number = ThroughBlocks[i];
-    assert(B < GroupSize && "Array overflow");
-    BCS[B].Number = Number;
+  for (unsigned i = 0; i != Blocks.size(); ++i) {
+    unsigned Number = Blocks[i];
     Intf.moveToBlock(Number);
 
     if (!Intf.hasInterference()) {
-      TransparentBlocks.push_back(Number);
+      assert(T < GroupSize && "Array overflow");
+      TBS[T] = Number;
+      if (++T == GroupSize) {
+        SpillPlacer->addLinks(ArrayRef<unsigned>(TBS, T));
+        T = 0;
+      }
       continue;
     }
 
+    assert(B < GroupSize && "Array overflow");
+    BCS[B].Number = Number;
+
     // Interference for the live-in value.
     if (Intf.first() <= Indexes->getMBBStartIdx(Number))
       BCS[B].Entry = SpillPlacement::MustSpill;
@@ -509,30 +613,94 @@ bool RAGreedy::addSplitConstraints(unsigned PhysReg, float &Cost) {
       ArrayRef<SpillPlacement::BlockConstraint> Array(BCS, B);
       SpillPlacer->addConstraints(Array);
       B = 0;
-      // Abort early when all hope is lost.
-      if (SpillPlacer->getPositiveNodes() == 0)
-        return false;
     }
   }
 
   ArrayRef<SpillPlacement::BlockConstraint> Array(BCS, B);
   SpillPlacer->addConstraints(Array);
-  if (SpillPlacer->getPositiveNodes() == 0)
-    return false;
+  SpillPlacer->addLinks(ArrayRef<unsigned>(TBS, T));
+}
 
-  // There is still some positive bias. Add all the links.
-  SpillPlacer->addLinks(TransparentBlocks);
-  return true;
+void RAGreedy::growRegion(GlobalSplitCandidate &Cand,
+                          InterferenceCache::Cursor Intf) {
+  // Keep track of through blocks that have not been added to SpillPlacer.
+  BitVector Todo = SA->getThroughBlocks();
+  SmallVectorImpl<unsigned> &ActiveBlocks = Cand.ActiveBlocks;
+  unsigned AddedTo = 0;
+#ifndef NDEBUG
+  unsigned Visited = 0;
+#endif
+
+  for (;;) {
+    ArrayRef<unsigned> NewBundles = SpillPlacer->getRecentPositive();
+    if (NewBundles.empty())
+      break;
+    // Find new through blocks in the periphery of PrefRegBundles.
+    for (int i = 0, e = NewBundles.size(); i != e; ++i) {
+      unsigned Bundle = NewBundles[i];
+      // Look at all blocks connected to Bundle in the full graph.
+      ArrayRef<unsigned> Blocks = Bundles->getBlocks(Bundle);
+      for (ArrayRef<unsigned>::iterator I = Blocks.begin(), E = Blocks.end();
+           I != E; ++I) {
+        unsigned Block = *I;
+        if (!Todo.test(Block))
+          continue;
+        Todo.reset(Block);
+        // This is a new through block. Add it to SpillPlacer later.
+        ActiveBlocks.push_back(Block);
+#ifndef NDEBUG
+        ++Visited;
+#endif
+      }
+    }
+    // Any new blocks to add?
+    if (ActiveBlocks.size() > AddedTo) {
+      ArrayRef<unsigned> Add(&ActiveBlocks[AddedTo],
+                             ActiveBlocks.size() - AddedTo);
+      addThroughConstraints(Intf, Add);
+      AddedTo = ActiveBlocks.size();
+    }
+    // Perhaps iterating can enable more bundles?
+    SpillPlacer->iterate();
+  }
+  DEBUG(dbgs() << ", v=" << Visited);
 }
 
+/// calcSpillCost - Compute how expensive it would be to split the live range in
+/// SA around all use blocks instead of forming bundle regions.
+float RAGreedy::calcSpillCost() {
+  float Cost = 0;
+  const LiveInterval &LI = SA->getParent();
+  ArrayRef<SplitAnalysis::BlockInfo> UseBlocks = SA->getUseBlocks();
+  for (unsigned i = 0; i != UseBlocks.size(); ++i) {
+    const SplitAnalysis::BlockInfo &BI = UseBlocks[i];
+    unsigned Number = BI.MBB->getNumber();
+    // We normally only need one spill instruction - a load or a store.
+    Cost += SpillPlacer->getBlockFrequency(Number);
+
+    // Unless the value is redefined in the block.
+    if (BI.LiveIn && BI.LiveOut) {
+      SlotIndex Start, Stop;
+      tie(Start, Stop) = Indexes->getMBBRange(Number);
+      LiveInterval::const_iterator I = LI.find(Start);
+      assert(I != LI.end() && "Expected live-in value");
+      // Is there a different live-out value? If so, we need an extra spill
+      // instruction.
+      if (I->end < Stop)
+        Cost += SpillPlacer->getBlockFrequency(Number);
+    }
+  }
+  return Cost;
+}
 
 /// calcGlobalSplitCost - Return the global split cost of following the split
 /// pattern in LiveBundles. This cost should be added to the local cost of the
 /// interference pattern in SplitConstraints.
 ///
-float RAGreedy::calcGlobalSplitCost(unsigned PhysReg,
-                                    const BitVector &LiveBundles) {
+float RAGreedy::calcGlobalSplitCost(GlobalSplitCandidate &Cand,
+                                    InterferenceCache::Cursor Intf) {
   float GlobalCost = 0;
+  const BitVector &LiveBundles = Cand.LiveBundles;
   ArrayRef<SplitAnalysis::BlockInfo> UseBlocks = SA->getUseBlocks();
   for (unsigned i = 0; i != UseBlocks.size(); ++i) {
     const SplitAnalysis::BlockInfo &BI = UseBlocks[i];
@@ -549,11 +717,8 @@ float RAGreedy::calcGlobalSplitCost(unsigned PhysReg,
       GlobalCost += Ins * SpillPlacer->getBlockFrequency(BC.Number);
   }
 
-  InterferenceCache::Cursor Intf(IntfCache, PhysReg);
-  ArrayRef<unsigned> ThroughBlocks = SA->getThroughBlocks();
-  SplitConstraints.resize(UseBlocks.size() + ThroughBlocks.size());
-  for (unsigned i = 0; i != ThroughBlocks.size(); ++i) {
-    unsigned Number = ThroughBlocks[i];
+  for (unsigned i = 0, e = Cand.ActiveBlocks.size(); i != e; ++i) {
+    unsigned Number = Cand.ActiveBlocks[i];
     bool RegIn  = LiveBundles[Bundles->getBundle(Number, 0)];
     bool RegOut = LiveBundles[Bundles->getBundle(Number, 1)];
     if (!RegIn && !RegOut)
@@ -578,23 +743,25 @@ float RAGreedy::calcGlobalSplitCost(unsigned PhysReg,
 /// avoiding interference. The 'stack' interval is the complement constructed by
 /// SplitEditor. It will contain the rest.
 ///
-void RAGreedy::splitAroundRegion(LiveInterval &VirtReg, unsigned PhysReg,
-                                 const BitVector &LiveBundles,
+void RAGreedy::splitAroundRegion(LiveInterval &VirtReg,
+                                 GlobalSplitCandidate &Cand,
                                  SmallVectorImpl<LiveInterval*> &NewVRegs) {
+  const BitVector &LiveBundles = Cand.LiveBundles;
+
   DEBUG({
-    dbgs() << "Splitting around region for " << PrintReg(PhysReg, TRI)
+    dbgs() << "Splitting around region for " << PrintReg(Cand.PhysReg, TRI)
            << " with bundles";
     for (int i = LiveBundles.find_first(); i>=0; i = LiveBundles.find_next(i))
       dbgs() << " EB#" << i;
     dbgs() << ".\n";
   });
 
-  InterferenceCache::Cursor Intf(IntfCache, PhysReg);
+  InterferenceCache::Cursor Intf(IntfCache, Cand.PhysReg);
   LiveRangeEdit LREdit(VirtReg, NewVRegs, this);
   SE->reset(LREdit);
 
   // Create the main cross-block interval.
-  SE->openIntv();
+  const unsigned MainIntv = SE->openIntv();
 
   // First add all defs that are live out of a block.
   ArrayRef<SplitAnalysis::BlockInfo> UseBlocks = SA->getUseBlocks();
@@ -603,6 +770,14 @@ void RAGreedy::splitAroundRegion(LiveInterval &VirtReg, unsigned PhysReg,
     bool RegIn  = LiveBundles[Bundles->getBundle(BI.MBB->getNumber(), 0)];
     bool RegOut = LiveBundles[Bundles->getBundle(BI.MBB->getNumber(), 1)];
 
+    // Create separate intervals for isolated blocks with multiple uses.
+    if (!RegIn && !RegOut && BI.FirstUse != BI.LastUse) {
+      DEBUG(dbgs() << "BB#" << BI.MBB->getNumber() << " isolated.\n");
+      SE->splitSingleBlock(BI);
+      SE->selectIntv(MainIntv);
+      continue;
+    }
+
     // Should the register be live out?
     if (!BI.LiveOut || !RegOut)
       continue;
@@ -628,7 +803,7 @@ void RAGreedy::splitAroundRegion(LiveInterval &VirtReg, unsigned PhysReg,
       DEBUG(dbgs() << ", no interference");
       if (!BI.LiveThrough) {
         DEBUG(dbgs() << ", not live-through.\n");
-        SE->useIntv(SE->enterIntvBefore(BI.Def), Stop);
+        SE->useIntv(SE->enterIntvBefore(BI.FirstUse), Stop);
         continue;
       }
       if (!RegIn) {
@@ -645,10 +820,10 @@ void RAGreedy::splitAroundRegion(LiveInterval &VirtReg, unsigned PhysReg,
     // Block has interference.
     DEBUG(dbgs() << ", interference to " << Intf.last());
 
-    if (!BI.LiveThrough && Intf.last() <= BI.Def) {
+    if (!BI.LiveThrough && Intf.last() <= BI.FirstUse) {
       // The interference doesn't reach the outgoing segment.
-      DEBUG(dbgs() << " doesn't affect def from " << BI.Def << '\n');
-      SE->useIntv(BI.Def, Stop);
+      DEBUG(dbgs() << " doesn't affect def from " << BI.FirstUse << '\n');
+      SE->useIntv(BI.FirstUse, Stop);
       continue;
     }
 
@@ -704,7 +879,7 @@ void RAGreedy::splitAroundRegion(LiveInterval &VirtReg, unsigned PhysReg,
       DEBUG(dbgs() << ", no interference");
       if (!BI.LiveThrough) {
         DEBUG(dbgs() << ", killed in block.\n");
-        SE->useIntv(Start, SE->leaveIntvAfter(BI.Kill));
+        SE->useIntv(Start, SE->leaveIntvAfter(BI.LastUse));
         continue;
       }
       if (!RegOut) {
@@ -737,10 +912,10 @@ void RAGreedy::splitAroundRegion(LiveInterval &VirtReg, unsigned PhysReg,
     // Block has interference.
     DEBUG(dbgs() << ", interference from " << Intf.first());
 
-    if (!BI.LiveThrough && Intf.first() >= BI.Kill) {
+    if (!BI.LiveThrough && Intf.first() >= BI.LastUse) {
       // The interference doesn't reach the outgoing segment.
-      DEBUG(dbgs() << " doesn't affect kill at " << BI.Kill << '\n');
-      SE->useIntv(Start, BI.Kill);
+      DEBUG(dbgs() << " doesn't affect kill at " << BI.LastUse << '\n');
+      SE->useIntv(Start, BI.LastUse);
       continue;
     }
 
@@ -766,9 +941,8 @@ void RAGreedy::splitAroundRegion(LiveInterval &VirtReg, unsigned PhysReg,
   }
 
   // Handle live-through blocks.
-  ArrayRef<unsigned> ThroughBlocks = SA->getThroughBlocks();
-  for (unsigned i = 0; i != ThroughBlocks.size(); ++i) {
-    unsigned Number = ThroughBlocks[i];
+  for (unsigned i = 0, e = Cand.ActiveBlocks.size(); i != e; ++i) {
+    unsigned Number = Cand.ActiveBlocks[i];
     bool RegIn  = LiveBundles[Bundles->getBundle(Number, 0)];
     bool RegOut = LiveBundles[Bundles->getBundle(Number, 1)];
     DEBUG(dbgs() << "Live through BB#" << Number << '\n');
@@ -787,70 +961,113 @@ void RAGreedy::splitAroundRegion(LiveInterval &VirtReg, unsigned PhysReg,
       SE->enterIntvAtEnd(*MBB);
   }
 
-  SE->closeIntv();
-
-  // FIXME: Should we be more aggressive about splitting the stack region into
-  // per-block segments? The current approach allows the stack region to
-  // separate into connected components. Some components may be allocatable.
-  SE->finish();
   ++NumGlobalSplits;
 
+  SmallVector<unsigned, 8> IntvMap;
+  SE->finish(&IntvMap);
+  DebugVars->splitRegister(VirtReg.reg, LREdit.regs());
+
+  LRStage.resize(MRI->getNumVirtRegs());
+  unsigned OrigBlocks = SA->getNumLiveBlocks();
+
+  // Sort out the new intervals created by splitting. We get four kinds:
+  // - Remainder intervals should not be split again.
+  // - Candidate intervals can be assigned to Cand.PhysReg.
+  // - Block-local splits are candidates for local splitting.
+  // - DCE leftovers should go back on the queue.
+  for (unsigned i = 0, e = LREdit.size(); i != e; ++i) {
+    unsigned Reg = LREdit.get(i)->reg;
+
+    // Ignore old intervals from DCE.
+    if (LRStage[Reg] != RS_New)
+      continue;
+
+    // Remainder interval. Don't try splitting again, spill if it doesn't
+    // allocate.
+    if (IntvMap[i] == 0) {
+      LRStage[Reg] = RS_Global;
+      continue;
+    }
+
+    // Main interval. Allow repeated splitting as long as the number of live
+    // blocks is strictly decreasing.
+    if (IntvMap[i] == MainIntv) {
+      if (SA->countLiveBlocks(LREdit.get(i)) >= OrigBlocks) {
+        DEBUG(dbgs() << "Main interval covers the same " << OrigBlocks
+                     << " blocks as original.\n");
+        // Don't allow repeated splitting as a safe guard against looping.
+        LRStage[Reg] = RS_Global;
+      }
+      continue;
+    }
+
+    // Other intervals are treated as new. This includes local intervals created
+    // for blocks with multiple uses, and anything created by DCE.
+  }
+
   if (VerifyEnabled)
     MF->verify(this, "After splitting live range around region");
 }
 
 unsigned RAGreedy::tryRegionSplit(LiveInterval &VirtReg, AllocationOrder &Order,
                                   SmallVectorImpl<LiveInterval*> &NewVRegs) {
-  BitVector LiveBundles, BestBundles;
-  float BestCost = 0;
-  unsigned BestReg = 0;
+  float BestCost = Hysteresis * calcSpillCost();
+  DEBUG(dbgs() << "Cost of isolating all blocks = " << BestCost << '\n');
+  const unsigned NoCand = ~0u;
+  unsigned BestCand = NoCand;
 
   Order.rewind();
   for (unsigned Cand = 0; unsigned PhysReg = Order.next(); ++Cand) {
     if (GlobalCand.size() <= Cand)
       GlobalCand.resize(Cand+1);
-    GlobalCand[Cand].PhysReg = PhysReg;
+    GlobalCand[Cand].reset(PhysReg);
 
-    SpillPlacer->prepare(LiveBundles);
+    SpillPlacer->prepare(GlobalCand[Cand].LiveBundles);
     float Cost;
-    if (!addSplitConstraints(PhysReg, Cost)) {
-      DEBUG(dbgs() << PrintReg(PhysReg, TRI) << "\tno positive bias\n");
+    InterferenceCache::Cursor Intf(IntfCache, PhysReg);
+    if (!addSplitConstraints(Intf, Cost)) {
+      DEBUG(dbgs() << PrintReg(PhysReg, TRI) << "\tno positive bundles\n");
       continue;
     }
-    DEBUG(dbgs() << PrintReg(PhysReg, TRI) << "\tbiased = "
-                 << SpillPlacer->getPositiveNodes() << ", static = " << Cost);
-    if (BestReg && Cost >= BestCost) {
-      DEBUG(dbgs() << " worse than " << PrintReg(BestReg, TRI) << '\n');
+    DEBUG(dbgs() << PrintReg(PhysReg, TRI) << "\tstatic = " << Cost);
+    if (Cost >= BestCost) {
+      DEBUG({
+        if (BestCand == NoCand)
+          dbgs() << " worse than no bundles\n";
+        else
+          dbgs() << " worse than "
+                 << PrintReg(GlobalCand[BestCand].PhysReg, TRI) << '\n';
+      });
       continue;
     }
+    growRegion(GlobalCand[Cand], Intf);
 
     SpillPlacer->finish();
 
     // No live bundles, defer to splitSingleBlocks().
-    if (!LiveBundles.any()) {
+    if (!GlobalCand[Cand].LiveBundles.any()) {
       DEBUG(dbgs() << " no bundles.\n");
       continue;
     }
 
-    Cost += calcGlobalSplitCost(PhysReg, LiveBundles);
+    Cost += calcGlobalSplitCost(GlobalCand[Cand], Intf);
     DEBUG({
       dbgs() << ", total = " << Cost << " with bundles";
-      for (int i = LiveBundles.find_first(); i>=0; i = LiveBundles.find_next(i))
+      for (int i = GlobalCand[Cand].LiveBundles.find_first(); i>=0;
+           i = GlobalCand[Cand].LiveBundles.find_next(i))
         dbgs() << " EB#" << i;
       dbgs() << ".\n";
     });
-    if (!BestReg || Cost < BestCost) {
-      BestReg = PhysReg;
-      BestCost = 0.98f * Cost; // Prevent rounding effects.
-      BestBundles.swap(LiveBundles);
+    if (Cost < BestCost) {
+      BestCand = Cand;
+      BestCost = Hysteresis * Cost; // Prevent rounding effects.
     }
   }
 
-  if (!BestReg)
+  if (BestCand == NoCand)
     return 0;
 
-  splitAroundRegion(VirtReg, BestReg, BestBundles, NewVRegs);
-  setStage(NewVRegs.begin(), NewVRegs.end(), RS_Region);
+  splitAroundRegion(VirtReg, GlobalCand[BestCand], NewVRegs);
   return 0;
 }
 
@@ -913,47 +1130,6 @@ void RAGreedy::calcGapWeights(unsigned PhysReg,
   }
 }
 
-/// getPrevMappedIndex - Return the slot index of the last non-copy instruction
-/// before MI that has a slot index. If MI is the first mapped instruction in
-/// its block, return the block start index instead.
-///
-SlotIndex RAGreedy::getPrevMappedIndex(const MachineInstr *MI) {
-  assert(MI && "Missing MachineInstr");
-  const MachineBasicBlock *MBB = MI->getParent();
-  MachineBasicBlock::const_iterator B = MBB->begin(), I = MI;
-  while (I != B)
-    if (!(--I)->isDebugValue() && !I->isCopy())
-      return Indexes->getInstructionIndex(I);
-  return Indexes->getMBBStartIdx(MBB);
-}
-
-/// calcPrevSlots - Fill in the PrevSlot array with the index of the previous
-/// real non-copy instruction for each instruction in SA->UseSlots.
-///
-void RAGreedy::calcPrevSlots() {
-  const SmallVectorImpl<SlotIndex> &Uses = SA->UseSlots;
-  PrevSlot.clear();
-  PrevSlot.reserve(Uses.size());
-  for (unsigned i = 0, e = Uses.size(); i != e; ++i) {
-    const MachineInstr *MI = Indexes->getInstructionFromIndex(Uses[i]);
-    PrevSlot.push_back(getPrevMappedIndex(MI).getDefIndex());
-  }
-}
-
-/// nextSplitPoint - Find the next index into SA->UseSlots > i such that it may
-/// be beneficial to split before UseSlots[i].
-///
-/// 0 is always a valid split point
-unsigned RAGreedy::nextSplitPoint(unsigned i) {
-  const SmallVectorImpl<SlotIndex> &Uses = SA->UseSlots;
-  const unsigned Size = Uses.size();
-  assert(i != Size && "No split points after the end");
-  // Allow split before i when Uses[i] is not adjacent to the previous use.
-  while (++i != Size && PrevSlot[i].getBaseIndex() <= Uses[i-1].getBaseIndex())
-    ;
-  return i;
-}
-
 /// tryLocalSplit - Try to split VirtReg into smaller intervals inside its only
 /// basic block.
 ///
@@ -981,11 +1157,27 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
     dbgs() << '\n';
   });
 
-  // For every use, find the previous mapped non-copy instruction.
-  // We use this to detect valid split points, and to estimate new interval
-  // sizes.
-  calcPrevSlots();
+  // Since we allow local split results to be split again, there is a risk of
+  // creating infinite loops. It is tempting to require that the new live
+  // ranges have less instructions than the original. That would guarantee
+  // convergence, but it is too strict. A live range with 3 instructions can be
+  // split 2+3 (including the COPY), and we want to allow that.
+  //
+  // Instead we use these rules:
+  //
+  // 1. Allow any split for ranges with getStage() < RS_Local. (Except for the
+  //    noop split, of course).
+  // 2. Require progress be made for ranges with getStage() >= RS_Local. All
+  //    the new ranges must have fewer instructions than before the split.
+  // 3. New ranges with the same number of instructions are marked RS_Local,
+  //    smaller ranges are marked RS_New.
+  //
+  // These rules allow a 3 -> 2+3 split once, which we need. They also prevent
+  // excessive splitting and infinite loops.
+  //
+  bool ProgressRequired = getStage(VirtReg) >= RS_Local;
 
+  // Best split candidate.
   unsigned BestBefore = NumGaps;
   unsigned BestAfter = 0;
   float BestDiff = 0;
@@ -1003,13 +1195,11 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
     // The new spill weight must be larger than any gap interference.
 
     // We will split before Uses[SplitBefore] and after Uses[SplitAfter].
-    unsigned SplitBefore = 0, SplitAfter = nextSplitPoint(1) - 1;
+    unsigned SplitBefore = 0, SplitAfter = 1;
 
     // MaxGap should always be max(GapWeight[SplitBefore..SplitAfter-1]).
     // It is the spill weight that needs to be evicted.
     float MaxGap = GapWeight[0];
-    for (unsigned i = 1; i != SplitAfter; ++i)
-      MaxGap = std::max(MaxGap, GapWeight[i]);
 
     for (;;) {
       // Live before/after split?
@@ -1027,41 +1217,31 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
       }
       // Should the interval be extended or shrunk?
       bool Shrink = true;
-      if (MaxGap < HUGE_VALF) {
-        // Estimate the new spill weight.
-        //
-        // Each instruction reads and writes the register, except the first
-        // instr doesn't read when !FirstLive, and the last instr doesn't write
-        // when !LastLive.
-        //
-        // We will be inserting copies before and after, so the total number of
-        // reads and writes is 2 * EstUses.
-        //
-        const unsigned EstUses = 2*(SplitAfter - SplitBefore) +
-                                 2*(LiveBefore + LiveAfter);
 
-        // Try to guess the size of the new interval. This should be trivial,
-        // but the slot index of an inserted copy can be a lot smaller than the
-        // instruction it is inserted before if there are many dead indexes
-        // between them.
-        //
-        // We measure the distance from the instruction before SplitBefore to
-        // get a conservative estimate.
-        //
-        // The final distance can still be different if inserting copies
-        // triggers a slot index renumbering.
+      // How many gaps would the new range have?
+      unsigned NewGaps = LiveBefore + SplitAfter - SplitBefore + LiveAfter;
+
+      // Legally, without causing looping?
+      bool Legal = !ProgressRequired || NewGaps < NumGaps;
+
+      if (Legal && MaxGap < HUGE_VALF) {
+        // Estimate the new spill weight. Each instruction reads or writes the
+        // register. Conservatively assume there are no read-modify-write
+        // instructions.
         //
-        const float EstWeight = normalizeSpillWeight(blockFreq * EstUses,
-                              PrevSlot[SplitBefore].distance(Uses[SplitAfter]));
+        // Try to guess the size of the new interval.
+        const float EstWeight = normalizeSpillWeight(blockFreq * (NewGaps + 1),
+                                 Uses[SplitBefore].distance(Uses[SplitAfter]) +
+                                 (LiveBefore + LiveAfter)*SlotIndex::InstrDist);
         // Would this split be possible to allocate?
         // Never allocate all gaps, we wouldn't be making progress.
-        float Diff = EstWeight - MaxGap;
-        DEBUG(dbgs() << " w=" << EstWeight << " d=" << Diff);
-        if (Diff > 0) {
+        DEBUG(dbgs() << " w=" << EstWeight);
+        if (EstWeight * Hysteresis >= MaxGap) {
           Shrink = false;
+          float Diff = EstWeight - MaxGap;
           if (Diff > BestDiff) {
             DEBUG(dbgs() << " (best)");
-            BestDiff = Diff;
+            BestDiff = Hysteresis * Diff;
             BestBefore = SplitBefore;
             BestAfter = SplitAfter;
           }
@@ -1070,8 +1250,7 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
 
       // Try to shrink.
       if (Shrink) {
-        SplitBefore = nextSplitPoint(SplitBefore);
-        if (SplitBefore < SplitAfter) {
+        if (++SplitBefore < SplitAfter) {
           DEBUG(dbgs() << " shrink\n");
           // Recompute the max when necessary.
           if (GapWeight[SplitBefore - 1] >= MaxGap) {
@@ -1091,10 +1270,7 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
       }
 
       DEBUG(dbgs() << " extend\n");
-      for (unsigned e = nextSplitPoint(SplitAfter + 1) - 1;
-           SplitAfter != e; ++SplitAfter)
-        MaxGap = std::max(MaxGap, GapWeight[SplitAfter]);
-          continue;
+      MaxGap = std::max(MaxGap, GapWeight[SplitAfter++]);
     }
   }
 
@@ -1113,9 +1289,27 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
   SlotIndex SegStart = SE->enterIntvBefore(Uses[BestBefore]);
   SlotIndex SegStop  = SE->leaveIntvAfter(Uses[BestAfter]);
   SE->useIntv(SegStart, SegStop);
-  SE->closeIntv();
-  SE->finish();
-  setStage(NewVRegs.begin(), NewVRegs.end(), RS_Local);
+  SmallVector<unsigned, 8> IntvMap;
+  SE->finish(&IntvMap);
+  DebugVars->splitRegister(VirtReg.reg, LREdit.regs());
+
+  // If the new range has the same number of instructions as before, mark it as
+  // RS_Local so the next split will be forced to make progress. Otherwise,
+  // leave the new intervals as RS_New so they can compete.
+  bool LiveBefore = BestBefore != 0 || BI.LiveIn;
+  bool LiveAfter = BestAfter != NumGaps || BI.LiveOut;
+  unsigned NewGaps = LiveBefore + BestAfter - BestBefore + LiveAfter;
+  if (NewGaps >= NumGaps) {
+    DEBUG(dbgs() << "Tagging non-progress ranges: ");
+    assert(!ProgressRequired && "Didn't make progress when it was required.");
+    LRStage.resize(MRI->getNumVirtRegs());
+    for (unsigned i = 0, e = IntvMap.size(); i != e; ++i)
+      if (IntvMap[i] == 1) {
+        LRStage[LREdit.get(i)->reg] = RS_Local;
+        DEBUG(dbgs() << PrintReg(LREdit.get(i)->reg));
+      }
+    DEBUG(dbgs() << '\n');
+  }
   ++NumLocalSplits;
 
   return 0;
@@ -1141,30 +1335,36 @@ unsigned RAGreedy::trySplit(LiveInterval &VirtReg, AllocationOrder &Order,
 
   // Don't iterate global splitting.
   // Move straight to spilling if this range was produced by a global split.
-  LiveRangeStage Stage = getStage(VirtReg);
-  if (Stage >= RS_Block)
+  if (getStage(VirtReg) >= RS_Global)
     return 0;
 
   SA->analyze(&VirtReg);
 
-  // First try to split around a region spanning multiple blocks.
-  if (Stage < RS_Region) {
-    unsigned PhysReg = tryRegionSplit(VirtReg, Order, NewVRegs);
-    if (PhysReg || !NewVRegs.empty())
+  // FIXME: SplitAnalysis may repair broken live ranges coming from the
+  // coalescer. That may cause the range to become allocatable which means that
+  // tryRegionSplit won't be making progress. This check should be replaced with
+  // an assertion when the coalescer is fixed.
+  if (SA->didRepairRange()) {
+    // VirtReg has changed, so all cached queries are invalid.
+    invalidateVirtRegs();
+    if (unsigned PhysReg = tryAssign(VirtReg, Order, NewVRegs))
       return PhysReg;
   }
 
+  // First try to split around a region spanning multiple blocks.
+  unsigned PhysReg = tryRegionSplit(VirtReg, Order, NewVRegs);
+  if (PhysReg || !NewVRegs.empty())
+    return PhysReg;
+
   // Then isolate blocks with multiple uses.
-  if (Stage < RS_Block) {
-    SplitAnalysis::BlockPtrSet Blocks;
-    if (SA->getMultiUseBlocks(Blocks)) {
-      LiveRangeEdit LREdit(VirtReg, NewVRegs, this);
-      SE->reset(LREdit);
-      SE->splitSingleBlocks(Blocks);
-      setStage(NewVRegs.begin(), NewVRegs.end(), RS_Block);
-      if (VerifyEnabled)
-        MF->verify(this, "After splitting live range around basic blocks");
-    }
+  SplitAnalysis::BlockPtrSet Blocks;
+  if (SA->getMultiUseBlocks(Blocks)) {
+    LiveRangeEdit LREdit(VirtReg, NewVRegs, this);
+    SE->reset(LREdit);
+    SE->splitSingleBlocks(Blocks);
+    setStage(NewVRegs.begin(), NewVRegs.end(), RS_Global);
+    if (VerifyEnabled)
+      MF->verify(this, "After splitting live range around basic blocks");
   }
 
   // Don't assign any physregs.
@@ -1179,21 +1379,25 @@ unsigned RAGreedy::trySplit(LiveInterval &VirtReg, AllocationOrder &Order,
 unsigned RAGreedy::selectOrSplit(LiveInterval &VirtReg,
                                  SmallVectorImpl<LiveInterval*> &NewVRegs) {
   // First try assigning a free register.
-  AllocationOrder Order(VirtReg.reg, *VRM, ReservedRegs);
-  while (unsigned PhysReg = Order.next()) {
-    if (!checkPhysRegInterference(VirtReg, PhysReg))
-      return PhysReg;
-  }
-
-  if (unsigned PhysReg = tryEvict(VirtReg, Order, NewVRegs))
+  AllocationOrder Order(VirtReg.reg, *VRM, RegClassInfo);
+  if (unsigned PhysReg = tryAssign(VirtReg, Order, NewVRegs))
     return PhysReg;
 
+  LiveRangeStage Stage = getStage(VirtReg);
+  DEBUG(dbgs() << StageName[Stage] << '\n');
+
+  // Try to evict a less worthy live range, but only for ranges from the primary
+  // queue. The RS_Second ranges already failed to do this, and they should not
+  // get a second chance until they have been split.
+  if (Stage != RS_Second)
+    if (unsigned PhysReg = tryEvict(VirtReg, Order, NewVRegs))
+      return PhysReg;
+
   assert(NewVRegs.empty() && "Cannot append to existing NewVRegs");
 
   // The first time we see a live range, don't try to split or spill.
   // Wait until the second time, when all smaller ranges have been allocated.
   // This gives a better picture of the interference to split around.
-  LiveRangeStage Stage = getStage(VirtReg);
   if (Stage == RS_First) {
     LRStage[VirtReg.reg] = RS_Second;
     DEBUG(dbgs() << "wait for second round\n");
@@ -1201,7 +1405,10 @@ unsigned RAGreedy::selectOrSplit(LiveInterval &VirtReg,
     return 0;
   }
 
-  assert(Stage < RS_Spill && "Cannot allocate after spilling");
+  // If we couldn't allocate a register from spilling, there is probably some
+  // invalid inline assembly. The base class wil report it.
+  if (Stage >= RS_Spill)
+    return ~0u;
 
   // Try splitting VirtReg or interferences.
   unsigned PhysReg = trySplit(VirtReg, Order, NewVRegs);
@@ -1234,12 +1441,12 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
   RegAllocBase::init(getAnalysis<VirtRegMap>(), getAnalysis<LiveIntervals>());
   Indexes = &getAnalysis<SlotIndexes>();
   DomTree = &getAnalysis<MachineDominatorTree>();
-  ReservedRegs = TRI->getReservedRegs(*MF);
   SpillerInstance.reset(createInlineSpiller(*this, *MF, *VRM));
   Loops = &getAnalysis<MachineLoopInfo>();
   LoopRanges = &getAnalysis<MachineLoopRanges>();
   Bundles = &getAnalysis<EdgeBundles>();
   SpillPlacer = &getAnalysis<SpillPlacement>();
+  DebugVars = &getAnalysis<LiveDebugVariables>();
 
   SA.reset(new SplitAnalysis(*VRM, *LIS, *Loops));
   SE.reset(new SplitEditor(*SA, *LIS, *VRM, *DomTree));
@@ -1258,7 +1465,7 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
   }
 
   // Write out new DBG_VALUE instructions.
-  getAnalysis<LiveDebugVariables>().emitDebugValues(VRM);
+  DebugVars->emitDebugValues(VRM);
 
   // The pass output is in VirtRegMap. Release all the transient data.
   releaseMemory();
diff --git a/lib/CodeGen/RegAllocLinearScan.cpp b/lib/CodeGen/RegAllocLinearScan.cpp
index ef78949..0818034 100644
--- a/lib/CodeGen/RegAllocLinearScan.cpp
+++ b/lib/CodeGen/RegAllocLinearScan.cpp
@@ -16,6 +16,7 @@
 #include "LiveRangeEdit.h"
 #include "VirtRegMap.h"
 #include "VirtRegRewriter.h"
+#include "RegisterClassInfo.h"
 #include "Spiller.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Function.h"
@@ -40,7 +41,6 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
-#include <set>
 #include <queue>
 #include <memory>
 #include <cmath>
@@ -67,6 +67,11 @@ TrivCoalesceEnds("trivial-coalesce-ends",
                   cl::desc("Attempt trivial coalescing of interval ends"),
                   cl::init(false), cl::Hidden);
 
+static cl::opt<bool>
+AvoidWAWHazard("avoid-waw-hazard",
+               cl::desc("Avoid write-write hazards for some register classes"),
+               cl::init(false), cl::Hidden);
+
 static RegisterRegAlloc
 linearscanRegAlloc("linearscan", "linear scan register allocator",
                    createLinearScanRegisterAllocator);
@@ -110,6 +115,7 @@ namespace {
       if (NumRecentlyUsedRegs > 0)
         RecentRegs.resize(NumRecentlyUsedRegs, 0);
       RecentNext = RecentRegs.begin();
+      avoidWAW_ = 0;
     }
 
     typedef std::pair<LiveInterval*, LiveInterval::iterator> IntervalPtr;
@@ -143,6 +149,7 @@ namespace {
     BitVector reservedRegs_;
     LiveIntervals* li_;
     MachineLoopInfo *loopInfo;
+    RegisterClassInfo RegClassInfo;
 
     /// handled_ - Intervals are added to the handled_ set in the order of their
     /// start value.  This is uses for backtracking.
@@ -180,6 +187,9 @@ namespace {
     SmallVector<unsigned, 4> RecentRegs;
     SmallVector<unsigned, 4>::iterator RecentNext;
 
+    // Last write-after-write register written.
+    unsigned avoidWAW_;
+
     // Record that we just picked this register.
     void recordRecentlyUsed(unsigned reg) {
       assert(reg != 0 && "Recently used register is NOREG!");
@@ -227,8 +237,8 @@ namespace {
 
     // Determine if we skip this register due to its being recently used.
     bool isRecentlyUsed(unsigned reg) const {
-      return std::find(RecentRegs.begin(), RecentRegs.end(), reg) !=
-             RecentRegs.end();
+      return reg == avoidWAW_ ||
+       std::find(RecentRegs.begin(), RecentRegs.end(), reg) != RecentRegs.end();
     }
 
   private:
@@ -358,13 +368,10 @@ namespace {
     /// getFirstNonReservedPhysReg - return the first non-reserved physical
     /// register in the register class.
     unsigned getFirstNonReservedPhysReg(const TargetRegisterClass *RC) {
-        TargetRegisterClass::iterator aoe = RC->allocation_order_end(*mf_);
-        TargetRegisterClass::iterator i = RC->allocation_order_begin(*mf_);
-        while (i != aoe && reservedRegs_.test(*i))
-          ++i;
-        assert(i != aoe && "All registers reserved?!");
-        return *i;
-      }
+      ArrayRef<unsigned> O = RegClassInfo.getOrder(RC);
+      assert(!O.empty() && "All registers reserved?!");
+      return O.front();
+    }
 
     void ComputeRelatedRegClasses();
 
@@ -516,6 +523,7 @@ bool RALinScan::runOnMachineFunction(MachineFunction &fn) {
   reservedRegs_ = tri_->getReservedRegs(fn);
   li_ = &getAnalysis<LiveIntervals>();
   loopInfo = &getAnalysis<MachineLoopInfo>();
+  RegClassInfo.runOnMachineFunction(fn);
 
   // We don't run the coalescer here because we have no reason to
   // interact with it.  If the coalescer requires interaction, it
@@ -792,7 +800,7 @@ void RALinScan::updateSpillWeights(std::vector<float> &Weights,
   // register class we are trying to allocate. Then add the weight to all
   // sub-registers of the super-register even if they are not aliases.
   // e.g. allocating for GR32, bh is not used, updating bl spill weight.
-  //      bl should get the same spill weight otherwise it will be choosen
+  //      bl should get the same spill weight otherwise it will be chosen
   //      as a spill candidate since spilling bh doesn't make ebx available.
   for (unsigned i = 0, e = Supers.size(); i != e; ++i) {
     for (const unsigned *sr = tri_->getSubRegisters(Supers[i]); *sr; ++sr)
@@ -1116,6 +1124,12 @@ void RALinScan::assignRegOrStackSlotAtInterval(LiveInterval* cur) {
     active_.push_back(std::make_pair(cur, cur->begin()));
     handled_.push_back(cur);
 
+    // Remember physReg for avoiding a write-after-write hazard in the next
+    // instruction.
+    if (AvoidWAWHazard &&
+        tri_->avoidWriteAfterWrite(mri_->getRegClass(cur->reg)))
+      avoidWAW_ = physReg;
+
     // "Upgrade" the physical register since it has been allocated.
     UpgradeRegister(physReg);
     if (LiveInterval *NextReloadLI = hasNextReloadInterval(cur)) {
@@ -1152,14 +1166,11 @@ void RALinScan::assignRegOrStackSlotAtInterval(LiveInterval* cur) {
 
   bool Found = false;
   std::vector<std::pair<unsigned,float> > RegsWeights;
+  ArrayRef<unsigned> Order = RegClassInfo.getOrder(RC);
   if (!minReg || SpillWeights[minReg] == HUGE_VALF)
-    for (TargetRegisterClass::iterator i = RC->allocation_order_begin(*mf_),
-           e = RC->allocation_order_end(*mf_); i != e; ++i) {
-      unsigned reg = *i;
+    for (unsigned i = 0; i != Order.size(); ++i) {
+      unsigned reg = Order[i];
       float regWeight = SpillWeights[reg];
-      // Don't even consider reserved regs.
-      if (reservedRegs_.test(reg))
-        continue;
       // Skip recently allocated registers and reserved registers.
       if (minWeight > regWeight && !isRecentlyUsed(reg))
         Found = true;
@@ -1168,11 +1179,8 @@ void RALinScan::assignRegOrStackSlotAtInterval(LiveInterval* cur) {
 
   // If we didn't find a register that is spillable, try aliases?
   if (!Found) {
-    for (TargetRegisterClass::iterator i = RC->allocation_order_begin(*mf_),
-           e = RC->allocation_order_end(*mf_); i != e; ++i) {
-      unsigned reg = *i;
-      if (reservedRegs_.test(reg))
-        continue;
+    for (unsigned i = 0; i != Order.size(); ++i) {
+      unsigned reg = Order[i];
       // No need to worry about if the alias register size < regsize of RC.
       // We are going to spill all registers that alias it anyway.
       for (const unsigned* as = tri_->getAliasSet(reg); *as; ++as)
@@ -1432,13 +1440,13 @@ unsigned RALinScan::getFreePhysReg(LiveInterval* cur,
   if (TargetRegisterInfo::isVirtualRegister(physReg) && vrm_->hasPhys(physReg))
     physReg = vrm_->getPhys(physReg);
 
-  TargetRegisterClass::iterator I, E;
-  tie(I, E) = tri_->getAllocationOrder(RC, Hint.first, physReg, *mf_);
-  assert(I != E && "No allocatable register in this register class!");
+  ArrayRef<unsigned> Order = tri_->getRawAllocationOrder(RC, Hint.first,
+                                                         physReg, *mf_);
+  assert(!Order.empty() && "No allocatable register in this register class!");
 
   // Scan for the first available register.
-  for (; I != E; ++I) {
-    unsigned Reg = *I;
+  for (unsigned i = 0; i != Order.size(); ++i) {
+    unsigned Reg = Order[i];
     // Ignore "downgraded" registers.
     if (SkipDGRegs && DowngradedRegs.count(Reg))
       continue;
@@ -1446,7 +1454,7 @@ unsigned RALinScan::getFreePhysReg(LiveInterval* cur,
     if (reservedRegs_.test(Reg))
       continue;
     // Skip recently allocated registers.
-    if (isRegAvail(Reg) && !isRecentlyUsed(Reg)) {
+    if (isRegAvail(Reg) && (!SkipDGRegs || !isRecentlyUsed(Reg))) {
       FreeReg = Reg;
       if (FreeReg < inactiveCounts.size())
         FreeRegInactiveCount = inactiveCounts[FreeReg];
@@ -1468,8 +1476,8 @@ unsigned RALinScan::getFreePhysReg(LiveInterval* cur,
   // inactive count.  Alkis found that this reduced register pressure very
   // slightly on X86 (in rev 1.94 of this file), though this should probably be
   // reevaluated now.
-  for (; I != E; ++I) {
-    unsigned Reg = *I;
+  for (unsigned i = 0; i != Order.size(); ++i) {
+    unsigned Reg = Order[i];
     // Ignore "downgraded" registers.
     if (SkipDGRegs && DowngradedRegs.count(Reg))
       continue;
@@ -1477,7 +1485,8 @@ unsigned RALinScan::getFreePhysReg(LiveInterval* cur,
     if (reservedRegs_.test(Reg))
       continue;
     if (isRegAvail(Reg) && Reg < inactiveCounts.size() &&
-        FreeRegInactiveCount < inactiveCounts[Reg] && !isRecentlyUsed(Reg)) {
+        FreeRegInactiveCount < inactiveCounts[Reg] &&
+        (!SkipDGRegs || !isRecentlyUsed(Reg))) {
       FreeReg = Reg;
       FreeRegInactiveCount = inactiveCounts[Reg];
       if (FreeRegInactiveCount == MaxInactiveCount)
@@ -1528,12 +1537,10 @@ unsigned RALinScan::getFreePhysReg(LiveInterval *cur) {
       return Preference;
   }
 
-  if (!DowngradedRegs.empty()) {
-    unsigned FreeReg = getFreePhysReg(cur, RC, MaxInactiveCount, inactiveCounts,
-                                      true);
-    if (FreeReg)
-      return FreeReg;
-  }
+  unsigned FreeReg = getFreePhysReg(cur, RC, MaxInactiveCount, inactiveCounts,
+                                    true);
+  if (FreeReg)
+    return FreeReg;
   return getFreePhysReg(cur, RC, MaxInactiveCount, inactiveCounts, false);
 }
 
diff --git a/lib/CodeGen/RegAllocPBQP.cpp b/lib/CodeGen/RegAllocPBQP.cpp
index 1e1f1e0..605507f 100644
--- a/lib/CodeGen/RegAllocPBQP.cpp
+++ b/lib/CodeGen/RegAllocPBQP.cpp
@@ -222,10 +222,9 @@ std::auto_ptr<PBQPRAProblem> PBQPBuilder::build(MachineFunction *mf,
     // Compute an initial allowed set for the current vreg.
     typedef std::vector<unsigned> VRAllowed;
     VRAllowed vrAllowed;
-    for (TargetRegisterClass::iterator aoItr = trc->allocation_order_begin(*mf),
-                                       aoEnd = trc->allocation_order_end(*mf);
-         aoItr != aoEnd; ++aoItr) {
-      unsigned preg = *aoItr;
+    ArrayRef<unsigned> rawOrder = trc->getRawAllocationOrder(*mf);
+    for (unsigned i = 0; i != rawOrder.size(); ++i) {
+      unsigned preg = rawOrder[i];
       if (!reservedRegs.test(preg)) {
         vrAllowed.push_back(preg);
       }
@@ -581,7 +580,7 @@ void RegAllocPBQP::finalizeAlloc() const {
 
     if (physReg == 0) {
       const TargetRegisterClass *liRC = mri->getRegClass(li->reg);
-      physReg = *liRC->allocation_order_begin(*mf);
+      physReg = liRC->getRawAllocationOrder(*mf).front();
     }
 
     vrm->assignVirt2Phys(li->reg, physReg);
diff --git a/lib/CodeGen/RegisterClassInfo.cpp b/lib/CodeGen/RegisterClassInfo.cpp
new file mode 100644
index 0000000..5a77e47
--- /dev/null
+++ b/lib/CodeGen/RegisterClassInfo.cpp
@@ -0,0 +1,112 @@
+//===-- RegisterClassInfo.cpp - Dynamic Register Class Info ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the RegisterClassInfo class which provides dynamic
+// information about target register classes. Callee saved and reserved
+// registers depends on calling conventions and other dynamic information, so
+// some things cannot be determined statically.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "regalloc"
+#include "RegisterClassInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/Target/TargetMachine.h"
+
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+RegisterClassInfo::RegisterClassInfo() : Tag(0), MF(0), TRI(0), CalleeSaved(0)
+{}
+
+void RegisterClassInfo::runOnMachineFunction(const MachineFunction &mf) {
+  bool Update = false;
+  MF = &mf;
+
+  // Allocate new array the first time we see a new target.
+  if (MF->getTarget().getRegisterInfo() != TRI) {
+    TRI = MF->getTarget().getRegisterInfo();
+    RegClass.reset(new RCInfo[TRI->getNumRegClasses()]);
+    Update = true;
+  }
+
+  // Does this MF have different CSRs?
+  const unsigned *CSR = TRI->getCalleeSavedRegs(MF);
+  if (Update || CSR != CalleeSaved) {
+    // Build a CSRNum map. Every CSR alias gets an entry pointing to the last
+    // overlapping CSR.
+    CSRNum.clear();
+    CSRNum.resize(TRI->getNumRegs(), 0);
+    for (unsigned N = 0; unsigned Reg = CSR[N]; ++N)
+      for (const unsigned *AS = TRI->getOverlaps(Reg);
+           unsigned Alias = *AS; ++AS)
+        CSRNum[Alias] = N + 1; // 0 means no CSR, 1 means CalleeSaved[0], ...
+    Update = true;
+  }
+  CalleeSaved = CSR;
+
+  // Different reserved registers?
+  BitVector RR = TRI->getReservedRegs(*MF);
+  if (RR != Reserved)
+    Update = true;
+  Reserved = RR;
+
+  // Invalidate cached information from previous function.
+  if (Update)
+    ++Tag;
+}
+
+/// compute - Compute the preferred allocation order for RC with reserved
+/// registers filtered out. Volatile registers come first followed by CSR
+/// aliases ordered according to the CSR order specified by the target.
+void RegisterClassInfo::compute(const TargetRegisterClass *RC) const {
+  RCInfo &RCI = RegClass[RC->getID()];
+
+  // Raw register count, including all reserved regs.
+  unsigned NumRegs = RC->getNumRegs();
+
+  if (!RCI.Order)
+    RCI.Order.reset(new unsigned[NumRegs]);
+
+  unsigned N = 0;
+  SmallVector<unsigned, 16> CSRAlias;
+
+  // FIXME: Once targets reserve registers instead of removing them from the
+  // allocation order, we can simply use begin/end here.
+  ArrayRef<unsigned> RawOrder = RC->getRawAllocationOrder(*MF);
+  for (unsigned i = 0; i != RawOrder.size(); ++i) {
+    unsigned PhysReg = RawOrder[i];
+    // Remove reserved registers from the allocation order.
+    if (Reserved.test(PhysReg))
+      continue;
+    if (CSRNum[PhysReg])
+      // PhysReg aliases a CSR, save it for later.
+      CSRAlias.push_back(PhysReg);
+    else
+      RCI.Order[N++] = PhysReg;
+  }
+  RCI.NumRegs = N + CSRAlias.size();
+  assert (RCI.NumRegs <= NumRegs && "Allocation order larger than regclass");
+
+  // CSR aliases go after the volatile registers, preserve the target's order.
+  std::copy(CSRAlias.begin(), CSRAlias.end(), &RCI.Order[N]);
+
+  DEBUG({
+    dbgs() << "AllocationOrder(" << RC->getName() << ") = [";
+    for (unsigned I = 0; I != RCI.NumRegs; ++I)
+      dbgs() << ' ' << PrintReg(RCI.Order[I], TRI);
+    dbgs() << " ]\n";
+  });
+
+  // RCI is now up-to-date.
+  RCI.Tag = Tag;
+}
+
diff --git a/lib/CodeGen/RegisterClassInfo.h b/lib/CodeGen/RegisterClassInfo.h
new file mode 100644
index 0000000..6f7d9c9
--- /dev/null
+++ b/lib/CodeGen/RegisterClassInfo.h
@@ -0,0 +1,121 @@
+//===-- RegisterClassInfo.h - Dynamic Register Class Info -*- C++ -*-------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the RegisterClassInfo class which provides dynamic
+// information about target register classes. Callee saved and reserved
+// registers depends on calling conventions and other dynamic information, so
+// some things cannot be determined statically.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_REGISTERCLASSINFO_H
+#define LLVM_CODEGEN_REGISTERCLASSINFO_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+namespace llvm {
+
+class RegisterClassInfo {
+  struct RCInfo {
+    unsigned Tag;
+    unsigned NumRegs;
+    OwningArrayPtr<unsigned> Order;
+
+    RCInfo() : Tag(0), NumRegs(0) {}
+    operator ArrayRef<unsigned>() const {
+      return ArrayRef<unsigned>(Order.get(), NumRegs);
+    }
+  };
+
+  // Brief cached information for each register class.
+  OwningArrayPtr<RCInfo> RegClass;
+
+  // Tag changes whenever cached information needs to be recomputed. An RCInfo
+  // entry is valid when its tag matches.
+  unsigned Tag;
+
+  const MachineFunction *MF;
+  const TargetRegisterInfo *TRI;
+
+  // Callee saved registers of last MF. Assumed to be valid until the next
+  // runOnFunction() call.
+  const unsigned *CalleeSaved;
+
+  // Map register number to CalleeSaved index + 1;
+  SmallVector<uint8_t, 4> CSRNum;
+
+  // Reserved registers in the current MF.
+  BitVector Reserved;
+
+  // Compute all information about RC.
+  void compute(const TargetRegisterClass *RC) const;
+
+  // Return an up-to-date RCInfo for RC.
+  const RCInfo &get(const TargetRegisterClass *RC) const {
+    const RCInfo &RCI = RegClass[RC->getID()];
+    if (Tag != RCI.Tag)
+      compute(RC);
+    return RCI;
+  }
+
+public:
+  RegisterClassInfo();
+
+  /// runOnFunction - Prepare to answer questions about MF. This must be called
+  /// before any other methods are used.
+  void runOnMachineFunction(const MachineFunction &MF);
+
+  /// getNumAllocatableRegs - Returns the number of actually allocatable
+  /// registers in RC in the current function.
+  unsigned getNumAllocatableRegs(const TargetRegisterClass *RC) const {
+    return get(RC).NumRegs;
+  }
+
+  /// getOrder - Returns the preferred allocation order for RC. The order
+  /// contains no reserved registers, and registers that alias callee saved
+  /// registers come last.
+  ArrayRef<unsigned> getOrder(const TargetRegisterClass *RC) const {
+    return get(RC);
+  }
+
+  /// getLastCalleeSavedAlias - Returns the last callee saved register that
+  /// overlaps PhysReg, or 0 if Reg doesn't overlap a CSR.
+  unsigned getLastCalleeSavedAlias(unsigned PhysReg) const {
+    assert(TargetRegisterInfo::isPhysicalRegister(PhysReg));
+    if (unsigned N = CSRNum[PhysReg])
+      return CalleeSaved[N-1];
+    return 0;
+  }
+
+  /// isReserved - Returns true when PhysReg is a reserved register.
+  ///
+  /// Reserved registers may belong to an allocatable register class, but the
+  /// target has explicitly requested that they are not used.
+  ///
+  bool isReserved(unsigned PhysReg) const {
+    return Reserved.test(PhysReg);
+  }
+
+  /// isAllocatable - Returns true when PhysReg belongs to an allocatable
+  /// register class and it hasn't been reserved.
+  ///
+  /// Allocatable registers may show up in the allocation order of some virtual
+  /// register, so a register allocator needs to track its liveness and
+  /// availability.
+  bool isAllocatable(unsigned PhysReg) const {
+    return TRI->get(PhysReg).inAllocatableClass && !isReserved(PhysReg);
+  }
+};
+} // end namespace llvm
+
+#endif
+
diff --git a/lib/CodeGen/RegisterScavenging.cpp b/lib/CodeGen/RegisterScavenging.cpp
index ebfe533..9e9a145 100644
--- a/lib/CodeGen/RegisterScavenging.cpp
+++ b/lib/CodeGen/RegisterScavenging.cpp
@@ -154,13 +154,16 @@ void RegScavenger::forward() {
   BitVector DeadRegs(NumPhysRegs);
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI->getOperand(i);
-    if (!MO.isReg() || MO.isUndef())
+    if (!MO.isReg())
       continue;
     unsigned Reg = MO.getReg();
     if (!Reg || isReserved(Reg))
       continue;
 
     if (MO.isUse()) {
+      // Ignore undef uses.
+      if (MO.isUndef())
+        continue;
       // Two-address operands implicitly kill.
       if (!isPred && (MO.isKill() || MI->isRegTiedToDefOperand(i)))
         addRegWithSubRegs(KillRegs, Reg);
@@ -178,12 +181,14 @@ void RegScavenger::forward() {
   // Verify uses and defs.
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI->getOperand(i);
-    if (!MO.isReg() || MO.isUndef())
+    if (!MO.isReg())
       continue;
     unsigned Reg = MO.getReg();
     if (!Reg || isReserved(Reg))
       continue;
     if (MO.isUse()) {
+      if (MO.isUndef())
+        continue;
       if (!isUsed(Reg)) {
         // Check if it's partial live: e.g.
         // D0 = insert_subreg D0<undef>, S0
diff --git a/lib/CodeGen/RenderMachineFunction.cpp b/lib/CodeGen/RenderMachineFunction.cpp
index cbfd5a2..8b02ec4 100644
--- a/lib/CodeGen/RenderMachineFunction.cpp
+++ b/lib/CodeGen/RenderMachineFunction.cpp
@@ -47,7 +47,7 @@ outputFileSuffix("rmf-file-suffix",
 
 static cl::opt<std::string>
 machineFuncsToRender("rmf-funcs",
-                     cl::desc("Coma seperated list of functions to render"
+                     cl::desc("Comma separated list of functions to render"
                               ", or \"*\"."),
                      cl::init(""), cl::Hidden);
 
@@ -434,8 +434,7 @@ namespace llvm {
            rcEnd = tri->regclass_end();
          rcItr != rcEnd; ++rcItr) {
       const TargetRegisterClass *trc = *rcItr;
-      unsigned capacity = std::distance(trc->allocation_order_begin(*mf),
-                                        trc->allocation_order_end(*mf));
+      unsigned capacity = trc->getRawAllocationOrder(*mf).size();
 
       if (capacity != 0)
         capacityMap[trc] = capacity;
@@ -482,8 +481,7 @@ namespace llvm {
                rcItr != rcEnd; ++rcItr) {
             const TargetRegisterClass *trc = *rcItr;
 
-            if (trc->allocation_order_begin(*mf) ==
-                trc->allocation_order_end(*mf))
+            if (trc->getRawAllocationOrder(*mf).empty())
               continue;
 
             unsigned worstAtI = getWorst(li->reg, trc);
diff --git a/lib/CodeGen/ScheduleDAG.cpp b/lib/CodeGen/ScheduleDAG.cpp
index 3388889..f328493 100644
--- a/lib/CodeGen/ScheduleDAG.cpp
+++ b/lib/CodeGen/ScheduleDAG.cpp
@@ -19,17 +19,27 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include <climits>
 using namespace llvm;
 
+#ifndef NDEBUG
+cl::opt<bool> StressSchedOpt(
+  "stress-sched", cl::Hidden, cl::init(false),
+  cl::desc("Stress test instruction scheduling"));
+#endif
+
 ScheduleDAG::ScheduleDAG(MachineFunction &mf)
   : TM(mf.getTarget()),
     TII(TM.getInstrInfo()),
     TRI(TM.getRegisterInfo()),
     MF(mf), MRI(mf.getRegInfo()),
     EntrySU(), ExitSU() {
+#ifndef NDEBUG
+  StressSched = StressSchedOpt;
+#endif
 }
 
 ScheduleDAG::~ScheduleDAG() {}
@@ -307,6 +317,8 @@ void SUnit::dumpAll(const ScheduleDAG *G) const {
       if (I->isArtificial())
         dbgs() << " *";
       dbgs() << ": Latency=" << I->getLatency();
+      if (I->isAssignedRegDep())
+        dbgs() << " Reg=" << G->TRI->getName(I->getReg());
       dbgs() << "\n";
     }
   }
@@ -472,7 +484,7 @@ void ScheduleDAGTopologicalSort::InitDAGTopologicalSorting() {
 #endif
 }
 
-/// AddPred - Updates the topological ordering to accomodate an edge
+/// AddPred - Updates the topological ordering to accommodate an edge
 /// to be added from SUnit X to SUnit Y.
 void ScheduleDAGTopologicalSort::AddPred(SUnit *Y, SUnit *X) {
   int UpperBound, LowerBound;
@@ -490,7 +502,7 @@ void ScheduleDAGTopologicalSort::AddPred(SUnit *Y, SUnit *X) {
   }
 }
 
-/// RemovePred - Updates the topological ordering to accomodate an
+/// RemovePred - Updates the topological ordering to accommodate an
 /// an edge to be removed from the specified node N from the predecessors
 /// of the current node M.
 void ScheduleDAGTopologicalSort::RemovePred(SUnit *M, SUnit *N) {
diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp
index f17023e..2363df4 100644
--- a/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -35,8 +35,9 @@ ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf,
                                      const MachineDominatorTree &mdt)
   : ScheduleDAG(mf), MLI(mli), MDT(mdt), MFI(mf.getFrameInfo()),
     InstrItins(mf.getTarget().getInstrItineraryData()),
-    Defs(TRI->getNumRegs()), Uses(TRI->getNumRegs()), LoopRegs(MLI, MDT) {
-  DbgValueVec.clear();
+    Defs(TRI->getNumRegs()), Uses(TRI->getNumRegs()), 
+    LoopRegs(MLI, MDT), FirstDbgValue(0) {
+  DbgValues.clear();
 }
 
 /// Run - perform scheduling.
@@ -120,7 +121,7 @@ static const Value *getUnderlyingObjectForInstr(const MachineInstr *MI,
     // such aliases.
     if (PSV->isAliased(MFI))
       return 0;
-    
+
     MayAlias = PSV->mayAlias(MFI);
     return V;
   }
@@ -174,7 +175,7 @@ void ScheduleDAGInstrs::AddSchedBarrierDeps() {
     for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(),
            SE = BB->succ_end(); SI != SE; ++SI)
       for (MachineBasicBlock::livein_iterator I = (*SI)->livein_begin(),
-             E = (*SI)->livein_end(); I != E; ++I) {    
+             E = (*SI)->livein_end(); I != E; ++I) {
         unsigned Reg = *I;
         if (Seen.insert(Reg))
           Uses[Reg].push_back(&ExitSU);
@@ -200,11 +201,6 @@ void ScheduleDAGInstrs::BuildSchedGraph(AliasAnalysis *AA) {
   std::map<const Value *, SUnit *> AliasMemDefs, NonAliasMemDefs;
   std::map<const Value *, std::vector<SUnit *> > AliasMemUses, NonAliasMemUses;
 
-  // Keep track of dangling debug references to registers.
-  std::vector<std::pair<MachineInstr*, unsigned> >
-    DanglingDebugValue(TRI->getNumRegs(),
-    std::make_pair(static_cast<MachineInstr*>(0), 0));
-
   // Check to see if the scheduler cares about latencies.
   bool UnitLatencies = ForceUnitLatencies();
 
@@ -214,26 +210,32 @@ void ScheduleDAGInstrs::BuildSchedGraph(AliasAnalysis *AA) {
 
   // Remove any stale debug info; sometimes BuildSchedGraph is called again
   // without emitting the info from the previous call.
-  DbgValueVec.clear();
+  DbgValues.clear();
+  FirstDbgValue = NULL;
 
   // Model data dependencies between instructions being scheduled and the
   // ExitSU.
   AddSchedBarrierDeps();
 
+  for (int i = 0, e = TRI->getNumRegs(); i != e; ++i) {
+    assert(Defs[i].empty() && "Only BuildGraph should push/pop Defs");
+  }
+
   // Walk the list of instructions, from bottom moving up.
+  MachineInstr *PrevMI = NULL;
   for (MachineBasicBlock::iterator MII = InsertPos, MIE = Begin;
        MII != MIE; --MII) {
     MachineInstr *MI = prior(MII);
-    // DBG_VALUE does not have SUnit's built, so just remember these for later
-    // reinsertion.
+    if (MI && PrevMI) {
+      DbgValues.push_back(std::make_pair(PrevMI, MI));
+      PrevMI = NULL;
+    }
+
     if (MI->isDebugValue()) {
-      if (MI->getNumOperands()==3 && MI->getOperand(0).isReg() &&
-          MI->getOperand(0).getReg())
-        DanglingDebugValue[MI->getOperand(0).getReg()] =
-             std::make_pair(MI, DbgValueVec.size());
-      DbgValueVec.push_back(MI);
+      PrevMI = MI;
       continue;
     }
+
     const TargetInstrDesc &TID = MI->getDesc();
     assert(!TID.isTerminator() && !MI->isLabel() &&
            "Cannot schedule terminators or labels!");
@@ -257,13 +259,8 @@ void ScheduleDAGInstrs::BuildSchedGraph(AliasAnalysis *AA) {
 
       assert(TRI->isPhysicalRegister(Reg) && "Virtual register encountered!");
 
-      if (MO.isDef() && DanglingDebugValue[Reg].first!=0) {
-        SU->DbgInstrList.push_back(DanglingDebugValue[Reg].first);
-        DbgValueVec[DanglingDebugValue[Reg].second] = 0;
-        DanglingDebugValue[Reg] = std::make_pair((MachineInstr*)0, 0);
-      }
-
       std::vector<SUnit *> &UseList = Uses[Reg];
+      // Defs are push in the order they are visited and never reordered.
       std::vector<SUnit *> &DefList = Defs[Reg];
       // Optionally add output and anti dependencies. For anti
       // dependencies we use a latency of 0 because for a multi-issue
@@ -283,9 +280,9 @@ void ScheduleDAGInstrs::BuildSchedGraph(AliasAnalysis *AA) {
           DefSU->addPred(SDep(SU, Kind, AOLatency, /*Reg=*/Reg));
       }
       for (const unsigned *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) {
-        std::vector<SUnit *> &DefList = Defs[*Alias];
-        for (unsigned i = 0, e = DefList.size(); i != e; ++i) {
-          SUnit *DefSU = DefList[i];
+        std::vector<SUnit *> &MemDefList = Defs[*Alias];
+        for (unsigned i = 0, e = MemDefList.size(); i != e; ++i) {
+          SUnit *DefSU = MemDefList[i];
           if (DefSU == &ExitSU)
             continue;
           if (DefSU != SU &&
@@ -371,7 +368,7 @@ void ScheduleDAGInstrs::BuildSchedGraph(AliasAnalysis *AA) {
                 // will be overlapped by work done outside the current
                 // scheduling region.
                 Latency -= std::min(Latency, Count);
-                // Add the artifical edge.
+                // Add the artificial edge.
                 ExitSU.addPred(SDep(SU, SDep::Order, Latency,
                                     /*Reg=*/0, /*isNormalMemory=*/false,
                                     /*isMustAlias=*/false,
@@ -393,6 +390,16 @@ void ScheduleDAGInstrs::BuildSchedGraph(AliasAnalysis *AA) {
         UseList.clear();
         if (!MO.isDead())
           DefList.clear();
+
+        // Calls will not be reordered because of chain dependencies (see
+        // below). Since call operands are dead, calls may continue to be added
+        // to the DefList making dependence checking quadratic in the size of
+        // the block. Instead, we leave only one call at the back of the
+        // DefList.
+        if (SU->isCall) {
+          while (!DefList.empty() && DefList.back()->isCall)
+            DefList.pop_back();
+        }
         DefList.push_back(SU);
       } else {
         UseList.push_back(SU);
@@ -411,11 +418,11 @@ void ScheduleDAGInstrs::BuildSchedGraph(AliasAnalysis *AA) {
 #define STORE_LOAD_LATENCY 1
     unsigned TrueMemOrderLatency = 0;
     if (TID.isCall() || MI->hasUnmodeledSideEffects() ||
-        (MI->hasVolatileMemoryRef() && 
+        (MI->hasVolatileMemoryRef() &&
          (!TID.mayLoad() || !MI->isInvariantLoad(AA)))) {
       // Be conservative with these and add dependencies on all memory
       // references, even those that are known to not alias.
-      for (std::map<const Value *, SUnit *>::iterator I = 
+      for (std::map<const Value *, SUnit *>::iterator I =
              NonAliasMemDefs.begin(), E = NonAliasMemDefs.end(); I != E; ++I) {
         I->second->addPred(SDep(SU, SDep::Order, /*Latency=*/0));
       }
@@ -458,9 +465,9 @@ void ScheduleDAGInstrs::BuildSchedGraph(AliasAnalysis *AA) {
         // A store to a specific PseudoSourceValue. Add precise dependencies.
         // Record the def in MemDefs, first adding a dep if there is
         // an existing def.
-        std::map<const Value *, SUnit *>::iterator I = 
+        std::map<const Value *, SUnit *>::iterator I =
           ((MayAlias) ? AliasMemDefs.find(V) : NonAliasMemDefs.find(V));
-        std::map<const Value *, SUnit *>::iterator IE = 
+        std::map<const Value *, SUnit *>::iterator IE =
           ((MayAlias) ? AliasMemDefs.end() : NonAliasMemDefs.end());
         if (I != IE) {
           I->second->addPred(SDep(SU, SDep::Order, /*Latency=*/0, /*Reg=*/0,
@@ -513,39 +520,41 @@ void ScheduleDAGInstrs::BuildSchedGraph(AliasAnalysis *AA) {
       if (MI->isInvariantLoad(AA)) {
         // Invariant load, no chain dependencies needed!
       } else {
-        if (const Value *V = 
+        if (const Value *V =
             getUnderlyingObjectForInstr(MI, MFI, MayAlias)) {
           // A load from a specific PseudoSourceValue. Add precise dependencies.
-          std::map<const Value *, SUnit *>::iterator I = 
+          std::map<const Value *, SUnit *>::iterator I =
             ((MayAlias) ? AliasMemDefs.find(V) : NonAliasMemDefs.find(V));
-          std::map<const Value *, SUnit *>::iterator IE = 
+          std::map<const Value *, SUnit *>::iterator IE =
             ((MayAlias) ? AliasMemDefs.end() : NonAliasMemDefs.end());
           if (I != IE)
             I->second->addPred(SDep(SU, SDep::Order, /*Latency=*/0, /*Reg=*/0,
                                     /*isNormalMemory=*/true));
           if (MayAlias)
             AliasMemUses[V].push_back(SU);
-          else 
+          else
             NonAliasMemUses[V].push_back(SU);
         } else {
           // A load with no underlying object. Depend on all
           // potentially aliasing stores.
-          for (std::map<const Value *, SUnit *>::iterator I = 
+          for (std::map<const Value *, SUnit *>::iterator I =
                  AliasMemDefs.begin(), E = AliasMemDefs.end(); I != E; ++I)
             I->second->addPred(SDep(SU, SDep::Order, /*Latency=*/0));
-          
+
           PendingLoads.push_back(SU);
           MayAlias = true;
         }
-        
+
         // Add dependencies on alias and barrier chains, if needed.
         if (MayAlias && AliasChain)
           AliasChain->addPred(SDep(SU, SDep::Order, /*Latency=*/0));
         if (BarrierChain)
           BarrierChain->addPred(SDep(SU, SDep::Order, /*Latency=*/0));
-      } 
+      }
     }
   }
+  if (PrevMI)
+    FirstDbgValue = PrevMI;
 
   for (int i = 0, e = TRI->getNumRegs(); i != e; ++i) {
     Defs[i].clear();
@@ -572,11 +581,11 @@ void ScheduleDAGInstrs::ComputeLatency(SUnit *SU) {
   }
 }
 
-void ScheduleDAGInstrs::ComputeOperandLatency(SUnit *Def, SUnit *Use, 
+void ScheduleDAGInstrs::ComputeOperandLatency(SUnit *Def, SUnit *Use,
                                               SDep& dep) const {
   if (!InstrItins || InstrItins->isEmpty())
     return;
-  
+
   // For a data dependency with a known register...
   if ((dep.getKind() != SDep::Data) || (dep.getReg() == 0))
     return;
@@ -655,39 +664,33 @@ MachineBasicBlock *ScheduleDAGInstrs::EmitSchedule() {
     BB->remove(I);
   }
 
-  // First reinsert any remaining debug_values; these are either constants,
-  // or refer to live-in registers.  The beginning of the block is the right
-  // place for the latter.  The former might reasonably be placed elsewhere
-  // using some kind of ordering algorithm, but right now it doesn't matter.
-  for (int i = DbgValueVec.size()-1; i>=0; --i)
-    if (DbgValueVec[i])
-      BB->insert(InsertPos, DbgValueVec[i]);
+  // If first instruction was a DBG_VALUE then put it back.
+  if (FirstDbgValue)
+    BB->insert(InsertPos, FirstDbgValue);
 
   // Then re-insert them according to the given schedule.
   for (unsigned i = 0, e = Sequence.size(); i != e; i++) {
-    SUnit *SU = Sequence[i];
-    if (!SU) {
+    if (SUnit *SU = Sequence[i])
+      BB->insert(InsertPos, SU->getInstr());
+    else
       // Null SUnit* is a noop.
       EmitNoop();
-      continue;
-    }
-
-    BB->insert(InsertPos, SU->getInstr());
-    for (unsigned i = 0, e = SU->DbgInstrList.size() ; i < e ; ++i)
-      BB->insert(InsertPos, SU->DbgInstrList[i]);
   }
 
   // Update the Begin iterator, as the first instruction in the block
   // may have been scheduled later.
-  if (!DbgValueVec.empty()) {
-    for (int i = DbgValueVec.size()-1; i>=0; --i)
-      if (DbgValueVec[i]!=0) {
-        Begin = DbgValueVec[DbgValueVec.size()-1];
-        break;
-      }
-  } else if (!Sequence.empty())
+  if (!Sequence.empty())
     Begin = Sequence[0]->getInstr();
 
-  DbgValueVec.clear();
+  // Reinsert any remaining debug_values.
+  for (std::vector<std::pair<MachineInstr *, MachineInstr *> >::iterator
+         DI = DbgValues.end(), DE = DbgValues.begin(); DI != DE; --DI) {
+    std::pair<MachineInstr *, MachineInstr *> P = *prior(DI);
+    MachineInstr *DbgValue = P.first;
+    MachineInstr *OrigPrivMI = P.second;
+    BB->insertAfter(OrigPrivMI, DbgValue);
+  }
+  DbgValues.clear();
+  FirstDbgValue = NULL;
   return BB;
 }
diff --git a/lib/CodeGen/ScheduleDAGInstrs.h b/lib/CodeGen/ScheduleDAGInstrs.h
index c878287..8a4ea85 100644
--- a/lib/CodeGen/ScheduleDAGInstrs.h
+++ b/lib/CodeGen/ScheduleDAGInstrs.h
@@ -110,10 +110,6 @@ namespace llvm {
     std::vector<std::vector<SUnit *> > Defs;
     std::vector<std::vector<SUnit *> > Uses;
  
-    /// DbgValueVec - Remember DBG_VALUEs that refer to a particular
-    /// register.
-    std::vector<MachineInstr *>DbgValueVec;
-
     /// PendingLoads - Remember where unknown loads are after the most recent
     /// unknown store, as we iterate. As with Defs and Uses, this is here
     /// to minimize construction/destruction.
@@ -128,6 +124,14 @@ namespace llvm {
     ///
     SmallSet<unsigned, 8> LoopLiveInRegs;
 
+  protected:
+
+    /// DbgValues - Remember instruction that preceeds DBG_VALUE.
+    typedef std::vector<std::pair<MachineInstr *, MachineInstr *> > 
+      DbgValueVector;
+    DbgValueVector DbgValues;
+    MachineInstr *FirstDbgValue;
+
   public:
     MachineBasicBlock::iterator Begin;    // The beginning of the range to
                                           // be scheduled. The range extends
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index a9afec8..4ac590a 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -138,6 +138,10 @@ namespace {
     SDValue PromoteExtend(SDValue Op);
     bool PromoteLoad(SDValue Op);
 
+    void ExtendSetCCUses(SmallVector<SDNode*, 4> SetCCs,
+                         SDValue Trunc, SDValue ExtLoad, DebugLoc DL,
+                         ISD::NodeType ExtType);
+
     /// combine - call the node-specific routine that knows how to fold each
     /// particular type of node. If that doesn't do anything, try the
     /// target-specific DAG combines.
@@ -165,6 +169,8 @@ namespace {
     SDValue visitMULHS(SDNode *N);
     SDValue visitSMUL_LOHI(SDNode *N);
     SDValue visitUMUL_LOHI(SDNode *N);
+    SDValue visitSMULO(SDNode *N);
+    SDValue visitUMULO(SDNode *N);
     SDValue visitSDIVREM(SDNode *N);
     SDValue visitUDIVREM(SDNode *N);
     SDValue visitAND(SDNode *N);
@@ -529,7 +535,8 @@ SDValue DAGCombiner::ReassociateOps(unsigned Opc, DebugLoc DL,
                                    cast<ConstantSDNode>(N0.getOperand(1)),
                                    cast<ConstantSDNode>(N1));
       return DAG.getNode(Opc, DL, VT, N0.getOperand(0), OpNode);
-    } else if (N0.hasOneUse()) {
+    }
+    if (N0.hasOneUse()) {
       // reassoc. (op (op x, c1), y) -> (op (op x, y), c1) iff x+c1 has one use
       SDValue OpNode = DAG.getNode(Opc, N0.getDebugLoc(), VT,
                                    N0.getOperand(0), N1);
@@ -546,7 +553,8 @@ SDValue DAGCombiner::ReassociateOps(unsigned Opc, DebugLoc DL,
                                    cast<ConstantSDNode>(N1.getOperand(1)),
                                    cast<ConstantSDNode>(N0));
       return DAG.getNode(Opc, DL, VT, N1.getOperand(0), OpNode);
-    } else if (N1.hasOneUse()) {
+    }
+    if (N1.hasOneUse()) {
       // reassoc. (op y, (op x, c1)) -> (op (op x, y), c1) iff x+c1 has one use
       SDValue OpNode = DAG.getNode(Opc, N0.getDebugLoc(), VT,
                                    N1.getOperand(0), N0);
@@ -990,6 +998,9 @@ void DAGCombiner::Run(CombineLevel AtLevel) {
           dbgs() << "\nWith: ";
           RV.getNode()->dump(&DAG);
           dbgs() << '\n');
+    
+    // Transfer debug value.
+    DAG.TransferDbgValues(SDValue(N, 0), RV);
     WorkListRemover DeadNodes(*this);
     if (N->getNumValues() == RV.getNode()->getNumValues())
       DAG.ReplaceAllUsesWith(N, RV.getNode(), &DeadNodes);
@@ -1045,6 +1056,8 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::MULHS:              return visitMULHS(N);
   case ISD::SMUL_LOHI:          return visitSMUL_LOHI(N);
   case ISD::UMUL_LOHI:          return visitUMUL_LOHI(N);
+  case ISD::SMULO:              return visitSMULO(N);
+  case ISD::UMULO:              return visitUMULO(N);
   case ISD::SDIVREM:            return visitSDIVREM(N);
   case ISD::UDIVREM:            return visitUDIVREM(N);
   case ISD::AND:                return visitAND(N);
@@ -1566,7 +1579,8 @@ static SDValue tryFoldToZero(DebugLoc DL, const TargetLowering &TLI, EVT VT,
                              SelectionDAG &DAG, bool LegalOperations) {
   if (!VT.isVector()) {
     return DAG.getConstant(0, VT);
-  } else if (!LegalOperations || TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)) {
+  }
+  if (!LegalOperations || TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)) {
     // Produce a vector of zeros.
     SDValue El = DAG.getConstant(0, VT.getVectorElementType());
     std::vector<SDValue> Ops(VT.getVectorNumElements(), El);
@@ -2174,6 +2188,26 @@ SDValue DAGCombiner::visitUMUL_LOHI(SDNode *N) {
   return SDValue();
 }
 
+SDValue DAGCombiner::visitSMULO(SDNode *N) {
+  // (smulo x, 2) -> (saddo x, x)
+  if (ConstantSDNode *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1)))
+    if (C2->getAPIntValue() == 2)
+      return DAG.getNode(ISD::SADDO, N->getDebugLoc(), N->getVTList(),
+                         N->getOperand(0), N->getOperand(0));
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitUMULO(SDNode *N) {
+  // (umulo x, 2) -> (uaddo x, x)
+  if (ConstantSDNode *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1)))
+    if (C2->getAPIntValue() == 2)
+      return DAG.getNode(ISD::UADDO, N->getDebugLoc(), N->getVTList(),
+                         N->getOperand(0), N->getOperand(0));
+
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitSDIVREM(SDNode *N) {
   SDValue Res = SimplifyNodeWithTwoResults(N, ISD::SDIV, ISD::SREM);
   if (Res.getNode()) return Res;
@@ -3000,6 +3034,9 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
   // fold (shl x, 0) -> x
   if (N1C && N1C->isNullValue())
     return N0;
+  // fold (shl undef, x) -> 0
+  if (N0.getOpcode() == ISD::UNDEF)
+    return DAG.getConstant(0, VT);
   // if (shl x, c) is known to be zero, return 0
   if (DAG.MaskedValueIsZero(SDValue(N, 0),
                             APInt::getAllOnesValue(OpSizeInBits)))
@@ -3062,26 +3099,27 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
     }
   }
 
-  // fold (shl (srl x, c1), c2) -> (shl (and x, (shl -1, c1)), (sub c2, c1)) or
-  //                               (srl (and x, (shl -1, c1)), (sub c1, c2))
+  // fold (shl (srl x, c1), c2) -> (and (shl x, (sub c2, c1), MASK) or
+  //                               (and (srl x, (sub c1, c2), MASK)
   if (N1C && N0.getOpcode() == ISD::SRL &&
       N0.getOperand(1).getOpcode() == ISD::Constant) {
     uint64_t c1 = cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue();
     if (c1 < VT.getSizeInBits()) {
       uint64_t c2 = N1C->getZExtValue();
-      SDValue HiBitsMask =
-        DAG.getConstant(APInt::getHighBitsSet(VT.getSizeInBits(),
-                                              VT.getSizeInBits() - c1),
-                        VT);
-      SDValue Mask = DAG.getNode(ISD::AND, N0.getDebugLoc(), VT,
-                                 N0.getOperand(0),
-                                 HiBitsMask);
-      if (c2 > c1)
-        return DAG.getNode(ISD::SHL, N->getDebugLoc(), VT, Mask,
-                           DAG.getConstant(c2-c1, N1.getValueType()));
-      else
-        return DAG.getNode(ISD::SRL, N->getDebugLoc(), VT, Mask,
-                           DAG.getConstant(c1-c2, N1.getValueType()));
+      APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(),
+                                         VT.getSizeInBits() - c1);
+      SDValue Shift;
+      if (c2 > c1) {
+        Mask = Mask.shl(c2-c1);
+        Shift = DAG.getNode(ISD::SHL, N->getDebugLoc(), VT, N0.getOperand(0),
+                            DAG.getConstant(c2-c1, N1.getValueType()));
+      } else {
+        Mask = Mask.lshr(c1-c2);
+        Shift = DAG.getNode(ISD::SRL, N->getDebugLoc(), VT, N0.getOperand(0),
+                            DAG.getConstant(c1-c2, N1.getValueType()));
+      }
+      return DAG.getNode(ISD::AND, N0.getDebugLoc(), VT, Shift,
+                         DAG.getConstant(Mask, VT));
     }
   }
   // fold (shl (sra x, c1), c1) -> (and x, (shl -1, c1))
@@ -3323,8 +3361,10 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
       return DAG.getUNDEF(VT);
 
     if (!LegalTypes || TLI.isTypeDesirableForOp(ISD::SRL, SmallVT)) {
+      uint64_t ShiftAmt = N1C->getZExtValue();
       SDValue SmallShift = DAG.getNode(ISD::SRL, N0.getDebugLoc(), SmallVT,
-                                       N0.getOperand(0), N1);
+                                       N0.getOperand(0),
+                          DAG.getConstant(ShiftAmt, getShiftAmountTy(SmallVT)));
       AddToWorkList(SmallShift.getNode());
       return DAG.getNode(ISD::ANY_EXTEND, N->getDebugLoc(), VT, SmallShift);
     }
@@ -3663,6 +3703,28 @@ static bool ExtendUsesToFormExtLoad(SDNode *N, SDValue N0,
   return true;
 }
 
+void DAGCombiner::ExtendSetCCUses(SmallVector<SDNode*, 4> SetCCs,
+                                  SDValue Trunc, SDValue ExtLoad, DebugLoc DL,
+                                  ISD::NodeType ExtType) {
+  // Extend SetCC uses if necessary.
+  for (unsigned i = 0, e = SetCCs.size(); i != e; ++i) {
+    SDNode *SetCC = SetCCs[i];
+    SmallVector<SDValue, 4> Ops;
+
+    for (unsigned j = 0; j != 2; ++j) {
+      SDValue SOp = SetCC->getOperand(j);
+      if (SOp == Trunc)
+        Ops.push_back(ExtLoad);
+      else
+        Ops.push_back(DAG.getNode(ExtType, DL, ExtLoad->getValueType(0), SOp));
+    }
+
+    Ops.push_back(SetCC->getOperand(2));
+    CombineTo(SetCC, DAG.getNode(ISD::SETCC, DL, SetCC->getValueType(0),
+                                 &Ops[0], Ops.size()));
+  }
+}
+
 SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
@@ -3751,27 +3813,8 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
       SDValue Trunc = DAG.getNode(ISD::TRUNCATE, N0.getDebugLoc(),
                                   N0.getValueType(), ExtLoad);
       CombineTo(N0.getNode(), Trunc, ExtLoad.getValue(1));
-
-      // Extend SetCC uses if necessary.
-      for (unsigned i = 0, e = SetCCs.size(); i != e; ++i) {
-        SDNode *SetCC = SetCCs[i];
-        SmallVector<SDValue, 4> Ops;
-
-        for (unsigned j = 0; j != 2; ++j) {
-          SDValue SOp = SetCC->getOperand(j);
-          if (SOp == Trunc)
-            Ops.push_back(ExtLoad);
-          else
-            Ops.push_back(DAG.getNode(ISD::SIGN_EXTEND,
-                                      N->getDebugLoc(), VT, SOp));
-        }
-
-        Ops.push_back(SetCC->getOperand(2));
-        CombineTo(SetCC, DAG.getNode(ISD::SETCC, N->getDebugLoc(),
-                                     SetCC->getValueType(0),
-                                     &Ops[0], Ops.size()));
-      }
-
+      ExtendSetCCUses(SetCCs, Trunc, ExtLoad, N->getDebugLoc(),
+                      ISD::SIGN_EXTEND);
       return SDValue(N, 0);   // Return N so it doesn't get rechecked!
     }
   }
@@ -3799,6 +3842,45 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
     }
   }
 
+  // fold (sext (and/or/xor (load x), cst)) ->
+  //      (and/or/xor (sextload x), (sext cst))
+  if ((N0.getOpcode() == ISD::AND || N0.getOpcode() == ISD::OR ||
+       N0.getOpcode() == ISD::XOR) &&
+      isa<LoadSDNode>(N0.getOperand(0)) &&
+      N0.getOperand(1).getOpcode() == ISD::Constant &&
+      TLI.isLoadExtLegal(ISD::SEXTLOAD, N0.getValueType()) &&
+      (!LegalOperations && TLI.isOperationLegal(N0.getOpcode(), VT))) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0.getOperand(0));
+    if (LN0->getExtensionType() != ISD::ZEXTLOAD) {
+      bool DoXform = true;
+      SmallVector<SDNode*, 4> SetCCs;
+      if (!N0.hasOneUse())
+        DoXform = ExtendUsesToFormExtLoad(N, N0.getOperand(0), ISD::SIGN_EXTEND,
+                                          SetCCs, TLI);
+      if (DoXform) {
+        SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, LN0->getDebugLoc(), VT,
+                                         LN0->getChain(), LN0->getBasePtr(),
+                                         LN0->getPointerInfo(),
+                                         LN0->getMemoryVT(),
+                                         LN0->isVolatile(),
+                                         LN0->isNonTemporal(),
+                                         LN0->getAlignment());
+        APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
+        Mask = Mask.sext(VT.getSizeInBits());
+        SDValue And = DAG.getNode(N0.getOpcode(), N->getDebugLoc(), VT,
+                                  ExtLoad, DAG.getConstant(Mask, VT));
+        SDValue Trunc = DAG.getNode(ISD::TRUNCATE,
+                                    N0.getOperand(0).getDebugLoc(),
+                                    N0.getOperand(0).getValueType(), ExtLoad);
+        CombineTo(N, And);
+        CombineTo(N0.getOperand(0).getNode(), Trunc, ExtLoad.getValue(1));
+        ExtendSetCCUses(SetCCs, Trunc, ExtLoad, N->getDebugLoc(),
+                        ISD::SIGN_EXTEND);
+        return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+      }
+    }
+  }
+
   if (N0.getOpcode() == ISD::SETCC) {
     // sext(setcc) -> sext_in_reg(vsetcc) for vectors.
     // Only do this before legalize for now.
@@ -3882,7 +3964,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
         // CombineTo deleted the truncate, if needed, but not what's under it.
         AddToWorkList(oye);
       }
-      return DAG.getNode(ISD::ZERO_EXTEND, N->getDebugLoc(), VT, NarrowLoad);
+      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
     }
   }
 
@@ -3957,27 +4039,48 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
                                   N0.getValueType(), ExtLoad);
       CombineTo(N0.getNode(), Trunc, ExtLoad.getValue(1));
 
-      // Extend SetCC uses if necessary.
-      for (unsigned i = 0, e = SetCCs.size(); i != e; ++i) {
-        SDNode *SetCC = SetCCs[i];
-        SmallVector<SDValue, 4> Ops;
-
-        for (unsigned j = 0; j != 2; ++j) {
-          SDValue SOp = SetCC->getOperand(j);
-          if (SOp == Trunc)
-            Ops.push_back(ExtLoad);
-          else
-            Ops.push_back(DAG.getNode(ISD::ZERO_EXTEND,
-                                      N->getDebugLoc(), VT, SOp));
-        }
+      ExtendSetCCUses(SetCCs, Trunc, ExtLoad, N->getDebugLoc(),
+                      ISD::ZERO_EXTEND);
+      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+    }
+  }
 
-        Ops.push_back(SetCC->getOperand(2));
-        CombineTo(SetCC, DAG.getNode(ISD::SETCC, N->getDebugLoc(),
-                                     SetCC->getValueType(0),
-                                     &Ops[0], Ops.size()));
+  // fold (zext (and/or/xor (load x), cst)) ->
+  //      (and/or/xor (zextload x), (zext cst))
+  if ((N0.getOpcode() == ISD::AND || N0.getOpcode() == ISD::OR ||
+       N0.getOpcode() == ISD::XOR) &&
+      isa<LoadSDNode>(N0.getOperand(0)) &&
+      N0.getOperand(1).getOpcode() == ISD::Constant &&
+      TLI.isLoadExtLegal(ISD::ZEXTLOAD, N0.getValueType()) &&
+      (!LegalOperations && TLI.isOperationLegal(N0.getOpcode(), VT))) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0.getOperand(0));
+    if (LN0->getExtensionType() != ISD::SEXTLOAD) {
+      bool DoXform = true;
+      SmallVector<SDNode*, 4> SetCCs;
+      if (!N0.hasOneUse())
+        DoXform = ExtendUsesToFormExtLoad(N, N0.getOperand(0), ISD::ZERO_EXTEND,
+                                          SetCCs, TLI);
+      if (DoXform) {
+        SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, LN0->getDebugLoc(), VT,
+                                         LN0->getChain(), LN0->getBasePtr(),
+                                         LN0->getPointerInfo(),
+                                         LN0->getMemoryVT(),
+                                         LN0->isVolatile(),
+                                         LN0->isNonTemporal(),
+                                         LN0->getAlignment());
+        APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
+        Mask = Mask.zext(VT.getSizeInBits());
+        SDValue And = DAG.getNode(N0.getOpcode(), N->getDebugLoc(), VT,
+                                  ExtLoad, DAG.getConstant(Mask, VT));
+        SDValue Trunc = DAG.getNode(ISD::TRUNCATE,
+                                    N0.getOperand(0).getDebugLoc(),
+                                    N0.getOperand(0).getValueType(), ExtLoad);
+        CombineTo(N, And);
+        CombineTo(N0.getOperand(0).getNode(), Trunc, ExtLoad.getValue(1));
+        ExtendSetCCUses(SetCCs, Trunc, ExtLoad, N->getDebugLoc(),
+                        ISD::ZERO_EXTEND);
+        return SDValue(N, 0);   // Return N so it doesn't get rechecked!
       }
-
-      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
     }
   }
 
@@ -4012,7 +4115,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
       EVT EltVT = VT.getVectorElementType();
       SmallVector<SDValue,8> OneOps(VT.getVectorNumElements(),
                                     DAG.getConstant(1, EltVT));
-      if (VT.getSizeInBits() == N0VT.getSizeInBits()) {
+      if (VT.getSizeInBits() == N0VT.getSizeInBits())
         // We know that the # elements of the results is the same as the
         // # elements of the compare (and the # elements of the compare result
         // for that matter).  Check to see that they are the same size.  If so,
@@ -4024,25 +4127,24 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
                                  cast<CondCodeSDNode>(N0.getOperand(2))->get()),
                            DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(), VT,
                                        &OneOps[0], OneOps.size()));
-      } else {
-        // If the desired elements are smaller or larger than the source
-        // elements we can use a matching integer vector type and then
-        // truncate/sign extend
-        EVT MatchingElementType =
-          EVT::getIntegerVT(*DAG.getContext(),
-                            N0VT.getScalarType().getSizeInBits());
-        EVT MatchingVectorType =
-          EVT::getVectorVT(*DAG.getContext(), MatchingElementType,
-                           N0VT.getVectorNumElements());
-        SDValue VsetCC =
-          DAG.getVSetCC(N->getDebugLoc(), MatchingVectorType, N0.getOperand(0),
-                        N0.getOperand(1),
-                        cast<CondCodeSDNode>(N0.getOperand(2))->get());
-        return DAG.getNode(ISD::AND, N->getDebugLoc(), VT,
-                           DAG.getSExtOrTrunc(VsetCC, N->getDebugLoc(), VT),
-                           DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(), VT,
-                                       &OneOps[0], OneOps.size()));
-      }
+
+      // If the desired elements are smaller or larger than the source
+      // elements we can use a matching integer vector type and then
+      // truncate/sign extend
+      EVT MatchingElementType =
+        EVT::getIntegerVT(*DAG.getContext(),
+                          N0VT.getScalarType().getSizeInBits());
+      EVT MatchingVectorType =
+        EVT::getVectorVT(*DAG.getContext(), MatchingElementType,
+                         N0VT.getVectorNumElements());
+      SDValue VsetCC =
+        DAG.getVSetCC(N->getDebugLoc(), MatchingVectorType, N0.getOperand(0),
+                      N0.getOperand(1),
+                      cast<CondCodeSDNode>(N0.getOperand(2))->get());
+      return DAG.getNode(ISD::AND, N->getDebugLoc(), VT,
+                         DAG.getSExtOrTrunc(VsetCC, N->getDebugLoc(), VT),
+                         DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(), VT,
+                                     &OneOps[0], OneOps.size()));
     }
 
     // zext(setcc x,y,cc) -> select_cc x, y, 1, 0, cc
@@ -4110,7 +4212,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
         // CombineTo deleted the truncate, if needed, but not what's under it.
         AddToWorkList(oye);
       }
-      return DAG.getNode(ISD::ANY_EXTEND, N->getDebugLoc(), VT, NarrowLoad);
+      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
     }
   }
 
@@ -4166,27 +4268,8 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
       SDValue Trunc = DAG.getNode(ISD::TRUNCATE, N0.getDebugLoc(),
                                   N0.getValueType(), ExtLoad);
       CombineTo(N0.getNode(), Trunc, ExtLoad.getValue(1));
-
-      // Extend SetCC uses if necessary.
-      for (unsigned i = 0, e = SetCCs.size(); i != e; ++i) {
-        SDNode *SetCC = SetCCs[i];
-        SmallVector<SDValue, 4> Ops;
-
-        for (unsigned j = 0; j != 2; ++j) {
-          SDValue SOp = SetCC->getOperand(j);
-          if (SOp == Trunc)
-            Ops.push_back(ExtLoad);
-          else
-            Ops.push_back(DAG.getNode(ISD::ANY_EXTEND,
-                                      N->getDebugLoc(), VT, SOp));
-        }
-
-        Ops.push_back(SetCC->getOperand(2));
-        CombineTo(SetCC, DAG.getNode(ISD::SETCC, N->getDebugLoc(),
-                                     SetCC->getValueType(0),
-                                     &Ops[0], Ops.size()));
-      }
-
+      ExtendSetCCUses(SetCCs, Trunc, ExtLoad, N->getDebugLoc(),
+                      ISD::ANY_EXTEND);
       return SDValue(N, 0);   // Return N so it doesn't get rechecked!
     }
   }
@@ -6265,6 +6348,10 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
                           ST->isNonTemporal(), OrigAlign);
   }
 
+  // Turn 'store undef, Ptr' -> nothing.
+  if (Value.getOpcode() == ISD::UNDEF && ST->isUnindexed())
+    return Chain;
+
   // Turn 'store float 1.0, Ptr' -> 'store int 0x12345678, Ptr'
   if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Value)) {
     // NOTE: If the original store is volatile, this transform must not increase
@@ -6298,8 +6385,10 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
           return DAG.getStore(Chain, N->getDebugLoc(), Tmp,
                               Ptr, ST->getPointerInfo(), ST->isVolatile(),
                               ST->isNonTemporal(), ST->getAlignment());
-        } else if (!ST->isVolatile() &&
-                   TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) {
+        }
+
+        if (!ST->isVolatile() &&
+            TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) {
           // Many FP stores are not made apparent until after legalize, e.g. for
           // argument passing.  Since this is so common, custom legalize the
           // 64-bit integer store into two 32-bit stores.
@@ -6393,8 +6482,9 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
     // "truncstore (or (shl x, 8), y), i8"  -> "truncstore y, i8"
     SDValue Shorter =
       GetDemandedBits(Value,
-                      APInt::getLowBitsSet(Value.getValueSizeInBits(),
-                                           ST->getMemoryVT().getSizeInBits()));
+                      APInt::getLowBitsSet(
+                        Value.getValueType().getScalarType().getSizeInBits(),
+                        ST->getMemoryVT().getScalarType().getSizeInBits()));
     AddToWorkList(Value.getNode());
     if (Shorter.getNode())
       return DAG.getTruncStore(Chain, N->getDebugLoc(), Shorter,
@@ -6486,18 +6576,18 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
   // (vextract (scalar_to_vector val, 0) -> val
   SDValue InVec = N->getOperand(0);
 
- if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) {
-   // Check if the result type doesn't match the inserted element type. A
-   // SCALAR_TO_VECTOR may truncate the inserted element and the
-   // EXTRACT_VECTOR_ELT may widen the extracted vector.
-   SDValue InOp = InVec.getOperand(0);
-   EVT NVT = N->getValueType(0);
-   if (InOp.getValueType() != NVT) {
-     assert(InOp.getValueType().isInteger() && NVT.isInteger());
-     return DAG.getSExtOrTrunc(InOp, InVec.getDebugLoc(), NVT);
-   }
-   return InOp;
- }
+  if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+    // Check if the result type doesn't match the inserted element type. A
+    // SCALAR_TO_VECTOR may truncate the inserted element and the
+    // EXTRACT_VECTOR_ELT may widen the extracted vector.
+    SDValue InOp = InVec.getOperand(0);
+    EVT NVT = N->getValueType(0);
+    if (InOp.getValueType() != NVT) {
+      assert(InOp.getValueType().isInteger() && NVT.isInteger());
+      return DAG.getSExtOrTrunc(InOp, InVec.getDebugLoc(), NVT);
+    }
+    return InOp;
+  }
 
   // Perform only after legalization to ensure build_vector / vector_shuffle
   // optimizations have already been done.
@@ -6558,7 +6648,7 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
       }
     }
 
-    if (!LN0 || !LN0->hasOneUse() || LN0->isVolatile())
+    if (!LN0 || !LN0->hasNUsesOfValue(1,0) || LN0->isVolatile())
       return SDValue();
 
     // If Idx was -1 above, Elt is going to be -1, so just return undef.
@@ -7497,18 +7587,17 @@ bool DAGCombiner::FindAliasInfo(SDNode *N,
     SrcValueAlign = LD->getOriginalAlignment();
     TBAAInfo = LD->getTBAAInfo();
     return true;
-  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+  }
+  if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
     Ptr = ST->getBasePtr();
     Size = ST->getMemoryVT().getSizeInBits() >> 3;
     SrcValue = ST->getSrcValue();
     SrcValueOffset = ST->getSrcValueOffset();
     SrcValueAlign = ST->getOriginalAlignment();
     TBAAInfo = ST->getTBAAInfo();
-  } else {
-    llvm_unreachable("FindAliasInfo expected a memory operand");
+    return false;
   }
-
-  return false;
+  llvm_unreachable("FindAliasInfo expected a memory operand");
 }
 
 /// GatherAllAliases - Walk up chain skipping non-aliasing memory nodes,
@@ -7621,13 +7710,13 @@ SDValue DAGCombiner::FindBetterChain(SDNode *N, SDValue OldChain) {
   // Accumulate all the aliases to this node.
   GatherAllAliases(N, OldChain, Aliases);
 
-  if (Aliases.size() == 0) {
-    // If no operands then chain to entry token.
+  // If no operands then chain to entry token.
+  if (Aliases.size() == 0)
     return DAG.getEntryNode();
-  } else if (Aliases.size() == 1) {
-    // If a single operand then chain to it.  We don't need to revisit it.
+
+  // If a single operand then chain to it.  We don't need to revisit it.
+  if (Aliases.size() == 1)
     return Aliases[0];
-  }
 
   // Construct a custom tailored token factor.
   return DAG.getNode(ISD::TokenFactor, N->getDebugLoc(), MVT::Other,
diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp
index ea8ace3..797f174 100644
--- a/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -43,6 +43,8 @@
 #include "llvm/GlobalVariable.h"
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
+#include "llvm/Operator.h"
+#include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/FastISel.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -109,8 +111,8 @@ unsigned FastISel::getRegForValue(const Value *V) {
   // of whether FastISel can handle them.
   MVT VT = RealVT.getSimpleVT();
   if (!TLI.isTypeLegal(VT)) {
-    // Promote MVT::i1 to a legal type though, because it's common and easy.
-    if (VT == MVT::i1)
+    // Handle integer promotions, though, because they're common and easy.
+    if (VT == MVT::i1 || VT == MVT::i8 || VT == MVT::i16)
       VT = TLI.getTypeToTransformTo(V->getContext(), VT).getSimpleVT();
     else
       return 0;
@@ -121,10 +123,9 @@ unsigned FastISel::getRegForValue(const Value *V) {
   // only locally. This is because Instructions already have the SSA
   // def-dominates-use requirement enforced.
   DenseMap<const Value *, unsigned>::iterator I = FuncInfo.ValueMap.find(V);
-  if (I != FuncInfo.ValueMap.end()) {
-    unsigned Reg = I->second;
-    return Reg;
-  }
+  if (I != FuncInfo.ValueMap.end())
+    return I->second;
+
   unsigned Reg = LocalValueMap[V];
   if (Reg != 0)
     return Reg;
@@ -164,8 +165,12 @@ unsigned FastISel::materializeRegForValue(const Value *V, MVT VT) {
     Reg =
       getRegForValue(Constant::getNullValue(TD.getIntPtrType(V->getContext())));
   } else if (const ConstantFP *CF = dyn_cast<ConstantFP>(V)) {
-    // Try to emit the constant directly.
-    Reg = FastEmit_f(VT, VT, ISD::ConstantFP, CF);
+    if (CF->isNullValue()) {
+      Reg = TargetMaterializeFloatZero(CF);
+    } else {
+      // Try to emit the constant directly.
+      Reg = FastEmit_f(VT, VT, ISD::ConstantFP, CF);
+    }
 
     if (!Reg) {
       // Try to emit the constant by using an integer constant with a cast.
@@ -230,10 +235,10 @@ unsigned FastISel::lookUpRegForValue(const Value *V) {
 /// NOTE: This is only necessary because we might select a block that uses
 /// a value before we select the block that defines the value.  It might be
 /// possible to fix this by selecting blocks in reverse postorder.
-unsigned FastISel::UpdateValueMap(const Value *I, unsigned Reg) {
+void FastISel::UpdateValueMap(const Value *I, unsigned Reg, unsigned NumRegs) {
   if (!isa<Instruction>(I)) {
     LocalValueMap[I] = Reg;
-    return Reg;
+    return;
   }
 
   unsigned &AssignedReg = FuncInfo.ValueMap[I];
@@ -242,12 +247,11 @@ unsigned FastISel::UpdateValueMap(const Value *I, unsigned Reg) {
     AssignedReg = Reg;
   else if (Reg != AssignedReg) {
     // Arrange for uses of AssignedReg to be replaced by uses of Reg.
-    FuncInfo.RegFixups[AssignedReg] = Reg;
+    for (unsigned i = 0; i < NumRegs; i++)
+      FuncInfo.RegFixups[AssignedReg+i] = Reg+i;
 
     AssignedReg = Reg;
   }
-
-  return AssignedReg;
 }
 
 std::pair<unsigned, bool> FastISel::getRegForGEPIndex(const Value *Idx) {
@@ -330,23 +334,51 @@ bool FastISel::SelectBinaryOp(const User *I, unsigned ISDOpcode) {
       return false;
   }
 
+  // Check if the first operand is a constant, and handle it as "ri".  At -O0,
+  // we don't have anything that canonicalizes operand order.
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(0)))
+    if (isa<Instruction>(I) && cast<Instruction>(I)->isCommutative()) {
+      unsigned Op1 = getRegForValue(I->getOperand(1));
+      if (Op1 == 0) return false;
+
+      bool Op1IsKill = hasTrivialKill(I->getOperand(1));
+
+      unsigned ResultReg = FastEmit_ri_(VT.getSimpleVT(), ISDOpcode, Op1,
+                                        Op1IsKill, CI->getZExtValue(),
+                                        VT.getSimpleVT());
+      if (ResultReg == 0) return false;
+
+      // We successfully emitted code for the given LLVM Instruction.
+      UpdateValueMap(I, ResultReg);
+      return true;
+    }
+
+
   unsigned Op0 = getRegForValue(I->getOperand(0));
-  if (Op0 == 0)
-    // Unhandled operand. Halt "fast" selection and bail.
+  if (Op0 == 0)   // Unhandled operand. Halt "fast" selection and bail.
     return false;
 
   bool Op0IsKill = hasTrivialKill(I->getOperand(0));
 
   // Check if the second operand is a constant and handle it appropriately.
   if (ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1))) {
-    unsigned ResultReg = FastEmit_ri(VT.getSimpleVT(), VT.getSimpleVT(),
-                                     ISDOpcode, Op0, Op0IsKill,
-                                     CI->getZExtValue());
-    if (ResultReg != 0) {
-      // We successfully emitted code for the given LLVM Instruction.
-      UpdateValueMap(I, ResultReg);
-      return true;
+    uint64_t Imm = CI->getZExtValue();
+
+    // Transform "sdiv exact X, 8" -> "sra X, 3".
+    if (ISDOpcode == ISD::SDIV && isa<BinaryOperator>(I) &&
+        cast<BinaryOperator>(I)->isExact() &&
+        isPowerOf2_64(Imm)) {
+      Imm = Log2_64(Imm);
+      ISDOpcode = ISD::SRA;
     }
+
+    unsigned ResultReg = FastEmit_ri_(VT.getSimpleVT(), ISDOpcode, Op0,
+                                      Op0IsKill, Imm, VT.getSimpleVT());
+    if (ResultReg == 0) return false;
+
+    // We successfully emitted code for the given LLVM Instruction.
+    UpdateValueMap(I, ResultReg);
+    return true;
   }
 
   // Check if the second operand is a constant float.
@@ -454,15 +486,35 @@ bool FastISel::SelectGetElementPtr(const User *I) {
 }
 
 bool FastISel::SelectCall(const User *I) {
-  const Function *F = cast<CallInst>(I)->getCalledFunction();
+  const CallInst *Call = cast<CallInst>(I);
+
+  // Handle simple inline asms.
+  if (const InlineAsm *IA = dyn_cast<InlineAsm>(Call->getArgOperand(0))) {
+    // Don't attempt to handle constraints.
+    if (!IA->getConstraintString().empty())
+      return false;
+
+    unsigned ExtraInfo = 0;
+    if (IA->hasSideEffects())
+      ExtraInfo |= InlineAsm::Extra_HasSideEffects;
+    if (IA->isAlignStack())
+      ExtraInfo |= InlineAsm::Extra_IsAlignStack;
+
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+            TII.get(TargetOpcode::INLINEASM))
+      .addExternalSymbol(IA->getAsmString().c_str())
+      .addImm(ExtraInfo);
+    return true;
+  }
+
+  const Function *F = Call->getCalledFunction();
   if (!F) return false;
 
   // Handle selected intrinsic function calls.
-  unsigned IID = F->getIntrinsicID();
-  switch (IID) {
+  switch (F->getIntrinsicID()) {
   default: break;
   case Intrinsic::dbg_declare: {
-    const DbgDeclareInst *DI = cast<DbgDeclareInst>(I);
+    const DbgDeclareInst *DI = cast<DbgDeclareInst>(Call);
     if (!DIVariable(DI->getVariable()).Verify() ||
         !FuncInfo.MF->getMMI().hasDebugInfo())
       return true;
@@ -494,7 +546,7 @@ bool FastISel::SelectCall(const User *I) {
   }
   case Intrinsic::dbg_value: {
     // This form of DBG_VALUE is target-independent.
-    const DbgValueInst *DI = cast<DbgValueInst>(I);
+    const DbgValueInst *DI = cast<DbgValueInst>(Call);
     const TargetInstrDesc &II = TII.get(TargetOpcode::DBG_VALUE);
     const Value *V = DI->getValue();
     if (!V) {
@@ -523,65 +575,68 @@ bool FastISel::SelectCall(const User *I) {
     return true;
   }
   case Intrinsic::eh_exception: {
-    EVT VT = TLI.getValueType(I->getType());
-    switch (TLI.getOperationAction(ISD::EXCEPTIONADDR, VT)) {
-    default: break;
-    case TargetLowering::Expand: {
-      assert(FuncInfo.MBB->isLandingPad() &&
-             "Call to eh.exception not in landing pad!");
-      unsigned Reg = TLI.getExceptionAddressRegister();
-      const TargetRegisterClass *RC = TLI.getRegClassFor(VT);
-      unsigned ResultReg = createResultReg(RC);
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
-              ResultReg).addReg(Reg);
-      UpdateValueMap(I, ResultReg);
-      return true;
-    }
-    }
-    break;
+    EVT VT = TLI.getValueType(Call->getType());
+    if (TLI.getOperationAction(ISD::EXCEPTIONADDR, VT)!=TargetLowering::Expand)
+      break;
+
+    assert(FuncInfo.MBB->isLandingPad() &&
+           "Call to eh.exception not in landing pad!");
+    unsigned Reg = TLI.getExceptionAddressRegister();
+    const TargetRegisterClass *RC = TLI.getRegClassFor(VT);
+    unsigned ResultReg = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+            ResultReg).addReg(Reg);
+    UpdateValueMap(Call, ResultReg);
+    return true;
   }
   case Intrinsic::eh_selector: {
-    EVT VT = TLI.getValueType(I->getType());
-    switch (TLI.getOperationAction(ISD::EHSELECTION, VT)) {
-    default: break;
-    case TargetLowering::Expand: {
-      if (FuncInfo.MBB->isLandingPad())
-        AddCatchInfo(*cast<CallInst>(I), &FuncInfo.MF->getMMI(), FuncInfo.MBB);
-      else {
+    EVT VT = TLI.getValueType(Call->getType());
+    if (TLI.getOperationAction(ISD::EHSELECTION, VT) != TargetLowering::Expand)
+      break;
+    if (FuncInfo.MBB->isLandingPad())
+      AddCatchInfo(*Call, &FuncInfo.MF->getMMI(), FuncInfo.MBB);
+    else {
 #ifndef NDEBUG
-        FuncInfo.CatchInfoLost.insert(cast<CallInst>(I));
+      FuncInfo.CatchInfoLost.insert(Call);
 #endif
-        // FIXME: Mark exception selector register as live in.  Hack for PR1508.
-        unsigned Reg = TLI.getExceptionSelectorRegister();
-        if (Reg) FuncInfo.MBB->addLiveIn(Reg);
-      }
-
+      // FIXME: Mark exception selector register as live in.  Hack for PR1508.
       unsigned Reg = TLI.getExceptionSelectorRegister();
-      EVT SrcVT = TLI.getPointerTy();
-      const TargetRegisterClass *RC = TLI.getRegClassFor(SrcVT);
-      unsigned ResultReg = createResultReg(RC);
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
-              ResultReg).addReg(Reg);
-
-      bool ResultRegIsKill = hasTrivialKill(I);
-
-      // Cast the register to the type of the selector.
-      if (SrcVT.bitsGT(MVT::i32))
-        ResultReg = FastEmit_r(SrcVT.getSimpleVT(), MVT::i32, ISD::TRUNCATE,
-                               ResultReg, ResultRegIsKill);
-      else if (SrcVT.bitsLT(MVT::i32))
-        ResultReg = FastEmit_r(SrcVT.getSimpleVT(), MVT::i32,
-                               ISD::SIGN_EXTEND, ResultReg, ResultRegIsKill);
-      if (ResultReg == 0)
-        // Unhandled operand. Halt "fast" selection and bail.
-        return false;
+      if (Reg) FuncInfo.MBB->addLiveIn(Reg);
+    }
 
-      UpdateValueMap(I, ResultReg);
+    unsigned Reg = TLI.getExceptionSelectorRegister();
+    EVT SrcVT = TLI.getPointerTy();
+    const TargetRegisterClass *RC = TLI.getRegClassFor(SrcVT);
+    unsigned ResultReg = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+            ResultReg).addReg(Reg);
+
+    bool ResultRegIsKill = hasTrivialKill(Call);
+
+    // Cast the register to the type of the selector.
+    if (SrcVT.bitsGT(MVT::i32))
+      ResultReg = FastEmit_r(SrcVT.getSimpleVT(), MVT::i32, ISD::TRUNCATE,
+                             ResultReg, ResultRegIsKill);
+    else if (SrcVT.bitsLT(MVT::i32))
+      ResultReg = FastEmit_r(SrcVT.getSimpleVT(), MVT::i32,
+                             ISD::SIGN_EXTEND, ResultReg, ResultRegIsKill);
+    if (ResultReg == 0)
+      // Unhandled operand. Halt "fast" selection and bail.
+      return false;
 
-      return true;
-    }
-    }
-    break;
+    UpdateValueMap(Call, ResultReg);
+
+    return true;
+  }
+  case Intrinsic::objectsize: {
+    ConstantInt *CI = cast<ConstantInt>(Call->getArgOperand(1));
+    unsigned long long Res = CI->isZero() ? -1ULL : 0;
+    Constant *ResCI = ConstantInt::get(Call->getType(), Res);
+    unsigned ResultReg = getRegForValue(ResCI);
+    if (ResultReg == 0)
+      return false;
+    UpdateValueMap(Call, ResultReg);
+    return true;
   }
   }
 
@@ -598,21 +653,13 @@ bool FastISel::SelectCast(const User *I, unsigned Opcode) {
     // Unhandled type. Halt "fast" selection and bail.
     return false;
 
-  // Check if the destination type is legal. Or as a special case,
-  // it may be i1 if we're doing a truncate because that's
-  // easy and somewhat common.
+  // Check if the destination type is legal.
   if (!TLI.isTypeLegal(DstVT))
-    if (DstVT != MVT::i1 || Opcode != ISD::TRUNCATE)
-      // Unhandled type. Halt "fast" selection and bail.
-      return false;
+    return false;
 
-  // Check if the source operand is legal. Or as a special case,
-  // it may be i1 if we're doing zero-extension because that's
-  // easy and somewhat common.
+  // Check if the source operand is legal.
   if (!TLI.isTypeLegal(SrcVT))
-    if (SrcVT != MVT::i1 || Opcode != ISD::ZERO_EXTEND)
-      // Unhandled type. Halt "fast" selection and bail.
-      return false;
+    return false;
 
   unsigned InputReg = getRegForValue(I->getOperand(0));
   if (!InputReg)
@@ -621,18 +668,6 @@ bool FastISel::SelectCast(const User *I, unsigned Opcode) {
 
   bool InputRegIsKill = hasTrivialKill(I->getOperand(0));
 
-  // If the operand is i1, arrange for the high bits in the register to be zero.
-  if (SrcVT == MVT::i1) {
-   SrcVT = TLI.getTypeToTransformTo(I->getContext(), SrcVT);
-   InputReg = FastEmitZExtFromI1(SrcVT.getSimpleVT(), InputReg, InputRegIsKill);
-   if (!InputReg)
-     return false;
-   InputRegIsKill = true;
-  }
-  // If the result is i1, truncate to the target's type for i1 first.
-  if (DstVT == MVT::i1)
-    DstVT = TLI.getTypeToTransformTo(I->getContext(), DstVT);
-
   unsigned ResultReg = FastEmit_r(SrcVT.getSimpleVT(),
                                   DstVT.getSimpleVT(),
                                   Opcode,
@@ -784,6 +819,47 @@ FastISel::SelectFNeg(const User *I) {
 }
 
 bool
+FastISel::SelectExtractValue(const User *U) {
+  const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(U);
+  if (!EVI)
+    return false;
+
+  // Make sure we only try to handle extracts with a legal result.  But also
+  // allow i1 because it's easy.
+  EVT RealVT = TLI.getValueType(EVI->getType(), /*AllowUnknown=*/true);
+  if (!RealVT.isSimple())
+    return false;
+  MVT VT = RealVT.getSimpleVT();
+  if (!TLI.isTypeLegal(VT) && VT != MVT::i1)
+    return false;
+
+  const Value *Op0 = EVI->getOperand(0);
+  const Type *AggTy = Op0->getType();
+
+  // Get the base result register.
+  unsigned ResultReg;
+  DenseMap<const Value *, unsigned>::iterator I = FuncInfo.ValueMap.find(Op0);
+  if (I != FuncInfo.ValueMap.end())
+    ResultReg = I->second;
+  else if (isa<Instruction>(Op0))
+    ResultReg = FuncInfo.InitializeRegForValue(Op0);
+  else
+    return false; // fast-isel can't handle aggregate constants at the moment
+
+  // Get the actual result register, which is an offset from the base register.
+  unsigned VTIndex = ComputeLinearIndex(AggTy, EVI->idx_begin(), EVI->idx_end());
+
+  SmallVector<EVT, 4> AggValueVTs;
+  ComputeValueVTs(TLI, AggTy, AggValueVTs);
+
+  for (unsigned i = 0; i < VTIndex; i++)
+    ResultReg += TLI.getNumRegisters(FuncInfo.Fn->getContext(), AggValueVTs[i]);
+
+  UpdateValueMap(EVI, ResultReg);
+  return true;
+}
+
+bool
 FastISel::SelectOperator(const User *I, unsigned Opcode) {
   switch (Opcode) {
   case Instruction::Add:
@@ -887,6 +963,9 @@ FastISel::SelectOperator(const User *I, unsigned Opcode) {
     return true;
   }
 
+  case Instruction::ExtractValue:
+    return SelectExtractValue(I);
+
   case Instruction::PHI:
     llvm_unreachable("FastISel shouldn't visit PHI nodes!");
 
@@ -966,59 +1045,33 @@ unsigned FastISel::FastEmit_rri(MVT, MVT,
 unsigned FastISel::FastEmit_ri_(MVT VT, unsigned Opcode,
                                 unsigned Op0, bool Op0IsKill,
                                 uint64_t Imm, MVT ImmType) {
+  // If this is a multiply by a power of two, emit this as a shift left.
+  if (Opcode == ISD::MUL && isPowerOf2_64(Imm)) {
+    Opcode = ISD::SHL;
+    Imm = Log2_64(Imm);
+  } else if (Opcode == ISD::UDIV && isPowerOf2_64(Imm)) {
+    // div x, 8 -> srl x, 3
+    Opcode = ISD::SRL;
+    Imm = Log2_64(Imm);
+  }
+
+  // Horrible hack (to be removed), check to make sure shift amounts are
+  // in-range.
+  if ((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) &&
+      Imm >= VT.getSizeInBits())
+    return 0;
+
   // First check if immediate type is legal. If not, we can't use the ri form.
   unsigned ResultReg = FastEmit_ri(VT, VT, Opcode, Op0, Op0IsKill, Imm);
   if (ResultReg != 0)
     return ResultReg;
   unsigned MaterialReg = FastEmit_i(ImmType, ImmType, ISD::Constant, Imm);
-  if (MaterialReg == 0)
-    return 0;
-  return FastEmit_rr(VT, VT, Opcode,
-                     Op0, Op0IsKill,
-                     MaterialReg, /*Kill=*/true);
-}
-
-/// FastEmit_rf_ - This method is a wrapper of FastEmit_ri. It first tries
-/// to emit an instruction with a floating-point immediate operand using
-/// FastEmit_rf. If that fails, it materializes the immediate into a register
-/// and try FastEmit_rr instead.
-unsigned FastISel::FastEmit_rf_(MVT VT, unsigned Opcode,
-                                unsigned Op0, bool Op0IsKill,
-                                const ConstantFP *FPImm, MVT ImmType) {
-  // First check if immediate type is legal. If not, we can't use the rf form.
-  unsigned ResultReg = FastEmit_rf(VT, VT, Opcode, Op0, Op0IsKill, FPImm);
-  if (ResultReg != 0)
-    return ResultReg;
-
-  // Materialize the constant in a register.
-  unsigned MaterialReg = FastEmit_f(ImmType, ImmType, ISD::ConstantFP, FPImm);
   if (MaterialReg == 0) {
-    // If the target doesn't have a way to directly enter a floating-point
-    // value into a register, use an alternate approach.
-    // TODO: The current approach only supports floating-point constants
-    // that can be constructed by conversion from integer values. This should
-    // be replaced by code that creates a load from a constant-pool entry,
-    // which will require some target-specific work.
-    const APFloat &Flt = FPImm->getValueAPF();
-    EVT IntVT = TLI.getPointerTy();
-
-    uint64_t x[2];
-    uint32_t IntBitWidth = IntVT.getSizeInBits();
-    bool isExact;
-    (void) Flt.convertToInteger(x, IntBitWidth, /*isSigned=*/true,
-                             APFloat::rmTowardZero, &isExact);
-    if (!isExact)
-      return 0;
-    APInt IntVal(IntBitWidth, 2, x);
-
-    unsigned IntegerReg = FastEmit_i(IntVT.getSimpleVT(), IntVT.getSimpleVT(),
-                                     ISD::Constant, IntVal.getZExtValue());
-    if (IntegerReg == 0)
-      return 0;
-    MaterialReg = FastEmit_r(IntVT.getSimpleVT(), VT,
-                             ISD::SINT_TO_FP, IntegerReg, /*Kill=*/true);
-    if (MaterialReg == 0)
-      return 0;
+    // This is a bit ugly/slow, but failing here means falling out of
+    // fast-isel, which would be very slow.
+    const IntegerType *ITy = IntegerType::get(FuncInfo.Fn->getContext(),
+                                              VT.getSizeInBits());
+    MaterialReg = getRegForValue(ConstantInt::get(ITy, Imm));
   }
   return FastEmit_rr(VT, VT, Opcode,
                      Op0, Op0IsKill,
@@ -1078,6 +1131,30 @@ unsigned FastISel::FastEmitInst_rr(unsigned MachineInstOpcode,
   return ResultReg;
 }
 
+unsigned FastISel::FastEmitInst_rrr(unsigned MachineInstOpcode,
+                                   const TargetRegisterClass *RC,
+                                   unsigned Op0, bool Op0IsKill,
+                                   unsigned Op1, bool Op1IsKill,
+                                   unsigned Op2, bool Op2IsKill) {
+  unsigned ResultReg = createResultReg(RC);
+  const TargetInstrDesc &II = TII.get(MachineInstOpcode);
+
+  if (II.getNumDefs() >= 1)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
+      .addReg(Op0, Op0IsKill * RegState::Kill)
+      .addReg(Op1, Op1IsKill * RegState::Kill)
+      .addReg(Op2, Op2IsKill * RegState::Kill);
+  else {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
+      .addReg(Op0, Op0IsKill * RegState::Kill)
+      .addReg(Op1, Op1IsKill * RegState::Kill)
+      .addReg(Op2, Op2IsKill * RegState::Kill);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+            ResultReg).addReg(II.ImplicitDefs[0]);
+  }
+  return ResultReg;
+}
+
 unsigned FastISel::FastEmitInst_ri(unsigned MachineInstOpcode,
                                    const TargetRegisterClass *RC,
                                    unsigned Op0, bool Op0IsKill,
@@ -1183,6 +1260,23 @@ unsigned FastISel::FastEmitInst_i(unsigned MachineInstOpcode,
   return ResultReg;
 }
 
+unsigned FastISel::FastEmitInst_ii(unsigned MachineInstOpcode,
+                                  const TargetRegisterClass *RC,
+                                  uint64_t Imm1, uint64_t Imm2) {
+  unsigned ResultReg = createResultReg(RC);
+  const TargetInstrDesc &II = TII.get(MachineInstOpcode);
+
+  if (II.getNumDefs() >= 1)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
+      .addImm(Imm1).addImm(Imm2);
+  else {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II).addImm(Imm1).addImm(Imm2);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+            ResultReg).addReg(II.ImplicitDefs[0]);
+  }
+  return ResultReg;
+}
+
 unsigned FastISel::FastEmitInst_extractsubreg(MVT RetVT,
                                               unsigned Op0, bool Op0IsKill,
                                               uint32_t Idx) {
@@ -1238,7 +1332,7 @@ bool FastISel::HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) {
       // Only handle legal types. Two interesting things to note here. First,
       // by bailing out early, we may leave behind some dead instructions,
       // since SelectionDAG's HandlePHINodesInSuccessorBlocks will insert its
-      // own moves. Second, this check is necessary becuase FastISel doesn't
+      // own moves. Second, this check is necessary because FastISel doesn't
       // use CreateRegs to create registers, so it always creates
       // exactly one register for each non-void instruction.
       EVT VT = TLI.getValueType(PN->getType(), /*AllowUnknown=*/true);
diff --git a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
index d8a5770..d518b5d 100644
--- a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -54,25 +54,6 @@ static bool isUsedOutsideOfDefiningBlock(const Instruction *I) {
   return false;
 }
 
-/// isOnlyUsedInEntryBlock - If the specified argument is only used in the
-/// entry block, return true.  This includes arguments used by switches, since
-/// the switch may expand into multiple basic blocks.
-static bool isOnlyUsedInEntryBlock(const Argument *A, bool EnableFastISel) {
-  // With FastISel active, we may be splitting blocks, so force creation
-  // of virtual registers for all non-dead arguments.
-  if (EnableFastISel)
-    return A->use_empty();
-
-  const BasicBlock *Entry = A->getParent()->begin();
-  for (Value::const_use_iterator UI = A->use_begin(), E = A->use_end();
-       UI != E; ++UI) {
-    const User *U = *UI;
-    if (cast<Instruction>(U)->getParent() != Entry || isa<SwitchInst>(U))
-      return false;  // Use not in entry block.
-  }
-  return true;
-}
-
 FunctionLoweringInfo::FunctionLoweringInfo(const TargetLowering &tli)
   : TLI(tli) {
 }
@@ -86,16 +67,10 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf) {
   SmallVector<ISD::OutputArg, 4> Outs;
   GetReturnInfo(Fn->getReturnType(),
                 Fn->getAttributes().getRetAttributes(), Outs, TLI);
-  CanLowerReturn = TLI.CanLowerReturn(Fn->getCallingConv(), Fn->isVarArg(),
+  CanLowerReturn = TLI.CanLowerReturn(Fn->getCallingConv(), *MF,
+				      Fn->isVarArg(),
                                       Outs, Fn->getContext());
 
-  // Create a vreg for each argument register that is not dead and is used
-  // outside of the entry block for the function.
-  for (Function::const_arg_iterator AI = Fn->arg_begin(), E = Fn->arg_end();
-       AI != E; ++AI)
-    if (!isOnlyUsedInEntryBlock(AI, EnableFastISel))
-      InitializeRegForValue(AI);
-
   // Initialize the mapping of values to registers.  This is only set up for
   // instruction values that are used outside of the block that defines
   // them.
@@ -181,6 +156,10 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf) {
          const PHINode *PN = dyn_cast<PHINode>(I); ++I) {
       if (PN->use_empty()) continue;
 
+      // Skip empty types
+      if (PN->getType()->isEmptyTy())
+        continue;
+
       DebugLoc DL = PN->getDebugLoc();
       unsigned PHIReg = ValueMap[PN];
       assert(PHIReg && "PHI node does not have an assigned virtual register!");
@@ -343,7 +322,7 @@ void FunctionLoweringInfo::ComputePHILiveOutRegInfo(const PHINode *PN) {
       APInt Zero(BitWidth, 0);
       DestLOI.KnownZero = Zero;
       DestLOI.KnownOne = Zero;
-      return;      
+      return;
     }
 
     if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
@@ -375,18 +354,18 @@ void FunctionLoweringInfo::ComputePHILiveOutRegInfo(const PHINode *PN) {
 /// setByValArgumentFrameIndex - Record frame index for the byval
 /// argument. This overrides previous frame index entry for this argument,
 /// if any.
-void FunctionLoweringInfo::setByValArgumentFrameIndex(const Argument *A, 
+void FunctionLoweringInfo::setByValArgumentFrameIndex(const Argument *A,
                                                       int FI) {
   assert (A->hasByValAttr() && "Argument does not have byval attribute!");
   ByValArgFrameIndexMap[A] = FI;
 }
-  
+
 /// getByValArgumentFrameIndex - Get frame index for the byval argument.
 /// If the argument does not have any assigned frame index then 0 is
 /// returned.
 int FunctionLoweringInfo::getByValArgumentFrameIndex(const Argument *A) {
   assert (A->hasByValAttr() && "Argument does not have byval attribute!");
-  DenseMap<const Argument *, int>::iterator I = 
+  DenseMap<const Argument *, int>::iterator I =
     ByValArgFrameIndexMap.find(A);
   if (I != ByValArgFrameIndexMap.end())
     return I->second;
diff --git a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index e309def..2a65d65 100644
--- a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -76,6 +76,12 @@ EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, bool IsCloned,
   // the CopyToReg'd destination register instead of creating a new vreg.
   bool MatchReg = true;
   const TargetRegisterClass *UseRC = NULL;
+  EVT VT = Node->getValueType(ResNo);
+
+  // Stick to the preferred register classes for legal types.
+  if (TLI->isTypeLegal(VT))
+    UseRC = TLI->getRegClassFor(VT);
+
   if (!IsClone && !IsCloned)
     for (SDNode::use_iterator UI = Node->use_begin(), E = Node->use_end();
          UI != E; ++UI) {
@@ -121,10 +127,9 @@ EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, bool IsCloned,
         break;
     }
 
-  EVT VT = Node->getValueType(ResNo);
   const TargetRegisterClass *SrcRC = 0, *DstRC = 0;
   SrcRC = TRI->getMinimalPhysRegClass(SrcReg, VT);
-  
+
   // Figure out the register class to create for the destreg.
   if (VRBase) {
     DstRC = MRI->getRegClass(VRBase);
@@ -283,7 +288,7 @@ InstrEmitter::AddRegisterOperand(MachineInstr *MI, SDValue Op,
       DstRC = II->OpInfo[IIOpNum].getRegClass(TRI);
     assert((DstRC || (TID.isVariadic() && IIOpNum >= TID.getNumOperands())) &&
            "Don't have operand info for this instruction!");
-    if (DstRC && SrcRC != DstRC && !SrcRC->hasSuperClass(DstRC)) {
+    if (DstRC && !SrcRC->hasSuperClassEq(DstRC)) {
       unsigned NewVReg = MRI->createVirtualRegister(DstRC);
       BuildMI(*MBB, InsertPos, Op.getNode()->getDebugLoc(),
               TII->get(TargetOpcode::COPY), NewVReg).addReg(VReg);
@@ -543,17 +548,18 @@ InstrEmitter::EmitCopyToRegClassNode(SDNode *Node,
 void InstrEmitter::EmitRegSequence(SDNode *Node,
                                   DenseMap<SDValue, unsigned> &VRBaseMap,
                                   bool IsClone, bool IsCloned) {
-  const TargetRegisterClass *RC = TLI->getRegClassFor(Node->getValueType(0));
+  unsigned DstRCIdx = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue();
+  const TargetRegisterClass *RC = TRI->getRegClass(DstRCIdx);
   unsigned NewVReg = MRI->createVirtualRegister(RC);
   MachineInstr *MI = BuildMI(*MF, Node->getDebugLoc(),
                              TII->get(TargetOpcode::REG_SEQUENCE), NewVReg);
   unsigned NumOps = Node->getNumOperands();
-  assert((NumOps & 1) == 0 &&
-         "REG_SEQUENCE must have an even number of operands!");
+  assert((NumOps & 1) == 1 &&
+         "REG_SEQUENCE must have an odd number of operands!");
   const TargetInstrDesc &II = TII->get(TargetOpcode::REG_SEQUENCE);
-  for (unsigned i = 0; i != NumOps; ++i) {
+  for (unsigned i = 1; i != NumOps; ++i) {
     SDValue Op = Node->getOperand(i);
-    if (i & 1) {
+    if ((i & 1) == 0) {
       unsigned SubIdx = cast<ConstantSDNode>(Op)->getZExtValue();
       unsigned SubReg = getVR(Node->getOperand(i-1), VRBaseMap);
       const TargetRegisterClass *TRC = MRI->getRegClass(SubReg);
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index b837261..4b6d3ef 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -14,23 +14,16 @@
 #include "llvm/Analysis/DebugInfo.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetOptions.h"
 #include "llvm/CallingConv.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
-#include "llvm/Function.h"
-#include "llvm/GlobalVariable.h"
 #include "llvm/LLVMContext.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
@@ -57,19 +50,13 @@ class SelectionDAGLegalize {
   const TargetMachine &TM;
   const TargetLowering &TLI;
   SelectionDAG &DAG;
-  CodeGenOpt::Level OptLevel;
 
   // Libcall insertion helpers.
 
-  /// LastCALLSEQ_END - This keeps track of the CALLSEQ_END node that has been
+  /// LastCALLSEQ - This keeps track of the CALLSEQ_END node that has been
   /// legalized.  We use this to ensure that calls are properly serialized
   /// against each other, including inserted libcalls.
-  SDValue LastCALLSEQ_END;
-
-  /// IsLegalizingCall - This member is used *only* for purposes of providing
-  /// helpful assertions that a libcall isn't created while another call is
-  /// being legalized (which could lead to non-serialized call sequences).
-  bool IsLegalizingCall;
+  SmallVector<SDValue, 8> LastCALLSEQ;
 
   enum LegalizeAction {
     Legal,      // The target natively supports this operation.
@@ -98,13 +85,13 @@ class SelectionDAGLegalize {
   }
 
 public:
-  SelectionDAGLegalize(SelectionDAG &DAG, CodeGenOpt::Level ol);
+  explicit SelectionDAGLegalize(SelectionDAG &DAG);
 
   /// getTypeAction - Return how we should legalize values of this type, either
   /// it is already legal or we need to expand it into multiple registers of
   /// smaller integer type, or we need to promote it to a larger type.
   LegalizeAction getTypeAction(EVT VT) const {
-    return (LegalizeAction)ValueTypeActions.getTypeAction(VT);
+    return (LegalizeAction)TLI.getTypeAction(*DAG.getContext(), VT);
   }
 
   /// isTypeLegal - Return true if this type is legal on this target.
@@ -147,6 +134,9 @@ private:
                              DebugLoc dl);
 
   SDValue ExpandLibCall(RTLIB::Libcall LC, SDNode *Node, bool isSigned);
+  SDValue ExpandLibCall(RTLIB::Libcall LC, EVT RetVT, const SDValue *Ops,
+                        unsigned NumOps, bool isSigned, DebugLoc dl);
+
   std::pair<SDValue, SDValue> ExpandChainLibCall(RTLIB::Libcall LC,
                                                  SDNode *Node, bool isSigned);
   SDValue ExpandFPLibCall(SDNode *Node, RTLIB::Libcall Call_F32,
@@ -158,7 +148,7 @@ private:
                            RTLIB::Libcall Call_I32,
                            RTLIB::Libcall Call_I64,
                            RTLIB::Libcall Call_I128);
-  SDValue ExpandDivRemLibCall(SDNode *Node, bool isSigned, bool isDIV);
+  void ExpandDivRemLibCall(SDNode *Node, SmallVectorImpl<SDValue> &Results);
 
   SDValue EmitStackConvert(SDValue SrcOp, EVT SlotVT, EVT DestVT, DebugLoc dl);
   SDValue ExpandBUILD_VECTOR(SDNode *Node);
@@ -184,6 +174,15 @@ private:
 
   void ExpandNode(SDNode *Node, SmallVectorImpl<SDValue> &Results);
   void PromoteNode(SDNode *Node, SmallVectorImpl<SDValue> &Results);
+
+  SDValue getLastCALLSEQ() { return LastCALLSEQ.back();  }
+  void setLastCALLSEQ(const SDValue s) { LastCALLSEQ.back() = s; }
+  void pushLastCALLSEQ(SDValue s) {
+    LastCALLSEQ.push_back(s);
+  }
+  void popLastCALLSEQ() {
+    LastCALLSEQ.pop_back();
+  }
 };
 }
 
@@ -219,18 +218,16 @@ SelectionDAGLegalize::ShuffleWithNarrowerEltType(EVT NVT, EVT VT,  DebugLoc dl,
   return DAG.getVectorShuffle(NVT, dl, N1, N2, &NewMask[0]);
 }
 
-SelectionDAGLegalize::SelectionDAGLegalize(SelectionDAG &dag,
-                                           CodeGenOpt::Level ol)
+SelectionDAGLegalize::SelectionDAGLegalize(SelectionDAG &dag)
   : TM(dag.getTarget()), TLI(dag.getTargetLoweringInfo()),
-    DAG(dag), OptLevel(ol),
+    DAG(dag),
     ValueTypeActions(TLI.getValueTypeActions()) {
   assert(MVT::LAST_VALUETYPE <= MVT::MAX_ALLOWED_VALUETYPE &&
          "Too many value types for ValueTypeActions to hold!");
 }
 
 void SelectionDAGLegalize::LegalizeDAG() {
-  LastCALLSEQ_END = DAG.getEntryNode();
-  IsLegalizingCall = false;
+  pushLastCALLSEQ(DAG.getEntryNode());
 
   // The legalize process is inherently a bottom-up recursive process (users
   // legalize their uses before themselves).  Given infinite stack space, we
@@ -258,14 +255,15 @@ void SelectionDAGLegalize::LegalizeDAG() {
 /// FindCallEndFromCallStart - Given a chained node that is part of a call
 /// sequence, find the CALLSEQ_END node that terminates the call sequence.
 static SDNode *FindCallEndFromCallStart(SDNode *Node, int depth = 0) {
-  // Nested CALLSEQ_START/END constructs aren't yet legal,
-  // but we can DTRT and handle them correctly here.
+  int next_depth = depth;
   if (Node->getOpcode() == ISD::CALLSEQ_START)
-    depth++;
-  else if (Node->getOpcode() == ISD::CALLSEQ_END) {
-    depth--;
-    if (depth == 0)
+    next_depth = depth + 1;
+  if (Node->getOpcode() == ISD::CALLSEQ_END) {
+    assert(depth > 0 && "negative depth!");
+    if (depth == 1)
       return Node;
+    else
+      next_depth = depth - 1;
   }
   if (Node->use_empty())
     return 0;   // No CallSeqEnd
@@ -296,7 +294,7 @@ static SDNode *FindCallEndFromCallStart(SDNode *Node, int depth = 0) {
     SDNode *User = *UI;
     for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i)
       if (User->getOperand(i) == TheChain)
-        if (SDNode *Result = FindCallEndFromCallStart(User, depth))
+        if (SDNode *Result = FindCallEndFromCallStart(User, next_depth))
           return Result;
   }
   return 0;
@@ -317,6 +315,7 @@ static SDNode *FindCallStartFromCallEnd(SDNode *Node) {
     case ISD::CALLSEQ_START:
       if (!nested)
         return Node;
+      Node = Node->getOperand(0).getNode();
       nested--;
       break;
     case ISD::CALLSEQ_END:
@@ -324,7 +323,7 @@ static SDNode *FindCallStartFromCallEnd(SDNode *Node) {
       break;
     }
   }
-  return 0;
+  return (Node->getOpcode() == ISD::CALLSEQ_START) ? Node : 0;
 }
 
 /// LegalizeAllNodesNotLeadingTo - Recursively walk the uses of N, looking to
@@ -433,68 +432,67 @@ SDValue ExpandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG,
       SDValue Result = DAG.getNode(ISD::BITCAST, dl, intVT, Val);
       return DAG.getStore(Chain, dl, Result, Ptr, ST->getPointerInfo(),
                           ST->isVolatile(), ST->isNonTemporal(), Alignment);
-    } else {
-      // Do a (aligned) store to a stack slot, then copy from the stack slot
-      // to the final destination using (unaligned) integer loads and stores.
-      EVT StoredVT = ST->getMemoryVT();
-      EVT RegVT =
-        TLI.getRegisterType(*DAG.getContext(),
-                            EVT::getIntegerVT(*DAG.getContext(),
-                                              StoredVT.getSizeInBits()));
-      unsigned StoredBytes = StoredVT.getSizeInBits() / 8;
-      unsigned RegBytes = RegVT.getSizeInBits() / 8;
-      unsigned NumRegs = (StoredBytes + RegBytes - 1) / RegBytes;
-
-      // Make sure the stack slot is also aligned for the register type.
-      SDValue StackPtr = DAG.CreateStackTemporary(StoredVT, RegVT);
-
-      // Perform the original store, only redirected to the stack slot.
-      SDValue Store = DAG.getTruncStore(Chain, dl,
-                                        Val, StackPtr, MachinePointerInfo(),
-                                        StoredVT, false, false, 0);
-      SDValue Increment = DAG.getConstant(RegBytes, TLI.getPointerTy());
-      SmallVector<SDValue, 8> Stores;
-      unsigned Offset = 0;
-
-      // Do all but one copies using the full register width.
-      for (unsigned i = 1; i < NumRegs; i++) {
-        // Load one integer register's worth from the stack slot.
-        SDValue Load = DAG.getLoad(RegVT, dl, Store, StackPtr,
-                                   MachinePointerInfo(),
-                                   false, false, 0);
-        // Store it to the final location.  Remember the store.
-        Stores.push_back(DAG.getStore(Load.getValue(1), dl, Load, Ptr,
-                                    ST->getPointerInfo().getWithOffset(Offset),
-                                      ST->isVolatile(), ST->isNonTemporal(),
-                                      MinAlign(ST->getAlignment(), Offset)));
-        // Increment the pointers.
-        Offset += RegBytes;
-        StackPtr = DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr,
-                               Increment);
-        Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
-      }
+    }
+    // Do a (aligned) store to a stack slot, then copy from the stack slot
+    // to the final destination using (unaligned) integer loads and stores.
+    EVT StoredVT = ST->getMemoryVT();
+    EVT RegVT =
+      TLI.getRegisterType(*DAG.getContext(),
+                          EVT::getIntegerVT(*DAG.getContext(),
+                                            StoredVT.getSizeInBits()));
+    unsigned StoredBytes = StoredVT.getSizeInBits() / 8;
+    unsigned RegBytes = RegVT.getSizeInBits() / 8;
+    unsigned NumRegs = (StoredBytes + RegBytes - 1) / RegBytes;
 
-      // The last store may be partial.  Do a truncating store.  On big-endian
-      // machines this requires an extending load from the stack slot to ensure
-      // that the bits are in the right place.
-      EVT MemVT = EVT::getIntegerVT(*DAG.getContext(),
-                                    8 * (StoredBytes - Offset));
+    // Make sure the stack slot is also aligned for the register type.
+    SDValue StackPtr = DAG.CreateStackTemporary(StoredVT, RegVT);
 
-      // Load from the stack slot.
-      SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, RegVT, Store, StackPtr,
-                                    MachinePointerInfo(),
-                                    MemVT, false, false, 0);
+    // Perform the original store, only redirected to the stack slot.
+    SDValue Store = DAG.getTruncStore(Chain, dl,
+                                      Val, StackPtr, MachinePointerInfo(),
+                                      StoredVT, false, false, 0);
+    SDValue Increment = DAG.getConstant(RegBytes, TLI.getPointerTy());
+    SmallVector<SDValue, 8> Stores;
+    unsigned Offset = 0;
 
-      Stores.push_back(DAG.getTruncStore(Load.getValue(1), dl, Load, Ptr,
-                                         ST->getPointerInfo()
-                                           .getWithOffset(Offset),
-                                         MemVT, ST->isVolatile(),
-                                         ST->isNonTemporal(),
-                                         MinAlign(ST->getAlignment(), Offset)));
-      // The order of the stores doesn't matter - say it with a TokenFactor.
-      return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Stores[0],
-                         Stores.size());
+    // Do all but one copies using the full register width.
+    for (unsigned i = 1; i < NumRegs; i++) {
+      // Load one integer register's worth from the stack slot.
+      SDValue Load = DAG.getLoad(RegVT, dl, Store, StackPtr,
+                                 MachinePointerInfo(),
+                                 false, false, 0);
+      // Store it to the final location.  Remember the store.
+      Stores.push_back(DAG.getStore(Load.getValue(1), dl, Load, Ptr,
+                                  ST->getPointerInfo().getWithOffset(Offset),
+                                    ST->isVolatile(), ST->isNonTemporal(),
+                                    MinAlign(ST->getAlignment(), Offset)));
+      // Increment the pointers.
+      Offset += RegBytes;
+      StackPtr = DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr,
+                             Increment);
+      Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
     }
+
+    // The last store may be partial.  Do a truncating store.  On big-endian
+    // machines this requires an extending load from the stack slot to ensure
+    // that the bits are in the right place.
+    EVT MemVT = EVT::getIntegerVT(*DAG.getContext(),
+                                  8 * (StoredBytes - Offset));
+
+    // Load from the stack slot.
+    SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, RegVT, Store, StackPtr,
+                                  MachinePointerInfo(),
+                                  MemVT, false, false, 0);
+
+    Stores.push_back(DAG.getTruncStore(Load.getValue(1), dl, Load, Ptr,
+                                       ST->getPointerInfo()
+                                         .getWithOffset(Offset),
+                                       MemVT, ST->isVolatile(),
+                                       ST->isNonTemporal(),
+                                       MinAlign(ST->getAlignment(), Offset)));
+    // The order of the stores doesn't matter - say it with a TokenFactor.
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Stores[0],
+                       Stores.size());
   }
   assert(ST->getMemoryVT().isInteger() &&
          !ST->getMemoryVT().isVector() &&
@@ -941,11 +939,12 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
     case ISD::BR_JT:
     case ISD::BR_CC:
     case ISD::BRCOND:
-      // Branches tweak the chain to include LastCALLSEQ_END
+      assert(LastCALLSEQ.size() == 1 && "branch inside CALLSEQ_BEGIN/END?");
+      // Branches tweak the chain to include LastCALLSEQ
       Ops[0] = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ops[0],
-                            LastCALLSEQ_END);
+                           getLastCALLSEQ());
       Ops[0] = LegalizeOp(Ops[0]);
-      LastCALLSEQ_END = DAG.getEntryNode();
+      setLastCALLSEQ(DAG.getEntryNode());
       break;
     case ISD::SHL:
     case ISD::SRL:
@@ -1034,6 +1033,7 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
     break;
   case ISD::CALLSEQ_START: {
     SDNode *CallEnd = FindCallEndFromCallStart(Node);
+    assert(CallEnd && "didn't find CALLSEQ_END!");
 
     // Recursively Legalize all of the inputs of the call end that do not lead
     // to this call start.  This ensures that any libcalls that need be inserted
@@ -1050,9 +1050,9 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
 
     // Merge in the last call to ensure that this call starts after the last
     // call ended.
-    if (LastCALLSEQ_END.getOpcode() != ISD::EntryToken) {
+    if (getLastCALLSEQ().getOpcode() != ISD::EntryToken) {
       Tmp1 = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                         Tmp1, LastCALLSEQ_END);
+                         Tmp1, getLastCALLSEQ());
       Tmp1 = LegalizeOp(Tmp1);
     }
 
@@ -1073,25 +1073,29 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
     // sequence have been legalized, legalize the call itself.  During this
     // process, no libcalls can/will be inserted, guaranteeing that no calls
     // can overlap.
-    assert(!IsLegalizingCall && "Inconsistent sequentialization of calls!");
     // Note that we are selecting this call!
-    LastCALLSEQ_END = SDValue(CallEnd, 0);
-    IsLegalizingCall = true;
+    setLastCALLSEQ(SDValue(CallEnd, 0));
 
     // Legalize the call, starting from the CALLSEQ_END.
-    LegalizeOp(LastCALLSEQ_END);
-    assert(!IsLegalizingCall && "CALLSEQ_END should have cleared this!");
+    LegalizeOp(getLastCALLSEQ());
     return Result;
   }
   case ISD::CALLSEQ_END:
-    // If the CALLSEQ_START node hasn't been legalized first, legalize it.  This
-    // will cause this node to be legalized as well as handling libcalls right.
-    if (LastCALLSEQ_END.getNode() != Node) {
-      LegalizeOp(SDValue(FindCallStartFromCallEnd(Node), 0));
-      DenseMap<SDValue, SDValue>::iterator I = LegalizedNodes.find(Op);
-      assert(I != LegalizedNodes.end() &&
-             "Legalizing the call start should have legalized this node!");
-      return I->second;
+    {
+      SDNode *myCALLSEQ_BEGIN = FindCallStartFromCallEnd(Node);
+
+      // If the CALLSEQ_START node hasn't been legalized first, legalize it.
+      // This will cause this node to be legalized as well as handling libcalls
+      // right.
+      if (getLastCALLSEQ().getNode() != Node) {
+        LegalizeOp(SDValue(myCALLSEQ_BEGIN, 0));
+        DenseMap<SDValue, SDValue>::iterator I = LegalizedNodes.find(Op);
+        assert(I != LegalizedNodes.end() &&
+               "Legalizing the call start should have legalized this node!");
+        return I->second;
+      }
+
+      pushLastCALLSEQ(SDValue(myCALLSEQ_BEGIN, 0));
     }
 
     // Otherwise, the call start has been legalized and everything is going
@@ -1119,9 +1123,8 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
                          Result.getResNo());
       }
     }
-    assert(IsLegalizingCall && "Call sequence imbalance between start/end?");
     // This finishes up call legalization.
-    IsLegalizingCall = false;
+    popLastCALLSEQ();
 
     // If the CALLSEQ_END node has a flag, remember that we legalized it.
     AddLegalizedOperand(SDValue(Node, 0), Result.getValue(0));
@@ -1371,6 +1374,91 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
           Tmp2 = LegalizeOp(Load.getValue(1));
           break;
         }
+
+        // If this is a promoted vector load, and the vector element types are
+        // legal, then scalarize it.
+        if (ExtType == ISD::EXTLOAD && SrcVT.isVector() &&
+          isTypeLegal(Node->getValueType(0).getScalarType())) {
+          SmallVector<SDValue, 8> LoadVals;
+          SmallVector<SDValue, 8> LoadChains;
+          unsigned NumElem = SrcVT.getVectorNumElements();
+          unsigned Stride = SrcVT.getScalarType().getSizeInBits()/8;
+
+          for (unsigned Idx=0; Idx<NumElem; Idx++) {
+            Tmp2 = DAG.getNode(ISD::ADD, dl, Tmp2.getValueType(), Tmp2,
+                                DAG.getIntPtrConstant(Stride));
+            SDValue ScalarLoad = DAG.getExtLoad(ISD::EXTLOAD, dl,
+                  Node->getValueType(0).getScalarType(),
+                  Tmp1, Tmp2, LD->getPointerInfo().getWithOffset(Idx * Stride),
+                  SrcVT.getScalarType(),
+                  LD->isVolatile(), LD->isNonTemporal(),
+                  LD->getAlignment());
+
+            LoadVals.push_back(ScalarLoad.getValue(0));
+            LoadChains.push_back(ScalarLoad.getValue(1));
+          }
+          Result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+            &LoadChains[0], LoadChains.size());
+          SDValue ValRes = DAG.getNode(ISD::BUILD_VECTOR, dl,
+            Node->getValueType(0), &LoadVals[0], LoadVals.size());
+
+          Tmp1 = LegalizeOp(ValRes);  // Relegalize new nodes.
+          Tmp2 = LegalizeOp(Result.getValue(0));  // Relegalize new nodes.
+          break;
+        }
+
+        // If this is a promoted vector load, and the vector element types are
+        // illegal, create the promoted vector from bitcasted segments.
+        if (ExtType == ISD::EXTLOAD && SrcVT.isVector()) {
+          EVT MemElemTy = Node->getValueType(0).getScalarType();
+          EVT SrcSclrTy = SrcVT.getScalarType();
+          unsigned SizeRatio =
+            (MemElemTy.getSizeInBits() / SrcSclrTy.getSizeInBits());
+
+          SmallVector<SDValue, 8> LoadVals;
+          SmallVector<SDValue, 8> LoadChains;
+          unsigned NumElem = SrcVT.getVectorNumElements();
+          unsigned Stride = SrcVT.getScalarType().getSizeInBits()/8;
+
+          for (unsigned Idx=0; Idx<NumElem; Idx++) {
+            Tmp2 = DAG.getNode(ISD::ADD, dl, Tmp2.getValueType(), Tmp2,
+                                DAG.getIntPtrConstant(Stride));
+            SDValue ScalarLoad = DAG.getExtLoad(ISD::EXTLOAD, dl,
+                  SrcVT.getScalarType(),
+                  Tmp1, Tmp2, LD->getPointerInfo().getWithOffset(Idx * Stride),
+                  SrcVT.getScalarType(),
+                  LD->isVolatile(), LD->isNonTemporal(),
+                  LD->getAlignment());
+            if (TLI.isBigEndian()) {
+              // MSB (which is garbage, comes first)
+              LoadVals.push_back(ScalarLoad.getValue(0));
+              for (unsigned i = 0; i<SizeRatio-1; ++i)
+                LoadVals.push_back(DAG.getUNDEF(SrcVT.getScalarType()));
+            } else {
+              // LSB (which is data, comes first)
+              for (unsigned i = 0; i<SizeRatio-1; ++i)
+                LoadVals.push_back(DAG.getUNDEF(SrcVT.getScalarType()));
+              LoadVals.push_back(ScalarLoad.getValue(0));
+            }
+            LoadChains.push_back(ScalarLoad.getValue(1));
+          }
+
+          Result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+            &LoadChains[0], LoadChains.size());
+          EVT TempWideVector = EVT::getVectorVT(*DAG.getContext(),
+            SrcVT.getScalarType(), NumElem*SizeRatio);
+          SDValue ValRes = DAG.getNode(ISD::BUILD_VECTOR, dl, 
+            TempWideVector, &LoadVals[0], LoadVals.size());
+
+          // Cast to the correct type
+          ValRes = DAG.getNode(ISD::BITCAST, dl, Node->getValueType(0), ValRes);
+
+          Tmp1 = LegalizeOp(ValRes);  // Relegalize new nodes.
+          Tmp2 = LegalizeOp(Result.getValue(0));  // Relegalize new nodes.
+          break;
+
+        }
+
         // FIXME: This does not work for vectors on most targets.  Sign- and
         // zero-extend operations are currently folded into extending loads,
         // whether they are legal or not, and then we end up here without any
@@ -1546,6 +1634,88 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
           Result = TLI.LowerOperation(Result, DAG);
           break;
         case Expand:
+
+          EVT WideScalarVT = Tmp3.getValueType().getScalarType();
+          EVT NarrowScalarVT = StVT.getScalarType();
+
+          // The Store type is illegal, must scalarize the vector store.
+          SmallVector<SDValue, 8> Stores;
+          bool ScalarLegal = isTypeLegal(WideScalarVT);
+          if (!isTypeLegal(StVT) && StVT.isVector() && ScalarLegal) {
+            unsigned NumElem = StVT.getVectorNumElements();
+
+            unsigned ScalarSize = StVT.getScalarType().getSizeInBits();
+            // Round odd types to the next pow of two.
+            if (!isPowerOf2_32(ScalarSize))
+              ScalarSize = NextPowerOf2(ScalarSize);
+            // Types smaller than 8 bits are promoted to 8 bits.
+            ScalarSize = std::max<unsigned>(ScalarSize, 8);
+            // Store stride
+            unsigned Stride = ScalarSize/8;
+            assert(isPowerOf2_32(Stride) && "Stride must be a power of two");
+
+            for (unsigned Idx=0; Idx<NumElem; Idx++) {
+              SDValue Ex = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+                                       WideScalarVT, Tmp3, DAG.getIntPtrConstant(Idx));
+
+
+              EVT NVT = EVT::getIntegerVT(*DAG.getContext(), ScalarSize);
+
+              Ex = DAG.getNode(ISD::TRUNCATE, dl, NVT, Ex);
+              Tmp2 = DAG.getNode(ISD::ADD, dl, Tmp2.getValueType(), Tmp2,
+                                 DAG.getIntPtrConstant(Stride));
+              SDValue Store = DAG.getStore(Tmp1, dl, Ex, Tmp2,
+                                           ST->getPointerInfo().getWithOffset(Idx*Stride),
+                                           isVolatile, isNonTemporal, Alignment);
+              Stores.push_back(Store);
+            }
+            Result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                                 &Stores[0], Stores.size());
+            break;
+          }
+
+          // The Store type is illegal, must scalarize the vector store.
+          // However, the scalar type is illegal. Must bitcast the result
+          // and store it in smaller parts.
+          if (!isTypeLegal(StVT) && StVT.isVector()) {
+            unsigned WideNumElem = StVT.getVectorNumElements();
+            unsigned Stride = NarrowScalarVT.getSizeInBits()/8;
+
+            unsigned SizeRatio =
+              (WideScalarVT.getSizeInBits() / NarrowScalarVT.getSizeInBits());
+
+            EVT CastValueVT = EVT::getVectorVT(*DAG.getContext(), NarrowScalarVT,
+                                               SizeRatio*WideNumElem);
+
+            // Cast the wide elem vector to wider vec with smaller elem type.
+            // Example <2 x i64> -> <4 x i32>
+            Tmp3 = DAG.getNode(ISD::BITCAST, dl, CastValueVT, Tmp3);
+
+            for (unsigned Idx=0; Idx<WideNumElem*SizeRatio; Idx++) {
+              // Extract elment i
+              SDValue Ex = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+                                       NarrowScalarVT, Tmp3, DAG.getIntPtrConstant(Idx));
+              // bump pointer.
+              Tmp2 = DAG.getNode(ISD::ADD, dl, Tmp2.getValueType(), Tmp2,
+                                 DAG.getIntPtrConstant(Stride));
+
+              // Store if, this element is:
+              //  - First element on big endian, or
+              //  - Last element on little endian
+              if (( TLI.isBigEndian() && (Idx%SizeRatio == 0)) ||
+                  ((!TLI.isBigEndian() && (Idx%SizeRatio == SizeRatio-1)))) {
+                SDValue Store = DAG.getStore(Tmp1, dl, Ex, Tmp2,
+                                             ST->getPointerInfo().getWithOffset(Idx*Stride),
+                                             isVolatile, isNonTemporal, Alignment);
+                Stores.push_back(Store);
+              }
+            }
+            Result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                                 &Stores[0], Stores.size());
+            break;
+          }
+
+
           // TRUNCSTORE:i16 i32 -> STORE i16
           assert(isTypeLegal(StVT) && "Do not know how to expand this store!");
           Tmp3 = DAG.getNode(ISD::TRUNCATE, dl, StVT, Tmp3);
@@ -2007,7 +2177,6 @@ SDValue SelectionDAGLegalize::ExpandBUILD_VECTOR(SDNode *Node) {
 // and leave the Hi part unset.
 SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node,
                                             bool isSigned) {
-  assert(!IsLegalizingCall && "Cannot overlap legalization of calls!");
   // The input chain to this libcall is the entry node of the function.
   // Legalizing the call will automatically add the previous call to the
   // dependence.
@@ -2043,9 +2212,43 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node,
     return DAG.getRoot();
 
   // Legalize the call sequence, starting with the chain.  This will advance
+  // the LastCALLSEQ to the legalized version of the CALLSEQ_END node that
+  // was added by LowerCallTo (guaranteeing proper serialization of calls).
+  LegalizeOp(CallInfo.second);
+  return CallInfo.first;
+}
+
+/// ExpandLibCall - Generate a libcall taking the given operands as arguments
+/// and returning a result of type RetVT.
+SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, EVT RetVT,
+                                            const SDValue *Ops, unsigned NumOps,
+                                            bool isSigned, DebugLoc dl) {
+  TargetLowering::ArgListTy Args;
+  Args.reserve(NumOps);
+
+  TargetLowering::ArgListEntry Entry;
+  for (unsigned i = 0; i != NumOps; ++i) {
+    Entry.Node = Ops[i];
+    Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
+    Entry.isSExt = isSigned;
+    Entry.isZExt = !isSigned;
+    Args.push_back(Entry);
+  }
+  SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
+                                         TLI.getPointerTy());
+
+  const Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
+  std::pair<SDValue,SDValue> CallInfo =
+  TLI.LowerCallTo(DAG.getEntryNode(), RetTy, isSigned, !isSigned, false,
+                  false, 0, TLI.getLibcallCallingConv(LC), false,
+                  /*isReturnValueUsed=*/true,
+                  Callee, Args, DAG, dl);
+
+  // Legalize the call sequence, starting with the chain.  This will advance
   // the LastCALLSEQ_END to the legalized version of the CALLSEQ_END node that
   // was added by LowerCallTo (guaranteeing proper serialization of calls).
   LegalizeOp(CallInfo.second);
+
   return CallInfo.first;
 }
 
@@ -2055,7 +2258,6 @@ std::pair<SDValue, SDValue>
 SelectionDAGLegalize::ExpandChainLibCall(RTLIB::Libcall LC,
                                          SDNode *Node,
                                          bool isSigned) {
-  assert(!IsLegalizingCall && "Cannot overlap legalization of calls!");
   SDValue InChain = Node->getOperand(0);
 
   TargetLowering::ArgListTy Args;
@@ -2081,7 +2283,7 @@ SelectionDAGLegalize::ExpandChainLibCall(RTLIB::Libcall LC,
                     Callee, Args, DAG, Node->getDebugLoc());
 
   // Legalize the call sequence, starting with the chain.  This will advance
-  // the LastCALLSEQ_END to the legalized version of the CALLSEQ_END node that
+  // the LastCALLSEQ to the legalized version of the CALLSEQ_END node that
   // was added by LowerCallTo (guaranteeing proper serialization of calls).
   LegalizeOp(CallInfo.second);
   return CallInfo;
@@ -2121,10 +2323,9 @@ SDValue SelectionDAGLegalize::ExpandIntLibCall(SDNode* Node, bool isSigned,
   return ExpandLibCall(LC, Node, isSigned);
 }
 
-/// ExpandDivRemLibCall - Issue libcalls to __{u}divmod to compute div / rem
-/// pairs.
-SDValue SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node, bool isSigned,
-                                                  bool isDIV) {
+/// isDivRemLibcallAvailable - Return true if divmod libcall is available.
+static bool isDivRemLibcallAvailable(SDNode *Node, bool isSigned,
+                                     const TargetLowering &TLI) {
   RTLIB::Libcall LC;
   switch (Node->getValueType(0).getSimpleVT().SimpleTy) {
   default: assert(0 && "Unexpected request for libcall!");
@@ -2135,17 +2336,18 @@ SDValue SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node, bool isSigned,
   case MVT::i128: LC= isSigned ? RTLIB::SDIVREM_I128:RTLIB::UDIVREM_I128; break;
   }
 
-  if (!TLI.getLibcallName(LC))
-    return SDValue();
+  return TLI.getLibcallName(LC) != 0;
+}
 
-  // Only issue divrem libcall if both quotient and remainder are needed.
+/// UseDivRem - Only issue divrem libcall if both quotient and remainder are
+/// needed.
+static bool UseDivRem(SDNode *Node, bool isSigned, bool isDIV) {
   unsigned OtherOpcode = 0;
-  if (isSigned) {
+  if (isSigned)
     OtherOpcode = isDIV ? ISD::SREM : ISD::SDIV;
-  } else {
+  else
     OtherOpcode = isDIV ? ISD::UREM : ISD::UDIV;
-  }
-  SDNode *OtherNode = 0;
+
   SDValue Op0 = Node->getOperand(0);
   SDValue Op1 = Node->getOperand(1);
   for (SDNode::use_iterator UI = Op0.getNode()->use_begin(),
@@ -2155,32 +2357,28 @@ SDValue SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node, bool isSigned,
       continue;
     if (User->getOpcode() == OtherOpcode &&
         User->getOperand(0) == Op0 &&
-        User->getOperand(1) == Op1) {
-      OtherNode = User;
-      break;
-    }
+        User->getOperand(1) == Op1)
+      return true;
   }
-  if (!OtherNode)
-    return SDValue();
+  return false;
+}
 
-  // If the libcall is already generated, no need to issue it again.
-  DenseMap<SDValue, SDValue>::iterator I
-    = LegalizedNodes.find(SDValue(OtherNode,0));
-  if (I != LegalizedNodes.end()) {
-    OtherNode = I->second.getNode();
-    SDNode *Chain = OtherNode->getOperand(0).getNode();
-    for (SDNode::use_iterator UI = Chain->use_begin(), UE = Chain->use_end();
-         UI != UE; ++UI) {
-      SDNode *User = *UI;
-      if (User == OtherNode)
-        continue;
-      if (isDIV) {
-        assert(User->getOpcode() == ISD::CopyFromReg);
-      } else {
-        assert(User->getOpcode() == ISD::LOAD);
-      }
-      return SDValue(User, 0);
-    }
+/// ExpandDivRemLibCall - Issue libcalls to __{u}divmod to compute div / rem
+/// pairs.
+void
+SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node,
+                                          SmallVectorImpl<SDValue> &Results) {
+  unsigned Opcode = Node->getOpcode();
+  bool isSigned = Opcode == ISD::SDIVREM;
+
+  RTLIB::Libcall LC;
+  switch (Node->getValueType(0).getSimpleVT().SimpleTy) {
+  default: assert(0 && "Unexpected request for libcall!");
+  case MVT::i8:   LC= isSigned ? RTLIB::SDIVREM_I8  : RTLIB::UDIVREM_I8;  break;
+  case MVT::i16:  LC= isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
+  case MVT::i32:  LC= isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
+  case MVT::i64:  LC= isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
+  case MVT::i128: LC= isSigned ? RTLIB::SDIVREM_I128:RTLIB::UDIVREM_I128; break;
   }
 
   // The input chain to this libcall is the entry node of the function.
@@ -2221,14 +2419,15 @@ SDValue SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node, bool isSigned,
                     /*isReturnValueUsed=*/true, Callee, Args, DAG, dl);
 
   // Legalize the call sequence, starting with the chain.  This will advance
-  // the LastCALLSEQ_END to the legalized version of the CALLSEQ_END node that
+  // the LastCALLSEQ to the legalized version of the CALLSEQ_END node that
   // was added by LowerCallTo (guaranteeing proper serialization of calls).
   LegalizeOp(CallInfo.second);
 
   // Remainder is loaded back from the stack frame.
-  SDValue Rem = DAG.getLoad(RetVT, dl, LastCALLSEQ_END, FIPtr,
+  SDValue Rem = DAG.getLoad(RetVT, dl, getLastCALLSEQ(), FIPtr,
                             MachinePointerInfo(), false, false, 0);
-  return isDIV ? CallInfo.first : Rem;
+  Results.push_back(CallInfo.first);
+  Results.push_back(Rem);
 }
 
 /// ExpandLegalINT_TO_FP - This function is responsible for legalizing a
@@ -2878,7 +3077,7 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node,
   }
   case ISD::FP_ROUND_INREG: {
     // The only way we can lower this is to turn it into a TRUNCSTORE,
-    // EXTLOAD pair, targetting a temporary location (a stack slot).
+    // EXTLOAD pair, targeting a temporary location (a stack slot).
 
     // NOTE: there is a choice here between constantly creating new stack
     // slots and always reusing the same one.  We currently always create
@@ -3204,28 +3403,25 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node,
     unsigned DivRemOpc = isSigned ? ISD::SDIVREM : ISD::UDIVREM;
     Tmp2 = Node->getOperand(0);
     Tmp3 = Node->getOperand(1);
-    if (TLI.isOperationLegalOrCustom(DivRemOpc, VT)) {
+    if (TLI.isOperationLegalOrCustom(DivRemOpc, VT) ||
+        (isDivRemLibcallAvailable(Node, isSigned, TLI) &&
+         UseDivRem(Node, isSigned, false))) {
       Tmp1 = DAG.getNode(DivRemOpc, dl, VTs, Tmp2, Tmp3).getValue(1);
     } else if (TLI.isOperationLegalOrCustom(DivOpc, VT)) {
       // X % Y -> X-X/Y*Y
       Tmp1 = DAG.getNode(DivOpc, dl, VT, Tmp2, Tmp3);
       Tmp1 = DAG.getNode(ISD::MUL, dl, VT, Tmp1, Tmp3);
       Tmp1 = DAG.getNode(ISD::SUB, dl, VT, Tmp2, Tmp1);
-    } else if (isSigned) {
-      Tmp1 = ExpandDivRemLibCall(Node, true, false);
-      if (!Tmp1.getNode())
-        Tmp1 = ExpandIntLibCall(Node, true,
-                                RTLIB::SREM_I8,
-                                RTLIB::SREM_I16, RTLIB::SREM_I32,
-                                RTLIB::SREM_I64, RTLIB::SREM_I128);
-    } else {
-      Tmp1 = ExpandDivRemLibCall(Node, false, false);
-      if (!Tmp1.getNode())
-        Tmp1 = ExpandIntLibCall(Node, false,
-                                RTLIB::UREM_I8,
-                                RTLIB::UREM_I16, RTLIB::UREM_I32,
-                                RTLIB::UREM_I64, RTLIB::UREM_I128);
-    }
+    } else if (isSigned)
+      Tmp1 = ExpandIntLibCall(Node, true,
+                              RTLIB::SREM_I8,
+                              RTLIB::SREM_I16, RTLIB::SREM_I32,
+                              RTLIB::SREM_I64, RTLIB::SREM_I128);
+    else
+      Tmp1 = ExpandIntLibCall(Node, false,
+                              RTLIB::UREM_I8,
+                              RTLIB::UREM_I16, RTLIB::UREM_I32,
+                              RTLIB::UREM_I64, RTLIB::UREM_I128);
     Results.push_back(Tmp1);
     break;
   }
@@ -3235,26 +3431,21 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node,
     unsigned DivRemOpc = isSigned ? ISD::SDIVREM : ISD::UDIVREM;
     EVT VT = Node->getValueType(0);
     SDVTList VTs = DAG.getVTList(VT, VT);
-    if (TLI.isOperationLegalOrCustom(DivRemOpc, VT))
+    if (TLI.isOperationLegalOrCustom(DivRemOpc, VT) ||
+        (isDivRemLibcallAvailable(Node, isSigned, TLI) &&
+         UseDivRem(Node, isSigned, true)))
       Tmp1 = DAG.getNode(DivRemOpc, dl, VTs, Node->getOperand(0),
                          Node->getOperand(1));
-    else if (isSigned) {
-      Tmp1 = ExpandDivRemLibCall(Node, true, true);
-      if (!Tmp1.getNode()) {
-        Tmp1 = ExpandIntLibCall(Node, true,
-                                RTLIB::SDIV_I8,
-                                RTLIB::SDIV_I16, RTLIB::SDIV_I32,
-                                RTLIB::SDIV_I64, RTLIB::SDIV_I128);
-      }
-    } else {
-      Tmp1 = ExpandDivRemLibCall(Node, false, true);
-      if (!Tmp1.getNode()) {
-        Tmp1 = ExpandIntLibCall(Node, false,
-                                RTLIB::UDIV_I8,
-                                RTLIB::UDIV_I16, RTLIB::UDIV_I32,
-                                RTLIB::UDIV_I64, RTLIB::UDIV_I128);
-      }
-    }
+    else if (isSigned)
+      Tmp1 = ExpandIntLibCall(Node, true,
+                              RTLIB::SDIV_I8,
+                              RTLIB::SDIV_I16, RTLIB::SDIV_I32,
+                              RTLIB::SDIV_I64, RTLIB::SDIV_I128);
+    else
+      Tmp1 = ExpandIntLibCall(Node, false,
+                              RTLIB::UDIV_I8,
+                              RTLIB::UDIV_I16, RTLIB::UDIV_I32,
+                              RTLIB::UDIV_I64, RTLIB::UDIV_I128);
     Results.push_back(Tmp1);
     break;
   }
@@ -3271,6 +3462,11 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node,
     Results.push_back(Tmp1.getValue(1));
     break;
   }
+  case ISD::SDIVREM:
+  case ISD::UDIVREM:
+    // Expand into divrem libcall
+    ExpandDivRemLibCall(Node, Results);
+    break;
   case ISD::MUL: {
     EVT VT = Node->getValueType(0);
     SDVTList VTs = DAG.getVTList(VT, VT);
@@ -3355,6 +3551,7 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node,
   case ISD::UMULO:
   case ISD::SMULO: {
     EVT VT = Node->getValueType(0);
+    EVT WideVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits() * 2);
     SDValue LHS = Node->getOperand(0);
     SDValue RHS = Node->getOperand(1);
     SDValue BottomHalf;
@@ -3372,7 +3569,6 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node,
       TopHalf = BottomHalf.getValue(1);
     } else if (TLI.isTypeLegal(EVT::getIntegerVT(*DAG.getContext(),
                                                  VT.getSizeInBits() * 2))) {
-      EVT WideVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits() * 2);
       LHS = DAG.getNode(Ops[isSigned][2], dl, WideVT, LHS);
       RHS = DAG.getNode(Ops[isSigned][2], dl, WideVT, RHS);
       Tmp1 = DAG.getNode(ISD::MUL, dl, WideVT, LHS, RHS);
@@ -3385,7 +3581,6 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node,
       // have a libcall big enough.
       // Also, we can fall back to a division in some cases, but that's a big
       // performance hit in the general case.
-      EVT WideVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits() * 2);
       RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
       if (WideVT == MVT::i16)
         LC = RTLIB::MUL_I16;
@@ -3396,15 +3591,27 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node,
       else if (WideVT == MVT::i128)
         LC = RTLIB::MUL_I128;
       assert(LC != RTLIB::UNKNOWN_LIBCALL && "Cannot expand this operation!");
-      LHS = DAG.getNode(Ops[isSigned][2], dl, WideVT, LHS);
-      RHS = DAG.getNode(Ops[isSigned][2], dl, WideVT, RHS);
 
-      SDValue Ret = ExpandLibCall(LC, Node, isSigned);
-      BottomHalf = DAG.getNode(ISD::TRUNCATE, dl, VT, Ret);
-      TopHalf = DAG.getNode(ISD::SRL, dl, Ret.getValueType(), Ret,
-                       DAG.getConstant(VT.getSizeInBits(), TLI.getPointerTy()));
-      TopHalf = DAG.getNode(ISD::TRUNCATE, dl, VT, TopHalf);
+      // The high part is obtained by SRA'ing all but one of the bits of low
+      // part.
+      unsigned LoSize = VT.getSizeInBits();
+      SDValue HiLHS = DAG.getNode(ISD::SRA, dl, VT, RHS,
+                                DAG.getConstant(LoSize-1, TLI.getPointerTy()));
+      SDValue HiRHS = DAG.getNode(ISD::SRA, dl, VT, LHS,
+                                DAG.getConstant(LoSize-1, TLI.getPointerTy()));
+
+      // Here we're passing the 2 arguments explicitly as 4 arguments that are
+      // pre-lowered to the correct types. This all depends upon WideVT not
+      // being a legal type for the architecture and thus has to be split to
+      // two arguments.
+      SDValue Args[] = { LHS, HiLHS, RHS, HiRHS };
+      SDValue Ret = ExpandLibCall(LC, WideVT, Args, 4, isSigned, dl);
+      BottomHalf = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VT, Ret,
+                               DAG.getIntPtrConstant(0));
+      TopHalf = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VT, Ret,
+                            DAG.getIntPtrConstant(1));
     }
+
     if (isSigned) {
       Tmp1 = DAG.getConstant(VT.getSizeInBits() - 1,
                              TLI.getShiftAmountTy(BottomHalf.getValueType()));
@@ -3486,9 +3693,13 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node,
                          Tmp2.getOperand(0), Tmp2.getOperand(1),
                          Node->getOperand(2));
     } else {
+      // We test only the i1 bit.  Skip the AND if UNDEF.
+      Tmp3 = (Tmp2.getOpcode() == ISD::UNDEF) ? Tmp2 :
+        DAG.getNode(ISD::AND, dl, Tmp2.getValueType(), Tmp2,
+                    DAG.getConstant(1, Tmp2.getValueType()));
       Tmp1 = DAG.getNode(ISD::BR_CC, dl, MVT::Other, Tmp1,
-                         DAG.getCondCode(ISD::SETNE), Tmp2,
-                         DAG.getConstant(0, Tmp2.getValueType()),
+                         DAG.getCondCode(ISD::SETNE), Tmp3,
+                         DAG.getConstant(0, Tmp3.getValueType()),
                          Node->getOperand(2));
     }
     Results.push_back(Tmp1);
@@ -3539,7 +3750,8 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node,
 
     LegalizeSetCCCondCode(TLI.getSetCCResultType(Tmp2.getValueType()),
                           Tmp2, Tmp3, Tmp4, dl);
-    LastCALLSEQ_END = DAG.getEntryNode();
+    assert(LastCALLSEQ.size() == 1 && "branch inside CALLSEQ_BEGIN/END?");
+    setLastCALLSEQ(DAG.getEntryNode());
 
     assert(!Tmp3.getNode() && "Can't legalize BR_CC with legal condition!");
     Tmp3 = DAG.getConstant(0, Tmp2.getValueType());
@@ -3697,9 +3909,8 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node,
 
 // SelectionDAG::Legalize - This is the entry point for the file.
 //
-void SelectionDAG::Legalize(CodeGenOpt::Level OptLevel) {
+void SelectionDAG::Legalize() {
   /// run - This is the main entry point to this class.
   ///
-  SelectionDAGLegalize(*this, OptLevel).LegalizeDAG();
+  SelectionDAGLegalize(*this).LegalizeDAG();
 }
-
diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 935aab0..da75b8a 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -73,6 +73,17 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::UNDEF:       Res = PromoteIntRes_UNDEF(N); break;
   case ISD::VAARG:       Res = PromoteIntRes_VAARG(N); break;
 
+  case ISD::EXTRACT_SUBVECTOR:
+                         Res = PromoteIntRes_EXTRACT_SUBVECTOR(N); break;
+  case ISD::VECTOR_SHUFFLE:
+                         Res = PromoteIntRes_VECTOR_SHUFFLE(N); break;
+  case ISD::INSERT_VECTOR_ELT:
+                         Res = PromoteIntRes_INSERT_VECTOR_ELT(N); break;
+  case ISD::BUILD_VECTOR:
+                         Res = PromoteIntRes_BUILD_VECTOR(N); break;
+  case ISD::SCALAR_TO_VECTOR:
+                         Res = PromoteIntRes_SCALAR_TO_VECTOR(N); break;
+
   case ISD::SIGN_EXTEND:
   case ISD::ZERO_EXTEND:
   case ISD::ANY_EXTEND:  Res = PromoteIntRes_INT_EXTEND(N); break;
@@ -174,24 +185,30 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BITCAST(SDNode *N) {
   default:
     assert(false && "Unknown type action!");
     break;
-  case Legal:
+  case TargetLowering::TypeLegal:
     break;
-  case PromoteInteger:
+  case TargetLowering::TypePromoteInteger:
     if (NOutVT.bitsEq(NInVT))
       // The input promotes to the same size.  Convert the promoted value.
       return DAG.getNode(ISD::BITCAST, dl, NOutVT, GetPromotedInteger(InOp));
+    if (NInVT.isVector())
+      // Promote vector element via memory load/store.
+      return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT,
+                         CreateStackStoreLoad(InOp, OutVT));
     break;
-  case SoftenFloat:
+  case TargetLowering::TypeSoftenFloat:
     // Promote the integer operand by hand.
     return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT, GetSoftenedFloat(InOp));
-  case ExpandInteger:
-  case ExpandFloat:
+  case TargetLowering::TypeExpandInteger:
+  case TargetLowering::TypeExpandFloat:
     break;
-  case ScalarizeVector:
+  case TargetLowering::TypeScalarizeVector:
     // Convert the element to an integer and promote it by hand.
-    return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT,
-                       BitConvertToInteger(GetScalarizedVector(InOp)));
-  case SplitVector: {
+    if (!NOutVT.isVector())
+      return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT,
+                         BitConvertToInteger(GetScalarizedVector(InOp)));
+    break;
+  case TargetLowering::TypeSplitVector: {
     // For example, i32 = BITCAST v2i16 on alpha.  Convert the split
     // pieces of the input into integers and reassemble in the final type.
     SDValue Lo, Hi;
@@ -208,7 +225,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BITCAST(SDNode *N) {
                        JoinIntegers(Lo, Hi));
     return DAG.getNode(ISD::BITCAST, dl, NOutVT, InOp);
   }
-  case WidenVector:
+  case TargetLowering::TypeWidenVector:
     if (OutVT.bitsEq(NInVT))
       // The input is widened to the same size.  Convert to the widened value.
       return DAG.getNode(ISD::BITCAST, dl, OutVT, GetWidenedVector(InOp));
@@ -342,7 +359,8 @@ SDValue DAGTypeLegalizer::PromoteIntRes_INT_EXTEND(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   DebugLoc dl = N->getDebugLoc();
 
-  if (getTypeAction(N->getOperand(0).getValueType()) == PromoteInteger) {
+  if (getTypeAction(N->getOperand(0).getValueType())
+      == TargetLowering::TypePromoteInteger) {
     SDValue Res = GetPromotedInteger(N->getOperand(0));
     assert(Res.getValueType().bitsLE(NVT) && "Extension doesn't make sense!");
 
@@ -507,11 +525,11 @@ SDValue DAGTypeLegalizer::PromoteIntRes_TRUNCATE(SDNode *N) {
 
   switch (getTypeAction(N->getOperand(0).getValueType())) {
   default: llvm_unreachable("Unknown type action!");
-  case Legal:
-  case ExpandInteger:
+  case TargetLowering::TypeLegal:
+  case TargetLowering::TypeExpandInteger:
     Res = N->getOperand(0);
     break;
-  case PromoteInteger:
+  case TargetLowering::TypePromoteInteger:
     Res = GetPromotedInteger(N->getOperand(0));
     break;
   }
@@ -557,9 +575,9 @@ SDValue DAGTypeLegalizer::PromoteIntRes_XMULO(SDNode *N, unsigned ResNo) {
   DebugLoc DL = N->getDebugLoc();
   EVT SmallVT = LHS.getValueType();
 
-  // To determine if the result overflowed in a larger type, we extend the input
-  // to the larger type, do the multiply, then check the high bits of the result
-  // to see if the overflow happened.
+  // To determine if the result overflowed in a larger type, we extend the
+  // input to the larger type, do the multiply, then check the high bits of
+  // the result to see if the overflow happened.
   if (N->getOpcode() == ISD::SMULO) {
     LHS = SExtPromotedInteger(LHS);
     RHS = SExtPromotedInteger(RHS);
@@ -569,8 +587,8 @@ SDValue DAGTypeLegalizer::PromoteIntRes_XMULO(SDNode *N, unsigned ResNo) {
   }
   SDValue Mul = DAG.getNode(ISD::MUL, DL, LHS.getValueType(), LHS, RHS);
 
-  // Overflow occurred iff the high part of the result does not zero/sign-extend
-  // the low part.
+  // Overflow occurred iff the high part of the result does not
+  // zero/sign-extend the low part.
   SDValue Overflow;
   if (N->getOpcode() == ISD::UMULO) {
     // Unsigned overflow occurred iff the high part is non-zero.
@@ -672,6 +690,8 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
   case ISD::BRCOND:       Res = PromoteIntOp_BRCOND(N, OpNo); break;
   case ISD::BUILD_PAIR:   Res = PromoteIntOp_BUILD_PAIR(N); break;
   case ISD::BUILD_VECTOR: Res = PromoteIntOp_BUILD_VECTOR(N); break;
+  case ISD::CONCAT_VECTORS: Res = PromoteIntOp_CONCAT_VECTORS(N); break;
+  case ISD::EXTRACT_VECTOR_ELT: Res = PromoteIntOp_EXTRACT_VECTOR_ELT(N); break;
   case ISD::CONVERT_RNDSAT:
                           Res = PromoteIntOp_CONVERT_RNDSAT(N); break;
   case ISD::INSERT_VECTOR_ELT:
@@ -952,7 +972,8 @@ SDValue DAGTypeLegalizer::PromoteIntOp_ZERO_EXTEND(SDNode *N) {
   DebugLoc dl = N->getDebugLoc();
   SDValue Op = GetPromotedInteger(N->getOperand(0));
   Op = DAG.getNode(ISD::ANY_EXTEND, dl, N->getValueType(0), Op);
-  return DAG.getZeroExtendInReg(Op, dl, N->getOperand(0).getValueType());
+  return DAG.getZeroExtendInReg(Op, dl,
+                                N->getOperand(0).getValueType().getScalarType());
 }
 
 
@@ -1513,7 +1534,8 @@ void DAGTypeLegalizer::ExpandIntRes_ANY_EXTEND(SDNode *N,
   } else {
     // For example, extension of an i48 to an i64.  The operand type necessarily
     // promotes to the result type, so will end up being expanded too.
-    assert(getTypeAction(Op.getValueType()) == PromoteInteger &&
+    assert(getTypeAction(Op.getValueType()) ==
+           TargetLowering::TypePromoteInteger &&
            "Only know how to promote this result!");
     SDValue Res = GetPromotedInteger(Op);
     assert(Res.getValueType() == N->getValueType(0) &&
@@ -2030,7 +2052,8 @@ void DAGTypeLegalizer::ExpandIntRes_SIGN_EXTEND(SDNode *N,
   } else {
     // For example, extension of an i48 to an i64.  The operand type necessarily
     // promotes to the result type, so will end up being expanded too.
-    assert(getTypeAction(Op.getValueType()) == PromoteInteger &&
+    assert(getTypeAction(Op.getValueType()) ==
+           TargetLowering::TypePromoteInteger &&
            "Only know how to promote this result!");
     SDValue Res = GetPromotedInteger(Op);
     assert(Res.getValueType() == N->getValueType(0) &&
@@ -2178,7 +2201,8 @@ void DAGTypeLegalizer::ExpandIntRes_ZERO_EXTEND(SDNode *N,
   } else {
     // For example, extension of an i48 to an i64.  The operand type necessarily
     // promotes to the result type, so will end up being expanded too.
-    assert(getTypeAction(Op.getValueType()) == PromoteInteger &&
+    assert(getTypeAction(Op.getValueType()) ==
+           TargetLowering::TypePromoteInteger &&
            "Only know how to promote this result!");
     SDValue Res = GetPromotedInteger(Op);
     assert(Res.getValueType() == N->getValueType(0) &&
@@ -2613,3 +2637,158 @@ SDValue DAGTypeLegalizer::ExpandIntOp_UINT_TO_FP(SDNode *N) {
          "Don't know how to expand this UINT_TO_FP!");
   return MakeLibCall(LC, DstVT, &Op, 1, true, dl);
 }
+
+SDValue DAGTypeLegalizer::PromoteIntRes_EXTRACT_SUBVECTOR(SDNode *N) {
+  SDValue InOp0 = N->getOperand(0);
+  EVT InVT = InOp0.getValueType();
+  EVT NInVT = TLI.getTypeToTransformTo(*DAG.getContext(), InVT);
+
+  EVT OutVT = N->getValueType(0);
+  EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
+  assert(NOutVT.isVector() && "This type must be promoted to a vector type");
+  unsigned OutNumElems = N->getValueType(0).getVectorNumElements();
+  EVT NOutVTElem = NOutVT.getVectorElementType();
+
+  DebugLoc dl = N->getDebugLoc();
+  SDValue BaseIdx = N->getOperand(1);
+
+  SmallVector<SDValue, 8> Ops;
+  for (unsigned i = 0; i != OutNumElems; ++i) {
+
+    // Extract the element from the original vector.
+    SDValue Index = DAG.getNode(ISD::ADD, dl, BaseIdx.getValueType(),
+      BaseIdx, DAG.getIntPtrConstant(i));
+    SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+      InVT.getVectorElementType(), N->getOperand(0), Index);
+
+    SDValue Op = DAG.getNode(ISD::ANY_EXTEND, dl, NOutVTElem, Ext);
+    // Insert the converted element to the new vector.
+    Ops.push_back(Op);
+  }
+
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, NOutVT, &Ops[0], Ops.size());
+}
+
+
+SDValue DAGTypeLegalizer::PromoteIntRes_VECTOR_SHUFFLE(SDNode *N) {
+
+  ShuffleVectorSDNode *SV = cast<ShuffleVectorSDNode>(N);
+  EVT VT = N->getValueType(0);
+  DebugLoc dl = N->getDebugLoc();
+
+  unsigned NumElts = VT.getVectorNumElements();
+  SmallVector<int, 8> NewMask;
+  for (unsigned i = 0; i != NumElts; ++i) {
+    NewMask.push_back(SV->getMaskElt(i));
+  }
+
+  SDValue V0 = GetPromotedInteger(N->getOperand(0));
+  SDValue V1 = GetPromotedInteger(N->getOperand(1));
+  EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+
+  return DAG.getVectorShuffle(OutVT, dl, V0,V1, &NewMask[0]);
+}
+
+
+SDValue DAGTypeLegalizer::PromoteIntRes_BUILD_VECTOR(SDNode *N) {
+
+  SDValue InOp0 = N->getOperand(0);
+  EVT InVT = InOp0.getValueType();
+  EVT NInVT = TLI.getTypeToTransformTo(*DAG.getContext(), InVT);
+
+  EVT OutVT = N->getValueType(0);
+  EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
+  assert(NOutVT.isVector() && "This type must be promoted to a vector type");
+  unsigned NumElems = N->getNumOperands();
+  EVT NOutVTElem = NOutVT.getVectorElementType();
+
+  DebugLoc dl = N->getDebugLoc();
+
+  SmallVector<SDValue, 8> Ops;
+  for (unsigned i = 0; i != NumElems; ++i) {
+    SDValue Op = DAG.getNode(ISD::ANY_EXTEND, dl, NOutVTElem, N->getOperand(i));
+    Ops.push_back(Op);
+  }
+
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, NOutVT, &Ops[0], Ops.size());
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_SCALAR_TO_VECTOR(SDNode *N) {
+
+  DebugLoc dl = N->getDebugLoc();
+
+  SDValue InOp0 = N->getOperand(0);
+  EVT InVT = InOp0.getValueType();
+  EVT NInVT = TLI.getTypeToTransformTo(*DAG.getContext(), InVT);
+  assert(!InVT.isVector() && "Input must not be a scalar");
+
+  EVT OutVT = N->getValueType(0);
+  EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
+  assert(NOutVT.isVector() && "This type must be promoted to a vector type");
+  EVT NOutVTElem = NOutVT.getVectorElementType();
+
+  SDValue Op = DAG.getNode(ISD::ANY_EXTEND, dl, NOutVTElem, N->getOperand(0));
+
+  return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NOutVT, Op);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_INSERT_VECTOR_ELT(SDNode *N) {
+
+  SDValue InOp0 = N->getOperand(0);
+  EVT InVT = InOp0.getValueType();
+  EVT InElVT = InVT.getVectorElementType();
+  EVT NInVT = TLI.getTypeToTransformTo(*DAG.getContext(), InVT);
+
+  EVT OutVT = N->getValueType(0);
+  EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
+  assert(NOutVT.isVector() && "This type must be promoted to a vector type");
+
+  EVT NOutVTElem = NOutVT.getVectorElementType();
+
+  DebugLoc dl = N->getDebugLoc();
+
+  SDValue ConvertedVector = DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT, InOp0);
+
+  SDValue ConvElem = DAG.getNode(ISD::ANY_EXTEND, dl,
+    NOutVTElem, N->getOperand(1));
+  return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,NOutVT,
+    ConvertedVector, ConvElem, N->getOperand(2));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_EXTRACT_VECTOR_ELT(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  SDValue V0 = GetPromotedInteger(N->getOperand(0));
+  SDValue V1 = N->getOperand(1);
+  SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+    V0->getValueType(0).getScalarType(), V0, V1);
+
+  return DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), Ext);
+
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_CONCAT_VECTORS(SDNode *N) {
+
+  DebugLoc dl = N->getDebugLoc();
+
+  EVT RetSclrTy = N->getValueType(0).getVectorElementType();
+
+  SmallVector<SDValue, 8> NewOps;
+
+  // For each incoming vector
+  for (unsigned VecIdx = 0, E = N->getNumOperands(); VecIdx!= E; ++VecIdx) {
+    SDValue Incoming = GetPromotedInteger(N->getOperand(VecIdx));
+    EVT SclrTy = Incoming->getValueType(0).getVectorElementType();
+    unsigned NumElem = Incoming->getValueType(0).getVectorNumElements();
+
+    for (unsigned i=0; i<NumElem; ++i) {
+      // Extract element from incoming vector
+      SDValue Ex = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SclrTy,
+      Incoming, DAG.getIntPtrConstant(i));
+      SDValue Tr = DAG.getNode(ISD::TRUNCATE, dl, RetSclrTy, Ex);
+      NewOps.push_back(Tr);
+    }
+  }
+
+  return DAG.getNode(ISD::BUILD_VECTOR, dl,  N->getValueType(0),
+    &NewOps[0], NewOps.size());
+  }
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index cedda7e..ba658b0 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -224,38 +224,38 @@ bool DAGTypeLegalizer::run() {
       switch (getTypeAction(ResultVT)) {
       default:
         assert(false && "Unknown action!");
-      case Legal:
+      case TargetLowering::TypeLegal:
         break;
       // The following calls must take care of *all* of the node's results,
       // not just the illegal result they were passed (this includes results
       // with a legal type).  Results can be remapped using ReplaceValueWith,
       // or their promoted/expanded/etc values registered in PromotedIntegers,
       // ExpandedIntegers etc.
-      case PromoteInteger:
+      case TargetLowering::TypePromoteInteger:
         PromoteIntegerResult(N, i);
         Changed = true;
         goto NodeDone;
-      case ExpandInteger:
+      case TargetLowering::TypeExpandInteger:
         ExpandIntegerResult(N, i);
         Changed = true;
         goto NodeDone;
-      case SoftenFloat:
+      case TargetLowering::TypeSoftenFloat:
         SoftenFloatResult(N, i);
         Changed = true;
         goto NodeDone;
-      case ExpandFloat:
+      case TargetLowering::TypeExpandFloat:
         ExpandFloatResult(N, i);
         Changed = true;
         goto NodeDone;
-      case ScalarizeVector:
+      case TargetLowering::TypeScalarizeVector:
         ScalarizeVectorResult(N, i);
         Changed = true;
         goto NodeDone;
-      case SplitVector:
+      case TargetLowering::TypeSplitVector:
         SplitVectorResult(N, i);
         Changed = true;
         goto NodeDone;
-      case WidenVector:
+      case TargetLowering::TypeWidenVector:
         WidenVectorResult(N, i);
         Changed = true;
         goto NodeDone;
@@ -277,36 +277,36 @@ ScanOperands:
       switch (getTypeAction(OpVT)) {
       default:
         assert(false && "Unknown action!");
-      case Legal:
+      case TargetLowering::TypeLegal:
         continue;
       // The following calls must either replace all of the node's results
       // using ReplaceValueWith, and return "false"; or update the node's
       // operands in place, and return "true".
-      case PromoteInteger:
+      case TargetLowering::TypePromoteInteger:
         NeedsReanalyzing = PromoteIntegerOperand(N, i);
         Changed = true;
         break;
-      case ExpandInteger:
+      case TargetLowering::TypeExpandInteger:
         NeedsReanalyzing = ExpandIntegerOperand(N, i);
         Changed = true;
         break;
-      case SoftenFloat:
+      case TargetLowering::TypeSoftenFloat:
         NeedsReanalyzing = SoftenFloatOperand(N, i);
         Changed = true;
         break;
-      case ExpandFloat:
+      case TargetLowering::TypeExpandFloat:
         NeedsReanalyzing = ExpandFloatOperand(N, i);
         Changed = true;
         break;
-      case ScalarizeVector:
+      case TargetLowering::TypeScalarizeVector:
         NeedsReanalyzing = ScalarizeVectorOperand(N, i);
         Changed = true;
         break;
-      case SplitVector:
+      case TargetLowering::TypeSplitVector:
         NeedsReanalyzing = SplitVectorOperand(N, i);
         Changed = true;
         break;
-      case WidenVector:
+      case TargetLowering::TypeWidenVector:
         NeedsReanalyzing = WidenVectorOperand(N, i);
         Changed = true;
         break;
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 5409b88..06dc40f 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -57,16 +57,6 @@ public:
     // 1+ - This is a node which has this many unprocessed operands.
   };
 private:
-  enum LegalizeAction {
-    Legal,           // The target natively supports this type.
-    PromoteInteger,  // Replace this integer type with a larger one.
-    ExpandInteger,   // Split this integer type into two of half the size.
-    SoftenFloat,     // Convert this float type to a same size integer type.
-    ExpandFloat,     // Split this float type into two of half the size.
-    ScalarizeVector, // Replace this one-element vector with its element type.
-    SplitVector,     // Split this vector type into two of half the size.
-    WidenVector      // This vector type should be widened into a larger vector.
-  };
 
   /// ValueTypeActions - This is a bitvector that contains two bits for each
   /// simple value type, where the two bits correspond to the LegalizeAction
@@ -74,41 +64,13 @@ private:
   TargetLowering::ValueTypeActionImpl ValueTypeActions;
 
   /// getTypeAction - Return how we should legalize values of this type.
-  LegalizeAction getTypeAction(EVT VT) const {
-    switch (ValueTypeActions.getTypeAction(VT)) {
-    default:
-      assert(false && "Unknown legalize action!");
-    case TargetLowering::Legal:
-      return Legal;
-    case TargetLowering::Promote:
-      // Promote can mean
-      //   1) For integers, use a larger integer type (e.g. i8 -> i32).
-      //   2) For vectors, use a wider vector type (e.g. v3i32 -> v4i32).
-      if (!VT.isVector())
-        return PromoteInteger;
-      return WidenVector;
-    case TargetLowering::Expand:
-      // Expand can mean
-      // 1) split scalar in half, 2) convert a float to an integer,
-      // 3) scalarize a single-element vector, 4) split a vector in two.
-      if (!VT.isVector()) {
-        if (VT.isInteger())
-          return ExpandInteger;
-        if (VT.getSizeInBits() ==
-                TLI.getTypeToTransformTo(*DAG.getContext(), VT).getSizeInBits())
-          return SoftenFloat;
-        return ExpandFloat;
-      }
-
-      if (VT.getVectorNumElements() == 1)
-        return ScalarizeVector;
-      return SplitVector;
-    }
+  TargetLowering::LegalizeTypeAction getTypeAction(EVT VT) const {
+    return TLI.getTypeAction(*DAG.getContext(), VT);
   }
 
   /// isTypeLegal - Return true if this type is legal on this target.
   bool isTypeLegal(EVT VT) const {
-    return ValueTypeActions.getTypeAction(VT) == TargetLowering::Legal;
+    return TLI.getTypeAction(*DAG.getContext(), VT) == TargetLowering::TypeLegal;
   }
 
   /// IgnoreNodeResults - Pretend all of this node's results are legal.
@@ -239,7 +201,7 @@ private:
     EVT OldVT = Op.getValueType();
     DebugLoc dl = Op.getDebugLoc();
     Op = GetPromotedInteger(Op);
-    return DAG.getZeroExtendInReg(Op, dl, OldVT);
+    return DAG.getZeroExtendInReg(Op, dl, OldVT.getScalarType());
   }
 
   // Integer Result Promotion.
@@ -248,6 +210,11 @@ private:
   SDValue PromoteIntRes_AssertZext(SDNode *N);
   SDValue PromoteIntRes_Atomic1(AtomicSDNode *N);
   SDValue PromoteIntRes_Atomic2(AtomicSDNode *N);
+  SDValue PromoteIntRes_EXTRACT_SUBVECTOR(SDNode *N);
+  SDValue PromoteIntRes_VECTOR_SHUFFLE(SDNode *N);
+  SDValue PromoteIntRes_BUILD_VECTOR(SDNode *N);
+  SDValue PromoteIntRes_SCALAR_TO_VECTOR(SDNode *N);
+  SDValue PromoteIntRes_INSERT_VECTOR_ELT(SDNode *N);
   SDValue PromoteIntRes_BITCAST(SDNode *N);
   SDValue PromoteIntRes_BSWAP(SDNode *N);
   SDValue PromoteIntRes_BUILD_PAIR(SDNode *N);
@@ -289,6 +256,9 @@ private:
   SDValue PromoteIntOp_BUILD_VECTOR(SDNode *N);
   SDValue PromoteIntOp_CONVERT_RNDSAT(SDNode *N);
   SDValue PromoteIntOp_INSERT_VECTOR_ELT(SDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_EXTRACT_ELEMENT(SDNode *N);
+  SDValue PromoteIntOp_EXTRACT_VECTOR_ELT(SDNode *N);
+  SDValue PromoteIntOp_CONCAT_VECTORS(SDNode *N);
   SDValue PromoteIntOp_MEMBARRIER(SDNode *N);
   SDValue PromoteIntOp_SCALAR_TO_VECTOR(SDNode *N);
   SDValue PromoteIntOp_SELECT(SDNode *N, unsigned OpNo);
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
index a75ae87..85ea6b6 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
@@ -43,36 +43,36 @@ void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) {
   switch (getTypeAction(InVT)) {
     default:
       assert(false && "Unknown type action!");
-    case Legal:
-    case PromoteInteger:
+    case TargetLowering::TypeLegal:
+    case TargetLowering::TypePromoteInteger:
       break;
-    case SoftenFloat:
+    case TargetLowering::TypeSoftenFloat:
       // Convert the integer operand instead.
       SplitInteger(GetSoftenedFloat(InOp), Lo, Hi);
       Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo);
       Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi);
       return;
-    case ExpandInteger:
-    case ExpandFloat:
+    case TargetLowering::TypeExpandInteger:
+    case TargetLowering::TypeExpandFloat:
       // Convert the expanded pieces of the input.
       GetExpandedOp(InOp, Lo, Hi);
       Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo);
       Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi);
       return;
-    case SplitVector:
+    case TargetLowering::TypeSplitVector:
       GetSplitVector(InOp, Lo, Hi);
       if (TLI.isBigEndian())
         std::swap(Lo, Hi);
       Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo);
       Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi);
       return;
-    case ScalarizeVector:
+    case TargetLowering::TypeScalarizeVector:
       // Convert the element instead.
       SplitInteger(BitConvertToInteger(GetScalarizedVector(InOp)), Lo, Hi);
       Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo);
       Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi);
       return;
-    case WidenVector: {
+    case TargetLowering::TypeWidenVector: {
       assert(!(InVT.getVectorNumElements() & 1) && "Unsupported BITCAST");
       InOp = GetWidenedVector(InOp);
       EVT InNVT = EVT::getVectorVT(*DAG.getContext(), InVT.getVectorElementType(),
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 0b4dd35..b5698f9 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -526,13 +526,13 @@ void DAGTypeLegalizer::SplitVecRes_BITCAST(SDNode *N, SDValue &Lo,
   switch (getTypeAction(InVT)) {
   default:
     assert(false && "Unknown type action!");
-  case Legal:
-  case PromoteInteger:
-  case SoftenFloat:
-  case ScalarizeVector:
+  case TargetLowering::TypeLegal:
+  case TargetLowering::TypePromoteInteger:
+  case TargetLowering::TypeSoftenFloat:
+  case TargetLowering::TypeScalarizeVector:
     break;
-  case ExpandInteger:
-  case ExpandFloat:
+  case TargetLowering::TypeExpandInteger:
+  case TargetLowering::TypeExpandFloat:
     // A scalar to vector conversion, where the scalar needs expansion.
     // If the vector is being split in two then we can just convert the
     // expanded pieces.
@@ -545,7 +545,7 @@ void DAGTypeLegalizer::SplitVecRes_BITCAST(SDNode *N, SDValue &Lo,
       return;
     }
     break;
-  case SplitVector:
+  case TargetLowering::TypeSplitVector:
     // If the input is a vector that needs to be split, convert each split
     // piece of the input now.
     GetSplitVector(InOp, Lo, Hi);
@@ -774,7 +774,7 @@ void DAGTypeLegalizer::SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo,
   EVT InVT = N->getOperand(0).getValueType();
   switch (getTypeAction(InVT)) {
   default: llvm_unreachable("Unexpected type action!");
-  case Legal: {
+  case TargetLowering::TypeLegal: {
     EVT InNVT = EVT::getVectorVT(*DAG.getContext(), InVT.getVectorElementType(),
                                  LoVT.getVectorNumElements());
     Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InNVT, N->getOperand(0),
@@ -783,10 +783,21 @@ void DAGTypeLegalizer::SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo,
                      DAG.getIntPtrConstant(InNVT.getVectorNumElements()));
     break;
   }
-  case SplitVector:
+  case TargetLowering::TypePromoteInteger: {
+    SDValue InOp = GetPromotedInteger(N->getOperand(0));
+    EVT InNVT = EVT::getVectorVT(*DAG.getContext(),
+                                 InOp.getValueType().getVectorElementType(),
+                                 LoVT.getVectorNumElements());
+    Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InNVT, InOp,
+                     DAG.getIntPtrConstant(0));
+    Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InNVT, InOp,
+                     DAG.getIntPtrConstant(InNVT.getVectorNumElements()));
+    break;
+  }
+  case TargetLowering::TypeSplitVector:
     GetSplitVector(N->getOperand(0), Lo, Hi);
     break;
-  case WidenVector: {
+  case TargetLowering::TypeWidenVector: {
     // If the result needs to be split and the input needs to be widened,
     // the two types must have different lengths. Use the widened result
     // and extract from it to do the split.
@@ -1439,7 +1450,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
   unsigned Opcode = N->getOpcode();
   unsigned InVTNumElts = InVT.getVectorNumElements();
 
-  if (getTypeAction(InVT) == WidenVector) {
+  if (getTypeAction(InVT) == TargetLowering::TypeWidenVector) {
     InOp = GetWidenedVector(N->getOperand(0));
     InVT = InOp.getValueType();
     InVTNumElts = InVT.getVectorNumElements();
@@ -1515,7 +1526,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_Shift(SDNode *N) {
   SDValue ShOp = N->getOperand(1);
 
   EVT ShVT = ShOp.getValueType();
-  if (getTypeAction(ShVT) == WidenVector) {
+  if (getTypeAction(ShVT) == TargetLowering::TypeWidenVector) {
     ShOp = GetWidenedVector(ShOp);
     ShVT = ShOp.getValueType();
   }
@@ -1557,9 +1568,9 @@ SDValue DAGTypeLegalizer::WidenVecRes_BITCAST(SDNode *N) {
   default:
     assert(false && "Unknown type action!");
     break;
-  case Legal:
+  case TargetLowering::TypeLegal:
     break;
-  case PromoteInteger:
+  case TargetLowering::TypePromoteInteger:
     // If the InOp is promoted to the same size, convert it.  Otherwise,
     // fall out of the switch and widen the promoted input.
     InOp = GetPromotedInteger(InOp);
@@ -1567,13 +1578,13 @@ SDValue DAGTypeLegalizer::WidenVecRes_BITCAST(SDNode *N) {
     if (WidenVT.bitsEq(InVT))
       return DAG.getNode(ISD::BITCAST, dl, WidenVT, InOp);
     break;
-  case SoftenFloat:
-  case ExpandInteger:
-  case ExpandFloat:
-  case ScalarizeVector:
-  case SplitVector:
+  case TargetLowering::TypeSoftenFloat:
+  case TargetLowering::TypeExpandInteger:
+  case TargetLowering::TypeExpandFloat:
+  case TargetLowering::TypeScalarizeVector:
+  case TargetLowering::TypeSplitVector:
     break;
-  case WidenVector:
+  case TargetLowering::TypeWidenVector:
     // If the InOp is widened to the same size, convert it.  Otherwise, fall
     // out of the switch and widen the widened input.
     InOp = GetWidenedVector(InOp);
@@ -1653,7 +1664,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) {
   unsigned NumOperands = N->getNumOperands();
 
   bool InputWidened = false; // Indicates we need to widen the input.
-  if (getTypeAction(InVT) != WidenVector) {
+  if (getTypeAction(InVT) != TargetLowering::TypeWidenVector) {
     if (WidenVT.getVectorNumElements() % InVT.getVectorNumElements() == 0) {
       // Add undef vectors to widen to correct length.
       unsigned NumConcat = WidenVT.getVectorNumElements() /
@@ -1732,7 +1743,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONVERT_RNDSAT(SDNode *N) {
   ISD::CvtCode CvtCode = cast<CvtRndSatSDNode>(N)->getCvtCode();
 
   unsigned InVTNumElts = InVT.getVectorNumElements();
-  if (getTypeAction(InVT) == WidenVector) {
+  if (getTypeAction(InVT) == TargetLowering::TypeWidenVector) {
     InOp = GetWidenedVector(InOp);
     InVT = InOp.getValueType();
     InVTNumElts = InVT.getVectorNumElements();
@@ -1800,7 +1811,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_EXTRACT_SUBVECTOR(SDNode *N) {
   SDValue  Idx  = N->getOperand(1);
   DebugLoc dl = N->getDebugLoc();
 
-  if (getTypeAction(InOp.getValueType()) == WidenVector)
+  if (getTypeAction(InOp.getValueType()) == TargetLowering::TypeWidenVector)
     InOp = GetWidenedVector(InOp);
 
   EVT InVT = InOp.getValueType();
@@ -1882,7 +1893,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_SELECT(SDNode *N) {
     EVT CondEltVT = CondVT.getVectorElementType();
     EVT CondWidenVT =  EVT::getVectorVT(*DAG.getContext(),
                                         CondEltVT, WidenNumElts);
-    if (getTypeAction(CondVT) == WidenVector)
+    if (getTypeAction(CondVT) == TargetLowering::TypeWidenVector)
       Cond1 = GetWidenedVector(Cond1);
 
     if (Cond1.getValueType() != CondWidenVT)
@@ -2026,7 +2037,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_Convert(SDNode *N) {
   DebugLoc dl = N->getDebugLoc();
   unsigned NumElts = VT.getVectorNumElements();
   SDValue InOp = N->getOperand(0);
-  if (getTypeAction(InOp.getValueType()) == WidenVector)
+  if (getTypeAction(InOp.getValueType()) == TargetLowering::TypeWidenVector)
     InOp = GetWidenedVector(InOp);
   EVT InVT = InOp.getValueType();
   EVT InEltVT = InVT.getVectorElementType();
@@ -2081,7 +2092,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_CONCAT_VECTORS(SDNode *N) {
   unsigned NumOperands = N->getNumOperands();
   for (unsigned i=0; i < NumOperands; ++i) {
     SDValue InOp = N->getOperand(i);
-    if (getTypeAction(InOp.getValueType()) == WidenVector)
+    if (getTypeAction(InOp.getValueType()) == TargetLowering::TypeWidenVector)
       InOp = GetWidenedVector(InOp);
     for (unsigned j=0; j < NumInElts; ++j)
       Ops[Idx++] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
@@ -2153,6 +2164,7 @@ static EVT FindMemType(SelectionDAG& DAG, const TargetLowering &TLI,
     if (MemVT.getSizeInBits() <= WidenEltWidth)
       break;
     if (TLI.isTypeLegal(MemVT) && (WidenWidth % MemVTWidth) == 0 &&
+        isPowerOf2_32(WidenWidth / MemVTWidth) &&
         (MemVTWidth <= Width ||
          (Align!=0 && MemVTWidth<=AlignInBits && MemVTWidth<=Width+WidenEx))) {
       RetVT = MemVT;
@@ -2168,6 +2180,7 @@ static EVT FindMemType(SelectionDAG& DAG, const TargetLowering &TLI,
     unsigned MemVTWidth = MemVT.getSizeInBits();
     if (TLI.isTypeLegal(MemVT) && WidenEltVT == MemVT.getVectorElementType() &&
         (WidenWidth % MemVTWidth) == 0 &&
+        isPowerOf2_32(WidenWidth / MemVTWidth) &&
         (MemVTWidth <= Width ||
          (Align!=0 && MemVTWidth<=AlignInBits && MemVTWidth<=Width+WidenEx))) {
       if (RetVT.getSizeInBits() < MemVTWidth || MemVT == WidenVT)
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
index b2e9c15..f09b381 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -71,6 +71,7 @@ static cl::opt<bool> DisableSchedCycles(
   cl::desc("Disable cycle-level precision during preRA scheduling"));
 
 // Temporary sched=list-ilp flags until the heuristics are robust.
+// Some options are also available under sched=list-hybrid.
 static cl::opt<bool> DisableSchedRegPressure(
   "disable-sched-reg-pressure", cl::Hidden, cl::init(false),
   cl::desc("Disable regpressure priority in sched=list-ilp"));
@@ -80,6 +81,9 @@ static cl::opt<bool> DisableSchedLiveUses(
 static cl::opt<bool> DisableSchedVRegCycle(
   "disable-sched-vrcycle", cl::Hidden, cl::init(false),
   cl::desc("Disable virtual register cycle interference checks"));
+static cl::opt<bool> DisableSchedPhysRegJoin(
+  "disable-sched-physreg-join", cl::Hidden, cl::init(false),
+  cl::desc("Disable physreg def-use affinity"));
 static cl::opt<bool> DisableSchedStalls(
   "disable-sched-stalls", cl::Hidden, cl::init(true),
   cl::desc("Disable no-stall priority in sched=list-ilp"));
@@ -102,11 +106,11 @@ static cl::opt<unsigned> AvgIPC(
 #ifndef NDEBUG
 namespace {
   // For sched=list-ilp, Count the number of times each factor comes into play.
-  enum { FactPressureDiff, FactRegUses, FactHeight, FactDepth, FactStatic,
-         FactOther, NumFactors };
+  enum { FactPressureDiff, FactRegUses, FactStall, FactHeight, FactDepth,
+         FactStatic, FactOther, NumFactors };
 }
 static const char *FactorName[NumFactors] =
-{"PressureDiff", "RegUses", "Height", "Depth","Static", "Other"};
+{"PressureDiff", "RegUses", "Stall", "Height", "Depth","Static", "Other"};
 static int FactorCount[NumFactors];
 #endif //!NDEBUG
 
@@ -272,6 +276,33 @@ private:
 };
 }  // end anonymous namespace
 
+/// GetCostForDef - Looks up the register class and cost for a given definition.
+/// Typically this just means looking up the representative register class,
+/// but for untyped values (MVT::untyped) it means inspecting the node's
+/// opcode to determine what register class is being generated.
+static void GetCostForDef(const ScheduleDAGSDNodes::RegDefIter &RegDefPos,
+                          const TargetLowering *TLI,
+                          const TargetInstrInfo *TII,
+                          const TargetRegisterInfo *TRI,
+                          unsigned &RegClass, unsigned &Cost) {
+  EVT VT = RegDefPos.GetValue();
+
+  // Special handling for untyped values.  These values can only come from
+  // the expansion of custom DAG-to-DAG patterns.
+  if (VT == MVT::untyped) {
+    unsigned Opcode = RegDefPos.GetNode()->getMachineOpcode();
+    unsigned Idx = RegDefPos.GetIdx();
+    const TargetInstrDesc Desc = TII->get(Opcode);
+    const TargetRegisterClass *RC = Desc.getRegClass(Idx, TRI);
+    RegClass = RC->getID();
+    // FIXME: Cost arbitrarily set to 1 because there doesn't seem to be a
+    // better way to determine it.
+    Cost = 1;
+  } else {
+    RegClass = TLI->getRepRegClassFor(VT)->getID();
+    Cost = TLI->getRepRegClassCostFor(VT);
+  }
+}
 
 /// Schedule - Schedule the DAG using list scheduling.
 void ScheduleDAGRRList::Schedule() {
@@ -463,6 +494,13 @@ void ScheduleDAGRRList::AdvancePastStalls(SUnit *SU) {
   if (DisableSchedCycles)
     return;
 
+  // FIXME: Nodes such as CopyFromReg probably should not advance the current
+  // cycle. Otherwise, we can wrongly mask real stalls. If the non-machine node
+  // has predecessors the cycle will be advanced when they are scheduled.
+  // But given the crude nature of modeling latency though such nodes, we
+  // currently need to treat these nodes like real instructions.
+  // if (!SU->getNode() || !SU->getNode()->isMachineOpcode()) return;
+
   unsigned ReadyCycle = isBottomUp ? SU->getHeight() : SU->getDepth();
 
   // Bump CurCycle to account for latency. We assume the latency of other
@@ -533,6 +571,8 @@ void ScheduleDAGRRList::EmitNode(SUnit *SU) {
   }
 }
 
+static void resetVRegCycle(SUnit *SU);
+
 /// ScheduleNodeBottomUp - Add the node to the schedule. Decrement the pending
 /// count of its predecessors. If a predecessor pending count is zero, add it to
 /// the Available queue.
@@ -542,7 +582,8 @@ void ScheduleDAGRRList::ScheduleNodeBottomUp(SUnit *SU) {
 
 #ifndef NDEBUG
   if (CurCycle < SU->getHeight())
-    DEBUG(dbgs() << "   Height [" << SU->getHeight() << "] pipeline stall!\n");
+    DEBUG(dbgs() << "   Height [" << SU->getHeight()
+          << "] pipeline stall!\n");
 #endif
 
   // FIXME: Do not modify node height. It may interfere with
@@ -559,7 +600,7 @@ void ScheduleDAGRRList::ScheduleNodeBottomUp(SUnit *SU) {
   AvailableQueue->ScheduledNode(SU);
 
   // If HazardRec is disabled, and each inst counts as one cycle, then
-  // advance CurCycle before ReleasePredecessors to avoid useles pushed to
+  // advance CurCycle before ReleasePredecessors to avoid useless pushes to
   // PendingQueue for schedulers that implement HasReadyFilter.
   if (!HazardRec->isEnabled() && AvgIPC < 2)
     AdvanceToCycle(CurCycle + 1);
@@ -580,20 +621,25 @@ void ScheduleDAGRRList::ScheduleNodeBottomUp(SUnit *SU) {
     }
   }
 
+  resetVRegCycle(SU);
+
   SU->isScheduled = true;
 
   // Conditions under which the scheduler should eagerly advance the cycle:
   // (1) No available instructions
   // (2) All pipelines full, so available instructions must have hazards.
   //
-  // If HazardRec is disabled, the cycle was advanced earlier.
+  // If HazardRec is disabled, the cycle was pre-advanced before calling
+  // ReleasePredecessors. In that case, IssueCount should remain 0.
   //
   // Check AvailableQueue after ReleasePredecessors in case of zero latency.
-  ++IssueCount;
-  if ((HazardRec->isEnabled() && HazardRec->atIssueLimit())
-      || (!HazardRec->isEnabled() && AvgIPC > 1 && IssueCount == AvgIPC)
-      || AvailableQueue->empty())
-    AdvanceToCycle(CurCycle + 1);
+  if (HazardRec->isEnabled() || AvgIPC > 1) {
+    if (SU->getNode() && SU->getNode()->isMachineOpcode())
+      ++IssueCount;
+    if ((HazardRec->isEnabled() && HazardRec->atIssueLimit())
+        || (!HazardRec->isEnabled() && IssueCount == AvgIPC))
+      AdvanceToCycle(CurCycle + 1);
+  }
 }
 
 /// CapturePred - This does the opposite of ReleasePred. Since SU is being
@@ -989,14 +1035,15 @@ static void CheckForLiveRegDef(SUnit *SU, unsigned Reg,
   for (const unsigned *AliasI = TRI->getOverlaps(Reg); *AliasI; ++AliasI) {
 
     // Check if Ref is live.
-    if (!LiveRegDefs[Reg]) continue;
+    if (!LiveRegDefs[*AliasI]) continue;
 
     // Allow multiple uses of the same def.
-    if (LiveRegDefs[Reg] == SU) continue;
+    if (LiveRegDefs[*AliasI] == SU) continue;
 
     // Add Reg to the set of interfering live regs.
-    if (RegAdded.insert(Reg))
-      LRegs.push_back(Reg);
+    if (RegAdded.insert(*AliasI)) {
+      LRegs.push_back(*AliasI);
+    }
   }
 }
 
@@ -1220,7 +1267,7 @@ void ScheduleDAGRRList::ListScheduleBottomUp() {
   // priority. If it is not ready put it back.  Schedule the node.
   Sequence.reserve(SUnits.size());
   while (!AvailableQueue->empty()) {
-    DEBUG(dbgs() << "\n*** Examining Available\n";
+    DEBUG(dbgs() << "\nExamining Available:\n";
           AvailableQueue->dump(this));
 
     // Pick the best node to schedule taking all constraints into
@@ -1349,6 +1396,21 @@ struct queue_sort : public std::binary_function<SUnit*, SUnit*, bool> {
   bool isReady(SUnit* SU, unsigned CurCycle) const { return true; }
 };
 
+#ifndef NDEBUG
+template<class SF>
+struct reverse_sort : public queue_sort {
+  SF &SortFunc;
+  reverse_sort(SF &sf) : SortFunc(sf) {}
+  reverse_sort(const reverse_sort &RHS) : SortFunc(RHS.SortFunc) {}
+
+  bool operator()(SUnit* left, SUnit* right) const {
+    // reverse left/right rather than simply !SortFunc(left, right)
+    // to expose different paths in the comparison logic.
+    return SortFunc(right, left);
+  }
+};
+#endif // NDEBUG
+
 /// bu_ls_rr_sort - Priority function for bottom up register pressure
 // reduction scheduler.
 struct bu_ls_rr_sort : public queue_sort {
@@ -1549,20 +1611,33 @@ protected:
 };
 
 template<class SF>
-class RegReductionPriorityQueue : public RegReductionPQBase {
-  static SUnit *popFromQueue(std::vector<SUnit*> &Q, SF &Picker) {
-    std::vector<SUnit *>::iterator Best = Q.begin();
-    for (std::vector<SUnit *>::iterator I = llvm::next(Q.begin()),
-           E = Q.end(); I != E; ++I)
-      if (Picker(*Best, *I))
-        Best = I;
-    SUnit *V = *Best;
-    if (Best != prior(Q.end()))
-      std::swap(*Best, Q.back());
-    Q.pop_back();
-    return V;
+static SUnit *popFromQueueImpl(std::vector<SUnit*> &Q, SF &Picker) {
+  std::vector<SUnit *>::iterator Best = Q.begin();
+  for (std::vector<SUnit *>::iterator I = llvm::next(Q.begin()),
+         E = Q.end(); I != E; ++I)
+    if (Picker(*Best, *I))
+      Best = I;
+  SUnit *V = *Best;
+  if (Best != prior(Q.end()))
+    std::swap(*Best, Q.back());
+  Q.pop_back();
+  return V;
+}
+
+template<class SF>
+SUnit *popFromQueue(std::vector<SUnit*> &Q, SF &Picker, ScheduleDAG *DAG) {
+#ifndef NDEBUG
+  if (DAG->StressSched) {
+    reverse_sort<SF> RPicker(Picker);
+    return popFromQueueImpl(Q, RPicker);
   }
+#endif
+  (void)DAG;
+  return popFromQueueImpl(Q, Picker);
+}
 
+template<class SF>
+class RegReductionPriorityQueue : public RegReductionPQBase {
   SF Picker;
 
 public:
@@ -1583,7 +1658,7 @@ public:
   SUnit *pop() {
     if (Queue.empty()) return NULL;
 
-    SUnit *V = popFromQueue(Queue, Picker);
+    SUnit *V = popFromQueue(Queue, Picker, scheduleDAG);
     V->NodeQueueId = 0;
     return V;
   }
@@ -1593,7 +1668,7 @@ public:
     std::vector<SUnit*> DumpQueue = Queue;
     SF DumpPicker = Picker;
     while (!DumpQueue.empty()) {
-      SUnit *SU = popFromQueue(DumpQueue, DumpPicker);
+      SUnit *SU = popFromQueue(DumpQueue, DumpPicker, scheduleDAG);
       if (isBottomUp())
         dbgs() << "Height " << SU->getHeight() << ": ";
       else
@@ -1623,6 +1698,20 @@ ILPBURRPriorityQueue;
 //           Static Node Priority for Register Pressure Reduction
 //===----------------------------------------------------------------------===//
 
+// Check for special nodes that bypass scheduling heuristics.
+// Currently this pushes TokenFactor nodes down, but may be used for other
+// pseudo-ops as well.
+//
+// Return -1 to schedule right above left, 1 for left above right.
+// Return 0 if no bias exists.
+static int checkSpecialNodes(const SUnit *left, const SUnit *right) {
+  bool LSchedLow = left->isScheduleLow;
+  bool RSchedLow = right->isScheduleLow;
+  if (LSchedLow != RSchedLow)
+    return LSchedLow < RSchedLow ? 1 : -1;
+  return 0;
+}
+
 /// CalcNodeSethiUllmanNumber - Compute Sethi Ullman number.
 /// Smaller number is the higher priority.
 static unsigned
@@ -1661,17 +1750,6 @@ void RegReductionPQBase::CalculateSethiUllmanNumbers() {
     CalcNodeSethiUllmanNumber(&(*SUnits)[i], SethiUllmanNumbers);
 }
 
-void RegReductionPQBase::initNodes(std::vector<SUnit> &sunits) {
-  SUnits = &sunits;
-  // Add pseudo dependency edges for two-address nodes.
-  AddPseudoTwoAddrDeps();
-  // Reroute edges to nodes with multiple uses.
-  if (!TracksRegPressure)
-    PrescheduleNodesWithMultipleUses();
-  // Calculate node priorities.
-  CalculateSethiUllmanNumbers();
-}
-
 void RegReductionPQBase::addNode(const SUnit *SU) {
   unsigned SUSize = SethiUllmanNumbers.size();
   if (SUnits->size() > SUSize)
@@ -1710,7 +1788,17 @@ unsigned RegReductionPQBase::getNodePriority(const SUnit *SU) const {
     // If SU does not have a register def, schedule it close to its uses
     // because it does not lengthen any live ranges.
     return 0;
+#if 1
   return SethiUllmanNumbers[SU->NodeNum];
+#else
+  unsigned Priority = SethiUllmanNumbers[SU->NodeNum];
+  if (SU->isCallOp) {
+    // FIXME: This assumes all of the defs are used as call operands.
+    int NP = (int)Priority - SU->getNode()->getNumValues();
+    return (NP > 0) ? NP : 0;
+  }
+  return Priority;
+#endif
 }
 
 //===----------------------------------------------------------------------===//
@@ -1746,8 +1834,10 @@ bool RegReductionPQBase::HighRegPressure(const SUnit *SU) const {
     for (ScheduleDAGSDNodes::RegDefIter RegDefPos(PredSU, scheduleDAG);
          RegDefPos.IsValid(); RegDefPos.Advance()) {
       EVT VT = RegDefPos.GetValue();
-      unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
-      unsigned Cost = TLI->getRepRegClassCostFor(VT);
+
+      unsigned RCId, Cost;
+      GetCostForDef(RegDefPos, TLI, TII, TRI, RCId, Cost);
+
       if ((RegPressure[RCId] + Cost) >= RegLimit[RCId])
         return true;
     }
@@ -1858,9 +1948,10 @@ void RegReductionPQBase::ScheduledNode(SUnit *SU) {
          RegDefPos.IsValid(); RegDefPos.Advance(), --SkipRegDefs) {
       if (SkipRegDefs)
         continue;
-      EVT VT = RegDefPos.GetValue();
-      unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
-      RegPressure[RCId] += TLI->getRepRegClassCostFor(VT);
+
+      unsigned RCId, Cost;
+      GetCostForDef(RegDefPos, TLI, TII, TRI, RCId, Cost);
+      RegPressure[RCId] += Cost;
       break;
     }
   }
@@ -1873,16 +1964,16 @@ void RegReductionPQBase::ScheduledNode(SUnit *SU) {
        RegDefPos.IsValid(); RegDefPos.Advance(), --SkipRegDefs) {
     if (SkipRegDefs > 0)
       continue;
-    EVT VT = RegDefPos.GetValue();
-    unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
-    if (RegPressure[RCId] < TLI->getRepRegClassCostFor(VT)) {
+    unsigned RCId, Cost;
+    GetCostForDef(RegDefPos, TLI, TII, TRI, RCId, Cost);
+    if (RegPressure[RCId] < Cost) {
       // Register pressure tracking is imprecise. This can happen. But we try
       // hard not to let it happen because it likely results in poor scheduling.
       DEBUG(dbgs() << "  SU(" << SU->NodeNum << ") has too many regdefs\n");
       RegPressure[RCId] = 0;
     }
     else {
-      RegPressure[RCId] -= TLI->getRepRegClassCostFor(VT);
+      RegPressure[RCId] -= Cost;
     }
   }
   dumpRegPressure();
@@ -2008,7 +2099,29 @@ static unsigned calcMaxScratches(const SUnit *SU) {
   return Scratches;
 }
 
-/// hasOnlyLiveOutUse - Return true if SU has a single value successor that is a
+/// hasOnlyLiveInOpers - Return true if SU has only value predecessors that are
+/// CopyFromReg from a virtual register.
+static bool hasOnlyLiveInOpers(const SUnit *SU) {
+  bool RetVal = false;
+  for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    if (I->isCtrl()) continue;
+    const SUnit *PredSU = I->getSUnit();
+    if (PredSU->getNode() &&
+        PredSU->getNode()->getOpcode() == ISD::CopyFromReg) {
+      unsigned Reg =
+        cast<RegisterSDNode>(PredSU->getNode()->getOperand(1))->getReg();
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        RetVal = true;
+        continue;
+      }
+    }
+    return false;
+  }
+  return RetVal;
+}
+
+/// hasOnlyLiveOutUses - Return true if SU has only value successors that are
 /// CopyToReg to a virtual register. This SU def is probably a liveout and
 /// it has no other use. It should be scheduled closer to the terminator.
 static bool hasOnlyLiveOutUses(const SUnit *SU) {
@@ -2030,62 +2143,71 @@ static bool hasOnlyLiveOutUses(const SUnit *SU) {
   return RetVal;
 }
 
-/// UnitsSharePred - Return true if the two scheduling units share a common
-/// data predecessor.
-static bool UnitsSharePred(const SUnit *left, const SUnit *right) {
-  SmallSet<const SUnit*, 4> Preds;
-  for (SUnit::const_pred_iterator I = left->Preds.begin(),E = left->Preds.end();
+// Set isVRegCycle for a node with only live in opers and live out uses. Also
+// set isVRegCycle for its CopyFromReg operands.
+//
+// This is only relevant for single-block loops, in which case the VRegCycle
+// node is likely an induction variable in which the operand and target virtual
+// registers should be coalesced (e.g. pre/post increment values). Setting the
+// isVRegCycle flag helps the scheduler prioritize other uses of the same
+// CopyFromReg so that this node becomes the virtual register "kill". This
+// avoids interference between the values live in and out of the block and
+// eliminates a copy inside the loop.
+static void initVRegCycle(SUnit *SU) {
+  if (DisableSchedVRegCycle)
+    return;
+
+  if (!hasOnlyLiveInOpers(SU) || !hasOnlyLiveOutUses(SU))
+    return;
+
+  DEBUG(dbgs() << "VRegCycle: SU(" << SU->NodeNum << ")\n");
+
+  SU->isVRegCycle = true;
+
+  for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
        I != E; ++I) {
-    if (I->isCtrl()) continue;  // ignore chain preds
-    Preds.insert(I->getSUnit());
+    if (I->isCtrl()) continue;
+    I->getSUnit()->isVRegCycle = true;
   }
-  for (SUnit::const_pred_iterator I = right->Preds.begin(),E = right->Preds.end();
+}
+
+// After scheduling the definition of a VRegCycle, clear the isVRegCycle flag of
+// CopyFromReg operands. We should no longer penalize other uses of this VReg.
+static void resetVRegCycle(SUnit *SU) {
+  if (!SU->isVRegCycle)
+    return;
+
+  for (SUnit::const_pred_iterator I = SU->Preds.begin(),E = SU->Preds.end();
        I != E; ++I) {
     if (I->isCtrl()) continue;  // ignore chain preds
-    if (Preds.count(I->getSUnit()))
-      return true;
+    SUnit *PredSU = I->getSUnit();
+    if (PredSU->isVRegCycle) {
+      assert(PredSU->getNode()->getOpcode() == ISD::CopyFromReg &&
+             "VRegCycle def must be CopyFromReg");
+      I->getSUnit()->isVRegCycle = 0;
+    }
   }
-  return false;
 }
 
-// Return true if the virtual register defined by VRCycleSU may interfere with
-// VRUseSU.
-//
-// Note: We may consider two SU's that use the same value live into a loop as
-// interferng even though the value is not an induction variable. This is an
-// unfortunate consequence of scheduling on the selection DAG.
-static bool checkVRegCycleInterference(const SUnit *VRCycleSU,
-                                       const SUnit *VRUseSU) {
-  for (SUnit::const_pred_iterator I = VRCycleSU->Preds.begin(),
-         E = VRCycleSU->Preds.end(); I != E; ++I) {
+// Return true if this SUnit uses a CopyFromReg node marked as a VRegCycle. This
+// means a node that defines the VRegCycle has not been scheduled yet.
+static bool hasVRegCycleUse(const SUnit *SU) {
+  // If this SU also defines the VReg, don't hoist it as a "use".
+  if (SU->isVRegCycle)
+    return false;
+
+  for (SUnit::const_pred_iterator I = SU->Preds.begin(),E = SU->Preds.end();
+       I != E; ++I) {
     if (I->isCtrl()) continue;  // ignore chain preds
-    SDNode *InNode = I->getSUnit()->getNode();
-    if (!InNode || InNode->getOpcode() != ISD::CopyFromReg)
-      continue;
-    for (SUnit::const_pred_iterator II = VRUseSU->Preds.begin(),
-           EE = VRUseSU->Preds.end(); II != EE; ++II) {
-      if (II->getSUnit() == I->getSUnit())
-        return true;
+    if (I->getSUnit()->isVRegCycle &&
+        I->getSUnit()->getNode()->getOpcode() == ISD::CopyFromReg) {
+      DEBUG(dbgs() << "  VReg cycle use: SU (" << SU->NodeNum << ")\n");
+      return true;
     }
   }
   return false;
 }
 
-// Compare the VRegCycle properties of the nodes.
-// Return -1 if left has higher priority, 1 if right has higher priority.
-// Return 0 if priority is equivalent.
-static int BUCompareVRegCycle(const SUnit *left, const SUnit *right) {
-  if (left->isVRegCycle && !right->isVRegCycle) {
-    if (checkVRegCycleInterference(left, right))
-      return -1;
-  }
-  else if (!left->isVRegCycle && right->isVRegCycle) {
-    if (checkVRegCycleInterference(right, left))
-      return 1;
-  }
-  return 0;
-}
-
 // Check for either a dependence (latency) or resource (hazard) stall.
 //
 // Note: The ScheduleHazardRecognizer interface requires a non-const SU.
@@ -2101,23 +2223,12 @@ static bool BUHasStall(SUnit *SU, int Height, RegReductionPQBase *SPQ) {
 // Return 0 if latency-based priority is equivalent.
 static int BUCompareLatency(SUnit *left, SUnit *right, bool checkPref,
                             RegReductionPQBase *SPQ) {
-  // If the two nodes share an operand and one of them has a single
-  // use that is a live out copy, favor the one that is live out. Otherwise
-  // it will be difficult to eliminate the copy if the instruction is a
-  // loop induction variable update. e.g.
-  // BB:
-  // sub r1, r3, #1
-  // str r0, [r2, r3]
-  // mov r3, r1
-  // cmp
-  // bne BB
-  bool SharePred = UnitsSharePred(left, right);
-  // FIXME: Only adjust if BB is a loop back edge.
-  // FIXME: What's the cost of a copy?
-  int LBonus = (SharePred && hasOnlyLiveOutUses(left)) ? 1 : 0;
-  int RBonus = (SharePred && hasOnlyLiveOutUses(right)) ? 1 : 0;
-  int LHeight = (int)left->getHeight() - LBonus;
-  int RHeight = (int)right->getHeight() - RBonus;
+  // Scheduling an instruction that uses a VReg whose postincrement has not yet
+  // been scheduled will induce a copy. Model this as an extra cycle of latency.
+  int LPenalty = hasVRegCycleUse(left) ? 1 : 0;
+  int RPenalty = hasVRegCycleUse(right) ? 1 : 0;
+  int LHeight = (int)left->getHeight() + LPenalty;
+  int RHeight = (int)right->getHeight() + RPenalty;
 
   bool LStall = (!checkPref || left->SchedulingPref == Sched::Latency) &&
     BUHasStall(left, LHeight, SPQ);
@@ -2128,48 +2239,102 @@ static int BUCompareLatency(SUnit *left, SUnit *right, bool checkPref,
   // If scheduling either one of the node will cause a pipeline stall, sort
   // them according to their height.
   if (LStall) {
-    if (!RStall)
+    if (!RStall) {
+      DEBUG(++FactorCount[FactStall]);
       return 1;
-    if (LHeight != RHeight)
+    }
+    if (LHeight != RHeight) {
+      DEBUG(++FactorCount[FactStall]);
       return LHeight > RHeight ? 1 : -1;
-  } else if (RStall)
+    }
+  } else if (RStall) {
+    DEBUG(++FactorCount[FactStall]);
     return -1;
+  }
 
   // If either node is scheduling for latency, sort them by height/depth
   // and latency.
   if (!checkPref || (left->SchedulingPref == Sched::Latency ||
                      right->SchedulingPref == Sched::Latency)) {
     if (DisableSchedCycles) {
-      if (LHeight != RHeight)
+      if (LHeight != RHeight) {
+        DEBUG(++FactorCount[FactHeight]);
         return LHeight > RHeight ? 1 : -1;
+      }
     }
     else {
       // If neither instruction stalls (!LStall && !RStall) then
       // its height is already covered so only its depth matters. We also reach
       // this if both stall but have the same height.
-      unsigned LDepth = left->getDepth();
-      unsigned RDepth = right->getDepth();
+      int LDepth = left->getDepth() - LPenalty;
+      int RDepth = right->getDepth() - RPenalty;
       if (LDepth != RDepth) {
+        DEBUG(++FactorCount[FactDepth]);
         DEBUG(dbgs() << "  Comparing latency of SU (" << left->NodeNum
               << ") depth " << LDepth << " vs SU (" << right->NodeNum
               << ") depth " << RDepth << "\n");
         return LDepth < RDepth ? 1 : -1;
       }
     }
-    if (left->Latency != right->Latency)
+    if (left->Latency != right->Latency) {
+      DEBUG(++FactorCount[FactOther]);
       return left->Latency > right->Latency ? 1 : -1;
+    }
   }
   return 0;
 }
 
 static bool BURRSort(SUnit *left, SUnit *right, RegReductionPQBase *SPQ) {
+  // Schedule physical register definitions close to their use. This is
+  // motivated by microarchitectures that can fuse cmp+jump macro-ops. But as
+  // long as shortening physreg live ranges is generally good, we can defer
+  // creating a subtarget hook.
+  if (!DisableSchedPhysRegJoin) {
+    bool LHasPhysReg = left->hasPhysRegDefs;
+    bool RHasPhysReg = right->hasPhysRegDefs;
+    if (LHasPhysReg != RHasPhysReg) {
+      DEBUG(++FactorCount[FactRegUses]);
+      #ifndef NDEBUG
+      const char *PhysRegMsg[] = {" has no physreg", " defines a physreg"};
+      #endif
+      DEBUG(dbgs() << "  SU (" << left->NodeNum << ") "
+            << PhysRegMsg[LHasPhysReg] << " SU(" << right->NodeNum << ") "
+            << PhysRegMsg[RHasPhysReg] << "\n");
+      return LHasPhysReg < RHasPhysReg;
+    }
+  }
+
+  // Prioritize by Sethi-Ulmann number and push CopyToReg nodes down.
   unsigned LPriority = SPQ->getNodePriority(left);
   unsigned RPriority = SPQ->getNodePriority(right);
+
+  // Be really careful about hoisting call operands above previous calls.
+  // Only allows it if it would reduce register pressure.
+  if (left->isCall && right->isCallOp) {
+    unsigned RNumVals = right->getNode()->getNumValues();
+    RPriority = (RPriority > RNumVals) ? (RPriority - RNumVals) : 0;
+  }
+  if (right->isCall && left->isCallOp) {
+    unsigned LNumVals = left->getNode()->getNumValues();
+    LPriority = (LPriority > LNumVals) ? (LPriority - LNumVals) : 0;
+  }
+
   if (LPriority != RPriority) {
     DEBUG(++FactorCount[FactStatic]);
     return LPriority > RPriority;
   }
-  DEBUG(++FactorCount[FactOther]);
+
+  // One or both of the nodes are calls and their sethi-ullman numbers are the
+  // same, then keep source order.
+  if (left->isCall || right->isCall) {
+    unsigned LOrder = SPQ->getNodeOrdering(left);
+    unsigned ROrder = SPQ->getNodeOrdering(right);
+
+    // Prefer an ordering where the lower the non-zero order number, the higher
+    // the preference.
+    if ((LOrder || ROrder) && LOrder != ROrder)
+      return LOrder != 0 && (LOrder < ROrder || ROrder == 0);
+  }
 
   // Try schedule def + use closer when Sethi-Ullman numbers are the same.
   // e.g.
@@ -2190,40 +2355,62 @@ static bool BURRSort(SUnit *left, SUnit *right, RegReductionPQBase *SPQ) {
   // This creates more short live intervals.
   unsigned LDist = closestSucc(left);
   unsigned RDist = closestSucc(right);
-  if (LDist != RDist)
+  if (LDist != RDist) {
+    DEBUG(++FactorCount[FactOther]);
     return LDist < RDist;
+  }
 
   // How many registers becomes live when the node is scheduled.
   unsigned LScratch = calcMaxScratches(left);
   unsigned RScratch = calcMaxScratches(right);
-  if (LScratch != RScratch)
+  if (LScratch != RScratch) {
+    DEBUG(++FactorCount[FactOther]);
     return LScratch > RScratch;
+  }
 
-  if (!DisableSchedCycles) {
+  // Comparing latency against a call makes little sense unless the node
+  // is register pressure-neutral.
+  if ((left->isCall && RPriority > 0) || (right->isCall && LPriority > 0))
+    return (left->NodeQueueId > right->NodeQueueId);
+
+  // Do not compare latencies when one or both of the nodes are calls.
+  if (!DisableSchedCycles &&
+      !(left->isCall || right->isCall)) {
     int result = BUCompareLatency(left, right, false /*checkPref*/, SPQ);
     if (result != 0)
       return result > 0;
   }
   else {
-    if (left->getHeight() != right->getHeight())
+    if (left->getHeight() != right->getHeight()) {
+      DEBUG(++FactorCount[FactHeight]);
       return left->getHeight() > right->getHeight();
+    }
 
-    if (left->getDepth() != right->getDepth())
+    if (left->getDepth() != right->getDepth()) {
+      DEBUG(++FactorCount[FactDepth]);
       return left->getDepth() < right->getDepth();
+    }
   }
 
   assert(left->NodeQueueId && right->NodeQueueId &&
          "NodeQueueId cannot be zero");
+  DEBUG(++FactorCount[FactOther]);
   return (left->NodeQueueId > right->NodeQueueId);
 }
 
 // Bottom up
 bool bu_ls_rr_sort::operator()(SUnit *left, SUnit *right) const {
+  if (int res = checkSpecialNodes(left, right))
+    return res > 0;
+
   return BURRSort(left, right, SPQ);
 }
 
 // Source order, otherwise bottom up.
 bool src_ls_rr_sort::operator()(SUnit *left, SUnit *right) const {
+  if (int res = checkSpecialNodes(left, right))
+    return res > 0;
+
   unsigned LOrder = SPQ->getNodeOrdering(left);
   unsigned ROrder = SPQ->getNodeOrdering(right);
 
@@ -2255,6 +2442,9 @@ bool hybrid_ls_rr_sort::isReady(SUnit *SU, unsigned CurCycle) const {
 
 // Return true if right should be scheduled with higher priority than left.
 bool hybrid_ls_rr_sort::operator()(SUnit *left, SUnit *right) const {
+  if (int res = checkSpecialNodes(left, right))
+    return res > 0;
+
   if (left->isCall || right->isCall)
     // No way to compute latency of calls.
     return BURRSort(left, right, SPQ);
@@ -2264,24 +2454,22 @@ bool hybrid_ls_rr_sort::operator()(SUnit *left, SUnit *right) const {
   // Avoid causing spills. If register pressure is high, schedule for
   // register pressure reduction.
   if (LHigh && !RHigh) {
+    DEBUG(++FactorCount[FactPressureDiff]);
     DEBUG(dbgs() << "  pressure SU(" << left->NodeNum << ") > SU("
           << right->NodeNum << ")\n");
     return true;
   }
   else if (!LHigh && RHigh) {
+    DEBUG(++FactorCount[FactPressureDiff]);
     DEBUG(dbgs() << "  pressure SU(" << right->NodeNum << ") > SU("
           << left->NodeNum << ")\n");
     return false;
   }
-  int result = 0;
-  if (!DisableSchedVRegCycle) {
-    result = BUCompareVRegCycle(left, right);
-  }
-  if (result == 0 && !LHigh && !RHigh) {
-    result = BUCompareLatency(left, right, true /*checkPref*/, SPQ);
+  if (!LHigh && !RHigh) {
+    int result = BUCompareLatency(left, right, true /*checkPref*/, SPQ);
+    if (result != 0)
+      return result > 0;
   }
-  if (result != 0)
-    return result > 0;
   return BURRSort(left, right, SPQ);
 }
 
@@ -2322,6 +2510,9 @@ static bool canEnableCoalescing(SUnit *SU) {
 // list-ilp is currently an experimental scheduler that allows various
 // heuristics to be enabled prior to the normal register reduction logic.
 bool ilp_ls_rr_sort::operator()(SUnit *left, SUnit *right) const {
+  if (int res = checkSpecialNodes(left, right))
+    return res > 0;
+
   if (left->isCall || right->isCall)
     // No way to compute latency of calls.
     return BURRSort(left, right, SPQ);
@@ -2347,12 +2538,6 @@ bool ilp_ls_rr_sort::operator()(SUnit *left, SUnit *right) const {
     if (RReduce && !LReduce) return true;
   }
 
-  if (!DisableSchedVRegCycle) {
-    int result = BUCompareVRegCycle(left, right);
-    if (result != 0)
-      return result > 0;
-  }
-
   if (!DisableSchedLiveUses && (LLiveUses != RLiveUses)) {
     DEBUG(dbgs() << "Live uses SU(" << left->NodeNum << "): " << LLiveUses
           << " != SU(" << right->NodeNum << "): " << RLiveUses << "\n");
@@ -2391,6 +2576,24 @@ bool ilp_ls_rr_sort::operator()(SUnit *left, SUnit *right) const {
   return BURRSort(left, right, SPQ);
 }
 
+void RegReductionPQBase::initNodes(std::vector<SUnit> &sunits) {
+  SUnits = &sunits;
+  // Add pseudo dependency edges for two-address nodes.
+  AddPseudoTwoAddrDeps();
+  // Reroute edges to nodes with multiple uses.
+  if (!TracksRegPressure)
+    PrescheduleNodesWithMultipleUses();
+  // Calculate node priorities.
+  CalculateSethiUllmanNumbers();
+
+  // For single block loops, mark nodes that look like canonical IV increments.
+  if (scheduleDAG->BB->isSuccessor(scheduleDAG->BB)) {
+    for (unsigned i = 0, e = sunits.size(); i != e; ++i) {
+      initVRegCycle(&sunits[i]);
+    }
+  }
+}
+
 //===----------------------------------------------------------------------===//
 //                    Preschedule for Register Pressure
 //===----------------------------------------------------------------------===//
@@ -2668,6 +2871,9 @@ static unsigned LimitedSumOfUnscheduledPredsOfSuccs(const SUnit *SU,
 
 // Top down
 bool td_ls_rr_sort::operator()(const SUnit *left, const SUnit *right) const {
+  if (int res = checkSpecialNodes(left, right))
+    return res < 0;
+
   unsigned LPriority = SPQ->getNodePriority(left);
   unsigned RPriority = SPQ->getNodePriority(right);
   bool LIsTarget = left->getNode() && left->getNode()->isMachineOpcode();
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index 24a1937..0d656ef 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -83,10 +83,13 @@ SUnit *ScheduleDAGSDNodes::Clone(SUnit *Old) {
   SU->Latency = Old->Latency;
   SU->isVRegCycle = Old->isVRegCycle;
   SU->isCall = Old->isCall;
+  SU->isCallOp = Old->isCallOp;
   SU->isTwoAddress = Old->isTwoAddress;
   SU->isCommutable = Old->isCommutable;
   SU->hasPhysRegDefs = Old->hasPhysRegDefs;
   SU->hasPhysRegClobbers = Old->hasPhysRegClobbers;
+  SU->isScheduleHigh = Old->isScheduleHigh;
+  SU->isScheduleLow = Old->isScheduleLow;
   SU->SchedulingPref = Old->SchedulingPref;
   Old->isCloned = true;
   return SU;
@@ -283,6 +286,7 @@ void ScheduleDAGSDNodes::BuildSchedUnits() {
   Worklist.push_back(DAG->getRoot().getNode());
   Visited.insert(DAG->getRoot().getNode());
 
+  SmallVector<SUnit*, 8> CallSUnits;
   while (!Worklist.empty()) {
     SDNode *NI = Worklist.pop_back_val();
 
@@ -335,6 +339,15 @@ void ScheduleDAGSDNodes::BuildSchedUnits() {
       if (!HasGlueUse) break;
     }
 
+    if (NodeSUnit->isCall)
+      CallSUnits.push_back(NodeSUnit);
+
+    // Schedule zero-latency TokenFactor below any nodes that may increase the
+    // schedule height. Otherwise, ancestors of the TokenFactor may appear to
+    // have false stalls.
+    if (NI->getOpcode() == ISD::TokenFactor)
+      NodeSUnit->isScheduleLow = true;
+
     // If there are glue operands involved, N is now the bottom-most node
     // of the sequence of nodes that are glued together.
     // Update the SUnit.
@@ -342,16 +355,26 @@ void ScheduleDAGSDNodes::BuildSchedUnits() {
     assert(N->getNodeId() == -1 && "Node already inserted!");
     N->setNodeId(NodeSUnit->NodeNum);
 
-    // Set isVRegCycle if the node operands are live into and value is live out
-    // of a single block loop.
-    InitVRegCycleFlag(NodeSUnit);
-
     // Compute NumRegDefsLeft. This must be done before AddSchedEdges.
     InitNumRegDefsLeft(NodeSUnit);
 
     // Assign the Latency field of NodeSUnit using target-provided information.
     ComputeLatency(NodeSUnit);
   }
+
+  // Find all call operands.
+  while (!CallSUnits.empty()) {
+    SUnit *SU = CallSUnits.pop_back_val();
+    for (const SDNode *SUNode = SU->getNode(); SUNode;
+         SUNode = SUNode->getGluedNode()) {
+      if (SUNode->getOpcode() != ISD::CopyToReg)
+        continue;
+      SDNode *SrcN = SUNode->getOperand(2).getNode();
+      if (isPassiveNode(SrcN)) continue;   // Not scheduled.
+      SUnit *SrcSU = &SUnits[SrcN->getNodeId()];
+      SrcSU->isCallOp = true;
+    }
+  }
 }
 
 void ScheduleDAGSDNodes::AddSchedEdges() {
@@ -412,11 +435,15 @@ void ScheduleDAGSDNodes::AddSchedEdges() {
         // it requires a cross class copy (cost < 0). That means we are only
         // treating "expensive to copy" register dependency as physical register
         // dependency. This may change in the future though.
-        if (Cost >= 0)
+        if (Cost >= 0 && !StressSched)
           PhysReg = 0;
 
         // If this is a ctrl dep, latency is 1.
         unsigned OpLatency = isChain ? 1 : OpSU->Latency;
+        // Special-case TokenFactor chains as zero-latency.
+        if(isChain && OpN->getOpcode() == ISD::TokenFactor)
+          OpLatency = 0;
+
         const SDep &dep = SDep(OpSU, isChain ? SDep::Order : SDep::Data,
                                OpLatency, PhysReg);
         if (!isChain && !UnitLatencies) {
@@ -512,47 +539,6 @@ void ScheduleDAGSDNodes::RegDefIter::Advance() {
   }
 }
 
-// Set isVRegCycle if this node's single use is CopyToReg and its only active
-// data operands are CopyFromReg.
-//
-// This is only relevant for single-block loops, in which case the VRegCycle
-// node is likely an induction variable in which the operand and target virtual
-// registers should be coalesced (e.g. pre/post increment values). Setting the
-// isVRegCycle flag helps the scheduler prioritize other uses of the same
-// CopyFromReg so that this node becomes the virtual register "kill". This
-// avoids interference between the values live in and out of the block and
-// eliminates a copy inside the loop.
-void ScheduleDAGSDNodes::InitVRegCycleFlag(SUnit *SU) {
-  if (!BB->isSuccessor(BB))
-    return;
-
-  SDNode *N = SU->getNode();
-  if (N->getGluedNode())
-    return;
-
-  if (!N->hasOneUse() || N->use_begin()->getOpcode() != ISD::CopyToReg)
-    return;
-
-  bool FoundLiveIn = false;
-  for (SDNode::op_iterator OI = N->op_begin(), E = N->op_end(); OI != E; ++OI) {
-    EVT OpVT = OI->getValueType();
-    assert(OpVT != MVT::Glue && "Glued nodes should be in same sunit!");
-
-    if (OpVT == MVT::Other)
-      continue; // ignore chain operands
-
-    if (isPassiveNode(OI->getNode()))
-      continue; // ignore constants and such
-
-    if (OI->getNode()->getOpcode() != ISD::CopyFromReg)
-      return;
-
-    FoundLiveIn = true;
-  }
-  if (FoundLiveIn)
-    SU->isVRegCycle = true;
-}
-
 void ScheduleDAGSDNodes::InitNumRegDefsLeft(SUnit *SU) {
   assert(SU->NumRegDefsLeft == 0 && "expect a new node");
   for (RegDefIter I(SU, this); I.IsValid(); I.Advance()) {
@@ -562,6 +548,16 @@ void ScheduleDAGSDNodes::InitNumRegDefsLeft(SUnit *SU) {
 }
 
 void ScheduleDAGSDNodes::ComputeLatency(SUnit *SU) {
+  SDNode *N = SU->getNode();
+
+  // TokenFactor operands are considered zero latency, and some schedulers
+  // (e.g. Top-Down list) may rely on the fact that operand latency is nonzero
+  // whenever node latency is nonzero.
+  if (N && N->getOpcode() == ISD::TokenFactor) {
+    SU->Latency = 0;
+    return;
+  }
+
   // Check to see if the scheduler cares about latencies.
   if (ForceUnitLatencies()) {
     SU->Latency = 1;
@@ -569,7 +565,6 @@ void ScheduleDAGSDNodes::ComputeLatency(SUnit *SU) {
   }
 
   if (!InstrItins || InstrItins->isEmpty()) {
-    SDNode *N = SU->getNode();
     if (N && N->isMachineOpcode() &&
         TII->isHighLatencyDef(N->getMachineOpcode()))
       SU->Latency = HighLatencyCycles;
@@ -641,7 +636,7 @@ namespace {
   };
 }
 
-/// ProcessSDDbgValues - Process SDDbgValues assoicated with this node.
+/// ProcessSDDbgValues - Process SDDbgValues associated with this node.
 static void ProcessSDDbgValues(SDNode *N, SelectionDAG *DAG,
                                InstrEmitter &Emitter,
                     SmallVector<std::pair<unsigned, MachineInstr*>, 32> &Orders,
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
index b5f68f3..3ad2bd6 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
@@ -135,6 +135,14 @@ namespace llvm {
         return ValueType;
       }
 
+      const SDNode *GetNode() const {
+        return Node;
+      }
+
+      unsigned GetIdx() const {
+        return DefIdx;
+      }
+
       void Advance();
     private:
       void InitNodeNumDefs();
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index c2711c8..68eeb60 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2050,14 +2050,15 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, const APInt &Mask,
     break;
 
   default:
-    // Allow the target to implement this method for its nodes.
-    if (Op.getOpcode() >= ISD::BUILTIN_OP_END) {
+    if (Op.getOpcode() < ISD::BUILTIN_OP_END)
+      break;
+    // Fallthrough
   case ISD::INTRINSIC_WO_CHAIN:
   case ISD::INTRINSIC_W_CHAIN:
   case ISD::INTRINSIC_VOID:
-      TLI.computeMaskedBitsForTargetNode(Op, Mask, KnownZero, KnownOne, *this,
-                                         Depth);
-    }
+    // Allow the target to implement this method for its nodes.
+    TLI.computeMaskedBitsForTargetNode(Op, Mask, KnownZero, KnownOne, *this,
+                                       Depth);
     return;
   }
 }
@@ -2322,6 +2323,13 @@ bool SelectionDAG::isKnownNeverZero(SDValue Op) const {
     return !C->isZero();
 
   // TODO: Recognize more cases here.
+  switch (Op.getOpcode()) {
+  default: break;
+  case ISD::OR:
+    if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
+      return !C->isNullValue();
+    break;
+  }
 
   return false;
 }
@@ -2339,16 +2347,6 @@ bool SelectionDAG::isEqualTo(SDValue A, SDValue B) const {
   return false;
 }
 
-bool SelectionDAG::isVerifiedDebugInfoDesc(SDValue Op) const {
-  GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op);
-  if (!GA) return false;
-  if (GA->getOffset() != 0) return false;
-  const GlobalVariable *GV = dyn_cast<GlobalVariable>(GA->getGlobal());
-  if (!GV) return false;
-  return MF->getMMI().hasDebugInfo();
-}
-
-
 /// getNode - Gets or creates the specified node.
 ///
 SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT) {
@@ -6304,7 +6302,7 @@ SDValue SelectionDAG::UnrollVectorOp(SDNode *N, unsigned ResNE) {
         Operands[j] = getNode(ISD::EXTRACT_VECTOR_ELT, dl,
                               OperandEltVT,
                               Operand,
-                              getConstant(i, MVT::i32));
+                              getConstant(i, TLI.getPointerTy()));
       } else {
         // A scalar operand; just use it as is.
         Operands[j] = Operand;
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 24c325e..d79a5ae 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -280,12 +280,35 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, DebugLoc DL,
     }
 
     // Vector/Vector bitcast.
-    return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
+    if (ValueVT.getSizeInBits() == PartVT.getSizeInBits())
+      return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
+
+    assert(PartVT.getVectorNumElements() == ValueVT.getVectorNumElements() &&
+      "Cannot handle this kind of promotion");
+    // Promoted vector extract
+    bool Smaller = ValueVT.bitsLE(PartVT);
+    return DAG.getNode((Smaller ? ISD::TRUNCATE : ISD::ANY_EXTEND),
+                       DL, ValueVT, Val);
+
   }
 
-  assert(ValueVT.getVectorElementType() == PartVT &&
-         ValueVT.getVectorNumElements() == 1 &&
+  // Trivial bitcast if the types are the same size and the destination
+  // vector type is legal.
+  if (PartVT.getSizeInBits() == ValueVT.getSizeInBits() &&
+      TLI.isTypeLegal(ValueVT))
+    return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
+
+  // Handle cases such as i8 -> <1 x i1>
+  assert(ValueVT.getVectorNumElements() == 1 &&
          "Only trivial scalar-to-vector conversions should get here!");
+
+  if (ValueVT.getVectorNumElements() == 1 &&
+      ValueVT.getVectorElementType() != PartVT) {
+    bool Smaller = ValueVT.bitsLE(PartVT);
+    Val = DAG.getNode((Smaller ? ISD::TRUNCATE : ISD::ANY_EXTEND),
+                       DL, ValueVT.getScalarType(), Val);
+  }
+
   return DAG.getNode(ISD::BUILD_VECTOR, DL, ValueVT, Val);
 }
 
@@ -426,7 +449,7 @@ static void getCopyToPartsVector(SelectionDAG &DAG, DebugLoc DL,
       // Bitconvert vector->vector case.
       Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
     } else if (PartVT.isVector() &&
-               PartVT.getVectorElementType() == ValueVT.getVectorElementType()&&
+               PartVT.getVectorElementType() == ValueVT.getVectorElementType() &&
                PartVT.getVectorNumElements() > ValueVT.getVectorNumElements()) {
       EVT ElementVT = PartVT.getVectorElementType();
       // Vector widening case, e.g. <2 x float> -> <4 x float>.  Shuffle in
@@ -446,13 +469,33 @@ static void getCopyToPartsVector(SelectionDAG &DAG, DebugLoc DL,
 
       //SDValue UndefElts = DAG.getUNDEF(VectorTy);
       //Val = DAG.getNode(ISD::CONCAT_VECTORS, DL, PartVT, Val, UndefElts);
-    } else {
+    } else if (PartVT.isVector() &&
+               PartVT.getVectorElementType().bitsGE(
+                 ValueVT.getVectorElementType()) &&
+               PartVT.getVectorNumElements() == ValueVT.getVectorNumElements()) {
+
+      // Promoted vector extract
+      unsigned NumElts = ValueVT.getVectorNumElements();
+      SmallVector<SDValue, 8> NewOps;
+      for (unsigned i = 0; i < NumElts; ++i) {
+        SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
+                       ValueVT.getScalarType(), Val ,DAG.getIntPtrConstant(i));
+        SDValue Cast = DAG.getNode(ISD::ANY_EXTEND,
+                       DL, PartVT.getScalarType(), Ext);
+        NewOps.push_back(Cast);
+      }
+      Val = DAG.getNode(ISD::BUILD_VECTOR, DL, PartVT,
+                        &NewOps[0], NewOps.size());
+    } else{
       // Vector -> scalar conversion.
-      assert(ValueVT.getVectorElementType() == PartVT &&
-             ValueVT.getVectorNumElements() == 1 &&
+      assert(ValueVT.getVectorNumElements() == 1 &&
              "Only trivial vector-to-scalar conversions should get here!");
       Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
                         PartVT, Val, DAG.getIntPtrConstant(0));
+
+      bool Smaller = ValueVT.bitsLE(PartVT);
+      Val = DAG.getNode((Smaller ? ISD::TRUNCATE : ISD::ANY_EXTEND),
+                         DL, PartVT, Val);
     }
 
     Parts[0] = Val;
@@ -783,11 +826,20 @@ void SelectionDAGBuilder::clear() {
   UnusedArgNodeMap.clear();
   PendingLoads.clear();
   PendingExports.clear();
-  DanglingDebugInfoMap.clear();
   CurDebugLoc = DebugLoc();
   HasTailCall = false;
 }
 
+/// clearDanglingDebugInfo - Clear the dangling debug information
+/// map. This function is seperated from the clear so that debug
+/// information that is dangling in a basic block can be properly
+/// resolved in a different basic block. This allows the
+/// SelectionDAG to resolve dangling debug information attached
+/// to PHI nodes.
+void SelectionDAGBuilder::clearDanglingDebugInfo() {
+  DanglingDebugInfoMap.clear();
+}
+
 /// getRoot - Return the current virtual root of the Selection DAG,
 /// flushing any PendingLoad items. This must be done before emitting
 /// a store or any other node that may need to be ordered after any
@@ -1175,6 +1227,10 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
 /// created for it, emit nodes to copy the value into the virtual
 /// registers.
 void SelectionDAGBuilder::CopyToExportRegsIfNeeded(const Value *V) {
+  // Skip empty types
+  if (V->getType()->isEmptyTy())
+    return;
+
   DenseMap<const Value *, unsigned>::iterator VMI = FuncInfo.ValueMap.find(V);
   if (VMI != FuncInfo.ValueMap.end()) {
     assert(!V->use_empty() && "Unused value assigned virtual registers!");
@@ -1223,6 +1279,24 @@ bool SelectionDAGBuilder::isExportableFromCurrentBlock(const Value *V,
   return true;
 }
 
+/// Return branch probability calculated by BranchProbabilityInfo for IR blocks.
+uint32_t SelectionDAGBuilder::getEdgeWeight(MachineBasicBlock *Src,
+                                            MachineBasicBlock *Dst) {
+  BranchProbabilityInfo *BPI = FuncInfo.BPI;
+  if (!BPI)
+    return 0;
+  BasicBlock *SrcBB = const_cast<BasicBlock*>(Src->getBasicBlock());
+  BasicBlock *DstBB = const_cast<BasicBlock*>(Dst->getBasicBlock());
+  return BPI->getEdgeWeight(SrcBB, DstBB);
+}
+
+void SelectionDAGBuilder::addSuccessorWithWeight(MachineBasicBlock *Src,
+                                                 MachineBasicBlock *Dst) {
+  uint32_t weight = getEdgeWeight(Src, Dst);
+  Src->addSuccessor(Dst, weight);
+}
+
+
 static bool InBlock(const Value *V, const BasicBlock *BB) {
   if (const Instruction *I = dyn_cast<Instruction>(V))
     return I->getParent() == BB;
@@ -1492,8 +1566,8 @@ void SelectionDAGBuilder::visitSwitchCase(CaseBlock &CB,
   }
 
   // Update successor info
-  SwitchBB->addSuccessor(CB.TrueBB);
-  SwitchBB->addSuccessor(CB.FalseBB);
+  addSuccessorWithWeight(SwitchBB, CB.TrueBB);
+  addSuccessorWithWeight(SwitchBB, CB.FalseBB);
 
   // Set NextBlock to be the MBB immediately after the current one, if any.
   // This is used to avoid emitting unnecessary branches to the next block.
@@ -1637,8 +1711,8 @@ void SelectionDAGBuilder::visitBitTestHeader(BitTestBlock &B,
 
   MachineBasicBlock* MBB = B.Cases[0].ThisBB;
 
-  SwitchBB->addSuccessor(B.Default);
-  SwitchBB->addSuccessor(MBB);
+  addSuccessorWithWeight(SwitchBB, B.Default);
+  addSuccessorWithWeight(SwitchBB, MBB);
 
   SDValue BrRange = DAG.getNode(ISD::BRCOND, getCurDebugLoc(),
                                 MVT::Other, CopyTo, RangeCmp,
@@ -1683,8 +1757,8 @@ void SelectionDAGBuilder::visitBitTestCase(BitTestBlock &BB,
                        ISD::SETNE);
   }
 
-  SwitchBB->addSuccessor(B.TargetBB);
-  SwitchBB->addSuccessor(NextMBB);
+  addSuccessorWithWeight(SwitchBB, B.TargetBB);
+  addSuccessorWithWeight(SwitchBB, NextMBB);
 
   SDValue BrAnd = DAG.getNode(ISD::BRCOND, getCurDebugLoc(),
                               MVT::Other, getControlRoot(),
@@ -1924,8 +1998,9 @@ bool SelectionDAGBuilder::handleJTSwitchCase(CaseRec& CR,
   // table.
   MachineBasicBlock *JumpTableBB = CurMF->CreateMachineBasicBlock(LLVMBB);
   CurMF->insert(BBI, JumpTableBB);
-  CR.CaseBB->addSuccessor(Default);
-  CR.CaseBB->addSuccessor(JumpTableBB);
+
+  addSuccessorWithWeight(CR.CaseBB, Default);
+  addSuccessorWithWeight(CR.CaseBB, JumpTableBB);
 
   // Build a vector of destination BBs, corresponding to each target
   // of the jump table. If the value of the jump table slot corresponds to
@@ -1952,7 +2027,7 @@ bool SelectionDAGBuilder::handleJTSwitchCase(CaseRec& CR,
          E = DestBBs.end(); I != E; ++I) {
     if (!SuccsHandled[(*I)->getNumber()]) {
       SuccsHandled[(*I)->getNumber()] = true;
-      JumpTableBB->addSuccessor(*I);
+      addSuccessorWithWeight(JumpTableBB, *I);
     }
   }
 
@@ -2019,9 +2094,13 @@ bool SelectionDAGBuilder::handleBTSplitSwitchCase(CaseRec& CR,
     APInt Range = ComputeRange(LEnd, RBegin);
     assert((Range - 2ULL).isNonNegative() &&
            "Invalid case distance");
-    double LDensity = (double)LSize.roundToDouble() /
+    // Use volatile double here to avoid excess precision issues on some hosts,
+    // e.g. that use 80-bit X87 registers.
+    volatile double LDensity =
+       (double)LSize.roundToDouble() /
                            (LEnd - First + 1ULL).roundToDouble();
-    double RDensity = (double)RSize.roundToDouble() /
+    volatile double RDensity =
+      (double)RSize.roundToDouble() /
                            (Last - RBegin + 1ULL).roundToDouble();
     double Metric = Range.logBase2()*(LDensity+RDensity);
     // Should always split in some non-trivial place
@@ -2367,8 +2446,10 @@ void SelectionDAGBuilder::visitIndirectBr(const IndirectBrInst &I) {
     succs.push_back(I.getSuccessor(i));
   array_pod_sort(succs.begin(), succs.end());
   succs.erase(std::unique(succs.begin(), succs.end()), succs.end());
-  for (unsigned i = 0, e = succs.size(); i != e; ++i)
-    IndirectBrMBB->addSuccessor(FuncInfo.MBBMap[succs[i]]);
+  for (unsigned i = 0, e = succs.size(); i != e; ++i) {
+    MachineBasicBlock *Succ = FuncInfo.MBBMap[succs[i]];
+    addSuccessorWithWeight(IndirectBrMBB, Succ);
+  }
 
   DAG.setRoot(DAG.getNode(ISD::BRIND, getCurDebugLoc(),
                           MVT::Other, getControlRoot(),
@@ -2806,16 +2887,18 @@ void SelectionDAGBuilder::visitInsertValue(const InsertValueInst &I) {
   SmallVector<SDValue, 4> Values(NumAggValues);
 
   SDValue Agg = getValue(Op0);
-  SDValue Val = getValue(Op1);
   unsigned i = 0;
   // Copy the beginning value(s) from the original aggregate.
   for (; i != LinearIndex; ++i)
     Values[i] = IntoUndef ? DAG.getUNDEF(AggValueVTs[i]) :
                 SDValue(Agg.getNode(), Agg.getResNo() + i);
   // Copy values from the inserted value(s).
-  for (; i != LinearIndex + NumValValues; ++i)
-    Values[i] = FromUndef ? DAG.getUNDEF(AggValueVTs[i]) :
-                SDValue(Val.getNode(), Val.getResNo() + i - LinearIndex);
+  if (NumValValues) {
+    SDValue Val = getValue(Op1);
+    for (; i != LinearIndex + NumValValues; ++i)
+      Values[i] = FromUndef ? DAG.getUNDEF(AggValueVTs[i]) :
+                  SDValue(Val.getNode(), Val.getResNo() + i - LinearIndex);
+  }
   // Copy remaining value(s) from the original aggregate.
   for (; i != NumAggValues; ++i)
     Values[i] = IntoUndef ? DAG.getUNDEF(AggValueVTs[i]) :
@@ -2838,6 +2921,13 @@ void SelectionDAGBuilder::visitExtractValue(const ExtractValueInst &I) {
   ComputeValueVTs(TLI, ValTy, ValValueVTs);
 
   unsigned NumValValues = ValValueVTs.size();
+
+  // Ignore a extractvalue that produces an empty object
+  if (!NumValValues) {
+    setValue(&I, DAG.getUNDEF(MVT(MVT::Other)));
+    return;
+  }
+
   SmallVector<SDValue, 4> Values(NumValValues);
 
   SDValue Agg = getValue(Op0);
@@ -4009,6 +4099,24 @@ static SDValue ExpandPowI(DebugLoc DL, SDValue LHS, SDValue RHS,
   return DAG.getNode(ISD::FPOWI, DL, LHS.getValueType(), LHS, RHS);
 }
 
+// getTruncatedArgReg - Find underlying register used for an truncated
+// argument.
+static unsigned getTruncatedArgReg(const SDValue &N) {
+  if (N.getOpcode() != ISD::TRUNCATE)
+    return 0;
+
+  const SDValue &Ext = N.getOperand(0);
+  if (Ext.getOpcode() == ISD::AssertZext || Ext.getOpcode() == ISD::AssertSext){
+    const SDValue &CFR = Ext.getOperand(0);
+    if (CFR.getOpcode() == ISD::CopyFromReg)
+      return cast<RegisterSDNode>(CFR.getOperand(1))->getReg();
+    else
+      if (CFR.getOpcode() == ISD::TRUNCATE)
+        return getTruncatedArgReg(CFR);
+  }
+  return 0;
+}
+
 /// EmitFuncArgumentDbgValue - If the DbgValueInst is a dbg_value of a function
 /// argument, create the corresponding DBG_VALUE machine instruction for it now.
 /// At the end of instruction selection, they will be inserted to the entry BB.
@@ -4029,10 +4137,6 @@ SelectionDAGBuilder::EmitFuncArgumentDbgValue(const Value *V, MDNode *Variable,
   if (DV.isInlinedFnArgument(MF.getFunction()))
     return false;
 
-  MachineBasicBlock *MBB = FuncInfo.MBB;
-  if (MBB != &MF.front())
-    return false;
-
   unsigned Reg = 0;
   if (Arg->hasByValAttr()) {
     // Byval arguments' frame index is recorded during argument lowering.
@@ -4044,9 +4148,12 @@ SelectionDAGBuilder::EmitFuncArgumentDbgValue(const Value *V, MDNode *Variable,
       Reg = 0;
   }
 
-  if (N.getNode() && N.getOpcode() == ISD::CopyFromReg) {
-    Reg = cast<RegisterSDNode>(N.getOperand(1))->getReg();
-    if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+  if (N.getNode()) {
+    if (N.getOpcode() == ISD::CopyFromReg)
+      Reg = cast<RegisterSDNode>(N.getOperand(1))->getReg();
+    else
+      Reg = getTruncatedArgReg(N);
+    if (Reg && TargetRegisterInfo::isVirtualRegister(Reg)) {
       MachineRegisterInfo &RegInfo = MF.getRegInfo();
       unsigned PR = RegInfo.getLiveInPhysReg(Reg);
       if (PR)
@@ -4208,9 +4315,9 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
           SDV = DAG.getDbgValue(Variable, FINode->getIndex(),
                                 0, dl, SDNodeOrder);
         else {
-          // Can't do anything with other non-AI cases yet.  This might be a
-          // parameter of a callee function that got inlined, for example.
-          DEBUG(dbgs() << "Dropping debug info for " << DI);
+          // Address is an argument, so try to emit its dbg value using
+          // virtual register info from the FuncInfo.ValueMap.
+          EmitFuncArgumentDbgValue(Address, Variable, 0, N);
           return 0;
         }
       } else if (AI)
@@ -4403,7 +4510,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   }
   case Intrinsic::eh_sjlj_dispatch_setup: {
     DAG.setRoot(DAG.getNode(ISD::EH_SJLJ_DISPATCHSETUP, dl, MVT::Other,
-                            getRoot()));
+                            getRoot(), getValue(I.getArgOperand(0))));
     return 0;
   }
 
@@ -4672,9 +4779,22 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   case Intrinsic::flt_rounds:
     setValue(&I, DAG.getNode(ISD::FLT_ROUNDS_, dl, MVT::i32));
     return 0;
-  case Intrinsic::trap:
-    DAG.setRoot(DAG.getNode(ISD::TRAP, dl,MVT::Other, getRoot()));
+  case Intrinsic::trap: {
+    StringRef TrapFuncName = getTrapFunctionName();
+    if (TrapFuncName.empty()) {
+      DAG.setRoot(DAG.getNode(ISD::TRAP, dl,MVT::Other, getRoot()));
+      return 0;
+    }
+    TargetLowering::ArgListTy Args;
+    std::pair<SDValue, SDValue> Result =
+      TLI.LowerCallTo(getRoot(), I.getType(),
+                 false, false, false, false, 0, CallingConv::C,
+                 /*isTailCall=*/false, /*isReturnValueUsed=*/true,
+                 DAG.getExternalSymbol(TrapFuncName.data(), TLI.getPointerTy()),
+                 Args, DAG, getCurDebugLoc());
+    DAG.setRoot(Result.second);
     return 0;
+  }
   case Intrinsic::uadd_with_overflow:
     return implVisitAluOverflow(I, ISD::UADDO);
   case Intrinsic::sadd_with_overflow:
@@ -4689,15 +4809,16 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     return implVisitAluOverflow(I, ISD::SMULO);
 
   case Intrinsic::prefetch: {
-    SDValue Ops[4];
+    SDValue Ops[5];
     unsigned rw = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
     Ops[0] = getRoot();
     Ops[1] = getValue(I.getArgOperand(0));
     Ops[2] = getValue(I.getArgOperand(1));
     Ops[3] = getValue(I.getArgOperand(2));
+    Ops[4] = getValue(I.getArgOperand(3));
     DAG.setRoot(DAG.getMemIntrinsicNode(ISD::PREFETCH, dl,
                                         DAG.getVTList(MVT::Other),
-                                        &Ops[0], 4,
+                                        &Ops[0], 5,
                                         EVT::getIntegerVT(*Context, 8),
                                         MachinePointerInfo(I.getArgOperand(0)),
                                         0, /* align */
@@ -4784,7 +4905,9 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
                 Outs, TLI, &Offsets);
 
   bool CanLowerReturn = TLI.CanLowerReturn(CS.getCallingConv(),
-                        FTy->isVarArg(), Outs, FTy->getContext());
+					   DAG.getMachineFunction(),
+					   FTy->isVarArg(), Outs,
+					   FTy->getContext());
 
   SDValue DemoteStackSlot;
   int DemoteStackIdx = -100;
@@ -4814,8 +4937,14 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
 
   for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
        i != e; ++i) {
-    SDValue ArgNode = getValue(*i);
-    Entry.Node = ArgNode; Entry.Ty = (*i)->getType();
+    const Value *V = *i;
+
+    // Skip empty types
+    if (V->getType()->isEmptyTy())
+      continue;
+
+    SDValue ArgNode = getValue(V);
+    Entry.Node = ArgNode; Entry.Ty = V->getType();
 
     unsigned attrInd = i - CS.arg_begin() + 1;
     Entry.isSExt  = CS.paramHasAttr(attrInd, Attribute::SExt);
@@ -5255,6 +5384,7 @@ public:
 
     const llvm::Type *OpTy = CallOperandVal->getType();
 
+    // FIXME: code duplicated from TargetLowering::ParseConstraints().
     // If this is an indirect operand, the operand is a pointer to the
     // accessed type.
     if (isIndirect) {
@@ -5264,6 +5394,11 @@ public:
       OpTy = PtrTy->getElementType();
     }
 
+    // Look for vector wrapped in a struct. e.g. { <16 x i8> }.
+    if (const StructType *STy = dyn_cast<StructType>(OpTy))
+      if (STy->getNumElements() == 1)
+        OpTy = STy->getElementType(0);
+
     // If OpTy is not a single value, it may be a struct/union that we
     // can tile with integers.
     if (!OpTy->isSingleValueType() && OpTy->isSized()) {
@@ -5315,6 +5450,8 @@ isAllocatableRegister(unsigned Reg, MachineFunction &MF,
     EVT ThisVT = MVT::Other;
 
     const TargetRegisterClass *RC = *RCI;
+    if (!RC->isAllocatable())
+      continue;
     // If none of the value types for this register class are valid, we
     // can't use it.  For example, 64-bit reg classes on 32-bit targets.
     for (TargetRegisterClass::vt_iterator I = RC->vt_begin(), E = RC->vt_end();
@@ -5336,15 +5473,14 @@ isAllocatableRegister(unsigned Reg, MachineFunction &MF,
     // frame pointer in functions that need it (due to them not being taken
     // out of allocation, because a variable sized allocation hasn't been seen
     // yet).  This is a slight code pessimization, but should still work.
-    for (TargetRegisterClass::iterator I = RC->allocation_order_begin(MF),
-         E = RC->allocation_order_end(MF); I != E; ++I)
-      if (*I == Reg) {
-        // We found a matching register class.  Keep looking at others in case
-        // we find one with larger registers that this physreg is also in.
-        FoundRC = RC;
-        FoundVT = ThisVT;
-        break;
-      }
+    ArrayRef<unsigned> RawOrder = RC->getRawAllocationOrder(MF);
+    if (std::find(RawOrder.begin(), RawOrder.end(), Reg) != RawOrder.end()) {
+      // We found a matching register class.  Keep looking at others in case
+      // we find one with larger registers that this physreg is also in.
+      FoundRC = RC;
+      FoundVT = ThisVT;
+      break;
+    }
   }
   return FoundRC;
 }
@@ -5491,9 +5627,15 @@ static void GetRegistersForValue(SelectionDAG &DAG,
                                             OpInfo.ConstraintVT);
 
   const TargetRegisterInfo *TRI = DAG.getTarget().getRegisterInfo();
+  BitVector Reserved = TRI->getReservedRegs(MF);
   unsigned NumAllocated = 0;
   for (unsigned i = 0, e = RegClassRegs.size(); i != e; ++i) {
     unsigned Reg = RegClassRegs[i];
+    // Filter out the reserved registers, but note that reserved registers are
+    // not fully determined at this point. We may still decide we need a frame
+    // pointer.
+    if (Reserved.test(Reg))
+      continue;
     // See if this register is available.
     if ((isOutReg && OutputRegs.count(Reg)) ||   // Already used.
         (isInReg  && InputRegs.count(Reg))) {    // Already used.
@@ -5542,7 +5684,9 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
 
   std::set<unsigned> OutputRegs, InputRegs;
 
-  TargetLowering::AsmOperandInfoVector TargetConstraints = TLI.ParseConstraints(CS);
+  TargetLowering::AsmOperandInfoVector
+    TargetConstraints = TLI.ParseConstraints(CS);
+
   bool hasMemory = false;
 
   unsigned ArgNo = 0;   // ArgNo - The argument of the CallInst.
@@ -5601,7 +5745,8 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
       hasMemory = true;
     else {
       for (unsigned j = 0, ee = OpInfo.Codes.size(); j != ee; ++j) {
-        TargetLowering::ConstraintType CType = TLI.getConstraintType(OpInfo.Codes[j]);
+        TargetLowering::ConstraintType
+          CType = TLI.getConstraintType(OpInfo.Codes[j]);
         if (CType == TargetLowering::C_Memory) {
           hasMemory = true;
           break;
@@ -5651,12 +5796,17 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
     // need to to provide an address for the memory input.
     if (OpInfo.ConstraintType == TargetLowering::C_Memory &&
         !OpInfo.isIndirect) {
-      assert((OpInfo.isMultipleAlternative || (OpInfo.Type == InlineAsm::isInput)) &&
+      assert((OpInfo.isMultipleAlternative ||
+              (OpInfo.Type == InlineAsm::isInput)) &&
              "Can only indirectify direct input operands!");
 
       // Memory operands really want the address of the value.  If we don't have
       // an indirect input, put it in the constpool if we can, otherwise spill
       // it to a stack slot.
+      // TODO: This isn't quite right. We need to handle these according to
+      // the addressing mode that the constraint wants. Also, this may take
+      // an additional register for the computation and we don't want that
+      // either.
 
       // If the operand is a float, integer, or vector constant, spill to a
       // constant pool entry to get its address.
@@ -5858,7 +6008,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
 
       if (OpInfo.ConstraintType == TargetLowering::C_Other) {
         std::vector<SDValue> Ops;
-        TLI.LowerAsmOperandForConstraint(InOperandVal, OpInfo.ConstraintCode[0],
+        TLI.LowerAsmOperandForConstraint(InOperandVal, OpInfo.ConstraintCode,
                                          Ops, DAG);
         if (Ops.empty())
           report_fatal_error("Invalid operand for inline asm constraint '" +
@@ -6067,14 +6217,15 @@ TargetLowering::LowerCallTo(SDValue Chain, const Type *RetTy,
         Flags.setByVal();
         const PointerType *Ty = cast<PointerType>(Args[i].Ty);
         const Type *ElementTy = Ty->getElementType();
-        unsigned FrameAlign = getByValTypeAlignment(ElementTy);
-        unsigned FrameSize  = getTargetData()->getTypeAllocSize(ElementTy);
+        Flags.setByValSize(getTargetData()->getTypeAllocSize(ElementTy));
         // For ByVal, alignment should come from FE.  BE will guess if this
         // info is not there but there are cases it cannot get right.
+        unsigned FrameAlign;
         if (Args[i].Alignment)
           FrameAlign = Args[i].Alignment;
+        else
+          FrameAlign = getByValTypeAlignment(ElementTy);
         Flags.setByValAlign(FrameAlign);
-        Flags.setByValSize(FrameSize);
       }
       if (Args[i].isNest)
         Flags.setNest();
@@ -6180,7 +6331,7 @@ TargetLowering::LowerCallTo(SDValue Chain, const Type *RetTy,
 
   // For a function returning void, there is no return value. We can't create
   // such a node, so we just return a null return value in that case. In
-  // that case, nothing will actualy look at the value.
+  // that case, nothing will actually look at the value.
   if (ReturnValues.empty())
     return std::make_pair(SDValue(), Chain);
 
@@ -6219,6 +6370,25 @@ SelectionDAGBuilder::CopyValueToVirtualRegister(const Value *V, unsigned Reg) {
 
 #include "llvm/CodeGen/SelectionDAGISel.h"
 
+/// isOnlyUsedInEntryBlock - If the specified argument is only used in the
+/// entry block, return true.  This includes arguments used by switches, since
+/// the switch may expand into multiple basic blocks.
+static bool isOnlyUsedInEntryBlock(const Argument *A) {
+  // With FastISel active, we may be splitting blocks, so force creation
+  // of virtual registers for all non-dead arguments.
+  if (EnableFastISel)
+    return A->use_empty();
+
+  const BasicBlock *Entry = A->getParent()->begin();
+  for (Value::const_use_iterator UI = A->use_begin(), E = A->use_end();
+       UI != E; ++UI) {
+    const User *U = *UI;
+    if (cast<Instruction>(U)->getParent() != Entry || isa<SwitchInst>(U))
+      return false;  // Use not in entry block.
+  }
+  return true;
+}
+
 void SelectionDAGISel::LowerArguments(const BasicBlock *LLVMBB) {
   // If this is the entry block, emit arguments.
   const Function &F = *LLVMBB->getParent();
@@ -6273,14 +6443,15 @@ void SelectionDAGISel::LowerArguments(const BasicBlock *LLVMBB) {
         Flags.setByVal();
         const PointerType *Ty = cast<PointerType>(I->getType());
         const Type *ElementTy = Ty->getElementType();
-        unsigned FrameAlign = TLI.getByValTypeAlignment(ElementTy);
-        unsigned FrameSize  = TD->getTypeAllocSize(ElementTy);
+        Flags.setByValSize(TD->getTypeAllocSize(ElementTy));
         // For ByVal, alignment should be passed from FE.  BE will guess if
         // this info is not there but there are cases it cannot get right.
+        unsigned FrameAlign;
         if (F.getParamAlignment(Idx))
           FrameAlign = F.getParamAlignment(Idx);
+        else
+          FrameAlign = TLI.getByValTypeAlignment(ElementTy);
         Flags.setByValAlign(FrameAlign);
-        Flags.setByValSize(FrameSize);
       }
       if (F.paramHasAttr(Idx, Attribute::Nest))
         Flags.setNest();
@@ -6362,8 +6533,8 @@ void SelectionDAGISel::LowerArguments(const BasicBlock *LLVMBB) {
     if (I->use_empty() && NumValues)
       SDB->setUnusedArgValue(I, InVals[i]);
 
-    for (unsigned Value = 0; Value != NumValues; ++Value) {
-      EVT VT = ValueVTs[Value];
+    for (unsigned Val = 0; Val != NumValues; ++Val) {
+      EVT VT = ValueVTs[Val];
       EVT PartVT = TLI.getRegisterType(*CurDAG->getContext(), VT);
       unsigned NumParts = TLI.getNumRegisters(*CurDAG->getContext(), VT);
 
@@ -6382,21 +6553,35 @@ void SelectionDAGISel::LowerArguments(const BasicBlock *LLVMBB) {
       i += NumParts;
     }
 
+    // We don't need to do anything else for unused arguments.
+    if (ArgValues.empty())
+      continue;
+
     // Note down frame index for byval arguments.
-    if (I->hasByValAttr() && !ArgValues.empty())
+    if (I->hasByValAttr())
       if (FrameIndexSDNode *FI =
           dyn_cast<FrameIndexSDNode>(ArgValues[0].getNode()))
         FuncInfo->setByValArgumentFrameIndex(I, FI->getIndex());
 
-    if (!I->use_empty()) {
-      SDValue Res;
-      if (!ArgValues.empty())
-        Res = DAG.getMergeValues(&ArgValues[0], NumValues,
-                                 SDB->getCurDebugLoc());
-      SDB->setValue(I, Res);
-
-      // If this argument is live outside of the entry block, insert a copy from
-      // whereever we got it to the vreg that other BB's will reference it as.
+    SDValue Res = DAG.getMergeValues(&ArgValues[0], NumValues,
+                                     SDB->getCurDebugLoc());
+    SDB->setValue(I, Res);
+
+    // If this argument is live outside of the entry block, insert a copy from
+    // wherever we got it to the vreg that other BB's will reference it as.
+    if (!EnableFastISel && Res.getOpcode() == ISD::CopyFromReg) {
+      // If we can, though, try to skip creating an unnecessary vreg.
+      // FIXME: This isn't very clean... it would be nice to make this more
+      // general.  It's also subtly incompatible with the hacks FastISel
+      // uses with vregs.
+      unsigned Reg = cast<RegisterSDNode>(Res.getOperand(1))->getReg();
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        FuncInfo->ValueMap[I] = Reg;
+        continue;
+      }
+    }
+    if (!isOnlyUsedInEntryBlock(I)) {
+      FuncInfo->InitializeRegForValue(I);
       SDB->CopyToExportRegsIfNeeded(I);
     }
   }
@@ -6442,6 +6627,10 @@ SelectionDAGBuilder::HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) {
       // Ignore dead phi's.
       if (PN->use_empty()) continue;
 
+      // Skip empty types
+      if (PN->getType()->isEmptyTy())
+        continue;
+
       unsigned Reg;
       const Value *PHIOp = PN->getIncomingValueForBlock(LLVMBB);
 
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index f546612..a1ca891 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -23,7 +23,6 @@
 #include "llvm/Support/CallSite.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <vector>
-#include <set>
 
 namespace llvm {
 
@@ -333,6 +332,14 @@ public:
   /// consumed.
   void clear();
 
+  /// clearDanglingDebugInfo - Clear the dangling debug information
+  /// map. This function is seperated from the clear so that debug
+  /// information that is dangling in a basic block can be properly
+  /// resolved in a different basic block. This allows the
+  /// SelectionDAG to resolve dangling debug information attached
+  /// to PHI nodes.
+  void clearDanglingDebugInfo();
+
   /// getRoot - Return the current virtual root of the Selection DAG,
   /// flushing any PendingLoad items. This must be done before emitting
   /// a store or any other node that may need to be ordered after any
@@ -427,6 +434,9 @@ private:
                                 const Value* SV,
                                 MachineBasicBlock* Default,
                                 MachineBasicBlock *SwitchBB);
+
+  uint32_t getEdgeWeight(MachineBasicBlock *Src, MachineBasicBlock *Dst);
+  void addSuccessorWithWeight(MachineBasicBlock *Src, MachineBasicBlock *Dst);
 public:
   void visitSwitchCase(CaseBlock &CB,
                        MachineBasicBlock *SwitchBB);
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 4b6066e..dc8044b 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -17,6 +17,7 @@
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/DebugInfo.h"
 #include "llvm/Constants.h"
 #include "llvm/Function.h"
@@ -55,17 +56,11 @@
 using namespace llvm;
 
 STATISTIC(NumFastIselFailures, "Number of instructions fast isel failed on");
+STATISTIC(NumFastIselSuccess, "Number of instructions fast isel selected");
 STATISTIC(NumFastIselBlocks, "Number of blocks selected entirely by fast isel");
 STATISTIC(NumDAGBlocks, "Number of blocks selected using DAG");
 STATISTIC(NumDAGIselRetries,"Number of times dag isel has to try another path");
 
-#ifndef NDEBUG
-STATISTIC(NumBBWithOutOfOrderLineInfo,
-          "Number of blocks with out of order line number info");
-STATISTIC(NumMBBWithOutOfOrderLineInfo,
-          "Number of machine blocks with out of order line number info");
-#endif
-
 static cl::opt<bool>
 EnableFastISelVerbose("fast-isel-verbose", cl::Hidden,
           cl::desc("Enable verbose messages in the \"fast\" "
@@ -74,6 +69,11 @@ static cl::opt<bool>
 EnableFastISelAbort("fast-isel-abort", cl::Hidden,
           cl::desc("Enable abort calls when \"fast\" instruction fails"));
 
+static cl::opt<bool>
+UseMBPI("use-mbpi",
+        cl::desc("use Machine Branch Probability Info"),
+        cl::init(true), cl::Hidden);
+
 #ifndef NDEBUG
 static cl::opt<bool>
 ViewDAGCombine1("view-dag-combine1-dags", cl::Hidden,
@@ -192,6 +192,7 @@ SelectionDAGISel::SelectionDAGISel(const TargetMachine &tm,
   DAGSize(0) {
     initializeGCModuleInfoPass(*PassRegistry::getPassRegistry());
     initializeAliasAnalysisAnalysisGroup(*PassRegistry::getPassRegistry());
+    initializeBranchProbabilityInfoPass(*PassRegistry::getPassRegistry());
   }
 
 SelectionDAGISel::~SelectionDAGISel() {
@@ -205,43 +206,11 @@ void SelectionDAGISel::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addPreserved<AliasAnalysis>();
   AU.addRequired<GCModuleInfo>();
   AU.addPreserved<GCModuleInfo>();
+  if (UseMBPI && OptLevel != CodeGenOpt::None)
+    AU.addRequired<BranchProbabilityInfo>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
-/// FunctionCallsSetJmp - Return true if the function has a call to setjmp or
-/// other function that gcc recognizes as "returning twice". This is used to
-/// limit code-gen optimizations on the machine function.
-///
-/// FIXME: Remove after <rdar://problem/8031714> is fixed.
-static bool FunctionCallsSetJmp(const Function *F) {
-  const Module *M = F->getParent();
-  static const char *ReturnsTwiceFns[] = {
-    "_setjmp",
-    "setjmp",
-    "sigsetjmp",
-    "setjmp_syscall",
-    "savectx",
-    "qsetjmp",
-    "vfork",
-    "getcontext"
-  };
-#define NUM_RETURNS_TWICE_FNS sizeof(ReturnsTwiceFns) / sizeof(const char *)
-
-  for (unsigned I = 0; I < NUM_RETURNS_TWICE_FNS; ++I)
-    if (const Function *Callee = M->getFunction(ReturnsTwiceFns[I])) {
-      if (!Callee->use_empty())
-        for (Value::const_use_iterator
-               I = Callee->use_begin(), E = Callee->use_end();
-             I != E; ++I)
-          if (const CallInst *CI = dyn_cast<CallInst>(*I))
-            if (CI->getParent()->getParent() == F)
-              return true;
-    }
-
-  return false;
-#undef NUM_RETURNS_TWICE_FNS
-}
-
 /// SplitCriticalSideEffectEdges - Look for critical edges with a PHI value that
 /// may trap on it.  In this case we have to split the edge so that the path
 /// through the predecessor block that doesn't go to the phi block doesn't
@@ -302,6 +271,12 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
 
   CurDAG->init(*MF);
   FuncInfo->set(Fn, *MF);
+
+  if (UseMBPI && OptLevel != CodeGenOpt::None)
+    FuncInfo->BPI = &getAnalysis<BranchProbabilityInfo>();
+  else
+    FuncInfo->BPI = 0;
+
   SDB->init(GFI, *AA);
 
   SelectAllBasicBlocks(Fn);
@@ -392,7 +367,7 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
   }
 
   // Determine if there is a call to setjmp in the machine function.
-  MF->setCallsSetJmp(FunctionCallsSetJmp(&Fn));
+  MF->setCallsSetJmp(Fn.callsFunctionThatReturnsTwice());
 
   // Replace forward-declared registers with the registers containing
   // the desired value.
@@ -421,10 +396,9 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
   return true;
 }
 
-void
-SelectionDAGISel::SelectBasicBlock(BasicBlock::const_iterator Begin,
-                                   BasicBlock::const_iterator End,
-                                   bool &HadTailCall) {
+void SelectionDAGISel::SelectBasicBlock(BasicBlock::const_iterator Begin,
+                                        BasicBlock::const_iterator End,
+                                        bool &HadTailCall) {
   // Lower all of the non-terminator instructions. If a call is emitted
   // as a tail call, cease emitting nodes for this block. Terminators
   // are handled below.
@@ -438,7 +412,6 @@ SelectionDAGISel::SelectBasicBlock(BasicBlock::const_iterator Begin,
 
   // Final step, emit the lowered DAG as machine code.
   CodeGenAndEmitDAG();
-  return;
 }
 
 void SelectionDAGISel::ComputeLiveOutVRegInfo() {
@@ -572,7 +545,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
 
   {
     NamedRegionTimer T("DAG Legalization", GroupName, TimePassesIsEnabled);
-    CurDAG->Legalize(OptLevel);
+    CurDAG->Legalize();
   }
 
   DEBUG(dbgs() << "Legalized selection DAG: BB#" << BlockNumber
@@ -748,16 +721,49 @@ void SelectionDAGISel::PrepareEHLandingPad() {
 
 
 
-
+/// TryToFoldFastISelLoad - We're checking to see if we can fold the specified
+/// load into the specified FoldInst.  Note that we could have a sequence where
+/// multiple LLVM IR instructions are folded into the same machineinstr.  For
+/// example we could have:
+///   A: x = load i32 *P
+///   B: y = icmp A, 42
+///   C: br y, ...
+///
+/// In this scenario, LI is "A", and FoldInst is "C".  We know about "B" (and
+/// any other folded instructions) because it is between A and C.
+///
+/// If we succeed in folding the load into the operation, return true.
+///
 bool SelectionDAGISel::TryToFoldFastISelLoad(const LoadInst *LI,
+                                             const Instruction *FoldInst,
                                              FastISel *FastIS) {
+  // We know that the load has a single use, but don't know what it is.  If it
+  // isn't one of the folded instructions, then we can't succeed here.  Handle
+  // this by scanning the single-use users of the load until we get to FoldInst.
+  unsigned MaxUsers = 6;  // Don't scan down huge single-use chains of instrs.
+  
+  const Instruction *TheUser = LI->use_back();
+  while (TheUser != FoldInst &&   // Scan up until we find FoldInst.
+         // Stay in the right block.
+         TheUser->getParent() == FoldInst->getParent() &&
+         --MaxUsers) {  // Don't scan too far.
+    // If there are multiple or no uses of this instruction, then bail out.
+    if (!TheUser->hasOneUse())
+      return false;
+    
+    TheUser = TheUser->use_back();
+  }
+  
   // Don't try to fold volatile loads.  Target has to deal with alignment
   // constraints.
   if (LI->isVolatile()) return false;
 
-  // Figure out which vreg this is going into.
+  // Figure out which vreg this is going into.  If there is no assigned vreg yet
+  // then there actually was no reference to it.  Perhaps the load is referenced
+  // by a dead instruction.
   unsigned LoadReg = FastIS->getRegForValue(LI);
-  assert(LoadReg && "Load isn't already assigned a vreg? ");
+  if (LoadReg == 0)
+    return false;
 
   // Check to see what the uses of this vreg are.  If it has no uses, or more
   // than one use (at the machine instr level) then we can't fold it.
@@ -788,48 +794,17 @@ bool SelectionDAGISel::TryToFoldFastISelLoad(const LoadInst *LI,
   return FastIS->TryToFoldLoad(User, RI.getOperandNo(), LI);
 }
 
-#ifndef NDEBUG
-/// CheckLineNumbers - Check if basic block instructions follow source order
-/// or not.
-static void CheckLineNumbers(const BasicBlock *BB) {
-  unsigned Line = 0;
-  unsigned Col = 0;
-  for (BasicBlock::const_iterator BI = BB->begin(),
-         BE = BB->end(); BI != BE; ++BI) {
-    const DebugLoc DL = BI->getDebugLoc();
-    if (DL.isUnknown()) continue;
-    unsigned L = DL.getLine();
-    unsigned C = DL.getCol();
-    if (L < Line || (L == Line && C < Col)) {
-      ++NumBBWithOutOfOrderLineInfo;
-      return;
-    }
-    Line = L;
-    Col = C;
-  }
+/// isFoldedOrDeadInstruction - Return true if the specified instruction is
+/// side-effect free and is either dead or folded into a generated instruction.
+/// Return false if it needs to be emitted.
+static bool isFoldedOrDeadInstruction(const Instruction *I,
+                                      FunctionLoweringInfo *FuncInfo) {
+  return !I->mayWriteToMemory() && // Side-effecting instructions aren't folded.
+         !isa<TerminatorInst>(I) && // Terminators aren't folded.
+         !isa<DbgInfoIntrinsic>(I) &&  // Debug instructions aren't folded.
+         !FuncInfo->isExportedInst(I); // Exported instrs must be computed.
 }
 
-/// CheckLineNumbers - Check if machine basic block instructions follow source
-/// order or not.
-static void CheckLineNumbers(const MachineBasicBlock *MBB) {
-  unsigned Line = 0;
-  unsigned Col = 0;
-  for (MachineBasicBlock::const_iterator MBI = MBB->begin(),
-         MBE = MBB->end(); MBI != MBE; ++MBI) {
-    const DebugLoc DL = MBI->getDebugLoc();
-    if (DL.isUnknown()) continue;
-    unsigned L = DL.getLine();
-    unsigned C = DL.getCol();
-    if (L < Line || (L == Line && C < Col)) {
-      ++NumMBBWithOutOfOrderLineInfo;
-      return;
-    }
-    Line = L;
-    Col = C;
-  }
-}
-#endif
-
 void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
   // Initialize the Fast-ISel state, if needed.
   FastISel *FastIS = 0;
@@ -841,9 +816,6 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
   for (ReversePostOrderTraversal<const Function*>::rpo_iterator
        I = RPOT.begin(), E = RPOT.end(); I != E; ++I) {
     const BasicBlock *LLVMBB = *I;
-#ifndef NDEBUG
-    CheckLineNumbers(LLVMBB);
-#endif
 
     if (OptLevel != CodeGenOpt::None) {
       bool AllPredsVisited = true;
@@ -856,15 +828,13 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
       }
 
       if (AllPredsVisited) {
-        for (BasicBlock::const_iterator I = LLVMBB->begin(), E = LLVMBB->end();
-             I != E && isa<PHINode>(I); ++I) {
+        for (BasicBlock::const_iterator I = LLVMBB->begin();
+             isa<PHINode>(I); ++I)
           FuncInfo->ComputePHILiveOutRegInfo(cast<PHINode>(I));
-        }
       } else {
-        for (BasicBlock::const_iterator I = LLVMBB->begin(), E = LLVMBB->end();
-             I != E && isa<PHINode>(I); ++I) {
+        for (BasicBlock::const_iterator I = LLVMBB->begin();
+             isa<PHINode>(I); ++I)
           FuncInfo->InvalidatePHILiveOutRegInfo(cast<PHINode>(I));
-        }
       }
 
       FuncInfo->VisitedBBs.insert(LLVMBB);
@@ -912,10 +882,7 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
         const Instruction *Inst = llvm::prior(BI);
 
         // If we no longer require this instruction, skip it.
-        if (!Inst->mayWriteToMemory() &&
-            !isa<TerminatorInst>(Inst) &&
-            !isa<DbgInfoIntrinsic>(Inst) &&
-            !FuncInfo->isExportedInst(Inst))
+        if (isFoldedOrDeadInstruction(Inst, FuncInfo))
           continue;
 
         // Bottom-up: reset the insert pos at the top, after any local-value
@@ -924,16 +891,21 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
 
         // Try to select the instruction with FastISel.
         if (FastIS->SelectInstruction(Inst)) {
-          // If fast isel succeeded, check to see if there is a single-use
-          // non-volatile load right before the selected instruction, and see if
-          // the load is used by the instruction.  If so, try to fold it.
-          const Instruction *BeforeInst = 0;
-          if (Inst != Begin)
-            BeforeInst = llvm::prior(llvm::prior(BI));
-          if (BeforeInst && isa<LoadInst>(BeforeInst) &&
-              BeforeInst->hasOneUse() && *BeforeInst->use_begin() == Inst &&
-              TryToFoldFastISelLoad(cast<LoadInst>(BeforeInst), FastIS))
-            --BI; // If we succeeded, don't re-select the load.
+          ++NumFastIselSuccess;
+          // If fast isel succeeded, skip over all the folded instructions, and
+          // then see if there is a load right before the selected instructions.
+          // Try to fold the load if so.
+          const Instruction *BeforeInst = Inst;
+          while (BeforeInst != Begin) {
+            BeforeInst = llvm::prior(BasicBlock::const_iterator(BeforeInst));
+            if (!isFoldedOrDeadInstruction(BeforeInst, FuncInfo))
+              break;
+          }
+          if (BeforeInst != Inst && isa<LoadInst>(BeforeInst) &&
+              BeforeInst->hasOneUse() &&
+              TryToFoldFastISelLoad(cast<LoadInst>(BeforeInst), Inst, FastIS))
+            // If we succeeded, don't re-select the load.
+            BI = llvm::next(BasicBlock::const_iterator(BeforeInst));
           continue;
         }
 
@@ -963,9 +935,14 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
           continue;
         }
 
-        // Otherwise, give up on FastISel for the rest of the block.
-        // For now, be a little lenient about non-branch terminators.
-        if (!isa<TerminatorInst>(Inst) || isa<BranchInst>(Inst)) {
+        if (isa<TerminatorInst>(Inst) && !isa<BranchInst>(Inst)) {
+          // Don't abort, and use a different message for terminator misses.
+          ++NumFastIselFailures;
+          if (EnableFastISelVerbose || EnableFastISelAbort) {
+            dbgs() << "FastISel missed terminator: ";
+            Inst->dump();
+          }
+        } else {
           ++NumFastIselFailures;
           if (EnableFastISelVerbose || EnableFastISelAbort) {
             dbgs() << "FastISel miss: ";
@@ -987,22 +964,20 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
     else
       ++NumFastIselBlocks;
 
-    // Run SelectionDAG instruction selection on the remainder of the block
-    // not handled by FastISel. If FastISel is not run, this is the entire
-    // block.
-    bool HadTailCall;
-    SelectBasicBlock(Begin, BI, HadTailCall);
+    if (Begin != BI) {
+      // Run SelectionDAG instruction selection on the remainder of the block
+      // not handled by FastISel. If FastISel is not run, this is the entire
+      // block.
+      bool HadTailCall;
+      SelectBasicBlock(Begin, BI, HadTailCall);
+    }
 
     FinishBasicBlock();
     FuncInfo->PHINodesToUpdate.clear();
   }
 
   delete FastIS;
-#ifndef NDEBUG
-  for (MachineFunction::const_iterator MBI = MF->begin(), MBE = MF->end();
-       MBI != MBE; ++MBI)
-    CheckLineNumbers(MBI);
-#endif
+  SDB->clearDanglingDebugInfo();
 }
 
 void
@@ -2634,11 +2609,45 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
       // instructions that access memory and for ComplexPatterns that match
       // loads.
       if (EmitNodeInfo & OPFL_MemRefs) {
+        // Only attach load or store memory operands if the generated
+        // instruction may load or store.
+        const TargetInstrDesc &TID = TM.getInstrInfo()->get(TargetOpc);
+        bool mayLoad = TID.mayLoad();
+        bool mayStore = TID.mayStore();
+
+        unsigned NumMemRefs = 0;
+        for (SmallVector<MachineMemOperand*, 2>::const_iterator I =
+             MatchedMemRefs.begin(), E = MatchedMemRefs.end(); I != E; ++I) {
+          if ((*I)->isLoad()) {
+            if (mayLoad)
+              ++NumMemRefs;
+          } else if ((*I)->isStore()) {
+            if (mayStore)
+              ++NumMemRefs;
+          } else {
+            ++NumMemRefs;
+          }
+        }
+
         MachineSDNode::mmo_iterator MemRefs =
-          MF->allocateMemRefsArray(MatchedMemRefs.size());
-        std::copy(MatchedMemRefs.begin(), MatchedMemRefs.end(), MemRefs);
+          MF->allocateMemRefsArray(NumMemRefs);
+
+        MachineSDNode::mmo_iterator MemRefsPos = MemRefs;
+        for (SmallVector<MachineMemOperand*, 2>::const_iterator I =
+             MatchedMemRefs.begin(), E = MatchedMemRefs.end(); I != E; ++I) {
+          if ((*I)->isLoad()) {
+            if (mayLoad)
+              *MemRefsPos++ = *I;
+          } else if ((*I)->isStore()) {
+            if (mayStore)
+              *MemRefsPos++ = *I;
+          } else {
+            *MemRefsPos++ = *I;
+          }
+        }
+
         cast<MachineSDNode>(Res)
-          ->setMemRefs(MemRefs, MemRefs + MatchedMemRefs.size());
+          ->setMemRefs(MemRefs, MemRefs + NumMemRefs);
       }
 
       DEBUG(errs() << "  "
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 4b0822d..efbfaa4 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -26,11 +26,19 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include <cctype>
 using namespace llvm;
 
+/// We are in the process of implementing a new TypeLegalization action
+/// - the promotion of vector elements. This feature is disabled by default
+/// and only enabled using this flag.
+static cl::opt<bool>
+AllowPromoteIntElem("promote-elements", cl::Hidden,
+  cl::desc("Allow promotion of integer vector element types"));
+
 namespace llvm {
 TLSModel::Model getTLSModel(const GlobalValue *GV, Reloc::Model reloc) {
   bool isLocal = GV->hasLocalLinkage();
@@ -528,7 +536,8 @@ static void InitCmpLibcallCCs(ISD::CondCode *CCs) {
 /// NOTE: The constructor takes ownership of TLOF.
 TargetLowering::TargetLowering(const TargetMachine &tm,
                                const TargetLoweringObjectFile *tlof)
-  : TM(tm), TD(TM.getTargetData()), TLOF(*tlof) {
+  : TM(tm), TD(TM.getTargetData()), TLOF(*tlof),
+  mayPromoteElements(AllowPromoteIntElem) {
   // All operations default to being supported.
   memset(OpActions, 0, sizeof(OpActions));
   memset(LoadExtActions, 0, sizeof(LoadExtActions));
@@ -596,6 +605,8 @@ TargetLowering::TargetLowering(const TargetMachine &tm,
   SchedPreferenceInfo = Sched::Latency;
   JumpBufSize = 0;
   JumpBufAlignment = 0;
+  MinFunctionAlignment = 0;
+  PrefFunctionAlignment = 0;
   PrefLoopAlignment = 0;
   MinStackArgumentAlignment = 1;
   ShouldFoldAtomicFences = false;
@@ -662,10 +673,16 @@ static unsigned getVectorTypeBreakdownMVT(MVT VT, MVT &IntermediateVT,
     NewVT = EltTy;
   IntermediateVT = NewVT;
 
+  unsigned NewVTSize = NewVT.getSizeInBits();
+
+  // Convert sizes such as i33 to i64.
+  if (!isPowerOf2_32(NewVTSize))
+    NewVTSize = NextPowerOf2(NewVTSize);
+
   EVT DestVT = TLI->getRegisterType(NewVT);
   RegisterVT = DestVT;
   if (EVT(DestVT).bitsLT(NewVT))    // Value is expanded, e.g. i64 -> i16.
-    return NumVectorRegs*(NewVT.getSizeInBits()/DestVT.getSizeInBits());
+    return NumVectorRegs*(NewVTSize/DestVT.getSizeInBits());
 
   // Otherwise, promotion or legal types use the same number of registers as
   // the vector decimated to the appropriate level.
@@ -747,7 +764,7 @@ void TargetLowering::computeRegisterProperties() {
     NumRegistersForVT[ExpandedReg] = 2*NumRegistersForVT[ExpandedReg-1];
     RegisterTypeForVT[ExpandedReg] = (MVT::SimpleValueType)LargestIntReg;
     TransformToType[ExpandedReg] = (MVT::SimpleValueType)(ExpandedReg - 1);
-    ValueTypeActions.setTypeAction(ExpandedVT, Expand);
+    ValueTypeActions.setTypeAction(ExpandedVT, TypeExpandInteger);
   }
 
   // Inspect all of the ValueType's smaller than the largest integer
@@ -761,7 +778,7 @@ void TargetLowering::computeRegisterProperties() {
     } else {
       RegisterTypeForVT[IntReg] = TransformToType[IntReg] =
         (MVT::SimpleValueType)LegalIntReg;
-      ValueTypeActions.setTypeAction(IVT, Promote);
+      ValueTypeActions.setTypeAction(IVT, TypePromoteInteger);
     }
   }
 
@@ -770,7 +787,7 @@ void TargetLowering::computeRegisterProperties() {
     NumRegistersForVT[MVT::ppcf128] = 2*NumRegistersForVT[MVT::f64];
     RegisterTypeForVT[MVT::ppcf128] = MVT::f64;
     TransformToType[MVT::ppcf128] = MVT::f64;
-    ValueTypeActions.setTypeAction(MVT::ppcf128, Expand);
+    ValueTypeActions.setTypeAction(MVT::ppcf128, TypeExpandFloat);
   }
 
   // Decide how to handle f64. If the target does not have native f64 support,
@@ -779,7 +796,7 @@ void TargetLowering::computeRegisterProperties() {
     NumRegistersForVT[MVT::f64] = NumRegistersForVT[MVT::i64];
     RegisterTypeForVT[MVT::f64] = RegisterTypeForVT[MVT::i64];
     TransformToType[MVT::f64] = MVT::i64;
-    ValueTypeActions.setTypeAction(MVT::f64, Expand);
+    ValueTypeActions.setTypeAction(MVT::f64, TypeSoftenFloat);
   }
 
   // Decide how to handle f32. If the target does not have native support for
@@ -789,12 +806,12 @@ void TargetLowering::computeRegisterProperties() {
       NumRegistersForVT[MVT::f32] = NumRegistersForVT[MVT::f64];
       RegisterTypeForVT[MVT::f32] = RegisterTypeForVT[MVT::f64];
       TransformToType[MVT::f32] = MVT::f64;
-      ValueTypeActions.setTypeAction(MVT::f32, Promote);
+      ValueTypeActions.setTypeAction(MVT::f32, TypePromoteInteger);
     } else {
       NumRegistersForVT[MVT::f32] = NumRegistersForVT[MVT::i32];
       RegisterTypeForVT[MVT::f32] = RegisterTypeForVT[MVT::i32];
       TransformToType[MVT::f32] = MVT::i32;
-      ValueTypeActions.setTypeAction(MVT::f32, Expand);
+      ValueTypeActions.setTypeAction(MVT::f32, TypeSoftenFloat);
     }
   }
 
@@ -810,6 +827,30 @@ void TargetLowering::computeRegisterProperties() {
     unsigned NElts = VT.getVectorNumElements();
     if (NElts != 1) {
       bool IsLegalWiderType = false;
+      // If we allow the promotion of vector elements using a flag,
+      // then return TypePromoteInteger on vector elements.
+      // First try to promote the elements of integer vectors. If no legal
+      // promotion was found, fallback to the widen-vector method.
+      if (mayPromoteElements)
+      for (unsigned nVT = i+1; nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) {
+        EVT SVT = (MVT::SimpleValueType)nVT;
+        // Promote vectors of integers to vectors with the same number
+        // of elements, with a wider element type.
+        if (SVT.getVectorElementType().getSizeInBits() > EltVT.getSizeInBits()
+            && SVT.getVectorNumElements() == NElts &&
+            isTypeLegal(SVT) && SVT.getScalarType().isInteger()) {
+          TransformToType[i] = SVT;
+          RegisterTypeForVT[i] = SVT;
+          NumRegistersForVT[i] = 1;
+          ValueTypeActions.setTypeAction(VT, TypePromoteInteger);
+          IsLegalWiderType = true;
+          break;
+        }
+      }
+
+      if (IsLegalWiderType) continue;
+
+      // Try to widen the vector.
       for (unsigned nVT = i+1; nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) {
         EVT SVT = (MVT::SimpleValueType)nVT;
         if (SVT.getVectorElementType() == EltVT &&
@@ -818,7 +859,7 @@ void TargetLowering::computeRegisterProperties() {
           TransformToType[i] = SVT;
           RegisterTypeForVT[i] = SVT;
           NumRegistersForVT[i] = 1;
-          ValueTypeActions.setTypeAction(VT, Promote);
+          ValueTypeActions.setTypeAction(VT, TypeWidenVector);
           IsLegalWiderType = true;
           break;
         }
@@ -838,10 +879,12 @@ void TargetLowering::computeRegisterProperties() {
     if (NVT == VT) {
       // Type is already a power of 2.  The default action is to split.
       TransformToType[i] = MVT::Other;
-      ValueTypeActions.setTypeAction(VT, Expand);
+      unsigned NumElts = VT.getVectorNumElements();
+      ValueTypeActions.setTypeAction(VT,
+            NumElts > 1 ? TypeSplitVector : TypeScalarizeVector);
     } else {
       TransformToType[i] = NVT;
-      ValueTypeActions.setTypeAction(VT, Promote);
+      ValueTypeActions.setTypeAction(VT, TypeWidenVector);
     }
   }
 
@@ -890,7 +933,7 @@ unsigned TargetLowering::getVectorTypeBreakdown(LLVMContext &Context, EVT VT,
   // If there is a wider vector type with the same element type as this one,
   // we should widen to that legal vector type.  This handles things like
   // <2 x float> -> <4 x float>.
-  if (NumElts != 1 && getTypeAction(VT) == Promote) {
+  if (NumElts != 1 && getTypeAction(Context, VT) == TypeWidenVector) {
     RegisterVT = getTypeToTransformTo(Context, VT);
     if (isTypeLegal(RegisterVT)) {
       IntermediateVT = RegisterVT;
@@ -928,8 +971,14 @@ unsigned TargetLowering::getVectorTypeBreakdown(LLVMContext &Context, EVT VT,
 
   EVT DestVT = getRegisterType(Context, NewVT);
   RegisterVT = DestVT;
+  unsigned NewVTSize = NewVT.getSizeInBits();
+
+  // Convert sizes such as i33 to i64.
+  if (!isPowerOf2_32(NewVTSize))
+    NewVTSize = NextPowerOf2(NewVTSize);
+
   if (DestVT.bitsLT(NewVT))   // Value is expanded, e.g. i64 -> i16.
-    return NumVectorRegs*(NewVT.getSizeInBits()/DestVT.getSizeInBits());
+    return NumVectorRegs*(NewVTSize/DestVT.getSizeInBits());
 
   // Otherwise, promotion or legal types use the same number of registers as
   // the vector decimated to the appropriate level.
@@ -1678,6 +1727,13 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
         ConstantSDNode *ShAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
         if (!ShAmt)
           break;
+        SDValue Shift = In.getOperand(1);
+        if (TLO.LegalTypes()) {
+          uint64_t ShVal = ShAmt->getZExtValue();
+          Shift =
+            TLO.DAG.getConstant(ShVal, getShiftAmountTy(Op.getValueType()));
+        }
+
         APInt HighBits = APInt::getHighBitsSet(OperandBitWidth,
                                                OperandBitWidth - BitWidth);
         HighBits = HighBits.lshr(ShAmt->getZExtValue()).trunc(BitWidth);
@@ -1691,7 +1747,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
           return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRL, dl,
                                                    Op.getValueType(),
                                                    NewTrunc,
-                                                   In.getOperand(1)));
+                                                   Shift));
         }
         break;
       }
@@ -1716,26 +1772,28 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     break;
   }
   case ISD::BITCAST:
-#if 0
-    // If this is an FP->Int bitcast and if the sign bit is the only thing that
-    // is demanded, turn this into a FGETSIGN.
-    if (NewMask == EVT::getIntegerVTSignBit(Op.getValueType()) &&
-        MVT::isFloatingPoint(Op.getOperand(0).getValueType()) &&
-        !MVT::isVector(Op.getOperand(0).getValueType())) {
-      // Only do this xform if FGETSIGN is valid or if before legalize.
-      if (!TLO.AfterLegalize ||
-          isOperationLegal(ISD::FGETSIGN, Op.getValueType())) {
+    // If this is an FP->Int bitcast and if the sign bit is the only
+    // thing demanded, turn this into a FGETSIGN.
+    if (!Op.getOperand(0).getValueType().isVector() &&
+        NewMask == APInt::getSignBit(Op.getValueType().getSizeInBits()) &&
+        Op.getOperand(0).getValueType().isFloatingPoint()) {
+      bool OpVTLegal = isOperationLegalOrCustom(ISD::FGETSIGN, Op.getValueType());
+      bool i32Legal  = isOperationLegalOrCustom(ISD::FGETSIGN, MVT::i32);
+      if ((OpVTLegal || i32Legal) && Op.getValueType().isSimple()) {
+        EVT Ty = OpVTLegal ? Op.getValueType() : MVT::i32;
         // Make a FGETSIGN + SHL to move the sign bit into the appropriate
         // place.  We expect the SHL to be eliminated by other optimizations.
-        SDValue Sign = TLO.DAG.getNode(ISD::FGETSIGN, Op.getValueType(),
-                                         Op.getOperand(0));
+        SDValue Sign = TLO.DAG.getNode(ISD::FGETSIGN, dl, Ty, Op.getOperand(0));
+        unsigned OpVTSizeInBits = Op.getValueType().getSizeInBits();
+        if (!OpVTLegal && OpVTSizeInBits > 32)
+          Sign = TLO.DAG.getNode(ISD::ZERO_EXTEND, dl, Op.getValueType(), Sign);
         unsigned ShVal = Op.getValueType().getSizeInBits()-1;
-        SDValue ShAmt = TLO.DAG.getConstant(ShVal, getShiftAmountTy());
-        return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SHL, Op.getValueType(),
+        SDValue ShAmt = TLO.DAG.getConstant(ShVal, Op.getValueType());
+        return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SHL, dl,
+                                                 Op.getValueType(),
                                                  Sign, ShAmt));
       }
     }
-#endif
     break;
   case ISD::ADD:
   case ISD::MUL:
@@ -1842,7 +1900,6 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
                               ISD::CondCode Cond, bool foldBooleans,
                               DAGCombinerInfo &DCI, DebugLoc dl) const {
   SelectionDAG &DAG = DCI.DAG;
-  LLVMContext &Context = *DAG.getContext();
 
   // These setcc operations always fold.
   switch (Cond) {
@@ -1853,12 +1910,11 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
   case ISD::SETTRUE2:  return DAG.getConstant(1, VT);
   }
 
-  if (isa<ConstantSDNode>(N0.getNode())) {
-    // Ensure that the constant occurs on the RHS, and fold constant
-    // comparisons.
+  // Ensure that the constant occurs on the RHS, and fold constant
+  // comparisons.
+  if (isa<ConstantSDNode>(N0.getNode()))
     return DAG.getSetCC(dl, VT, N1, N0, ISD::getSetCCSwappedOperands(Cond));
-  }
-
+  
   if (ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1.getNode())) {
     const APInt &C1 = N1C->getAPIntValue();
 
@@ -1911,6 +1967,42 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
       // TODO: (ctpop x) == 1 -> x && (x & x-1) == 0 iff ctpop is illegal.
     }
 
+    // (zext x) == C --> x == (trunc C)
+    if (DCI.isBeforeLegalize() && N0->hasOneUse() &&
+        (Cond == ISD::SETEQ || Cond == ISD::SETNE)) {
+      unsigned MinBits = N0.getValueSizeInBits();
+      SDValue PreZExt;
+      if (N0->getOpcode() == ISD::ZERO_EXTEND) {
+        // ZExt
+        MinBits = N0->getOperand(0).getValueSizeInBits();
+        PreZExt = N0->getOperand(0);
+      } else if (N0->getOpcode() == ISD::AND) {
+        // DAGCombine turns costly ZExts into ANDs
+        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0->getOperand(1)))
+          if ((C->getAPIntValue()+1).isPowerOf2()) {
+            MinBits = C->getAPIntValue().countTrailingOnes();
+            PreZExt = N0->getOperand(0);
+          }
+      } else if (LoadSDNode *LN0 = dyn_cast<LoadSDNode>(N0)) {
+        // ZEXTLOAD
+        if (LN0->getExtensionType() == ISD::ZEXTLOAD) {
+          MinBits = LN0->getMemoryVT().getSizeInBits();
+          PreZExt = N0;
+        }
+      }
+
+      // Make sure we're not loosing bits from the constant.
+      if (MinBits < C1.getBitWidth() && MinBits > C1.getActiveBits()) {
+        EVT MinVT = EVT::getIntegerVT(*DAG.getContext(), MinBits);
+        if (isTypeDesirableForOp(ISD::SETCC, MinVT)) {
+          // Will get folded away.
+          SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, MinVT, PreZExt);
+          SDValue C = DAG.getConstant(C1.trunc(MinBits), MinVT);
+          return DAG.getSetCC(dl, VT, Trunc, C, Cond);
+        }
+      }
+    }
+
     // If the LHS is '(and load, const)', the RHS is 0,
     // the test is for equality or unsigned, and all 1 bits of the const are
     // in the same partial word, see if we can shorten the load.
@@ -1949,7 +2041,7 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
         }
       }
       if (bestWidth) {
-        EVT newVT = EVT::getIntegerVT(Context, bestWidth);
+        EVT newVT = EVT::getIntegerVT(*DAG.getContext(), bestWidth);
         if (newVT.isRound()) {
           EVT PtrType = Lod->getOperand(1).getValueType();
           SDValue Ptr = Lod->getBasePtr();
@@ -2578,9 +2670,13 @@ const char *TargetLowering::LowerXConstraint(EVT ConstraintVT) const{
 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
 /// vector.  If it is invalid, don't add anything to Ops.
 void TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
-                                                  char ConstraintLetter,
+                                                  std::string &Constraint,
                                                   std::vector<SDValue> &Ops,
                                                   SelectionDAG &DAG) const {
+  
+  if (Constraint.length() > 1) return;
+  
+  char ConstraintLetter = Constraint[0];
   switch (ConstraintLetter) {
   default: break;
   case 'X':     // Allows any operand; labels (basic block) use this.
@@ -2769,6 +2865,12 @@ TargetLowering::AsmOperandInfoVector TargetLowering::ParseConstraints(
           report_fatal_error("Indirect operand for inline asm not a pointer!");
         OpTy = PtrTy->getElementType();
       }
+      
+      // Look for vector wrapped in a struct. e.g. { <16 x i8> }.
+      if (const StructType *STy = dyn_cast<StructType>(OpTy))
+        if (STy->getNumElements() == 1)
+          OpTy = STy->getElementType(0);
+
       // If OpTy is not a single value, it may be a struct/union that we
       // can tile with integers.
       if (!OpTy->isSingleValueType() && OpTy->isSized()) {
@@ -3013,7 +3115,7 @@ static void ChooseConstraint(TargetLowering::AsmOperandInfo &OpInfo,
       assert(OpInfo.Codes[i].size() == 1 &&
              "Unhandled multi-letter 'other' constraint");
       std::vector<SDValue> ResultOps;
-      TLI.LowerAsmOperandForConstraint(Op, OpInfo.Codes[i][0],
+      TLI.LowerAsmOperandForConstraint(Op, OpInfo.Codes[i],
                                        ResultOps, *DAG);
       if (!ResultOps.empty()) {
         BestType = CType;
diff --git a/lib/CodeGen/ShrinkWrapping.cpp b/lib/CodeGen/ShrinkWrapping.cpp
index 7b5bca4..160f38f 100644
--- a/lib/CodeGen/ShrinkWrapping.cpp
+++ b/lib/CodeGen/ShrinkWrapping.cpp
@@ -277,7 +277,7 @@ void PEI::calculateAnticAvail(MachineFunction &Fn) {
   // Initialize data flow sets.
   clearAnticAvailSets();
 
-  // Calulate Antic{In,Out} and Avail{In,Out} iteratively on the MCFG.
+  // Calculate Antic{In,Out} and Avail{In,Out} iteratively on the MCFG.
   bool changed = true;
   unsigned iterations = 0;
   while (changed) {
diff --git a/lib/CodeGen/SimpleRegisterCoalescing.cpp b/lib/CodeGen/SimpleRegisterCoalescing.cpp
index c621726..221bec5 100644
--- a/lib/CodeGen/SimpleRegisterCoalescing.cpp
+++ b/lib/CodeGen/SimpleRegisterCoalescing.cpp
@@ -47,7 +47,6 @@ STATISTIC(numExtends  , "Number of copies extended");
 STATISTIC(NumReMats   , "Number of instructions re-materialized");
 STATISTIC(numPeep     , "Number of identity moves eliminated after coalescing");
 STATISTIC(numAborts   , "Number of times interval joining aborted");
-STATISTIC(numDeadValNo, "Number of valno def marked dead");
 
 char SimpleRegisterCoalescing::ID = 0;
 static cl::opt<bool>
@@ -61,9 +60,9 @@ DisableCrossClassJoin("disable-cross-class-join",
                cl::init(false), cl::Hidden);
 
 static cl::opt<bool>
-DisablePhysicalJoin("disable-physical-join",
-               cl::desc("Avoid coalescing physical register copies"),
-               cl::init(false), cl::Hidden);
+EnablePhysicalJoin("join-physregs",
+                   cl::desc("Join physical register copies"),
+                   cl::init(false), cl::Hidden);
 
 static cl::opt<bool>
 VerifyCoalescing("verify-coalescing",
@@ -208,15 +207,14 @@ bool SimpleRegisterCoalescing::AdjustCopiesBackFrom(const CoalescerPair &CP,
   if (ValLR+1 != BLR) return false;
 
   // If a live interval is a physical register, conservatively check if any
-  // of its sub-registers is overlapping the live interval of the virtual
-  // register. If so, do not coalesce.
-  if (TargetRegisterInfo::isPhysicalRegister(IntB.reg) &&
-      *tri_->getSubRegisters(IntB.reg)) {
-    for (const unsigned* SR = tri_->getSubRegisters(IntB.reg); *SR; ++SR)
-      if (li_->hasInterval(*SR) && IntA.overlaps(li_->getInterval(*SR))) {
+  // of its aliases is overlapping the live interval of the virtual register.
+  // If so, do not coalesce.
+  if (TargetRegisterInfo::isPhysicalRegister(IntB.reg)) {
+    for (const unsigned *AS = tri_->getAliasSet(IntB.reg); *AS; ++AS)
+      if (li_->hasInterval(*AS) && IntA.overlaps(li_->getInterval(*AS))) {
         DEBUG({
-            dbgs() << "\t\tInterfere with sub-register ";
-            li_->getInterval(*SR).print(dbgs(), tri_);
+            dbgs() << "\t\tInterfere with alias ";
+            li_->getInterval(*AS).print(dbgs(), tri_);
           });
         return false;
       }
@@ -254,7 +252,12 @@ bool SimpleRegisterCoalescing::AdjustCopiesBackFrom(const CoalescerPair &CP,
 
   // Okay, merge "B1" into the same value number as "B0".
   if (BValNo != ValLR->valno) {
+    // If B1 is killed by a PHI, then the merged live range must also be killed
+    // by the same PHI, as B0 and B1 can not overlap.
+    bool HasPHIKill = BValNo->hasPHIKill();
     IntB.MergeValueNumberInto(BValNo, ValLR->valno);
+    if (HasPHIKill)
+      ValLR->valno->setHasPHIKill(true);
   }
   DEBUG({
       dbgs() << "   result = ";
@@ -273,7 +276,7 @@ bool SimpleRegisterCoalescing::AdjustCopiesBackFrom(const CoalescerPair &CP,
   // merge, find the last use and trim the live range. That will also add the
   // isKill marker.
   if (ALR->end == CopyIdx)
-    TrimLiveIntervalToLastUse(CopyUseIdx, CopyMI->getParent(), IntA, ALR);
+    li_->shrinkToUses(&IntA);
 
   ++numExtends;
   return true;
@@ -427,6 +430,10 @@ bool SimpleRegisterCoalescing::RemoveCopyByCommutingDef(const CoalescerPair &CP,
   MachineInstr *NewMI = tii_->commuteInstruction(DefMI);
   if (!NewMI)
     return false;
+  if (TargetRegisterInfo::isVirtualRegister(IntA.reg) &&
+      TargetRegisterInfo::isVirtualRegister(IntB.reg) &&
+      !mri_->constrainRegClass(IntB.reg, mri_->getRegClass(IntA.reg)))
+    return false;
   if (NewMI != DefMI) {
     li_->ReplaceMachineInstrInMaps(DefMI, NewMI);
     MBB->insert(DefMI, NewMI);
@@ -504,98 +511,6 @@ bool SimpleRegisterCoalescing::RemoveCopyByCommutingDef(const CoalescerPair &CP,
   return true;
 }
 
-/// isSameOrFallThroughBB - Return true if MBB == SuccMBB or MBB simply
-/// fallthoughs to SuccMBB.
-static bool isSameOrFallThroughBB(MachineBasicBlock *MBB,
-                                  MachineBasicBlock *SuccMBB,
-                                  const TargetInstrInfo *tii_) {
-  if (MBB == SuccMBB)
-    return true;
-  MachineBasicBlock *TBB = 0, *FBB = 0;
-  SmallVector<MachineOperand, 4> Cond;
-  return !tii_->AnalyzeBranch(*MBB, TBB, FBB, Cond) && !TBB && !FBB &&
-    MBB->isSuccessor(SuccMBB);
-}
-
-/// removeRange - Wrapper for LiveInterval::removeRange. This removes a range
-/// from a physical register live interval as well as from the live intervals
-/// of its sub-registers.
-static void removeRange(LiveInterval &li,
-                        SlotIndex Start, SlotIndex End,
-                        LiveIntervals *li_, const TargetRegisterInfo *tri_) {
-  li.removeRange(Start, End, true);
-  if (TargetRegisterInfo::isPhysicalRegister(li.reg)) {
-    for (const unsigned* SR = tri_->getSubRegisters(li.reg); *SR; ++SR) {
-      if (!li_->hasInterval(*SR))
-        continue;
-      LiveInterval &sli = li_->getInterval(*SR);
-      SlotIndex RemoveStart = Start;
-      SlotIndex RemoveEnd = Start;
-
-      while (RemoveEnd != End) {
-        LiveInterval::iterator LR = sli.FindLiveRangeContaining(RemoveStart);
-        if (LR == sli.end())
-          break;
-        RemoveEnd = (LR->end < End) ? LR->end : End;
-        sli.removeRange(RemoveStart, RemoveEnd, true);
-        RemoveStart = RemoveEnd;
-      }
-    }
-  }
-}
-
-/// TrimLiveIntervalToLastUse - If there is a last use in the same basic block
-/// as the copy instruction, trim the live interval to the last use and return
-/// true.
-bool
-SimpleRegisterCoalescing::TrimLiveIntervalToLastUse(SlotIndex CopyIdx,
-                                                    MachineBasicBlock *CopyMBB,
-                                                    LiveInterval &li,
-                                                    const LiveRange *LR) {
-  SlotIndex MBBStart = li_->getMBBStartIdx(CopyMBB);
-  SlotIndex LastUseIdx;
-  MachineOperand *LastUse =
-    lastRegisterUse(LR->start, CopyIdx.getPrevSlot(), li.reg, LastUseIdx);
-  if (LastUse) {
-    MachineInstr *LastUseMI = LastUse->getParent();
-    if (!isSameOrFallThroughBB(LastUseMI->getParent(), CopyMBB, tii_)) {
-      // r1024 = op
-      // ...
-      // BB1:
-      //       = r1024
-      //
-      // BB2:
-      // r1025<dead> = r1024<kill>
-      if (MBBStart < LR->end)
-        removeRange(li, MBBStart, LR->end, li_, tri_);
-      return true;
-    }
-
-    // There are uses before the copy, just shorten the live range to the end
-    // of last use.
-    LastUse->setIsKill();
-    removeRange(li, LastUseIdx.getDefIndex(), LR->end, li_, tri_);
-    if (LastUseMI->isCopy()) {
-      MachineOperand &DefMO = LastUseMI->getOperand(0);
-      if (DefMO.getReg() == li.reg && !DefMO.getSubReg())
-        DefMO.setIsDead();
-    }
-    return true;
-  }
-
-  // Is it livein?
-  if (LR->start <= MBBStart && LR->end > MBBStart) {
-    if (LR->start == li_->getZeroIndex()) {
-      assert(TargetRegisterInfo::isPhysicalRegister(li.reg));
-      // Live-in to the function but dead. Remove it from entry live-in set.
-      mf_->begin()->removeLiveIn(li.reg);
-    }
-    // FIXME: Shorten intervals in BBs that reaches this BB.
-  }
-
-  return false;
-}
-
 /// ReMaterializeTrivialDef - If the source of a copy is defined by a trivial
 /// computation, replace the copy by rematerialize the definition.
 bool SimpleRegisterCoalescing::ReMaterializeTrivialDef(LiveInterval &SrcInt,
@@ -782,26 +697,6 @@ static bool removeIntervalIfEmpty(LiveInterval &li, LiveIntervals *li_,
   return false;
 }
 
-/// ShortenDeadCopyLiveRange - Shorten a live range defined by a dead copy.
-/// Return true if live interval is removed.
-bool SimpleRegisterCoalescing::ShortenDeadCopyLiveRange(LiveInterval &li,
-                                                        MachineInstr *CopyMI) {
-  SlotIndex CopyIdx = li_->getInstructionIndex(CopyMI);
-  LiveInterval::iterator MLR =
-    li.FindLiveRangeContaining(CopyIdx.getDefIndex());
-  if (MLR == li.end())
-    return false;  // Already removed by ShortenDeadCopySrcLiveRange.
-  SlotIndex RemoveStart = MLR->start;
-  SlotIndex RemoveEnd = MLR->end;
-  SlotIndex DefIdx = CopyIdx.getDefIndex();
-  // Remove the liverange that's defined by this.
-  if (RemoveStart == DefIdx && RemoveEnd == DefIdx.getStoreIndex()) {
-    removeRange(li, RemoveStart, RemoveEnd, li_, tri_);
-    return removeIntervalIfEmpty(li, li_, tri_);
-  }
-  return false;
-}
-
 /// RemoveDeadDef - If a def of a live interval is now determined dead, remove
 /// the val# it defines. If the live interval becomes empty, remove it as well.
 bool SimpleRegisterCoalescing::RemoveDeadDef(LiveInterval &li,
@@ -835,84 +730,6 @@ void SimpleRegisterCoalescing::RemoveCopyFlag(unsigned DstReg,
   }
 }
 
-/// PropagateDeadness - Propagate the dead marker to the instruction which
-/// defines the val#.
-static void PropagateDeadness(LiveInterval &li, MachineInstr *CopyMI,
-                              SlotIndex &LRStart, LiveIntervals *li_,
-                              const TargetRegisterInfo* tri_) {
-  MachineInstr *DefMI =
-    li_->getInstructionFromIndex(LRStart.getDefIndex());
-  if (DefMI && DefMI != CopyMI) {
-    int DeadIdx = DefMI->findRegisterDefOperandIdx(li.reg);
-    if (DeadIdx != -1)
-      DefMI->getOperand(DeadIdx).setIsDead();
-    else
-      DefMI->addOperand(MachineOperand::CreateReg(li.reg,
-                   /*def*/true, /*implicit*/true, /*kill*/false, /*dead*/true));
-    LRStart = LRStart.getNextSlot();
-  }
-}
-
-/// ShortenDeadCopySrcLiveRange - Shorten a live range as it's artificially
-/// extended by a dead copy. Mark the last use (if any) of the val# as kill as
-/// ends the live range there. If there isn't another use, then this live range
-/// is dead. Return true if live interval is removed.
-bool
-SimpleRegisterCoalescing::ShortenDeadCopySrcLiveRange(LiveInterval &li,
-                                                      MachineInstr *CopyMI) {
-  SlotIndex CopyIdx = li_->getInstructionIndex(CopyMI);
-  if (CopyIdx == SlotIndex()) {
-    // FIXME: special case: function live in. It can be a general case if the
-    // first instruction index starts at > 0 value.
-    assert(TargetRegisterInfo::isPhysicalRegister(li.reg));
-    // Live-in to the function but dead. Remove it from entry live-in set.
-    if (mf_->begin()->isLiveIn(li.reg))
-      mf_->begin()->removeLiveIn(li.reg);
-    if (const LiveRange *LR = li.getLiveRangeContaining(CopyIdx))
-      removeRange(li, LR->start, LR->end, li_, tri_);
-    return removeIntervalIfEmpty(li, li_, tri_);
-  }
-
-  LiveInterval::iterator LR =
-    li.FindLiveRangeContaining(CopyIdx.getPrevIndex().getStoreIndex());
-  if (LR == li.end())
-    // Livein but defined by a phi.
-    return false;
-
-  SlotIndex RemoveStart = LR->start;
-  SlotIndex RemoveEnd = CopyIdx.getStoreIndex();
-  if (LR->end > RemoveEnd)
-    // More uses past this copy? Nothing to do.
-    return false;
-
-  // If there is a last use in the same bb, we can't remove the live range.
-  // Shorten the live interval and return.
-  MachineBasicBlock *CopyMBB = CopyMI->getParent();
-  if (TrimLiveIntervalToLastUse(CopyIdx, CopyMBB, li, LR))
-    return false;
-
-  // There are other kills of the val#. Nothing to do.
-  if (!li.isOnlyLROfValNo(LR))
-    return false;
-
-  MachineBasicBlock *StartMBB = li_->getMBBFromIndex(RemoveStart);
-  if (!isSameOrFallThroughBB(StartMBB, CopyMBB, tii_))
-    // If the live range starts in another mbb and the copy mbb is not a fall
-    // through mbb, then we can only cut the range from the beginning of the
-    // copy mbb.
-    RemoveStart = li_->getMBBStartIdx(CopyMBB).getNextIndex().getBaseIndex();
-
-  if (LR->valno->def == RemoveStart) {
-    // If the def MI defines the val# and this copy is the only kill of the
-    // val#, then propagate the dead marker.
-    PropagateDeadness(li, CopyMI, RemoveStart, li_, tri_);
-    ++numDeadValNo;
-  }
-
-  removeRange(li, RemoveStart, RemoveEnd, li_, tri_);
-  return removeIntervalIfEmpty(li, li_, tri_);
-}
-
 /// shouldJoinPhys - Return true if a copy involving a physreg should be joined.
 /// We need to be careful about coalescing a source physical register with a
 /// virtual register. Once the coalescing is done, it cannot be broken and these
@@ -928,7 +745,7 @@ bool SimpleRegisterCoalescing::shouldJoinPhys(CoalescerPair &CP) {
   if (!Allocatable && CP.isFlipped() && JoinVInt.containsOneValue())
     return true;
 
-  if (DisablePhysicalJoin) {
+  if (!EnablePhysicalJoin) {
     DEBUG(dbgs() << "\tPhysreg joins disabled.\n");
     return false;
   }
@@ -955,7 +772,7 @@ bool SimpleRegisterCoalescing::shouldJoinPhys(CoalescerPair &CP) {
   //        CodeGen/X86/phys_subreg_coalesce-3.ll needs it.
   if (!CP.isPartial()) {
     const TargetRegisterClass *RC = mri_->getRegClass(CP.getSrcReg());
-    unsigned Threshold = allocatableRCRegs_[RC].count() * 2;
+    unsigned Threshold = RegClassInfo.getNumAllocatableRegs(RC) * 2;
     unsigned Length = li_->getApproximateInstructionCount(JoinVInt);
     if (Length > Threshold) {
       ++numAborts;
@@ -974,7 +791,7 @@ SimpleRegisterCoalescing::isWinToJoinCrossClass(unsigned SrcReg,
                                              const TargetRegisterClass *SrcRC,
                                              const TargetRegisterClass *DstRC,
                                              const TargetRegisterClass *NewRC) {
-  unsigned NewRCCount = allocatableRCRegs_[NewRC].count();
+  unsigned NewRCCount = RegClassInfo.getNumAllocatableRegs(NewRC);
   // This heuristics is good enough in practice, but it's obviously not *right*.
   // 4 is a magic number that works well enough for x86, ARM, etc. It filter
   // out all but the most restrictive register classes.
@@ -988,8 +805,14 @@ SimpleRegisterCoalescing::isWinToJoinCrossClass(unsigned SrcReg,
   LiveInterval &DstInt = li_->getInterval(DstReg);
   unsigned SrcSize = li_->getApproximateInstructionCount(SrcInt);
   unsigned DstSize = li_->getApproximateInstructionCount(DstInt);
-  if (SrcSize <= NewRCCount && DstSize <= NewRCCount)
+
+  // Coalesce aggressively if the intervals are small compared to the number of
+  // registers in the new class. The number 4 is fairly arbitrary, chosen to be
+  // less aggressive than the 8 used for the whole function size.
+  const unsigned ThresSize = 4 * NewRCCount;
+  if (SrcSize <= ThresSize && DstSize <= ThresSize)
     return true;
+
   // Estimate *register use density*. If it doubles or more, abort.
   unsigned SrcUses = std::distance(mri_->use_nodbg_begin(SrcReg),
                                    mri_->use_nodbg_end());
@@ -997,13 +820,13 @@ SimpleRegisterCoalescing::isWinToJoinCrossClass(unsigned SrcReg,
                                    mri_->use_nodbg_end());
   unsigned NewUses = SrcUses + DstUses;
   unsigned NewSize = SrcSize + DstSize;
-  if (SrcRC != NewRC && SrcSize > NewRCCount) {
-    unsigned SrcRCCount = allocatableRCRegs_[SrcRC].count();
+  if (SrcRC != NewRC && SrcSize > ThresSize) {
+    unsigned SrcRCCount = RegClassInfo.getNumAllocatableRegs(SrcRC);
     if (NewUses*SrcSize*SrcRCCount > 2*SrcUses*NewSize*NewRCCount)
       return false;
   }
-  if (DstRC != NewRC && DstSize > NewRCCount) {
-    unsigned DstRCCount = allocatableRCRegs_[DstRC].count();
+  if (DstRC != NewRC && DstSize > ThresSize) {
+    unsigned DstRCCount = RegClassInfo.getNumAllocatableRegs(DstRC);
     if (NewUses*DstSize*DstRCCount > 2*DstUses*NewSize*NewRCCount)
       return false;
   }
@@ -1033,6 +856,7 @@ bool SimpleRegisterCoalescing::JoinCopy(CopyRec &TheCopy, bool &Again) {
 
   // If they are already joined we continue.
   if (CP.getSrcReg() == CP.getDstReg()) {
+    markAsJoined(CopyMI);
     DEBUG(dbgs() << "\tCopy already coalesced.\n");
     return false;  // Not coalescable.
   }
@@ -1552,81 +1376,6 @@ void SimpleRegisterCoalescing::joinIntervals() {
   }
 }
 
-/// Return true if the two specified registers belong to different register
-/// classes.  The registers may be either phys or virt regs.
-bool
-SimpleRegisterCoalescing::differingRegisterClasses(unsigned RegA,
-                                                   unsigned RegB) const {
-  // Get the register classes for the first reg.
-  if (TargetRegisterInfo::isPhysicalRegister(RegA)) {
-    assert(TargetRegisterInfo::isVirtualRegister(RegB) &&
-           "Shouldn't consider two physregs!");
-    return !mri_->getRegClass(RegB)->contains(RegA);
-  }
-
-  // Compare against the regclass for the second reg.
-  const TargetRegisterClass *RegClassA = mri_->getRegClass(RegA);
-  if (TargetRegisterInfo::isVirtualRegister(RegB)) {
-    const TargetRegisterClass *RegClassB = mri_->getRegClass(RegB);
-    return RegClassA != RegClassB;
-  }
-  return !RegClassA->contains(RegB);
-}
-
-/// lastRegisterUse - Returns the last (non-debug) use of the specific register
-/// between cycles Start and End or NULL if there are no uses.
-MachineOperand *
-SimpleRegisterCoalescing::lastRegisterUse(SlotIndex Start,
-                                          SlotIndex End,
-                                          unsigned Reg,
-                                          SlotIndex &UseIdx) const{
-  UseIdx = SlotIndex();
-  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
-    MachineOperand *LastUse = NULL;
-    for (MachineRegisterInfo::use_nodbg_iterator I = mri_->use_nodbg_begin(Reg),
-           E = mri_->use_nodbg_end(); I != E; ++I) {
-      MachineOperand &Use = I.getOperand();
-      MachineInstr *UseMI = Use.getParent();
-      if (UseMI->isIdentityCopy())
-        continue;
-      SlotIndex Idx = li_->getInstructionIndex(UseMI);
-      if (Idx >= Start && Idx < End && (!UseIdx.isValid() || Idx >= UseIdx)) {
-        LastUse = &Use;
-        UseIdx = Idx.getUseIndex();
-      }
-    }
-    return LastUse;
-  }
-
-  SlotIndex s = Start;
-  SlotIndex e = End.getPrevSlot().getBaseIndex();
-  while (e >= s) {
-    // Skip deleted instructions
-    MachineInstr *MI = li_->getInstructionFromIndex(e);
-    while (e != SlotIndex() && e.getPrevIndex() >= s && !MI) {
-      e = e.getPrevIndex();
-      MI = li_->getInstructionFromIndex(e);
-    }
-    if (e < s || MI == NULL)
-      return NULL;
-
-    // Ignore identity copies.
-    if (!MI->isIdentityCopy())
-      for (unsigned i = 0, NumOps = MI->getNumOperands(); i != NumOps; ++i) {
-        MachineOperand &Use = MI->getOperand(i);
-        if (Use.isReg() && Use.isUse() && Use.getReg() &&
-            tri_->regsOverlap(Use.getReg(), Reg)) {
-          UseIdx = e.getUseIndex();
-          return &Use;
-        }
-      }
-
-    e = e.getPrevIndex();
-  }
-
-  return NULL;
-}
-
 void SimpleRegisterCoalescing::releaseMemory() {
   JoinedCopies.clear();
   ReMatCopies.clear();
@@ -1651,10 +1400,7 @@ bool SimpleRegisterCoalescing::runOnMachineFunction(MachineFunction &fn) {
   if (VerifyCoalescing)
     mf_->verify(this, "Before register coalescing");
 
-  for (TargetRegisterInfo::regclass_iterator I = tri_->regclass_begin(),
-         E = tri_->regclass_end(); I != E; ++I)
-    allocatableRCRegs_.insert(std::make_pair(*I,
-                                             tri_->getAllocatableSet(fn, *I)));
+  RegClassInfo.runOnMachineFunction(fn);
 
   // Join (coalesce) intervals if requested.
   if (EnableJoining) {
@@ -1691,13 +1437,11 @@ bool SimpleRegisterCoalescing::runOnMachineFunction(MachineFunction &fn) {
           // or else the scavenger may complain. LowerSubregs will
           // delete them later.
           DoDelete = false;
-        
+
         if (MI->allDefsAreDead()) {
-          if (li_->hasInterval(SrcReg)) {
-            LiveInterval &li = li_->getInterval(SrcReg);
-            if (!ShortenDeadCopySrcLiveRange(li, MI))
-              ShortenDeadCopyLiveRange(li, MI);
-          }
+          if (TargetRegisterInfo::isVirtualRegister(SrcReg) &&
+              li_->hasInterval(SrcReg))
+            li_->shrinkToUses(&li_->getInterval(SrcReg));
           DoDelete = true;
         }
         if (!DoDelete) {
@@ -1749,24 +1493,6 @@ bool SimpleRegisterCoalescing::runOnMachineFunction(MachineFunction &fn) {
           DeadDefs.clear();
       }
 
-      // If the move will be an identity move delete it
-      if (MI->isIdentityCopy()) {
-        unsigned SrcReg = MI->getOperand(1).getReg();
-        if (li_->hasInterval(SrcReg)) {
-          LiveInterval &RegInt = li_->getInterval(SrcReg);
-          // If def of this move instruction is dead, remove its live range
-          // from the destination register's live interval.
-          if (MI->allDefsAreDead()) {
-            if (!ShortenDeadCopySrcLiveRange(RegInt, MI))
-              ShortenDeadCopyLiveRange(RegInt, MI);
-          }
-        }
-        li_->RemoveMachineInstrFromMaps(MI);
-        mii = mbbi->erase(mii);
-        ++numPeep;
-        continue;
-      }
-
       ++mii;
 
       // Check for now unnecessary kill flags.
diff --git a/lib/CodeGen/SimpleRegisterCoalescing.h b/lib/CodeGen/SimpleRegisterCoalescing.h
index 65cf542..92f6c64 100644
--- a/lib/CodeGen/SimpleRegisterCoalescing.h
+++ b/lib/CodeGen/SimpleRegisterCoalescing.h
@@ -17,7 +17,7 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/RegisterCoalescer.h"
-#include "llvm/ADT/BitVector.h"
+#include "RegisterClassInfo.h"
 
 namespace llvm {
   class SimpleRegisterCoalescing;
@@ -47,8 +47,7 @@ namespace llvm {
     LiveDebugVariables *ldv_;
     const MachineLoopInfo* loopInfo;
     AliasAnalysis *AA;
-    
-    DenseMap<const TargetRegisterClass*, BitVector> allocatableRCRegs_;
+    RegisterClassInfo RegClassInfo;
 
     /// JoinedCopies - Keep track of copies eliminated due to coalescing.
     ///
@@ -103,10 +102,6 @@ namespace llvm {
     /// use this information below to update aliases.
     bool JoinIntervals(CoalescerPair &CP);
 
-    /// Return true if the two specified registers belong to different register
-    /// classes.  The registers may be either phys or virt regs.
-    bool differingRegisterClasses(unsigned RegA, unsigned RegB) const;
-
     /// AdjustCopiesBackFrom - We found a non-trivially-coalescable copy. If
     /// the source value number is defined by a copy from the destination reg
     /// see if we can merge these two destination reg valno# into a single
@@ -124,13 +119,6 @@ namespace llvm {
     /// can transform the copy into a noop by commuting the definition.
     bool RemoveCopyByCommutingDef(const CoalescerPair &CP,MachineInstr *CopyMI);
 
-    /// TrimLiveIntervalToLastUse - If there is a last use in the same basic
-    /// block as the copy instruction, trim the ive interval to the last use
-    /// and return true.
-    bool TrimLiveIntervalToLastUse(SlotIndex CopyIdx,
-                                   MachineBasicBlock *CopyMBB,
-                                   LiveInterval &li, const LiveRange *LR);
-
     /// ReMaterializeTrivialDef - If the source of a copy is defined by a trivial
     /// computation, replace the copy by rematerialize the definition.
     /// If PreserveSrcInt is true, make sure SrcInt is valid after the call.
@@ -156,16 +144,6 @@ namespace llvm {
     /// subregister.
     void UpdateRegDefsUses(const CoalescerPair &CP);
 
-    /// ShortenDeadCopyLiveRange - Shorten a live range defined by a dead copy.
-    /// Return true if live interval is removed.
-    bool ShortenDeadCopyLiveRange(LiveInterval &li, MachineInstr *CopyMI);
-
-    /// ShortenDeadCopyLiveRange - Shorten a live range as it's artificially
-    /// extended by a dead copy. Mark the last use (if any) of the val# as kill
-    /// as ends the live range there. If there isn't another use, then this
-    /// live range is dead. Return true if live interval is removed.
-    bool ShortenDeadCopySrcLiveRange(LiveInterval &li, MachineInstr *CopyMI);
-
     /// RemoveDeadDef - If a def of a live interval is now determined dead,
     /// remove the val# it defines. If the live interval becomes empty, remove
     /// it as well.
@@ -175,11 +153,6 @@ namespace llvm {
     /// VNInfo copy flag for DstReg and all aliases.
     void RemoveCopyFlag(unsigned DstReg, const MachineInstr *CopyMI);
 
-    /// lastRegisterUse - Returns the last use of the specific register between
-    /// cycles Start and End or NULL if there are no uses.
-    MachineOperand *lastRegisterUse(SlotIndex Start, SlotIndex End,
-                                    unsigned Reg, SlotIndex &LastUseIdx) const;
-
     /// markAsJoined - Remember that CopyMI has already been joined.
     void markAsJoined(MachineInstr *CopyMI);
   };
diff --git a/lib/CodeGen/SjLjEHPrepare.cpp b/lib/CodeGen/SjLjEHPrepare.cpp
index aaa8568..92970e4 100644
--- a/lib/CodeGen/SjLjEHPrepare.cpp
+++ b/lib/CodeGen/SjLjEHPrepare.cpp
@@ -443,16 +443,17 @@ bool SjLjEHPass::insertSjLjEHSupport(Function &F) {
     BasicBlock::Create(F.getContext(), "eh.sjlj.setjmp.catch", &F);
 
   // Insert a load of the callsite in the dispatch block, and a switch on its
-  // value.  By default, we go to a block that just does an unwind (which is the
-  // correct action for a standard call).
-  BasicBlock *UnwindBlock =
-    BasicBlock::Create(F.getContext(), "unwindbb", &F);
-  Unwinds.push_back(new UnwindInst(F.getContext(), UnwindBlock));
+  // value. By default, we issue a trap statement.
+  BasicBlock *TrapBlock =
+    BasicBlock::Create(F.getContext(), "trapbb", &F);
+  CallInst::Create(Intrinsic::getDeclaration(F.getParent(), Intrinsic::trap),
+                   "", TrapBlock);
+  new UnreachableInst(F.getContext(), TrapBlock);
 
   Value *DispatchLoad = new LoadInst(CallSite, "invoke.num", true,
                                      DispatchBlock);
   SwitchInst *DispatchSwitch =
-    SwitchInst::Create(DispatchLoad, UnwindBlock, Invokes.size(),
+    SwitchInst::Create(DispatchLoad, TrapBlock, Invokes.size(),
                        DispatchBlock);
   // Split the entry block to insert the conditional branch for the setjmp.
   BasicBlock *ContBlock = EntryBB->splitBasicBlock(EntryBB->getTerminator(),
@@ -519,7 +520,7 @@ bool SjLjEHPass::insertSjLjEHSupport(Function &F) {
 
   // Add a call to dispatch_setup after the setjmp call. This is expanded to any
   // target-specific setup that needs to be done.
-  CallInst::Create(DispatchSetupFn, "", EntryBB->getTerminator());
+  CallInst::Create(DispatchSetupFn, DispatchVal, "", EntryBB->getTerminator());
 
   // check the return value of the setjmp. non-zero goes to dispatcher.
   Value *IsNormal = new ICmpInst(EntryBB->getTerminator(),
@@ -561,7 +562,7 @@ bool SjLjEHPass::insertSjLjEHSupport(Function &F) {
   // Replace all unwinds with a branch to the unwind handler.
   // ??? Should this ever happen with sjlj exceptions?
   for (unsigned i = 0, e = Unwinds.size(); i != e; ++i) {
-    BranchInst::Create(UnwindBlock, Unwinds[i]);
+    BranchInst::Create(TrapBlock, Unwinds[i]);
     Unwinds[i]->eraseFromParent();
   }
 
diff --git a/lib/CodeGen/SpillPlacement.cpp b/lib/CodeGen/SpillPlacement.cpp
index cab18a1..6949618 100644
--- a/lib/CodeGen/SpillPlacement.cpp
+++ b/lib/CodeGen/SpillPlacement.cpp
@@ -135,13 +135,10 @@ struct SpillPlacement::Node {
 
   /// addBias - Bias this node from an ingoing[0] or outgoing[1] link.
   /// Return the change to the total number of positive biases.
-  int addBias(float w, bool out) {
+  void addBias(float w, bool out) {
     // Normalize w relative to all connected blocks from that direction.
     w *= Scale[out];
-    int Before = Bias > 0;
     Bias += w;
-    int After = Bias > 0;
-    return After - Before;
   }
 
   /// update - Recompute Value from Bias and Links. Return true when node
@@ -230,14 +227,14 @@ void SpillPlacement::addConstraints(ArrayRef<BlockConstraint> LiveBlocks) {
     if (I->Entry != DontCare) {
       unsigned ib = bundles->getBundle(I->Number, 0);
       activate(ib);
-      PositiveNodes += nodes[ib].addBias(Freq * Bias[I->Entry], 1);
+      nodes[ib].addBias(Freq * Bias[I->Entry], 1);
     }
 
     // Live-out from block?
     if (I->Exit != DontCare) {
       unsigned ob = bundles->getBundle(I->Number, 1);
       activate(ob);
-      PositiveNodes += nodes[ob].addBias(Freq * Bias[I->Exit], 0);
+      nodes[ob].addBias(Freq * Bias[I->Exit], 0);
     }
   }
 }
@@ -254,16 +251,42 @@ void SpillPlacement::addLinks(ArrayRef<unsigned> Links) {
       continue;
     activate(ib);
     activate(ob);
+    if (nodes[ib].Links.empty() && !nodes[ib].mustSpill())
+      Linked.push_back(ib);
+    if (nodes[ob].Links.empty() && !nodes[ob].mustSpill())
+      Linked.push_back(ob);
     float Freq = getBlockFrequency(Number);
     nodes[ib].addLink(ob, Freq, 1);
     nodes[ob].addLink(ib, Freq, 0);
   }
 }
 
+bool SpillPlacement::scanActiveBundles() {
+  Linked.clear();
+  RecentPositive.clear();
+  for (int n = ActiveNodes->find_first(); n>=0; n = ActiveNodes->find_next(n)) {
+    nodes[n].update(nodes);
+    // A node that must spill, or a node without any links is not going to
+    // change its value ever again, so exclude it from iterations.
+    if (nodes[n].mustSpill())
+      continue;
+    if (!nodes[n].Links.empty())
+      Linked.push_back(n);
+    if (nodes[n].preferReg())
+      RecentPositive.push_back(n);
+  }
+  return !RecentPositive.empty();
+}
+
 /// iterate - Repeatedly update the Hopfield nodes until stability or the
 /// maximum number of iterations is reached.
 /// @param Linked - Numbers of linked nodes that need updating.
-void SpillPlacement::iterate(const SmallVectorImpl<unsigned> &Linked) {
+void SpillPlacement::iterate() {
+  // First update the recently positive nodes. They have likely received new
+  // negative bias that will turn them off.
+  while (!RecentPositive.empty())
+    nodes[RecentPositive.pop_back_val()].update(nodes);
+
   if (Linked.empty())
     return;
 
@@ -279,10 +302,13 @@ void SpillPlacement::iterate(const SmallVectorImpl<unsigned> &Linked) {
     for (SmallVectorImpl<unsigned>::const_reverse_iterator I =
            llvm::next(Linked.rbegin()), E = Linked.rend(); I != E; ++I) {
       unsigned n = *I;
-      bool C = nodes[n].update(nodes);
-      Changed |= C;
+      if (nodes[n].update(nodes)) {
+        Changed = true;
+        if (nodes[n].preferReg())
+          RecentPositive.push_back(n);
+      }
     }
-    if (!Changed)
+    if (!Changed || !RecentPositive.empty())
       return;
 
     // Scan forwards, skipping the first node which was just updated.
@@ -290,38 +316,29 @@ void SpillPlacement::iterate(const SmallVectorImpl<unsigned> &Linked) {
     for (SmallVectorImpl<unsigned>::const_iterator I =
            llvm::next(Linked.begin()), E = Linked.end(); I != E; ++I) {
       unsigned n = *I;
-      bool C = nodes[n].update(nodes);
-      Changed |= C;
+      if (nodes[n].update(nodes)) {
+        Changed = true;
+        if (nodes[n].preferReg())
+          RecentPositive.push_back(n);
+      }
     }
-    if (!Changed)
+    if (!Changed || !RecentPositive.empty())
       return;
   }
 }
 
 void SpillPlacement::prepare(BitVector &RegBundles) {
+  Linked.clear();
+  RecentPositive.clear();
   // Reuse RegBundles as our ActiveNodes vector.
   ActiveNodes = &RegBundles;
   ActiveNodes->clear();
   ActiveNodes->resize(bundles->getNumBundles());
-  PositiveNodes = 0;
 }
 
 bool
 SpillPlacement::finish() {
   assert(ActiveNodes && "Call prepare() first");
-  // Update all active nodes, and find the ones that are actually linked to
-  // something so their value may change when iterating.
-  SmallVector<unsigned, 8> Linked;
-  for (int n = ActiveNodes->find_first(); n>=0; n = ActiveNodes->find_next(n)) {
-    nodes[n].update(nodes);
-    // A node that must spill, or a node without any links is not going to
-    // change its value ever again, so exclude it from iterations.
-    if (!nodes[n].Links.empty() && !nodes[n].mustSpill())
-      Linked.push_back(n);
-  }
-
-  // Iterate the network to convergence.
-  iterate(Linked);
 
   // Write preferences back to ActiveNodes.
   bool Perfect = true;
diff --git a/lib/CodeGen/SpillPlacement.h b/lib/CodeGen/SpillPlacement.h
index 46e64e6..6952ad8 100644
--- a/lib/CodeGen/SpillPlacement.h
+++ b/lib/CodeGen/SpillPlacement.h
@@ -49,8 +49,12 @@ class SpillPlacement  : public MachineFunctionPass {
   // caller.
   BitVector *ActiveNodes;
 
-  // The number of active nodes with a positive bias.
-  unsigned PositiveNodes;
+  // Nodes with active links. Populated by scanActiveBundles.
+  SmallVector<unsigned, 8> Linked;
+
+  // Nodes that went positive during the last call to scanActiveBundles or
+  // iterate.
+  SmallVector<unsigned, 8> RecentPositive;
 
   // Block frequencies are computed once. Indexed by block number.
   SmallVector<float, 4> BlockFrequency;
@@ -95,9 +99,20 @@ public:
   /// addLinks - Add transparent blocks with the given numbers.
   void addLinks(ArrayRef<unsigned> Links);
 
-  /// getPositiveNodes - Return the total number of graph nodes with a positive
-  /// bias after adding constraints.
-  unsigned getPositiveNodes() const { return PositiveNodes; }
+  /// scanActiveBundles - Perform an initial scan of all bundles activated by
+  /// addConstraints and addLinks, updating their state. Add all the bundles
+  /// that now prefer a register to RecentPositive.
+  /// Prepare internal data structures for iterate.
+  /// Return true is there are any positive nodes.
+  bool scanActiveBundles();
+
+  /// iterate - Update the network iteratively until convergence, or new bundles
+  /// are found.
+  void iterate();
+
+  /// getRecentPositive - Return an array of bundles that became positive during
+  /// the previous call to scanActiveBundles or iterate.
+  ArrayRef<unsigned> getRecentPositive() { return RecentPositive; }
 
   /// finish - Compute the optimal spill code placement given the
   /// constraints. No MustSpill constraints will be violated, and the smallest
@@ -120,7 +135,6 @@ private:
   virtual void releaseMemory();
 
   void activate(unsigned);
-  void iterate(const SmallVectorImpl<unsigned>&);
 };
 
 } // end namespace llvm
diff --git a/lib/CodeGen/Spiller.cpp b/lib/CodeGen/Spiller.cpp
index b89139f..b6bbcd7 100644
--- a/lib/CodeGen/Spiller.cpp
+++ b/lib/CodeGen/Spiller.cpp
@@ -25,7 +25,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include <set>
 
 using namespace llvm;
 
diff --git a/lib/CodeGen/SplitKit.cpp b/lib/CodeGen/SplitKit.cpp
index 201e9b1..bf27cc8 100644
--- a/lib/CodeGen/SplitKit.cpp
+++ b/lib/CodeGen/SplitKit.cpp
@@ -30,6 +30,9 @@ using namespace llvm;
 
 STATISTIC(NumFinished, "Number of splits finished");
 STATISTIC(NumSimple,   "Number of splits that were simple");
+STATISTIC(NumCopies,   "Number of copies inserted for splitting");
+STATISTIC(NumRemats,   "Number of rematerialized defs for splitting");
+STATISTIC(NumRepairs,  "Number of invalid live ranges repaired");
 
 //===----------------------------------------------------------------------===//
 //                                 Split Analysis
@@ -51,6 +54,7 @@ void SplitAnalysis::clear() {
   UseBlocks.clear();
   ThroughBlocks.clear();
   CurLI = 0;
+  DidRepairRange = false;
 }
 
 SlotIndex SplitAnalysis::computeLastSplitPoint(unsigned Num) {
@@ -119,6 +123,8 @@ void SplitAnalysis::analyzeUses() {
   if (!calcLiveBlockInfo()) {
     // FIXME: calcLiveBlockInfo found inconsistencies in the live range.
     // I am looking at you, SimpleRegisterCoalescing!
+    DidRepairRange = true;
+    ++NumRepairs;
     DEBUG(dbgs() << "*** Fixing inconsistent live interval! ***\n");
     const_cast<LiveIntervals&>(LIS)
       .shrinkToUses(const_cast<LiveInterval*>(CurLI));
@@ -132,12 +138,14 @@ void SplitAnalysis::analyzeUses() {
   DEBUG(dbgs() << "Analyze counted "
                << UseSlots.size() << " instrs in "
                << UseBlocks.size() << " blocks, through "
-               << ThroughBlocks.size() << " blocks.\n");
+               << NumThroughBlocks << " blocks.\n");
 }
 
 /// calcLiveBlockInfo - Fill the LiveBlocks array with information about blocks
 /// where CurLI is live.
 bool SplitAnalysis::calcLiveBlockInfo() {
+  ThroughBlocks.resize(MF.getNumBlockIDs());
+  NumThroughBlocks = NumGapBlocks = 0;
   if (CurLI->empty())
     return true;
 
@@ -156,54 +164,63 @@ bool SplitAnalysis::calcLiveBlockInfo() {
     SlotIndex Start, Stop;
     tie(Start, Stop) = LIS.getSlotIndexes()->getMBBRange(BI.MBB);
 
-    // LVI is the first live segment overlapping MBB.
-    BI.LiveIn = LVI->start <= Start;
-    if (!BI.LiveIn)
-      BI.Def = LVI->start;
-
-    // Find the first and last uses in the block.
-    bool Uses = UseI != UseE && *UseI < Stop;
-    if (Uses) {
+    // If the block contains no uses, the range must be live through. At one
+    // point, SimpleRegisterCoalescing could create dangling ranges that ended
+    // mid-block.
+    if (UseI == UseE || *UseI >= Stop) {
+      ++NumThroughBlocks;
+      ThroughBlocks.set(BI.MBB->getNumber());
+      // The range shouldn't end mid-block if there are no uses. This shouldn't
+      // happen.
+      if (LVI->end < Stop)
+        return false;
+    } else {
+      // This block has uses. Find the first and last uses in the block.
       BI.FirstUse = *UseI;
       assert(BI.FirstUse >= Start);
       do ++UseI;
       while (UseI != UseE && *UseI < Stop);
       BI.LastUse = UseI[-1];
       assert(BI.LastUse < Stop);
-    }
 
-    // Look for gaps in the live range.
-    bool hasGap = false;
-    BI.LiveOut = true;
-    while (LVI->end < Stop) {
-      SlotIndex LastStop = LVI->end;
-      if (++LVI == LVE || LVI->start >= Stop) {
-        BI.Kill = LastStop;
-        BI.LiveOut = false;
-        break;
-      }
-      if (LastStop < LVI->start) {
-        hasGap = true;
-        BI.Kill = LastStop;
-        BI.Def = LVI->start;
+      // LVI is the first live segment overlapping MBB.
+      BI.LiveIn = LVI->start <= Start;
+
+      // Look for gaps in the live range.
+      BI.LiveOut = true;
+      while (LVI->end < Stop) {
+        SlotIndex LastStop = LVI->end;
+        if (++LVI == LVE || LVI->start >= Stop) {
+          BI.LiveOut = false;
+          BI.LastUse = LastStop;
+          break;
+        }
+        if (LastStop < LVI->start) {
+          // There is a gap in the live range. Create duplicate entries for the
+          // live-in snippet and the live-out snippet.
+          ++NumGapBlocks;
+
+          // Push the Live-in part.
+          BI.LiveThrough = false;
+          BI.LiveOut = false;
+          UseBlocks.push_back(BI);
+          UseBlocks.back().LastUse = LastStop;
+
+          // Set up BI for the live-out part.
+          BI.LiveIn = false;
+          BI.LiveOut = true;
+          BI.FirstUse = LVI->start;
+        }
       }
-    }
 
-    // Don't set LiveThrough when the block has a gap.
-    BI.LiveThrough = !hasGap && BI.LiveIn && BI.LiveOut;
-    if (Uses)
+      // Don't set LiveThrough when the block has a gap.
+      BI.LiveThrough = BI.LiveIn && BI.LiveOut;
       UseBlocks.push_back(BI);
-    else
-      ThroughBlocks.push_back(BI.MBB->getNumber());
 
-    // FIXME: This should never happen. The live range stops or starts without a
-    // corresponding use. An earlier pass did something wrong.
-    if (!BI.LiveThrough && !Uses)
-      return false;
-
-    // LVI is now at LVE or LVI->end >= Stop.
-    if (LVI == LVE)
-      break;
+      // LVI is now at LVE or LVI->end >= Stop.
+      if (LVI == LVE)
+        break;
+    }
 
     // Live segment ends exactly at Stop. Move to the next segment.
     if (LVI->end == Stop && ++LVI == LVE)
@@ -215,9 +232,34 @@ bool SplitAnalysis::calcLiveBlockInfo() {
     else
       MFI = LIS.getMBBFromIndex(LVI->start);
   }
+
+  assert(getNumLiveBlocks() == countLiveBlocks(CurLI) && "Bad block count");
   return true;
 }
 
+unsigned SplitAnalysis::countLiveBlocks(const LiveInterval *cli) const {
+  if (cli->empty())
+    return 0;
+  LiveInterval *li = const_cast<LiveInterval*>(cli);
+  LiveInterval::iterator LVI = li->begin();
+  LiveInterval::iterator LVE = li->end();
+  unsigned Count = 0;
+
+  // Loop over basic blocks where li is live.
+  MachineFunction::const_iterator MFI = LIS.getMBBFromIndex(LVI->start);
+  SlotIndex Stop = LIS.getMBBEndIdx(MFI);
+  for (;;) {
+    ++Count;
+    LVI = li->advanceTo(LVI, Stop);
+    if (LVI == LVE)
+      return Count;
+    do {
+      ++MFI;
+      Stop = LIS.getMBBEndIdx(MFI);
+    } while (Stop <= LVI->start);
+  }
+}
+
 bool SplitAnalysis::isOriginalEndpoint(SlotIndex Idx) const {
   unsigned OrigReg = VRM.getOriginal(CurLI->reg);
   const LiveInterval &Orig = LIS.getInterval(OrigReg);
@@ -348,8 +390,33 @@ void SplitEditor::extendRange(unsigned RegIdx, SlotIndex Idx) {
   // Now for the fun part. We know that ParentVNI potentially has multiple defs,
   // and we may need to create even more phi-defs to preserve VNInfo SSA form.
   // Perform a search for all predecessor blocks where we know the dominating
-  // VNInfo. Insert phi-def VNInfos along the path back to IdxMBB.
+  // VNInfo.
+  VNInfo *VNI = findReachingDefs(LI, IdxMBB, Idx.getNextSlot());
 
+  // When there were multiple different values, we may need new PHIs.
+  if (!VNI)
+    return updateSSA();
+
+  // Poor man's SSA update for the single-value case.
+  LiveOutPair LOP(VNI, MDT[LIS.getMBBFromIndex(VNI->def)]);
+  for (SmallVectorImpl<LiveInBlock>::iterator I = LiveInBlocks.begin(),
+         E = LiveInBlocks.end(); I != E; ++I) {
+    MachineBasicBlock *MBB = I->DomNode->getBlock();
+    SlotIndex Start = LIS.getMBBStartIdx(MBB);
+    if (I->Kill.isValid())
+      LI->addRange(LiveRange(Start, I->Kill, VNI));
+    else {
+      LiveOutCache[MBB] = LOP;
+      LI->addRange(LiveRange(Start, LIS.getMBBEndIdx(MBB), VNI));
+    }
+  }
+}
+
+/// findReachingDefs - Search the CFG for known live-out values.
+/// Add required live-in blocks to LiveInBlocks.
+VNInfo *SplitEditor::findReachingDefs(LiveInterval *LI,
+                                      MachineBasicBlock *KillMBB,
+                                      SlotIndex Kill) {
   // Initialize the live-out cache the first time it is needed.
   if (LiveOutSeen.empty()) {
     unsigned N = VRM.getMachineFunction().getNumBlockIDs();
@@ -358,16 +425,15 @@ void SplitEditor::extendRange(unsigned RegIdx, SlotIndex Idx) {
   }
 
   // Blocks where LI should be live-in.
-  SmallVector<MachineDomTreeNode*, 16> LiveIn;
-  LiveIn.push_back(MDT[IdxMBB]);
+  SmallVector<MachineBasicBlock*, 16> WorkList(1, KillMBB);
 
   // Remember if we have seen more than one value.
   bool UniqueVNI = true;
-  VNInfo *IdxVNI = 0;
+  VNInfo *TheVNI = 0;
 
   // Using LiveOutCache as a visited set, perform a BFS for all reaching defs.
-  for (unsigned i = 0; i != LiveIn.size(); ++i) {
-    MachineBasicBlock *MBB = LiveIn[i]->getBlock();
+  for (unsigned i = 0; i != WorkList.size(); ++i) {
+    MachineBasicBlock *MBB = WorkList[i];
     assert(!MBB->pred_empty() && "Value live-in to entry block?");
     for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
            PE = MBB->pred_end(); PI != PE; ++PI) {
@@ -377,9 +443,9 @@ void SplitEditor::extendRange(unsigned RegIdx, SlotIndex Idx) {
        // Is this a known live-out block?
        if (LiveOutSeen.test(Pred->getNumber())) {
          if (VNInfo *VNI = LOP.first) {
-           if (IdxVNI && IdxVNI != VNI)
+           if (TheVNI && TheVNI != VNI)
              UniqueVNI = false;
-           IdxVNI = VNI;
+           TheVNI = VNI;
          }
          continue;
        }
@@ -395,64 +461,50 @@ void SplitEditor::extendRange(unsigned RegIdx, SlotIndex Idx) {
        LOP.first = VNI;
        if (VNI) {
          LOP.second = MDT[LIS.getMBBFromIndex(VNI->def)];
-         if (IdxVNI && IdxVNI != VNI)
+         if (TheVNI && TheVNI != VNI)
            UniqueVNI = false;
-         IdxVNI = VNI;
+         TheVNI = VNI;
          continue;
        }
        LOP.second = 0;
 
        // No, we need a live-in value for Pred as well
-       if (Pred != IdxMBB)
-         LiveIn.push_back(MDT[Pred]);
+       if (Pred != KillMBB)
+          WorkList.push_back(Pred);
        else
-         UniqueVNI = false; // Loopback to IdxMBB, ask updateSSA() for help.
+          // Loopback to KillMBB, so value is really live through.
+         Kill = SlotIndex();
     }
   }
 
-  // We may need to add phi-def values to preserve the SSA form.
-  if (UniqueVNI) {
-    LiveOutPair LOP(IdxVNI, MDT[LIS.getMBBFromIndex(IdxVNI->def)]);
-    // Update LiveOutCache, but skip IdxMBB at LiveIn[0].
-    for (unsigned i = 1, e = LiveIn.size(); i != e; ++i)
-      LiveOutCache[LiveIn[i]->getBlock()] = LOP;
-  } else
-    IdxVNI = updateSSA(RegIdx, LiveIn, Idx, IdxMBB);
-
-  // Since we went through the trouble of a full BFS visiting all reaching defs,
-  // the values in LiveIn are now accurate. No more phi-defs are needed
-  // for these blocks, so we can color the live ranges.
-  for (unsigned i = 0, e = LiveIn.size(); i != e; ++i) {
-    MachineBasicBlock *MBB = LiveIn[i]->getBlock();
-    SlotIndex Start = LIS.getMBBStartIdx(MBB);
-    VNInfo *VNI = LiveOutCache[MBB].first;
+  // Transfer WorkList to LiveInBlocks in reverse order.
+  // This ordering works best with updateSSA().
+  LiveInBlocks.clear();
+  LiveInBlocks.reserve(WorkList.size());
+  while(!WorkList.empty())
+    LiveInBlocks.push_back(MDT[WorkList.pop_back_val()]);
 
-    // Anything in LiveIn other than IdxMBB is live-through.
-    // In IdxMBB, we should stop at Idx unless the same value is live-out.
-    if (MBB == IdxMBB && IdxVNI != VNI)
-      LI->addRange(LiveRange(Start, Idx.getNextSlot(), IdxVNI));
-    else
-      LI->addRange(LiveRange(Start, LIS.getMBBEndIdx(MBB), VNI));
-  }
+  // The kill block may not be live-through.
+  assert(LiveInBlocks.back().DomNode->getBlock() == KillMBB);
+  LiveInBlocks.back().Kill = Kill;
+
+  return UniqueVNI ? TheVNI : 0;
 }
 
-VNInfo *SplitEditor::updateSSA(unsigned RegIdx,
-                               SmallVectorImpl<MachineDomTreeNode*> &LiveIn,
-                               SlotIndex Idx,
-                               const MachineBasicBlock *IdxMBB) {
+void SplitEditor::updateSSA() {
   // This is essentially the same iterative algorithm that SSAUpdater uses,
   // except we already have a dominator tree, so we don't have to recompute it.
-  LiveInterval *LI = Edit->get(RegIdx);
-  VNInfo *IdxVNI = 0;
   unsigned Changes;
   do {
     Changes = 0;
     // Propagate live-out values down the dominator tree, inserting phi-defs
-    // when necessary. Since LiveIn was created by a BFS, going backwards makes
-    // it more likely for us to visit immediate dominators before their
-    // children.
-    for (unsigned i = LiveIn.size(); i; --i) {
-      MachineDomTreeNode *Node = LiveIn[i-1];
+    // when necessary.
+    for (SmallVectorImpl<LiveInBlock>::iterator I = LiveInBlocks.begin(),
+           E = LiveInBlocks.end(); I != E; ++I) {
+      MachineDomTreeNode *Node = I->DomNode;
+      // Skip block if the live-in value has already been determined.
+      if (!Node)
+        continue;
       MachineBasicBlock *MBB = Node->getBlock();
       MachineDomTreeNode *IDom = Node->getIDom();
       LiveOutPair IDomValue;
@@ -461,9 +513,9 @@ VNInfo *SplitEditor::updateSSA(unsigned RegIdx,
       // This is probably an unreachable block that has survived somehow.
       bool needPHI = !IDom || !LiveOutSeen.test(IDom->getBlock()->getNumber());
 
-      // IDom dominates all of our predecessors, but it may not be the immediate
-      // dominator. Check if any of them have live-out values that are properly
-      // dominated by IDom. If so, we need a phi-def here.
+      // IDom dominates all of our predecessors, but it may not be their
+      // immediate dominator. Check if any of them have live-out values that are
+      // properly dominated by IDom. If so, we need a phi-def here.
       if (!needPHI) {
         IDomValue = LiveOutCache[IDom->getBlock()];
         for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
@@ -481,40 +533,35 @@ VNInfo *SplitEditor::updateSSA(unsigned RegIdx,
         }
       }
 
+      // The value may be live-through even if Kill is set, as can happen when
+      // we are called from extendRange. In that case LiveOutSeen is true, and
+      // LiveOutCache indicates a foreign or missing value.
+      LiveOutPair &LOP = LiveOutCache[MBB];
+
       // Create a phi-def if required.
       if (needPHI) {
         ++Changes;
         SlotIndex Start = LIS.getMBBStartIdx(MBB);
+        unsigned RegIdx = RegAssign.lookup(Start);
+        LiveInterval *LI = Edit->get(RegIdx);
         VNInfo *VNI = LI->getNextValue(Start, 0, LIS.getVNInfoAllocator());
         VNI->setIsPHIDef(true);
-        // We no longer need LI to be live-in.
-        LiveIn.erase(LiveIn.begin()+(i-1));
-        // Blocks in LiveIn are either IdxMBB, or have a value live-through.
-        if (MBB == IdxMBB)
-          IdxVNI = VNI;
-        // Check if we need to update live-out info.
-        LiveOutPair &LOP = LiveOutCache[MBB];
-        if (LOP.second == Node || !LiveOutSeen.test(MBB->getNumber())) {
-          // We already have a live-out defined in MBB, so this must be IdxMBB.
-          assert(MBB == IdxMBB && "Adding phi-def to known live-out");
-          LI->addRange(LiveRange(Start, Idx.getNextSlot(), VNI));
-        } else {
-          // This phi-def is also live-out, so color the whole block.
+        I->Value = VNI;
+        // This block is done, we know the final value.
+        I->DomNode = 0;
+        if (I->Kill.isValid())
+          LI->addRange(LiveRange(Start, I->Kill, VNI));
+        else {
           LI->addRange(LiveRange(Start, LIS.getMBBEndIdx(MBB), VNI));
           LOP = LiveOutPair(VNI, Node);
         }
       } else if (IDomValue.first) {
-        // No phi-def here. Remember incoming value for IdxMBB.
-        if (MBB == IdxMBB) {
-          IdxVNI = IDomValue.first;
-          // IdxMBB need not be live-out.
-          if (!LiveOutSeen.test(MBB->getNumber()))
-            continue;
-        }
-        assert(LiveOutSeen.test(MBB->getNumber()) && "Expected live-out block");
+        // No phi-def here. Remember incoming value.
+        I->Value = IDomValue.first;
+        if (I->Kill.isValid())
+          continue;
         // Propagate IDomValue if needed:
         // MBB is live-out and doesn't define its own value.
-        LiveOutPair &LOP = LiveOutCache[MBB];
         if (LOP.second != Node && LOP.first != IDomValue.first) {
           ++Changes;
           LOP = IDomValue;
@@ -523,8 +570,20 @@ VNInfo *SplitEditor::updateSSA(unsigned RegIdx,
     }
   } while (Changes);
 
-  assert(IdxVNI && "Didn't find value for Idx");
-  return IdxVNI;
+  // The values in LiveInBlocks are now accurate. No more phi-defs are needed
+  // for these blocks, so we can color the live ranges.
+  for (SmallVectorImpl<LiveInBlock>::iterator I = LiveInBlocks.begin(),
+         E = LiveInBlocks.end(); I != E; ++I) {
+    if (!I->DomNode)
+      continue;
+    assert(I->Value && "No live-in value found");
+    MachineBasicBlock *MBB = I->DomNode->getBlock();
+    SlotIndex Start = LIS.getMBBStartIdx(MBB);
+    unsigned RegIdx = RegAssign.lookup(Start);
+    LiveInterval *LI = Edit->get(RegIdx);
+    LI->addRange(LiveRange(Start, I->Kill.isValid() ?
+                                  I->Kill : LIS.getMBBEndIdx(MBB), I->Value));
+  }
 }
 
 VNInfo *SplitEditor::defFromParent(unsigned RegIdx,
@@ -536,15 +595,22 @@ VNInfo *SplitEditor::defFromParent(unsigned RegIdx,
   SlotIndex Def;
   LiveInterval *LI = Edit->get(RegIdx);
 
+  // We may be trying to avoid interference that ends at a deleted instruction,
+  // so always begin RegIdx 0 early and all others late.
+  bool Late = RegIdx != 0;
+
   // Attempt cheap-as-a-copy rematerialization.
   LiveRangeEdit::Remat RM(ParentVNI);
   if (Edit->canRematerializeAt(RM, UseIdx, true, LIS)) {
-    Def = Edit->rematerializeAt(MBB, I, LI->reg, RM, LIS, TII, TRI);
+    Def = Edit->rematerializeAt(MBB, I, LI->reg, RM, LIS, TII, TRI, Late);
+    ++NumRemats;
   } else {
     // Can't remat, just insert a copy from parent.
     CopyMI = BuildMI(MBB, I, DebugLoc(), TII.get(TargetOpcode::COPY), LI->reg)
                .addReg(Edit->getReg());
-    Def = LIS.InsertMachineInstrInMaps(CopyMI).getDefIndex();
+    Def = LIS.getSlotIndexes()->insertMachineInstrInMaps(CopyMI, Late)
+            .getDefIndex();
+    ++NumCopies;
   }
 
   // Define the value in Reg.
@@ -554,9 +620,7 @@ VNInfo *SplitEditor::defFromParent(unsigned RegIdx,
 }
 
 /// Create a new virtual register and live interval.
-void SplitEditor::openIntv() {
-  assert(!OpenIdx && "Previous LI not closed before openIntv");
-
+unsigned SplitEditor::openIntv() {
   // Create the complement as index 0.
   if (Edit->empty())
     Edit->create(LIS, VRM);
@@ -564,6 +628,13 @@ void SplitEditor::openIntv() {
   // Create the open interval.
   OpenIdx = Edit->size();
   Edit->create(LIS, VRM);
+  return OpenIdx;
+}
+
+void SplitEditor::selectIntv(unsigned Idx) {
+  assert(Idx != 0 && "Cannot select the complement interval");
+  assert(Idx < Edit->size() && "Can only select previously opened interval");
+  OpenIdx = Idx;
 }
 
 SlotIndex SplitEditor::enterIntvBefore(SlotIndex Idx) {
@@ -686,18 +757,11 @@ void SplitEditor::overlapIntv(SlotIndex Start, SlotIndex End) {
   DEBUG(dump());
 }
 
-/// closeIntv - Indicate that we are done editing the currently open
-/// LiveInterval, and ranges can be trimmed.
-void SplitEditor::closeIntv() {
-  assert(OpenIdx && "openIntv not called before closeIntv");
-  OpenIdx = 0;
-}
-
-/// transferSimpleValues - Transfer all simply defined values to the new live
-/// ranges.
-/// Values that were rematerialized or that have multiple defs are left alone.
-bool SplitEditor::transferSimpleValues() {
+/// transferValues - Transfer all possible values to the new live ranges.
+/// Values that were rematerialized are left alone, they need extendRange().
+bool SplitEditor::transferValues() {
   bool Skipped = false;
+  LiveInBlocks.clear();
   RegAssignMap::const_iterator AssignI = RegAssign.begin();
   for (LiveInterval::const_iterator ParentI = Edit->getParent().begin(),
          ParentE = Edit->getParent().end(); ParentI != ParentE; ++ParentI) {
@@ -721,16 +785,98 @@ bool SplitEditor::transferSimpleValues() {
         RegIdx = 0;
         End = std::min(End, AssignI.start());
       }
+
+      // The interval [Start;End) is continuously mapped to RegIdx, ParentVNI.
       DEBUG(dbgs() << " [" << Start << ';' << End << ")=" << RegIdx);
+      LiveInterval *LI = Edit->get(RegIdx);
+
+      // Check for a simply defined value that can be blitted directly.
       if (VNInfo *VNI = Values.lookup(std::make_pair(RegIdx, ParentVNI->id))) {
         DEBUG(dbgs() << ':' << VNI->id);
-        Edit->get(RegIdx)->addRange(LiveRange(Start, End, VNI));
-      } else
+        LI->addRange(LiveRange(Start, End, VNI));
+        Start = End;
+        continue;
+      }
+
+      // Skip rematerialized values, we need to use extendRange() and
+      // extendPHIKillRanges() to completely recompute the live ranges.
+      if (Edit->didRematerialize(ParentVNI)) {
+        DEBUG(dbgs() << "(remat)");
         Skipped = true;
+        Start = End;
+        continue;
+      }
+
+      // Initialize the live-out cache the first time it is needed.
+      if (LiveOutSeen.empty()) {
+        unsigned N = VRM.getMachineFunction().getNumBlockIDs();
+        LiveOutSeen.resize(N);
+        LiveOutCache.resize(N);
+      }
+
+      // This value has multiple defs in RegIdx, but it wasn't rematerialized,
+      // so the live range is accurate. Add live-in blocks in [Start;End) to the
+      // LiveInBlocks.
+      MachineFunction::iterator MBB = LIS.getMBBFromIndex(Start);
+      SlotIndex BlockStart, BlockEnd;
+      tie(BlockStart, BlockEnd) = LIS.getSlotIndexes()->getMBBRange(MBB);
+
+      // The first block may be live-in, or it may have its own def.
+      if (Start != BlockStart) {
+        VNInfo *VNI = LI->extendInBlock(BlockStart,
+                                        std::min(BlockEnd, End).getPrevSlot());
+        assert(VNI && "Missing def for complex mapped value");
+        DEBUG(dbgs() << ':' << VNI->id << "*BB#" << MBB->getNumber());
+        // MBB has its own def. Is it also live-out?
+        if (BlockEnd <= End) {
+          LiveOutSeen.set(MBB->getNumber());
+          LiveOutCache[MBB] = LiveOutPair(VNI, MDT[MBB]);
+        }
+        // Skip to the next block for live-in.
+        ++MBB;
+        BlockStart = BlockEnd;
+      }
+
+      // Handle the live-in blocks covered by [Start;End).
+      assert(Start <= BlockStart && "Expected live-in block");
+      while (BlockStart < End) {
+        DEBUG(dbgs() << ">BB#" << MBB->getNumber());
+        BlockEnd = LIS.getMBBEndIdx(MBB);
+        if (BlockStart == ParentVNI->def) {
+          // This block has the def of a parent PHI, so it isn't live-in.
+          assert(ParentVNI->isPHIDef() && "Non-phi defined at block start?");
+          VNInfo *VNI = LI->extendInBlock(BlockStart,
+                                         std::min(BlockEnd, End).getPrevSlot());
+          assert(VNI && "Missing def for complex mapped parent PHI");
+          if (End >= BlockEnd) {
+            // Live-out as well.
+            LiveOutSeen.set(MBB->getNumber());
+            LiveOutCache[MBB] = LiveOutPair(VNI, MDT[MBB]);
+          }
+        } else {
+          // This block needs a live-in value.
+          LiveInBlocks.push_back(MDT[MBB]);
+          // The last block covered may not be live-out.
+          if (End < BlockEnd)
+            LiveInBlocks.back().Kill = End;
+          else {
+            // Live-out, but we need updateSSA to tell us the value.
+            LiveOutSeen.set(MBB->getNumber());
+            LiveOutCache[MBB] = LiveOutPair((VNInfo*)0,
+                                            (MachineDomTreeNode*)0);
+          }
+        }
+        BlockStart = BlockEnd;
+        ++MBB;
+      }
       Start = End;
     } while (Start != ParentI->end);
     DEBUG(dbgs() << '\n');
   }
+
+  if (!LiveInBlocks.empty())
+    updateSSA();
+
   return Skipped;
 }
 
@@ -835,8 +981,7 @@ void SplitEditor::deleteRematVictims() {
   Edit->eliminateDeadDefs(Dead, LIS, VRM, TII);
 }
 
-void SplitEditor::finish() {
-  assert(OpenIdx == 0 && "Previous LI not closed before rewrite");
+void SplitEditor::finish(SmallVectorImpl<unsigned> *LRMap) {
   ++NumFinished;
 
   // At this point, the live intervals in Edit contain VNInfos corresponding to
@@ -866,24 +1011,31 @@ void SplitEditor::finish() {
     assert((*I)->hasAtLeastOneValue() && "Split interval has no value");
 #endif
 
-  // Transfer the simply mapped values, check if any are complex.
-  bool Complex = transferSimpleValues();
-  if (Complex)
+  // Transfer the simply mapped values, check if any are skipped.
+  bool Skipped = transferValues();
+  if (Skipped)
     extendPHIKillRanges();
   else
     ++NumSimple;
 
   // Rewrite virtual registers, possibly extending ranges.
-  rewriteAssigned(Complex);
+  rewriteAssigned(Skipped);
 
   // Delete defs that were rematted everywhere.
-  if (Complex)
+  if (Skipped)
     deleteRematVictims();
 
   // Get rid of unused values and set phi-kill flags.
   for (LiveRangeEdit::iterator I = Edit->begin(), E = Edit->end(); I != E; ++I)
     (*I)->RenumberValues(LIS);
 
+  // Provide a reverse mapping from original indices to Edit ranges.
+  if (LRMap) {
+    LRMap->clear();
+    for (unsigned i = 0, e = Edit->size(); i != e; ++i)
+      LRMap->push_back(i);
+  }
+
   // Now check if any registers were separated into multiple components.
   ConnectedVNInfoEqClasses ConEQ(LIS);
   for (unsigned i = 0, e = Edit->size(); i != e; ++i) {
@@ -895,13 +1047,18 @@ void SplitEditor::finish() {
     DEBUG(dbgs() << "  " << NumComp << " components: " << *li << '\n');
     SmallVector<LiveInterval*, 8> dups;
     dups.push_back(li);
-    for (unsigned i = 1; i != NumComp; ++i)
+    for (unsigned j = 1; j != NumComp; ++j)
       dups.push_back(&Edit->create(LIS, VRM));
     ConEQ.Distribute(&dups[0], MRI);
+    // The new intervals all map back to i.
+    if (LRMap)
+      LRMap->resize(Edit->size(), i);
   }
 
   // Calculate spill weight and allocation hints for new intervals.
   Edit->calculateRegClassAndHint(VRM.getMachineFunction(), LIS, SA.Loops);
+
+  assert(!LRMap || LRMap->size() == Edit->size());
 }
 
 
@@ -925,6 +1082,21 @@ bool SplitAnalysis::getMultiUseBlocks(BlockPtrSet &Blocks) {
   return !Blocks.empty();
 }
 
+void SplitEditor::splitSingleBlock(const SplitAnalysis::BlockInfo &BI) {
+  openIntv();
+  SlotIndex LastSplitPoint = SA.getLastSplitPoint(BI.MBB->getNumber());
+  SlotIndex SegStart = enterIntvBefore(std::min(BI.FirstUse,
+    LastSplitPoint));
+  if (!BI.LiveOut || BI.LastUse < LastSplitPoint) {
+    useIntv(SegStart, leaveIntvAfter(BI.LastUse));
+  } else {
+      // The last use is after the last valid split point.
+    SlotIndex SegStop = leaveIntvBefore(LastSplitPoint);
+    useIntv(SegStart, SegStop);
+    overlapIntv(SegStop, BI.LastUse);
+  }
+}
+
 /// splitSingleBlocks - Split CurLI into a separate live interval inside each
 /// basic block in Blocks.
 void SplitEditor::splitSingleBlocks(const SplitAnalysis::BlockPtrSet &Blocks) {
@@ -932,22 +1104,8 @@ void SplitEditor::splitSingleBlocks(const SplitAnalysis::BlockPtrSet &Blocks) {
   ArrayRef<SplitAnalysis::BlockInfo> UseBlocks = SA.getUseBlocks();
   for (unsigned i = 0; i != UseBlocks.size(); ++i) {
     const SplitAnalysis::BlockInfo &BI = UseBlocks[i];
-    if (!Blocks.count(BI.MBB))
-      continue;
-
-    openIntv();
-    SlotIndex LastSplitPoint = SA.getLastSplitPoint(BI.MBB->getNumber());
-    SlotIndex SegStart = enterIntvBefore(std::min(BI.FirstUse,
-                                                  LastSplitPoint));
-    if (!BI.LiveOut || BI.LastUse < LastSplitPoint) {
-      useIntv(SegStart, leaveIntvAfter(BI.LastUse));
-    } else {
-      // The last use is after the last valid split point.
-      SlotIndex SegStop = leaveIntvBefore(LastSplitPoint);
-      useIntv(SegStart, SegStop);
-      overlapIntv(SegStop, BI.LastUse);
-    }
-    closeIntv();
+    if (Blocks.count(BI.MBB))
+      splitSingleBlock(BI);
   }
   finish();
 }
diff --git a/lib/CodeGen/SplitKit.h b/lib/CodeGen/SplitKit.h
index 20ac8a1..7174c0b 100644
--- a/lib/CodeGen/SplitKit.h
+++ b/lib/CodeGen/SplitKit.h
@@ -12,6 +12,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#ifndef LLVM_CODEGEN_SPLITKIT_H
+#define LLVM_CODEGEN_SPLITKIT_H
+
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
@@ -60,17 +63,22 @@ public:
   ///  1. |   o---x   | Internal to block. Variable is only live in this block.
   ///  2. |---x       | Live-in, kill.
   ///  3. |       o---| Def, live-out.
-  ///  4. |---x   o---| Live-in, kill, def, live-out.
+  ///  4. |---x   o---| Live-in, kill, def, live-out. Counted by NumGapBlocks.
   ///  5. |---o---o---| Live-through with uses or defs.
-  ///  6. |-----------| Live-through without uses. Transparent.
+  ///  6. |-----------| Live-through without uses. Counted by NumThroughBlocks.
+  ///
+  /// Two BlockInfo entries are created for template 4. One for the live-in
+  /// segment, and one for the live-out segment. These entries look as if the
+  /// block were split in the middle where the live range isn't live.
+  ///
+  /// Live-through blocks without any uses don't get BlockInfo entries. They
+  /// are simply listed in ThroughBlocks instead.
   ///
   struct BlockInfo {
     MachineBasicBlock *MBB;
     SlotIndex FirstUse;   ///< First instr using current reg.
     SlotIndex LastUse;    ///< Last instr using current reg.
-    SlotIndex Kill;       ///< Interval end point inside block.
-    SlotIndex Def;        ///< Interval start point inside block.
-    bool LiveThrough;     ///< Live in whole block (Templ 5. or 6. above).
+    bool LiveThrough;     ///< Live in whole block (Templ 5. above).
     bool LiveIn;          ///< Current reg is live in.
     bool LiveOut;         ///< Current reg is live out.
   };
@@ -88,8 +96,18 @@ private:
   /// UseBlocks - Blocks where CurLI has uses.
   SmallVector<BlockInfo, 8> UseBlocks;
 
+  /// NumGapBlocks - Number of duplicate entries in UseBlocks for blocks where
+  /// the live range has a gap.
+  unsigned NumGapBlocks;
+
   /// ThroughBlocks - Block numbers where CurLI is live through without uses.
-  SmallVector<unsigned, 8> ThroughBlocks;
+  BitVector ThroughBlocks;
+
+  /// NumThroughBlocks - Number of live-through blocks.
+  unsigned NumThroughBlocks;
+
+  /// DidRepairRange - analyze was forced to shrinkToUses().
+  bool DidRepairRange;
 
   SlotIndex computeLastSplitPoint(unsigned Num);
 
@@ -107,6 +125,11 @@ public:
   /// split.
   void analyze(const LiveInterval *li);
 
+  /// didRepairRange() - Returns true if CurLI was invalid and has been repaired
+  /// by analyze(). This really shouldn't happen, but sometimes the coalescer
+  /// can create live ranges that end in mid-air.
+  bool didRepairRange() const { return DidRepairRange; }
+
   /// clear - clear all data structures so SplitAnalysis is ready to analyze a
   /// new interval.
   void clear();
@@ -133,11 +156,26 @@ public:
 
   /// getUseBlocks - Return an array of BlockInfo objects for the basic blocks
   /// where CurLI has uses.
-  ArrayRef<BlockInfo> getUseBlocks() { return UseBlocks; }
+  ArrayRef<BlockInfo> getUseBlocks() const { return UseBlocks; }
+
+  /// getNumThroughBlocks - Return the number of through blocks.
+  unsigned getNumThroughBlocks() const { return NumThroughBlocks; }
+
+  /// isThroughBlock - Return true if CurLI is live through MBB without uses.
+  bool isThroughBlock(unsigned MBB) const { return ThroughBlocks.test(MBB); }
+
+  /// getThroughBlocks - Return the set of through blocks.
+  const BitVector &getThroughBlocks() const { return ThroughBlocks; }
+
+  /// getNumLiveBlocks - Return the number of blocks where CurLI is live.
+  unsigned getNumLiveBlocks() const {
+    return getUseBlocks().size() - NumGapBlocks + getNumThroughBlocks();
+  }
 
-  /// getThroughBlocks - Return an array of block numbers where CurLI is live
-  /// through without uses.
-  ArrayRef<unsigned> getThroughBlocks() { return ThroughBlocks; }
+  /// countLiveBlocks - Return the number of blocks where li is live. This is
+  /// guaranteed to return the same number as getNumLiveBlocks() after calling
+  /// analyze(li).
+  unsigned countLiveBlocks(const LiveInterval *li) const;
 
   typedef SmallPtrSet<const MachineBasicBlock*, 16> BlockPtrSet;
 
@@ -223,6 +261,30 @@ class SplitEditor {
   // entry in LiveOutCache.
   BitVector LiveOutSeen;
 
+  /// LiveInBlock - Info for updateSSA() about a block where a register is
+  /// live-in.
+  /// The updateSSA caller provides DomNode and Kill inside MBB, updateSSA()
+  /// adds the computed live-in value.
+  struct LiveInBlock {
+    // Dominator tree node for the block.
+    // Cleared by updateSSA when the final value has been determined.
+    MachineDomTreeNode *DomNode;
+
+    // Live-in value filled in by updateSSA once it is known.
+    VNInfo *Value;
+
+    // Position in block where the live-in range ends, or SlotIndex() if the
+    // range passes through the block.
+    SlotIndex Kill;
+
+    LiveInBlock(MachineDomTreeNode *node) : DomNode(node), Value(0) {}
+  };
+
+  /// LiveInBlocks - List of live-in blocks used by findReachingDefs() and
+  /// updateSSA(). This list is usually empty, it exists here to avoid frequent
+  /// reallocations.
+  SmallVector<LiveInBlock, 16> LiveInBlocks;
+
   /// defValue - define a value in RegIdx from ParentVNI at Idx.
   /// Idx does not have to be ParentVNI->def, but it must be contained within
   /// ParentVNI's live range in ParentLI. The new value is added to the value
@@ -246,17 +308,22 @@ class SplitEditor {
   /// Insert PHIDefs as needed to preserve SSA form.
   void extendRange(unsigned RegIdx, SlotIndex Idx);
 
-  /// updateSSA - Insert PHIDefs as necessary and update LiveOutCache such that
-  /// Edit.get(RegIdx) is live-in to all the blocks in LiveIn.
-  /// Return the value that is eventually live-in to IdxMBB.
-  VNInfo *updateSSA(unsigned RegIdx,
-                    SmallVectorImpl<MachineDomTreeNode*> &LiveIn,
-                    SlotIndex Idx,
-                    const MachineBasicBlock *IdxMBB);
+  /// findReachingDefs - Starting from MBB, add blocks to LiveInBlocks until all
+  /// reaching defs for LI are found.
+  /// @param LI   Live interval whose value is needed.
+  /// @param MBB  Block where LI should be live-in.
+  /// @param Kill Kill point in MBB.
+  /// @return Unique value seen, or NULL.
+  VNInfo *findReachingDefs(LiveInterval *LI, MachineBasicBlock *MBB,
+                           SlotIndex Kill);
 
-  /// transferSimpleValues - Transfer simply defined values to the new ranges.
-  /// Return true if any complex ranges were skipped.
-  bool transferSimpleValues();
+  /// updateSSA - Compute and insert PHIDefs such that all blocks in
+  // LiveInBlocks get a known live-in value. Add live ranges to the blocks.
+  void updateSSA();
+
+  /// transferValues - Transfer values to the new ranges.
+  /// Return true if any ranges were skipped.
+  bool transferValues();
 
   /// extendPHIKillRanges - Extend the ranges of all values killed by original
   /// parent PHIDefs.
@@ -278,7 +345,15 @@ public:
   void reset(LiveRangeEdit&);
 
   /// Create a new virtual register and live interval.
-  void openIntv();
+  /// Return the interval index, starting from 1. Interval index 0 is the
+  /// implicit complement interval.
+  unsigned openIntv();
+
+  /// currentIntv - Return the current interval index.
+  unsigned currentIntv() const { return OpenIdx; }
+
+  /// selectIntv - Select a previously opened interval index.
+  void selectIntv(unsigned Idx);
 
   /// enterIntvBefore - Enter the open interval before the instruction at Idx.
   /// If the parent interval is not live before Idx, a COPY is not inserted.
@@ -321,22 +396,28 @@ public:
   ///
   void overlapIntv(SlotIndex Start, SlotIndex End);
 
-  /// closeIntv - Indicate that we are done editing the currently open
-  /// LiveInterval, and ranges can be trimmed.
-  void closeIntv();
-
   /// finish - after all the new live ranges have been created, compute the
   /// remaining live range, and rewrite instructions to use the new registers.
-  void finish();
+  /// @param LRMap When not null, this vector will map each live range in Edit
+  ///              back to the indices returned by openIntv.
+  ///              There may be extra indices created by dead code elimination.
+  void finish(SmallVectorImpl<unsigned> *LRMap = 0);
 
   /// dump - print the current interval maping to dbgs().
   void dump() const;
 
   // ===--- High level methods ---===
 
+  /// splitSingleBlock - Split CurLI into a separate live interval around the
+  /// uses in a single block. This is intended to be used as part of a larger
+  /// split, and doesn't call finish().
+  void splitSingleBlock(const SplitAnalysis::BlockInfo &BI);
+
   /// splitSingleBlocks - Split CurLI into a separate live interval inside each
   /// basic block in Blocks.
   void splitSingleBlocks(const SplitAnalysis::BlockPtrSet &Blocks);
 };
 
 }
+
+#endif
diff --git a/lib/CodeGen/StrongPHIElimination.cpp b/lib/CodeGen/StrongPHIElimination.cpp
index ec7829e..227eb47 100644
--- a/lib/CodeGen/StrongPHIElimination.cpp
+++ b/lib/CodeGen/StrongPHIElimination.cpp
@@ -587,7 +587,7 @@ StrongPHIElimination::SplitInterferencesForBasicBlock(
   }
 
   // We now walk the PHIs in successor blocks and check for interferences. This
-  // is necesary because the use of a PHI's operands are logically contained in
+  // is necessary because the use of a PHI's operands are logically contained in
   // the predecessor block. The def of a PHI's destination register is processed
   // along with the other defs in a basic block.
 
diff --git a/lib/CodeGen/TailDuplication.cpp b/lib/CodeGen/TailDuplication.cpp
index 04d3d31..90cb72f 100644
--- a/lib/CodeGen/TailDuplication.cpp
+++ b/lib/CodeGen/TailDuplication.cpp
@@ -34,6 +34,7 @@ STATISTIC(NumTails     , "Number of tails duplicated");
 STATISTIC(NumTailDups  , "Number of tail duplicated blocks");
 STATISTIC(NumInstrDups , "Additional instructions due to tail duplication");
 STATISTIC(NumDeadBlocks, "Number of dead blocks removed");
+STATISTIC(NumAddedPHIs , "Number of phis added");
 
 // Heuristic for tail duplication.
 static cl::opt<unsigned>
@@ -80,16 +81,21 @@ namespace {
     void ProcessPHI(MachineInstr *MI, MachineBasicBlock *TailBB,
                     MachineBasicBlock *PredBB,
                     DenseMap<unsigned, unsigned> &LocalVRMap,
-                    SmallVector<std::pair<unsigned,unsigned>, 4> &Copies);
+                    SmallVector<std::pair<unsigned,unsigned>, 4> &Copies,
+                    const DenseSet<unsigned> &UsedByPhi,
+                    bool Remove);
     void DuplicateInstruction(MachineInstr *MI,
                               MachineBasicBlock *TailBB,
                               MachineBasicBlock *PredBB,
                               MachineFunction &MF,
-                              DenseMap<unsigned, unsigned> &LocalVRMap);
+                              DenseMap<unsigned, unsigned> &LocalVRMap,
+                              const DenseSet<unsigned> &UsedByPhi);
     void UpdateSuccessorsPHIs(MachineBasicBlock *FromBB, bool isDead,
                               SmallVector<MachineBasicBlock*, 8> &TDBBs,
                               SmallSetVector<MachineBasicBlock*, 8> &Succs);
     bool TailDuplicateBlocks(MachineFunction &MF);
+    bool shouldTailDuplicate(const MachineFunction &MF,
+                             MachineBasicBlock &TailBB);
     bool TailDuplicate(MachineBasicBlock *TailBB, MachineFunction &MF,
                        SmallVector<MachineBasicBlock*, 8> &TDBBs,
                        SmallVector<MachineInstr*, 16> &Copies);
@@ -146,11 +152,11 @@ static void VerifyPHIs(MachineFunction &MF, bool CheckExtra) {
       for (unsigned i = 1, e = MI->getNumOperands(); i != e; i += 2) {
         MachineBasicBlock *PHIBB = MI->getOperand(i+1).getMBB();
         if (CheckExtra && !Preds.count(PHIBB)) {
-          // This is not a hard error.
           dbgs() << "Warning: malformed PHI in BB#" << MBB->getNumber()
                  << ": " << *MI;
           dbgs() << "  extra input from predecessor BB#"
                  << PHIBB->getNumber() << '\n';
+          llvm_unreachable(0);
         }
         if (PHIBB->getNumber() < 0) {
           dbgs() << "Malformed PHI in BB#" << MBB->getNumber() << ": " << *MI;
@@ -183,10 +189,6 @@ bool TailDuplicatePass::TailDuplicateBlocks(MachineFunction &MF) {
     if (NumTails == TailDupLimit)
       break;
 
-    // Only duplicate blocks that end with unconditional branches.
-    if (MBB->canFallThrough())
-      continue;
-
     // Save the successors list.
     SmallSetVector<MachineBasicBlock*, 8> Succs(MBB->succ_begin(),
                                                 MBB->succ_end());
@@ -240,7 +242,7 @@ bool TailDuplicatePass::TailDuplicateBlocks(MachineFunction &MF) {
             MachineOperand &UseMO = UI.getOperand();
             MachineInstr *UseMI = &*UI;
             ++UI;
-            if (UseMI->getParent() == DefBB)
+            if (UseMI->getParent() == DefBB && !UseMI->isPHI())
               continue;
             SSAUpdate.RewriteUse(UseMO);
           }
@@ -271,6 +273,7 @@ bool TailDuplicatePass::TailDuplicateBlocks(MachineFunction &MF) {
       MadeChange = true;
     }
   }
+  NumAddedPHIs += NewPHIs.size();
 
   return MadeChange;
 }
@@ -293,6 +296,24 @@ static unsigned getPHISrcRegOpIdx(MachineInstr *MI, MachineBasicBlock *SrcBB) {
   return 0;
 }
 
+
+// Remember which registers are used by phis in this block. This is
+// used to determine which registers are liveout while modifying the
+// block (which is why we need to copy the information).
+static void getRegsUsedByPHIs(const MachineBasicBlock &BB,
+                              DenseSet<unsigned> *UsedByPhi) {
+  for(MachineBasicBlock::const_iterator I = BB.begin(), E = BB.end();
+      I != E; ++I) {
+    const MachineInstr &MI = *I;
+    if (!MI.isPHI())
+      break;
+    for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
+      unsigned SrcReg = MI.getOperand(i).getReg();
+      UsedByPhi->insert(SrcReg);
+    }
+  }
+}
+
 /// AddSSAUpdateEntry - Add a definition and source virtual registers pair for
 /// SSA update.
 void TailDuplicatePass::AddSSAUpdateEntry(unsigned OrigReg, unsigned NewReg,
@@ -315,7 +336,9 @@ void TailDuplicatePass::ProcessPHI(MachineInstr *MI,
                                    MachineBasicBlock *TailBB,
                                    MachineBasicBlock *PredBB,
                                    DenseMap<unsigned, unsigned> &LocalVRMap,
-                         SmallVector<std::pair<unsigned,unsigned>, 4> &Copies) {
+                           SmallVector<std::pair<unsigned,unsigned>, 4> &Copies,
+                                   const DenseSet<unsigned> &RegsUsedByPhi,
+                                   bool Remove) {
   unsigned DefReg = MI->getOperand(0).getReg();
   unsigned SrcOpIdx = getPHISrcRegOpIdx(MI, PredBB);
   assert(SrcOpIdx && "Unable to find matching PHI source?");
@@ -327,9 +350,12 @@ void TailDuplicatePass::ProcessPHI(MachineInstr *MI,
   // available value liveout of the block.
   unsigned NewDef = MRI->createVirtualRegister(RC);
   Copies.push_back(std::make_pair(NewDef, SrcReg));
-  if (isDefLiveOut(DefReg, TailBB, MRI))
+  if (isDefLiveOut(DefReg, TailBB, MRI) || RegsUsedByPhi.count(DefReg))
     AddSSAUpdateEntry(DefReg, NewDef, PredBB);
 
+  if (!Remove)
+    return;
+
   // Remove PredBB from the PHI node.
   MI->RemoveOperand(SrcOpIdx+1);
   MI->RemoveOperand(SrcOpIdx);
@@ -343,7 +369,8 @@ void TailDuplicatePass::DuplicateInstruction(MachineInstr *MI,
                                      MachineBasicBlock *TailBB,
                                      MachineBasicBlock *PredBB,
                                      MachineFunction &MF,
-                                     DenseMap<unsigned, unsigned> &LocalVRMap) {
+                                     DenseMap<unsigned, unsigned> &LocalVRMap,
+                                     const DenseSet<unsigned> &UsedByPhi) {
   MachineInstr *NewMI = TII->duplicate(MI, MF);
   for (unsigned i = 0, e = NewMI->getNumOperands(); i != e; ++i) {
     MachineOperand &MO = NewMI->getOperand(i);
@@ -357,7 +384,7 @@ void TailDuplicatePass::DuplicateInstruction(MachineInstr *MI,
       unsigned NewReg = MRI->createVirtualRegister(RC);
       MO.setReg(NewReg);
       LocalVRMap.insert(std::make_pair(Reg, NewReg));
-      if (isDefLiveOut(Reg, TailBB, MRI))
+      if (isDefLiveOut(Reg, TailBB, MRI) || UsedByPhi.count(Reg))
         AddSSAUpdateEntry(Reg, NewReg, PredBB);
     } else {
       DenseMap<unsigned, unsigned>::iterator VI = LocalVRMap.find(Reg);
@@ -416,6 +443,13 @@ TailDuplicatePass::UpdateSuccessorsPHIs(MachineBasicBlock *FromBB, bool isDead,
         // This register is defined in the tail block.
         for (unsigned j = 0, ee = LI->second.size(); j != ee; ++j) {
           MachineBasicBlock *SrcBB = LI->second[j].first;
+          // If we didn't duplicate a bb into a particular predecessor, we
+          // might still have added an entry to SSAUpdateVals to correcly
+          // recompute SSA. If that case, avoid adding a dummy extra argument
+          // this PHI.
+          if (!SrcBB->isSuccessor(SuccBB))
+            continue;
+
           unsigned SrcReg = LI->second[j].second;
           if (Idx != 0) {
             II->getOperand(Idx).setReg(SrcReg);
@@ -448,14 +482,19 @@ TailDuplicatePass::UpdateSuccessorsPHIs(MachineBasicBlock *FromBB, bool isDead,
   }
 }
 
-/// TailDuplicate - If it is profitable, duplicate TailBB's contents in each
-/// of its predecessors.
+/// shouldTailDuplicate - Determine if it is profitable to duplicate this block.
 bool
-TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB, MachineFunction &MF,
-                                 SmallVector<MachineBasicBlock*, 8> &TDBBs,
-                                 SmallVector<MachineInstr*, 16> &Copies) {
-  // Set the limit on the number of instructions to duplicate, with a default
-  // of one less than the tail-merge threshold. When optimizing for size,
+TailDuplicatePass::shouldTailDuplicate(const MachineFunction &MF,
+                                       MachineBasicBlock &TailBB) {
+  // Only duplicate blocks that end with unconditional branches.
+  if (TailBB.canFallThrough())
+    return false;
+
+  // Don't try to tail-duplicate single-block loops.
+  if (TailBB.isSuccessor(&TailBB))
+    return false;
+
+  // Set the limit on the cost to duplicate. When optimizing for size,
   // duplicate only one, because one branch instruction can be eliminated to
   // compensate for the duplication.
   unsigned MaxDuplicateCount;
@@ -465,49 +504,56 @@ TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB, MachineFunction &MF,
   else
     MaxDuplicateCount = TailDuplicateSize;
 
-  if (PreRegAlloc) {
-    if (TailBB->empty())
-      return false;
-    const TargetInstrDesc &TID = TailBB->back().getDesc();
-    // Pre-regalloc tail duplication hurts compile time and doesn't help
-    // much except for indirect branches and returns.
-    if (!TID.isIndirectBranch() && !TID.isReturn())
-      return false;
-    // If the target has hardware branch prediction that can handle indirect
-    // branches, duplicating them can often make them predictable when there
-    // are common paths through the code.  The limit needs to be high enough
-    // to allow undoing the effects of tail merging and other optimizations
-    // that rearrange the predecessors of the indirect branch.
-    MaxDuplicateCount = 20;
-  }
+  // If the target has hardware branch prediction that can handle indirect
+  // branches, duplicating them can often make them predictable when there
+  // are common paths through the code.  The limit needs to be high enough
+  // to allow undoing the effects of tail merging and other optimizations
+  // that rearrange the predecessors of the indirect branch.
 
-  // Don't try to tail-duplicate single-block loops.
-  if (TailBB->isSuccessor(TailBB))
-    return false;
+  if (PreRegAlloc && !TailBB.empty()) {
+    const TargetInstrDesc &TID = TailBB.back().getDesc();
+    if (TID.isIndirectBranch())
+      MaxDuplicateCount = 20;
+  }
 
   // Check the instructions in the block to determine whether tail-duplication
   // is invalid or unlikely to be profitable.
   unsigned InstrCount = 0;
-  bool HasCall = false;
-  for (MachineBasicBlock::iterator I = TailBB->begin();
-       I != TailBB->end(); ++I) {
+  for (MachineBasicBlock::const_iterator I = TailBB.begin(); I != TailBB.end();
+       ++I) {
     // Non-duplicable things shouldn't be tail-duplicated.
-    if (I->getDesc().isNotDuplicable()) return false;
+    if (I->getDesc().isNotDuplicable())
+      return false;
+
     // Do not duplicate 'return' instructions if this is a pre-regalloc run.
     // A return may expand into a lot more instructions (e.g. reload of callee
     // saved registers) after PEI.
-    if (PreRegAlloc && I->getDesc().isReturn()) return false;
-    // Don't duplicate more than the threshold.
-    if (InstrCount == MaxDuplicateCount) return false;
-    // Remember if we saw a call.
-    if (I->getDesc().isCall()) HasCall = true;
+    if (PreRegAlloc && I->getDesc().isReturn())
+      return false;
+
+    // Avoid duplicating calls before register allocation. Calls presents a
+    // barrier to register allocation so duplicating them may end up increasing
+    // spills.
+    if (PreRegAlloc && I->getDesc().isCall())
+      return false;
+
     if (!I->isPHI() && !I->isDebugValue())
       InstrCount += 1;
+
+    if (InstrCount > MaxDuplicateCount)
+      return false;
   }
-  // Don't tail-duplicate calls before register allocation. Calls presents a
-  // barrier to register allocation so duplicating them may end up increasing
-  // spills.
-  if (InstrCount > 1 && (PreRegAlloc && HasCall))
+
+  return true;
+}
+
+/// TailDuplicate - If it is profitable, duplicate TailBB's contents in each
+/// of its predecessors.
+bool
+TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB, MachineFunction &MF,
+                                 SmallVector<MachineBasicBlock*, 8> &TDBBs,
+                                 SmallVector<MachineInstr*, 16> &Copies) {
+  if (!shouldTailDuplicate(MF, *TailBB))
     return false;
 
   DEBUG(dbgs() << "\n*** Tail-duplicating BB#" << TailBB->getNumber() << '\n');
@@ -518,13 +564,17 @@ TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB, MachineFunction &MF,
   bool Changed = false;
   SmallSetVector<MachineBasicBlock*, 8> Preds(TailBB->pred_begin(),
                                               TailBB->pred_end());
+  DenseSet<unsigned> UsedByPhi;
+  getRegsUsedByPHIs(*TailBB, &UsedByPhi);
   for (SmallSetVector<MachineBasicBlock *, 8>::iterator PI = Preds.begin(),
        PE = Preds.end(); PI != PE; ++PI) {
     MachineBasicBlock *PredBB = *PI;
 
     assert(TailBB != PredBB &&
            "Single-block loop should have been rejected earlier!");
-    if (PredBB->succ_size() > 1) continue;
+    // EH edges are ignored by AnalyzeBranch.
+    if (PredBB->succ_size() > 1)
+      continue;
 
     MachineBasicBlock *PredTBB, *PredFBB;
     SmallVector<MachineOperand, 4> PredCond;
@@ -532,9 +582,6 @@ TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB, MachineFunction &MF,
       continue;
     if (!PredCond.empty())
       continue;
-    // EH edges are ignored by AnalyzeBranch.
-    if (PredBB->succ_size() != 1)
-      continue;
     // Don't duplicate into a fall-through predecessor (at least for now).
     if (PredBB->isLayoutSuccessor(TailBB) && PredBB->canFallThrough())
       continue;
@@ -557,11 +604,11 @@ TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB, MachineFunction &MF,
       if (MI->isPHI()) {
         // Replace the uses of the def of the PHI with the register coming
         // from PredBB.
-        ProcessPHI(MI, TailBB, PredBB, LocalVRMap, CopyInfos);
+        ProcessPHI(MI, TailBB, PredBB, LocalVRMap, CopyInfos, UsedByPhi, true);
       } else {
         // Replace def of virtual registers with new registers, and update
         // uses with PHI source register or the new registers.
-        DuplicateInstruction(MI, TailBB, PredBB, MF, LocalVRMap);
+        DuplicateInstruction(MI, TailBB, PredBB, MF, LocalVRMap, UsedByPhi);
       }
     }
     MachineBasicBlock::iterator Loc = PredBB->getFirstTerminator();
@@ -570,6 +617,10 @@ TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB, MachineFunction &MF,
                                TII->get(TargetOpcode::COPY),
                                CopyInfos[i].first).addReg(CopyInfos[i].second));
     }
+
+    // Simplify
+    TII->AnalyzeBranch(*PredBB, PredTBB, PredFBB, PredCond, true);
+
     NumInstrDups += TailBB->size() - 1; // subtract one for removed branch
 
     // Update the CFG.
@@ -590,12 +641,11 @@ TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB, MachineFunction &MF,
   MachineBasicBlock *PrevBB = prior(MachineFunction::iterator(TailBB));
   MachineBasicBlock *PriorTBB = 0, *PriorFBB = 0;
   SmallVector<MachineOperand, 4> PriorCond;
-  bool PriorUnAnalyzable =
-    TII->AnalyzeBranch(*PrevBB, PriorTBB, PriorFBB, PriorCond, true);
   // This has to check PrevBB->succ_size() because EH edges are ignored by
   // AnalyzeBranch.
-  if (!PriorUnAnalyzable && PriorCond.empty() && !PriorTBB &&
-      TailBB->pred_size() == 1 && PrevBB->succ_size() == 1 &&
+  if (PrevBB->succ_size() == 1 && 
+      !TII->AnalyzeBranch(*PrevBB, PriorTBB, PriorFBB, PriorCond, true) &&
+      PriorCond.empty() && !PriorTBB && TailBB->pred_size() == 1 &&
       !TailBB->hasAddressTaken()) {
     DEBUG(dbgs() << "\nMerging into block: " << *PrevBB
           << "From MBB: " << *TailBB);
@@ -608,7 +658,7 @@ TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB, MachineFunction &MF,
         // Replace the uses of the def of the PHI with the register coming
         // from PredBB.
         MachineInstr *MI = &*I++;
-        ProcessPHI(MI, TailBB, PrevBB, LocalVRMap, CopyInfos);
+        ProcessPHI(MI, TailBB, PrevBB, LocalVRMap, CopyInfos, UsedByPhi, true);
         if (MI->getParent())
           MI->eraseFromParent();
       }
@@ -618,7 +668,7 @@ TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB, MachineFunction &MF,
         // Replace def of virtual registers with new registers, and update
         // uses with PHI source register or the new registers.
         MachineInstr *MI = &*I++;
-        DuplicateInstruction(MI, TailBB, PrevBB, MF, LocalVRMap);
+        DuplicateInstruction(MI, TailBB, PrevBB, MF, LocalVRMap, UsedByPhi);
         MI->eraseFromParent();
       }
       MachineBasicBlock::iterator Loc = PrevBB->getFirstTerminator();
@@ -639,6 +689,57 @@ TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB, MachineFunction &MF,
     Changed = true;
   }
 
+  // If this is after register allocation, there are no phis to fix.
+  if (!PreRegAlloc)
+    return Changed;
+
+  // If we made no changes so far, we are safe.
+  if (!Changed)
+    return Changed;
+
+
+  // Handle the nasty case in that we duplicated a block that is part of a loop
+  // into some but not all of its predecessors. For example:
+  //    1 -> 2 <-> 3                 |
+  //          \                      |
+  //           \---> rest            |
+  // if we duplicate 2 into 1 but not into 3, we end up with
+  // 12 -> 3 <-> 2 -> rest           |
+  //   \             /               |
+  //    \----->-----/                |
+  // If there was a "var = phi(1, 3)" in 2, it has to be ultimately replaced
+  // with a phi in 3 (which now dominates 2).
+  // What we do here is introduce a copy in 3 of the register defined by the
+  // phi, just like when we are duplicating 2 into 3, but we don't copy any
+  // real instructions or remove the 3 -> 2 edge from the phi in 2.
+  for (SmallSetVector<MachineBasicBlock *, 8>::iterator PI = Preds.begin(),
+       PE = Preds.end(); PI != PE; ++PI) {
+    MachineBasicBlock *PredBB = *PI;
+    if (std::find(TDBBs.begin(), TDBBs.end(), PredBB) != TDBBs.end())
+      continue;
+
+    // EH edges
+    if (PredBB->succ_size() != 1)
+      continue;
+
+    DenseMap<unsigned, unsigned> LocalVRMap;
+    SmallVector<std::pair<unsigned,unsigned>, 4> CopyInfos;
+    MachineBasicBlock::iterator I = TailBB->begin();
+    // Process PHI instructions first.
+    while (I != TailBB->end() && I->isPHI()) {
+      // Replace the uses of the def of the PHI with the register coming
+      // from PredBB.
+      MachineInstr *MI = &*I++;
+      ProcessPHI(MI, TailBB, PredBB, LocalVRMap, CopyInfos, UsedByPhi, false);
+    }
+    MachineBasicBlock::iterator Loc = PredBB->getFirstTerminator();
+    for (unsigned i = 0, e = CopyInfos.size(); i != e; ++i) {
+      Copies.push_back(BuildMI(*PredBB, Loc, DebugLoc(),
+                               TII->get(TargetOpcode::COPY),
+                               CopyInfos[i].first).addReg(CopyInfos[i].second));
+    }
+  }
+
   return Changed;
 }
 
@@ -655,4 +756,3 @@ void TailDuplicatePass::RemoveDeadBlock(MachineBasicBlock *MBB) {
   // Remove the block.
   MBB->eraseFromParent();
 }
-
diff --git a/lib/CodeGen/TargetInstrInfoImpl.cpp b/lib/CodeGen/TargetInstrInfoImpl.cpp
index 15340a3..34e2b33 100644
--- a/lib/CodeGen/TargetInstrInfoImpl.cpp
+++ b/lib/CodeGen/TargetInstrInfoImpl.cpp
@@ -212,8 +212,7 @@ static const TargetRegisterClass *canFoldCopy(const MachineInstr *MI,
   if (TargetRegisterInfo::isPhysicalRegister(LiveOp.getReg()))
     return RC->contains(LiveOp.getReg()) ? RC : 0;
 
-  const TargetRegisterClass *LiveRC = MRI.getRegClass(LiveReg);
-  if (RC == LiveRC || RC->hasSubClass(LiveRC))
+  if (RC->hasSubClassEq(MRI.getRegClass(LiveReg)))
     return RC;
 
   // FIXME: Allow folding when register classes are memory compatible.
@@ -388,11 +387,6 @@ isReallyTriviallyReMaterializableGeneric(const MachineInstr *MI,
     if (MO.isDef() != (i == 0))
       return false;
 
-    // For the def, it should be the only def of that register.
-    if (MO.isDef() && (llvm::next(MRI.def_begin(Reg)) != MRI.def_end() ||
-                       MRI.isLiveIn(Reg)))
-      return false;
-
     // Don't allow any virtual-register uses. Rematting an instruction with
     // virtual register uses would length the live ranges of the uses, which
     // is not necessarily a good idea, certainly not "trivial".
diff --git a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index f332d12..2da1bd4 100644
--- a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -23,6 +23,7 @@
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Target/Mangler.h"
 #include "llvm/Target/TargetData.h"
@@ -176,12 +177,59 @@ const MCSection *TargetLoweringObjectFileELF::getEHFrameSection() const {
                                     SectionKind::getDataRel());
 }
 
+MCSymbol *
+TargetLoweringObjectFileELF::getCFIPersonalitySymbol(const GlobalValue *GV,
+                                                     Mangler *Mang,
+                                                MachineModuleInfo *MMI) const {
+  unsigned Encoding = getPersonalityEncoding();
+  switch (Encoding & 0x70) {
+  default:
+    report_fatal_error("We do not support this DWARF encoding yet!");
+  case dwarf::DW_EH_PE_absptr:
+    return  Mang->getSymbol(GV);
+    break;
+  case dwarf::DW_EH_PE_pcrel: {
+    return getContext().GetOrCreateSymbol(StringRef("DW.ref.") +
+                                          Mang->getSymbol(GV)->getName());
+    break;
+  }
+  }
+}
+
+void TargetLoweringObjectFileELF::emitPersonalityValue(MCStreamer &Streamer,
+                                                       const TargetMachine &TM,
+                                                       const MCSymbol *Sym) const {
+  SmallString<64> NameData("DW.ref.");
+  NameData += Sym->getName();
+  MCSymbol *Label = getContext().GetOrCreateSymbol(NameData);
+  Streamer.EmitSymbolAttribute(Label, MCSA_Hidden);
+  Streamer.EmitSymbolAttribute(Label, MCSA_Weak);
+  StringRef Prefix = ".data.";
+  NameData.insert(NameData.begin(), Prefix.begin(), Prefix.end());
+  unsigned Flags = ELF::SHF_ALLOC | ELF::SHF_WRITE | ELF::SHF_GROUP;
+  const MCSection *Sec = getContext().getELFSection(NameData,
+                                                    ELF::SHT_PROGBITS,
+                                                    Flags,
+                                                    SectionKind::getDataRel(),
+                                                    0, Label->getName());
+  Streamer.SwitchSection(Sec);
+  Streamer.EmitValueToAlignment(8);
+  Streamer.EmitSymbolAttribute(Label, MCSA_ELF_TypeObject);
+  const MCExpr *E = MCConstantExpr::Create(8, getContext());
+  Streamer.EmitELFSize(Label, E);
+  Streamer.EmitLabel(Label);
+
+  unsigned Size = TM.getTargetData()->getPointerSize();
+  Streamer.EmitSymbolValue(Sym, Size);
+}
+
 static SectionKind
 getELFKindForNamedSection(StringRef Name, SectionKind K) {
-  // FIXME: Why is this here? Codegen is should not be in the business
-  // of figuring section flags. If the user wrote section(".eh_frame"),
-  // we should just pass that to MC which will defer to the assembly
-  // or use its default if producing an object file.
+  // N.B.: The defaults used in here are no the same ones used in MC.
+  // We follow gcc, MC follows gas. For example, given ".section .eh_frame",
+  // both gas and MC will produce a section with no flags. Given
+  // section(".eh_frame") gcc will produce
+  // .section	.eh_frame,"a",@progbits
   if (Name.empty() || Name[0] != '.') return K;
 
   // Some lame default implementation based on some magic section names.
@@ -207,9 +255,6 @@ getELFKindForNamedSection(StringRef Name, SectionKind K) {
       Name.startswith(".llvm.linkonce.tb."))
     return SectionKind::getThreadBSS();
 
-  if (Name == ".eh_frame")
-    return SectionKind::getDataRel();
-
   return K;
 }
 
@@ -424,8 +469,7 @@ getExprForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang,
     }
 
     return TargetLoweringObjectFile::
-      getExprForDwarfReference(SSym, Mang, MMI,
-                               Encoding & ~dwarf::DW_EH_PE_indirect, Streamer);
+      getExprForDwarfReference(SSym, Encoding & ~dwarf::DW_EH_PE_indirect, Streamer);
   }
 
   return TargetLoweringObjectFile::
@@ -438,26 +482,13 @@ getExprForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang,
 
 void TargetLoweringObjectFileMachO::Initialize(MCContext &Ctx,
                                                const TargetMachine &TM) {
-  // _foo.eh symbols are currently always exported so that the linker knows
-  // about them.  This is not necessary on 10.6 and later, but it
-  // doesn't hurt anything.
-  // FIXME: I need to get this from Triple.
-  IsFunctionEHSymbolGlobal = true;
   IsFunctionEHFrameSymbolPrivate = false;
   SupportsWeakOmittedEHFrame = false;
 
+  // .comm doesn't support alignment before Leopard.
   Triple T(((LLVMTargetMachine&)TM).getTargetTriple());
-  if (T.getOS() == Triple::Darwin) {
-    switch (T.getDarwinMajorNumber()) {
-    case 7:  // 10.3 Panther.
-    case 8:  // 10.4 Tiger.
-      CommDirectiveSupportsAlignment = false;
-      break;
-    case 9:   // 10.5 Leopard.
-    case 10:  // 10.6 SnowLeopard.
-      break;
-    }
-  }
+  if (T.isMacOSX() && T.isMacOSXVersionLT(10, 5))
+    CommDirectiveSupportsAlignment = false;
 
   TargetLoweringObjectFile::Initialize(Ctx, TM);
 
@@ -803,14 +834,36 @@ getExprForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang,
     }
 
     return TargetLoweringObjectFile::
-      getExprForDwarfReference(SSym, Mang, MMI,
-                               Encoding & ~dwarf::DW_EH_PE_indirect, Streamer);
+      getExprForDwarfReference(SSym, Encoding & ~dwarf::DW_EH_PE_indirect, Streamer);
   }
 
   return TargetLoweringObjectFile::
     getExprForDwarfGlobalReference(GV, Mang, MMI, Encoding, Streamer);
 }
 
+MCSymbol *TargetLoweringObjectFileMachO::
+getCFIPersonalitySymbol(const GlobalValue *GV, Mangler *Mang,
+                        MachineModuleInfo *MMI) const {
+  // The mach-o version of this method defaults to returning a stub reference.
+  MachineModuleInfoMachO &MachOMMI =
+    MMI->getObjFileInfo<MachineModuleInfoMachO>();
+
+  SmallString<128> Name;
+  Mang->getNameWithPrefix(Name, GV, true);
+  Name += "$non_lazy_ptr";
+
+  // Add information about the stub reference to MachOMMI so that the stub
+  // gets emitted by the asmprinter.
+  MCSymbol *SSym = getContext().GetOrCreateSymbol(Name.str());
+  MachineModuleInfoImpl::StubValueTy &StubSym = MachOMMI.getGVStubEntry(SSym);
+  if (StubSym.getPointer() == 0) {
+    MCSymbol *Sym = Mang->getSymbol(GV);
+    StubSym = MachineModuleInfoImpl::StubValueTy(Sym, !GV->hasLocalLinkage());
+  }
+
+  return SSym;
+}
+
 unsigned TargetLoweringObjectFileMachO::getPersonalityEncoding() const {
   return DW_EH_PE_indirect | DW_EH_PE_pcrel | DW_EH_PE_sdata4;
 }
@@ -819,7 +872,7 @@ unsigned TargetLoweringObjectFileMachO::getLSDAEncoding() const {
   return DW_EH_PE_pcrel;
 }
 
-unsigned TargetLoweringObjectFileMachO::getFDEEncoding() const {
+unsigned TargetLoweringObjectFileMachO::getFDEEncoding(bool CFI) const {
   return DW_EH_PE_pcrel;
 }
 
@@ -934,6 +987,20 @@ void TargetLoweringObjectFileCOFF::Initialize(MCContext &Ctx,
     getContext().getCOFFSection(".drectve",
                                 COFF::IMAGE_SCN_LNK_INFO,
                                 SectionKind::getMetadata());
+
+  PDataSection =
+    getContext().getCOFFSection(".pdata",
+                                COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                                COFF::IMAGE_SCN_MEM_READ |
+                                COFF::IMAGE_SCN_MEM_WRITE,
+                                SectionKind::getDataRel());
+
+  XDataSection =
+    getContext().getCOFFSection(".xdata",
+                                COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                                COFF::IMAGE_SCN_MEM_READ |
+                                COFF::IMAGE_SCN_MEM_WRITE,
+                                SectionKind::getDataRel());
 }
 
 const MCSection *TargetLoweringObjectFileCOFF::getEHFrameSection() const {
@@ -944,6 +1011,28 @@ const MCSection *TargetLoweringObjectFileCOFF::getEHFrameSection() const {
                                      SectionKind::getDataRel());
 }
 
+const MCSection *TargetLoweringObjectFileCOFF::getWin64EHFuncTableSection(
+                                                       StringRef suffix) const {
+  if (suffix == "")
+    return PDataSection;
+  return getContext().getCOFFSection((".pdata"+suffix).str(),
+                                     COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                                     COFF::IMAGE_SCN_MEM_READ |
+                                     COFF::IMAGE_SCN_MEM_WRITE,
+                                     SectionKind::getDataRel());
+}
+
+const MCSection *TargetLoweringObjectFileCOFF::getWin64EHTableSection(
+                                                       StringRef suffix) const {
+  if (suffix == "")
+    return XDataSection;
+  return getContext().getCOFFSection((".xdata"+suffix).str(),
+                                     COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                                     COFF::IMAGE_SCN_MEM_READ |
+                                     COFF::IMAGE_SCN_MEM_WRITE,
+                                     SectionKind::getDataRel());
+}
+
 
 static unsigned
 getCOFFSectionFlags(SectionKind K) {
diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp
index 52ea872..f54d879 100644
--- a/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -1125,6 +1125,7 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &MF) {
             break; // The tied operands have been eliminated.
         }
 
+        bool IsEarlyClobber = false;
         bool RemovedKillFlag = false;
         bool AllUsesCopied = true;
         unsigned LastCopiedReg = 0;
@@ -1132,7 +1133,11 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &MF) {
         for (unsigned tpi = 0, tpe = TiedPairs.size(); tpi != tpe; ++tpi) {
           unsigned SrcIdx = TiedPairs[tpi].first;
           unsigned DstIdx = TiedPairs[tpi].second;
-          unsigned regA = mi->getOperand(DstIdx).getReg();
+
+          const MachineOperand &DstMO = mi->getOperand(DstIdx);
+          unsigned regA = DstMO.getReg();
+          IsEarlyClobber |= DstMO.isEarlyClobber();
+
           // Grab regB from the instruction because it may have changed if the
           // instruction was commuted.
           regB = mi->getOperand(SrcIdx).getReg();
@@ -1196,15 +1201,17 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &MF) {
         }
 
         if (AllUsesCopied) {
-          // Replace other (un-tied) uses of regB with LastCopiedReg.
-          for (unsigned i = 0, e = mi->getNumOperands(); i != e; ++i) {
-            MachineOperand &MO = mi->getOperand(i);
-            if (MO.isReg() && MO.getReg() == regB && MO.isUse()) {
-              if (MO.isKill()) {
-                MO.setIsKill(false);
-                RemovedKillFlag = true;
+          if (!IsEarlyClobber) {
+            // Replace other (un-tied) uses of regB with LastCopiedReg.
+            for (unsigned i = 0, e = mi->getNumOperands(); i != e; ++i) {
+              MachineOperand &MO = mi->getOperand(i);
+              if (MO.isReg() && MO.getReg() == regB && MO.isUse()) {
+                if (MO.isKill()) {
+                  MO.setIsKill(false);
+                  RemovedKillFlag = true;
+                }
+                MO.setReg(LastCopiedReg);
               }
-              MO.setReg(LastCopiedReg);
             }
           }
 
diff --git a/lib/CodeGen/UnreachableBlockElim.cpp b/lib/CodeGen/UnreachableBlockElim.cpp
index 48d8ab1..52693f0 100644
--- a/lib/CodeGen/UnreachableBlockElim.cpp
+++ b/lib/CodeGen/UnreachableBlockElim.cpp
@@ -196,8 +196,11 @@ bool UnreachableMachineBlockElim::runOnMachineFunction(MachineFunction &F) {
         temp->eraseFromParent();
         ModifiedPHI = true;
 
-        if (Input != Output)
-          F.getRegInfo().replaceRegWith(Output, Input);
+        if (Input != Output) {
+          MachineRegisterInfo &MRI = F.getRegInfo();
+          MRI.constrainRegClass(Input, MRI.getRegClass(Output));
+          MRI.replaceRegWith(Output, Input);
+        }
 
         continue;
       }
diff --git a/lib/CodeGen/VirtRegMap.cpp b/lib/CodeGen/VirtRegMap.cpp
index 7a7ea69..7557979 100644
--- a/lib/CodeGen/VirtRegMap.cpp
+++ b/lib/CodeGen/VirtRegMap.cpp
@@ -42,6 +42,7 @@
 using namespace llvm;
 
 STATISTIC(NumSpills  , "Number of register spills");
+STATISTIC(NumIdCopies, "Number of identity moves eliminated after rewriting");
 
 //===----------------------------------------------------------------------===//
 //  VirtRegMap implementation
@@ -260,6 +261,8 @@ void VirtRegMap::rewrite(SlotIndexes *Indexes) {
                << "********** Function: "
                << MF->getFunction()->getName() << '\n');
   DEBUG(dump());
+  SmallVector<unsigned, 8> SuperDeads;
+  SmallVector<unsigned, 8> SuperDefs;
   SmallVector<unsigned, 8> SuperKills;
 
   for (MachineFunction::iterator MBBI = MF->begin(), MBBE = MF->end();
@@ -283,12 +286,13 @@ void VirtRegMap::rewrite(SlotIndexes *Indexes) {
         if (MO.getSubReg()) {
           // A virtual register kill refers to the whole register, so we may
           // have to add <imp-use,kill> operands for the super-register.
-          if (MO.isUse() && MO.isKill() && !MO.isUndef())
-            SuperKills.push_back(PhysReg);
-
-          // We don't have to deal with sub-register defs because
-          // LiveIntervalAnalysis already added the necessary <imp-def>
-          // operands.
+          if (MO.isUse()) {
+            if (MO.isKill() && !MO.isUndef())
+              SuperKills.push_back(PhysReg);
+          } else if (MO.isDead())
+            SuperDeads.push_back(PhysReg);
+          else
+            SuperDefs.push_back(PhysReg);
 
           // PhysReg operands cannot have subregister indexes.
           PhysReg = TRI->getSubReg(PhysReg, MO.getSubReg());
@@ -305,10 +309,17 @@ void VirtRegMap::rewrite(SlotIndexes *Indexes) {
       while (!SuperKills.empty())
         MI->addRegisterKilled(SuperKills.pop_back_val(), TRI, true);
 
+      while (!SuperDeads.empty())
+        MI->addRegisterDead(SuperDeads.pop_back_val(), TRI, true);
+
+      while (!SuperDefs.empty())
+        MI->addRegisterDefined(SuperDefs.pop_back_val(), TRI);
+
       DEBUG(dbgs() << "> " << *MI);
 
       // Finally, remove any identity copies.
       if (MI->isIdentityCopy()) {
+        ++NumIdCopies;
         if (MI->getNumOperands() == 2) {
           DEBUG(dbgs() << "Deleting identity copy.\n");
           RemoveMachineInstrFromMaps(MI);
diff --git a/lib/CodeGen/VirtRegRewriter.cpp b/lib/CodeGen/VirtRegRewriter.cpp
index 67be1b0..1850658 100644
--- a/lib/CodeGen/VirtRegRewriter.cpp
+++ b/lib/CodeGen/VirtRegRewriter.cpp
@@ -32,7 +32,7 @@ STATISTIC(NumCommutes, "Number of instructions commuted");
 STATISTIC(NumDRM     , "Number of re-materializable defs elided");
 STATISTIC(NumStores  , "Number of stores added");
 STATISTIC(NumPSpills , "Number of physical register spills");
-STATISTIC(NumOmitted , "Number of reloads omited");
+STATISTIC(NumOmitted , "Number of reloads omitted");
 STATISTIC(NumAvoided , "Number of reloads deemed unnecessary");
 STATISTIC(NumCopified, "Number of available reloads turned into copies");
 STATISTIC(NumReMats  , "Number of re-materialization");
@@ -669,7 +669,7 @@ static void UpdateKills(MachineInstr &MI, const TargetRegisterInfo* TRI,
   }
 }
 
-/// ReMaterialize - Re-materialize definition for Reg targetting DestReg.
+/// ReMaterialize - Re-materialize definition for Reg targeting DestReg.
 ///
 static void ReMaterialize(MachineBasicBlock &MBB,
                           MachineBasicBlock::iterator &MII,
diff --git a/lib/ExecutionEngine/CMakeLists.txt b/lib/ExecutionEngine/CMakeLists.txt
index 8bff265..58caae8 100644
--- a/lib/ExecutionEngine/CMakeLists.txt
+++ b/lib/ExecutionEngine/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_llvm_library(LLVMExecutionEngine
   ExecutionEngine.cpp
   ExecutionEngineBindings.cpp
+  TargetSelect.cpp
   )
 
 add_subdirectory(Interpreter)
diff --git a/lib/ExecutionEngine/ExecutionEngine.cpp b/lib/ExecutionEngine/ExecutionEngine.cpp
index 13e07ac..7652090 100644
--- a/lib/ExecutionEngine/ExecutionEngine.cpp
+++ b/lib/ExecutionEngine/ExecutionEngine.cpp
@@ -29,6 +29,7 @@
 #include "llvm/Support/DynamicLibrary.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetMachine.h"
 #include <cmath>
 #include <cstring>
 using namespace llvm;
@@ -42,20 +43,14 @@ ExecutionEngine *(*ExecutionEngine::JITCtor)(
   JITMemoryManager *JMM,
   CodeGenOpt::Level OptLevel,
   bool GVsWithCode,
-  CodeModel::Model CMM,
-  StringRef MArch,
-  StringRef MCPU,
-  const SmallVectorImpl<std::string>& MAttrs) = 0;
+  TargetMachine *TM) = 0;
 ExecutionEngine *(*ExecutionEngine::MCJITCtor)(
   Module *M,
   std::string *ErrorStr,
   JITMemoryManager *JMM,
   CodeGenOpt::Level OptLevel,
   bool GVsWithCode,
-  CodeModel::Model CMM,
-  StringRef MArch,
-  StringRef MCPU,
-  const SmallVectorImpl<std::string>& MAttrs) = 0;
+  TargetMachine *TM) = 0;
 ExecutionEngine *(*ExecutionEngine::InterpCtor)(Module *M,
                                                 std::string *ErrorStr) = 0;
 
@@ -313,13 +308,17 @@ void ExecutionEngine::runStaticConstructorsDestructors(Module *module,
 
   // Should be an array of '{ i32, void ()* }' structs.  The first value is
   // the init priority, which we ignore.
+  if (isa<ConstantAggregateZero>(GV->getInitializer()))
+    return;
   ConstantArray *InitList = cast<ConstantArray>(GV->getInitializer());
   for (unsigned i = 0, e = InitList->getNumOperands(); i != e; ++i) {
+    if (isa<ConstantAggregateZero>(InitList->getOperand(i)))
+      continue;
     ConstantStruct *CS = cast<ConstantStruct>(InitList->getOperand(i));
 
     Constant *FP = CS->getOperand(1);
     if (FP->isNullValue())
-      break;  // Found a null terminator, exit.
+      continue;  // Found a sentinal value, ignore.
 
     // Strip off constant expression casts.
     if (ConstantExpr *CE = dyn_cast<ConstantExpr>(FP))
@@ -415,6 +414,35 @@ ExecutionEngine *ExecutionEngine::create(Module *M,
       .create();
 }
 
+/// createJIT - This is the factory method for creating a JIT for the current
+/// machine, it does not fall back to the interpreter.  This takes ownership
+/// of the module.
+ExecutionEngine *ExecutionEngine::createJIT(Module *M,
+                                            std::string *ErrorStr,
+                                            JITMemoryManager *JMM,
+                                            CodeGenOpt::Level OptLevel,
+                                            bool GVsWithCode,
+                                            CodeModel::Model CMM) {
+  if (ExecutionEngine::JITCtor == 0) {
+    if (ErrorStr)
+      *ErrorStr = "JIT has not been linked in.";
+    return 0;
+  }
+
+  // Use the defaults for extra parameters.  Users can use EngineBuilder to
+  // set them.
+  StringRef MArch = "";
+  StringRef MCPU = "";
+  SmallVector<std::string, 1> MAttrs;
+
+  TargetMachine *TM =
+          EngineBuilder::selectTarget(M, MArch, MCPU, MAttrs, ErrorStr);
+  if (!TM || (ErrorStr && ErrorStr->length() > 0)) return 0;
+  TM->setCodeModel(CMM);
+
+  return ExecutionEngine::JITCtor(M, ErrorStr, JMM, OptLevel, GVsWithCode, TM);
+}
+
 ExecutionEngine *EngineBuilder::create() {
   // Make sure we can resolve symbols in the program as well. The zero arg
   // to the function tells DynamicLibrary to load the program, not a library.
@@ -437,18 +465,21 @@ ExecutionEngine *EngineBuilder::create() {
   // Unless the interpreter was explicitly selected or the JIT is not linked,
   // try making a JIT.
   if (WhichEngine & EngineKind::JIT) {
-    if (UseMCJIT && ExecutionEngine::MCJITCtor) {
-      ExecutionEngine *EE =
-        ExecutionEngine::MCJITCtor(M, ErrorStr, JMM, OptLevel,
-                                   AllocateGVsWithCode, CMModel,
-                                   MArch, MCPU, MAttrs);
-      if (EE) return EE;
-    } else if (ExecutionEngine::JITCtor) {
-      ExecutionEngine *EE =
-        ExecutionEngine::JITCtor(M, ErrorStr, JMM, OptLevel,
-                                 AllocateGVsWithCode, CMModel,
-                                 MArch, MCPU, MAttrs);
-      if (EE) return EE;
+    if (TargetMachine *TM =
+        EngineBuilder::selectTarget(M, MArch, MCPU, MAttrs, ErrorStr)) {
+      TM->setCodeModel(CMModel);
+
+      if (UseMCJIT && ExecutionEngine::MCJITCtor) {
+        ExecutionEngine *EE =
+          ExecutionEngine::MCJITCtor(M, ErrorStr, JMM, OptLevel,
+                                     AllocateGVsWithCode, TM);
+        if (EE) return EE;
+      } else if (ExecutionEngine::JITCtor) {
+        ExecutionEngine *EE =
+          ExecutionEngine::JITCtor(M, ErrorStr, JMM, OptLevel,
+                                   AllocateGVsWithCode, TM);
+        if (EE) return EE;
+      }
     }
   }
 
@@ -835,7 +866,7 @@ void ExecutionEngine::StoreValueToMemory(const GenericValue &Val,
   case Type::PointerTyID:
     // Ensure 64 bit target pointers are fully initialized on 32 bit hosts.
     if (StoreBytes != sizeof(PointerTy))
-      memset(Ptr, 0, StoreBytes);
+      memset(&(Ptr->PointerVal), 0, StoreBytes);
 
     *((PointerTy*)Ptr) = Val.PointerVal;
     break;
diff --git a/lib/ExecutionEngine/JIT/CMakeLists.txt b/lib/ExecutionEngine/JIT/CMakeLists.txt
index 42020d6..cefb0ae 100644
--- a/lib/ExecutionEngine/JIT/CMakeLists.txt
+++ b/lib/ExecutionEngine/JIT/CMakeLists.txt
@@ -9,5 +9,4 @@ add_llvm_library(LLVMJIT
   JITEmitter.cpp
   JITMemoryManager.cpp
   OProfileJITEventListener.cpp
-  TargetSelect.cpp
   )
diff --git a/lib/ExecutionEngine/JIT/JIT.cpp b/lib/ExecutionEngine/JIT/JIT.cpp
index 56121c1..8fceaf2 100644
--- a/lib/ExecutionEngine/JIT/JIT.cpp
+++ b/lib/ExecutionEngine/JIT/JIT.cpp
@@ -203,39 +203,18 @@ void DarwinRegisterFrame(void* FrameBegin) {
 /// createJIT - This is the factory method for creating a JIT for the current
 /// machine, it does not fall back to the interpreter.  This takes ownership
 /// of the module.
-ExecutionEngine *ExecutionEngine::createJIT(Module *M,
-                                            std::string *ErrorStr,
-                                            JITMemoryManager *JMM,
-                                            CodeGenOpt::Level OptLevel,
-                                            bool GVsWithCode,
-                                            CodeModel::Model CMM) {
-  // Use the defaults for extra parameters.  Users can use EngineBuilder to
-  // set them.
-  StringRef MArch = "";
-  StringRef MCPU = "";
-  SmallVector<std::string, 1> MAttrs;
-  return JIT::createJIT(M, ErrorStr, JMM, OptLevel, GVsWithCode, CMM,
-                        MArch, MCPU, MAttrs);
-}
-
 ExecutionEngine *JIT::createJIT(Module *M,
                                 std::string *ErrorStr,
                                 JITMemoryManager *JMM,
                                 CodeGenOpt::Level OptLevel,
                                 bool GVsWithCode,
-                                CodeModel::Model CMM,
-                                StringRef MArch,
-                                StringRef MCPU,
-                                const SmallVectorImpl<std::string>& MAttrs) {
+                                TargetMachine *TM) {
   // Try to register the program as a source of symbols to resolve against.
+  //
+  // FIXME: Don't do this here.
   sys::DynamicLibrary::LoadLibraryPermanently(0, NULL);
 
-  // Pick a target either via -march or by guessing the native arch.
-  TargetMachine *TM = JIT::selectTarget(M, MArch, MCPU, MAttrs, ErrorStr);
-  if (!TM || (ErrorStr && ErrorStr->length() > 0)) return 0;
-  TM->setCodeModel(CMM);
-
-  // If the target supports JIT code generation, create a the JIT.
+  // If the target supports JIT code generation, create the JIT.
   if (TargetJITInfo *TJ = TM->getJITInfo()) {
     return new JIT(M, *TM, *TJ, JMM, OptLevel, GVsWithCode);
   } else {
@@ -666,7 +645,7 @@ void JIT::jitTheFunction(Function *F, const MutexGuard &locked) {
 }
 
 /// getPointerToFunction - This method is used to get the address of the
-/// specified function, compiling it if neccesary.
+/// specified function, compiling it if necessary.
 ///
 void *JIT::getPointerToFunction(Function *F) {
 
diff --git a/lib/ExecutionEngine/JIT/JIT.h b/lib/ExecutionEngine/JIT/JIT.h
index b576c16..b879fc3 100644
--- a/lib/ExecutionEngine/JIT/JIT.h
+++ b/lib/ExecutionEngine/JIT/JIT.h
@@ -181,23 +181,12 @@ public:
   ///
   JITCodeEmitter *getCodeEmitter() const { return JCE; }
 
-  /// selectTarget - Pick a target either via -march or by guessing the native
-  /// arch.  Add any CPU features specified via -mcpu or -mattr.
-  static TargetMachine *selectTarget(Module *M,
-                                     StringRef MArch,
-                                     StringRef MCPU,
-                                     const SmallVectorImpl<std::string>& MAttrs,
-                                     std::string *Err);
-
   static ExecutionEngine *createJIT(Module *M,
                                     std::string *ErrorStr,
                                     JITMemoryManager *JMM,
                                     CodeGenOpt::Level OptLevel,
                                     bool GVsWithCode,
-                                    CodeModel::Model CMM,
-                                    StringRef MArch,
-                                    StringRef MCPU,
-                                    const SmallVectorImpl<std::string>& MAttrs);
+                                    TargetMachine *TM);
 
   // Run the JIT on F and return information about the generated code
   void runJITOnFunction(Function *F, MachineCodeInfo *MCI = 0);
diff --git a/lib/ExecutionEngine/JIT/JITDebugRegisterer.cpp b/lib/ExecutionEngine/JIT/JITDebugRegisterer.cpp
index 3b5acb7..e71c20b 100644
--- a/lib/ExecutionEngine/JIT/JITDebugRegisterer.cpp
+++ b/lib/ExecutionEngine/JIT/JITDebugRegisterer.cpp
@@ -27,7 +27,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Mutex.h"
 #include <string>
-#include <vector>
 
 namespace llvm {
 
@@ -143,7 +142,7 @@ void JITDebugRegisterer::RegisterFunction(const Function *F, DebugInfo &I) {
 
   // Add a mapping from F to the entry and buffer, so we can delete this
   // info later.
-  FnMap[F] = std::make_pair<std::string, jit_code_entry*>(Buffer, JITCodeEntry);
+  FnMap[F] = std::make_pair(Buffer, JITCodeEntry);
 
   // Acquire the lock and do the registration.
   {
diff --git a/lib/ExecutionEngine/JIT/JITEmitter.cpp b/lib/ExecutionEngine/JIT/JITEmitter.cpp
index 3b4e750..d046b8a 100644
--- a/lib/ExecutionEngine/JIT/JITEmitter.cpp
+++ b/lib/ExecutionEngine/JIT/JITEmitter.cpp
@@ -128,7 +128,7 @@ namespace {
       return GlobalToIndirectSymMap;
     }
 
-    pair<void *, Function *> LookupFunctionFromCallSite(
+    std::pair<void *, Function *> LookupFunctionFromCallSite(
         const MutexGuard &locked, void *CallSite) const {
       assert(locked.holds(TheJIT->lock));
 
@@ -646,7 +646,7 @@ void *JITResolver::JITCompilerFn(void *Stub) {
 
     // The address given to us for the stub may not be exactly right, it might
     // be a little bit after the stub.  As such, use upper_bound to find it.
-    pair<void*, Function*> I =
+    std::pair<void*, Function*> I =
       JR->state.LookupFunctionFromCallSite(locked, Stub);
     F = I.second;
     ActualPtr = I.first;
diff --git a/lib/ExecutionEngine/JIT/TargetSelect.cpp b/lib/ExecutionEngine/JIT/TargetSelect.cpp
deleted file mode 100644
index 8d92ab0..0000000
--- a/lib/ExecutionEngine/JIT/TargetSelect.cpp
+++ /dev/null
@@ -1,91 +0,0 @@
-//===-- TargetSelect.cpp - Target Chooser Code ----------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This just asks the TargetRegistry for the appropriate JIT to use, and allows
-// the user to specify a specific one on the commandline with -march=x. Clients
-// should initialize targets prior to calling createJIT.
-//
-//===----------------------------------------------------------------------===//
-
-#include "JIT.h"
-#include "llvm/Module.h"
-#include "llvm/ADT/Triple.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/Host.h"
-#include "llvm/Target/SubtargetFeature.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetRegistry.h"
-using namespace llvm;
-
-/// selectTarget - Pick a target either via -march or by guessing the native
-/// arch.  Add any CPU features specified via -mcpu or -mattr.
-TargetMachine *JIT::selectTarget(Module *Mod,
-                                 StringRef MArch,
-                                 StringRef MCPU,
-                                 const SmallVectorImpl<std::string>& MAttrs,
-                                 std::string *ErrorStr) {
-  Triple TheTriple(Mod->getTargetTriple());
-  if (TheTriple.getTriple().empty())
-    TheTriple.setTriple(sys::getHostTriple());
-
-  // Adjust the triple to match what the user requested.
-  const Target *TheTarget = 0;
-  if (!MArch.empty()) {
-    for (TargetRegistry::iterator it = TargetRegistry::begin(),
-           ie = TargetRegistry::end(); it != ie; ++it) {
-      if (MArch == it->getName()) {
-        TheTarget = &*it;
-        break;
-      }
-    }
-
-    if (!TheTarget) {
-      *ErrorStr = "No available targets are compatible with this -march, "
-        "see -version for the available targets.\n";
-      return 0;
-    }
-
-    // Adjust the triple to match (if known), otherwise stick with the
-    // module/host triple.
-    Triple::ArchType Type = Triple::getArchTypeForLLVMName(MArch);
-    if (Type != Triple::UnknownArch)
-      TheTriple.setArch(Type);
-  } else {
-    std::string Error;
-    TheTarget = TargetRegistry::lookupTarget(TheTriple.getTriple(), Error);
-    if (TheTarget == 0) {
-      if (ErrorStr)
-        *ErrorStr = Error;
-      return 0;
-    }
-  }
-
-  if (!TheTarget->hasJIT()) {
-    errs() << "WARNING: This target JIT is not designed for the host you are"
-           << " running.  If bad things happen, please choose a different "
-           << "-march switch.\n";
-  }
-
-  // Package up features to be passed to target/subtarget
-  std::string FeaturesStr;
-  if (!MCPU.empty() || !MAttrs.empty()) {
-    SubtargetFeatures Features;
-    Features.setCPU(MCPU);
-    for (unsigned i = 0; i != MAttrs.size(); ++i)
-      Features.AddFeature(MAttrs[i]);
-    FeaturesStr = Features.getString();
-  }
-
-  // Allocate a target...
-  TargetMachine *Target =
-    TheTarget->createTargetMachine(TheTriple.getTriple(), FeaturesStr);
-  assert(Target && "Could not allocate target machine!");
-  return Target;
-}
diff --git a/lib/ExecutionEngine/MCJIT/CMakeLists.txt b/lib/ExecutionEngine/MCJIT/CMakeLists.txt
index 6553079..38fdffa 100644
--- a/lib/ExecutionEngine/MCJIT/CMakeLists.txt
+++ b/lib/ExecutionEngine/MCJIT/CMakeLists.txt
@@ -1,5 +1,4 @@
 add_llvm_library(LLVMMCJIT
   MCJIT.cpp
-  TargetSelect.cpp
   Intercept.cpp
   )
diff --git a/lib/ExecutionEngine/MCJIT/MCJIT.cpp b/lib/ExecutionEngine/MCJIT/MCJIT.cpp
index 148e0d9..4475f4d 100644
--- a/lib/ExecutionEngine/MCJIT/MCJIT.cpp
+++ b/lib/ExecutionEngine/MCJIT/MCJIT.cpp
@@ -1,4 +1,4 @@
-//===-- MCJIT.cpp - MC-based Just-in-Time Compiler --------------------------===//
+//===-- MCJIT.cpp - MC-based Just-in-Time Compiler ------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -38,27 +38,15 @@ ExecutionEngine *MCJIT::createJIT(Module *M,
                                   JITMemoryManager *JMM,
                                   CodeGenOpt::Level OptLevel,
                                   bool GVsWithCode,
-                                  CodeModel::Model CMM,
-                                  StringRef MArch,
-                                  StringRef MCPU,
-                                  const SmallVectorImpl<std::string>& MAttrs) {
+                                  TargetMachine *TM) {
   // Try to register the program as a source of symbols to resolve against.
   //
   // FIXME: Don't do this here.
   sys::DynamicLibrary::LoadLibraryPermanently(0, NULL);
 
-  // Pick a target either via -march or by guessing the native arch.
-  //
-  // FIXME: This should be lifted out of here, it isn't something which should
-  // be part of the JIT policy, rather the burden for this selection should be
-  // pushed to clients.
-  TargetMachine *TM = MCJIT::selectTarget(M, MArch, MCPU, MAttrs, ErrorStr);
-  if (!TM || (ErrorStr && ErrorStr->length() > 0)) return 0;
-  TM->setCodeModel(CMM);
-
   // If the target supports JIT code generation, create the JIT.
   if (TargetJITInfo *TJ = TM->getJITInfo())
-    return new MCJIT(M, TM, *TJ, new MCJITMemoryManager(JMM), OptLevel,
+    return new MCJIT(M, TM, *TJ, new MCJITMemoryManager(JMM, M), OptLevel,
                      GVsWithCode);
 
   if (ErrorStr)
@@ -93,6 +81,8 @@ MCJIT::MCJIT(Module *m, TargetMachine *tm, TargetJITInfo &tji,
                                                               Buffer.size()));
   if (Dyld.loadObject(MB))
     report_fatal_error(Dyld.getErrorString());
+  // Resolve any relocations.
+  Dyld.resolveRelocations();
 }
 
 MCJIT::~MCJIT() {
@@ -112,8 +102,12 @@ void *MCJIT::getPointerToFunction(Function *F) {
     return Addr;
   }
 
-  Twine Name = TM->getMCAsmInfo()->getGlobalPrefix() + F->getName();
-  return (void*)Dyld.getSymbolAddress(Name.str());
+  // FIXME: Should we be using the mangler for this? Probably.
+  StringRef BaseName = F->getName();
+  if (BaseName[0] == '\1')
+    return (void*)Dyld.getSymbolAddress(BaseName.substr(1));
+  return (void*)Dyld.getSymbolAddress((TM->getMCAsmInfo()->getGlobalPrefix()
+                                       + BaseName).str());
 }
 
 void *MCJIT::recompileAndRelinkFunction(Function *F) {
diff --git a/lib/ExecutionEngine/MCJIT/MCJIT.h b/lib/ExecutionEngine/MCJIT/MCJIT.h
index 1b50766..b64c21a 100644
--- a/lib/ExecutionEngine/MCJIT/MCJIT.h
+++ b/lib/ExecutionEngine/MCJIT/MCJIT.h
@@ -76,22 +76,12 @@ public:
     MCJITCtor = createJIT;
   }
 
-  // FIXME: This routine is scheduled for termination. Do not use it.
-  static TargetMachine *selectTarget(Module *M,
-                                     StringRef MArch,
-                                     StringRef MCPU,
-                                     const SmallVectorImpl<std::string>& MAttrs,
-                                     std::string *Err);
-
   static ExecutionEngine *createJIT(Module *M,
                                     std::string *ErrorStr,
                                     JITMemoryManager *JMM,
                                     CodeGenOpt::Level OptLevel,
                                     bool GVsWithCode,
-                                    CodeModel::Model CMM,
-                                    StringRef MArch,
-                                    StringRef MCPU,
-                                    const SmallVectorImpl<std::string>& MAttrs);
+                                    TargetMachine *TM);
 
   // @}
 };
diff --git a/lib/ExecutionEngine/MCJIT/MCJITMemoryManager.h b/lib/ExecutionEngine/MCJIT/MCJITMemoryManager.h
index 0108ecc..40bc031 100644
--- a/lib/ExecutionEngine/MCJIT/MCJITMemoryManager.h
+++ b/lib/ExecutionEngine/MCJIT/MCJITMemoryManager.h
@@ -26,13 +26,21 @@ class MCJITMemoryManager : public RTDyldMemoryManager {
   // FIXME: Multiple modules.
   Module *M;
 public:
-  MCJITMemoryManager(JITMemoryManager *jmm) : JMM(jmm) {}
+  MCJITMemoryManager(JITMemoryManager *jmm, Module *m) : JMM(jmm), M(m) {}
 
   // Allocate ActualSize bytes, or more, for the named function. Return
   // a pointer to the allocated memory and update Size to reflect how much
   // memory was acutally allocated.
   uint8_t *startFunctionBody(const char *Name, uintptr_t &Size) {
+    // FIXME: This should really reference the MCAsmInfo to get the global
+    //        prefix.
+    if (Name[0] == '_') ++Name;
     Function *F = M->getFunction(Name);
+    // Some ObjC names have a prefixed \01 in the IR. If we failed to find
+    // the symbol and it's of the ObjC conventions (starts with "-"), try
+    // prepending a \01 and see if we can find it that way.
+    if (!F && Name[0] == '-')
+      F = M->getFunction((Twine("\1") + Name).str());
     assert(F && "No matching function in JIT IR Module!");
     return JMM->startFunctionBody(F, Size);
   }
@@ -41,7 +49,15 @@ public:
   // memory was actually used.
   void endFunctionBody(const char *Name, uint8_t *FunctionStart,
                        uint8_t *FunctionEnd) {
+    // FIXME: This should really reference the MCAsmInfo to get the global
+    //        prefix.
+    if (Name[0] == '_') ++Name;
     Function *F = M->getFunction(Name);
+    // Some ObjC names have a prefixed \01 in the IR. If we failed to find
+    // the symbol and it's of the ObjC conventions (starts with "-"), try
+    // prepending a \01 and see if we can find it that way.
+    if (!F && Name[0] == '-')
+      F = M->getFunction((Twine("\1") + Name).str());
     assert(F && "No matching function in JIT IR Module!");
     JMM->endFunctionBody(F, FunctionStart, FunctionEnd);
   }
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
index 29fced4..eda4cbb 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
@@ -41,17 +41,38 @@ class RuntimeDyldImpl {
   // The MemoryManager to load objects into.
   RTDyldMemoryManager *MemMgr;
 
+  // FIXME: This all assumes we're dealing with external symbols for anything
+  //        explicitly referenced. I.e., we can index by name and things
+  //        will work out. In practice, this may not be the case, so we
+  //        should find a way to effectively generalize.
 
   // For each function, we have a MemoryBlock of it's instruction data.
   StringMap<sys::MemoryBlock> Functions;
 
   // Master symbol table. As modules are loaded and external symbols are
   // resolved, their addresses are stored here.
-  StringMap<uint64_t> SymbolTable;
-
-  // FIXME: Should have multiple data blocks, one for each loaded chunk of
-  //        compiled code.
-  sys::MemoryBlock Data;
+  StringMap<uint8_t*> SymbolTable;
+
+  // For each symbol, keep a list of relocations based on it. Anytime
+  // its address is reassigned (the JIT re-compiled the function, e.g.),
+  // the relocations get re-resolved.
+  struct RelocationEntry {
+    std::string Target;     // Object this relocation is contained in.
+    uint64_t    Offset;     // Offset into the object for the relocation.
+    uint32_t    Data;       // Second word of the raw macho relocation entry.
+    int64_t     Addend;     // Addend encoded in the instruction itself, if any.
+    bool        isResolved; // Has this relocation been resolved previously?
+
+    RelocationEntry(StringRef t, uint64_t offset, uint32_t data, int64_t addend)
+      : Target(t), Offset(offset), Data(data), Addend(addend),
+        isResolved(false) {}
+  };
+  typedef SmallVector<RelocationEntry, 4> RelocationList;
+  StringMap<RelocationList> Relocations;
+
+  // FIXME: Also keep a map of all the relocations contained in an object. Use
+  // this to dynamically answer whether all of the relocations in it have
+  // been resolved or not.
 
   bool HasError;
   std::string ErrorStr;
@@ -65,12 +86,11 @@ class RuntimeDyldImpl {
 
   void extractFunction(StringRef Name, uint8_t *StartAddress,
                        uint8_t *EndAddress);
-  bool resolveRelocation(uint32_t BaseSection, macho::RelocationEntry RE,
-                         SmallVectorImpl<void *> &SectionBases,
-                         SmallVectorImpl<StringRef> &SymbolNames);
-  bool resolveX86_64Relocation(intptr_t Address, intptr_t Value, bool isPCRel,
+  bool resolveRelocation(uint8_t *Address, uint8_t *Value, bool isPCRel,
+                         unsigned Type, unsigned Size);
+  bool resolveX86_64Relocation(uintptr_t Address, uintptr_t Value, bool isPCRel,
                                unsigned Type, unsigned Size);
-  bool resolveARMRelocation(intptr_t Address, intptr_t Value, bool isPCRel,
+  bool resolveARMRelocation(uintptr_t Address, uintptr_t Value, bool isPCRel,
                             unsigned Type, unsigned Size);
 
   bool loadSegment32(const MachOObject *Obj,
@@ -85,13 +105,15 @@ public:
 
   bool loadObject(MemoryBuffer *InputBuffer);
 
-  uint64_t getSymbolAddress(StringRef Name) {
+  void *getSymbolAddress(StringRef Name) {
     // FIXME: Just look up as a function for now. Overly simple of course.
     // Work in progress.
-    return (uint64_t)Functions.lookup(Name).base();
+    return SymbolTable.lookup(Name);
   }
 
-  sys::MemoryBlock getMemoryBlock() { return Data; }
+  void resolveRelocations();
+
+  void reassignSymbolAddress(StringRef Name, uint8_t *Addr);
 
   // Is the linker in an error state?
   bool hasError() { return HasError; }
@@ -107,75 +129,41 @@ void RuntimeDyldImpl::extractFunction(StringRef Name, uint8_t *StartAddress,
                                       uint8_t *EndAddress) {
   // Allocate memory for the function via the memory manager.
   uintptr_t Size = EndAddress - StartAddress + 1;
-  uint8_t *Mem = MemMgr->startFunctionBody(Name.data(), Size);
+  uintptr_t AllocSize = Size;
+  uint8_t *Mem = MemMgr->startFunctionBody(Name.data(), AllocSize);
   assert(Size >= (uint64_t)(EndAddress - StartAddress + 1) &&
          "Memory manager failed to allocate enough memory!");
   // Copy the function payload into the memory block.
-  memcpy(Mem, StartAddress, EndAddress - StartAddress + 1);
+  memcpy(Mem, StartAddress, Size);
   MemMgr->endFunctionBody(Name.data(), Mem, Mem + Size);
   // Remember where we put it.
   Functions[Name] = sys::MemoryBlock(Mem, Size);
-  DEBUG(dbgs() << "    allocated to " << Mem << "\n");
+  // Default the assigned address for this symbol to wherever this
+  // allocated it.
+  SymbolTable[Name] = Mem;
+  DEBUG(dbgs() << "    allocated to [" << Mem << ", " << Mem + Size << "]\n");
 }
 
 bool RuntimeDyldImpl::
-resolveRelocation(uint32_t BaseSection, macho::RelocationEntry RE,
-                  SmallVectorImpl<void *> &SectionBases,
-                  SmallVectorImpl<StringRef> &SymbolNames) {
-  // struct relocation_info {
-  //   int32_t r_address;
-  //   uint32_t r_symbolnum:24,
-  //            r_pcrel:1,
-  //            r_length:2,
-  //            r_extern:1,
-  //            r_type:4;
-  // };
-  uint32_t SymbolNum = RE.Word1 & 0xffffff; // 24-bit value
-  bool isPCRel = (RE.Word1 >> 24) & 1;
-  unsigned Log2Size = (RE.Word1 >> 25) & 3;
-  bool isExtern = (RE.Word1 >> 27) & 1;
-  unsigned Type = (RE.Word1 >> 28) & 0xf;
-  if (RE.Word0 & macho::RF_Scattered)
-    return Error("NOT YET IMPLEMENTED: scattered relocations.");
-
-  // The address requiring a relocation.
-  intptr_t Address = (intptr_t)SectionBases[BaseSection] + RE.Word0;
-
-  // Figure out the target address of the relocation. If isExtern is true,
-  // this relocation references the symbol table, otherwise it references
-  // a section in the same object, numbered from 1 through NumSections
-  // (SectionBases is [0, NumSections-1]).
-  intptr_t Value;
-  if (isExtern) {
-    StringRef Name = SymbolNames[SymbolNum];
-    if (SymbolTable.lookup(Name)) {
-      // The symbol is in our symbol table, so we can resolve it directly.
-      Value = (intptr_t)SymbolTable[Name];
-    } else {
-      return Error("NOT YET IMPLEMENTED: relocations to pre-compiled code.");
-    }
-    DEBUG(dbgs() << "Resolve relocation(" << Type << ") from '" << Name
-                 << "' to " << format("0x%x", Address) << ".\n");
-  } else {
-    // For non-external relocations, the SymbolNum is actual a section number
-    // as described above.
-    Value = (intptr_t)SectionBases[SymbolNum - 1];
-  }
-
-  unsigned Size = 1 << Log2Size;
+resolveRelocation(uint8_t *Address, uint8_t *Value, bool isPCRel,
+                  unsigned Type, unsigned Size) {
+  // This just dispatches to the proper target specific routine.
   switch (CPUType) {
   default: assert(0 && "Unsupported CPU type!");
   case mach::CTM_x86_64:
-    return resolveX86_64Relocation(Address, Value, isPCRel, Type, Size);
+    return resolveX86_64Relocation((uintptr_t)Address, (uintptr_t)Value,
+                                   isPCRel, Type, Size);
   case mach::CTM_ARM:
-    return resolveARMRelocation(Address, Value, isPCRel, Type, Size);
+    return resolveARMRelocation((uintptr_t)Address, (uintptr_t)Value,
+                                isPCRel, Type, Size);
   }
   llvm_unreachable("");
 }
 
-bool RuntimeDyldImpl::resolveX86_64Relocation(intptr_t Address, intptr_t Value,
-                                              bool isPCRel, unsigned Type,
-                                              unsigned Size) {
+bool RuntimeDyldImpl::
+resolveX86_64Relocation(uintptr_t Address, uintptr_t Value,
+                        bool isPCRel, unsigned Type,
+                        unsigned Size) {
   // If the relocation is PC-relative, the value to be encoded is the
   // pointer difference.
   if (isPCRel)
@@ -210,7 +198,7 @@ bool RuntimeDyldImpl::resolveX86_64Relocation(intptr_t Address, intptr_t Value,
   return false;
 }
 
-bool RuntimeDyldImpl::resolveARMRelocation(intptr_t Address, intptr_t Value,
+bool RuntimeDyldImpl::resolveARMRelocation(uintptr_t Address, uintptr_t Value,
                                            bool isPCRel, unsigned Type,
                                            unsigned Size) {
   // If the relocation is PC-relative, the value to be encoded is the
@@ -225,6 +213,7 @@ bool RuntimeDyldImpl::resolveARMRelocation(intptr_t Address, intptr_t Value,
 
   switch(Type) {
   default:
+    llvm_unreachable("Invalid relocation type!");
   case macho::RIT_Vanilla: {
     llvm_unreachable("Invalid relocation type!");
     // Mask in the target value a byte at a time (we don't have an alignment
@@ -236,10 +225,6 @@ bool RuntimeDyldImpl::resolveARMRelocation(intptr_t Address, intptr_t Value,
     }
     break;
   }
-  case macho::RIT_Pair:
-  case macho::RIT_Difference:
-  case macho::RIT_ARM_LocalDifference:
-  case macho::RIT_ARM_PreboundLazyPointer:
   case macho::RIT_ARM_Branch24Bit: {
     // Mask the value into the target address. We know instructions are
     // 32-bit aligned, so we can do it all at once.
@@ -260,6 +245,10 @@ bool RuntimeDyldImpl::resolveARMRelocation(intptr_t Address, intptr_t Value,
   case macho::RIT_ARM_ThumbBranch32Bit:
   case macho::RIT_ARM_Half:
   case macho::RIT_ARM_HalfDifference:
+  case macho::RIT_Pair:
+  case macho::RIT_Difference:
+  case macho::RIT_ARM_LocalDifference:
+  case macho::RIT_ARM_PreboundLazyPointer:
     return Error("Relocation type not implemented yet!");
   }
   return false;
@@ -269,98 +258,137 @@ bool RuntimeDyldImpl::
 loadSegment32(const MachOObject *Obj,
               const MachOObject::LoadCommandInfo *SegmentLCI,
               const InMemoryStruct<macho::SymtabLoadCommand> &SymtabLC) {
-  InMemoryStruct<macho::SegmentLoadCommand> Segment32LC;
-  Obj->ReadSegmentLoadCommand(*SegmentLCI, Segment32LC);
-  if (!Segment32LC)
+  InMemoryStruct<macho::SegmentLoadCommand> SegmentLC;
+  Obj->ReadSegmentLoadCommand(*SegmentLCI, SegmentLC);
+  if (!SegmentLC)
     return Error("unable to load segment load command");
 
-  // Map the segment into memory.
-  std::string ErrorStr;
-  Data = sys::Memory::AllocateRWX(Segment32LC->VMSize, 0, &ErrorStr);
-  if (!Data.base())
-    return Error("unable to allocate memory block: '" + ErrorStr + "'");
-  memcpy(Data.base(), Obj->getData(Segment32LC->FileOffset,
-                                   Segment32LC->FileSize).data(),
-         Segment32LC->FileSize);
-  memset((char*)Data.base() + Segment32LC->FileSize, 0,
-         Segment32LC->VMSize - Segment32LC->FileSize);
-
-  // Bind the section indices to addresses and record the relocations we
-  // need to resolve.
-  typedef std::pair<uint32_t, macho::RelocationEntry> RelocationMap;
-  SmallVector<RelocationMap, 64> Relocations;
-
-  SmallVector<void *, 16> SectionBases;
-  for (unsigned i = 0; i != Segment32LC->NumSections; ++i) {
+  for (unsigned SectNum = 0; SectNum != SegmentLC->NumSections; ++SectNum) {
     InMemoryStruct<macho::Section> Sect;
-    Obj->ReadSection(*SegmentLCI, i, Sect);
-   if (!Sect)
-      return Error("unable to load section: '" + Twine(i) + "'");
-
-    // Remember any relocations the section has so we can resolve them later.
-    for (unsigned j = 0; j != Sect->NumRelocationTableEntries; ++j) {
-      InMemoryStruct<macho::RelocationEntry> RE;
-      Obj->ReadRelocationEntry(Sect->RelocationTableOffset, j, RE);
-      Relocations.push_back(RelocationMap(j, *RE));
-    }
-
-    // FIXME: Improve check.
-//    if (Sect->Flags != 0x80000400)
-//      return Error("unsupported section type!");
-
-    SectionBases.push_back((char*) Data.base() + Sect->Address);
-  }
+    Obj->ReadSection(*SegmentLCI, SectNum, Sect);
+    if (!Sect)
+      return Error("unable to load section: '" + Twine(SectNum) + "'");
 
-  // Bind all the symbols to address. Keep a record of the names for use
-  // by relocation resolution.
-  SmallVector<StringRef, 64> SymbolNames;
-  for (unsigned i = 0; i != SymtabLC->NumSymbolTableEntries; ++i) {
-    InMemoryStruct<macho::SymbolTableEntry> STE;
-    Obj->ReadSymbolTableEntry(SymtabLC->SymbolTableOffset, i, STE);
-    if (!STE)
-      return Error("unable to read symbol: '" + Twine(i) + "'");
-    // Get the symbol name.
-    StringRef Name = Obj->getStringAtIndex(STE->StringIndex);
-    SymbolNames.push_back(Name);
-
-    // Just skip undefined symbols. They'll be loaded from whatever
-    // module they come from (or system dylib) when we resolve relocations
-    // involving them.
-    if (STE->SectionIndex == 0)
+    // FIXME: For the time being, we're only loading text segments.
+    if (Sect->Flags != 0x80000400)
       continue;
 
-    unsigned Index = STE->SectionIndex - 1;
-    if (Index >= Segment32LC->NumSections)
-      return Error("invalid section index for symbol: '" + Twine() + "'");
-
-    // Get the section base address.
-    void *SectionBase = SectionBases[Index];
+    // Address and names of symbols in the section.
+    typedef std::pair<uint64_t, StringRef> SymbolEntry;
+    SmallVector<SymbolEntry, 64> Symbols;
+    // Index of all the names, in this section or not. Used when we're
+    // dealing with relocation entries.
+    SmallVector<StringRef, 64> SymbolNames;
+    for (unsigned i = 0; i != SymtabLC->NumSymbolTableEntries; ++i) {
+      InMemoryStruct<macho::SymbolTableEntry> STE;
+      Obj->ReadSymbolTableEntry(SymtabLC->SymbolTableOffset, i, STE);
+      if (!STE)
+        return Error("unable to read symbol: '" + Twine(i) + "'");
+      if (STE->SectionIndex > SegmentLC->NumSections)
+        return Error("invalid section index for symbol: '" + Twine(i) + "'");
+      // Get the symbol name.
+      StringRef Name = Obj->getStringAtIndex(STE->StringIndex);
+      SymbolNames.push_back(Name);
 
-    // Get the symbol address.
-    uint64_t Address = (uint64_t)SectionBase + STE->Value;
+      // Just skip symbols not defined in this section.
+      if ((unsigned)STE->SectionIndex - 1 != SectNum)
+        continue;
 
-    // FIXME: Check the symbol type and flags.
-    if (STE->Type != 0xF)
-      return Error("unexpected symbol type!");
-    if (STE->Flags != 0x0)
-      return Error("unexpected symbol type!");
+      // FIXME: Check the symbol type and flags.
+      if (STE->Type != 0xF)  // external, defined in this section.
+        continue;
+      // Flags == 0x8 marks a thumb function for ARM, which is fine as it
+      // doesn't require any special handling here.
+      if (STE->Flags != 0x0 && STE->Flags != 0x8)
+        continue;
 
-    DEBUG(dbgs() << "Symbol: '" << Name << "' @ " << Address << "\n");
+      // Remember the symbol.
+      Symbols.push_back(SymbolEntry(STE->Value, Name));
 
-    SymbolTable[Name] = Address;
-  }
+      DEBUG(dbgs() << "Function sym: '" << Name << "' @ " <<
+            (Sect->Address + STE->Value) << "\n");
+    }
+    // Sort the symbols by address, just in case they didn't come in that way.
+    array_pod_sort(Symbols.begin(), Symbols.end());
 
-  // Now resolve any relocations.
-  for (unsigned i = 0, e = Relocations.size(); i != e; ++i) {
-    if (resolveRelocation(Relocations[i].first, Relocations[i].second,
-                          SectionBases, SymbolNames))
-      return true;
-  }
+    // If there weren't any functions (odd, but just in case...)
+    if (!Symbols.size())
+      continue;
 
-  // We've loaded the section; now mark the functions in it as executable.
-  // FIXME: We really should use the MemoryManager for this.
-  sys::Memory::setRangeExecutable(Data.base(), Data.size());
+    // Extract the function data.
+    uint8_t *Base = (uint8_t*)Obj->getData(SegmentLC->FileOffset,
+                                           SegmentLC->FileSize).data();
+    for (unsigned i = 0, e = Symbols.size() - 1; i != e; ++i) {
+      uint64_t StartOffset = Sect->Address + Symbols[i].first;
+      uint64_t EndOffset = Symbols[i + 1].first - 1;
+      DEBUG(dbgs() << "Extracting function: " << Symbols[i].second
+                   << " from [" << StartOffset << ", " << EndOffset << "]\n");
+      extractFunction(Symbols[i].second, Base + StartOffset, Base + EndOffset);
+    }
+    // The last symbol we do after since the end address is calculated
+    // differently because there is no next symbol to reference.
+    uint64_t StartOffset = Symbols[Symbols.size() - 1].first;
+    uint64_t EndOffset = Sect->Size - 1;
+    DEBUG(dbgs() << "Extracting function: " << Symbols[Symbols.size()-1].second
+                 << " from [" << StartOffset << ", " << EndOffset << "]\n");
+    extractFunction(Symbols[Symbols.size()-1].second,
+                    Base + StartOffset, Base + EndOffset);
 
+    // Now extract the relocation information for each function and process it.
+    for (unsigned j = 0; j != Sect->NumRelocationTableEntries; ++j) {
+      InMemoryStruct<macho::RelocationEntry> RE;
+      Obj->ReadRelocationEntry(Sect->RelocationTableOffset, j, RE);
+      if (RE->Word0 & macho::RF_Scattered)
+        return Error("NOT YET IMPLEMENTED: scattered relocations.");
+      // Word0 of the relocation is the offset into the section where the
+      // relocation should be applied. We need to translate that into an
+      // offset into a function since that's our atom.
+      uint32_t Offset = RE->Word0;
+      // Look for the function containing the address. This is used for JIT
+      // code, so the number of functions in section is almost always going
+      // to be very small (usually just one), so until we have use cases
+      // where that's not true, just use a trivial linear search.
+      unsigned SymbolNum;
+      unsigned NumSymbols = Symbols.size();
+      assert(NumSymbols > 0 && Symbols[0].first <= Offset &&
+             "No symbol containing relocation!");
+      for (SymbolNum = 0; SymbolNum < NumSymbols - 1; ++SymbolNum)
+        if (Symbols[SymbolNum + 1].first > Offset)
+          break;
+      // Adjust the offset to be relative to the symbol.
+      Offset -= Symbols[SymbolNum].first;
+      // Get the name of the symbol containing the relocation.
+      StringRef TargetName = SymbolNames[SymbolNum];
+
+      bool isExtern = (RE->Word1 >> 27) & 1;
+      // Figure out the source symbol of the relocation. If isExtern is true,
+      // this relocation references the symbol table, otherwise it references
+      // a section in the same object, numbered from 1 through NumSections
+      // (SectionBases is [0, NumSections-1]).
+      // FIXME: Some targets (ARM) use internal relocations even for
+      // externally visible symbols, if the definition is in the same
+      // file as the reference. We need to convert those back to by-name
+      // references. We can resolve the address based on the section
+      // offset and see if we have a symbol at that address. If we do,
+      // use that; otherwise, puke.
+      if (!isExtern)
+        return Error("Internal relocations not supported.");
+      uint32_t SourceNum = RE->Word1 & 0xffffff; // 24-bit value
+      StringRef SourceName = SymbolNames[SourceNum];
+
+      // FIXME: Get the relocation addend from the target address.
+
+      // Now store the relocation information. Associate it with the source
+      // symbol.
+      Relocations[SourceName].push_back(RelocationEntry(TargetName,
+                                                        Offset,
+                                                        RE->Word1,
+                                                        0 /*Addend*/));
+      DEBUG(dbgs() << "Relocation at '" << TargetName << "' + " << Offset
+                   << " from '" << SourceName << "(Word1: "
+                   << format("0x%x", RE->Word1) << ")\n");
+    }
+  }
   return false;
 }
 
@@ -380,51 +408,55 @@ loadSegment64(const MachOObject *Obj,
     if (!Sect)
       return Error("unable to load section: '" + Twine(SectNum) + "'");
 
-    // FIXME: Improve check.
+    // FIXME: For the time being, we're only loading text segments.
     if (Sect->Flags != 0x80000400)
-      return Error("unsupported section type!");
+      continue;
 
     // Address and names of symbols in the section.
     typedef std::pair<uint64_t, StringRef> SymbolEntry;
     SmallVector<SymbolEntry, 64> Symbols;
+    // Index of all the names, in this section or not. Used when we're
+    // dealing with relocation entries.
+    SmallVector<StringRef, 64> SymbolNames;
     for (unsigned i = 0; i != SymtabLC->NumSymbolTableEntries; ++i) {
       InMemoryStruct<macho::Symbol64TableEntry> STE;
       Obj->ReadSymbol64TableEntry(SymtabLC->SymbolTableOffset, i, STE);
       if (!STE)
         return Error("unable to read symbol: '" + Twine(i) + "'");
       if (STE->SectionIndex > Segment64LC->NumSections)
-        return Error("invalid section index for symbol: '" + Twine() + "'");
+        return Error("invalid section index for symbol: '" + Twine(i) + "'");
+      // Get the symbol name.
+      StringRef Name = Obj->getStringAtIndex(STE->StringIndex);
+      SymbolNames.push_back(Name);
 
       // Just skip symbols not defined in this section.
-      if (STE->SectionIndex - 1 != SectNum)
+      if ((unsigned)STE->SectionIndex - 1 != SectNum)
         continue;
 
-      // Get the symbol name.
-      StringRef Name = Obj->getStringAtIndex(STE->StringIndex);
-
       // FIXME: Check the symbol type and flags.
       if (STE->Type != 0xF)  // external, defined in this section.
-        return Error("unexpected symbol type!");
+        continue;
       if (STE->Flags != 0x0)
-        return Error("unexpected symbol type!");
-
-      uint64_t BaseAddress = Sect->Address;
-      uint64_t Address = BaseAddress + STE->Value;
+        continue;
 
       // Remember the symbol.
-      Symbols.push_back(SymbolEntry(Address, Name));
+      Symbols.push_back(SymbolEntry(STE->Value, Name));
 
-      DEBUG(dbgs() << "Function sym: '" << Name << "' @ " << Address << "\n");
+      DEBUG(dbgs() << "Function sym: '" << Name << "' @ " <<
+            (Sect->Address + STE->Value) << "\n");
     }
-    // Sort the symbols by address, just in case they didn't come in that
-    // way.
+    // Sort the symbols by address, just in case they didn't come in that way.
     array_pod_sort(Symbols.begin(), Symbols.end());
 
+    // If there weren't any functions (odd, but just in case...)
+    if (!Symbols.size())
+      continue;
+
     // Extract the function data.
     uint8_t *Base = (uint8_t*)Obj->getData(Segment64LC->FileOffset,
                                            Segment64LC->FileSize).data();
     for (unsigned i = 0, e = Symbols.size() - 1; i != e; ++i) {
-      uint64_t StartOffset = Symbols[i].first;
+      uint64_t StartOffset = Sect->Address + Symbols[i].first;
       uint64_t EndOffset = Symbols[i + 1].first - 1;
       DEBUG(dbgs() << "Extracting function: " << Symbols[i].second
                    << " from [" << StartOffset << ", " << EndOffset << "]\n");
@@ -438,8 +470,56 @@ loadSegment64(const MachOObject *Obj,
                  << " from [" << StartOffset << ", " << EndOffset << "]\n");
     extractFunction(Symbols[Symbols.size()-1].second,
                     Base + StartOffset, Base + EndOffset);
-  }
 
+    // Now extract the relocation information for each function and process it.
+    for (unsigned j = 0; j != Sect->NumRelocationTableEntries; ++j) {
+      InMemoryStruct<macho::RelocationEntry> RE;
+      Obj->ReadRelocationEntry(Sect->RelocationTableOffset, j, RE);
+      if (RE->Word0 & macho::RF_Scattered)
+        return Error("NOT YET IMPLEMENTED: scattered relocations.");
+      // Word0 of the relocation is the offset into the section where the
+      // relocation should be applied. We need to translate that into an
+      // offset into a function since that's our atom.
+      uint32_t Offset = RE->Word0;
+      // Look for the function containing the address. This is used for JIT
+      // code, so the number of functions in section is almost always going
+      // to be very small (usually just one), so until we have use cases
+      // where that's not true, just use a trivial linear search.
+      unsigned SymbolNum;
+      unsigned NumSymbols = Symbols.size();
+      assert(NumSymbols > 0 && Symbols[0].first <= Offset &&
+             "No symbol containing relocation!");
+      for (SymbolNum = 0; SymbolNum < NumSymbols - 1; ++SymbolNum)
+        if (Symbols[SymbolNum + 1].first > Offset)
+          break;
+      // Adjust the offset to be relative to the symbol.
+      Offset -= Symbols[SymbolNum].first;
+      // Get the name of the symbol containing the relocation.
+      StringRef TargetName = SymbolNames[SymbolNum];
+
+      bool isExtern = (RE->Word1 >> 27) & 1;
+      // Figure out the source symbol of the relocation. If isExtern is true,
+      // this relocation references the symbol table, otherwise it references
+      // a section in the same object, numbered from 1 through NumSections
+      // (SectionBases is [0, NumSections-1]).
+      if (!isExtern)
+        return Error("Internal relocations not supported.");
+      uint32_t SourceNum = RE->Word1 & 0xffffff; // 24-bit value
+      StringRef SourceName = SymbolNames[SourceNum];
+
+      // FIXME: Get the relocation addend from the target address.
+
+      // Now store the relocation information. Associate it with the source
+      // symbol.
+      Relocations[SourceName].push_back(RelocationEntry(TargetName,
+                                                        Offset,
+                                                        RE->Word1,
+                                                        0 /*Addend*/));
+      DEBUG(dbgs() << "Relocation at '" << TargetName << "' + " << Offset
+                   << " from '" << SourceName << "(Word1: "
+                   << format("0x%x", RE->Word1) << ")\n");
+    }
+  }
   return false;
 }
 
@@ -530,6 +610,40 @@ bool RuntimeDyldImpl::loadObject(MemoryBuffer *InputBuffer) {
   return false;
 }
 
+// Resolve the relocations for all symbols we currently know about.
+void RuntimeDyldImpl::resolveRelocations() {
+  // Just iterate over the symbols in our symbol table and assign their
+  // addresses.
+  StringMap<uint8_t*>::iterator i = SymbolTable.begin();
+  StringMap<uint8_t*>::iterator e = SymbolTable.end();
+  for (;i != e; ++i)
+    reassignSymbolAddress(i->getKey(), i->getValue());
+}
+
+// Assign an address to a symbol name and resolve all the relocations
+// associated with it.
+void RuntimeDyldImpl::reassignSymbolAddress(StringRef Name, uint8_t *Addr) {
+  // Assign the address in our symbol table.
+  SymbolTable[Name] = Addr;
+
+  RelocationList &Relocs = Relocations[Name];
+  for (unsigned i = 0, e = Relocs.size(); i != e; ++i) {
+    RelocationEntry &RE = Relocs[i];
+    uint8_t *Target = SymbolTable[RE.Target] + RE.Offset;
+    bool isPCRel = (RE.Data >> 24) & 1;
+    unsigned Type = (RE.Data >> 28) & 0xf;
+    unsigned Size = 1 << ((RE.Data >> 25) & 3);
+
+    DEBUG(dbgs() << "Resolving relocation at '" << RE.Target
+          << "' + " << RE.Offset << " (" << format("%p", Target) << ")"
+          << " from '" << Name << " (" << format("%p", Addr) << ")"
+          << "(" << (isPCRel ? "pcrel" : "absolute")
+          << ", type: " << Type << ", Size: " << Size << ").\n");
+
+    resolveRelocation(Target, Addr, isPCRel, Type, Size);
+    RE.isResolved = true;
+  }
+}
 
 //===----------------------------------------------------------------------===//
 // RuntimeDyld class implementation
@@ -545,12 +659,16 @@ bool RuntimeDyld::loadObject(MemoryBuffer *InputBuffer) {
   return Dyld->loadObject(InputBuffer);
 }
 
-uint64_t RuntimeDyld::getSymbolAddress(StringRef Name) {
+void *RuntimeDyld::getSymbolAddress(StringRef Name) {
   return Dyld->getSymbolAddress(Name);
 }
 
-sys::MemoryBlock RuntimeDyld::getMemoryBlock() {
-  return Dyld->getMemoryBlock();
+void RuntimeDyld::resolveRelocations() {
+  Dyld->resolveRelocations();
+}
+
+void RuntimeDyld::reassignSymbolAddress(StringRef Name, uint8_t *Addr) {
+  Dyld->reassignSymbolAddress(Name, Addr);
 }
 
 StringRef RuntimeDyld::getErrorString() {
diff --git a/lib/ExecutionEngine/MCJIT/TargetSelect.cpp b/lib/ExecutionEngine/TargetSelect.cpp
index 50f6593..a8822e5 100644
--- a/lib/ExecutionEngine/MCJIT/TargetSelect.cpp
+++ b/lib/ExecutionEngine/TargetSelect.cpp
@@ -13,7 +13,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MCJIT.h"
+#include "llvm/ExecutionEngine/ExecutionEngine.h"
 #include "llvm/Module.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Support/CommandLine.h"
@@ -26,11 +26,11 @@ using namespace llvm;
 
 /// selectTarget - Pick a target either via -march or by guessing the native
 /// arch.  Add any CPU features specified via -mcpu or -mattr.
-TargetMachine *MCJIT::selectTarget(Module *Mod,
-                                 StringRef MArch,
-                                 StringRef MCPU,
-                                 const SmallVectorImpl<std::string>& MAttrs,
-                                 std::string *ErrorStr) {
+TargetMachine *EngineBuilder::selectTarget(Module *Mod,
+                              StringRef MArch,
+                              StringRef MCPU,
+                              const SmallVectorImpl<std::string>& MAttrs,
+                              std::string *ErrorStr) {
   Triple TheTriple(Mod->getTargetTriple());
   if (TheTriple.getTriple().empty())
     TheTriple.setTriple(sys::getHostTriple());
diff --git a/lib/MC/CMakeLists.txt b/lib/MC/CMakeLists.txt
index 6aed059..a77ecd3 100644
--- a/lib/MC/CMakeLists.txt
+++ b/lib/MC/CMakeLists.txt
@@ -30,6 +30,7 @@ add_llvm_library(LLVMMC
   MCStreamer.cpp
   MCSymbol.cpp
   MCValue.cpp
+  MCWin64EH.cpp
   MachObjectWriter.cpp
   WinCOFFStreamer.cpp
   WinCOFFObjectWriter.cpp
diff --git a/lib/MC/ELFObjectWriter.cpp b/lib/MC/ELFObjectWriter.cpp
index b7d30cd..59e1b8e 100644
--- a/lib/MC/ELFObjectWriter.cpp
+++ b/lib/MC/ELFObjectWriter.cpp
@@ -25,6 +25,8 @@
 #include "llvm/Support/ELF.h"
 #include "llvm/Target/TargetAsmBackend.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/ADT/Statistic.h"
 
 #include "../Target/X86/X86FixupKinds.h"
 #include "../Target/ARM/ARMFixupKinds.h"
@@ -32,6 +34,9 @@
 #include <vector>
 using namespace llvm;
 
+#undef  DEBUG_TYPE
+#define DEBUG_TYPE "reloc-info"
+
 bool ELFObjectWriter::isFixupKindPCRel(const MCAssembler &Asm, unsigned Kind) {
   const MCFixupKindInfo &FKI =
     Asm.getBackend().getFixupKindInfo((MCFixupKind) Kind);
@@ -46,6 +51,7 @@ bool ELFObjectWriter::RelocNeedsGOT(MCSymbolRefExpr::VariantKind Variant) {
   case MCSymbolRefExpr::VK_GOT:
   case MCSymbolRefExpr::VK_PLT:
   case MCSymbolRefExpr::VK_GOTPCREL:
+  case MCSymbolRefExpr::VK_GOTOFF:
   case MCSymbolRefExpr::VK_TPOFF:
   case MCSymbolRefExpr::VK_TLSGD:
   case MCSymbolRefExpr::VK_GOTTPOFF:
@@ -181,8 +187,13 @@ uint64_t ELFObjectWriter::SymbolValue(MCSymbolData &Data,
   if (!Symbol.isInSection())
     return 0;
 
-  if (Data.getFragment())
-    return Layout.getSymbolOffset(&Data);
+
+  if (Data.getFragment()) {
+    if (Data.getFlags() & ELF_Other_ThumbFunc)
+      return Layout.getSymbolOffset(&Data)+1;
+    else
+      return Layout.getSymbolOffset(&Data);
+  }
 
   return 0;
 }
@@ -319,7 +330,9 @@ void ELFObjectWriter::WriteSymbolTable(MCDataFragment *SymtabF,
 
 const MCSymbol *ELFObjectWriter::SymbolToReloc(const MCAssembler &Asm,
                                                const MCValue &Target,
-                                               const MCFragment &F) const {
+                                               const MCFragment &F, 
+                                               const MCFixup &Fixup,
+                                               bool IsPCRel) const {
   const MCSymbol &Symbol = Target.getSymA()->getSymbol();
   const MCSymbol &ASymbol = Symbol.AliasedSymbol();
   const MCSymbol *Renamed = Renames.lookup(&Symbol);
@@ -342,7 +355,7 @@ const MCSymbol *ELFObjectWriter::SymbolToReloc(const MCAssembler &Asm,
   const SectionKind secKind = Section.getKind();
 
   if (secKind.isBSS())
-    return ExplicitRelSym(Asm, Target, F, true);
+    return ExplicitRelSym(Asm, Target, F, Fixup, IsPCRel);
 
   if (secKind.isThreadLocal()) {
     if (Renamed)
@@ -365,13 +378,14 @@ const MCSymbol *ELFObjectWriter::SymbolToReloc(const MCAssembler &Asm,
 
   if (Section.getFlags() & ELF::SHF_MERGE) {
     if (Target.getConstant() == 0)
-      return NULL;
+      return ExplicitRelSym(Asm, Target, F, Fixup, IsPCRel);
     if (Renamed)
       return Renamed;
     return &Symbol;
   }
 
-  return ExplicitRelSym(Asm, Target, F, false);
+  return ExplicitRelSym(Asm, Target, F, Fixup, IsPCRel);
+
 }
 
 
@@ -390,7 +404,7 @@ void ELFObjectWriter::RecordRelocation(const MCAssembler &Asm,
   if (!Target.isAbsolute()) {
     const MCSymbol &Symbol = Target.getSymA()->getSymbol();
     const MCSymbol &ASymbol = Symbol.AliasedSymbol();
-    RelocSymbol = SymbolToReloc(Asm, Target, *Fragment);
+    RelocSymbol = SymbolToReloc(Asm, Target, *Fragment, Fixup, IsPCRel);
 
     if (const MCSymbolRefExpr *RefB = Target.getSymB()) {
       const MCSymbol &SymbolB = RefB->getSymbol();
@@ -532,6 +546,7 @@ void ELFObjectWriter::ComputeSymbolTable(MCAssembler &Asm,
                                          RevGroupMapTy RevGroupMap,
                                          unsigned NumRegularSections) {
   // FIXME: Is this the correct place to do this?
+  // FIXME: Why is an undefined reference to _GLOBAL_OFFSET_TABLE_ needed?
   if (NeedsGOT) {
     llvm::StringRef Name = "_GLOBAL_OFFSET_TABLE_";
     MCSymbol *Sym = Asm.getContext().GetOrCreateSymbol(Name);
@@ -1261,32 +1276,93 @@ void ARMELFObjectWriter::WriteEFlags() {
 
 // In ARM, _MergedGlobals and other most symbols get emitted directly.
 // I.e. not as an offset to a section symbol.
-// This code is a first-cut approximation of what ARM/gcc does.
+// This code is an approximation of what ARM/gcc does.
+
+STATISTIC(PCRelCount, "Total number of PIC Relocations");
+STATISTIC(NonPCRelCount, "Total number of non-PIC relocations");
 
 const MCSymbol *ARMELFObjectWriter::ExplicitRelSym(const MCAssembler &Asm,
                                                    const MCValue &Target,
                                                    const MCFragment &F,
-                                                   bool IsBSS) const {
+                                                   const MCFixup &Fixup,
+                                                   bool IsPCRel) const {
   const MCSymbol &Symbol = Target.getSymA()->getSymbol();
   bool EmitThisSym = false;
 
-  if (IsBSS) {
-    EmitThisSym = StringSwitch<bool>(Symbol.getName())
-      .Case("_MergedGlobals", true)
-      .Default(false);
+  const MCSectionELF &Section =
+    static_cast<const MCSectionELF&>(Symbol.getSection());
+  bool InNormalSection = true;
+  unsigned RelocType = 0;
+  RelocType = GetRelocTypeInner(Target, Fixup, IsPCRel);
+
+  DEBUG(
+      const MCSymbolRefExpr::VariantKind Kind = Target.getSymA()->getKind();
+      MCSymbolRefExpr::VariantKind Kind2;
+      Kind2 = Target.getSymB() ?  Target.getSymB()->getKind() :
+        MCSymbolRefExpr::VK_None;
+      dbgs() << "considering symbol "
+        << Section.getSectionName() << "/"
+        << Symbol.getName() << "/"
+        << " Rel:" << (unsigned)RelocType
+        << " Kind: " << (int)Kind << "/" << (int)Kind2
+        << " Tmp:"
+        << Symbol.isAbsolute() << "/" << Symbol.isDefined() << "/"
+        << Symbol.isVariable() << "/" << Symbol.isTemporary()
+        << " Counts:" << PCRelCount << "/" << NonPCRelCount << "\n");
+
+  if (IsPCRel) { ++PCRelCount;
+    switch (RelocType) {
+    default:
+      // Most relocation types are emitted as explicit symbols
+      InNormalSection =
+        StringSwitch<bool>(Section.getSectionName())
+        .Case(".data.rel.ro.local", false)
+        .Case(".data.rel", false)
+        .Case(".bss", false)
+        .Default(true);
+      EmitThisSym = true;
+      break;
+    case ELF::R_ARM_ABS32:
+      // But things get strange with R_ARM_ABS32
+      // In this case, most things that go in .rodata show up
+      // as section relative relocations
+      InNormalSection =
+        StringSwitch<bool>(Section.getSectionName())
+        .Case(".data.rel.ro.local", false)
+        .Case(".data.rel", false)
+        .Case(".rodata", false)
+        .Case(".bss", false)
+        .Default(true);
+      EmitThisSym = false;
+      break;
+    }
   } else {
-    EmitThisSym = StringSwitch<bool>(Symbol.getName())
-      .Case("_MergedGlobals", true)
-      .StartsWith(".L.str", true)
-      .Default(false);
+    NonPCRelCount++;
+    InNormalSection =
+      StringSwitch<bool>(Section.getSectionName())
+      .Case(".data.rel.ro.local", false)
+      .Case(".rodata", false)
+      .Case(".data.rel", false)
+      .Case(".bss", false)
+      .Default(true);
+
+    switch (RelocType) {
+    default: EmitThisSym = true; break;
+    case ELF::R_ARM_ABS32: EmitThisSym = false; break;
+    }
   }
+
   if (EmitThisSym)
     return &Symbol;
-  if (! Symbol.isTemporary())
+  if (! Symbol.isTemporary() && InNormalSection) {
     return &Symbol;
+  }
   return NULL;
 }
 
+// Need to examine the Fixup when determining whether to 
+// emit the relocation as an explicit symbol or as a section relative
+// offset
 unsigned ARMELFObjectWriter::GetRelocType(const MCValue &Target,
                                           const MCFixup &Fixup,
                                           bool IsPCRel,
@@ -1295,6 +1371,20 @@ unsigned ARMELFObjectWriter::GetRelocType(const MCValue &Target,
   MCSymbolRefExpr::VariantKind Modifier = Target.isAbsolute() ?
     MCSymbolRefExpr::VK_None : Target.getSymA()->getKind();
 
+  unsigned Type = GetRelocTypeInner(Target, Fixup, IsPCRel);
+
+  if (RelocNeedsGOT(Modifier))
+    NeedsGOT = true;
+  
+  return Type;
+}
+
+unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
+                                               const MCFixup &Fixup,
+                                               bool IsPCRel) const  {
+  MCSymbolRefExpr::VariantKind Modifier = Target.isAbsolute() ?
+    MCSymbolRefExpr::VK_None : Target.getSymA()->getKind();
+
   unsigned Type = 0;
   if (IsPCRel) {
     switch ((unsigned)Fixup.getKind()) {
@@ -1303,7 +1393,7 @@ unsigned ARMELFObjectWriter::GetRelocType(const MCValue &Target,
       switch (Modifier) {
       default: llvm_unreachable("Unsupported Modifier");
       case MCSymbolRefExpr::VK_None:
-        Type = ELF::R_ARM_BASE_PREL;
+        Type = ELF::R_ARM_REL32;
         break;
       case MCSymbolRefExpr::VK_ARM_TLSGD:
         assert(0 && "unimplemented");
@@ -1342,6 +1432,17 @@ unsigned ARMELFObjectWriter::GetRelocType(const MCValue &Target,
     case ARM::fixup_t2_movw_lo16_pcrel:
       Type = ELF::R_ARM_THM_MOVW_PREL_NC;
       break;
+    case ARM::fixup_arm_thumb_bl:
+    case ARM::fixup_arm_thumb_blx:
+      switch (Modifier) {
+      case MCSymbolRefExpr::VK_ARM_PLT:
+        Type = ELF::R_ARM_THM_CALL;
+        break;
+      default:
+        Type = ELF::R_ARM_NONE;
+        break;
+      }
+      break;
     }
   } else {
     switch ((unsigned)Fixup.getKind()) {
@@ -1399,9 +1500,6 @@ unsigned ARMELFObjectWriter::GetRelocType(const MCValue &Target,
     }
   }
 
-  if (RelocNeedsGOT(Modifier))
-    NeedsGOT = true;
-
   return Type;
 }
 
@@ -1476,13 +1574,17 @@ unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target,
     if (IsPCRel) {
       switch ((unsigned)Fixup.getKind()) {
       default: llvm_unreachable("invalid fixup kind!");
+
+      case FK_Data_8: Type = ELF::R_X86_64_PC64; break;
+      case FK_Data_4: Type = ELF::R_X86_64_PC32; break;
+      case FK_Data_2: Type = ELF::R_X86_64_PC16; break;
+
       case FK_PCRel_8:
         assert(Modifier == MCSymbolRefExpr::VK_None);
         Type = ELF::R_X86_64_PC64;
         break;
       case X86::reloc_signed_4byte:
       case X86::reloc_riprel_4byte_movq_load:
-      case FK_Data_4: // FIXME?
       case X86::reloc_riprel_4byte:
       case FK_PCRel_4:
         switch (Modifier) {
@@ -1609,6 +1711,9 @@ unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target,
         case MCSymbolRefExpr::VK_DTPOFF:
           Type = ELF::R_386_TLS_LDO_32;
           break;
+        case MCSymbolRefExpr::VK_GOTTPOFF:
+          Type = ELF::R_386_TLS_IE_32;
+          break;
         }
         break;
       case FK_Data_2: Type = ELF::R_386_16; break;
diff --git a/lib/MC/ELFObjectWriter.h b/lib/MC/ELFObjectWriter.h
index f1d514a..7593099 100644
--- a/lib/MC/ELFObjectWriter.h
+++ b/lib/MC/ELFObjectWriter.h
@@ -140,15 +140,18 @@ class ELFObjectWriter : public MCObjectWriter {
     unsigned ShstrtabIndex;
 
 
-    const MCSymbol *SymbolToReloc(const MCAssembler &Asm,
-                                  const MCValue &Target,
-                                  const MCFragment &F) const;
+    virtual const MCSymbol *SymbolToReloc(const MCAssembler &Asm,
+                                          const MCValue &Target,
+                                          const MCFragment &F,
+                                          const MCFixup &Fixup,
+                                          bool IsPCRel) const;
 
     // For arch-specific emission of explicit reloc symbol
     virtual const MCSymbol *ExplicitRelSym(const MCAssembler &Asm,
                                            const MCValue &Target,
                                            const MCFragment &F,
-                                           bool IsBSS) const {
+                                           const MCFixup &Fixup,
+                                           bool IsPCRel) const {
       return NULL;
     }
 
@@ -380,11 +383,16 @@ class ELFObjectWriter : public MCObjectWriter {
     virtual const MCSymbol *ExplicitRelSym(const MCAssembler &Asm,
                                            const MCValue &Target,
                                            const MCFragment &F,
-                                           bool IsBSS) const;
+                                           const MCFixup &Fixup,
+                                           bool IsPCRel) const;
 
     virtual unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
                                   bool IsPCRel, bool IsRelocWithSymbol,
                                   int64_t Addend);
+  private:
+    unsigned GetRelocTypeInner(const MCValue &Target,
+                               const MCFixup &Fixup, bool IsPCRel) const;
+    
   };
 
   //===- MBlazeELFObjectWriter -------------------------------------------===//
diff --git a/lib/MC/MCAsmInfo.cpp b/lib/MC/MCAsmInfo.cpp
index 116c007..73b259e 100644
--- a/lib/MC/MCAsmInfo.cpp
+++ b/lib/MC/MCAsmInfo.cpp
@@ -13,7 +13,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCStreamer.h"
 #include "llvm/Support/DataTypes.h"
+#include "llvm/Support/Dwarf.h"
 #include <cctype>
 #include <cstring>
 using namespace llvm;
@@ -70,9 +74,8 @@ MCAsmInfo::MCAsmInfo() {
   HasLEB128 = false;
   SupportsDebugInformation = false;
   ExceptionsType = ExceptionHandling::None;
-  DwarfRequiresFrameSection = true;
   DwarfUsesInlineInfoSection = false;
-  DwarfUsesAbsoluteLabelForStmtList = true;
+  DwarfRequiresRelocationForSectionOffset = true;
   DwarfSectionOffsetDirective = 0;
   DwarfUsesLabelOffsetForRanges = true;
   HasMicrosoftFastStdCallMangling = false;
@@ -106,3 +109,25 @@ unsigned MCAsmInfo::getSLEB128Size(int Value) {
   } while (IsMore);
   return Size;
 }
+
+const MCExpr *
+MCAsmInfo::getExprForPersonalitySymbol(const MCSymbol *Sym,
+                                       unsigned Encoding,
+                                       MCStreamer &Streamer) const {
+  return getExprForFDESymbol(Sym, Encoding, Streamer);
+}
+
+const MCExpr *
+MCAsmInfo::getExprForFDESymbol(const MCSymbol *Sym,
+                               unsigned Encoding,
+                               MCStreamer &Streamer) const {
+  if (!(Encoding & dwarf::DW_EH_PE_pcrel))
+    return MCSymbolRefExpr::Create(Sym, Streamer.getContext());
+
+  MCContext &Context = Streamer.getContext();
+  const MCExpr *Res = MCSymbolRefExpr::Create(Sym, Context);
+  MCSymbol *PCSym = Context.CreateTempSymbol();
+  Streamer.EmitLabel(PCSym);
+  const MCExpr *PC = MCSymbolRefExpr::Create(PCSym, Context);
+  return MCBinaryExpr::CreateSub(Res, PC, Context);
+}
diff --git a/lib/MC/MCAsmInfoDarwin.cpp b/lib/MC/MCAsmInfoDarwin.cpp
index 526ad0d..5851cb0 100644
--- a/lib/MC/MCAsmInfoDarwin.cpp
+++ b/lib/MC/MCAsmInfoDarwin.cpp
@@ -13,6 +13,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCAsmInfoDarwin.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCStreamer.h"
 using namespace llvm;
 
 MCAsmInfoDarwin::MCAsmInfoDarwin() {
@@ -53,7 +56,6 @@ MCAsmInfoDarwin::MCAsmInfoDarwin() {
   HasNoDeadStrip = true;
   HasSymbolResolver = true;
 
-  DwarfUsesAbsoluteLabelForStmtList = false;
+  DwarfRequiresRelocationForSectionOffset = false;
   DwarfUsesLabelOffsetForRanges = false;
 }
-
diff --git a/lib/MC/MCAsmStreamer.cpp b/lib/MC/MCAsmStreamer.cpp
index c0fcce7..e8b09fc 100644
--- a/lib/MC/MCAsmStreamer.cpp
+++ b/lib/MC/MCAsmStreamer.cpp
@@ -45,20 +45,27 @@ class MCAsmStreamer : public MCStreamer {
   unsigned IsVerboseAsm : 1;
   unsigned ShowInst : 1;
   unsigned UseLoc : 1;
+  unsigned UseCFI : 1;
+
+  enum EHSymbolFlags { EHGlobal         = 1,
+                       EHWeakDefinition = 1 << 1,
+                       EHPrivateExtern  = 1 << 2 };
+  DenseMap<const MCSymbol*, unsigned> FlagMap;
 
   bool needsSet(const MCExpr *Value);
 
+  void EmitRegisterName(int64_t Register);
+
 public:
   MCAsmStreamer(MCContext &Context, formatted_raw_ostream &os,
-                bool isVerboseAsm,
-                bool useLoc,
+                bool isVerboseAsm, bool useLoc, bool useCFI,
                 MCInstPrinter *printer, MCCodeEmitter *emitter,
                 TargetAsmBackend *asmbackend,
                 bool showInst)
     : MCStreamer(Context), OS(os), MAI(Context.getAsmInfo()),
       InstPrinter(printer), Emitter(emitter), AsmBackend(asmbackend),
       CommentStream(CommentToEmit), IsVerboseAsm(isVerboseAsm),
-      ShowInst(showInst), UseLoc(useLoc) {
+      ShowInst(showInst), UseLoc(useLoc), UseCFI(useCFI) {
     if (InstPrinter && IsVerboseAsm)
       InstPrinter->setCommentStream(CommentStream);
   }
@@ -118,7 +125,8 @@ public:
   }
 
   virtual void EmitLabel(MCSymbol *Symbol);
-
+  virtual void EmitEHSymAttributes(const MCSymbol *Symbol,
+                                   MCSymbol *EHSymbol);
   virtual void EmitAssemblerFlag(MCAssemblerFlag Flag);
   virtual void EmitThumbFunc(MCSymbol *Func);
 
@@ -127,6 +135,8 @@ public:
   virtual void EmitDwarfAdvanceLineAddr(int64_t LineDelta,
                                         const MCSymbol *LastLabel,
                                         const MCSymbol *Label);
+  virtual void EmitDwarfAdvanceFrameAddr(const MCSymbol *LastLabel,
+                                         const MCSymbol *Label);
 
   virtual void EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute);
 
@@ -154,13 +164,13 @@ public:
   virtual void EmitBytes(StringRef Data, unsigned AddrSpace);
 
   virtual void EmitValueImpl(const MCExpr *Value, unsigned Size,
-                             bool isPCRel, unsigned AddrSpace);
+                             unsigned AddrSpace);
   virtual void EmitIntValue(uint64_t Value, unsigned Size,
                             unsigned AddrSpace = 0);
 
-  virtual void EmitULEB128Value(const MCExpr *Value, unsigned AddrSpace = 0);
+  virtual void EmitULEB128Value(const MCExpr *Value);
 
-  virtual void EmitSLEB128Value(const MCExpr *Value, unsigned AddrSpace = 0);
+  virtual void EmitSLEB128Value(const MCExpr *Value);
 
   virtual void EmitGPRel32Value(const MCExpr *Value);
 
@@ -182,15 +192,38 @@ public:
   virtual bool EmitDwarfFileDirective(unsigned FileNo, StringRef Filename);
   virtual void EmitDwarfLocDirective(unsigned FileNo, unsigned Line,
                                      unsigned Column, unsigned Flags,
-                                     unsigned Isa, unsigned Discriminator);
-
-  virtual bool EmitCFIStartProc();
-  virtual bool EmitCFIEndProc();
-  virtual bool EmitCFIDefCfaOffset(int64_t Offset);
-  virtual bool EmitCFIDefCfaRegister(int64_t Register);
-  virtual bool EmitCFIOffset(int64_t Register, int64_t Offset);
-  virtual bool EmitCFIPersonality(const MCSymbol *Sym, unsigned Encoding);
-  virtual bool EmitCFILsda(const MCSymbol *Sym, unsigned Encoding);
+                                     unsigned Isa, unsigned Discriminator,
+                                     StringRef FileName);
+
+  virtual void EmitCFISections(bool EH, bool Debug);
+  virtual void EmitCFIStartProc();
+  virtual void EmitCFIEndProc();
+  virtual void EmitCFIDefCfa(int64_t Register, int64_t Offset);
+  virtual void EmitCFIDefCfaOffset(int64_t Offset);
+  virtual void EmitCFIDefCfaRegister(int64_t Register);
+  virtual void EmitCFIOffset(int64_t Register, int64_t Offset);
+  virtual void EmitCFIPersonality(const MCSymbol *Sym, unsigned Encoding);
+  virtual void EmitCFILsda(const MCSymbol *Sym, unsigned Encoding);
+  virtual void EmitCFIRememberState();
+  virtual void EmitCFIRestoreState();
+  virtual void EmitCFISameValue(int64_t Register);
+  virtual void EmitCFIRelOffset(int64_t Register, int64_t Offset);
+  virtual void EmitCFIAdjustCfaOffset(int64_t Adjustment);
+
+  virtual void EmitWin64EHStartProc(const MCSymbol *Symbol);
+  virtual void EmitWin64EHEndProc();
+  virtual void EmitWin64EHStartChained();
+  virtual void EmitWin64EHEndChained();
+  virtual void EmitWin64EHHandler(const MCSymbol *Sym, bool Unwind,
+                                  bool Except);
+  virtual void EmitWin64EHHandlerData();
+  virtual void EmitWin64EHPushReg(unsigned Register);
+  virtual void EmitWin64EHSetFrame(unsigned Register, unsigned Offset);
+  virtual void EmitWin64EHAllocStack(unsigned Size);
+  virtual void EmitWin64EHSaveReg(unsigned Register, unsigned Offset);
+  virtual void EmitWin64EHSaveXMM(unsigned Register, unsigned Offset);
+  virtual void EmitWin64EHPushFrame(bool Code);
+  virtual void EmitWin64EHEndProlog();
 
   virtual void EmitFnStart();
   virtual void EmitFnEnd();
@@ -269,14 +302,27 @@ void MCAsmStreamer::ChangeSection(const MCSection *Section) {
   Section->PrintSwitchToSection(MAI, OS);
 }
 
+void MCAsmStreamer::EmitEHSymAttributes(const MCSymbol *Symbol,
+                                        MCSymbol *EHSymbol) {
+  if (UseCFI)
+    return;
+
+  unsigned Flags = FlagMap.lookup(Symbol);
+
+  if (Flags & EHGlobal)
+    EmitSymbolAttribute(EHSymbol, MCSA_Global);
+  if (Flags & EHWeakDefinition)
+    EmitSymbolAttribute(EHSymbol, MCSA_WeakDefinition);
+  if (Flags & EHPrivateExtern)
+    EmitSymbolAttribute(EHSymbol, MCSA_PrivateExtern);
+}
+
 void MCAsmStreamer::EmitLabel(MCSymbol *Symbol) {
   assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
-  assert(!Symbol->isVariable() && "Cannot emit a variable symbol!");
-  assert(getCurrentSection() && "Cannot emit before setting section!");
+  MCStreamer::EmitLabel(Symbol);
 
   OS << *Symbol << MAI.getLabelSuffix();
   EmitEOL();
-  Symbol->setSection(*getCurrentSection());
 }
 
 void MCAsmStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
@@ -294,7 +340,8 @@ void MCAsmStreamer::EmitThumbFunc(MCSymbol *Func) {
   // This needs to emit to a temporary string to get properly quoted
   // MCSymbols when they have spaces in them.
   OS << "\t.thumb_func";
-  if (Func)
+  // Only Mach-O hasSubsectionsViaSymbols()
+  if (MAI.hasSubsectionsViaSymbols())
     OS << '\t' << *Func;
   EmitEOL();
 }
@@ -319,6 +366,15 @@ void MCAsmStreamer::EmitDwarfAdvanceLineAddr(int64_t LineDelta,
                        getContext().getTargetAsmInfo().getPointerSize());
 }
 
+void MCAsmStreamer::EmitDwarfAdvanceFrameAddr(const MCSymbol *LastLabel,
+                                              const MCSymbol *Label) {
+  EmitIntValue(dwarf::DW_CFA_advance_loc4, 1);
+  const MCExpr *AddrDelta = BuildSymbolDiff(getContext(), Label, LastLabel);
+  AddrDelta = ForceExpAbs(AddrDelta);
+  EmitValue(AddrDelta, 4);
+}
+
+
 void MCAsmStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
                                         MCSymbolAttr Attribute) {
   switch (Attribute) {
@@ -347,6 +403,7 @@ void MCAsmStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
     return;
   case MCSA_Global: // .globl/.global
     OS << MAI.getGlobalDirective();
+    FlagMap[Symbol] |= EHGlobal;
     break;
   case MCSA_Hidden:         OS << "\t.hidden\t";          break;
   case MCSA_IndirectSymbol: OS << "\t.indirect_symbol\t"; break;
@@ -355,11 +412,17 @@ void MCAsmStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
   case MCSA_Local:          OS << "\t.local\t";           break;
   case MCSA_NoDeadStrip:    OS << "\t.no_dead_strip\t";   break;
   case MCSA_SymbolResolver: OS << "\t.symbol_resolver\t"; break;
-  case MCSA_PrivateExtern:  OS << "\t.private_extern\t";  break;
+  case MCSA_PrivateExtern:
+    OS << "\t.private_extern\t";
+    FlagMap[Symbol] |= EHPrivateExtern;
+    break;
   case MCSA_Protected:      OS << "\t.protected\t";       break;
   case MCSA_Reference:      OS << "\t.reference\t";       break;
   case MCSA_Weak:           OS << "\t.weak\t";            break;
-  case MCSA_WeakDefinition: OS << "\t.weak_definition\t"; break;
+  case MCSA_WeakDefinition:
+    OS << "\t.weak_definition\t";
+    FlagMap[Symbol] |= EHWeakDefinition;
+    break;
       // .weak_reference
   case MCSA_WeakReference:  OS << MAI.getWeakRefDirective(); break;
   case MCSA_WeakDefAutoPrivate: OS << "\t.weak_def_can_be_hidden\t"; break;
@@ -522,9 +585,8 @@ void MCAsmStreamer::EmitIntValue(uint64_t Value, unsigned Size,
 }
 
 void MCAsmStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
-                                  bool isPCRel, unsigned AddrSpace) {
+                                  unsigned AddrSpace) {
   assert(getCurrentSection() && "Cannot emit contents before setting section!");
-  assert(!isPCRel && "Cannot emit pc relative relocations!");
   const char *Directive = 0;
   switch (Size) {
   default: break;
@@ -553,10 +615,10 @@ void MCAsmStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitULEB128Value(const MCExpr *Value, unsigned AddrSpace) {
+void MCAsmStreamer::EmitULEB128Value(const MCExpr *Value) {
   int64_t IntValue;
   if (Value->EvaluateAsAbsolute(IntValue)) {
-    EmitULEB128IntValue(IntValue, AddrSpace);
+    EmitULEB128IntValue(IntValue);
     return;
   }
   assert(MAI.hasLEB128() && "Cannot print a .uleb");
@@ -564,10 +626,10 @@ void MCAsmStreamer::EmitULEB128Value(const MCExpr *Value, unsigned AddrSpace) {
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitSLEB128Value(const MCExpr *Value, unsigned AddrSpace) {
+void MCAsmStreamer::EmitSLEB128Value(const MCExpr *Value) {
   int64_t IntValue;
   if (Value->EvaluateAsAbsolute(IntValue)) {
-    EmitSLEB128IntValue(IntValue, AddrSpace);
+    EmitSLEB128IntValue(IntValue);
     return;
   }
   assert(MAI.hasLEB128() && "Cannot print a .sleb");
@@ -683,9 +745,10 @@ bool MCAsmStreamer::EmitDwarfFileDirective(unsigned FileNo, StringRef Filename){
 void MCAsmStreamer::EmitDwarfLocDirective(unsigned FileNo, unsigned Line,
                                           unsigned Column, unsigned Flags,
                                           unsigned Isa,
-                                          unsigned Discriminator) {
+                                          unsigned Discriminator,
+                                          StringRef FileName) {
   this->MCStreamer::EmitDwarfLocDirective(FileNo, Line, Column, Flags,
-                                          Isa, Discriminator);
+                                          Isa, Discriminator, FileName);
   if (!UseLoc)
     return;
 
@@ -711,78 +774,289 @@ void MCAsmStreamer::EmitDwarfLocDirective(unsigned FileNo, unsigned Line,
     OS << "isa " << Isa;
   if (Discriminator)
     OS << "discriminator " << Discriminator;
+
+  if (IsVerboseAsm) {
+    OS.PadToColumn(MAI.getCommentColumn());
+    OS << MAI.getCommentString() << ' ' << FileName << ':' 
+       << Line << ':' << Column;
+  }
   EmitEOL();
 }
 
-bool MCAsmStreamer::EmitCFIStartProc() {
-  if (this->MCStreamer::EmitCFIStartProc())
-    return true;
+void MCAsmStreamer::EmitCFISections(bool EH, bool Debug) {
+  MCStreamer::EmitCFISections(EH, Debug);
+
+  if (!UseCFI)
+    return;
+
+  OS << "\t.cfi_sections ";
+  if (EH) {
+    OS << ".eh_frame";
+    if (Debug)
+      OS << ", .debug_frame";
+  } else if (Debug) {
+    OS << ".debug_frame";
+  }
 
-  OS << "\t.cfi_startproc";
   EmitEOL();
+}
+
+void MCAsmStreamer::EmitCFIStartProc() {
+  MCStreamer::EmitCFIStartProc();
+
+  if (!UseCFI)
+    return;
 
-  return false;
+  OS << "\t.cfi_startproc";
+  EmitEOL();
 }
 
-bool MCAsmStreamer::EmitCFIEndProc() {
-  if (this->MCStreamer::EmitCFIEndProc())
-    return true;
+void MCAsmStreamer::EmitCFIEndProc() {
+  MCStreamer::EmitCFIEndProc();
+
+  if (!UseCFI)
+    return;
 
   OS << "\t.cfi_endproc";
   EmitEOL();
+}
+
+void MCAsmStreamer::EmitRegisterName(int64_t Register) {
+  if (InstPrinter) {
+    const TargetAsmInfo &asmInfo = getContext().getTargetAsmInfo();
+    unsigned LLVMRegister = asmInfo.getLLVMRegNum(Register, true);
+    InstPrinter->printRegName(OS, LLVMRegister);
+  } else {
+    OS << Register;
+  }
+}
+
+void MCAsmStreamer::EmitCFIDefCfa(int64_t Register, int64_t Offset) {
+  MCStreamer::EmitCFIDefCfa(Register, Offset);
 
-  return false;
+  if (!UseCFI)
+    return;
+
+  OS << "\t.cfi_def_cfa ";
+  EmitRegisterName(Register);
+  OS << ", " << Offset;
+  EmitEOL();
 }
 
-bool MCAsmStreamer::EmitCFIDefCfaOffset(int64_t Offset) {
-  if (this->MCStreamer::EmitCFIDefCfaOffset(Offset))
-    return true;
+void MCAsmStreamer::EmitCFIDefCfaOffset(int64_t Offset) {
+  MCStreamer::EmitCFIDefCfaOffset(Offset);
+
+  if (!UseCFI)
+    return;
 
   OS << "\t.cfi_def_cfa_offset " << Offset;
   EmitEOL();
+}
+
+void MCAsmStreamer::EmitCFIDefCfaRegister(int64_t Register) {
+  MCStreamer::EmitCFIDefCfaRegister(Register);
 
-  return false;
+  if (!UseCFI)
+    return;
+
+  OS << "\t.cfi_def_cfa_register ";
+  EmitRegisterName(Register);
+  EmitEOL();
 }
 
-bool MCAsmStreamer::EmitCFIDefCfaRegister(int64_t Register) {
-  if (this->MCStreamer::EmitCFIDefCfaRegister(Register))
-    return true;
+void MCAsmStreamer::EmitCFIOffset(int64_t Register, int64_t Offset) {
+  this->MCStreamer::EmitCFIOffset(Register, Offset);
+
+  if (!UseCFI)
+    return;
 
-  OS << "\t.cfi_def_cfa_register " << Register;
+  OS << "\t.cfi_offset ";
+  EmitRegisterName(Register);
+  OS << ", " << Offset;
   EmitEOL();
+}
+
+void MCAsmStreamer::EmitCFIPersonality(const MCSymbol *Sym,
+                                       unsigned Encoding) {
+  MCStreamer::EmitCFIPersonality(Sym, Encoding);
 
-  return false;
+  if (!UseCFI)
+    return;
+
+  OS << "\t.cfi_personality " << Encoding << ", " << *Sym;
+  EmitEOL();
 }
 
-bool MCAsmStreamer::EmitCFIOffset(int64_t Register, int64_t Offset) {
-  if (this->MCStreamer::EmitCFIOffset(Register, Offset))
-    return true;
+void MCAsmStreamer::EmitCFILsda(const MCSymbol *Sym, unsigned Encoding) {
+  MCStreamer::EmitCFILsda(Sym, Encoding);
 
-  OS << "\t.cfi_offset " << Register << ", " << Offset;
+  if (!UseCFI)
+    return;
+
+  OS << "\t.cfi_lsda " << Encoding << ", " << *Sym;
   EmitEOL();
+}
+
+void MCAsmStreamer::EmitCFIRememberState() {
+  MCStreamer::EmitCFIRememberState();
 
-  return false;
+  if (!UseCFI)
+    return;
+
+  OS << "\t.cfi_remember_state";
+  EmitEOL();
 }
 
-bool MCAsmStreamer::EmitCFIPersonality(const MCSymbol *Sym,
-                                       unsigned Encoding) {
-  if (this->MCStreamer::EmitCFIPersonality(Sym, Encoding))
-    return true;
+void MCAsmStreamer::EmitCFIRestoreState() {
+  MCStreamer::EmitCFIRestoreState();
 
-  OS << "\t.cfi_personality " << Encoding << ", " << *Sym;
+  if (!UseCFI)
+    return;
+
+  OS << "\t.cfi_restore_state";
   EmitEOL();
+}
 
-  return false;
+void MCAsmStreamer::EmitCFISameValue(int64_t Register) {
+  MCStreamer::EmitCFISameValue(Register);
+
+  if (!UseCFI)
+    return;
+
+  OS << "\t.cfi_same_value ";
+  EmitRegisterName(Register);
+  EmitEOL();
 }
 
-bool MCAsmStreamer::EmitCFILsda(const MCSymbol *Sym, unsigned Encoding) {
-  if (this->MCStreamer::EmitCFILsda(Sym, Encoding))
-    return true;
+void MCAsmStreamer::EmitCFIRelOffset(int64_t Register, int64_t Offset) {
+  MCStreamer::EmitCFIRelOffset(Register, Offset);
 
-  OS << "\t.cfi_lsda " << Encoding << ", " << *Sym;
+  if (!UseCFI)
+    return;
+
+  OS << "\t.cfi_rel_offset ";
+  EmitRegisterName(Register);
+  OS << ", " << Offset;
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitCFIAdjustCfaOffset(int64_t Adjustment) {
+  MCStreamer::EmitCFIAdjustCfaOffset(Adjustment);
+
+  if (!UseCFI)
+    return;
+
+  OS << "\t.cfi_adjust_cfa_offset " << Adjustment;
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitWin64EHStartProc(const MCSymbol *Symbol) {
+  MCStreamer::EmitWin64EHStartProc(Symbol);
+
+  OS << ".seh_proc " << *Symbol;
   EmitEOL();
+}
+
+void MCAsmStreamer::EmitWin64EHEndProc() {
+  MCStreamer::EmitWin64EHEndProc();
 
-  return false;
+  OS << "\t.seh_endproc";
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitWin64EHStartChained() {
+  MCStreamer::EmitWin64EHStartChained();
+
+  OS << "\t.seh_startchained";
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitWin64EHEndChained() {
+  MCStreamer::EmitWin64EHEndChained();
+
+  OS << "\t.seh_endchained";
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitWin64EHHandler(const MCSymbol *Sym, bool Unwind,
+                                       bool Except) {
+  MCStreamer::EmitWin64EHHandler(Sym, Unwind, Except);
+
+  OS << "\t.seh_handler " << *Sym;
+  if (Unwind)
+    OS << ", @unwind";
+  if (Except)
+    OS << ", @except";
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitWin64EHHandlerData() {
+  MCStreamer::EmitWin64EHHandlerData();
+
+  // Switch sections. Don't call SwitchSection directly, because that will
+  // cause the section switch to be visible in the emitted assembly.
+  // We only do this so the section switch that terminates the handler
+  // data block is visible.
+  MCWin64EHUnwindInfo *CurFrame = getCurrentW64UnwindInfo();
+  StringRef suffix=MCWin64EHUnwindEmitter::GetSectionSuffix(CurFrame->Function);
+  const MCSection *xdataSect =
+    getContext().getTargetAsmInfo().getWin64EHTableSection(suffix);
+  if (xdataSect)
+    SwitchSectionNoChange(xdataSect);
+
+  OS << "\t.seh_handlerdata";
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitWin64EHPushReg(unsigned Register) {
+  MCStreamer::EmitWin64EHPushReg(Register);
+
+  OS << "\t.seh_pushreg " << Register;
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitWin64EHSetFrame(unsigned Register, unsigned Offset) {
+  MCStreamer::EmitWin64EHSetFrame(Register, Offset);
+
+  OS << "\t.seh_setframe " << Register << ", " << Offset;
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitWin64EHAllocStack(unsigned Size) {
+  MCStreamer::EmitWin64EHAllocStack(Size);
+
+  OS << "\t.seh_stackalloc " << Size;
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitWin64EHSaveReg(unsigned Register, unsigned Offset) {
+  MCStreamer::EmitWin64EHSaveReg(Register, Offset);
+
+  OS << "\t.seh_savereg " << Register << ", " << Offset;
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitWin64EHSaveXMM(unsigned Register, unsigned Offset) {
+  MCStreamer::EmitWin64EHSaveXMM(Register, Offset);
+
+  OS << "\t.seh_savexmm " << Register << ", " << Offset;
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitWin64EHPushFrame(bool Code) {
+  MCStreamer::EmitWin64EHPushFrame(Code);
+
+  OS << "\t.seh_pushframe";
+  if (Code)
+    OS << " @code";
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitWin64EHEndProlog(void) {
+  MCStreamer::EmitWin64EHEndProlog();
+
+  OS << "\t.seh_endprologue";
+  EmitEOL();
 }
 
 void MCAsmStreamer::AddEncodingComment(const MCInst &Inst) {
@@ -895,8 +1169,10 @@ void MCAsmStreamer::EmitPersonality(const MCSymbol *Personality) {
 }
 
 void MCAsmStreamer::EmitSetFP(unsigned FpReg, unsigned SpReg, int64_t Offset) {
-  OS << "\t.setfp\t" << InstPrinter->getRegName(FpReg)
-     << ", "        << InstPrinter->getRegName(SpReg);
+  OS << "\t.setfp\t";
+  InstPrinter->printRegName(OS, FpReg);
+  OS << ", ";
+  InstPrinter->printRegName(OS, SpReg);
   if (Offset)
     OS << ", #" << Offset;
   EmitEOL();
@@ -915,10 +1191,12 @@ void MCAsmStreamer::EmitRegSave(const SmallVectorImpl<unsigned> &RegList,
   else
     OS << "\t.save\t{";
 
-  OS << InstPrinter->getRegName(RegList[0]);
+  InstPrinter->printRegName(OS, RegList[0]);
 
-  for (unsigned i = 1, e = RegList.size(); i != e; ++i)
-    OS << ", " << InstPrinter->getRegName(RegList[i]);
+  for (unsigned i = 1, e = RegList.size(); i != e; ++i) {
+    OS << ", ";
+    InstPrinter->printRegName(OS, RegList[i]);
+  }
 
   OS << "}";
   EmitEOL();
@@ -927,9 +1205,6 @@ void MCAsmStreamer::EmitRegSave(const SmallVectorImpl<unsigned> &RegList,
 void MCAsmStreamer::EmitInstruction(const MCInst &Inst) {
   assert(getCurrentSection() && "Cannot emit contents before setting section!");
 
-  if (!UseLoc)
-    MCLineEntry::Make(this, getCurrentSection());
-
   // Show the encoding in a comment if we have a code emitter.
   if (Emitter)
     AddEncodingComment(Inst);
@@ -962,13 +1237,17 @@ void MCAsmStreamer::Finish() {
   // Dump out the dwarf file & directory tables and line tables.
   if (getContext().hasDwarfFiles() && !UseLoc)
     MCDwarfFileTable::Emit(this);
+
+  if (!UseCFI)
+    EmitFrames(false);
 }
 
 MCStreamer *llvm::createAsmStreamer(MCContext &Context,
                                     formatted_raw_ostream &OS,
                                     bool isVerboseAsm, bool useLoc,
+                                    bool useCFI,
                                     MCInstPrinter *IP, MCCodeEmitter *CE,
                                     TargetAsmBackend *TAB, bool ShowInst) {
-  return new MCAsmStreamer(Context, OS, isVerboseAsm, useLoc,
+  return new MCAsmStreamer(Context, OS, isVerboseAsm, useLoc, useCFI,
                            IP, CE, TAB, ShowInst);
 }
diff --git a/lib/MC/MCAssembler.cpp b/lib/MC/MCAssembler.cpp
index 9992646..527a63c 100644
--- a/lib/MC/MCAssembler.cpp
+++ b/lib/MC/MCAssembler.cpp
@@ -28,7 +28,6 @@
 #include "llvm/Target/TargetRegistry.h"
 #include "llvm/Target/TargetAsmBackend.h"
 
-#include <vector>
 using namespace llvm;
 
 namespace {
@@ -103,6 +102,33 @@ uint64_t MCAsmLayout::getFragmentOffset(const MCFragment *F) const {
 }
 
 uint64_t MCAsmLayout::getSymbolOffset(const MCSymbolData *SD) const {
+  const MCSymbol &S = SD->getSymbol();
+
+  // If this is a variable, then recursively evaluate now.
+  if (S.isVariable()) {
+    MCValue Target;
+    if (!S.getVariableValue()->EvaluateAsRelocatable(Target, *this))
+      report_fatal_error("unable to evaluate offset for variable '" +
+                         S.getName() + "'");
+
+    // Verify that any used symbols are defined.
+    if (Target.getSymA() && Target.getSymA()->getSymbol().isUndefined())
+      report_fatal_error("unable to evaluate offset to undefined symbol '" +
+                         Target.getSymA()->getSymbol().getName() + "'");
+    if (Target.getSymB() && Target.getSymB()->getSymbol().isUndefined())
+      report_fatal_error("unable to evaluate offset to undefined symbol '" +
+                         Target.getSymB()->getSymbol().getName() + "'");
+      
+    uint64_t Offset = Target.getConstant();
+    if (Target.getSymA())
+      Offset += getSymbolOffset(&Assembler.getSymbolData(
+                                  Target.getSymA()->getSymbol()));
+    if (Target.getSymB())
+      Offset -= getSymbolOffset(&Assembler.getSymbolData(
+                                  Target.getSymB()->getSymbol()));
+    return Offset;
+  }
+
   assert(SD->getFragment() && "Invalid getOffset() on undefined symbol!");
   return getFragmentOffset(SD->getFragment()) + SD->getOffset();
 }
@@ -692,7 +718,9 @@ bool MCAssembler::RelaxInstruction(MCAsmLayout &Layout,
 bool MCAssembler::RelaxLEB(MCAsmLayout &Layout, MCLEBFragment &LF) {
   int64_t Value = 0;
   uint64_t OldSize = LF.getContents().size();
-  LF.getValue().EvaluateAsAbsolute(Value, Layout);
+  bool IsAbs = LF.getValue().EvaluateAsAbsolute(Value, Layout);
+  (void)IsAbs;
+  assert(IsAbs);
   SmallString<8> &Data = LF.getContents();
   Data.clear();
   raw_svector_ostream OSE(Data);
diff --git a/lib/MC/MCContext.cpp b/lib/MC/MCContext.cpp
index 7c68713..8faa72e 100644
--- a/lib/MC/MCContext.cpp
+++ b/lib/MC/MCContext.cpp
@@ -27,7 +27,9 @@ typedef StringMap<const MCSectionCOFF*> COFFUniqueMapTy;
 
 
 MCContext::MCContext(const MCAsmInfo &mai, const TargetAsmInfo *tai) :
-  MAI(mai), TAI(tai), NextUniqueID(0),
+  MAI(mai), TAI(tai),
+  Allocator(), Symbols(Allocator), UsedNames(Allocator),
+  NextUniqueID(0),
   CurrentDwarfLoc(0,0,0,DWARF2_FLAG_IS_STMT,0,0),
   AllowTemporaryLabels(true) {
   MachOUniquingMap = 0;
@@ -85,12 +87,11 @@ MCSymbol *MCContext::CreateSymbol(StringRef Name) {
   StringMapEntry<bool> *NameEntry = &UsedNames.GetOrCreateValue(Name);
   if (NameEntry->getValue()) {
     assert(isTemporary && "Cannot rename non temporary symbols");
-    SmallString<128> NewName;
+    SmallString<128> NewName = Name;
     do {
-      Twine T = Name + Twine(NextUniqueID++);
-      T.toVector(NewName);
-      StringRef foo = NewName;
-      NameEntry = &UsedNames.GetOrCreateValue(foo);
+      NewName.resize(Name.size());
+      raw_svector_ostream(NewName) << NextUniqueID++;
+      NameEntry = &UsedNames.GetOrCreateValue(NewName);
     } while (NameEntry->getValue());
   }
   NameEntry->setValue(true);
@@ -110,9 +111,8 @@ MCSymbol *MCContext::GetOrCreateSymbol(const Twine &Name) {
 
 MCSymbol *MCContext::CreateTempSymbol() {
   SmallString<128> NameSV;
-  Twine Name = Twine(MAI.getPrivateGlobalPrefix()) + "tmp" +
-    Twine(NextUniqueID++);
-  Name.toVector(NameSV);
+  raw_svector_ostream(NameSV)
+    << MAI.getPrivateGlobalPrefix() << "tmp" << NextUniqueID++;
   return CreateSymbol(NameSV);
 }
 
diff --git a/lib/MC/MCDisassembler/Disassembler.cpp b/lib/MC/MCDisassembler/Disassembler.cpp
index 4707198..6e636f0 100644
--- a/lib/MC/MCDisassembler/Disassembler.cpp
+++ b/lib/MC/MCDisassembler/Disassembler.cpp
@@ -6,11 +6,10 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
+
 #include "Disassembler.h"
-#include <stdio.h>
 #include "llvm-c/Disassembler.h"
 
-#include <string>
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCDisassembler.h"
 #include "llvm/MC/MCInst.h"
@@ -27,17 +26,12 @@ class Target;
 } // namespace llvm
 using namespace llvm;
 
-#ifdef __cplusplus
-extern "C" {
-#endif // __cplusplus
-
-//
 // LLVMCreateDisasm() creates a disassembler for the TripleName.  Symbolic
 // disassembly is supported by passing a block of information in the DisInfo
-// parameter and specifing the TagType and call back functions as described in
+// parameter and specifying the TagType and callback functions as described in
 // the header llvm-c/Disassembler.h .  The pointer to the block and the 
-// functions can all be passed as NULL.  If successfull this returns a
-// disassembler context if not it returns NULL.
+// functions can all be passed as NULL.  If successful, this returns a
+// disassembler context.  If not, it returns NULL.
 //
 LLVMDisasmContextRef LLVMCreateDisasm(const char *TripleName, void *DisInfo,
                                       int TagType, LLVMOpInfoCallback GetOpInfo,
@@ -77,8 +71,9 @@ LLVMDisasmContextRef LLVMCreateDisasm(const char *TripleName, void *DisInfo,
   assert(Ctx && "Unable to create MCContext!");
 
   // Set up disassembler.
-  const MCDisassembler *DisAsm = TheTarget->createMCDisassembler();
+  MCDisassembler *DisAsm = TheTarget->createMCDisassembler();
   assert(DisAsm && "Unable to create disassembler!");
+  DisAsm->setupForSymbolicDisassembly(GetOpInfo, DisInfo, Ctx);
 
   // Set up the instruction printer.
   int AsmPrinterVariant = MAI->getAssemblerDialect();
@@ -107,7 +102,6 @@ namespace {
 // The memory object created by LLVMDisasmInstruction().
 //
 class DisasmMemoryObject : public MemoryObject {
-private:
   uint8_t *Bytes;
   uint64_t Size;
   uint64_t BasePC;
@@ -125,12 +119,12 @@ public:
     return 0;
   }
 };
-} // namespace
+} // end anonymous namespace
 
 //
-// LLVMDisasmInstruction() disassmbles a single instruction using the
+// LLVMDisasmInstruction() disassembles a single instruction using the
 // disassembler context specified in the parameter DC.  The bytes of the
-// instuction are specified in the parameter Bytes, and contains at least
+// instruction are specified in the parameter Bytes, and contains at least
 // BytesSize number of bytes.  The instruction is at the address specified by
 // the PC parameter.  If a valid instruction can be disassembled its string is
 // returned indirectly in OutString which whos size is specified in the
@@ -153,21 +147,15 @@ size_t LLVMDisasmInstruction(LLVMDisasmContextRef DCR, uint8_t *Bytes,
   if (!DisAsm->getInstruction(Inst, Size, MemoryObject, PC, /*REMOVE*/ nulls()))
     return 0;
 
-  std::string InsnStr;
-  raw_string_ostream OS(InsnStr);
-  raw_ostream &Out = OS;
-  IP->printInst(&Inst, Out);
-
-  std::string p;
-  p = OS.str();
-#ifdef LLVM_ON_WIN32
-  sprintf(OutString, "%s", p.c_str());
-#else
-  snprintf(OutString, OutStringSize, "%s", p.c_str());
-#endif
-  return Size;
-}
+  SmallVector<char, 64> InsnStr;
+  raw_svector_ostream OS(InsnStr);
+  IP->printInst(&Inst, OS);
+  OS.flush();
+
+  assert(OutStringSize != 0 && "Output buffer cannot be zero size");
+  size_t OutputSize = std::min(OutStringSize-1, InsnStr.size());
+  std::memcpy(OutString, InsnStr.data(), OutputSize);
+  OutString[OutputSize] = '\0'; // Terminate string.
 
-#ifdef __cplusplus
+  return Size;
 }
-#endif // __cplusplus
diff --git a/lib/MC/MCDisassembler/Disassembler.h b/lib/MC/MCDisassembler/Disassembler.h
index f844951..f0ec42a 100644
--- a/lib/MC/MCDisassembler/Disassembler.h
+++ b/lib/MC/MCDisassembler/Disassembler.h
@@ -13,6 +13,10 @@
 // syntax.
 //
 //===----------------------------------------------------------------------===//
+
+#ifndef LLVM_MC_DISASSEMBLER_H
+#define LLVM_MC_DISASSEMBLER_H
+
 #include "llvm-c/Disassembler.h"
 #include <string>
 #include "llvm/ADT/OwningPtr.h"
@@ -69,7 +73,7 @@ private:
 
 public:
   LLVMDisasmContext(std::string tripleName, void *disInfo, int tagType,
-	  LLVMOpInfoCallback getOpInfo,
+                    LLVMOpInfoCallback getOpInfo,
                     LLVMSymbolLookupCallback symbolLookUp,
                     const Target *theTarget, const MCAsmInfo *mAI,
                     llvm::TargetMachine *tM, const TargetAsmInfo *tai,
@@ -88,3 +92,5 @@ public:
 };
 
 } // namespace llvm
+
+#endif
diff --git a/lib/MC/MCDisassembler/EDDisassembler.cpp b/lib/MC/MCDisassembler/EDDisassembler.cpp
index e36b3a4..91c5284 100644
--- a/lib/MC/MCDisassembler/EDDisassembler.cpp
+++ b/lib/MC/MCDisassembler/EDDisassembler.cpp
@@ -334,6 +334,15 @@ int EDDisassembler::printInst(std::string &str, MCInst &inst) {
   return 0;
 }
 
+static void diag_handler(const SMDiagnostic &diag,
+                         void *context)
+{
+  if (context) {
+    EDDisassembler *disassembler = static_cast<EDDisassembler*>(context);
+    diag.Print("", disassembler->ErrorStream);
+  }
+}
+
 int EDDisassembler::parseInst(SmallVectorImpl<MCParsedAsmOperand*> &operands,
                               SmallVectorImpl<AsmToken> &tokens,
                               const std::string &str) {
@@ -356,6 +365,7 @@ int EDDisassembler::parseInst(SmallVectorImpl<MCParsedAsmOperand*> &operands,
   SMLoc instLoc;
   
   SourceMgr sourceMgr;
+  sourceMgr.setDiagHandler(diag_handler, static_cast<void*>(this));
   sourceMgr.AddNewSourceBuffer(buf, SMLoc()); // ownership of buf handed over
   MCContext context(*AsmInfo, NULL);
   OwningPtr<MCStreamer> streamer(createNullStreamer(context));
diff --git a/lib/MC/MCDisassembler/EDOperand.cpp b/lib/MC/MCDisassembler/EDOperand.cpp
index 04b21cb..492bb08 100644
--- a/lib/MC/MCDisassembler/EDOperand.cpp
+++ b/lib/MC/MCDisassembler/EDOperand.cpp
@@ -198,15 +198,24 @@ int EDOperand::evaluate(uint64_t &result,
     default:
       return -1;
     case kOperandTypeImmediate:
+      if (!Inst.Inst->getOperand(MCOpIndex).isImm())
+        return -1;
+            
       result = Inst.Inst->getOperand(MCOpIndex).getImm();
       return 0;
     case kOperandTypeRegister:
     {
+      if (!Inst.Inst->getOperand(MCOpIndex).isReg())
+        return -1;
+        
       unsigned reg = Inst.Inst->getOperand(MCOpIndex).getReg();
       return callback(&result, reg, arg);
     }
     case kOperandTypeARMBranchTarget:
     {
+      if (!Inst.Inst->getOperand(MCOpIndex).isImm())
+        return -1;
+        
       int64_t displacement = Inst.Inst->getOperand(MCOpIndex).getImm();
       
       uint64_t pcVal;
diff --git a/lib/MC/MCDwarf.cpp b/lib/MC/MCDwarf.cpp
index 112d7d8..13cb81a 100644
--- a/lib/MC/MCDwarf.cpp
+++ b/lib/MC/MCDwarf.cpp
@@ -17,6 +17,7 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -439,19 +440,105 @@ static int getDataAlignmentFactor(MCStreamer &streamer) {
    return -size;
 }
 
-static void EmitCFIInstruction(MCStreamer &Streamer,
-                               const MCCFIInstruction &Instr) {
+static unsigned getSizeForEncoding(MCStreamer &streamer,
+                                   unsigned symbolEncoding) {
+  MCContext &context = streamer.getContext();
+  const TargetAsmInfo &asmInfo = context.getTargetAsmInfo();
+  unsigned format = symbolEncoding & 0x0f;
+  switch (format) {
+  default:
+    assert(0 && "Unknown Encoding");
+  case dwarf::DW_EH_PE_absptr:
+  case dwarf::DW_EH_PE_signed:
+    return asmInfo.getPointerSize();
+  case dwarf::DW_EH_PE_udata2:
+  case dwarf::DW_EH_PE_sdata2:
+    return 2;
+  case dwarf::DW_EH_PE_udata4:
+  case dwarf::DW_EH_PE_sdata4:
+    return 4;
+  case dwarf::DW_EH_PE_udata8:
+  case dwarf::DW_EH_PE_sdata8:
+    return 8;
+  }
+}
+
+static void EmitSymbol(MCStreamer &streamer, const MCSymbol &symbol,
+                       unsigned symbolEncoding) {
+  MCContext &context = streamer.getContext();
+  const MCAsmInfo &asmInfo = context.getAsmInfo();
+  const MCExpr *v = asmInfo.getExprForFDESymbol(&symbol,
+                                                symbolEncoding,
+                                                streamer);
+  unsigned size = getSizeForEncoding(streamer, symbolEncoding);
+  streamer.EmitAbsValue(v, size);
+}
+
+static void EmitPersonality(MCStreamer &streamer, const MCSymbol &symbol,
+                            unsigned symbolEncoding) {
+  MCContext &context = streamer.getContext();
+  const MCAsmInfo &asmInfo = context.getAsmInfo();
+  const MCExpr *v = asmInfo.getExprForPersonalitySymbol(&symbol,
+                                                        symbolEncoding,
+                                                        streamer);
+  unsigned size = getSizeForEncoding(streamer, symbolEncoding);
+  streamer.EmitValue(v, size);
+}
+
+static const MachineLocation TranslateMachineLocation(
+                                                  const TargetAsmInfo &AsmInfo,
+                                                  const MachineLocation &Loc) {
+  unsigned Reg = Loc.getReg() == MachineLocation::VirtualFP ?
+    MachineLocation::VirtualFP :
+    unsigned(AsmInfo.getDwarfRegNum(Loc.getReg(), true));
+  const MachineLocation &NewLoc = Loc.isReg() ?
+    MachineLocation(Reg) : MachineLocation(Reg, Loc.getOffset());
+  return NewLoc;
+}
+
+namespace {
+  class FrameEmitterImpl {
+    int CFAOffset;
+    int CIENum;
+    bool UsingCFI;
+    bool IsEH;
+    const MCSymbol *SectionStart;
+
+  public:
+    FrameEmitterImpl(bool usingCFI, bool isEH, const MCSymbol *sectionStart) :
+      CFAOffset(0), CIENum(0), UsingCFI(usingCFI), IsEH(isEH),
+      SectionStart(sectionStart) {
+    }
+
+    const MCSymbol &EmitCIE(MCStreamer &streamer,
+                            const MCSymbol *personality,
+                            unsigned personalityEncoding,
+                            const MCSymbol *lsda,
+                            unsigned lsdaEncoding);
+    MCSymbol *EmitFDE(MCStreamer &streamer,
+                      const MCSymbol &cieStart,
+                      const MCDwarfFrameInfo &frame);
+    void EmitCFIInstructions(MCStreamer &streamer,
+                             const std::vector<MCCFIInstruction> &Instrs,
+                             MCSymbol *BaseLabel);
+    void EmitCFIInstruction(MCStreamer &Streamer,
+                            const MCCFIInstruction &Instr);
+  };
+}
+
+void FrameEmitterImpl::EmitCFIInstruction(MCStreamer &Streamer,
+                                          const MCCFIInstruction &Instr) {
   int dataAlignmentFactor = getDataAlignmentFactor(Streamer);
 
   switch (Instr.getOperation()) {
-  case MCCFIInstruction::Move: {
+  case MCCFIInstruction::Move:
+  case MCCFIInstruction::RelMove: {
     const MachineLocation &Dst = Instr.getDestination();
     const MachineLocation &Src = Instr.getSource();
+    const bool IsRelative = Instr.getOperation() == MCCFIInstruction::RelMove;
 
     // If advancing cfa.
     if (Dst.isReg() && Dst.getReg() == MachineLocation::VirtualFP) {
-      assert(!Src.isReg() && "Machine move not supported yet.");
-
       if (Src.getReg() == MachineLocation::VirtualFP) {
         Streamer.EmitIntValue(dwarf::DW_CFA_def_cfa_offset, 1);
       } else {
@@ -459,7 +546,12 @@ static void EmitCFIInstruction(MCStreamer &Streamer,
         Streamer.EmitULEB128IntValue(Src.getReg());
       }
 
-      Streamer.EmitULEB128IntValue(-Src.getOffset(), 1);
+      if (IsRelative)
+        CFAOffset += Src.getOffset();
+      else
+        CFAOffset = -Src.getOffset();
+
+      Streamer.EmitULEB128IntValue(CFAOffset);
       return;
     }
 
@@ -471,7 +563,11 @@ static void EmitCFIInstruction(MCStreamer &Streamer,
     }
 
     unsigned Reg = Src.getReg();
-    int Offset = Dst.getOffset() / dataAlignmentFactor;
+
+    int Offset = Dst.getOffset();
+    if (IsRelative)
+      Offset -= CFAOffset;
+    Offset = Offset / dataAlignmentFactor;
 
     if (Offset < 0) {
       Streamer.EmitIntValue(dwarf::DW_CFA_offset_extended_sf, 1);
@@ -479,11 +575,11 @@ static void EmitCFIInstruction(MCStreamer &Streamer,
       Streamer.EmitSLEB128IntValue(Offset);
     } else if (Reg < 64) {
       Streamer.EmitIntValue(dwarf::DW_CFA_offset + Reg, 1);
-      Streamer.EmitULEB128IntValue(Offset, 1);
+      Streamer.EmitULEB128IntValue(Offset);
     } else {
       Streamer.EmitIntValue(dwarf::DW_CFA_offset_extended, 1);
-      Streamer.EmitULEB128IntValue(Reg, 1);
-      Streamer.EmitULEB128IntValue(Offset, 1);
+      Streamer.EmitULEB128IntValue(Reg);
+      Streamer.EmitULEB128IntValue(Offset);
     }
     return;
   }
@@ -493,15 +589,21 @@ static void EmitCFIInstruction(MCStreamer &Streamer,
   case MCCFIInstruction::Restore:
     Streamer.EmitIntValue(dwarf::DW_CFA_restore_state, 1);
     return;
+  case MCCFIInstruction::SameValue: {
+    unsigned Reg = Instr.getDestination().getReg();
+    Streamer.EmitIntValue(dwarf::DW_CFA_same_value, 1);
+    Streamer.EmitULEB128IntValue(Reg);
+    return;
+  }
   }
   llvm_unreachable("Unhandled case in switch");
 }
 
 /// EmitFrameMoves - Emit frame instructions to describe the layout of the
 /// frame.
-static void EmitCFIInstructions(MCStreamer &streamer,
-                                const std::vector<MCCFIInstruction> &Instrs,
-                                MCSymbol *BaseLabel) {
+void FrameEmitterImpl::EmitCFIInstructions(MCStreamer &streamer,
+                                    const std::vector<MCCFIInstruction> &Instrs,
+                                           MCSymbol *BaseLabel) {
   for (unsigned i = 0, N = Instrs.size(); i < N; ++i) {
     const MCCFIInstruction &Instr = Instrs[i];
     MCSymbol *Label = Instr.getLabel();
@@ -521,90 +623,48 @@ static void EmitCFIInstructions(MCStreamer &streamer,
   }
 }
 
-static void EmitSymbol(MCStreamer &streamer, const MCSymbol &symbol,
-                       unsigned symbolEncoding) {
+const MCSymbol &FrameEmitterImpl::EmitCIE(MCStreamer &streamer,
+                                          const MCSymbol *personality,
+                                          unsigned personalityEncoding,
+                                          const MCSymbol *lsda,
+                                          unsigned lsdaEncoding) {
   MCContext &context = streamer.getContext();
   const TargetAsmInfo &asmInfo = context.getTargetAsmInfo();
-  unsigned format = symbolEncoding & 0x0f;
-  unsigned application = symbolEncoding & 0x70;
-  unsigned size;
-  switch (format) {
-  default:
-    assert(0 && "Unknown Encoding");
-  case dwarf::DW_EH_PE_absptr:
-  case dwarf::DW_EH_PE_signed:
-    size = asmInfo.getPointerSize();
-    break;
-  case dwarf::DW_EH_PE_udata2:
-  case dwarf::DW_EH_PE_sdata2:
-    size = 2;
-    break;
-  case dwarf::DW_EH_PE_udata4:
-  case dwarf::DW_EH_PE_sdata4:
-    size = 4;
-    break;
-  case dwarf::DW_EH_PE_udata8:
-  case dwarf::DW_EH_PE_sdata8:
-    size = 8;
-    break;
-  }
-  switch (application) {
-  default:
-    assert(0 && "Unknown Encoding");
-    break;
-  case 0:
-    streamer.EmitSymbolValue(&symbol, size);
-    break;
-  case dwarf::DW_EH_PE_pcrel:
-    streamer.EmitPCRelSymbolValue(&symbol, size);
-    break;
-  }
-}
 
-static const MachineLocation TranslateMachineLocation(
-                                                  const TargetAsmInfo &AsmInfo,
-                                                  const MachineLocation &Loc) {
-  unsigned Reg = Loc.getReg() == MachineLocation::VirtualFP ?
-    MachineLocation::VirtualFP :
-    unsigned(AsmInfo.getDwarfRegNum(Loc.getReg(), true));
-  const MachineLocation &NewLoc = Loc.isReg() ?
-    MachineLocation(Reg) : MachineLocation(Reg, Loc.getOffset());
-  return NewLoc;
-}
+  MCSymbol *sectionStart;
+  if (asmInfo.isFunctionEHFrameSymbolPrivate() || !IsEH)
+    sectionStart = context.CreateTempSymbol();
+  else
+    sectionStart = context.GetOrCreateSymbol(Twine("EH_frame") + Twine(CIENum));
+
+  CIENum++;
 
-static const MCSymbol &EmitCIE(MCStreamer &streamer,
-                               const MCSymbol *personality,
-                               unsigned personalityEncoding,
-                               const MCSymbol *lsda,
-                               unsigned lsdaEncoding) {
-  MCContext &context = streamer.getContext();
-  const TargetAsmInfo &asmInfo = context.getTargetAsmInfo();
-  const MCSection &section = *asmInfo.getEHFrameSection();
-  streamer.SwitchSection(&section);
-  MCSymbol *sectionStart = streamer.getContext().CreateTempSymbol();
   MCSymbol *sectionEnd = streamer.getContext().CreateTempSymbol();
 
   // Length
   const MCExpr *Length = MakeStartMinusEndExpr(streamer, *sectionStart,
                                                *sectionEnd, 4);
   streamer.EmitLabel(sectionStart);
-  streamer.EmitValue(Length, 4);
+  streamer.EmitAbsValue(Length, 4);
 
   // CIE ID
-  streamer.EmitIntValue(0, 4);
+  unsigned CIE_ID = IsEH ? 0 : -1;
+  streamer.EmitIntValue(CIE_ID, 4);
 
   // Version
   streamer.EmitIntValue(dwarf::DW_CIE_VERSION, 1);
 
   // Augmentation String
   SmallString<8> Augmentation;
-  Augmentation += "z";
-  if (personality)
-    Augmentation += "P";
-  if (lsda)
-    Augmentation += "L";
-  Augmentation += "R";
-  streamer.EmitBytes(Augmentation.str(), 0);
+  if (IsEH) {
+    Augmentation += "z";
+    if (personality)
+      Augmentation += "P";
+    if (lsda)
+      Augmentation += "L";
+    Augmentation += "R";
+    streamer.EmitBytes(Augmentation.str(), 0);
+  }
   streamer.EmitIntValue(0, 1);
 
   // Code Alignment Factor
@@ -617,28 +677,34 @@ static const MCSymbol &EmitCIE(MCStreamer &streamer,
   streamer.EmitULEB128IntValue(asmInfo.getDwarfRARegNum(true));
 
   // Augmentation Data Length (optional)
-  MCSymbol *augmentationStart = streamer.getContext().CreateTempSymbol();
-  MCSymbol *augmentationEnd = streamer.getContext().CreateTempSymbol();
-  const MCExpr *augmentationLength = MakeStartMinusEndExpr(streamer,
-                                                           *augmentationStart,
-                                                           *augmentationEnd, 0);
-  streamer.EmitULEB128Value(augmentationLength);
-
-  // Augmentation Data (optional)
-  streamer.EmitLabel(augmentationStart);
-  if (personality) {
-    // Personality Encoding
-    streamer.EmitIntValue(personalityEncoding, 1);
-    // Personality
-    EmitSymbol(streamer, *personality, personalityEncoding);
-  }
-  if (lsda) {
-    // LSDA Encoding
-    streamer.EmitIntValue(lsdaEncoding, 1);
+
+  unsigned augmentationLength = 0;
+  if (IsEH) {
+    if (personality) {
+      // Personality Encoding
+      augmentationLength += 1;
+      // Personality
+      augmentationLength += getSizeForEncoding(streamer, personalityEncoding);
+    }
+    if (lsda)
+      augmentationLength += 1;
+    // Encoding of the FDE pointers
+    augmentationLength += 1;
+
+    streamer.EmitULEB128IntValue(augmentationLength);
+
+    // Augmentation Data (optional)
+    if (personality) {
+      // Personality Encoding
+      streamer.EmitIntValue(personalityEncoding, 1);
+      // Personality
+      EmitPersonality(streamer, *personality, personalityEncoding);
+    }
+    if (lsda)
+      streamer.EmitIntValue(lsdaEncoding, 1); // LSDA Encoding
+    // Encoding of the FDE pointers
+    streamer.EmitIntValue(asmInfo.getFDEEncoding(UsingCFI), 1);
   }
-  // Encoding of the FDE pointers
-  streamer.EmitIntValue(dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4, 1);
-  streamer.EmitLabel(augmentationEnd);
 
   // Initial Instructions
 
@@ -658,56 +724,80 @@ static const MCSymbol &EmitCIE(MCStreamer &streamer,
   EmitCFIInstructions(streamer, Instructions, NULL);
 
   // Padding
-  streamer.EmitValueToAlignment(4);
+  streamer.EmitValueToAlignment(IsEH ? 4 : asmInfo.getPointerSize());
 
   streamer.EmitLabel(sectionEnd);
   return *sectionStart;
 }
 
-static MCSymbol *EmitFDE(MCStreamer &streamer,
-                         const MCSymbol &cieStart,
-                         const MCDwarfFrameInfo &frame) {
+MCSymbol *FrameEmitterImpl::EmitFDE(MCStreamer &streamer,
+                                    const MCSymbol &cieStart,
+                                    const MCDwarfFrameInfo &frame) {
   MCContext &context = streamer.getContext();
   MCSymbol *fdeStart = context.CreateTempSymbol();
   MCSymbol *fdeEnd = context.CreateTempSymbol();
+  const TargetAsmInfo &TAsmInfo = context.getTargetAsmInfo();
+
+  if (!TAsmInfo.isFunctionEHFrameSymbolPrivate() && IsEH) {
+    MCSymbol *EHSym = context.GetOrCreateSymbol(
+      frame.Function->getName() + Twine(".eh"));
+    streamer.EmitEHSymAttributes(frame.Function, EHSym);
+    streamer.EmitLabel(EHSym);
+  }
 
   // Length
   const MCExpr *Length = MakeStartMinusEndExpr(streamer, *fdeStart, *fdeEnd, 0);
-  streamer.EmitValue(Length, 4);
+  streamer.EmitAbsValue(Length, 4);
 
   streamer.EmitLabel(fdeStart);
+
   // CIE Pointer
-  const MCExpr *offset = MakeStartMinusEndExpr(streamer, cieStart, *fdeStart,
-                                               0);
-  streamer.EmitValue(offset, 4);
+  const MCAsmInfo &asmInfo = context.getAsmInfo();
+  if (IsEH) {
+    const MCExpr *offset = MakeStartMinusEndExpr(streamer, cieStart, *fdeStart,
+                                                 0);
+    streamer.EmitAbsValue(offset, 4);
+  } else if (!asmInfo.doesDwarfRequireRelocationForSectionOffset()) {
+    const MCExpr *offset = MakeStartMinusEndExpr(streamer, *SectionStart,
+                                                 cieStart, 0);
+    streamer.EmitAbsValue(offset, 4);
+  } else {
+    streamer.EmitSymbolValue(&cieStart, 4);
+  }
+  unsigned fdeEncoding = TAsmInfo.getFDEEncoding(UsingCFI);
+  unsigned size = getSizeForEncoding(streamer, fdeEncoding);
 
   // PC Begin
-  streamer.EmitPCRelSymbolValue(frame.Begin, 4);
+  unsigned PCBeginEncoding = IsEH ? fdeEncoding :
+    (unsigned)dwarf::DW_EH_PE_absptr;
+  unsigned PCBeginSize = getSizeForEncoding(streamer, PCBeginEncoding);
+  EmitSymbol(streamer, *frame.Begin, PCBeginEncoding);
 
   // PC Range
   const MCExpr *Range = MakeStartMinusEndExpr(streamer, *frame.Begin,
                                               *frame.End, 0);
-  streamer.EmitValue(Range, 4);
-
-  // Augmentation Data Length
-  MCSymbol *augmentationStart = streamer.getContext().CreateTempSymbol();
-  MCSymbol *augmentationEnd = streamer.getContext().CreateTempSymbol();
-  const MCExpr *augmentationLength = MakeStartMinusEndExpr(streamer,
-                                                           *augmentationStart,
-                                                           *augmentationEnd, 0);
-  streamer.EmitULEB128Value(augmentationLength);
-
-  // Augmentation Data
-  streamer.EmitLabel(augmentationStart);
-  if (frame.Lsda)
-    EmitSymbol(streamer, *frame.Lsda, frame.LsdaEncoding);
-  streamer.EmitLabel(augmentationEnd);
+  streamer.EmitAbsValue(Range, size);
+
+  if (IsEH) {
+    // Augmentation Data Length
+    unsigned augmentationLength = 0;
+
+    if (frame.Lsda)
+      augmentationLength += getSizeForEncoding(streamer, frame.LsdaEncoding);
+
+    streamer.EmitULEB128IntValue(augmentationLength);
+
+    // Augmentation Data
+    if (frame.Lsda)
+      EmitSymbol(streamer, *frame.Lsda, frame.LsdaEncoding);
+  }
+
   // Call Frame Instructions
 
   EmitCFIInstructions(streamer, frame.Instructions, frame.Begin);
 
   // Padding
-  streamer.EmitValueToAlignment(4);
+  streamer.EmitValueToAlignment(PCBeginSize);
 
   return fdeEnd;
 }
@@ -753,22 +843,32 @@ namespace llvm {
   };
 }
 
-void MCDwarfFrameEmitter::Emit(MCStreamer &streamer) {
-  const MCContext &context = streamer.getContext();
+void MCDwarfFrameEmitter::Emit(MCStreamer &streamer,
+                               bool usingCFI,
+                               bool isEH) {
+  MCContext &context = streamer.getContext();
   const TargetAsmInfo &asmInfo = context.getTargetAsmInfo();
+  const MCSection &section = isEH ?
+    *asmInfo.getEHFrameSection() : *asmInfo.getDwarfFrameSection();
+  streamer.SwitchSection(&section);
+  MCSymbol *SectionStart = context.CreateTempSymbol();
+  streamer.EmitLabel(SectionStart);
+
   MCSymbol *fdeEnd = NULL;
   DenseMap<CIEKey, const MCSymbol*> CIEStarts;
+  FrameEmitterImpl Emitter(usingCFI, isEH, SectionStart);
 
+  const MCSymbol *DummyDebugKey = NULL;
   for (unsigned i = 0, n = streamer.getNumFrameInfos(); i < n; ++i) {
     const MCDwarfFrameInfo &frame = streamer.getFrameInfo(i);
     CIEKey key(frame.Personality, frame.PersonalityEncoding,
                frame.LsdaEncoding);
-    const MCSymbol *&cieStart = CIEStarts[key];
+    const MCSymbol *&cieStart = isEH ? CIEStarts[key] : DummyDebugKey;
     if (!cieStart)
-      cieStart = &EmitCIE(streamer, frame.Personality,
-                          frame.PersonalityEncoding, frame.Lsda,
-                          frame.LsdaEncoding);
-    fdeEnd = EmitFDE(streamer, *cieStart, frame);
+      cieStart = &Emitter.EmitCIE(streamer, frame.Personality,
+                                  frame.PersonalityEncoding, frame.Lsda,
+                                  frame.LsdaEncoding);
+    fdeEnd = Emitter.EmitFDE(streamer, *cieStart, frame);
     if (i != n - 1)
       streamer.EmitLabel(fdeEnd);
   }
diff --git a/lib/MC/MCELF.cpp b/lib/MC/MCELF.cpp
index ce7783e..2c3f8e8 100644
--- a/lib/MC/MCELF.cpp
+++ b/lib/MC/MCELF.cpp
@@ -57,13 +57,13 @@ void MCELF::SetVisibility(MCSymbolData &SD, unsigned Visibility) {
   assert(Visibility == ELF::STV_DEFAULT || Visibility == ELF::STV_INTERNAL ||
          Visibility == ELF::STV_HIDDEN || Visibility == ELF::STV_PROTECTED);
 
-  uint32_t OtherFlags = SD.getFlags() & ~(0xf << ELF_STV_Shift);
+  uint32_t OtherFlags = SD.getFlags() & ~(0x3 << ELF_STV_Shift);
   SD.setFlags(OtherFlags | (Visibility << ELF_STV_Shift));
 }
 
 unsigned MCELF::GetVisibility(MCSymbolData &SD) {
   unsigned Visibility =
-    (SD.getFlags() & (0xf << ELF_STV_Shift)) >> ELF_STV_Shift;
+    (SD.getFlags() & (0x3 << ELF_STV_Shift)) >> ELF_STV_Shift;
   assert(Visibility == ELF::STV_DEFAULT || Visibility == ELF::STV_INTERNAL ||
          Visibility == ELF::STV_HIDDEN || Visibility == ELF::STV_PROTECTED);
   return Visibility;
diff --git a/lib/MC/MCELFStreamer.cpp b/lib/MC/MCELFStreamer.cpp
index 9fc9173..bbb2789 100644
--- a/lib/MC/MCELFStreamer.cpp
+++ b/lib/MC/MCELFStreamer.cpp
@@ -66,6 +66,11 @@ void MCELFStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
 
 void MCELFStreamer::EmitThumbFunc(MCSymbol *Func) {
   // FIXME: Anything needed here to flag the function as thumb?
+
+  getAssembler().setIsThumbFunc(Func);
+
+  MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Func);
+  SD.setFlags(SD.getFlags() | ELF_Other_ThumbFunc);
 }
 
 void MCELFStreamer::EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) {
@@ -345,8 +350,7 @@ void MCELFStreamer::EmitInstToData(const MCInst &Inst) {
 }
 
 void MCELFStreamer::Finish() {
-  if (getNumFrameInfos())
-    MCDwarfFrameEmitter::Emit(*this);
+  EmitFrames(true);
 
   for (std::vector<LocalCommon>::const_iterator i = LocalCommons.begin(),
                                                 e = LocalCommons.end();
diff --git a/lib/MC/MCExpr.cpp b/lib/MC/MCExpr.cpp
index 2debe18..fcf1aab 100644
--- a/lib/MC/MCExpr.cpp
+++ b/lib/MC/MCExpr.cpp
@@ -42,8 +42,8 @@ void MCExpr::print(raw_ostream &OS) const {
     // absolute names.
     bool UseParens = Sym.getName()[0] == '$';
 
-    if (SRE.getKind() == MCSymbolRefExpr::VK_PPC_HA16 ||
-        SRE.getKind() == MCSymbolRefExpr::VK_PPC_LO16) {
+    if (SRE.getKind() == MCSymbolRefExpr::VK_PPC_DARWIN_HA16 ||
+        SRE.getKind() == MCSymbolRefExpr::VK_PPC_DARWIN_LO16) {
       OS << MCSymbolRefExpr::getVariantKindName(SRE.getKind());
       UseParens = true;
     }
@@ -61,8 +61,8 @@ void MCExpr::print(raw_ostream &OS) const {
         SRE.getKind() == MCSymbolRefExpr::VK_ARM_GOTTPOFF)
       OS << MCSymbolRefExpr::getVariantKindName(SRE.getKind());
     else if (SRE.getKind() != MCSymbolRefExpr::VK_None &&
-             SRE.getKind() != MCSymbolRefExpr::VK_PPC_HA16 &&
-             SRE.getKind() != MCSymbolRefExpr::VK_PPC_LO16)
+             SRE.getKind() != MCSymbolRefExpr::VK_PPC_DARWIN_HA16 &&
+             SRE.getKind() != MCSymbolRefExpr::VK_PPC_DARWIN_LO16)
       OS << '@' << MCSymbolRefExpr::getVariantKindName(SRE.getKind());
 
     return;
@@ -197,8 +197,10 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) {
   case VK_ARM_GOTTPOFF: return "(gottpoff)";
   case VK_ARM_TLSGD: return "(tlsgd)";
   case VK_PPC_TOC: return "toc";
-  case VK_PPC_HA16: return "ha16";
-  case VK_PPC_LO16: return "lo16";
+  case VK_PPC_DARWIN_HA16: return "ha16";
+  case VK_PPC_DARWIN_LO16: return "lo16";
+  case VK_PPC_GAS_HA16: return "ha";
+  case VK_PPC_GAS_LO16: return "l";
   }
 }
 
@@ -389,7 +391,7 @@ static bool EvaluateSymbolicAdd(const MCAssembler *Asm,
     //   (LHS_A - RHS_B),
     //   (RHS_A - LHS_B),
     //   (RHS_A - RHS_B).
-    // Since we are attempting to be as aggresive as possible about folding, we
+    // Since we are attempting to be as aggressive as possible about folding, we
     // attempt to evaluate each possible alternative.
     AttemptToFoldSymbolOffsetDifference(Asm, Layout, Addrs, InSet, LHS_A, LHS_B,
                                         Result_Cst);
@@ -559,3 +561,45 @@ bool MCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
   assert(0 && "Invalid assembly expression kind!");
   return false;
 }
+
+const MCSection *MCExpr::FindAssociatedSection() const {
+  switch (getKind()) {
+  case Target:
+    // We never look through target specific expressions.
+    return cast<MCTargetExpr>(this)->FindAssociatedSection();
+
+  case Constant:
+    return MCSymbol::AbsolutePseudoSection;
+
+  case SymbolRef: {
+    const MCSymbolRefExpr *SRE = cast<MCSymbolRefExpr>(this);
+    const MCSymbol &Sym = SRE->getSymbol();
+
+    if (Sym.isDefined())
+      return &Sym.getSection();
+
+    return 0;
+  }
+
+  case Unary:
+    return cast<MCUnaryExpr>(this)->getSubExpr()->FindAssociatedSection();
+
+  case Binary: {
+    const MCBinaryExpr *BE = cast<MCBinaryExpr>(this);
+    const MCSection *LHS_S = BE->getLHS()->FindAssociatedSection();
+    const MCSection *RHS_S = BE->getRHS()->FindAssociatedSection();
+
+    // If either section is absolute, return the other.
+    if (LHS_S == MCSymbol::AbsolutePseudoSection)
+      return RHS_S;
+    if (RHS_S == MCSymbol::AbsolutePseudoSection)
+      return LHS_S;
+
+    // Otherwise, return the first non-null section.
+    return LHS_S ? LHS_S : RHS_S;
+  }
+  }
+
+  assert(0 && "Invalid assembly expression kind!");
+  return 0;
+}
diff --git a/lib/MC/MCInstPrinter.cpp b/lib/MC/MCInstPrinter.cpp
index 212b85e..81a939f 100644
--- a/lib/MC/MCInstPrinter.cpp
+++ b/lib/MC/MCInstPrinter.cpp
@@ -20,7 +20,6 @@ StringRef MCInstPrinter::getOpcodeName(unsigned Opcode) const {
   return "";
 }
 
-StringRef MCInstPrinter::getRegName(unsigned RegNo) const {
+void MCInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
   assert(0 && "Target should implement this");
-  return "";
 }
diff --git a/lib/MC/MCLoggingStreamer.cpp b/lib/MC/MCLoggingStreamer.cpp
index 012c7f6..46ea9b8 100644
--- a/lib/MC/MCLoggingStreamer.cpp
+++ b/lib/MC/MCLoggingStreamer.cpp
@@ -154,21 +154,19 @@ public:
   }
 
   virtual void EmitValueImpl(const MCExpr *Value, unsigned Size,
-                             bool isPCRel, unsigned AddrSpace){
+                             unsigned AddrSpace){
     LogCall("EmitValue");
-    return Child->EmitValueImpl(Value, Size, isPCRel, AddrSpace);
+    return Child->EmitValueImpl(Value, Size, AddrSpace);
   }
 
-  virtual void EmitULEB128Value(const MCExpr *Value,
-                                unsigned AddrSpace = 0) {
+  virtual void EmitULEB128Value(const MCExpr *Value) {
     LogCall("EmitULEB128Value");
-    return Child->EmitULEB128Value(Value, AddrSpace);
+    return Child->EmitULEB128Value(Value);
   }
 
-  virtual void EmitSLEB128Value(const MCExpr *Value,
-                                unsigned AddrSpace = 0) {
+  virtual void EmitSLEB128Value(const MCExpr *Value) {
     LogCall("EmitSLEB128Value");
-    return Child->EmitSLEB128Value(Value, AddrSpace);
+    return Child->EmitSLEB128Value(Value);
   }
 
   virtual void EmitGPRel32Value(const MCExpr *Value) {
@@ -215,13 +213,14 @@ public:
 
   virtual void EmitDwarfLocDirective(unsigned FileNo, unsigned Line,
                                      unsigned Column, unsigned Flags,
-                                     unsigned Isa, unsigned Discriminator) {
+                                     unsigned Isa, unsigned Discriminator,
+                                     StringRef FileName) {
     LogCall("EmitDwarfLocDirective",
             "FileNo:" + Twine(FileNo) + " Line:" + Twine(Line) +
             " Column:" + Twine(Column) + " Flags:" + Twine(Flags) +
             " Isa:" + Twine(Isa) + " Discriminator:" + Twine(Discriminator));
             return Child->EmitDwarfLocDirective(FileNo, Line, Column, Flags,
-                                                Isa, Discriminator);
+                                                Isa, Discriminator, FileName);
   }
 
   virtual void EmitInstruction(const MCInst &Inst) {
diff --git a/lib/MC/MCMachOStreamer.cpp b/lib/MC/MCMachOStreamer.cpp
index d1f9f5c..12aeb4f 100644
--- a/lib/MC/MCMachOStreamer.cpp
+++ b/lib/MC/MCMachOStreamer.cpp
@@ -44,6 +44,8 @@ public:
 
   virtual void InitSections();
   virtual void EmitLabel(MCSymbol *Symbol);
+  virtual void EmitEHSymAttributes(const MCSymbol *Symbol,
+                                   MCSymbol *EHSymbol);
   virtual void EmitAssemblerFlag(MCAssemblerFlag Flag);
   virtual void EmitThumbFunc(MCSymbol *Func);
   virtual void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value);
@@ -101,6 +103,18 @@ void MCMachOStreamer::InitSections() {
 
 }
 
+void MCMachOStreamer::EmitEHSymAttributes(const MCSymbol *Symbol,
+                                          MCSymbol *EHSymbol) {
+  MCSymbolData &SD =
+    getAssembler().getOrCreateSymbolData(*Symbol);
+  if (SD.isExternal())
+    EmitSymbolAttribute(EHSymbol, MCSA_Global);
+  if (SD.getFlags() & SF_WeakDefinition)
+    EmitSymbolAttribute(EHSymbol, MCSA_WeakDefinition);
+  if (SD.isPrivateExtern())
+    EmitSymbolAttribute(EHSymbol, MCSA_PrivateExtern);
+}
+
 void MCMachOStreamer::EmitLabel(MCSymbol *Symbol) {
   assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
 
@@ -363,6 +377,8 @@ void MCMachOStreamer::EmitInstToData(const MCInst &Inst) {
 }
 
 void MCMachOStreamer::Finish() {
+  EmitFrames(true);
+
   // We have to set the fragment atom associations so we can relax properly for
   // Mach-O.
 
diff --git a/lib/MC/MCNullStreamer.cpp b/lib/MC/MCNullStreamer.cpp
index 08ddf01..f38b822 100644
--- a/lib/MC/MCNullStreamer.cpp
+++ b/lib/MC/MCNullStreamer.cpp
@@ -67,11 +67,9 @@ namespace {
     virtual void EmitBytes(StringRef Data, unsigned AddrSpace) {}
 
     virtual void EmitValueImpl(const MCExpr *Value, unsigned Size,
-                               bool isPCRel, unsigned AddrSpace) {}
-    virtual void EmitULEB128Value(const MCExpr *Value,
-                                  unsigned AddrSpace = 0) {}
-    virtual void EmitSLEB128Value(const MCExpr *Value,
-                                  unsigned AddrSpace = 0) {}
+                               unsigned AddrSpace) {}
+    virtual void EmitULEB128Value(const MCExpr *Value) {}
+    virtual void EmitSLEB128Value(const MCExpr *Value) {}
     virtual void EmitGPRel32Value(const MCExpr *Value) {}
     virtual void EmitValueToAlignment(unsigned ByteAlignment, int64_t Value = 0,
                                       unsigned ValueSize = 1,
@@ -89,7 +87,8 @@ namespace {
     }
     virtual void EmitDwarfLocDirective(unsigned FileNo, unsigned Line,
                                        unsigned Column, unsigned Flags,
-                                       unsigned Isa, unsigned Discriminator) {}
+                                       unsigned Isa, unsigned Discriminator,
+                                       StringRef FileName) {}
     virtual void EmitInstruction(const MCInst &Inst) {}
 
     virtual void Finish() {}
diff --git a/lib/MC/MCObjectStreamer.cpp b/lib/MC/MCObjectStreamer.cpp
index ef22eaa..e230c53 100644
--- a/lib/MC/MCObjectStreamer.cpp
+++ b/lib/MC/MCObjectStreamer.cpp
@@ -90,7 +90,7 @@ const MCExpr *MCObjectStreamer::AddValueSymbols(const MCExpr *Value) {
 }
 
 void MCObjectStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
-                                     bool isPCRel, unsigned AddrSpace) {
+                                     unsigned AddrSpace) {
   assert(AddrSpace == 0 && "Address space must be 0!");
   MCDataFragment *DF = getOrCreateDataFragment();
 
@@ -102,15 +102,12 @@ void MCObjectStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
   }
   DF->addFixup(MCFixup::Create(DF->getContents().size(),
                                Value,
-                               MCFixup::getKindForSize(Size, isPCRel)));
+                               MCFixup::getKindForSize(Size, false)));
   DF->getContents().resize(DF->getContents().size() + Size, 0);
 }
 
 void MCObjectStreamer::EmitLabel(MCSymbol *Symbol) {
-  assert(!Symbol->isVariable() && "Cannot emit a variable symbol!");
-  assert(getCurrentSection() && "Cannot emit before setting section!");
-
-  Symbol->setSection(*getCurrentSection());
+  MCStreamer::EmitLabel(Symbol);
 
   MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
 
@@ -124,23 +121,23 @@ void MCObjectStreamer::EmitLabel(MCSymbol *Symbol) {
   SD.setOffset(F->getContents().size());
 }
 
-void MCObjectStreamer::EmitULEB128Value(const MCExpr *Value,
-                                        unsigned AddrSpace) {
+void MCObjectStreamer::EmitULEB128Value(const MCExpr *Value) {
   int64_t IntValue;
   if (Value->EvaluateAsAbsolute(IntValue, getAssembler())) {
-    EmitULEB128IntValue(IntValue, AddrSpace);
+    EmitULEB128IntValue(IntValue);
     return;
   }
+  Value = ForceExpAbs(Value);
   new MCLEBFragment(*Value, false, getCurrentSectionData());
 }
 
-void MCObjectStreamer::EmitSLEB128Value(const MCExpr *Value,
-                                        unsigned AddrSpace) {
+void MCObjectStreamer::EmitSLEB128Value(const MCExpr *Value) {
   int64_t IntValue;
   if (Value->EvaluateAsAbsolute(IntValue, getAssembler())) {
-    EmitSLEB128IntValue(IntValue, AddrSpace);
+    EmitSLEB128IntValue(IntValue);
     return;
   }
+  Value = ForceExpAbs(Value);
   new MCLEBFragment(*Value, true, getCurrentSectionData());
 }
 
@@ -191,30 +188,11 @@ void MCObjectStreamer::EmitInstruction(const MCInst &Inst) {
 void MCObjectStreamer::EmitInstToFragment(const MCInst &Inst) {
   MCInstFragment *IF = new MCInstFragment(Inst, getCurrentSectionData());
 
-  raw_svector_ostream VecOS(IF->getCode());
+  SmallString<128> Code;
+  raw_svector_ostream VecOS(Code);
   getAssembler().getEmitter().EncodeInstruction(Inst, VecOS, IF->getFixups());
-}
-
-static const MCExpr *BuildSymbolDiff(MCContext &Context,
-                                     const MCSymbol *A, const MCSymbol *B) {
-  MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
-  const MCExpr *ARef =
-    MCSymbolRefExpr::Create(A, Variant, Context);
-  const MCExpr *BRef =
-    MCSymbolRefExpr::Create(B, Variant, Context);
-  const MCExpr *AddrDelta =
-    MCBinaryExpr::Create(MCBinaryExpr::Sub, ARef, BRef, Context);
-  return AddrDelta;
-}
-
-static const MCExpr *ForceExpAbs(MCObjectStreamer *Streamer,
-                                  MCContext &Context, const MCExpr* Expr) {
- if (Context.getAsmInfo().hasAggressiveSymbolFolding())
-   return Expr;
-
- MCSymbol *ABS = Context.CreateTempSymbol();
- Streamer->EmitAssignment(ABS, Expr);
- return MCSymbolRefExpr::Create(ABS, Context);
+  VecOS.flush();
+  IF->getCode().append(Code.begin(), Code.end());
 }
 
 void MCObjectStreamer::EmitDwarfAdvanceLineAddr(int64_t LineDelta,
@@ -231,7 +209,7 @@ void MCObjectStreamer::EmitDwarfAdvanceLineAddr(int64_t LineDelta,
     MCDwarfLineAddr::Emit(this, LineDelta, Res);
     return;
   }
-  AddrDelta = ForceExpAbs(this, getContext(), AddrDelta);
+  AddrDelta = ForceExpAbs(AddrDelta);
   new MCDwarfLineAddrFragment(LineDelta, *AddrDelta, getCurrentSectionData());
 }
 
@@ -243,7 +221,7 @@ void MCObjectStreamer::EmitDwarfAdvanceFrameAddr(const MCSymbol *LastLabel,
     MCDwarfFrameEmitter::EmitAdvanceLoc(*this, Res);
     return;
   }
-  AddrDelta = ForceExpAbs(this, getContext(), AddrDelta);
+  AddrDelta = ForceExpAbs(AddrDelta);
   new MCDwarfCallFrameFragment(*AddrDelta, getCurrentSectionData());
 }
 
diff --git a/lib/MC/MCParser/AsmLexer.cpp b/lib/MC/MCParser/AsmLexer.cpp
index 6bd8986..0c1f8f0 100644
--- a/lib/MC/MCParser/AsmLexer.cpp
+++ b/lib/MC/MCParser/AsmLexer.cpp
@@ -213,13 +213,13 @@ AsmToken AsmLexer::LexDigit() {
 
     // Requires at least one binary digit.
     if (CurPtr == NumStart)
-      return ReturnError(TokStart, "Invalid binary number");
+      return ReturnError(TokStart, "invalid binary number");
 
     StringRef Result(TokStart, CurPtr - TokStart);
 
     long long Value;
     if (Result.substr(2).getAsInteger(2, Value))
-      return ReturnError(TokStart, "Invalid binary number");
+      return ReturnError(TokStart, "invalid binary number");
 
     // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
     // suffixes on integer literals.
@@ -236,11 +236,11 @@ AsmToken AsmLexer::LexDigit() {
 
     // Requires at least one hex digit.
     if (CurPtr == NumStart)
-      return ReturnError(CurPtr-2, "Invalid hexadecimal number");
+      return ReturnError(CurPtr-2, "invalid hexadecimal number");
 
     unsigned long long Result;
     if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(0, Result))
-      return ReturnError(TokStart, "Invalid hexadecimal number");
+      return ReturnError(TokStart, "invalid hexadecimal number");
 
     // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
     // suffixes on integer literals.
@@ -251,13 +251,13 @@ AsmToken AsmLexer::LexDigit() {
   }
 
   // Must be an octal number, it starts with 0.
-  while (*CurPtr >= '0' && *CurPtr <= '7')
+  while (*CurPtr >= '0' && *CurPtr <= '9')
     ++CurPtr;
 
   StringRef Result(TokStart, CurPtr - TokStart);
   long long Value;
   if (Result.getAsInteger(8, Value))
-    return ReturnError(TokStart, "Invalid octal number");
+    return ReturnError(TokStart, "invalid octal number");
 
   // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
   // suffixes on integer literals.
@@ -388,6 +388,7 @@ AsmToken AsmLexer::LexToken() {
   case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1));
   case '$': return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1));
   case '@': return AsmToken(AsmToken::At, StringRef(TokStart, 1));
+  case '\\': return AsmToken(AsmToken::BackSlash, StringRef(TokStart, 1));
   case '=':
     if (*CurPtr == '=')
       return ++CurPtr, AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2));
diff --git a/lib/MC/MCParser/AsmParser.cpp b/lib/MC/MCParser/AsmParser.cpp
index 09c92b8..bbd6635 100644
--- a/lib/MC/MCParser/AsmParser.cpp
+++ b/lib/MC/MCParser/AsmParser.cpp
@@ -27,6 +27,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCDwarf.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
@@ -36,15 +37,21 @@
 #include <vector>
 using namespace llvm;
 
+static cl::opt<bool>
+FatalAssemblerWarnings("fatal-assembler-warnings",
+                       cl::desc("Consider warnings as error"));
+
 namespace {
 
 /// \brief Helper class for tracking macro definitions.
 struct Macro {
   StringRef Name;
   StringRef Body;
+  std::vector<StringRef> Parameters;
 
 public:
-  Macro(StringRef N, StringRef B) : Name(N), Body(B) {}
+  Macro(StringRef N, StringRef B, const std::vector<StringRef> &P) :
+    Name(N), Body(B), Parameters(P) {}
 };
 
 /// \brief Helper class for storing information about an active macro
@@ -64,7 +71,7 @@ struct MacroInstantiation {
 
 public:
   MacroInstantiation(const Macro *M, SMLoc IL, SMLoc EL,
-                     const std::vector<std::vector<AsmToken> > &A);
+                     MemoryBuffer *I);
 };
 
 /// \brief The concrete assembly parser instance.
@@ -77,6 +84,7 @@ private:
   AsmLexer Lexer;
   MCContext &Ctx;
   MCStreamer &Out;
+  const MCAsmInfo &MAI;
   SourceMgr &SrcMgr;
   MCAsmParserExtension *GenericParser;
   MCAsmParserExtension *PlatformParser;
@@ -128,7 +136,7 @@ public:
   virtual MCContext &getContext() { return Ctx; }
   virtual MCStreamer &getStreamer() { return Out; }
 
-  virtual void Warning(SMLoc L, const Twine &Meg);
+  virtual bool Warning(SMLoc L, const Twine &Msg);
   virtual bool Error(SMLoc L, const Twine &Msg);
 
   const AsmToken &Lex();
@@ -146,11 +154,16 @@ private:
   bool ParseStatement();
 
   bool HandleMacroEntry(StringRef Name, SMLoc NameLoc, const Macro *M);
+  bool expandMacro(SmallString<256> &Buf, StringRef Body,
+                   const std::vector<StringRef> &Parameters,
+                   const std::vector<std::vector<AsmToken> > &A,
+                   const SMLoc &L);
   void HandleMacroExit();
 
   void PrintMacroInstantiations();
-  void PrintMessage(SMLoc Loc, const Twine &Msg, const char *Type) const {
-    SrcMgr.PrintMessage(Loc, Msg, Type);
+  void PrintMessage(SMLoc Loc, const Twine &Msg, const char *Type,
+                    bool ShowLine = true) const {
+    SrcMgr.PrintMessage(Loc, Msg, Type, ShowLine);
   }
 
   /// EnterIncludeFile - Enter the specified file. This returns true on failure.
@@ -243,6 +256,8 @@ public:
     AddDirectiveHandler<&GenericAsmParser::ParseDirectiveStabs>(".stabs");
 
     // CFI directives.
+    AddDirectiveHandler<&GenericAsmParser::ParseDirectiveCFISections>(
+                                                               ".cfi_sections");
     AddDirectiveHandler<&GenericAsmParser::ParseDirectiveCFIStartProc>(
                                                               ".cfi_startproc");
     AddDirectiveHandler<&GenericAsmParser::ParseDirectiveCFIEndProc>(
@@ -251,10 +266,14 @@ public:
                                                          ".cfi_def_cfa");
     AddDirectiveHandler<&GenericAsmParser::ParseDirectiveCFIDefCfaOffset>(
                                                          ".cfi_def_cfa_offset");
+    AddDirectiveHandler<&GenericAsmParser::ParseDirectiveCFIAdjustCfaOffset>(
+                                                      ".cfi_adjust_cfa_offset");
     AddDirectiveHandler<&GenericAsmParser::ParseDirectiveCFIDefCfaRegister>(
                                                        ".cfi_def_cfa_register");
     AddDirectiveHandler<&GenericAsmParser::ParseDirectiveCFIOffset>(
                                                                  ".cfi_offset");
+    AddDirectiveHandler<&GenericAsmParser::ParseDirectiveCFIRelOffset>(
+                                                             ".cfi_rel_offset");
     AddDirectiveHandler<
      &GenericAsmParser::ParseDirectiveCFIPersonalityOrLsda>(".cfi_personality");
     AddDirectiveHandler<
@@ -263,6 +282,8 @@ public:
       &GenericAsmParser::ParseDirectiveCFIRememberState>(".cfi_remember_state");
     AddDirectiveHandler<
       &GenericAsmParser::ParseDirectiveCFIRestoreState>(".cfi_restore_state");
+    AddDirectiveHandler<
+      &GenericAsmParser::ParseDirectiveCFISameValue>(".cfi_same_value");
 
     // Macro directives.
     AddDirectiveHandler<&GenericAsmParser::ParseDirectiveMacrosOnOff>(
@@ -283,15 +304,19 @@ public:
   bool ParseDirectiveLine(StringRef, SMLoc DirectiveLoc);
   bool ParseDirectiveLoc(StringRef, SMLoc DirectiveLoc);
   bool ParseDirectiveStabs(StringRef, SMLoc DirectiveLoc);
+  bool ParseDirectiveCFISections(StringRef, SMLoc DirectiveLoc);
   bool ParseDirectiveCFIStartProc(StringRef, SMLoc DirectiveLoc);
   bool ParseDirectiveCFIEndProc(StringRef, SMLoc DirectiveLoc);
   bool ParseDirectiveCFIDefCfa(StringRef, SMLoc DirectiveLoc);
   bool ParseDirectiveCFIDefCfaOffset(StringRef, SMLoc DirectiveLoc);
+  bool ParseDirectiveCFIAdjustCfaOffset(StringRef, SMLoc DirectiveLoc);
   bool ParseDirectiveCFIDefCfaRegister(StringRef, SMLoc DirectiveLoc);
   bool ParseDirectiveCFIOffset(StringRef, SMLoc DirectiveLoc);
+  bool ParseDirectiveCFIRelOffset(StringRef, SMLoc DirectiveLoc);
   bool ParseDirectiveCFIPersonalityOrLsda(StringRef, SMLoc DirectiveLoc);
   bool ParseDirectiveCFIRememberState(StringRef, SMLoc DirectiveLoc);
   bool ParseDirectiveCFIRestoreState(StringRef, SMLoc DirectiveLoc);
+  bool ParseDirectiveCFISameValue(StringRef, SMLoc DirectiveLoc);
 
   bool ParseDirectiveMacrosOnOff(StringRef, SMLoc DirectiveLoc);
   bool ParseDirectiveMacro(StringRef, SMLoc DirectiveLoc);
@@ -314,7 +339,7 @@ enum { DEFAULT_ADDRSPACE = 0 };
 
 AsmParser::AsmParser(const Target &T, SourceMgr &_SM, MCContext &_Ctx,
                      MCStreamer &_Out, const MCAsmInfo &_MAI)
-  : Lexer(_MAI), Ctx(_Ctx), Out(_Out), SrcMgr(_SM),
+  : Lexer(_MAI), Ctx(_Ctx), Out(_Out), MAI(_MAI), SrcMgr(_SM),
     GenericParser(new GenericAsmParser), PlatformParser(0),
     CurBuffer(0), MacrosEnabled(true) {
   Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer));
@@ -358,9 +383,12 @@ void AsmParser::PrintMacroInstantiations() {
                  "note");
 }
 
-void AsmParser::Warning(SMLoc L, const Twine &Msg) {
+bool AsmParser::Warning(SMLoc L, const Twine &Msg) {
+  if (FatalAssemblerWarnings)
+    return Error(L, Msg);
   PrintMessage(L, Msg, "warning");
   PrintMacroInstantiations();
+  return false;
 }
 
 bool AsmParser::Error(SMLoc L, const Twine &Msg) {
@@ -371,7 +399,8 @@ bool AsmParser::Error(SMLoc L, const Twine &Msg) {
 }
 
 bool AsmParser::EnterIncludeFile(const std::string &Filename) {
-  int NewBuf = SrcMgr.AddIncludeFile(Filename, Lexer.getLoc());
+  std::string IncludedFile;
+  int NewBuf = SrcMgr.AddIncludeFile(Filename, Lexer.getLoc(), IncludedFile);
   if (NewBuf == -1)
     return true;
 
@@ -439,6 +468,29 @@ bool AsmParser::Run(bool NoInitialTextSection, bool NoFinalize) {
       TokError("unassigned file number: " + Twine(i) + " for .file directives");
   }
 
+  // Check to see that all assembler local symbols were actually defined.
+  // Targets that don't do subsections via symbols may not want this, though,
+  // so conservatively exclude them. Only do this if we're finalizing, though,
+  // as otherwise we won't necessarilly have seen everything yet.
+  if (!NoFinalize && MAI.hasSubsectionsViaSymbols()) {
+    const MCContext::SymbolTable &Symbols = getContext().getSymbols();
+    for (MCContext::SymbolTable::const_iterator i = Symbols.begin(),
+         e = Symbols.end();
+         i != e; ++i) {
+      MCSymbol *Sym = i->getValue();
+      // Variable symbols may not be marked as defined, so check those
+      // explicitly. If we know it's a variable, we have a definition for
+      // the purposes of this check.
+      if (Sym->isTemporary() && !Sym->isVariable() && !Sym->isDefined())
+        // FIXME: We would really like to refer back to where the symbol was
+        // first referenced for a source location. We need to add something
+        // to track that. Currently, we just point to the end of the file.
+        PrintMessage(getLexer().getLoc(), "assembler local symbol '" +
+                     Sym->getName() + "' not defined", "error", false);
+    }
+  }
+
+
   // Finalize the output stream if there are no errors and if the client wants
   // us to.
   if (!HadError && !NoFinalize)
@@ -517,6 +569,9 @@ bool AsmParser::ParsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
   switch (Lexer.getKind()) {
   default:
     return TokError("unknown token in expression");
+  // If we have an error assume that we've already handled it.
+  case AsmToken::Error:
+    return true;
   case AsmToken::Exclaim:
     Lex(); // Eat the operator.
     if (ParsePrimaryExpr(Res, EndLoc))
@@ -530,7 +585,7 @@ bool AsmParser::ParsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
 
     StringRef Identifier;
     if (ParseIdentifier(Identifier))
-      return false;
+      return true;
 
     // This is a symbol reference.
     std::pair<StringRef, StringRef> Split = Identifier.split('@');
@@ -1114,9 +1169,9 @@ bool AsmParser::ParseStatement() {
     if (!getTargetParser().ParseDirective(ID))
       return false;
 
-    Warning(IDLoc, "ignoring directive for now");
+    bool retval = Warning(IDLoc, "ignoring directive for now");
     EatToEndOfStatement();
-    return false;
+    return retval;
   }
 
   CheckForValidSection();
@@ -1159,27 +1214,33 @@ bool AsmParser::ParseStatement() {
   return false;
 }
 
-MacroInstantiation::MacroInstantiation(const Macro *M, SMLoc IL, SMLoc EL,
-                                   const std::vector<std::vector<AsmToken> > &A)
-  : TheMacro(M), InstantiationLoc(IL), ExitLoc(EL)
-{
-  // Macro instantiation is lexical, unfortunately. We construct a new buffer
-  // to hold the macro body with substitutions.
-  SmallString<256> Buf;
+bool AsmParser::expandMacro(SmallString<256> &Buf, StringRef Body,
+                            const std::vector<StringRef> &Parameters,
+                            const std::vector<std::vector<AsmToken> > &A,
+                            const SMLoc &L) {
   raw_svector_ostream OS(Buf);
+  unsigned NParameters = Parameters.size();
+  if (NParameters != 0 && NParameters != A.size())
+    return Error(L, "Wrong number of arguments");
 
-  StringRef Body = M->Body;
   while (!Body.empty()) {
     // Scan for the next substitution.
     std::size_t End = Body.size(), Pos = 0;
     for (; Pos != End; ++Pos) {
       // Check for a substitution or escape.
-      if (Body[Pos] != '$' || Pos + 1 == End)
-        continue;
-
-      char Next = Body[Pos + 1];
-      if (Next == '$' || Next == 'n' || isdigit(Next))
-        break;
+      if (!NParameters) {
+        // This macro has no parameters, look for $0, $1, etc.
+        if (Body[Pos] != '$' || Pos + 1 == End)
+          continue;
+
+        char Next = Body[Pos + 1];
+        if (Next == '$' || Next == 'n' || isdigit(Next))
+          break;
+      } else {
+        // This macro has parameters, look for \foo, \bar, etc.
+        if (Body[Pos] == '\\' && Pos + 1 != End)
+          break;
+      }
     }
 
     // Add the prefix.
@@ -1189,41 +1250,69 @@ MacroInstantiation::MacroInstantiation(const Macro *M, SMLoc IL, SMLoc EL,
     if (Pos == End)
       break;
 
-    switch (Body[Pos+1]) {
-       // $$ => $
-    case '$':
-      OS << '$';
-      break;
+    if (!NParameters) {
+      switch (Body[Pos+1]) {
+        // $$ => $
+      case '$':
+        OS << '$';
+        break;
 
-      // $n => number of arguments
-    case 'n':
-      OS << A.size();
-      break;
+        // $n => number of arguments
+      case 'n':
+        OS << A.size();
+        break;
 
-       // $[0-9] => argument
-    default: {
-      // Missing arguments are ignored.
-      unsigned Index = Body[Pos+1] - '0';
-      if (Index >= A.size())
+        // $[0-9] => argument
+      default: {
+        // Missing arguments are ignored.
+        unsigned Index = Body[Pos+1] - '0';
+        if (Index >= A.size())
+          break;
+
+        // Otherwise substitute with the token values, with spaces eliminated.
+        for (std::vector<AsmToken>::const_iterator it = A[Index].begin(),
+               ie = A[Index].end(); it != ie; ++it)
+          OS << it->getString();
         break;
+      }
+      }
+      Pos += 2;
+    } else {
+      unsigned I = Pos + 1;
+      while (isalnum(Body[I]) && I + 1 != End)
+        ++I;
+
+      const char *Begin = Body.data() + Pos +1;
+      StringRef Argument(Begin, I - (Pos +1));
+      unsigned Index = 0;
+      for (; Index < NParameters; ++Index)
+        if (Parameters[Index] == Argument)
+          break;
+
+      // FIXME: We should error at the macro definition.
+      if (Index == NParameters)
+        return Error(L, "Parameter not found");
 
-      // Otherwise substitute with the token values, with spaces eliminated.
       for (std::vector<AsmToken>::const_iterator it = A[Index].begin(),
              ie = A[Index].end(); it != ie; ++it)
         OS << it->getString();
-      break;
-    }
-    }
 
+      Pos += 1 + Argument.size();
+    }
     // Update the scan point.
-    Body = Body.substr(Pos + 2);
+    Body = Body.substr(Pos);
   }
 
   // We include the .endmacro in the buffer as our queue to exit the macro
   // instantiation.
   OS << ".endmacro\n";
+  return false;
+}
 
-  Instantiation = MemoryBuffer::getMemBufferCopy(OS.str(), "<instantiation>");
+MacroInstantiation::MacroInstantiation(const Macro *M, SMLoc IL, SMLoc EL,
+                                       MemoryBuffer *I)
+  : TheMacro(M), Instantiation(I), InstantiationLoc(IL), ExitLoc(EL)
+{
 }
 
 bool AsmParser::HandleMacroEntry(StringRef Name, SMLoc NameLoc,
@@ -1260,11 +1349,22 @@ bool AsmParser::HandleMacroEntry(StringRef Name, SMLoc NameLoc,
     Lex();
   }
 
+  // Macro instantiation is lexical, unfortunately. We construct a new buffer
+  // to hold the macro body with substitutions.
+  SmallString<256> Buf;
+  StringRef Body = M->Body;
+
+  if (expandMacro(Buf, Body, M->Parameters, MacroArguments, getTok().getLoc()))
+    return true;
+
+  MemoryBuffer *Instantiation =
+    MemoryBuffer::getMemBufferCopy(Buf.str(), "<instantiation>");
+
   // Create the macro instantiation object and add to the current macro
   // instantiation stack.
   MacroInstantiation *MI = new MacroInstantiation(M, NameLoc,
                                                   getTok().getLoc(),
-                                                  MacroArguments);
+                                                  Instantiation);
   ActiveMacros.push_back(MI);
 
   // Jump to the macro instantiation and prime the lexer.
@@ -1336,7 +1436,7 @@ bool AsmParser::ParseAssignment(StringRef Name, bool allow_redef) {
     // FIXME: Diagnose assignment to protected identifier (e.g., register name).
     if (Sym->isUndefined() && !Sym->isUsed() && !Sym->isVariable())
       ; // Allow redefinitions of undefined symbols only used in directives.
-    else if (!Sym->isUndefined() && (!Sym->isAbsolute() || !allow_redef))
+    else if (!Sym->isUndefined() && (!Sym->isVariable() || !allow_redef))
       return Error(EqualLoc, "redefinition of '" + Name + "'");
     else if (!Sym->isVariable())
       return Error(EqualLoc, "invalid assignment to '" + Name + "'");
@@ -2241,7 +2341,7 @@ bool GenericAsmParser::ParseDirectiveLoc(StringRef, SMLoc DirectiveLoc) {
   }
 
   getStreamer().EmitDwarfLocDirective(FileNumber, LineNumber, ColumnPos, Flags,
-                                      Isa, Discriminator);
+                                      Isa, Discriminator, StringRef());
 
   return false;
 }
@@ -2253,17 +2353,52 @@ bool GenericAsmParser::ParseDirectiveStabs(StringRef Directive,
   return TokError("unsupported directive '" + Directive + "'");
 }
 
+/// ParseDirectiveCFISections
+/// ::= .cfi_sections section [, section]
+bool GenericAsmParser::ParseDirectiveCFISections(StringRef,
+                                                 SMLoc DirectiveLoc) {
+  StringRef Name;
+  bool EH = false;
+  bool Debug = false;
+
+  if (getParser().ParseIdentifier(Name))
+    return TokError("Expected an identifier");
+
+  if (Name == ".eh_frame")
+    EH = true;
+  else if (Name == ".debug_frame")
+    Debug = true;
+
+  if (getLexer().is(AsmToken::Comma)) {
+    Lex();
+
+    if (getParser().ParseIdentifier(Name))
+      return TokError("Expected an identifier");
+
+    if (Name == ".eh_frame")
+      EH = true;
+    else if (Name == ".debug_frame")
+      Debug = true;
+  }
+
+  getStreamer().EmitCFISections(EH, Debug);
+
+  return false;
+}
+
 /// ParseDirectiveCFIStartProc
 /// ::= .cfi_startproc
 bool GenericAsmParser::ParseDirectiveCFIStartProc(StringRef,
                                                   SMLoc DirectiveLoc) {
-  return getStreamer().EmitCFIStartProc();
+  getStreamer().EmitCFIStartProc();
+  return false;
 }
 
 /// ParseDirectiveCFIEndProc
 /// ::= .cfi_endproc
 bool GenericAsmParser::ParseDirectiveCFIEndProc(StringRef, SMLoc DirectiveLoc) {
-  return getStreamer().EmitCFIEndProc();
+  getStreamer().EmitCFIEndProc();
+  return false;
 }
 
 /// ParseRegisterOrRegisterNumber - parse register name or number.
@@ -2271,7 +2406,7 @@ bool GenericAsmParser::ParseRegisterOrRegisterNumber(int64_t &Register,
                                                      SMLoc DirectiveLoc) {
   unsigned RegNo;
 
-  if (getLexer().is(AsmToken::Percent)) {
+  if (getLexer().isNot(AsmToken::Integer)) {
     if (getParser().getTargetParser().ParseRegister(RegNo, DirectiveLoc,
       DirectiveLoc))
       return true;
@@ -2298,7 +2433,8 @@ bool GenericAsmParser::ParseDirectiveCFIDefCfa(StringRef,
   if (getParser().ParseAbsoluteExpression(Offset))
     return true;
 
-  return getStreamer().EmitCFIDefCfa(Register, Offset);
+  getStreamer().EmitCFIDefCfa(Register, Offset);
+  return false;
 }
 
 /// ParseDirectiveCFIDefCfaOffset
@@ -2309,7 +2445,20 @@ bool GenericAsmParser::ParseDirectiveCFIDefCfaOffset(StringRef,
   if (getParser().ParseAbsoluteExpression(Offset))
     return true;
 
-  return getStreamer().EmitCFIDefCfaOffset(Offset);
+  getStreamer().EmitCFIDefCfaOffset(Offset);
+  return false;
+}
+
+/// ParseDirectiveCFIAdjustCfaOffset
+/// ::= .cfi_adjust_cfa_offset adjustment
+bool GenericAsmParser::ParseDirectiveCFIAdjustCfaOffset(StringRef,
+                                                        SMLoc DirectiveLoc) {
+  int64_t Adjustment = 0;
+  if (getParser().ParseAbsoluteExpression(Adjustment))
+    return true;
+
+  getStreamer().EmitCFIAdjustCfaOffset(Adjustment);
+  return false;
 }
 
 /// ParseDirectiveCFIDefCfaRegister
@@ -2320,11 +2469,12 @@ bool GenericAsmParser::ParseDirectiveCFIDefCfaRegister(StringRef,
   if (ParseRegisterOrRegisterNumber(Register, DirectiveLoc))
     return true;
 
-  return getStreamer().EmitCFIDefCfaRegister(Register);
+  getStreamer().EmitCFIDefCfaRegister(Register);
+  return false;
 }
 
 /// ParseDirectiveCFIOffset
-/// ::= .cfi_off register, offset
+/// ::= .cfi_offset register, offset
 bool GenericAsmParser::ParseDirectiveCFIOffset(StringRef, SMLoc DirectiveLoc) {
   int64_t Register = 0;
   int64_t Offset = 0;
@@ -2339,7 +2489,29 @@ bool GenericAsmParser::ParseDirectiveCFIOffset(StringRef, SMLoc DirectiveLoc) {
   if (getParser().ParseAbsoluteExpression(Offset))
     return true;
 
-  return getStreamer().EmitCFIOffset(Register, Offset);
+  getStreamer().EmitCFIOffset(Register, Offset);
+  return false;
+}
+
+/// ParseDirectiveCFIRelOffset
+/// ::= .cfi_rel_offset register, offset
+bool GenericAsmParser::ParseDirectiveCFIRelOffset(StringRef,
+                                                  SMLoc DirectiveLoc) {
+  int64_t Register = 0;
+
+  if (ParseRegisterOrRegisterNumber(Register, DirectiveLoc))
+    return true;
+
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("unexpected token in directive");
+  Lex();
+
+  int64_t Offset = 0;
+  if (getParser().ParseAbsoluteExpression(Offset))
+    return true;
+
+  getStreamer().EmitCFIRelOffset(Register, Offset);
+  return false;
 }
 
 static bool isValidEncoding(int64_t Encoding) {
@@ -2389,25 +2561,42 @@ bool GenericAsmParser::ParseDirectiveCFIPersonalityOrLsda(StringRef IDVal,
   MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
 
   if (IDVal == ".cfi_personality")
-    return getStreamer().EmitCFIPersonality(Sym, Encoding);
+    getStreamer().EmitCFIPersonality(Sym, Encoding);
   else {
     assert(IDVal == ".cfi_lsda");
-    return getStreamer().EmitCFILsda(Sym, Encoding);
+    getStreamer().EmitCFILsda(Sym, Encoding);
   }
+  return false;
 }
 
 /// ParseDirectiveCFIRememberState
 /// ::= .cfi_remember_state
 bool GenericAsmParser::ParseDirectiveCFIRememberState(StringRef IDVal,
                                                       SMLoc DirectiveLoc) {
-  return getStreamer().EmitCFIRememberState();
+  getStreamer().EmitCFIRememberState();
+  return false;
 }
 
 /// ParseDirectiveCFIRestoreState
 /// ::= .cfi_remember_state
 bool GenericAsmParser::ParseDirectiveCFIRestoreState(StringRef IDVal,
                                                      SMLoc DirectiveLoc) {
-  return getStreamer().EmitCFIRestoreState();
+  getStreamer().EmitCFIRestoreState();
+  return false;
+}
+
+/// ParseDirectiveCFISameValue
+/// ::= .cfi_same_value register
+bool GenericAsmParser::ParseDirectiveCFISameValue(StringRef IDVal,
+                                                  SMLoc DirectiveLoc) {
+  int64_t Register = 0;
+
+  if (ParseRegisterOrRegisterNumber(Register, DirectiveLoc))
+    return true;
+
+  getStreamer().EmitCFISameValue(Register);
+
+  return false;
 }
 
 /// ParseDirectiveMacrosOnOff
@@ -2425,13 +2614,27 @@ bool GenericAsmParser::ParseDirectiveMacrosOnOff(StringRef Directive,
 }
 
 /// ParseDirectiveMacro
-/// ::= .macro name
+/// ::= .macro name [parameters]
 bool GenericAsmParser::ParseDirectiveMacro(StringRef Directive,
                                            SMLoc DirectiveLoc) {
   StringRef Name;
   if (getParser().ParseIdentifier(Name))
     return TokError("expected identifier in directive");
 
+  std::vector<StringRef> Parameters;
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    for(;;) {
+      StringRef Parameter;
+      if (getParser().ParseIdentifier(Parameter))
+        return TokError("expected identifier in directive");
+      Parameters.push_back(Parameter);
+
+      if (getLexer().isNot(AsmToken::Comma))
+        break;
+      Lex();
+    }
+  }
+
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in '.macro' directive");
 
@@ -2469,7 +2672,7 @@ bool GenericAsmParser::ParseDirectiveMacro(StringRef Directive,
   const char *BodyStart = StartToken.getLoc().getPointer();
   const char *BodyEnd = EndToken.getLoc().getPointer();
   StringRef Body = StringRef(BodyStart, BodyEnd - BodyStart);
-  getParser().MacroMap[Name] = new Macro(Name, Body);
+  getParser().MacroMap[Name] = new Macro(Name, Body, Parameters);
   return false;
 }
 
diff --git a/lib/MC/MCParser/COFFAsmParser.cpp b/lib/MC/MCParser/COFFAsmParser.cpp
index 5ecab03..64f6355 100644
--- a/lib/MC/MCParser/COFFAsmParser.cpp
+++ b/lib/MC/MCParser/COFFAsmParser.cpp
@@ -14,6 +14,9 @@
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/TargetAsmParser.h"
 #include "llvm/Support/COFF.h"
 using namespace llvm;
 
@@ -41,6 +44,34 @@ class COFFAsmParser : public MCAsmParserExtension {
     AddDirectiveHandler<&COFFAsmParser::ParseDirectiveScl>(".scl");
     AddDirectiveHandler<&COFFAsmParser::ParseDirectiveType>(".type");
     AddDirectiveHandler<&COFFAsmParser::ParseDirectiveEndef>(".endef");
+
+    // Win64 EH directives.
+    AddDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveStartProc>(
+                                                                   ".seh_proc");
+    AddDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveEndProc>(
+                                                                ".seh_endproc");
+    AddDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveStartChained>(
+                                                           ".seh_startchained");
+    AddDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveEndChained>(
+                                                             ".seh_endchained");
+    AddDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveHandler>(
+                                                                ".seh_handler");
+    AddDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveHandlerData>(
+                                                            ".seh_handlerdata");
+    AddDirectiveHandler<&COFFAsmParser::ParseSEHDirectivePushReg>(
+                                                                ".seh_pushreg");
+    AddDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveSetFrame>(
+                                                               ".seh_setframe");
+    AddDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveAllocStack>(
+                                                             ".seh_stackalloc");
+    AddDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveSaveReg>(
+                                                                ".seh_savereg");
+    AddDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveSaveXMM>(
+                                                                ".seh_savexmm");
+    AddDirectiveHandler<&COFFAsmParser::ParseSEHDirectivePushFrame>(
+                                                              ".seh_pushframe");
+    AddDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveEndProlog>(
+                                                            ".seh_endprologue");
   }
 
   bool ParseSectionDirectiveText(StringRef, SMLoc) {
@@ -70,6 +101,23 @@ class COFFAsmParser : public MCAsmParserExtension {
   bool ParseDirectiveType(StringRef, SMLoc);
   bool ParseDirectiveEndef(StringRef, SMLoc);
 
+  // Win64 EH directives.
+  bool ParseSEHDirectiveStartProc(StringRef, SMLoc);
+  bool ParseSEHDirectiveEndProc(StringRef, SMLoc);
+  bool ParseSEHDirectiveStartChained(StringRef, SMLoc);
+  bool ParseSEHDirectiveEndChained(StringRef, SMLoc);
+  bool ParseSEHDirectiveHandler(StringRef, SMLoc);
+  bool ParseSEHDirectiveHandlerData(StringRef, SMLoc);
+  bool ParseSEHDirectivePushReg(StringRef, SMLoc);
+  bool ParseSEHDirectiveSetFrame(StringRef, SMLoc);
+  bool ParseSEHDirectiveAllocStack(StringRef, SMLoc);
+  bool ParseSEHDirectiveSaveReg(StringRef, SMLoc);
+  bool ParseSEHDirectiveSaveXMM(StringRef, SMLoc);
+  bool ParseSEHDirectivePushFrame(StringRef, SMLoc);
+  bool ParseSEHDirectiveEndProlog(StringRef, SMLoc);
+
+  bool ParseAtUnwindOrAtExcept(bool &unwind, bool &except);
+  bool ParseSEHRegisterNumber(unsigned &RegNo);
 public:
   COFFAsmParser() {}
 };
@@ -135,6 +183,256 @@ bool COFFAsmParser::ParseDirectiveEndef(StringRef, SMLoc) {
   return false;
 }
 
+bool COFFAsmParser::ParseSEHDirectiveStartProc(StringRef, SMLoc) {
+  StringRef SymbolID;
+  if (getParser().ParseIdentifier(SymbolID))
+    return true;
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in directive");
+
+  MCSymbol *Symbol = getContext().GetOrCreateSymbol(SymbolID);
+
+  Lex();
+  getStreamer().EmitWin64EHStartProc(Symbol);
+  return false;
+}
+
+bool COFFAsmParser::ParseSEHDirectiveEndProc(StringRef, SMLoc) {
+  Lex();
+  getStreamer().EmitWin64EHEndProc();
+  return false;
+}
+
+bool COFFAsmParser::ParseSEHDirectiveStartChained(StringRef, SMLoc) {
+  Lex();
+  getStreamer().EmitWin64EHStartChained();
+  return false;
+}
+
+bool COFFAsmParser::ParseSEHDirectiveEndChained(StringRef, SMLoc) {
+  Lex();
+  getStreamer().EmitWin64EHEndChained();
+  return false;
+}
+
+bool COFFAsmParser::ParseSEHDirectiveHandler(StringRef, SMLoc) {
+  StringRef SymbolID;
+  if (getParser().ParseIdentifier(SymbolID))
+    return true;
+
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("you must specify one or both of @unwind or @except");
+  Lex();
+  bool unwind = false, except = false;
+  if (ParseAtUnwindOrAtExcept(unwind, except))
+    return true;
+  if (getLexer().is(AsmToken::Comma)) {
+    Lex();
+    if (ParseAtUnwindOrAtExcept(unwind, except))
+      return true;
+  }
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in directive");
+
+  MCSymbol *handler = getContext().GetOrCreateSymbol(SymbolID);
+
+  Lex();
+  getStreamer().EmitWin64EHHandler(handler, unwind, except);
+  return false;
+}
+
+bool COFFAsmParser::ParseSEHDirectiveHandlerData(StringRef, SMLoc) {
+  Lex();
+  getStreamer().EmitWin64EHHandlerData();
+  return false;
+}
+
+bool COFFAsmParser::ParseSEHDirectivePushReg(StringRef, SMLoc L) {
+  unsigned Reg;
+  if (ParseSEHRegisterNumber(Reg))
+    return true;
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in directive");
+
+  Lex();
+  getStreamer().EmitWin64EHPushReg(Reg);
+  return false;
+}
+
+bool COFFAsmParser::ParseSEHDirectiveSetFrame(StringRef, SMLoc L) {
+  unsigned Reg;
+  int64_t Off;
+  if (ParseSEHRegisterNumber(Reg))
+    return true;
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("you must specify a stack pointer offset");
+
+  Lex();
+  SMLoc startLoc = getLexer().getLoc();
+  if (getParser().ParseAbsoluteExpression(Off))
+    return true;
+
+  if (Off & 0x0F)
+    return Error(startLoc, "offset is not a multiple of 16");
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in directive");
+
+  Lex();
+  getStreamer().EmitWin64EHSetFrame(Reg, Off);
+  return false;
+}
+
+bool COFFAsmParser::ParseSEHDirectiveAllocStack(StringRef, SMLoc) {
+  int64_t Size;
+  SMLoc startLoc = getLexer().getLoc();
+  if (getParser().ParseAbsoluteExpression(Size))
+    return true;
+
+  if (Size & 7)
+    return Error(startLoc, "size is not a multiple of 8");
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in directive");
+
+  Lex();
+  getStreamer().EmitWin64EHAllocStack(Size);
+  return false;
+}
+
+bool COFFAsmParser::ParseSEHDirectiveSaveReg(StringRef, SMLoc L) {
+  unsigned Reg;
+  int64_t Off;
+  if (ParseSEHRegisterNumber(Reg))
+    return true;
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("you must specify an offset on the stack");
+
+  Lex();
+  SMLoc startLoc = getLexer().getLoc();
+  if (getParser().ParseAbsoluteExpression(Off))
+    return true;
+
+  if (Off & 7)
+    return Error(startLoc, "size is not a multiple of 8");
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in directive");
+
+  Lex();
+  // FIXME: Err on %xmm* registers
+  getStreamer().EmitWin64EHSaveReg(Reg, Off);
+  return false;
+}
+
+// FIXME: This method is inherently x86-specific. It should really be in the
+// x86 backend.
+bool COFFAsmParser::ParseSEHDirectiveSaveXMM(StringRef, SMLoc L) {
+  unsigned Reg;
+  int64_t Off;
+  if (ParseSEHRegisterNumber(Reg))
+    return true;
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("you must specify an offset on the stack");
+
+  Lex();
+  SMLoc startLoc = getLexer().getLoc();
+  if (getParser().ParseAbsoluteExpression(Off))
+    return true;
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in directive");
+
+  if (Off & 0x0F)
+    return Error(startLoc, "offset is not a multiple of 16");
+
+  Lex();
+  // FIXME: Err on non-%xmm* registers
+  getStreamer().EmitWin64EHSaveXMM(Reg, Off);
+  return false;
+}
+
+bool COFFAsmParser::ParseSEHDirectivePushFrame(StringRef, SMLoc) {
+  bool Code = false;
+  StringRef CodeID;
+  if (getLexer().is(AsmToken::At)) {
+    SMLoc startLoc = getLexer().getLoc();
+    Lex();
+    if (!getParser().ParseIdentifier(CodeID)) {
+      if (CodeID != "code")
+        return Error(startLoc, "expected @code");
+      Code = true;
+    }
+  }
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in directive");
+
+  Lex();
+  getStreamer().EmitWin64EHPushFrame(Code);
+  return false;
+}
+
+bool COFFAsmParser::ParseSEHDirectiveEndProlog(StringRef, SMLoc) {
+  Lex();
+  getStreamer().EmitWin64EHEndProlog();
+  return false;
+}
+
+bool COFFAsmParser::ParseAtUnwindOrAtExcept(bool &unwind, bool &except) {
+  StringRef identifier;
+  if (getLexer().isNot(AsmToken::At))
+    return TokError("a handler attribute must begin with '@'");
+  SMLoc startLoc = getLexer().getLoc();
+  Lex();
+  if (getParser().ParseIdentifier(identifier))
+    return Error(startLoc, "expected @unwind or @except");
+  if (identifier == "unwind")
+    unwind = true;
+  else if (identifier == "except")
+    except = true;
+  else
+    return Error(startLoc, "expected @unwind or @except");
+  return false;
+}
+
+bool COFFAsmParser::ParseSEHRegisterNumber(unsigned &RegNo) {
+  SMLoc startLoc = getLexer().getLoc();
+  if (getLexer().is(AsmToken::Percent)) {
+    const TargetAsmInfo &asmInfo = getContext().getTargetAsmInfo();
+    SMLoc endLoc;
+    unsigned LLVMRegNo;
+    if (getParser().getTargetParser().ParseRegister(LLVMRegNo,startLoc,endLoc))
+      return true;
+
+    // Check that this is a non-volatile register.
+    const unsigned *NVRegs = asmInfo.getCalleeSavedRegs();
+    unsigned i;
+    for (i = 0; NVRegs[i] != 0; ++i)
+      if (NVRegs[i] == LLVMRegNo)
+        break;
+    if (NVRegs[i] == 0)
+      return Error(startLoc, "expected non-volatile register");
+
+    int SEHRegNo = asmInfo.getSEHRegNum(LLVMRegNo);
+    if (SEHRegNo < 0)
+      return Error(startLoc,"register can't be represented in SEH unwind info");
+    RegNo = SEHRegNo;
+  }
+  else {
+    int64_t n;
+    if (getParser().ParseAbsoluteExpression(n))
+      return true;
+    if (n > 15)
+      return Error(startLoc, "register number is too high");
+    RegNo = n;
+  }
+
+  return false;
+}
+
 namespace llvm {
 
 MCAsmParserExtension *createCOFFAsmParser() {
diff --git a/lib/MC/MCParser/DarwinAsmParser.cpp b/lib/MC/MCParser/DarwinAsmParser.cpp
index 3c092cd..6f45068 100644
--- a/lib/MC/MCParser/DarwinAsmParser.cpp
+++ b/lib/MC/MCParser/DarwinAsmParser.cpp
@@ -369,11 +369,9 @@ bool DarwinAsmParser::ParseDirectiveDumpOrLoad(StringRef Directive,
   // FIXME: If/when .dump and .load are implemented they will be done in the
   // the assembly parser and not have any need for an MCStreamer API.
   if (IsDump)
-    Warning(IDLoc, "ignoring directive .dump for now");
+    return Warning(IDLoc, "ignoring directive .dump for now");
   else
-    Warning(IDLoc, "ignoring directive .load for now");
-
-  return false;
+    return Warning(IDLoc, "ignoring directive .load for now");
 }
 
 /// ParseDirectiveLsym
diff --git a/lib/MC/MCStreamer.cpp b/lib/MC/MCStreamer.cpp
index 1bd287b..ae3ed0f 100644
--- a/lib/MC/MCStreamer.cpp
+++ b/lib/MC/MCStreamer.cpp
@@ -12,19 +12,48 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetAsmInfo.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Twine.h"
 #include <cstdlib>
 using namespace llvm;
 
-MCStreamer::MCStreamer(MCContext &Ctx) : Context(Ctx) {
+MCStreamer::MCStreamer(MCContext &Ctx) : Context(Ctx), EmitEHFrame(true),
+                                         EmitDebugFrame(false),
+                                         CurrentW64UnwindInfo(0) {
   const MCSection *section = NULL;
   SectionStack.push_back(std::make_pair(section, section));
 }
 
 MCStreamer::~MCStreamer() {
+  for (unsigned i = 0; i < getNumW64UnwindInfos(); ++i)
+    delete W64UnwindInfos[i];
+}
+
+const MCExpr *MCStreamer::BuildSymbolDiff(MCContext &Context,
+                                          const MCSymbol *A,
+                                          const MCSymbol *B) {
+  MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
+  const MCExpr *ARef =
+    MCSymbolRefExpr::Create(A, Variant, Context);
+  const MCExpr *BRef =
+    MCSymbolRefExpr::Create(B, Variant, Context);
+  const MCExpr *AddrDelta =
+    MCBinaryExpr::Create(MCBinaryExpr::Sub, ARef, BRef, Context);
+  return AddrDelta;
+}
+
+const MCExpr *MCStreamer::ForceExpAbs(const MCExpr* Expr) {
+  if (Context.getAsmInfo().hasAggressiveSymbolFolding() ||
+      isa<MCSymbolRefExpr>(Expr))
+    return Expr;
+
+  MCSymbol *ABS = Context.CreateTempSymbol();
+  EmitAssignment(ABS, Expr);
+  return MCSymbolRefExpr::Create(ABS, Context);
 }
 
 raw_ostream &MCStreamer::GetCommentOS() {
@@ -52,9 +81,11 @@ void MCStreamer::EmitIntValue(uint64_t Value, unsigned Size,
   assert((isUIntN(8 * Size, Value) || isIntN(8 * Size, Value)) &&
          "Invalid size");
   char buf[8];
-  // FIXME: Endianness assumption.
-  for (unsigned i = 0; i != Size; ++i)
-    buf[i] = uint8_t(Value >> (i * 8));
+  const bool isLittleEndian = Context.getTargetAsmInfo().isLittleEndian();
+  for (unsigned i = 0; i != Size; ++i) {
+    unsigned index = isLittleEndian ? i : (Size - i - 1);
+    buf[i] = uint8_t(Value >> (index * 8));
+  }
   EmitBytes(StringRef(buf, Size), AddrSpace);
 }
 
@@ -78,42 +109,22 @@ void MCStreamer::EmitSLEB128IntValue(int64_t Value, unsigned AddrSpace) {
 
 void MCStreamer::EmitAbsValue(const MCExpr *Value, unsigned Size,
                               unsigned AddrSpace) {
-  if (getContext().getAsmInfo().hasAggressiveSymbolFolding()) {
-    EmitValue(Value, Size, AddrSpace);
-    return;
-  }
-  MCSymbol *ABS = getContext().CreateTempSymbol();
-  EmitAssignment(ABS, Value);
-  EmitSymbolValue(ABS, Size, AddrSpace);
+  const MCExpr *ABS = ForceExpAbs(Value);
+  EmitValue(ABS, Size, AddrSpace);
 }
 
 
 void MCStreamer::EmitValue(const MCExpr *Value, unsigned Size,
                            unsigned AddrSpace) {
-  EmitValueImpl(Value, Size, false, AddrSpace);
-}
-
-void MCStreamer::EmitPCRelValue(const MCExpr *Value, unsigned Size,
-                                unsigned AddrSpace) {
-  EmitValueImpl(Value, Size, true, AddrSpace);
+  EmitValueImpl(Value, Size, AddrSpace);
 }
 
 void MCStreamer::EmitSymbolValue(const MCSymbol *Sym, unsigned Size,
-                                 bool isPCRel, unsigned AddrSpace) {
-  EmitValueImpl(MCSymbolRefExpr::Create(Sym, getContext()), Size, isPCRel,
+                                  unsigned AddrSpace) {
+  EmitValueImpl(MCSymbolRefExpr::Create(Sym, getContext()), Size,
                 AddrSpace);
 }
 
-void MCStreamer::EmitSymbolValue(const MCSymbol *Sym, unsigned Size,
-                                 unsigned AddrSpace) {
-  EmitSymbolValue(Sym, Size, false, AddrSpace);
-}
-
-void MCStreamer::EmitPCRelSymbolValue(const MCSymbol *Sym, unsigned Size,
-                                      unsigned AddrSpace) {
-  EmitSymbolValue(Sym, Size, true, AddrSpace);
-}
-
 void MCStreamer::EmitGPRel32Value(const MCExpr *Value) {
   report_fatal_error("unsupported directive in streamer");
 }
@@ -135,7 +146,8 @@ bool MCStreamer::EmitDwarfFileDirective(unsigned FileNo,
 void MCStreamer::EmitDwarfLocDirective(unsigned FileNo, unsigned Line,
                                        unsigned Column, unsigned Flags,
                                        unsigned Isa,
-                                       unsigned Discriminator) {
+                                       unsigned Discriminator,
+                                       StringRef FileName) {
   getContext().setCurrentDwarfLoc(FileNo, Line, Column, Flags, Isa,
                                   Discriminator);
 }
@@ -152,28 +164,45 @@ void MCStreamer::EnsureValidFrame() {
     report_fatal_error("No open frame");
 }
 
-bool MCStreamer::EmitCFIStartProc() {
+void MCStreamer::EmitEHSymAttributes(const MCSymbol *Symbol,
+                                     MCSymbol *EHSymbol) {
+}
+
+void MCStreamer::EmitLabel(MCSymbol *Symbol) {
+  assert(!Symbol->isVariable() && "Cannot emit a variable symbol!");
+  assert(getCurrentSection() && "Cannot emit before setting section!");
+  Symbol->setSection(*getCurrentSection());
+
+  StringRef Prefix = getContext().getAsmInfo().getPrivateGlobalPrefix();
+  if (!Symbol->getName().startswith(Prefix))
+    LastNonPrivate = Symbol;
+}
+
+void MCStreamer::EmitCFISections(bool EH, bool Debug) {
+  assert(EH || Debug);
+  EmitEHFrame = EH;
+  EmitDebugFrame = Debug;
+}
+
+void MCStreamer::EmitCFIStartProc() {
   MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
-  if (CurFrame && !CurFrame->End) {
+  if (CurFrame && !CurFrame->End)
     report_fatal_error("Starting a frame before finishing the previous one!");
-    return true;
-  }
   MCDwarfFrameInfo Frame;
   Frame.Begin = getContext().CreateTempSymbol();
+  Frame.Function = LastNonPrivate;
   EmitLabel(Frame.Begin);
   FrameInfos.push_back(Frame);
-  return false;
 }
 
-bool MCStreamer::EmitCFIEndProc() {
+void MCStreamer::EmitCFIEndProc() {
   EnsureValidFrame();
   MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
   CurFrame->End = getContext().CreateTempSymbol();
   EmitLabel(CurFrame->End);
-  return false;
 }
 
-bool MCStreamer::EmitCFIDefCfa(int64_t Register, int64_t Offset) {
+void MCStreamer::EmitCFIDefCfa(int64_t Register, int64_t Offset) {
   EnsureValidFrame();
   MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
   MCSymbol *Label = getContext().CreateTempSymbol();
@@ -182,10 +211,9 @@ bool MCStreamer::EmitCFIDefCfa(int64_t Register, int64_t Offset) {
   MachineLocation Source(Register, -Offset);
   MCCFIInstruction Instruction(Label, Dest, Source);
   CurFrame->Instructions.push_back(Instruction);
-  return false;
 }
 
-bool MCStreamer::EmitCFIDefCfaOffset(int64_t Offset) {
+void MCStreamer::EmitCFIDefCfaOffset(int64_t Offset) {
   EnsureValidFrame();
   MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
   MCSymbol *Label = getContext().CreateTempSymbol();
@@ -194,10 +222,20 @@ bool MCStreamer::EmitCFIDefCfaOffset(int64_t Offset) {
   MachineLocation Source(MachineLocation::VirtualFP, -Offset);
   MCCFIInstruction Instruction(Label, Dest, Source);
   CurFrame->Instructions.push_back(Instruction);
-  return false;
 }
 
-bool MCStreamer::EmitCFIDefCfaRegister(int64_t Register) {
+void MCStreamer::EmitCFIAdjustCfaOffset(int64_t Adjustment) {
+  EnsureValidFrame();
+  MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
+  MCSymbol *Label = getContext().CreateTempSymbol();
+  EmitLabel(Label);
+  MachineLocation Dest(MachineLocation::VirtualFP);
+  MachineLocation Source(MachineLocation::VirtualFP, Adjustment);
+  MCCFIInstruction Instruction(MCCFIInstruction::RelMove, Label, Dest, Source);
+  CurFrame->Instructions.push_back(Instruction);
+}
+
+void MCStreamer::EmitCFIDefCfaRegister(int64_t Register) {
   EnsureValidFrame();
   MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
   MCSymbol *Label = getContext().CreateTempSymbol();
@@ -206,10 +244,9 @@ bool MCStreamer::EmitCFIDefCfaRegister(int64_t Register) {
   MachineLocation Source(MachineLocation::VirtualFP);
   MCCFIInstruction Instruction(Label, Dest, Source);
   CurFrame->Instructions.push_back(Instruction);
-  return false;
 }
 
-bool MCStreamer::EmitCFIOffset(int64_t Register, int64_t Offset) {
+void MCStreamer::EmitCFIOffset(int64_t Register, int64_t Offset) {
   EnsureValidFrame();
   MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
   MCSymbol *Label = getContext().CreateTempSymbol();
@@ -218,37 +255,44 @@ bool MCStreamer::EmitCFIOffset(int64_t Register, int64_t Offset) {
   MachineLocation Source(Register, Offset);
   MCCFIInstruction Instruction(Label, Dest, Source);
   CurFrame->Instructions.push_back(Instruction);
-  return false;
 }
 
-bool MCStreamer::EmitCFIPersonality(const MCSymbol *Sym,
+void MCStreamer::EmitCFIRelOffset(int64_t Register, int64_t Offset) {
+  EnsureValidFrame();
+  MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
+  MCSymbol *Label = getContext().CreateTempSymbol();
+  EmitLabel(Label);
+  MachineLocation Dest(Register, Offset);
+  MachineLocation Source(Register, Offset);
+  MCCFIInstruction Instruction(MCCFIInstruction::RelMove, Label, Dest, Source);
+  CurFrame->Instructions.push_back(Instruction);
+}
+
+void MCStreamer::EmitCFIPersonality(const MCSymbol *Sym,
                                     unsigned Encoding) {
   EnsureValidFrame();
   MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
   CurFrame->Personality = Sym;
   CurFrame->PersonalityEncoding = Encoding;
-  return false;
 }
 
-bool MCStreamer::EmitCFILsda(const MCSymbol *Sym, unsigned Encoding) {
+void MCStreamer::EmitCFILsda(const MCSymbol *Sym, unsigned Encoding) {
   EnsureValidFrame();
   MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
   CurFrame->Lsda = Sym;
   CurFrame->LsdaEncoding = Encoding;
-  return false;
 }
 
-bool MCStreamer::EmitCFIRememberState() {
+void MCStreamer::EmitCFIRememberState() {
   EnsureValidFrame();
   MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
   MCSymbol *Label = getContext().CreateTempSymbol();
   EmitLabel(Label);
   MCCFIInstruction Instruction(MCCFIInstruction::Remember, Label);
   CurFrame->Instructions.push_back(Instruction);
-  return false;
 }
 
-bool MCStreamer::EmitCFIRestoreState() {
+void MCStreamer::EmitCFIRestoreState() {
   // FIXME: Error if there is no matching cfi_remember_state.
   EnsureValidFrame();
   MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
@@ -256,7 +300,165 @@ bool MCStreamer::EmitCFIRestoreState() {
   EmitLabel(Label);
   MCCFIInstruction Instruction(MCCFIInstruction::Restore, Label);
   CurFrame->Instructions.push_back(Instruction);
-  return false;
+}
+
+void MCStreamer::EmitCFISameValue(int64_t Register) {
+  EnsureValidFrame();
+  MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
+  MCSymbol *Label = getContext().CreateTempSymbol();
+  EmitLabel(Label);
+  MCCFIInstruction Instruction(MCCFIInstruction::SameValue, Label, Register);
+  CurFrame->Instructions.push_back(Instruction);
+}
+
+void MCStreamer::setCurrentW64UnwindInfo(MCWin64EHUnwindInfo *Frame) {
+  W64UnwindInfos.push_back(Frame);
+  CurrentW64UnwindInfo = W64UnwindInfos.back();
+}
+
+void MCStreamer::EnsureValidW64UnwindInfo() {
+  MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
+  if (!CurFrame || CurFrame->End)
+    report_fatal_error("No open Win64 EH frame function!");
+}
+
+void MCStreamer::EmitWin64EHStartProc(const MCSymbol *Symbol) {
+  MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
+  if (CurFrame && !CurFrame->End)
+    report_fatal_error("Starting a function before ending the previous one!");
+  MCWin64EHUnwindInfo *Frame = new MCWin64EHUnwindInfo;
+  Frame->Begin = getContext().CreateTempSymbol();
+  Frame->Function = Symbol;
+  EmitLabel(Frame->Begin);
+  setCurrentW64UnwindInfo(Frame);
+}
+
+void MCStreamer::EmitWin64EHEndProc() {
+  EnsureValidW64UnwindInfo();
+  MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
+  if (CurFrame->ChainedParent)
+    report_fatal_error("Not all chained regions terminated!");
+  CurFrame->End = getContext().CreateTempSymbol();
+  EmitLabel(CurFrame->End);
+}
+
+void MCStreamer::EmitWin64EHStartChained() {
+  EnsureValidW64UnwindInfo();
+  MCWin64EHUnwindInfo *Frame = new MCWin64EHUnwindInfo;
+  MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
+  Frame->Begin = getContext().CreateTempSymbol();
+  Frame->Function = CurFrame->Function;
+  Frame->ChainedParent = CurFrame;
+  EmitLabel(Frame->Begin);
+  setCurrentW64UnwindInfo(Frame);
+}
+
+void MCStreamer::EmitWin64EHEndChained() {
+  EnsureValidW64UnwindInfo();
+  MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
+  if (!CurFrame->ChainedParent)
+    report_fatal_error("End of a chained region outside a chained region!");
+  CurFrame->End = getContext().CreateTempSymbol();
+  EmitLabel(CurFrame->End);
+  CurrentW64UnwindInfo = CurFrame->ChainedParent;
+}
+
+void MCStreamer::EmitWin64EHHandler(const MCSymbol *Sym, bool Unwind,
+                                    bool Except) {
+  EnsureValidW64UnwindInfo();
+  MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
+  if (CurFrame->ChainedParent)
+    report_fatal_error("Chained unwind areas can't have handlers!");
+  CurFrame->ExceptionHandler = Sym;
+  if (!Except && !Unwind)
+    report_fatal_error("Don't know what kind of handler this is!");
+  if (Unwind)
+    CurFrame->HandlesUnwind = true;
+  if (Except)
+    CurFrame->HandlesExceptions = true;
+}
+
+void MCStreamer::EmitWin64EHHandlerData() {
+  EnsureValidW64UnwindInfo();
+  MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
+  if (CurFrame->ChainedParent)
+    report_fatal_error("Chained unwind areas can't have handlers!");
+}
+
+void MCStreamer::EmitWin64EHPushReg(unsigned Register) {
+  EnsureValidW64UnwindInfo();
+  MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
+  MCSymbol *Label = getContext().CreateTempSymbol();
+  MCWin64EHInstruction Inst(Win64EH::UOP_PushNonVol, Label, Register);
+  EmitLabel(Label);
+  CurFrame->Instructions.push_back(Inst);
+}
+
+void MCStreamer::EmitWin64EHSetFrame(unsigned Register, unsigned Offset) {
+  EnsureValidW64UnwindInfo();
+  MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
+  if (CurFrame->LastFrameInst >= 0)
+    report_fatal_error("Frame register and offset already specified!");
+  if (Offset & 0x0F)
+    report_fatal_error("Misaligned frame pointer offset!");
+  MCWin64EHInstruction Inst(Win64EH::UOP_SetFPReg, NULL, Register, Offset);
+  CurFrame->LastFrameInst = CurFrame->Instructions.size();
+  CurFrame->Instructions.push_back(Inst);
+}
+
+void MCStreamer::EmitWin64EHAllocStack(unsigned Size) {
+  EnsureValidW64UnwindInfo();
+  if (Size & 7)
+    report_fatal_error("Misaligned stack allocation!");
+  MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
+  MCSymbol *Label = getContext().CreateTempSymbol();
+  MCWin64EHInstruction Inst(Label, Size);
+  EmitLabel(Label);
+  CurFrame->Instructions.push_back(Inst);
+}
+
+void MCStreamer::EmitWin64EHSaveReg(unsigned Register, unsigned Offset) {
+  EnsureValidW64UnwindInfo();
+  if (Offset & 7)
+    report_fatal_error("Misaligned saved register offset!");
+  MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
+  MCSymbol *Label = getContext().CreateTempSymbol();
+  MCWin64EHInstruction Inst(
+     Offset > 512*1024-8 ? Win64EH::UOP_SaveNonVolBig : Win64EH::UOP_SaveNonVol,
+                            Label, Register, Offset);
+  EmitLabel(Label);
+  CurFrame->Instructions.push_back(Inst);
+}
+
+void MCStreamer::EmitWin64EHSaveXMM(unsigned Register, unsigned Offset) {
+  EnsureValidW64UnwindInfo();
+  if (Offset & 0x0F)
+    report_fatal_error("Misaligned saved vector register offset!");
+  MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
+  MCSymbol *Label = getContext().CreateTempSymbol();
+  MCWin64EHInstruction Inst(
+    Offset > 512*1024-16 ? Win64EH::UOP_SaveXMM128Big : Win64EH::UOP_SaveXMM128,
+                            Label, Register, Offset);
+  EmitLabel(Label);
+  CurFrame->Instructions.push_back(Inst);
+}
+
+void MCStreamer::EmitWin64EHPushFrame(bool Code) {
+  EnsureValidW64UnwindInfo();
+  MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
+  if (CurFrame->Instructions.size() > 0)
+    report_fatal_error("If present, PushMachFrame must be the first UOP");
+  MCSymbol *Label = getContext().CreateTempSymbol();
+  MCWin64EHInstruction Inst(Win64EH::UOP_PushMachFrame, Label, Code);
+  EmitLabel(Label);
+  CurFrame->Instructions.push_back(Inst);
+}
+
+void MCStreamer::EmitWin64EHEndProlog() {
+  EnsureValidW64UnwindInfo();
+  MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
+  CurFrame->PrologEnd = getContext().CreateTempSymbol();
+  EmitLabel(CurFrame->PrologEnd);
 }
 
 void MCStreamer::EmitFnStart() {
@@ -313,3 +515,21 @@ void MCStreamer::EmitRawText(const Twine &T) {
   T.toVector(Str);
   EmitRawText(Str.str());
 }
+
+void MCStreamer::EmitFrames(bool usingCFI) {
+  if (!getNumFrameInfos())
+    return;
+
+  if (EmitEHFrame)
+    MCDwarfFrameEmitter::Emit(*this, usingCFI, true);
+
+  if (EmitDebugFrame)
+    MCDwarfFrameEmitter::Emit(*this, usingCFI, false);
+}
+
+void MCStreamer::EmitW64Tables() {
+  if (!getNumW64UnwindInfos())
+    return;
+
+  MCWin64EHUnwindEmitter::Emit(*this);
+}
diff --git a/lib/MC/MCSymbol.cpp b/lib/MC/MCSymbol.cpp
index 1c71f26..c2fad167 100644
--- a/lib/MC/MCSymbol.cpp
+++ b/lib/MC/MCSymbol.cpp
@@ -58,9 +58,13 @@ void MCSymbol::setVariableValue(const MCExpr *Value) {
          "Invalid redefinition!");
   this->Value = Value;
 
-  // Mark the variable as absolute as appropriate.
-  if (isa<MCConstantExpr>(Value))
-    setAbsolute();
+  // Variables should always be marked as in the same "section" as the value.
+  const MCSection *Section = Value->FindAssociatedSection();
+  if (Section) {
+    setSection(*Section);
+  } else {
+    setUndefined();
+  }
 }
 
 void MCSymbol::print(raw_ostream &OS) const {
diff --git a/lib/MC/MCWin64EH.cpp b/lib/MC/MCWin64EH.cpp
new file mode 100644
index 0000000..9453f5c
--- /dev/null
+++ b/lib/MC/MCWin64EH.cpp
@@ -0,0 +1,258 @@
+//===- lib/MC/MCWin64EH.cpp - MCWin64EH implementation --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCWin64EH.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/Target/TargetAsmInfo.h"
+
+namespace llvm {
+
+// NOTE: All relocations generated here are 4-byte image-relative.
+
+static uint8_t CountOfUnwindCodes(std::vector<MCWin64EHInstruction> &instArray){
+  uint8_t count = 0;
+  for (std::vector<MCWin64EHInstruction>::const_iterator I = instArray.begin(),
+       E = instArray.end(); I != E; ++I) {
+    switch (I->getOperation()) {
+    case Win64EH::UOP_PushNonVol:
+    case Win64EH::UOP_AllocSmall:
+    case Win64EH::UOP_SetFPReg:
+    case Win64EH::UOP_PushMachFrame:
+      count += 1;
+      break;
+    case Win64EH::UOP_SaveNonVol:
+    case Win64EH::UOP_SaveXMM128:
+      count += 2;
+      break;
+    case Win64EH::UOP_SaveNonVolBig:
+    case Win64EH::UOP_SaveXMM128Big:
+      count += 3;
+      break;
+    case Win64EH::UOP_AllocLarge:
+      if (I->getSize() > 512*1024-8)
+        count += 3;
+      else
+        count += 2;
+      break;
+    }
+  }
+  return count;
+}
+
+static void EmitAbsDifference(MCStreamer &streamer, MCSymbol *lhs,
+                              MCSymbol *rhs) {
+  MCContext &context = streamer.getContext();
+  const MCExpr *diff = MCBinaryExpr::CreateSub(MCSymbolRefExpr::Create(
+                                                                  lhs, context),
+                                               MCSymbolRefExpr::Create(
+                                                                  rhs, context),
+                                               context);
+  streamer.EmitAbsValue(diff, 1);
+
+}
+
+static void EmitUnwindCode(MCStreamer &streamer, MCSymbol *begin,
+                           MCWin64EHInstruction &inst) {
+  uint8_t b1, b2;
+  uint16_t w;
+  b2 = (inst.getOperation() & 0x0F);
+  switch (inst.getOperation()) {
+  case Win64EH::UOP_PushNonVol:
+    EmitAbsDifference(streamer, inst.getLabel(), begin);
+    b2 |= (inst.getRegister() & 0x0F) << 4;
+    streamer.EmitIntValue(b2, 1);
+    break;
+  case Win64EH::UOP_AllocLarge:
+    EmitAbsDifference(streamer, inst.getLabel(), begin);
+    if (inst.getSize() > 512*1024-8) {
+      b2 |= 0x10;
+      streamer.EmitIntValue(b2, 1);
+      w = inst.getSize() & 0xFFF8;
+      streamer.EmitIntValue(w, 2);
+      w = inst.getSize() >> 16;
+    } else {
+      streamer.EmitIntValue(b2, 1);
+      w = inst.getSize() >> 3;
+    }
+    streamer.EmitIntValue(w, 2);
+    break;
+  case Win64EH::UOP_AllocSmall:
+    b2 |= (((inst.getSize()-8) >> 3) & 0x0F) << 4;
+    EmitAbsDifference(streamer, inst.getLabel(), begin);
+    streamer.EmitIntValue(b2, 1);
+    break;
+  case Win64EH::UOP_SetFPReg:
+    b1 = inst.getOffset() & 0xF0;
+    streamer.EmitIntValue(b1, 1);
+    streamer.EmitIntValue(b2, 1);
+    break;
+  case Win64EH::UOP_SaveNonVol:
+  case Win64EH::UOP_SaveXMM128:
+    b2 |= (inst.getRegister() & 0x0F) << 4;
+    EmitAbsDifference(streamer, inst.getLabel(), begin);
+    streamer.EmitIntValue(b2, 1);
+    w = inst.getOffset() >> 3;
+    if (inst.getOperation() == Win64EH::UOP_SaveXMM128)
+      w >>= 1;
+    streamer.EmitIntValue(w, 2);
+    break;
+  case Win64EH::UOP_SaveNonVolBig:
+  case Win64EH::UOP_SaveXMM128Big:
+    b2 |= (inst.getRegister() & 0x0F) << 4;
+    EmitAbsDifference(streamer, inst.getLabel(), begin);
+    streamer.EmitIntValue(b2, 1);
+    if (inst.getOperation() == Win64EH::UOP_SaveXMM128Big)
+      w = inst.getOffset() & 0xFFF0;
+    else
+      w = inst.getOffset() & 0xFFF8;
+    streamer.EmitIntValue(w, 2);
+    w = inst.getOffset() >> 16;
+    streamer.EmitIntValue(w, 2);
+    break;
+  case Win64EH::UOP_PushMachFrame:
+    if (inst.isPushCodeFrame())
+      b2 |= 0x10;
+    EmitAbsDifference(streamer, inst.getLabel(), begin);
+    streamer.EmitIntValue(b2, 1);
+    break;
+  }
+}
+
+static void EmitRuntimeFunction(MCStreamer &streamer,
+                                const MCWin64EHUnwindInfo *info) {
+  MCContext &context = streamer.getContext();
+
+  streamer.EmitValueToAlignment(4);
+  streamer.EmitValue(MCSymbolRefExpr::Create(info->Begin, context), 4);
+  streamer.EmitValue(MCSymbolRefExpr::Create(info->End, context), 4);
+  streamer.EmitValue(MCSymbolRefExpr::Create(info->Symbol, context), 4);
+}
+
+static void EmitUnwindInfo(MCStreamer &streamer, MCWin64EHUnwindInfo *info) {
+  // If this UNWIND_INFO already has a symbol, it's already been emitted.
+  if (info->Symbol) return;
+
+  MCContext &context = streamer.getContext();
+  streamer.EmitValueToAlignment(4);
+  // Upper 3 bits are the version number (currently 1).
+  uint8_t flags = 0x01;
+  info->Symbol = context.CreateTempSymbol();
+  streamer.EmitLabel(info->Symbol);
+
+  if (info->ChainedParent)
+    flags |= Win64EH::UNW_ChainInfo << 3;
+  else {
+    if (info->HandlesUnwind)
+      flags |= Win64EH::UNW_TerminateHandler << 3;
+    if (info->HandlesExceptions)
+      flags |= Win64EH::UNW_ExceptionHandler << 3;
+  }
+  streamer.EmitIntValue(flags, 1);
+
+  if (info->PrologEnd)
+    EmitAbsDifference(streamer, info->PrologEnd, info->Begin);
+  else
+    streamer.EmitIntValue(0, 1);
+
+  uint8_t numCodes = CountOfUnwindCodes(info->Instructions);
+  streamer.EmitIntValue(numCodes, 1);
+
+  uint8_t frame = 0;
+  if (info->LastFrameInst >= 0) {
+    MCWin64EHInstruction &frameInst = info->Instructions[info->LastFrameInst];
+    assert(frameInst.getOperation() == Win64EH::UOP_SetFPReg);
+    frame = (frameInst.getRegister() & 0x0F) |
+            (frameInst.getOffset() & 0xF0);
+  }
+  streamer.EmitIntValue(frame, 1);
+
+  // Emit unwind instructions (in reverse order).
+  uint8_t numInst = info->Instructions.size();
+  for (uint8_t c = 0; c < numInst; ++c) {
+    MCWin64EHInstruction inst = info->Instructions.back();
+    info->Instructions.pop_back();
+    EmitUnwindCode(streamer, info->Begin, inst);
+  }
+
+  if (flags & (Win64EH::UNW_ChainInfo << 3))
+    EmitRuntimeFunction(streamer, info->ChainedParent);
+  else if (flags &
+           ((Win64EH::UNW_TerminateHandler|Win64EH::UNW_ExceptionHandler) << 3))
+    streamer.EmitValue(MCSymbolRefExpr::Create(info->ExceptionHandler, context),
+                       4);
+  else if (numCodes < 2) {
+    // The minimum size of an UNWIND_INFO struct is 8 bytes. If we're not
+    // a chained unwind info, if there is no handler, and if there are fewer
+    // than 2 slots used in the unwind code array, we have to pad to 8 bytes.
+    if (numCodes == 1)
+      streamer.EmitIntValue(0, 2);
+    else
+      streamer.EmitIntValue(0, 4);
+  }
+}
+
+StringRef MCWin64EHUnwindEmitter::GetSectionSuffix(const MCSymbol *func) {
+  if (!func || !func->isInSection()) return "";
+  const MCSection *section = &func->getSection();
+  const MCSectionCOFF *COFFSection;
+  if ((COFFSection = dyn_cast<MCSectionCOFF>(section))) {
+    StringRef name = COFFSection->getSectionName();
+    size_t dollar = name.find('$');
+    size_t dot = name.find('.', 1);
+    if (dollar == StringRef::npos && dot == StringRef::npos)
+      return "";
+    if (dot == StringRef::npos)
+      return name.substr(dollar);
+    if (dollar == StringRef::npos || dot < dollar)
+      return name.substr(dot);
+    return name.substr(dollar);
+  }
+  return "";
+}
+
+void MCWin64EHUnwindEmitter::EmitUnwindInfo(MCStreamer &streamer,
+                                            MCWin64EHUnwindInfo *info) {
+  // Switch sections (the static function above is meant to be called from
+  // here and from Emit().
+  MCContext &context = streamer.getContext();
+  const TargetAsmInfo &asmInfo = context.getTargetAsmInfo();
+  const MCSection *xdataSect =
+    asmInfo.getWin64EHTableSection(GetSectionSuffix(info->Function));
+  streamer.SwitchSection(xdataSect);
+
+  llvm::EmitUnwindInfo(streamer, info);
+}
+
+void MCWin64EHUnwindEmitter::Emit(MCStreamer &streamer) {
+  MCContext &context = streamer.getContext();
+  // Emit the unwind info structs first.
+  const TargetAsmInfo &asmInfo = context.getTargetAsmInfo();
+  for (unsigned i = 0; i < streamer.getNumW64UnwindInfos(); ++i) {
+    MCWin64EHUnwindInfo &info = streamer.getW64UnwindInfo(i);
+    const MCSection *xdataSect =
+      asmInfo.getWin64EHTableSection(GetSectionSuffix(info.Function));
+    streamer.SwitchSection(xdataSect);
+    llvm::EmitUnwindInfo(streamer, &info);
+  }
+  // Now emit RUNTIME_FUNCTION entries.
+  for (unsigned i = 0; i < streamer.getNumW64UnwindInfos(); ++i) {
+    MCWin64EHUnwindInfo &info = streamer.getW64UnwindInfo(i);
+    const MCSection *pdataSect =
+      asmInfo.getWin64EHFuncTableSection(GetSectionSuffix(info.Function));
+    streamer.SwitchSection(pdataSect);
+    EmitRuntimeFunction(streamer, &info);
+  }
+}
+
+} // End of namespace llvm
+
diff --git a/lib/MC/MachObjectWriter.cpp b/lib/MC/MachObjectWriter.cpp
index 105506a..f049b1c 100644
--- a/lib/MC/MachObjectWriter.cpp
+++ b/lib/MC/MachObjectWriter.cpp
@@ -121,6 +121,33 @@ private:
   }
   uint64_t getSymbolAddress(const MCSymbolData* SD,
                             const MCAsmLayout &Layout) const {
+    const MCSymbol &S = SD->getSymbol();
+
+    // If this is a variable, then recursively evaluate now.
+    if (S.isVariable()) {
+      MCValue Target;
+      if (!S.getVariableValue()->EvaluateAsRelocatable(Target, Layout))
+        report_fatal_error("unable to evaluate offset for variable '" +
+                           S.getName() + "'");
+
+      // Verify that any used symbols are defined.
+      if (Target.getSymA() && Target.getSymA()->getSymbol().isUndefined())
+        report_fatal_error("unable to evaluate offset to undefined symbol '" +
+                           Target.getSymA()->getSymbol().getName() + "'");
+      if (Target.getSymB() && Target.getSymB()->getSymbol().isUndefined())
+        report_fatal_error("unable to evaluate offset to undefined symbol '" +
+                           Target.getSymB()->getSymbol().getName() + "'");
+
+      uint64_t Address = Target.getConstant();
+      if (Target.getSymA())
+        Address += getSymbolAddress(&Layout.getAssembler().getSymbolData(
+                                      Target.getSymA()->getSymbol()), Layout);
+      if (Target.getSymB())
+        Address += getSymbolAddress(&Layout.getAssembler().getSymbolData(
+                                      Target.getSymB()->getSymbol()), Layout);
+      return Address;
+    }
+
     return getSectionAddress(SD->getFragment()->getParent()) +
       Layout.getSymbolOffset(SD);
   }
@@ -440,7 +467,7 @@ public:
       // Compensate for the relocation offset, Darwin x86_64 relocations only
       // have the addend and appear to have attempted to define it to be the
       // actual expression addend without the PCrel bias. However, instructions
-      // with data following the relocation are not accomodated for (see comment
+      // with data following the relocation are not accommodated for (see comment
       // below regarding SIGNED{1,2,4}), so it isn't exactly that either.
       Value += 1LL << Log2Size;
     }
@@ -541,7 +568,7 @@ public:
       }
 
       // x86_64 almost always uses external relocations, except when there is no
-      // symbol to use as a base address (a local symbol with no preceeding
+      // symbol to use as a base address (a local symbol with no preceding
       // non-local symbol).
       if (Base) {
         Index = Base->getIndex();
@@ -550,7 +577,7 @@ public:
         // Add the local offset, if needed.
         if (Base != &SD)
           Value += Layout.getSymbolOffset(&SD) - Layout.getSymbolOffset(Base);
-      } else if (Symbol->isInSection()) {
+      } else if (Symbol->isInSection() && !Symbol->isVariable()) {
         // The index is the section ordinal (1-based).
         Index = SD.getFragment()->getParent()->getOrdinal() + 1;
         IsExtern = 0;
@@ -1028,17 +1055,17 @@ public:
       // FIXME!
       report_fatal_error("FIXME: relocations to absolute targets "
                          "not yet implemented");
-    } else if (SD->getSymbol().isVariable()) {
-      int64_t Res;
-      if (SD->getSymbol().getVariableValue()->EvaluateAsAbsolute(
-            Res, Layout, SectionAddress)) {
-        FixedValue = Res;
-        return;
+    } else {
+      // Resolve constant variables.
+      if (SD->getSymbol().isVariable()) {
+        int64_t Res;
+        if (SD->getSymbol().getVariableValue()->EvaluateAsAbsolute(
+              Res, Layout, SectionAddress)) {
+          FixedValue = Res;
+          return;
+        }
       }
 
-      report_fatal_error("unsupported relocation of variable '" +
-                         SD->getSymbol().getName() + "'");
-    } else {
       // Check whether we need an external or internal relocation.
       if (doesSymbolRequireExternRelocation(SD)) {
         IsExtern = 1;
@@ -1050,8 +1077,10 @@ public:
           FixedValue -= Layout.getSymbolOffset(SD);
       } else {
         // The index is the section ordinal (1-based).
-        Index = SD->getFragment()->getParent()->getOrdinal() + 1;
-        FixedValue += getSectionAddress(SD->getFragment()->getParent());
+        const MCSectionData &SymSD = Asm.getSectionData(
+          SD->getSymbol().getSection());
+        Index = SymSD.getOrdinal() + 1;
+        FixedValue += getSectionAddress(&SymSD);
       }
       if (IsPCRel)
         FixedValue -= getSectionAddress(Fragment->getParent());
@@ -1127,17 +1156,17 @@ public:
       // FIXME: Currently, these are never generated (see code below). I cannot
       // find a case where they are actually emitted.
       Type = macho::RIT_Vanilla;
-    } else if (SD->getSymbol().isVariable()) {
-      int64_t Res;
-      if (SD->getSymbol().getVariableValue()->EvaluateAsAbsolute(
-            Res, Layout, SectionAddress)) {
-        FixedValue = Res;
-        return;
+    } else {
+      // Resolve constant variables.
+      if (SD->getSymbol().isVariable()) {
+        int64_t Res;
+        if (SD->getSymbol().getVariableValue()->EvaluateAsAbsolute(
+              Res, Layout, SectionAddress)) {
+          FixedValue = Res;
+          return;
+        }
       }
 
-      report_fatal_error("unsupported relocation of variable '" +
-                         SD->getSymbol().getName() + "'");
-    } else {
       // Check whether we need an external or internal relocation.
       if (doesSymbolRequireExternRelocation(SD)) {
         IsExtern = 1;
@@ -1149,8 +1178,10 @@ public:
           FixedValue -= Layout.getSymbolOffset(SD);
       } else {
         // The index is the section ordinal (1-based).
-        Index = SD->getFragment()->getParent()->getOrdinal() + 1;
-        FixedValue += getSectionAddress(SD->getFragment()->getParent());
+        const MCSectionData &SymSD = Asm.getSectionData(
+          SD->getSymbol().getSection());
+        Index = SymSD.getOrdinal() + 1;
+        FixedValue += getSectionAddress(&SymSD);
       }
       if (IsPCRel)
         FixedValue -= getSectionAddress(Fragment->getParent());
diff --git a/lib/MC/WinCOFFObjectWriter.cpp b/lib/MC/WinCOFFObjectWriter.cpp
index 6ca5d37..101237a 100644
--- a/lib/MC/WinCOFFObjectWriter.cpp
+++ b/lib/MC/WinCOFFObjectWriter.cpp
@@ -647,22 +647,27 @@ void WinCOFFObjectWriter::RecordRelocation(const MCAssembler &Asm,
 
   COFFSection *coff_section = SectionMap[&SectionData->getSection()];
   COFFSymbol *coff_symbol = SymbolMap[&A_SD.getSymbol()];
+  const MCSymbolRefExpr *SymA = Target.getSymA();
+  const MCSymbolRefExpr *SymB = Target.getSymB();
+  const bool CrossSection = SymB &&
+    &SymA->getSymbol().getSection() != &SymB->getSymbol().getSection();
 
   if (Target.getSymB()) {
-    if (&Target.getSymA()->getSymbol().getSection()
-     != &Target.getSymB()->getSymbol().getSection()) {
-      llvm_unreachable("Symbol relative relocations are only allowed between "
-                       "symbols in the same section");
-    }
     const MCSymbol *B = &Target.getSymB()->getSymbol();
     MCSymbolData &B_SD = Asm.getSymbolData(*B);
 
-    FixedValue = Layout.getSymbolOffset(&A_SD) - Layout.getSymbolOffset(&B_SD);
+    // Offset of the symbol in the section
+    int64_t a = Layout.getSymbolOffset(&B_SD);
+
+    // Ofeset of the relocation in the section
+    int64_t b = Layout.getFragmentOffset(Fragment) + Fixup.getOffset();
 
+    FixedValue = b - a;
     // In the case where we have SymbA and SymB, we just need to store the delta
     // between the two symbols.  Update FixedValue to account for the delta, and
     // skip recording the relocation.
-    return;
+    if (!CrossSection)
+      return;
   } else {
     FixedValue = Target.getConstant();
   }
@@ -673,7 +678,7 @@ void WinCOFFObjectWriter::RecordRelocation(const MCAssembler &Asm,
   Reloc.Data.VirtualAddress = Layout.getFragmentOffset(Fragment);
 
   // Turn relocations for temporary symbols into section relocations.
-  if (coff_symbol->MCData->getSymbol().isTemporary()) {
+  if (coff_symbol->MCData->getSymbol().isTemporary() || CrossSection) {
     Reloc.Symb = coff_symbol->Section->Symbol;
     FixedValue += Layout.getFragmentOffset(coff_symbol->MCData->Fragment)
                 + coff_symbol->MCData->getOffset();
@@ -684,7 +689,12 @@ void WinCOFFObjectWriter::RecordRelocation(const MCAssembler &Asm,
 
   Reloc.Data.VirtualAddress += Fixup.getOffset();
 
-  switch ((unsigned)Fixup.getKind()) {
+  unsigned FixupKind = Fixup.getKind();
+
+  if (CrossSection)
+    FixupKind = FK_PCRel_4;
+
+  switch (FixupKind) {
   case FK_PCRel_4:
   case X86::reloc_riprel_4byte:
   case X86::reloc_riprel_4byte_movq_load:
diff --git a/lib/MC/WinCOFFStreamer.cpp b/lib/MC/WinCOFFStreamer.cpp
index 46968e6..6c36c12 100644
--- a/lib/MC/WinCOFFStreamer.cpp
+++ b/lib/MC/WinCOFFStreamer.cpp
@@ -23,6 +23,7 @@
 #include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/MC/MCWin64EH.h"
 #include "llvm/Target/TargetRegistry.h"
 #include "llvm/Target/TargetAsmBackend.h"
 #include "llvm/ADT/StringMap.h"
@@ -74,6 +75,7 @@ public:
                                  unsigned MaxBytesToEmit);
   virtual void EmitFileDirective(StringRef Filename);
   virtual void EmitInstruction(const MCInst &Instruction);
+  virtual void EmitWin64EHHandlerData();
   virtual void Finish();
 
 private:
@@ -377,7 +379,16 @@ void WinCOFFStreamer::EmitInstruction(const MCInst &Instruction) {
                                                 Fragment->getFixups());
 }
 
+void WinCOFFStreamer::EmitWin64EHHandlerData() {
+  MCStreamer::EmitWin64EHHandlerData();
+
+  // We have to emit the unwind info now, because this directive
+  // actually switches to the .xdata section!
+  MCWin64EHUnwindEmitter::EmitUnwindInfo(*this, getCurrentW64UnwindInfo());
+}
+
 void WinCOFFStreamer::Finish() {
+  EmitW64Tables();
   MCObjectStreamer::Finish();
 }
 
diff --git a/lib/Object/CMakeLists.txt b/lib/Object/CMakeLists.txt
index f28d2ec..703d385 100644
--- a/lib/Object/CMakeLists.txt
+++ b/lib/Object/CMakeLists.txt
@@ -1,7 +1,8 @@
 add_llvm_library(LLVMObject
+  COFFObjectFile.cpp
+  ELFObjectFile.cpp
   MachOObject.cpp
+  MachOObjectFile.cpp
   Object.cpp
   ObjectFile.cpp
-  COFFObjectFile.cpp
-  ELFObjectFile.cpp
   )
diff --git a/lib/Object/COFFObjectFile.cpp b/lib/Object/COFFObjectFile.cpp
index 30709f1..86bf44b 100644
--- a/lib/Object/COFFObjectFile.cpp
+++ b/lib/Object/COFFObjectFile.cpp
@@ -186,11 +186,8 @@ char COFFObjectFile::getSymbolNMTypeChar(DataRefImpl Symb) const {
     return ret;
 
   uint32_t Characteristics = 0;
-  uint32_t PointerToRawData = 0;
-  const coff_section *Section = getSection(symb->SectionNumber);
-  if (Section) {
+  if (const coff_section *Section = getSection(symb->SectionNumber)) {
     Characteristics = Section->Characteristics;
-    PointerToRawData = Section->PointerToRawData;
   }
 
   switch (symb->SectionNumber) {
diff --git a/lib/Object/MachOObjectFile.cpp b/lib/Object/MachOObjectFile.cpp
new file mode 100644
index 0000000..877cbfb
--- /dev/null
+++ b/lib/Object/MachOObjectFile.cpp
@@ -0,0 +1,327 @@
+//===- MachOObjectFile.cpp - Mach-O object file binding ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the MachOObjectFile class, which binds the MachOObject
+// class to the generic ObjectFile wrapper.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Triple.h"
+#include "llvm/Object/MachOFormat.h"
+#include "llvm/Object/MachOObject.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/MachO.h"
+
+#include <cctype>
+#include <cstring>
+#include <limits>
+
+using namespace llvm;
+using namespace object;
+
+namespace llvm {
+
+typedef MachOObject::LoadCommandInfo LoadCommandInfo;
+
+class MachOObjectFile : public ObjectFile {
+public:
+  MachOObjectFile(MemoryBuffer *Object, MachOObject *MOO)
+    : ObjectFile(Object),
+      MachOObj(MOO),
+      RegisteredStringTable(std::numeric_limits<uint32_t>::max()) {}
+
+  virtual symbol_iterator begin_symbols() const;
+  virtual symbol_iterator end_symbols() const;
+  virtual section_iterator begin_sections() const;
+  virtual section_iterator end_sections() const;
+
+  virtual uint8_t getBytesInAddress() const;
+  virtual StringRef getFileFormatName() const;
+  virtual unsigned getArch() const;
+
+protected:
+  virtual SymbolRef getSymbolNext(DataRefImpl Symb) const;
+  virtual StringRef getSymbolName(DataRefImpl Symb) const;
+  virtual uint64_t  getSymbolAddress(DataRefImpl Symb) const;
+  virtual uint64_t  getSymbolSize(DataRefImpl Symb) const;
+  virtual char      getSymbolNMTypeChar(DataRefImpl Symb) const;
+  virtual bool      isSymbolInternal(DataRefImpl Symb) const;
+
+  virtual SectionRef getSectionNext(DataRefImpl Sec) const;
+  virtual StringRef  getSectionName(DataRefImpl Sec) const;
+  virtual uint64_t   getSectionAddress(DataRefImpl Sec) const;
+  virtual uint64_t   getSectionSize(DataRefImpl Sec) const;
+  virtual StringRef  getSectionContents(DataRefImpl Sec) const;
+  virtual bool       isSectionText(DataRefImpl Sec) const;
+
+private:
+  MachOObject *MachOObj;
+  mutable uint32_t RegisteredStringTable;
+
+  void moveToNextSection(DataRefImpl &DRI) const;
+  void getSymbolTableEntry(DataRefImpl DRI,
+                           InMemoryStruct<macho::SymbolTableEntry> &Res) const;
+  void moveToNextSymbol(DataRefImpl &DRI) const;
+  void getSection(DataRefImpl DRI, InMemoryStruct<macho::Section> &Res) const;
+};
+
+ObjectFile *ObjectFile::createMachOObjectFile(MemoryBuffer *Buffer) {
+  std::string Err;
+  MachOObject *MachOObj = MachOObject::LoadFromBuffer(Buffer, &Err);
+  if (!MachOObj)
+    return NULL;
+  return new MachOObjectFile(Buffer, MachOObj);
+}
+
+/*===-- Symbols -----------------------------------------------------------===*/
+
+void MachOObjectFile::moveToNextSymbol(DataRefImpl &DRI) const {
+  uint32_t LoadCommandCount = MachOObj->getHeader().NumLoadCommands;
+  while (DRI.d.a < LoadCommandCount) {
+    LoadCommandInfo LCI = MachOObj->getLoadCommandInfo(DRI.d.a);
+    if (LCI.Command.Type == macho::LCT_Symtab) {
+      InMemoryStruct<macho::SymtabLoadCommand> SymtabLoadCmd;
+      MachOObj->ReadSymtabLoadCommand(LCI, SymtabLoadCmd);
+      if (DRI.d.b < SymtabLoadCmd->NumSymbolTableEntries)
+        return;
+    }
+
+    DRI.d.a++;
+    DRI.d.b = 0;
+  }
+}
+
+void MachOObjectFile::getSymbolTableEntry(DataRefImpl DRI,
+    InMemoryStruct<macho::SymbolTableEntry> &Res) const {
+  InMemoryStruct<macho::SymtabLoadCommand> SymtabLoadCmd;
+  LoadCommandInfo LCI = MachOObj->getLoadCommandInfo(DRI.d.a);
+  MachOObj->ReadSymtabLoadCommand(LCI, SymtabLoadCmd);
+
+  if (RegisteredStringTable != DRI.d.a) {
+    MachOObj->RegisterStringTable(*SymtabLoadCmd);
+    RegisteredStringTable = DRI.d.a;
+  }
+
+  MachOObj->ReadSymbolTableEntry(SymtabLoadCmd->SymbolTableOffset, DRI.d.b,
+                                 Res);
+}
+
+
+SymbolRef MachOObjectFile::getSymbolNext(DataRefImpl DRI) const {
+  DRI.d.b++;
+  moveToNextSymbol(DRI);
+  return SymbolRef(DRI, this);
+}
+
+StringRef MachOObjectFile::getSymbolName(DataRefImpl DRI) const {
+  InMemoryStruct<macho::SymbolTableEntry> Entry;
+  getSymbolTableEntry(DRI, Entry);
+  return MachOObj->getStringAtIndex(Entry->StringIndex);
+}
+
+uint64_t MachOObjectFile::getSymbolAddress(DataRefImpl DRI) const {
+  InMemoryStruct<macho::SymbolTableEntry> Entry;
+  getSymbolTableEntry(DRI, Entry);
+  return Entry->Value;
+}
+
+uint64_t MachOObjectFile::getSymbolSize(DataRefImpl DRI) const {
+  return UnknownAddressOrSize;
+}
+
+char MachOObjectFile::getSymbolNMTypeChar(DataRefImpl DRI) const {
+  InMemoryStruct<macho::SymbolTableEntry> Entry;
+  getSymbolTableEntry(DRI, Entry);
+
+  char Char;
+  switch (Entry->Type & macho::STF_TypeMask) {
+    case macho::STT_Undefined:
+      Char = 'u';
+      break;
+    case macho::STT_Absolute:
+    case macho::STT_Section:
+      Char = 's';
+      break;
+    default:
+      Char = '?';
+      break;
+  }
+
+  if (Entry->Flags & (macho::STF_External | macho::STF_PrivateExtern))
+    Char = toupper(Char);
+  return Char;
+}
+
+bool MachOObjectFile::isSymbolInternal(DataRefImpl DRI) const {
+  InMemoryStruct<macho::SymbolTableEntry> Entry;
+  getSymbolTableEntry(DRI, Entry);
+  return Entry->Flags & macho::STF_StabsEntryMask;
+}
+
+ObjectFile::symbol_iterator MachOObjectFile::begin_symbols() const {
+  // DRI.d.a = segment number; DRI.d.b = symbol index.
+  DataRefImpl DRI;
+  DRI.d.a = DRI.d.b = 0;
+  moveToNextSymbol(DRI);
+  return symbol_iterator(SymbolRef(DRI, this));
+}
+
+ObjectFile::symbol_iterator MachOObjectFile::end_symbols() const {
+  DataRefImpl DRI;
+  DRI.d.a = MachOObj->getHeader().NumLoadCommands;
+  DRI.d.b = 0;
+  return symbol_iterator(SymbolRef(DRI, this));
+}
+
+
+/*===-- Sections ----------------------------------------------------------===*/
+
+void MachOObjectFile::moveToNextSection(DataRefImpl &DRI) const {
+  uint32_t LoadCommandCount = MachOObj->getHeader().NumLoadCommands;
+  while (DRI.d.a < LoadCommandCount) {
+    LoadCommandInfo LCI = MachOObj->getLoadCommandInfo(DRI.d.a);
+    if (LCI.Command.Type == macho::LCT_Segment) {
+      InMemoryStruct<macho::SegmentLoadCommand> SegmentLoadCmd;
+      MachOObj->ReadSegmentLoadCommand(LCI, SegmentLoadCmd);
+      if (DRI.d.b < SegmentLoadCmd->NumSections)
+        return;
+    } else if (LCI.Command.Type == macho::LCT_Segment64) {
+      InMemoryStruct<macho::Segment64LoadCommand> Segment64LoadCmd;
+      MachOObj->ReadSegment64LoadCommand(LCI, Segment64LoadCmd);
+      if (DRI.d.b < Segment64LoadCmd->NumSections)
+        return;
+    }
+
+    DRI.d.a++;
+    DRI.d.b = 0;
+  }
+}
+
+SectionRef MachOObjectFile::getSectionNext(DataRefImpl DRI) const {
+  DRI.d.b++;
+  moveToNextSection(DRI);
+  return SectionRef(DRI, this);
+}
+
+void
+MachOObjectFile::getSection(DataRefImpl DRI,
+                            InMemoryStruct<macho::Section> &Res) const {
+  InMemoryStruct<macho::SegmentLoadCommand> SLC;
+  LoadCommandInfo LCI = MachOObj->getLoadCommandInfo(DRI.d.a);
+  MachOObj->ReadSegmentLoadCommand(LCI, SLC);
+  MachOObj->ReadSection(LCI, DRI.d.b, Res);
+}
+
+StringRef MachOObjectFile::getSectionName(DataRefImpl DRI) const {
+  InMemoryStruct<macho::SegmentLoadCommand> SLC;
+  LoadCommandInfo LCI = MachOObj->getLoadCommandInfo(DRI.d.a);
+  MachOObj->ReadSegmentLoadCommand(LCI, SLC);
+  InMemoryStruct<macho::Section> Sect;
+  MachOObj->ReadSection(LCI, DRI.d.b, Sect);
+
+  static char Result[34];
+  strcpy(Result, SLC->Name);
+  strcat(Result, ",");
+  strcat(Result, Sect->Name);
+  return StringRef(Result);
+}
+
+uint64_t MachOObjectFile::getSectionAddress(DataRefImpl DRI) const {
+  InMemoryStruct<macho::Section> Sect;
+  getSection(DRI, Sect);
+  return Sect->Address;
+}
+
+uint64_t MachOObjectFile::getSectionSize(DataRefImpl DRI) const {
+  InMemoryStruct<macho::Section> Sect;
+  getSection(DRI, Sect);
+  return Sect->Size;
+}
+
+StringRef MachOObjectFile::getSectionContents(DataRefImpl DRI) const {
+  InMemoryStruct<macho::Section> Sect;
+  getSection(DRI, Sect);
+  return MachOObj->getData(Sect->Offset, Sect->Size);
+}
+
+bool MachOObjectFile::isSectionText(DataRefImpl DRI) const {
+  InMemoryStruct<macho::SegmentLoadCommand> SLC;
+  LoadCommandInfo LCI = MachOObj->getLoadCommandInfo(DRI.d.a);
+  MachOObj->ReadSegmentLoadCommand(LCI, SLC);
+  return !strcmp(SLC->Name, "__TEXT");
+}
+
+ObjectFile::section_iterator MachOObjectFile::begin_sections() const {
+  DataRefImpl DRI;
+  DRI.d.a = DRI.d.b = 0;
+  moveToNextSection(DRI);
+  return section_iterator(SectionRef(DRI, this));
+}
+
+ObjectFile::section_iterator MachOObjectFile::end_sections() const {
+  DataRefImpl DRI;
+  DRI.d.a = MachOObj->getHeader().NumLoadCommands;
+  DRI.d.b = 0;
+  return section_iterator(SectionRef(DRI, this));
+}
+
+/*===-- Miscellaneous -----------------------------------------------------===*/
+
+uint8_t MachOObjectFile::getBytesInAddress() const {
+  return MachOObj->is64Bit() ? 8 : 4;
+}
+
+StringRef MachOObjectFile::getFileFormatName() const {
+  if (!MachOObj->is64Bit()) {
+    switch (MachOObj->getHeader().CPUType) {
+    case llvm::MachO::CPUTypeI386:
+      return "Mach-O 32-bit i386";
+    case llvm::MachO::CPUTypeARM:
+      return "Mach-O arm";
+    case llvm::MachO::CPUTypePowerPC:
+      return "Mach-O 32-bit ppc";
+    default:
+      assert((MachOObj->getHeader().CPUType & llvm::MachO::CPUArchABI64) == 0 &&
+             "64-bit object file when we're not 64-bit?");
+      return "Mach-O 32-bit unknown";
+    }
+  }
+
+  switch (MachOObj->getHeader().CPUType) {
+  case llvm::MachO::CPUTypeX86_64:
+    return "Mach-O 64-bit x86-64";
+  case llvm::MachO::CPUTypePowerPC64:
+    return "Mach-O 64-bit ppc64";
+  default:
+    assert((MachOObj->getHeader().CPUType & llvm::MachO::CPUArchABI64) == 1 &&
+           "32-bit object file when we're 64-bit?");
+    return "Mach-O 64-bit unknown";
+  }
+}
+
+unsigned MachOObjectFile::getArch() const {
+  switch (MachOObj->getHeader().CPUType) {
+  case llvm::MachO::CPUTypeI386:
+    return Triple::x86;
+  case llvm::MachO::CPUTypeX86_64:
+    return Triple::x86_64;
+  case llvm::MachO::CPUTypeARM:
+    return Triple::arm;
+  case llvm::MachO::CPUTypePowerPC:
+    return Triple::ppc;
+  case llvm::MachO::CPUTypePowerPC64:
+    return Triple::ppc64;
+  default:
+    return Triple::UnknownArch;
+  }
+}
+
+} // end namespace llvm
+
diff --git a/lib/Object/ObjectFile.cpp b/lib/Object/ObjectFile.cpp
index 161ae3a..47b6311 100644
--- a/lib/Object/ObjectFile.cpp
+++ b/lib/Object/ObjectFile.cpp
@@ -55,7 +55,7 @@ ObjectFile *ObjectFile::createObjectFile(MemoryBuffer *Object) {
     case sys::Mach_O_DynamicLinker_FileType:
     case sys::Mach_O_Bundle_FileType:
     case sys::Mach_O_DynamicallyLinkedSharedLibStub_FileType:
-      return 0;
+      return createMachOObjectFile(Object);
     case sys::COFF_FileType:
       return createCOFFObjectFile(Object);
     default:
diff --git a/lib/Support/APFloat.cpp b/lib/Support/APFloat.cpp
index 3a63258..c3169ac 100644
--- a/lib/Support/APFloat.cpp
+++ b/lib/Support/APFloat.cpp
@@ -3564,7 +3564,7 @@ void APFloat::toString(SmallVectorImpl<char> &Str,
 }
 
 bool APFloat::getExactInverse(APFloat *inv) const {
-  // We can only guarantee the existance of an exact inverse for IEEE floats.
+  // We can only guarantee the existence of an exact inverse for IEEE floats.
   if (semantics != &IEEEhalf && semantics != &IEEEsingle &&
       semantics != &IEEEdouble && semantics != &IEEEquad)
     return false;
diff --git a/lib/Support/APInt.cpp b/lib/Support/APInt.cpp
index 5789721..76265d4 100644
--- a/lib/Support/APInt.cpp
+++ b/lib/Support/APInt.cpp
@@ -1375,7 +1375,7 @@ APInt APInt::sqrt() const {
                  uint64_t(::round(::sqrt(double(isSingleWord()?VAL:pVal[0])))));
 #else
     return APInt(BitWidth,
-                 uint64_t(::sqrt(double(isSingleWord()?VAL:pVal[0]))) + 0.5);
+                 uint64_t(::sqrt(double(isSingleWord()?VAL:pVal[0])) + 0.5));
 #endif
   }
 
@@ -1518,7 +1518,7 @@ APInt::ms APInt::magic() const {
 /// Requires that the divisor not be 0.  Taken from "Hacker's Delight", Henry
 /// S. Warren, Jr., chapter 10.
 /// LeadingZeros can be used to simplify the calculation if the upper bits
-/// of the devided value are known zero.
+/// of the divided value are known zero.
 APInt::mu APInt::magicu(unsigned LeadingZeros) const {
   const APInt& d = *this;
   unsigned p;
@@ -2164,12 +2164,33 @@ void APInt::fromString(unsigned numbits, StringRef str, uint8_t radix) {
 }
 
 void APInt::toString(SmallVectorImpl<char> &Str, unsigned Radix,
-                     bool Signed) const {
+                     bool Signed, bool formatAsCLiteral) const {
   assert((Radix == 10 || Radix == 8 || Radix == 16 || Radix == 2) &&
          "Radix should be 2, 8, 10, or 16!");
 
+  const char *Prefix = "";
+  if (formatAsCLiteral) {
+    switch (Radix) {
+      case 2:
+        // Binary literals are a non-standard extension added in gcc 4.3:
+        // http://gcc.gnu.org/onlinedocs/gcc-4.3.0/gcc/Binary-constants.html
+        Prefix = "0b";
+        break;
+      case 8:
+        Prefix = "0";
+        break;
+      case 16:
+        Prefix = "0x";
+        break;
+    }
+  }
+
   // First, check for a zero value and just short circuit the logic below.
   if (*this == 0) {
+    while (*Prefix) {
+      Str.push_back(*Prefix);
+      ++Prefix;
+    };
     Str.push_back('0');
     return;
   }
@@ -2193,6 +2214,11 @@ void APInt::toString(SmallVectorImpl<char> &Str, unsigned Radix,
       }
     }
 
+    while (*Prefix) {
+      Str.push_back(*Prefix);
+      ++Prefix;
+    };
+
     while (N) {
       *--BufPtr = Digits[N % Radix];
       N /= Radix;
@@ -2212,6 +2238,11 @@ void APInt::toString(SmallVectorImpl<char> &Str, unsigned Radix,
     Str.push_back('-');
   }
 
+  while (*Prefix) {
+    Str.push_back(*Prefix);
+    ++Prefix;
+  };
+
   // We insert the digits backward, then reverse them to get the right order.
   unsigned StartDig = Str.size();
 
@@ -2251,7 +2282,7 @@ void APInt::toString(SmallVectorImpl<char> &Str, unsigned Radix,
 /// to the methods above.
 std::string APInt::toString(unsigned Radix = 10, bool Signed = true) const {
   SmallString<40> S;
-  toString(S, Radix, Signed);
+  toString(S, Radix, Signed, /* formatAsCLiteral = */false);
   return S.str();
 }
 
@@ -2266,7 +2297,7 @@ void APInt::dump() const {
 
 void APInt::print(raw_ostream &OS, bool isSigned) const {
   SmallString<40> S;
-  this->toString(S, 10, isSigned);
+  this->toString(S, 10, isSigned, /* formatAsCLiteral = */false);
   OS << S.str();
 }
 
diff --git a/lib/Support/Allocator.cpp b/lib/Support/Allocator.cpp
index 5e27df6..215b0f2 100644
--- a/lib/Support/Allocator.cpp
+++ b/lib/Support/Allocator.cpp
@@ -136,6 +136,14 @@ unsigned BumpPtrAllocator::GetNumSlabs() const {
   return NumSlabs;
 }
 
+size_t BumpPtrAllocator::getTotalMemory() const {
+  size_t TotalMemory = 0;
+  for (MemSlab *Slab = CurSlab; Slab != 0; Slab = Slab->NextPtr) {
+    TotalMemory += Slab->Size;
+  }
+  return TotalMemory;
+}
+  
 void BumpPtrAllocator::PrintStats() const {
   unsigned NumSlabs = 0;
   size_t TotalMemory = 0;
diff --git a/lib/Support/BranchProbability.cpp b/lib/Support/BranchProbability.cpp
new file mode 100644
index 0000000..97342da
--- /dev/null
+++ b/lib/Support/BranchProbability.cpp
@@ -0,0 +1,44 @@
+//===-------------- lib/Support/BranchProbability.cpp -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements Branch Probability class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+BranchProbability::BranchProbability(uint32_t n, uint32_t d) {
+  assert(d > 0 && "Denomiator cannot be 0!");
+  assert(n <= d && "Probability cannot be bigger than 1!");
+  N = n;
+  D = d;
+}
+
+raw_ostream &BranchProbability::print(raw_ostream &OS) const {
+  OS << N << " / " << D << " = " << ((double)N / D);
+  return OS;
+}
+
+void BranchProbability::dump() const {
+  print(dbgs());
+  dbgs() << "\n";
+}
+
+namespace llvm {
+
+raw_ostream &operator<<(raw_ostream &OS, const BranchProbability &Prob) {
+  Prob.print(OS);
+  return OS;
+}
+
+}
diff --git a/lib/Support/CMakeLists.txt b/lib/Support/CMakeLists.txt
index a0e997d..867d930 100644
--- a/lib/Support/CMakeLists.txt
+++ b/lib/Support/CMakeLists.txt
@@ -9,6 +9,7 @@ add_llvm_library(LLVMSupport
   APInt.cpp
   APSInt.cpp
   Allocator.cpp
+  BranchProbability.cpp
   circular_raw_ostream.cpp
   CommandLine.cpp
   ConstantRange.cpp
diff --git a/lib/Support/CommandLine.cpp b/lib/Support/CommandLine.cpp
index a1f2fce..2914337 100644
--- a/lib/Support/CommandLine.cpp
+++ b/lib/Support/CommandLine.cpp
@@ -186,12 +186,14 @@ static Option *LookupOption(StringRef &Arg, StringRef &Value,
 /// have already been stripped.
 static Option *LookupNearestOption(StringRef Arg,
                                    const StringMap<Option*> &OptionsMap,
-                                   const char *&NearestString) {
+                                   std::string &NearestString) {
   // Reject all dashes.
   if (Arg.empty()) return 0;
 
   // Split on any equal sign.
-  StringRef LHS = Arg.split('=').first;
+  std::pair<StringRef, StringRef> SplitArg = Arg.split('=');
+  StringRef &LHS = SplitArg.first;  // LHS == Arg when no '=' is present.
+  StringRef &RHS = SplitArg.second;
 
   // Find the closest match.
   Option *Best = 0;
@@ -204,14 +206,19 @@ static Option *LookupNearestOption(StringRef Arg,
     if (O->ArgStr[0])
       OptionNames.push_back(O->ArgStr);
 
+    bool PermitValue = O->getValueExpectedFlag() != cl::ValueDisallowed;
+    StringRef Flag = PermitValue ? LHS : Arg;
     for (size_t i = 0, e = OptionNames.size(); i != e; ++i) {
       StringRef Name = OptionNames[i];
       unsigned Distance = StringRef(Name).edit_distance(
-        Arg, /*AllowReplacements=*/true, /*MaxEditDistance=*/BestDistance);
+        Flag, /*AllowReplacements=*/true, /*MaxEditDistance=*/BestDistance);
       if (!Best || Distance < BestDistance) {
         Best = O;
-        NearestString = OptionNames[i];
         BestDistance = Distance;
+	if (RHS.empty() || !PermitValue)
+	  NearestString = OptionNames[i];
+	else
+	  NearestString = std::string(OptionNames[i]) + "=" + RHS.str();
       }
     }
   }
@@ -611,7 +618,7 @@ void cl::ParseCommandLineOptions(int argc, char **argv,
   for (int i = 1; i < argc; ++i) {
     Option *Handler = 0;
     Option *NearestHandler = 0;
-    const char *NearestHandlerString = 0;
+    std::string NearestHandlerString;
     StringRef Value;
     StringRef ArgName = "";
 
@@ -904,8 +911,8 @@ size_t alias::getOptionWidth() const {
 // Print out the option for the alias.
 void alias::printOptionInfo(size_t GlobalWidth) const {
   size_t L = std::strlen(ArgStr);
-  errs() << "  -" << ArgStr;
-  errs().indent(GlobalWidth-L-6) << " - " << HelpStr << "\n";
+  outs() << "  -" << ArgStr;
+  outs().indent(GlobalWidth-L-6) << " - " << HelpStr << "\n";
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Support/Dwarf.cpp b/lib/Support/Dwarf.cpp
index 9799ef5..0813321 100644
--- a/lib/Support/Dwarf.cpp
+++ b/lib/Support/Dwarf.cpp
@@ -203,6 +203,11 @@ const char *llvm::dwarf::AttributeString(unsigned Attribute) {
   case DW_AT_APPLE_major_runtime_vers:   return "DW_AT_APPLE_major_runtime_vers";
   case DW_AT_APPLE_runtime_class:        return "DW_AT_APPLE_runtime_class";
   case DW_AT_APPLE_omit_frame_ptr:       return "DW_AT_APPLE_omit_frame_ptr";
+  case DW_AT_APPLE_property_name:        return "DW_AT_APPLE_property_name";
+  case DW_AT_APPLE_property_getter:      return "DW_AT_APPLE_property_getter";
+  case DW_AT_APPLE_property_setter:      return "DW_AT_APPLE_property_setter";
+  case DW_AT_APPLE_property_attribute:   return "DW_AT_APPLE_property_attribute";
+  case DW_AT_APPLE_objc_complete_type:   return "DW_AT_APPLE_objc_complete_type";
   }
   return 0;
 }
@@ -391,6 +396,7 @@ const char *llvm::dwarf::OperationEncodingString(unsigned Encoding) {
   case DW_OP_call_ref:                   return "DW_OP_call_ref";
   case DW_OP_form_tls_address:           return "DW_OP_form_tls_address";
   case DW_OP_call_frame_cfa:             return "DW_OP_call_frame_cfa";
+  case DW_OP_bit_piece:                  return "DW_OP_bit_piece";
   case DW_OP_lo_user:                    return "DW_OP_lo_user";
   case DW_OP_hi_user:                    return "DW_OP_hi_user";
   }
diff --git a/lib/Support/ErrorHandling.cpp b/lib/Support/ErrorHandling.cpp
index 3579546..e6cc57d 100644
--- a/lib/Support/ErrorHandling.cpp
+++ b/lib/Support/ErrorHandling.cpp
@@ -32,7 +32,6 @@
 #endif
 
 using namespace llvm;
-using namespace std;
 
 static fatal_error_handler_t ErrorHandler = 0;
 static void *ErrorHandlerUserData = 0;
diff --git a/lib/Support/FileUtilities.cpp b/lib/Support/FileUtilities.cpp
index 5dbabee..4c8c0c6 100644
--- a/lib/Support/FileUtilities.cpp
+++ b/lib/Support/FileUtilities.cpp
@@ -198,7 +198,7 @@ int llvm::DiffFilesWithTolerance(const sys::PathWithStatus &FileA,
     return 1;
   }
 
-  // Now its safe to mmap the files into memory becasue both files
+  // Now its safe to mmap the files into memory because both files
   // have a non-zero size.
   error_code ec;
   OwningPtr<MemoryBuffer> F1;
diff --git a/lib/Support/FoldingSet.cpp b/lib/Support/FoldingSet.cpp
index a4f80a9..1568342 100644
--- a/lib/Support/FoldingSet.cpp
+++ b/lib/Support/FoldingSet.cpp
@@ -92,7 +92,7 @@ void FoldingSetNodeID::AddInteger(long long I) {
 }
 void FoldingSetNodeID::AddInteger(unsigned long long I) {
   AddInteger(unsigned(I));
-  if ((uint64_t)(int)I != I)
+  if ((uint64_t)(unsigned)I != I)
     Bits.push_back(unsigned(I >> 32));
 }
 
@@ -147,6 +147,11 @@ void FoldingSetNodeID::AddString(StringRef String) {
   Bits.push_back(V);
 }
 
+// AddNodeID - Adds the Bit data of another ID to *this.
+void FoldingSetNodeID::AddNodeID(const FoldingSetNodeID &ID) {
+  Bits.append(ID.Bits.begin(), ID.Bits.end());
+}
+
 /// ComputeHash - Compute a strong hash value for this FoldingSetNodeID, used to 
 /// lookup the node in the FoldingSetImpl.
 unsigned FoldingSetNodeID::ComputeHash() const {
diff --git a/lib/Support/Host.cpp b/lib/Support/Host.cpp
index 911c64a..4299aa4 100644
--- a/lib/Support/Host.cpp
+++ b/lib/Support/Host.cpp
@@ -215,7 +215,8 @@ std::string sys::getHostCPUName() {
       case 37: // Intel Core i7, laptop version.
         return "corei7";
       case 42: // SandyBridge
-        return "sandybridge";
+      case 45:
+        return "corei7-avx";
 
       case 28: // Intel Atom processor. All processors are manufactured using
                // the 45 nm process
diff --git a/lib/Support/MemoryBuffer.cpp b/lib/Support/MemoryBuffer.cpp
index ea72720..d264be9 100644
--- a/lib/Support/MemoryBuffer.cpp
+++ b/lib/Support/MemoryBuffer.cpp
@@ -67,7 +67,7 @@ static void CopyStringRef(char *Memory, StringRef Data) {
 
 /// GetNamedBuffer - Allocates a new MemoryBuffer with Name copied after it.
 template <typename T>
-static T* GetNamedBuffer(StringRef Buffer, StringRef Name,
+static T *GetNamedBuffer(StringRef Buffer, StringRef Name,
                          bool RequiresNullTerminator) {
   char *Mem = static_cast<char*>(operator new(sizeof(T) + Name.size() + 1));
   CopyStringRef(Mem + sizeof(T), Name);
@@ -86,11 +86,15 @@ public:
      // The name is stored after the class itself.
     return reinterpret_cast<const char*>(this + 1);
   }
+  
+  virtual BufferKind getBufferKind() const {
+    return MemoryBuffer_Malloc;
+  }
 };
 }
 
 /// getMemBuffer - Open the specified memory range as a MemoryBuffer.  Note
-/// that EndPtr[0] must be a null byte and be accessible!
+/// that InputData must be a null terminated if RequiresNullTerminator is true!
 MemoryBuffer *MemoryBuffer::getMemBuffer(StringRef InputData,
                                          StringRef BufferName,
                                          bool RequiresNullTerminator) {
@@ -191,6 +195,10 @@ public:
     sys::Path::UnMapFilePages(reinterpret_cast<const char*>(RealStart),
                               RealSize);
   }
+  
+  virtual BufferKind getBufferKind() const {
+    return MemoryBuffer_MMap;
+  }
 };
 }
 
@@ -213,9 +221,9 @@ error_code MemoryBuffer::getFile(const char *Filename,
   OpenFlags |= O_BINARY;  // Open input file in binary mode on win32.
 #endif
   int FD = ::open(Filename, OpenFlags);
-  if (FD == -1) {
+  if (FD == -1)
     return error_code(errno, posix_category());
-  }
+
   error_code ret = getOpenFile(FD, Filename, result, FileSize, FileSize,
                                0, RequiresNullTerminator);
   close(FD);
diff --git a/lib/Support/Path.cpp b/lib/Support/Path.cpp
index cfe23d8..8fbaf2d 100644
--- a/lib/Support/Path.cpp
+++ b/lib/Support/Path.cpp
@@ -92,15 +92,21 @@ sys::IdentifyFileType(const char *magic, unsigned length) {
       }
       break;
 
+      // The two magic numbers for mach-o are:
+      // 0xfeedface - 32-bit mach-o
+      // 0xfeedfacf - 64-bit mach-o
     case 0xFE:
-    case 0xCE: {
+    case 0xCE:
+    case 0xCF: {
       uint16_t type = 0;
       if (magic[0] == char(0xFE) && magic[1] == char(0xED) &&
-          magic[2] == char(0xFA) && magic[3] == char(0xCE)) {
+          magic[2] == char(0xFA) && 
+          (magic[3] == char(0xCE) || magic[3] == char(0xCF))) {
         /* Native endian */
         if (length >= 16) type = magic[14] << 8 | magic[15];
-      } else if (magic[0] == char(0xCE) && magic[1] == char(0xFA) &&
-                 magic[2] == char(0xED) && magic[3] == char(0xFE)) {
+      } else if ((magic[0] == char(0xCE) || magic[0] == char(0xCF)) &&
+                 magic[1] == char(0xFA) && magic[2] == char(0xED) &&
+                 magic[3] == char(0xFE)) {
         /* Reverse endian */
         if (length >= 14) type = magic[13] << 8 | magic[12];
       }
diff --git a/lib/Support/PrettyStackTrace.cpp b/lib/Support/PrettyStackTrace.cpp
index a9f4709..082b701 100644
--- a/lib/Support/PrettyStackTrace.cpp
+++ b/lib/Support/PrettyStackTrace.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 // This file defines some helpful functions for dealing with the possibility of
-// Unix signals occuring while your program is running.
+// Unix signals occurring while your program is running.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/lib/Support/Regex.cpp b/lib/Support/Regex.cpp
index 309ffb0..d293da0 100644
--- a/lib/Support/Regex.cpp
+++ b/lib/Support/Regex.cpp
@@ -82,7 +82,7 @@ bool Regex::match(StringRef String, SmallVectorImpl<StringRef> *Matches){
         Matches->push_back(StringRef());
         continue;
       }
-      assert(pm[i].rm_eo > pm[i].rm_so);
+      assert(pm[i].rm_eo >= pm[i].rm_so);
       Matches->push_back(StringRef(String.data()+pm[i].rm_so,
                                    pm[i].rm_eo-pm[i].rm_so));
     }
diff --git a/lib/Support/Signals.cpp b/lib/Support/Signals.cpp
index a3af37d..a117893 100644
--- a/lib/Support/Signals.cpp
+++ b/lib/Support/Signals.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 // This file defines some helpful functions for dealing with the possibility of
-// Unix signals occuring while your program is running.
+// Unix signals occurring while your program is running.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/lib/Support/SourceMgr.cpp b/lib/Support/SourceMgr.cpp
index ef09916..de042a9f 100644
--- a/lib/Support/SourceMgr.cpp
+++ b/lib/Support/SourceMgr.cpp
@@ -49,14 +49,16 @@ SourceMgr::~SourceMgr() {
 /// directory or in one of the IncludeDirs.  If no file is found, this returns
 /// ~0, otherwise it returns the buffer ID of the stacked file.
 unsigned SourceMgr::AddIncludeFile(const std::string &Filename,
-                                   SMLoc IncludeLoc) {
+                                   SMLoc IncludeLoc,
+                                   std::string &IncludedFile) {
   OwningPtr<MemoryBuffer> NewBuf;
-  MemoryBuffer::getFile(Filename.c_str(), NewBuf);
+  IncludedFile = Filename;
+  MemoryBuffer::getFile(IncludedFile.c_str(), NewBuf);
 
   // If the file didn't exist directly, see if it's in an include path.
   for (unsigned i = 0, e = IncludeDirectories.size(); i != e && !NewBuf; ++i) {
-    std::string IncFile = IncludeDirectories[i] + "/" + Filename;
-    MemoryBuffer::getFile(IncFile.c_str(), NewBuf);
+    IncludedFile = IncludeDirectories[i] + "/" + Filename;
+    MemoryBuffer::getFile(IncludedFile.c_str(), NewBuf);
   }
 
   if (NewBuf == 0) return ~0U;
diff --git a/lib/Support/StringRef.cpp b/lib/Support/StringRef.cpp
index 5398051..8c3fc09 100644
--- a/lib/Support/StringRef.cpp
+++ b/lib/Support/StringRef.cpp
@@ -131,7 +131,7 @@ unsigned StringRef::edit_distance(llvm::StringRef Other,
 
 /// find - Search for the first string \arg Str in the string.
 ///
-/// \return - The index of the first occurence of \arg Str, or npos if not
+/// \return - The index of the first occurrence of \arg Str, or npos if not
 /// found.
 size_t StringRef::find(StringRef Str, size_t From) const {
   size_t N = Str.size();
@@ -145,7 +145,7 @@ size_t StringRef::find(StringRef Str, size_t From) const {
 
 /// rfind - Search for the last string \arg Str in the string.
 ///
-/// \return - The index of the last occurence of \arg Str, or npos if not
+/// \return - The index of the last occurrence of \arg Str, or npos if not
 /// found.
 size_t StringRef::rfind(StringRef Str) const {
   size_t N = Str.size();
diff --git a/lib/Support/Triple.cpp b/lib/Support/Triple.cpp
index 53ca48f..dbdb303 100644
--- a/lib/Support/Triple.cpp
+++ b/lib/Support/Triple.cpp
@@ -41,7 +41,8 @@ const char *Triple::getArchTypeName(ArchType Kind) {
   case x86_64:  return "x86_64";
   case xcore:   return "xcore";
   case mblaze:  return "mblaze";
-  case ptx:     return "ptx";
+  case ptx32:   return "ptx32";
+  case ptx64:   return "ptx64";
   }
 
   return "<invalid>";
@@ -74,7 +75,8 @@ const char *Triple::getArchTypePrefix(ArchType Kind) {
 
   case xcore:   return "xcore";
 
-  case ptx:     return "ptx";
+  case ptx32:   return "ptx";
+  case ptx64:   return "ptx";
   }
 }
 
@@ -99,8 +101,10 @@ const char *Triple::getOSTypeName(OSType Kind) {
   case Darwin: return "darwin";
   case DragonFly: return "dragonfly";
   case FreeBSD: return "freebsd";
+  case IOS: return "ios";
   case Linux: return "linux";
   case Lv2: return "lv2";
+  case MacOSX: return "macosx";
   case MinGW32: return "mingw32";
   case NetBSD: return "netbsd";
   case OpenBSD: return "openbsd";
@@ -163,8 +167,10 @@ Triple::ArchType Triple::getArchTypeForLLVMName(StringRef Name) {
     return x86_64;
   if (Name == "xcore")
     return xcore;
-  if (Name == "ptx")
-    return ptx;
+  if (Name == "ptx32")
+    return ptx32;
+  if (Name == "ptx64")
+    return ptx64;
 
   return UnknownArch;
 }
@@ -203,15 +209,17 @@ Triple::ArchType Triple::getArchTypeForDarwinArchName(StringRef Str) {
       Str == "armv6" || Str == "armv7")
     return Triple::arm;
 
-  if (Str == "ptx")
-    return Triple::ptx;
+  if (Str == "ptx32")
+    return Triple::ptx32;
+  if (Str == "ptx64")
+    return Triple::ptx64;
 
   return Triple::UnknownArch;
 }
 
 // Returns architecture name that is understood by the target assembler.
 const char *Triple::getArchNameForAssembler() {
-  if (getOS() != Triple::Darwin && getVendor() != Triple::Apple)
+  if (!isOSDarwin() && getVendor() != Triple::Apple)
     return NULL;
 
   StringRef Str = getArchName();
@@ -236,8 +244,10 @@ const char *Triple::getArchNameForAssembler() {
     return "armv6";
   if (Str == "armv7" || Str == "thumbv7")
     return "armv7";
-  if (Str == "ptx")
-    return "ptx";
+  if (Str == "ptx32")
+    return "ptx32";
+  if (Str == "ptx64")
+    return "ptx64";
   return NULL;
 }
 
@@ -286,8 +296,10 @@ Triple::ArchType Triple::ParseArch(StringRef ArchName) {
     return tce;
   else if (ArchName == "xcore")
     return xcore;
-  else if (ArchName == "ptx")
-    return ptx;
+  else if (ArchName == "ptx32")
+    return ptx32;
+  else if (ArchName == "ptx64")
+    return ptx64;
   else
     return UnknownArch;
 }
@@ -314,10 +326,14 @@ Triple::OSType Triple::ParseOS(StringRef OSName) {
     return DragonFly;
   else if (OSName.startswith("freebsd"))
     return FreeBSD;
+  else if (OSName.startswith("ios"))
+    return IOS;
   else if (OSName.startswith("linux"))
     return Linux;
   else if (OSName.startswith("lv2"))
     return Lv2;
+  else if (OSName.startswith("macosx"))
+    return MacOSX;
   else if (OSName.startswith("mingw32"))
     return MinGW32;
   else if (OSName.startswith("netbsd"))
@@ -526,67 +542,44 @@ StringRef Triple::getOSAndEnvironmentName() const {
 
 static unsigned EatNumber(StringRef &Str) {
   assert(!Str.empty() && Str[0] >= '0' && Str[0] <= '9' && "Not a number");
-  unsigned Result = Str[0]-'0';
+  unsigned Result = 0;
 
-  // Eat the digit.
-  Str = Str.substr(1);
-
-  // Handle "darwin11".
-  if (Result == 1 && !Str.empty() && Str[0] >= '0' && Str[0] <= '9') {
+  do {
+    // Consume the leading digit.
     Result = Result*10 + (Str[0] - '0');
+
     // Eat the digit.
     Str = Str.substr(1);
-  }
+  } while (!Str.empty() && Str[0] >= '0' && Str[0] <= '9');
 
   return Result;
 }
 
-/// getDarwinNumber - Parse the 'darwin number' out of the specific target
-/// triple.  For example, if we have darwin8.5 return 8,5,0.  If any entry is
-/// not defined, return 0's.  This requires that the triple have an OSType of
-/// darwin before it is called.
-void Triple::getDarwinNumber(unsigned &Maj, unsigned &Min,
-                             unsigned &Revision) const {
-  assert(getOS() == Darwin && "Not a darwin target triple!");
+void Triple::getOSVersion(unsigned &Major, unsigned &Minor,
+                          unsigned &Micro) const {
   StringRef OSName = getOSName();
-  assert(OSName.startswith("darwin") && "Unknown darwin target triple!");
-
-  // Strip off "darwin".
-  OSName = OSName.substr(6);
-
-  Maj = Min = Revision = 0;
-
-  if (OSName.empty() || OSName[0] < '0' || OSName[0] > '9')
-    return;
 
-  // The major version is the first digit.
-  Maj = EatNumber(OSName);
-  if (OSName.empty()) return;
+  // Assume that the OS portion of the triple starts with the canonical name.
+  StringRef OSTypeName = getOSTypeName(getOS());
+  if (OSName.startswith(OSTypeName))
+    OSName = OSName.substr(OSTypeName.size());
 
-  // Handle minor version: 10.4.9 -> darwin8.9.
-  if (OSName[0] != '.')
-    return;
+  // Any unset version defaults to 0.
+  Major = Minor = Micro = 0;
 
-  // Eat the '.'.
-  OSName = OSName.substr(1);
-
-  if (OSName.empty() || OSName[0] < '0' || OSName[0] > '9')
-    return;
-
-  Min = EatNumber(OSName);
-  if (OSName.empty()) return;
-
-  // Handle revision darwin8.9.1
-  if (OSName[0] != '.')
-    return;
-
-  // Eat the '.'.
-  OSName = OSName.substr(1);
+  // Parse up to three components.
+  unsigned *Components[3] = { &Major, &Minor, &Micro };
+  for (unsigned i = 0; i != 3; ++i) {
+    if (OSName.empty() || OSName[0] < '0' || OSName[0] > '9')
+      break;
 
-  if (OSName.empty() || OSName[0] < '0' || OSName[0] > '9')
-    return;
+    // Consume the leading number.
+    *Components[i] = EatNumber(OSName);
 
-  Revision = EatNumber(OSName);
+    // Consume the separator, if present.
+    if (OSName.startswith("."))
+      OSName = OSName.substr(1);
+  }
 }
 
 void Triple::setTriple(const Twine &Str) {
diff --git a/lib/Support/Unix/Host.inc b/lib/Support/Unix/Host.inc
index c971955..a286a15 100644
--- a/lib/Support/Unix/Host.inc
+++ b/lib/Support/Unix/Host.inc
@@ -45,35 +45,6 @@ std::string sys::getHostTriple() {
   // Normalize the arch, since the host triple may not actually match the host.
   std::string Arch = ArchSplit.first;
 
-  // It would be nice to do this in terms of llvm::Triple, but that is in
-  // Support which is layered above us.
-#if defined(__x86_64__)
-  Arch = "x86_64";
-#elif defined(__i386__)
-  Arch = "i386";
-#elif defined(__ppc64__)
-  Arch = "powerpc64";
-#elif defined(__ppc__)
-  Arch = "powerpc";
-#elif defined(__arm__)
-
-  // FIXME: We need to pick the right ARM triple (which involves querying the
-  // chip). However, for now this is most important for LLVM arch selection, so
-  // we only need to make sure to distinguish ARM and Thumb.
-#  if defined(__thumb__)
-  Arch = "thumb";
-#  else
-  Arch = "arm";
-#  endif
-
-#else
-
-  // FIXME: When enough auto-detection is in place, this should just
-  // #error. Then at least the arch selection is done, and we only need the OS
-  // etc selection to kill off the use of LLVM_HOSTTRIPLE.
-
-#endif
-
   std::string Triple(Arch);
   Triple += '-';
   Triple += ArchSplit.second;
@@ -88,10 +59,7 @@ std::string sys::getHostTriple() {
   std::string::size_type DarwinDashIdx = Triple.find("-darwin");
   if (DarwinDashIdx != std::string::npos) {
     Triple.resize(DarwinDashIdx + strlen("-darwin"));
-
-    // Only add the major part of the os version.
-    std::string Version = getOSVersion();
-    Triple += Version.substr(0, Version.find('.'));
+    Triple += getOSVersion();
   }
 
   return Triple;
diff --git a/lib/Support/Unix/Program.inc b/lib/Support/Unix/Program.inc
index 6efc8cd..346baf1 100644
--- a/lib/Support/Unix/Program.inc
+++ b/lib/Support/Unix/Program.inc
@@ -236,7 +236,7 @@ Program::Execute(const Path &path, const char **args, const char **envp,
   // Create a child process.
   int child = fork();
   switch (child) {
-    // An error occured:  Return to the caller.
+    // An error occurred:  Return to the caller.
     case -1:
       MakeErrMsg(ErrMsg, "Couldn't fork");
       return false;
@@ -338,7 +338,7 @@ Program::Wait(const sys::Path &path,
       else
         MakeErrMsg(ErrMsg, "Child timed out", 0);
 
-      return -1;   // Timeout detected
+      return -2;   // Timeout detected
     } else if (errno != EINTR) {
       MakeErrMsg(ErrMsg, "Error waiting for child process");
       return -1;
@@ -382,7 +382,9 @@ Program::Wait(const sys::Path &path,
         *ErrMsg += " (core dumped)";
 #endif
     }
-    return -1;
+    // Return a special value to indicate that the process received an unhandled
+    // signal during execution as opposed to failing to execute.
+    return -2;
   }
   return result;
 #else
diff --git a/lib/Support/Unix/Signals.inc b/lib/Support/Unix/Signals.inc
index 0a61759..e286869 100644
--- a/lib/Support/Unix/Signals.inc
+++ b/lib/Support/Unix/Signals.inc
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 // This file defines some helpful functions for dealing with the possibility of
-// Unix signals occuring while your program is running.
+// Unix signals occurring while your program is running.
 //
 //===----------------------------------------------------------------------===//
 
@@ -274,6 +274,9 @@ void llvm::sys::PrintStackTraceOnErrorSignal() {
 
 #ifdef __APPLE__
 
+#include <signal.h>
+#include <pthread.h>
+
 int raise(int sig) {
   return pthread_kill(pthread_self(), sig);
 }
@@ -291,9 +294,6 @@ void __assert_rtn(const char *func,
   abort();
 }
 
-#include <signal.h>
-#include <pthread.h>
-
 void abort() {
   raise(SIGABRT);
   usleep(1000);
diff --git a/lib/Support/Windows/DynamicLibrary.inc b/lib/Support/Windows/DynamicLibrary.inc
index 2c14366..4227844 100644
--- a/lib/Support/Windows/DynamicLibrary.inc
+++ b/lib/Support/Windows/DynamicLibrary.inc
@@ -41,41 +41,12 @@ using namespace sys;
 
 static std::vector<HMODULE> OpenedHandles;
 
-#ifdef _WIN64
-  typedef DWORD64 ModuleBaseType;
-#else
-  typedef ULONG ModuleBaseType;
-#endif
-
 extern "C" {
-// Use old callback if:
-//  - Not using Visual Studio
-//  - Visual Studio 2005 or earlier but only if we are not using the Windows SDK
-//    or Windows SDK version is older than 6.0
-// Use new callback if:
-//  - Newer Visual Studio (comes with newer SDK).
-//  - Visual Studio 2005 with Windows SDK 6.0+
-#if defined(_MSC_VER)
-  #if _MSC_VER < 1500 && (!defined(VER_PRODUCTBUILD) || VER_PRODUCTBUILD < 6000)
-    #define OLD_ELM_CALLBACK_DECL 1
-  #endif
-#elif defined(__MINGW64__)
-  // Use new callback.
-#elif defined(__MINGW32__)
-  #define OLD_ELM_CALLBACK_DECL 1
-#endif
 
-#ifdef OLD_ELM_CALLBACK_DECL
-  static BOOL CALLBACK ELM_Callback(PSTR  ModuleName,
-                                    ModuleBaseType ModuleBase,
+  static BOOL CALLBACK ELM_Callback(WIN32_ELMCB_PCSTR ModuleName,
+                                    ULONG_PTR ModuleBase,
                                     ULONG ModuleSize,
                                     PVOID UserContext)
-#else
-  static BOOL CALLBACK ELM_Callback(PCSTR  ModuleName,
-                                    ModuleBaseType ModuleBase,
-                                    ULONG ModuleSize,
-                                    PVOID UserContext)
-#endif
   {
     // Ignore VC++ runtimes prior to 7.1.  Somehow some of them get loaded
     // into the process.
diff --git a/lib/Support/Windows/Program.inc b/lib/Support/Windows/Program.inc
index 350363c..e486e6e 100644
--- a/lib/Support/Windows/Program.inc
+++ b/lib/Support/Windows/Program.inc
@@ -349,7 +349,8 @@ Program::Wait(const Path &path,
   if (WaitForSingleObject(hProcess, millisecondsToWait) == WAIT_TIMEOUT) {
     if (!TerminateProcess(hProcess, 1)) {
       MakeErrMsg(ErrMsg, "Failed to terminate timed-out program.");
-      return -1;
+      // -2 indicates a crash or timeout as opposed to failure to execute.
+      return -2;
     }
     WaitForSingleObject(hProcess, INFINITE);
   }
@@ -362,7 +363,8 @@ Program::Wait(const Path &path,
   if (!rc) {
     SetLastError(err);
     MakeErrMsg(ErrMsg, "Failed getting status for program.");
-    return -1;
+    // -2 indicates a crash or timeout as opposed to failure to execute.
+    return -2;
   }
 
   return status;
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index e690e18..6af5f85 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -67,6 +67,14 @@ def FeatureNEONForFP : SubtargetFeature<"neonfp", "UseNEONForSinglePrecisionFP",
 def FeaturePref32BitThumb : SubtargetFeature<"32bit", "Pref32BitThumb", "true",
                                              "Prefer 32-bit Thumb instrs">;
 
+/// Some instructions update CPSR partially, which can add false dependency for
+/// out-of-order implementation, e.g. Cortex-A9, unless each individual bit is
+/// mapped to a separate physical register. Avoid partial CPSR update for these
+/// processors.
+def FeatureAvoidPartialCPSR : SubtargetFeature<"avoid-partial-cpsr",
+                                               "AvoidCPSRPartialUpdate", "true",
+                                 "Avoid CPSR partial update for OOO execution">;
+
 // Multiprocessing extension.
 def FeatureMP : SubtargetFeature<"mp", "HasMPExtension", "true",
                                  "Supports Multiprocessing extension">;
@@ -110,8 +118,9 @@ def ProcA8      : SubtargetFeature<"a8", "ARMProcFamily", "CortexA8",
                                     FeatureT2XtPk]>;
 def ProcA9      : SubtargetFeature<"a9", "ARMProcFamily", "CortexA9",
                                    "Cortex-A9 ARM processors",
-                                   [FeatureHasSlowFPVMLx, FeatureVMLxForwarding,
-                                    FeatureT2XtPk, FeatureFP16]>;
+                                   [FeatureVMLxForwarding,
+                                    FeatureT2XtPk, FeatureFP16,
+                                    FeatureAvoidPartialCPSR]>;
 
 class ProcNoItin<string Name, list<SubtargetFeature> Features>
  : Processor<Name, GenericItineraries, Features>;
@@ -178,6 +187,8 @@ def : Processor<"cortex-a8",        CortexA8Itineraries,
                                     [ArchV7A, ProcA8]>;
 def : Processor<"cortex-a9",        CortexA9Itineraries,
                                     [ArchV7A, ProcA9]>;
+def : Processor<"cortex-a9-mp",     CortexA9Itineraries,
+                                    [ArchV7A, ProcA9, FeatureMP]>;
 
 // V7M Processors.
 def : ProcNoItin<"cortex-m3",       [ArchV7M]>;
diff --git a/lib/Target/ARM/ARMAsmBackend.cpp b/lib/Target/ARM/ARMAsmBackend.cpp
index e972a08..db132da 100644
--- a/lib/Target/ARM/ARMAsmBackend.cpp
+++ b/lib/Target/ARM/ARMAsmBackend.cpp
@@ -76,7 +76,7 @@ public:
 { "fixup_arm_thumb_blx",     7,            21,  MCFixupKindInfo::FKF_IsPCRel },
 { "fixup_arm_thumb_cb",      0,            16,  MCFixupKindInfo::FKF_IsPCRel },
 { "fixup_arm_thumb_cp",      1,             8,  MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_arm_thumb_bcc",     1,             8,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_thumb_bcc",     0,             8,  MCFixupKindInfo::FKF_IsPCRel },
 // movw / movt: 16-bits immediate but scattered into two chunks 0 - 12, 16 - 19.
 { "fixup_arm_movt_hi16",     0,            20,  0 },
 { "fixup_arm_movw_lo16",     0,            20,  0 },
@@ -164,23 +164,25 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
   case FK_Data_4:
     return Value;
   case ARM::fixup_arm_movt_hi16:
-  case ARM::fixup_arm_movt_hi16_pcrel:
     Value >>= 16;
     // Fallthrough
   case ARM::fixup_arm_movw_lo16:
+  case ARM::fixup_arm_movt_hi16_pcrel:
   case ARM::fixup_arm_movw_lo16_pcrel: {
     unsigned Hi4 = (Value & 0xF000) >> 12;
     unsigned Lo12 = Value & 0x0FFF;
+    assert ((((int64_t)Value) >= -0x8000) && (((int64_t)Value) <= 0x7fff) &&
+            "Out of range pc-relative fixup value!");
     // inst{19-16} = Hi4;
     // inst{11-0} = Lo12;
     Value = (Hi4 << 16) | (Lo12);
     return Value;
   }
   case ARM::fixup_t2_movt_hi16:
-  case ARM::fixup_t2_movt_hi16_pcrel:
     Value >>= 16;
     // Fallthrough
   case ARM::fixup_t2_movw_lo16:
+  case ARM::fixup_t2_movt_hi16_pcrel:
   case ARM::fixup_t2_movw_lo16_pcrel: {
     unsigned Hi4 = (Value & 0xF000) >> 12;
     unsigned i = (Value & 0x800) >> 11;
@@ -190,8 +192,9 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
     // inst{26} = i;
     // inst{14-12} = Mid3;
     // inst{7-0} = Lo8;
+    assert ((((int64_t)Value) >= -0x8000) && (((int64_t)Value) <= 0x7fff) &&
+            "Out of range pc-relative fixup value!");
     Value = (Hi4 << 16) | (i << 26) | (Mid3 << 12) | (Lo8);
-
     uint64_t swapped = (Value & 0xFFFF0000) >> 16;
     swapped |= (Value & 0x0000FFFF) << 16;
     return swapped;
@@ -305,7 +308,7 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
     //
     // Note that the halfwords are stored high first, low second; so we need
     // to transpose the fixup value here to map properly.
-    unsigned isNeg = (int64_t(Value) < 0) ? 1 : 0;
+    unsigned isNeg = (int64_t(Value - 4) < 0) ? 1 : 0;
     uint32_t Binary = 0;
     Value = 0x3fffff & ((Value - 4) >> 1);
     Binary  = (Value & 0x7ff) << 16;    // Low imm11 value.
@@ -323,7 +326,7 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
     //
     // Note that the halfwords are stored high first, low second; so we need
     // to transpose the fixup value here to map properly.
-    unsigned isNeg = (int64_t(Value) < 0) ? 1 : 0;
+    unsigned isNeg = (int64_t(Value-4) < 0) ? 1 : 0;
     uint32_t Binary = 0;
     Value = 0xfffff & ((Value - 2) >> 2);
     Binary  = (Value & 0x3ff) << 17;    // Low imm10L value.
@@ -404,7 +407,6 @@ void ELFARMAsmBackend::ApplyFixup(const MCFixup &Fixup, char *Data,
   if (!Value) return;           // Doesn't change encoding.
 
   unsigned Offset = Fixup.getOffset();
-  assert(Offset % NumBytes == 0 && "Offset mod NumBytes is nonzero!");
 
   // For each byte of the fragment that the fixup touches, mask in the bits from
   // the fixup value. The Value has been "split up" into the appropriate
@@ -501,18 +503,22 @@ void DarwinARMAsmBackend::ApplyFixup(const MCFixup &Fixup, char *Data,
 TargetAsmBackend *llvm::createARMAsmBackend(const Target &T,
                                             const std::string &TT) {
   Triple TheTriple(TT);
-  switch (TheTriple.getOS()) {
-  case Triple::Darwin: {
-    if (TheTriple.getArchName() == "armv6" ||
+
+  if (TheTriple.isOSDarwin()) {
+    if (TheTriple.getArchName() == "armv4t" ||
+        TheTriple.getArchName() == "thumbv4t")
+      return new DarwinARMAsmBackend(T, object::mach::CSARM_V4T);
+    else if (TheTriple.getArchName() == "armv5e" ||
+        TheTriple.getArchName() == "thumbv5e")
+      return new DarwinARMAsmBackend(T, object::mach::CSARM_V5TEJ);
+    else if (TheTriple.getArchName() == "armv6" ||
         TheTriple.getArchName() == "thumbv6")
       return new DarwinARMAsmBackend(T, object::mach::CSARM_V6);
     return new DarwinARMAsmBackend(T, object::mach::CSARM_V7);
   }
-  case Triple::MinGW32:
-  case Triple::Cygwin:
-  case Triple::Win32:
+
+  if (TheTriple.isOSWindows())
     assert(0 && "Windows not supported on ARM");
-  default:
-    return new ELFARMAsmBackend(T, Triple(TT).getOS());
-  }
+
+  return new ELFARMAsmBackend(T, Triple(TT).getOS());
 }
diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp
index 8eb1993..eb73902 100644
--- a/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -172,10 +172,70 @@ getDebugValueLocation(const MachineInstr *MI) const {
   return Location;
 }
 
+/// EmitDwarfRegOp - Emit dwarf register operation.
+void ARMAsmPrinter::EmitDwarfRegOp(const MachineLocation &MLoc) const {
+  const TargetRegisterInfo *RI = TM.getRegisterInfo();
+  if (RI->getDwarfRegNum(MLoc.getReg(), false) != -1)
+    AsmPrinter::EmitDwarfRegOp(MLoc);
+  else {
+    unsigned Reg = MLoc.getReg();
+    if (Reg >= ARM::S0 && Reg <= ARM::S31) {
+      assert(ARM::S0 + 31 == ARM::S31 && "Unexpected ARM S register numbering");
+      // S registers are described as bit-pieces of a register
+      // S[2x] = DW_OP_regx(256 + (x>>1)) DW_OP_bit_piece(32, 0)
+      // S[2x+1] = DW_OP_regx(256 + (x>>1)) DW_OP_bit_piece(32, 32)
+      
+      unsigned SReg = Reg - ARM::S0;
+      bool odd = SReg & 0x1;
+      unsigned Rx = 256 + (SReg >> 1);
+
+      OutStreamer.AddComment("DW_OP_regx for S register");
+      EmitInt8(dwarf::DW_OP_regx);
+
+      OutStreamer.AddComment(Twine(SReg));
+      EmitULEB128(Rx);
+
+      if (odd) {
+        OutStreamer.AddComment("DW_OP_bit_piece 32 32");
+        EmitInt8(dwarf::DW_OP_bit_piece);
+        EmitULEB128(32);
+        EmitULEB128(32);
+      } else {
+        OutStreamer.AddComment("DW_OP_bit_piece 32 0");
+        EmitInt8(dwarf::DW_OP_bit_piece);
+        EmitULEB128(32);
+        EmitULEB128(0);
+      }
+    } else if (Reg >= ARM::Q0 && Reg <= ARM::Q15) {
+      assert(ARM::Q0 + 15 == ARM::Q15 && "Unexpected ARM Q register numbering");
+      // Q registers Q0-Q15 are described by composing two D registers together.
+      // Qx = DW_OP_regx(256+2x) DW_OP_piece(8) DW_OP_regx(256+2x+1) DW_OP_piece(8)
+
+      unsigned QReg = Reg - ARM::Q0;
+      unsigned D1 = 256 + 2 * QReg;
+      unsigned D2 = D1 + 1;
+      
+      OutStreamer.AddComment("DW_OP_regx for Q register: D1");
+      EmitInt8(dwarf::DW_OP_regx);
+      EmitULEB128(D1);
+      OutStreamer.AddComment("DW_OP_piece 8");
+      EmitInt8(dwarf::DW_OP_piece);
+      EmitULEB128(8);
+
+      OutStreamer.AddComment("DW_OP_regx for Q register: D2");
+      EmitInt8(dwarf::DW_OP_regx);
+      EmitULEB128(D2);
+      OutStreamer.AddComment("DW_OP_piece 8");
+      EmitInt8(dwarf::DW_OP_piece);
+      EmitULEB128(8);
+    }
+  }
+}
+
 void ARMAsmPrinter::EmitFunctionEntryLabel() {
   if (AFI->isThumbFunction()) {
     OutStreamer.EmitAssemblerFlag(MCAF_Code16);
-    OutStreamer.EmitThumbFunc(Subtarget->isTargetDarwin()? CurrentFnSym : 0);
+    OutStreamer.EmitThumbFunc(CurrentFnSym);
   }
 
   OutStreamer.EmitLabel(CurrentFnSym);
@@ -305,10 +365,63 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
     case 'q': // Print a NEON quad precision register.
       printOperand(MI, OpNum, O);
       return false;
-    case 'Q':
-    case 'R':
-    case 'H':
-      // These modifiers are not yet supported.
+    case 'y': // Print a VFP single precision register as indexed double.
+      // This uses the ordering of the alias table to get the first 'd' register
+      // that overlaps the 's' register. Also, s0 is an odd register, hence the
+      // odd modulus check below.
+      if (MI->getOperand(OpNum).isReg()) {
+        unsigned Reg = MI->getOperand(OpNum).getReg();
+        const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo();
+        O << ARMInstPrinter::getRegisterName(TRI->getAliasSet(Reg)[0]) <<
+        (((Reg % 2) == 1) ? "[0]" : "[1]");
+        return false;
+      }
+      return true;
+    case 'B': // Bitwise inverse of integer or symbol without a preceding #.
+      if (!MI->getOperand(OpNum).isImm())
+        return true;
+      O << ~(MI->getOperand(OpNum).getImm());
+      return false;
+    case 'L': // The low 16 bits of an immediate constant.
+      if (!MI->getOperand(OpNum).isImm())
+        return true;
+      O << (MI->getOperand(OpNum).getImm() & 0xffff);
+      return false;
+    case 'M': { // A register range suitable for LDM/STM.
+      if (!MI->getOperand(OpNum).isReg())
+        return true;
+      const MachineOperand &MO = MI->getOperand(OpNum);
+      unsigned RegBegin = MO.getReg();
+      // This takes advantage of the 2 operand-ness of ldm/stm and that we've
+      // already got the operands in registers that are operands to the
+      // inline asm statement.
+      
+      O << "{" << ARMInstPrinter::getRegisterName(RegBegin);
+      
+      // FIXME: The register allocator not only may not have given us the
+      // registers in sequence, but may not be in ascending registers. This
+      // will require changes in the register allocator that'll need to be
+      // propagated down here if the operands change.
+      unsigned RegOps = OpNum + 1;
+      while (MI->getOperand(RegOps).isReg()) {
+        O << ", " 
+          << ARMInstPrinter::getRegisterName(MI->getOperand(RegOps).getReg());
+        RegOps++;
+      }
+
+      O << "}";
+
+      return false;
+    }
+    // These modifiers are not yet supported.
+    case 'p': // The high single-precision register of a VFP double-precision
+              // register.
+    case 'e': // The low doubleword register of a NEON quad register.
+    case 'f': // The high doubleword register of a NEON quad register.
+    case 'h': // A range of VFP/NEON registers suitable for VLD1/VST1.
+    case 'Q': // The least significant register of a pair.
+    case 'R': // The most significant register of a pair.
+    case 'H': // The highest-numbered register of a pair.
       return true;
     }
   }
@@ -321,9 +434,21 @@ bool ARMAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
                                           unsigned OpNum, unsigned AsmVariant,
                                           const char *ExtraCode,
                                           raw_ostream &O) {
-  if (ExtraCode && ExtraCode[0])
-    return true; // Unknown modifier.
-
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+    
+    switch (ExtraCode[0]) {
+      case 'A': // A memory operand for a VLD1/VST1 instruction.
+      default: return true;  // Unknown modifier.
+      case 'm': // The base register of a memory operand.
+        if (!MI->getOperand(OpNum).isReg())
+          return true;
+        O << ARMInstPrinter::getRegisterName(MI->getOperand(OpNum).getReg());
+        return false;
+    }
+  }
+  
   const MachineOperand &MO = MI->getOperand(OpNum);
   assert(MO.isReg() && "unexpected inline asm memory operand");
   O << "[" << ARMInstPrinter::getRegisterName(MO.getReg()) << "]";
@@ -489,6 +614,12 @@ void ARMAsmPrinter::emitAttributes() {
     //
 
     /// ADD additional Else-cases here!
+  } else if (CPUString == "xscale") {
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v5TEJ);
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::ARM_ISA_use,
+                               ARMBuildAttrs::Allowed);
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::THUMB_ISA_use,
+                               ARMBuildAttrs::Allowed);
   } else if (CPUString == "generic") {
     // FIXME: Why these defaults?
     AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v4T);
@@ -1077,6 +1208,26 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     }
     return;
   }
+  case ARM::tBXr9_CALL:
+  case ARM::tBX_CALL: {
+    {
+      MCInst TmpInst;
+      TmpInst.setOpcode(ARM::tMOVr);
+      TmpInst.addOperand(MCOperand::CreateReg(ARM::LR));
+      TmpInst.addOperand(MCOperand::CreateReg(ARM::PC));
+      OutStreamer.EmitInstruction(TmpInst);
+    }
+    {
+      MCInst TmpInst;
+      TmpInst.setOpcode(ARM::tBX);
+      TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg()));
+      // Add predicate operands.
+      TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
+      TmpInst.addOperand(MCOperand::CreateReg(0));
+      OutStreamer.EmitInstruction(TmpInst);
+    }
+    return;
+  }
   case ARM::BMOVPCRXr9_CALL:
   case ARM::BMOVPCRX_CALL: {
     {
@@ -1698,7 +1849,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     }
     {
       MCInst TmpInst;
-      TmpInst.setOpcode(ARM::tBX_RET_vararg);
+      TmpInst.setOpcode(ARM::tBX);
       TmpInst.addOperand(MCOperand::CreateReg(ScratchReg));
       // Predicate.
       TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
@@ -1708,7 +1859,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     return;
   }
   // Tail jump branches are really just branch instructions with additional
-  // code-gen attributes. Convert them to the cannonical form here.
+  // code-gen attributes. Convert them to the canonical form here.
   case ARM::TAILJMPd:
   case ARM::TAILJMPdND: {
     MCInst TmpInst, TmpInst2;
@@ -1727,7 +1878,9 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   case ARM::tTAILJMPdND: {
     MCInst TmpInst, TmpInst2;
     LowerARMMachineInstrToMCInst(MI, TmpInst2, *this);
-    TmpInst.setOpcode(ARM::tB);
+    // The Darwin toolchain doesn't support tail call relocations of 16-bit
+    // branches.
+    TmpInst.setOpcode(Opc == ARM::tTAILJMPd ? ARM::t2B : ARM::tB);
     TmpInst.addOperand(TmpInst2.getOperand(0));
     OutStreamer.AddComment("TAILCALL");
     OutStreamer.EmitInstruction(TmpInst);
diff --git a/lib/Target/ARM/ARMAsmPrinter.h b/lib/Target/ARM/ARMAsmPrinter.h
index 9db139b..5f9169e 100644
--- a/lib/Target/ARM/ARMAsmPrinter.h
+++ b/lib/Target/ARM/ARMAsmPrinter.h
@@ -89,6 +89,9 @@ public:
 
   MachineLocation getDebugValueLocation(const MachineInstr *MI) const;
 
+  /// EmitDwarfRegOp - Emit dwarf register operation.
+  virtual void EmitDwarfRegOp(const MachineLocation &MLoc) const;
+
   virtual unsigned getISAEncoding() {
     // ARM/Darwin adds ISA to the DWARF info for each function.
     if (!Subtarget->isTargetDarwin())
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 30148c2..44a3976 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -1021,7 +1021,7 @@ reMaterialize(MachineBasicBlock &MBB,
     MachineInstrBuilder MIB = BuildMI(MBB, I, Orig->getDebugLoc(), get(Opcode),
                                       DestReg)
       .addConstantPoolIndex(CPI).addImm(PCLabelId);
-    (*MIB).setMemRefs(Orig->memoperands_begin(), Orig->memoperands_end());
+    MIB->setMemRefs(Orig->memoperands_begin(), Orig->memoperands_end());
     break;
   }
   }
@@ -1201,7 +1201,7 @@ bool ARMBaseInstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
 }
 
 /// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to
-/// determine (in conjuction with areLoadsFromSameBasePtr) if two loads should
+/// determine (in conjunction with areLoadsFromSameBasePtr) if two loads should
 /// be scheduled togther. On some targets if two loads are loading from
 /// addresses in the same cache line, it's better if they are scheduled
 /// together. This function takes two integers that represent the load offsets
@@ -1270,19 +1270,19 @@ bool ARMBaseInstrInfo::isSchedulingBoundary(const MachineInstr *MI,
 }
 
 bool ARMBaseInstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB,
-                                           unsigned NumCyles,
+                                           unsigned NumCycles,
                                            unsigned ExtraPredCycles,
                                            float Probability,
                                            float Confidence) const {
-  if (!NumCyles)
+  if (!NumCycles)
     return false;
 
   // Attempt to estimate the relative costs of predication versus branching.
-  float UnpredCost = Probability * NumCyles;
+  float UnpredCost = Probability * NumCycles;
   UnpredCost += 1.0; // The branch itself
   UnpredCost += (1.0 - Confidence) * Subtarget.getMispredictionPenalty();
 
-  return (float)(NumCyles + ExtraPredCycles) < UnpredCost;
+  return (float)(NumCycles + ExtraPredCycles) < UnpredCost;
 }
 
 bool ARMBaseInstrInfo::
@@ -1618,17 +1618,39 @@ OptimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, int CmpMask,
   // Set the "zero" bit in CPSR.
   switch (MI->getOpcode()) {
   default: break;
+  case ARM::RSBrr:
   case ARM::RSBri:
+  case ARM::RSCrr:
   case ARM::RSCri:
+  case ARM::ADDrr:
   case ARM::ADDri:
+  case ARM::ADCrr:
   case ARM::ADCri:
+  case ARM::SUBrr:
   case ARM::SUBri:
+  case ARM::SBCrr:
   case ARM::SBCri:
   case ARM::t2RSBri:
+  case ARM::t2ADDrr:
   case ARM::t2ADDri:
+  case ARM::t2ADCrr:
   case ARM::t2ADCri:
+  case ARM::t2SUBrr:
   case ARM::t2SUBri:
-  case ARM::t2SBCri: {
+  case ARM::t2SBCrr:
+  case ARM::t2SBCri:
+  case ARM::ANDrr:
+  case ARM::ANDri:
+  case ARM::t2ANDrr:
+  case ARM::t2ANDri:
+  case ARM::ORRrr:
+  case ARM::ORRri:
+  case ARM::t2ORRrr:
+  case ARM::t2ORRri:
+  case ARM::EORrr:
+  case ARM::EORri:
+  case ARM::t2EORrr:
+  case ARM::t2EORri: {
     // Scan forward for the use of CPSR, if it's a conditional code requires
     // checking of V bit, then this is not safe to do. If we can't find the
     // CPSR use (i.e. used in another block), then it's not safe to perform
@@ -1667,16 +1689,13 @@ OptimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, int CmpMask,
     if (!isSafe)
       return false;
 
-    // fallthrough
-  }
-  case ARM::ANDri:
-  case ARM::t2ANDri:
     // Toggle the optional operand to CPSR.
     MI->getOperand(5).setReg(ARM::CPSR);
     MI->getOperand(5).setIsDef(true);
     CmpInstr->eraseFromParent();
     return true;
   }
+  }
 
   return false;
 }
@@ -2203,6 +2222,101 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
     }
   }
 
+  if (DefAlign < 8 && Subtarget.isCortexA9())
+    switch (DefTID.getOpcode()) {
+    default: break;
+    case ARM::VLD1q8:
+    case ARM::VLD1q16:
+    case ARM::VLD1q32:
+    case ARM::VLD1q64:
+    case ARM::VLD1q8_UPD:
+    case ARM::VLD1q16_UPD:
+    case ARM::VLD1q32_UPD:
+    case ARM::VLD1q64_UPD:
+    case ARM::VLD2d8:
+    case ARM::VLD2d16:
+    case ARM::VLD2d32:
+    case ARM::VLD2q8:
+    case ARM::VLD2q16:
+    case ARM::VLD2q32:
+    case ARM::VLD2d8_UPD:
+    case ARM::VLD2d16_UPD:
+    case ARM::VLD2d32_UPD:
+    case ARM::VLD2q8_UPD:
+    case ARM::VLD2q16_UPD:
+    case ARM::VLD2q32_UPD:
+    case ARM::VLD3d8:
+    case ARM::VLD3d16:
+    case ARM::VLD3d32:
+    case ARM::VLD1d64T:
+    case ARM::VLD3d8_UPD:
+    case ARM::VLD3d16_UPD:
+    case ARM::VLD3d32_UPD:
+    case ARM::VLD1d64T_UPD:
+    case ARM::VLD3q8_UPD:
+    case ARM::VLD3q16_UPD:
+    case ARM::VLD3q32_UPD:
+    case ARM::VLD4d8:
+    case ARM::VLD4d16:
+    case ARM::VLD4d32:
+    case ARM::VLD1d64Q:
+    case ARM::VLD4d8_UPD:
+    case ARM::VLD4d16_UPD:
+    case ARM::VLD4d32_UPD:
+    case ARM::VLD1d64Q_UPD:
+    case ARM::VLD4q8_UPD:
+    case ARM::VLD4q16_UPD:
+    case ARM::VLD4q32_UPD:
+    case ARM::VLD1DUPq8:
+    case ARM::VLD1DUPq16:
+    case ARM::VLD1DUPq32:
+    case ARM::VLD1DUPq8_UPD:
+    case ARM::VLD1DUPq16_UPD:
+    case ARM::VLD1DUPq32_UPD:
+    case ARM::VLD2DUPd8:
+    case ARM::VLD2DUPd16:
+    case ARM::VLD2DUPd32:
+    case ARM::VLD2DUPd8_UPD:
+    case ARM::VLD2DUPd16_UPD:
+    case ARM::VLD2DUPd32_UPD:
+    case ARM::VLD4DUPd8:
+    case ARM::VLD4DUPd16:
+    case ARM::VLD4DUPd32:
+    case ARM::VLD4DUPd8_UPD:
+    case ARM::VLD4DUPd16_UPD:
+    case ARM::VLD4DUPd32_UPD:
+    case ARM::VLD1LNd8:
+    case ARM::VLD1LNd16:
+    case ARM::VLD1LNd32:
+    case ARM::VLD1LNd8_UPD:
+    case ARM::VLD1LNd16_UPD:
+    case ARM::VLD1LNd32_UPD:
+    case ARM::VLD2LNd8:
+    case ARM::VLD2LNd16:
+    case ARM::VLD2LNd32:
+    case ARM::VLD2LNq16:
+    case ARM::VLD2LNq32:
+    case ARM::VLD2LNd8_UPD:
+    case ARM::VLD2LNd16_UPD:
+    case ARM::VLD2LNd32_UPD:
+    case ARM::VLD2LNq16_UPD:
+    case ARM::VLD2LNq32_UPD:
+    case ARM::VLD4LNd8:
+    case ARM::VLD4LNd16:
+    case ARM::VLD4LNd32:
+    case ARM::VLD4LNq16:
+    case ARM::VLD4LNq32:
+    case ARM::VLD4LNd8_UPD:
+    case ARM::VLD4LNd16_UPD:
+    case ARM::VLD4LNd32_UPD:
+    case ARM::VLD4LNq16_UPD:
+    case ARM::VLD4LNq32_UPD:
+      // If the address is not 64-bit aligned, the latencies of these
+      // instructions increases by one.
+      ++Latency;
+      break;
+    }
+
   return Latency;
 }
 
@@ -2269,6 +2383,113 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
     }
   }
 
+  if (DefAlign < 8 && Subtarget.isCortexA9())
+    switch (DefTID.getOpcode()) {
+    default: break;
+    case ARM::VLD1q8Pseudo:
+    case ARM::VLD1q16Pseudo:
+    case ARM::VLD1q32Pseudo:
+    case ARM::VLD1q64Pseudo:
+    case ARM::VLD1q8Pseudo_UPD:
+    case ARM::VLD1q16Pseudo_UPD:
+    case ARM::VLD1q32Pseudo_UPD:
+    case ARM::VLD1q64Pseudo_UPD:
+    case ARM::VLD2d8Pseudo:
+    case ARM::VLD2d16Pseudo:
+    case ARM::VLD2d32Pseudo:
+    case ARM::VLD2q8Pseudo:
+    case ARM::VLD2q16Pseudo:
+    case ARM::VLD2q32Pseudo:
+    case ARM::VLD2d8Pseudo_UPD:
+    case ARM::VLD2d16Pseudo_UPD:
+    case ARM::VLD2d32Pseudo_UPD:
+    case ARM::VLD2q8Pseudo_UPD:
+    case ARM::VLD2q16Pseudo_UPD:
+    case ARM::VLD2q32Pseudo_UPD:
+    case ARM::VLD3d8Pseudo:
+    case ARM::VLD3d16Pseudo:
+    case ARM::VLD3d32Pseudo:
+    case ARM::VLD1d64TPseudo:
+    case ARM::VLD3d8Pseudo_UPD:
+    case ARM::VLD3d16Pseudo_UPD:
+    case ARM::VLD3d32Pseudo_UPD:
+    case ARM::VLD1d64TPseudo_UPD:
+    case ARM::VLD3q8Pseudo_UPD:
+    case ARM::VLD3q16Pseudo_UPD:
+    case ARM::VLD3q32Pseudo_UPD:
+    case ARM::VLD3q8oddPseudo:
+    case ARM::VLD3q16oddPseudo:
+    case ARM::VLD3q32oddPseudo:
+    case ARM::VLD3q8oddPseudo_UPD:
+    case ARM::VLD3q16oddPseudo_UPD:
+    case ARM::VLD3q32oddPseudo_UPD:
+    case ARM::VLD4d8Pseudo:
+    case ARM::VLD4d16Pseudo:
+    case ARM::VLD4d32Pseudo:
+    case ARM::VLD1d64QPseudo:
+    case ARM::VLD4d8Pseudo_UPD:
+    case ARM::VLD4d16Pseudo_UPD:
+    case ARM::VLD4d32Pseudo_UPD:
+    case ARM::VLD1d64QPseudo_UPD:
+    case ARM::VLD4q8Pseudo_UPD:
+    case ARM::VLD4q16Pseudo_UPD:
+    case ARM::VLD4q32Pseudo_UPD:
+    case ARM::VLD4q8oddPseudo:
+    case ARM::VLD4q16oddPseudo:
+    case ARM::VLD4q32oddPseudo:
+    case ARM::VLD4q8oddPseudo_UPD:
+    case ARM::VLD4q16oddPseudo_UPD:
+    case ARM::VLD4q32oddPseudo_UPD:
+    case ARM::VLD1DUPq8Pseudo:
+    case ARM::VLD1DUPq16Pseudo:
+    case ARM::VLD1DUPq32Pseudo:
+    case ARM::VLD1DUPq8Pseudo_UPD:
+    case ARM::VLD1DUPq16Pseudo_UPD:
+    case ARM::VLD1DUPq32Pseudo_UPD:
+    case ARM::VLD2DUPd8Pseudo:
+    case ARM::VLD2DUPd16Pseudo:
+    case ARM::VLD2DUPd32Pseudo:
+    case ARM::VLD2DUPd8Pseudo_UPD:
+    case ARM::VLD2DUPd16Pseudo_UPD:
+    case ARM::VLD2DUPd32Pseudo_UPD:
+    case ARM::VLD4DUPd8Pseudo:
+    case ARM::VLD4DUPd16Pseudo:
+    case ARM::VLD4DUPd32Pseudo:
+    case ARM::VLD4DUPd8Pseudo_UPD:
+    case ARM::VLD4DUPd16Pseudo_UPD:
+    case ARM::VLD4DUPd32Pseudo_UPD:
+    case ARM::VLD1LNq8Pseudo:
+    case ARM::VLD1LNq16Pseudo:
+    case ARM::VLD1LNq32Pseudo:
+    case ARM::VLD1LNq8Pseudo_UPD:
+    case ARM::VLD1LNq16Pseudo_UPD:
+    case ARM::VLD1LNq32Pseudo_UPD:
+    case ARM::VLD2LNd8Pseudo:
+    case ARM::VLD2LNd16Pseudo:
+    case ARM::VLD2LNd32Pseudo:
+    case ARM::VLD2LNq16Pseudo:
+    case ARM::VLD2LNq32Pseudo:
+    case ARM::VLD2LNd8Pseudo_UPD:
+    case ARM::VLD2LNd16Pseudo_UPD:
+    case ARM::VLD2LNd32Pseudo_UPD:
+    case ARM::VLD2LNq16Pseudo_UPD:
+    case ARM::VLD2LNq32Pseudo_UPD:
+    case ARM::VLD4LNd8Pseudo:
+    case ARM::VLD4LNd16Pseudo:
+    case ARM::VLD4LNd32Pseudo:
+    case ARM::VLD4LNq16Pseudo:
+    case ARM::VLD4LNq32Pseudo:
+    case ARM::VLD4LNd8Pseudo_UPD:
+    case ARM::VLD4LNd16Pseudo_UPD:
+    case ARM::VLD4LNd32Pseudo_UPD:
+    case ARM::VLD4LNq16Pseudo_UPD:
+    case ARM::VLD4LNq32Pseudo_UPD:
+      // If the address is not 64-bit aligned, the latencies of these
+      // instructions increases by one.
+      ++Latency;
+      break;
+    }
+
   return Latency;
 }
 
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h
index 30095fe..96f0e76 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -291,8 +291,8 @@ public:
                                        int64_t &Offset1, int64_t &Offset2)const;
 
   /// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to
-  /// determine (in conjuction with areLoadsFromSameBasePtr) if two loads should
-  /// be scheduled togther. On some targets if two loads are loading from
+  /// determine (in conjunction with areLoadsFromSameBasePtr) if two loads
+  /// should be scheduled togther. On some targets if two loads are loading from
   /// addresses in the same cache line, it's better if they are scheduled
   /// together. This function takes two integers that represent the load offsets
   /// from the common base address. It returns true if it decides it's desirable
@@ -307,7 +307,7 @@ public:
                                     const MachineFunction &MF) const;
 
   virtual bool isProfitableToIfCvt(MachineBasicBlock &MBB,
-                                   unsigned NumCyles, unsigned ExtraPredCycles,
+                                   unsigned NumCycles, unsigned ExtraPredCycles,
                                    float Prob, float Confidence) const;
 
   virtual bool isProfitableToIfCvt(MachineBasicBlock &TMBB,
@@ -317,10 +317,10 @@ public:
                                    float Probability, float Confidence) const;
 
   virtual bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB,
-                                         unsigned NumCyles,
+                                         unsigned NumCycles,
                                          float Probability,
                                          float Confidence) const {
-    return NumCyles == 1;
+    return NumCycles == 1;
   }
 
   /// AnalyzeCompare - For a comparison instruction, return the source register
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index 1918fd9..2adcd2c 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -88,7 +88,7 @@ BitVector ARMBaseRegisterInfo::
 getReservedRegs(const MachineFunction &MF) const {
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
 
-  // FIXME: avoid re-calculating this everytime.
+  // FIXME: avoid re-calculating this every time.
   BitVector Reserved(getNumRegs());
   Reserved.set(ARM::SP);
   Reserved.set(ARM::PC);
@@ -342,6 +342,25 @@ ARMBaseRegisterInfo::canCombineSubRegIndices(const TargetRegisterClass *RC,
   return false;
 }
 
+const TargetRegisterClass*
+ARMBaseRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC)
+                                                                         const {
+  const TargetRegisterClass *Super = RC;
+  TargetRegisterClass::sc_iterator I = RC->superclasses_begin();
+  do {
+    switch (Super->getID()) {
+    case ARM::GPRRegClassID:
+    case ARM::SPRRegClassID:
+    case ARM::DPRRegClassID:
+    case ARM::QPRRegClassID:
+    case ARM::QQPRRegClassID:
+    case ARM::QQQQPRRegClassID:
+      return Super;
+    }
+    Super = *I++;
+  } while (Super);
+  return RC;
+}
 
 const TargetRegisterClass *
 ARMBaseRegisterInfo::getPointerRegClass(unsigned Kind) const {
@@ -368,12 +387,12 @@ ARMBaseRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
   }
 }
 
-/// getAllocationOrder - Returns the register allocation order for a specified
-/// register class in the form of a pair of TargetRegisterClass iterators.
-std::pair<TargetRegisterClass::iterator,TargetRegisterClass::iterator>
-ARMBaseRegisterInfo::getAllocationOrder(const TargetRegisterClass *RC,
-                                        unsigned HintType, unsigned HintReg,
-                                        const MachineFunction &MF) const {
+/// getRawAllocationOrder - Returns the register allocation order for a
+/// specified register class with a target-dependent hint.
+ArrayRef<unsigned>
+ARMBaseRegisterInfo::getRawAllocationOrder(const TargetRegisterClass *RC,
+                                           unsigned HintType, unsigned HintReg,
+                                           const MachineFunction &MF) const {
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
   // Alternative register allocation orders when favoring even / odd registers
   // of register pairs.
@@ -450,70 +469,54 @@ ARMBaseRegisterInfo::getAllocationOrder(const TargetRegisterClass *RC,
 
   // We only support even/odd hints for GPR and rGPR.
   if (RC != ARM::GPRRegisterClass && RC != ARM::rGPRRegisterClass)
-    return std::make_pair(RC->allocation_order_begin(MF),
-                          RC->allocation_order_end(MF));
+    return RC->getRawAllocationOrder(MF);
 
   if (HintType == ARMRI::RegPairEven) {
     if (isPhysicalRegister(HintReg) && getRegisterPairEven(HintReg, MF) == 0)
       // It's no longer possible to fulfill this hint. Return the default
       // allocation order.
-      return std::make_pair(RC->allocation_order_begin(MF),
-                            RC->allocation_order_end(MF));
+      return RC->getRawAllocationOrder(MF);
 
     if (!TFI->hasFP(MF)) {
       if (!STI.isR9Reserved())
-        return std::make_pair(GPREven1,
-                              GPREven1 + (sizeof(GPREven1)/sizeof(unsigned)));
+        return ArrayRef<unsigned>(GPREven1);
       else
-        return std::make_pair(GPREven4,
-                              GPREven4 + (sizeof(GPREven4)/sizeof(unsigned)));
+        return ArrayRef<unsigned>(GPREven4);
     } else if (FramePtr == ARM::R7) {
       if (!STI.isR9Reserved())
-        return std::make_pair(GPREven2,
-                              GPREven2 + (sizeof(GPREven2)/sizeof(unsigned)));
+        return ArrayRef<unsigned>(GPREven2);
       else
-        return std::make_pair(GPREven5,
-                              GPREven5 + (sizeof(GPREven5)/sizeof(unsigned)));
+        return ArrayRef<unsigned>(GPREven5);
     } else { // FramePtr == ARM::R11
       if (!STI.isR9Reserved())
-        return std::make_pair(GPREven3,
-                              GPREven3 + (sizeof(GPREven3)/sizeof(unsigned)));
+        return ArrayRef<unsigned>(GPREven3);
       else
-        return std::make_pair(GPREven6,
-                              GPREven6 + (sizeof(GPREven6)/sizeof(unsigned)));
+        return ArrayRef<unsigned>(GPREven6);
     }
   } else if (HintType == ARMRI::RegPairOdd) {
     if (isPhysicalRegister(HintReg) && getRegisterPairOdd(HintReg, MF) == 0)
       // It's no longer possible to fulfill this hint. Return the default
       // allocation order.
-      return std::make_pair(RC->allocation_order_begin(MF),
-                            RC->allocation_order_end(MF));
+      return RC->getRawAllocationOrder(MF);
 
     if (!TFI->hasFP(MF)) {
       if (!STI.isR9Reserved())
-        return std::make_pair(GPROdd1,
-                              GPROdd1 + (sizeof(GPROdd1)/sizeof(unsigned)));
+        return ArrayRef<unsigned>(GPROdd1);
       else
-        return std::make_pair(GPROdd4,
-                              GPROdd4 + (sizeof(GPROdd4)/sizeof(unsigned)));
+        return ArrayRef<unsigned>(GPROdd4);
     } else if (FramePtr == ARM::R7) {
       if (!STI.isR9Reserved())
-        return std::make_pair(GPROdd2,
-                              GPROdd2 + (sizeof(GPROdd2)/sizeof(unsigned)));
+        return ArrayRef<unsigned>(GPROdd2);
       else
-        return std::make_pair(GPROdd5,
-                              GPROdd5 + (sizeof(GPROdd5)/sizeof(unsigned)));
+        return ArrayRef<unsigned>(GPROdd5);
     } else { // FramePtr == ARM::R11
       if (!STI.isR9Reserved())
-        return std::make_pair(GPROdd3,
-                              GPROdd3 + (sizeof(GPROdd3)/sizeof(unsigned)));
+        return ArrayRef<unsigned>(GPROdd3);
       else
-        return std::make_pair(GPROdd6,
-                              GPROdd6 + (sizeof(GPROdd6)/sizeof(unsigned)));
+        return ArrayRef<unsigned>(GPROdd6);
     }
   }
-  return std::make_pair(RC->allocation_order_begin(MF),
-                        RC->allocation_order_end(MF));
+  return RC->getRawAllocationOrder(MF);
 }
 
 /// ResolveRegAllocHint - Resolves the specified register allocation hint
@@ -554,6 +557,29 @@ ARMBaseRegisterInfo::UpdateRegAllocHint(unsigned Reg, unsigned NewReg,
   }
 }
 
+bool
+ARMBaseRegisterInfo::avoidWriteAfterWrite(const TargetRegisterClass *RC) const {
+  // CortexA9 has a Write-after-write hazard for NEON registers.
+  if (!STI.isCortexA9())
+    return false;
+
+  switch (RC->getID()) {
+  case ARM::DPRRegClassID:
+  case ARM::DPR_8RegClassID:
+  case ARM::DPR_VFP2RegClassID:
+  case ARM::QPRRegClassID:
+  case ARM::QPR_8RegClassID:
+  case ARM::QPR_VFP2RegClassID:
+  case ARM::SPRRegClassID:
+  case ARM::SPR_8RegClassID:
+    // Avoid reusing S, D, and Q registers.
+    // Don't increase register pressure for QQ and QQQQ.
+    return true;
+  default:
+    return false;
+  }
+}
+
 bool ARMBaseRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
@@ -642,6 +668,10 @@ int ARMBaseRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
   return ARMGenRegisterInfo::getDwarfRegNumFull(RegNum, 0);
 }
 
+int ARMBaseRegisterInfo::getLLVMRegNum(unsigned DwarfRegNo, bool isEH) const {
+  return ARMGenRegisterInfo::getLLVMRegNumFull(DwarfRegNo,0);
+}
+
 unsigned ARMBaseRegisterInfo::getRegisterPairEven(unsigned Reg,
                                               const MachineFunction &MF) const {
   switch (Reg) {
@@ -1069,8 +1099,11 @@ materializeFrameBaseRegister(MachineBasicBlock *MBB,
   if (Ins != MBB->end())
     DL = Ins->getDebugLoc();
 
-  MachineInstrBuilder MIB =
-    BuildMI(*MBB, Ins, DL, TII.get(ADDriOpc), BaseReg)
+  const TargetInstrDesc &TID = TII.get(ADDriOpc);
+  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+  MRI.constrainRegClass(BaseReg, TID.OpInfo[0].getRegClass(this));
+
+  MachineInstrBuilder MIB = BuildMI(*MBB, Ins, DL, TID, BaseReg)
     .addFrameIndex(FrameIdx).addImm(Offset);
 
   if (!AFI->isThumb1OnlyFunction())
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.h b/lib/Target/ARM/ARMBaseRegisterInfo.h
index 0507396..70b6f01 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.h
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -128,13 +128,15 @@ public:
 
   const TargetRegisterClass *getPointerRegClass(unsigned Kind = 0) const;
 
+  const TargetRegisterClass*
+  getLargestLegalSuperClass(const TargetRegisterClass *RC) const;
+
   unsigned getRegPressureLimit(const TargetRegisterClass *RC,
                                MachineFunction &MF) const;
 
-  std::pair<TargetRegisterClass::iterator,TargetRegisterClass::iterator>
-  getAllocationOrder(const TargetRegisterClass *RC,
-                     unsigned HintType, unsigned HintReg,
-                     const MachineFunction &MF) const;
+  ArrayRef<unsigned> getRawAllocationOrder(const TargetRegisterClass *RC,
+                                           unsigned HintType, unsigned HintReg,
+                                           const MachineFunction &MF) const;
 
   unsigned ResolveRegAllocHint(unsigned Type, unsigned Reg,
                                const MachineFunction &MF) const;
@@ -142,6 +144,8 @@ public:
   void UpdateRegAllocHint(unsigned Reg, unsigned NewReg,
                           MachineFunction &MF) const;
 
+  virtual bool avoidWriteAfterWrite(const TargetRegisterClass *RC) const;
+
   bool hasBasePointer(const MachineFunction &MF) const;
 
   bool canRealignStack(const MachineFunction &MF) const;
@@ -167,6 +171,7 @@ public:
   unsigned getEHHandlerRegister() const;
 
   int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+  int getLLVMRegNum(unsigned RegNum, bool isEH) const;
 
   bool isLowRegister(unsigned Reg) const;
 
diff --git a/lib/Target/ARM/ARMCallingConv.td b/lib/Target/ARM/ARMCallingConv.td
index 1e6b95e..d2981c0 100644
--- a/lib/Target/ARM/ARMCallingConv.td
+++ b/lib/Target/ARM/ARMCallingConv.td
@@ -23,7 +23,7 @@ class CCIfAlign<string Align, CCAction A>:
 def CC_ARM_APCS : CallingConv<[
 
   // Handles byval parameters.
-  CCIfByVal<CCPassByVal<8, 8>>,
+  CCIfByVal<CCPassByVal<4, 4>>,
     
   CCIfType<[i8, i16], CCPromoteToType<i32>>,
 
diff --git a/lib/Target/ARM/ARMCodeEmitter.cpp b/lib/Target/ARM/ARMCodeEmitter.cpp
index d381ce7..97bac88 100644
--- a/lib/Target/ARM/ARMCodeEmitter.cpp
+++ b/lib/Target/ARM/ARMCodeEmitter.cpp
@@ -231,6 +231,9 @@ namespace {
       const { return 0; }
     unsigned getAddrMode6AddressOpValue(const MachineInstr &MI, unsigned Op)
       const { return 0; }
+    unsigned getAddrMode6OneLane32AddressOpValue(const MachineInstr &MI,
+                                                 unsigned Op)
+      const { return 0; }
     unsigned getAddrMode6DupAddressOpValue(const MachineInstr &MI, unsigned Op)
       const { return 0; }
     unsigned getAddrMode6OffsetOpValue(const MachineInstr &MI, unsigned Op)
@@ -239,6 +242,8 @@ namespace {
                                             unsigned Op) const { return 0; }
     unsigned getMsbOpValue(const MachineInstr &MI,
                            unsigned Op) const { return 0; }
+    unsigned getSsatBitPosValue(const MachineInstr &MI,
+                                unsigned Op) const { return 0; }
     uint32_t getLdStmModeOpValue(const MachineInstr &MI, unsigned OpIdx)
       const {return 0; }
     uint32_t getLdStSORegOpValue(const MachineInstr &MI, unsigned OpIdx)
@@ -1459,6 +1464,7 @@ void ARMCodeEmitter::emitMiscArithInstruction(const MachineInstr &MI) {
   // Set the conditional execution predicate
   Binary |= II->getPredicate(&MI) << ARMII::CondShift;
 
+  // PKH instructions are finished at this point
   if (TID.Opcode == ARM::PKHBT || TID.Opcode == ARM::PKHTB) {
     emitWordLE(Binary);
     return;
diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index a14c952..b6b3c75 100644
--- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -455,6 +455,10 @@ void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) {
   // Add an implicit def for the super-register.
   MIB.addReg(DstReg, RegState::ImplicitDefine | getDeadRegState(DstIsDead));
   TransferImpOps(MI, MIB, MIB);
+
+  // Transfer memoperands.
+  MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+
   MI.eraseFromParent();
 }
 
@@ -496,10 +500,13 @@ void ARMExpandPseudo::ExpandVST(MachineBasicBlock::iterator &MBBI) {
   MIB.addOperand(MI.getOperand(OpIdx++));
   MIB.addOperand(MI.getOperand(OpIdx++));
 
-  if (SrcIsKill)
-    // Add an implicit kill for the super-reg.
-    (*MIB).addRegisterKilled(SrcReg, TRI, true);
+  if (SrcIsKill) // Add an implicit kill for the super-reg.
+    MIB->addRegisterKilled(SrcReg, TRI, true);
   TransferImpOps(MI, MIB, MIB);
+
+  // Transfer memoperands.
+  MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+
   MI.eraseFromParent();
 }
 
@@ -622,9 +629,8 @@ void ARMExpandPseudo::ExpandVTBL(MachineBasicBlock::iterator &MBBI,
   MIB.addOperand(MI.getOperand(OpIdx++));
   MIB.addOperand(MI.getOperand(OpIdx++));
 
-  if (SrcIsKill)
-    // Add an implicit kill for the super-reg.
-    (*MIB).addRegisterKilled(SrcReg, TRI, true);
+  if (SrcIsKill)  // Add an implicit kill for the super-reg.
+    MIB->addRegisterKilled(SrcReg, TRI, true);
   TransferImpOps(MI, MIB, MIB);
   MI.eraseFromParent();
 }
@@ -655,8 +661,8 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB,
     unsigned SOImmValV2 = ARM_AM::getSOImmTwoPartSecond(ImmVal);
     LO16 = LO16.addImm(SOImmValV1);
     HI16 = HI16.addImm(SOImmValV2);
-    (*LO16).setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
-    (*HI16).setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+    LO16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+    HI16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
     LO16.addImm(Pred).addReg(PredReg).addReg(0);
     HI16.addImm(Pred).addReg(PredReg).addReg(0);
     TransferImpOps(MI, LO16, HI16);
@@ -692,8 +698,8 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB,
     HI16 = HI16.addGlobalAddress(GV, MO.getOffset(), TF | ARMII::MO_HI16);
   }
 
-  (*LO16).setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
-  (*HI16).setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+  LO16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+  HI16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
   LO16.addImm(Pred).addReg(PredReg);
   HI16.addImm(Pred).addReg(PredReg);
 
@@ -856,7 +862,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
                 TII->get(ARM::BL))
         .addExternalSymbol("__aeabi_read_tp", 0);
 
-      (*MIB).setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+      MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
       TransferImpOps(MI, MIB, MIB);
       MI.eraseFromParent();
       return true;
@@ -871,7 +877,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
         AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(),
                                TII->get(NewLdOpc), DstReg)
                        .addOperand(MI.getOperand(1)));
-      (*MIB1).setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+      MIB1->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
       MachineInstrBuilder MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(),
                                          TII->get(ARM::tPICADD))
         .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
@@ -927,7 +933,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       if (isARM) {
         AddDefaultPred(MIB3);
         if (Opcode == ARM::MOV_ga_pcrel_ldr)
-          (*MIB2).setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+          MIB2->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
       }
       TransferImpOps(MI, MIB1, MIB3);
       MI.eraseFromParent();
@@ -1019,9 +1025,8 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       unsigned D1 = TRI->getSubReg(SrcReg, ARM::dsub_1);
       MIB.addReg(D0).addReg(D1);
 
-      if (SrcIsKill)
-        // Add an implicit kill for the Q register.
-        (*MIB).addRegisterKilled(SrcReg, TRI, true);
+      if (SrcIsKill)      // Add an implicit kill for the Q register.
+        MIB->addRegisterKilled(SrcReg, TRI, true);
 
       TransferImpOps(MI, MIB, MIB);
       MI.eraseFromParent();
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index 1e94ab6..5cf73c4 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -14,6 +14,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARM.h"
+#include "ARMAddressingModes.h"
 #include "ARMBaseInstrInfo.h"
 #include "ARMCallingConv.h"
 #include "ARMRegisterInfo.h"
@@ -26,6 +27,7 @@
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Module.h"
+#include "llvm/Operator.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/FastISel.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
@@ -69,12 +71,10 @@ namespace {
     } Base;
 
     int Offset;
-    unsigned Scale;
-    unsigned PlusReg;
 
     // Innocuous defaults for our address.
     Address()
-     : BaseType(RegBase), Offset(0), Scale(0), PlusReg(0) {
+     : BaseType(RegBase), Offset(0) {
        Base.Reg = 0;
      }
   } Address;
@@ -136,6 +136,9 @@ class ARMFastISel : public FastISel {
     virtual unsigned FastEmitInst_i(unsigned MachineInstOpcode,
                                     const TargetRegisterClass *RC,
                                     uint64_t Imm);
+    virtual unsigned FastEmitInst_ii(unsigned MachineInstOpcode,
+                                     const TargetRegisterClass *RC,
+                                     uint64_t Imm1, uint64_t Imm2);
 
     virtual unsigned FastEmitInst_extractsubreg(MVT RetVT,
                                                 unsigned Op0, bool Op0IsKill,
@@ -164,6 +167,7 @@ class ARMFastISel : public FastISel {
     bool SelectCall(const Instruction *I);
     bool SelectSelect(const Instruction *I);
     bool SelectRet(const Instruction *I);
+    bool SelectIntCast(const Instruction *I);
 
     // Utility routines.
   private:
@@ -203,7 +207,8 @@ class ARMFastISel : public FastISel {
     bool DefinesOptionalPredicate(MachineInstr *MI, bool *CPSR);
     const MachineInstrBuilder &AddOptionalDefs(const MachineInstrBuilder &MIB);
     void AddLoadStoreOperands(EVT VT, Address &Addr,
-                              const MachineInstrBuilder &MIB);
+                              const MachineInstrBuilder &MIB,
+                              unsigned Flags);
 };
 
 } // end anonymous namespace
@@ -230,16 +235,16 @@ bool ARMFastISel::DefinesOptionalPredicate(MachineInstr *MI, bool *CPSR) {
 
 bool ARMFastISel::isARMNEONPred(const MachineInstr *MI) {
   const TargetInstrDesc &TID = MI->getDesc();
-  
+
   // If we're a thumb2 or not NEON function we were handled via isPredicable.
   if ((TID.TSFlags & ARMII::DomainMask) != ARMII::DomainNEON ||
        AFI->isThumb2Function())
     return false;
-  
+
   for (unsigned i = 0, e = TID.getNumOperands(); i != e; ++i)
     if (TID.OpInfo[i].isPredicate())
       return true;
-  
+
   return false;
 }
 
@@ -257,7 +262,7 @@ ARMFastISel::AddOptionalDefs(const MachineInstrBuilder &MIB) {
   // we're not predicable but add it anyways.
   if (TII.isPredicable(MI) || isARMNEONPred(MI))
     AddDefaultPred(MIB);
-  
+
   // Do we optionally set a predicate?  Preds is size > 0 iff the predicate
   // defines CPSR. All other OptionalDefines in ARM are the CCR register.
   bool CPSR = false;
@@ -433,6 +438,26 @@ unsigned ARMFastISel::FastEmitInst_i(unsigned MachineInstOpcode,
   return ResultReg;
 }
 
+unsigned ARMFastISel::FastEmitInst_ii(unsigned MachineInstOpcode,
+                                      const TargetRegisterClass *RC,
+                                      uint64_t Imm1, uint64_t Imm2) {
+  unsigned ResultReg = createResultReg(RC);
+  const TargetInstrDesc &II = TII.get(MachineInstOpcode);
+
+  if (II.getNumDefs() >= 1)
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
+                    .addImm(Imm1).addImm(Imm2));
+  else {
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
+                    .addImm(Imm1).addImm(Imm2));
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                            TII.get(TargetOpcode::COPY),
+                            ResultReg)
+                    .addReg(II.ImplicitDefs[0]));
+  }
+  return ResultReg;
+}
+
 unsigned ARMFastISel::FastEmitInst_extractsubreg(MVT RetVT,
                                                  unsigned Op0, bool Op0IsKill,
                                                  uint32_t Idx) {
@@ -552,9 +577,6 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, EVT VT) {
 
   Reloc::Model RelocM = TM.getRelocationModel();
 
-  // TODO: No external globals for now.
-  if (Subtarget->GVIsIndirectSymbol(GV, RelocM)) return 0;
-
   // TODO: Need more magic for ARM PIC.
   if (!isThumb && (RelocM == Reloc::PIC_)) return 0;
 
@@ -589,6 +611,23 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, EVT VT) {
           .addImm(0);
   }
   AddOptionalDefs(MIB);
+
+  if (Subtarget->GVIsIndirectSymbol(GV, RelocM)) {
+    unsigned NewDestReg = createResultReg(TLI.getRegClassFor(VT));
+    if (isThumb)
+      MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(ARM::t2LDRi12),
+                    NewDestReg)
+            .addReg(DestReg)
+            .addImm(0);
+    else
+      MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(ARM::LDRi12),
+                    NewDestReg)
+            .addReg(DestReg)
+            .addImm(0);
+    DestReg = NewDestReg;
+    AddOptionalDefs(MIB);
+  }
+
   return DestReg;
 }
 
@@ -727,7 +766,7 @@ bool ARMFastISel::ARMComputeAddress(const Value *Obj, Address &Addr) {
                  FuncInfo.MBBMap[cast<Instruction>(Op)->getParent()]
                  == FuncInfo.MBB) &&
                 isa<ConstantInt>(cast<AddOperator>(Op)->getOperand(1))) {
-              // An add (in the same block) with a constant operand. Fold the 
+              // An add (in the same block) with a constant operand. Fold the
               // constant.
               ConstantInt *CI =
               cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
@@ -735,7 +774,7 @@ bool ARMFastISel::ARMComputeAddress(const Value *Obj, Address &Addr) {
               // Iterate on the other operand.
               Op = cast<AddOperator>(Op)->getOperand(0);
               continue;
-            } 
+            }
             // Unsupported
             goto unsupported_gep;
           }
@@ -821,37 +860,21 @@ void ARMFastISel::ARMSimplifyAddress(Address &Addr, EVT VT) {
   // Since the offset is too large for the load/store instruction
   // get the reg+offset into a register.
   if (needsLowering) {
-    ARMCC::CondCodes Pred = ARMCC::AL;
-    unsigned PredReg = 0;
-
-    TargetRegisterClass *RC = isThumb ? ARM::tGPRRegisterClass :
-      ARM::GPRRegisterClass;
-    unsigned BaseReg = createResultReg(RC);
-
-    if (!isThumb)
-      emitARMRegPlusImmediate(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
-                              BaseReg, Addr.Base.Reg, Addr.Offset,
-                              Pred, PredReg,
-                              static_cast<const ARMBaseInstrInfo&>(TII));
-    else {
-      assert(AFI->isThumb2Function());
-      emitT2RegPlusImmediate(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
-                             BaseReg, Addr.Base.Reg, Addr.Offset, Pred, PredReg,
-                             static_cast<const ARMBaseInstrInfo&>(TII));
-    }
+    Addr.Base.Reg = FastEmit_ri_(MVT::i32, ISD::ADD, Addr.Base.Reg,
+                                 /*Op0IsKill*/false, Addr.Offset, MVT::i32);
     Addr.Offset = 0;
-    Addr.Base.Reg = BaseReg;
   }
 }
 
 void ARMFastISel::AddLoadStoreOperands(EVT VT, Address &Addr,
-                                       const MachineInstrBuilder &MIB) {
+                                       const MachineInstrBuilder &MIB,
+                                       unsigned Flags) {
   // addrmode5 output depends on the selection dag addressing dividing the
   // offset by 4 that it then later multiplies. Do this here as well.
   if (VT.getSimpleVT().SimpleTy == MVT::f32 ||
       VT.getSimpleVT().SimpleTy == MVT::f64)
     Addr.Offset /= 4;
-    
+
   // Frame base works a bit differently. Handle it separately.
   if (Addr.BaseType == Address::FrameIndexBase) {
     int FI = Addr.Base.FI;
@@ -859,7 +882,7 @@ void ARMFastISel::AddLoadStoreOperands(EVT VT, Address &Addr,
     MachineMemOperand *MMO =
           FuncInfo.MF->getMachineMemOperand(
                                   MachinePointerInfo::getFixedStack(FI, Offset),
-                                  MachineMemOperand::MOLoad,
+                                  Flags,
                                   MFI.getObjectSize(FI),
                                   MFI.getObjectAlignment(FI));
     // Now add the rest of the operands.
@@ -873,7 +896,7 @@ void ARMFastISel::AddLoadStoreOperands(EVT VT, Address &Addr,
   } else {
     // Now add the rest of the operands.
     MIB.addReg(Addr.Base.Reg);
-  
+
     // ARM halfword load/stores need an additional operand.
     if (!isThumb && VT.getSimpleVT().SimpleTy == MVT::i16) MIB.addReg(0);
 
@@ -918,7 +941,7 @@ bool ARMFastISel::ARMEmitLoad(EVT VT, unsigned &ResultReg, Address &Addr) {
   ResultReg = createResultReg(RC);
   MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                                     TII.get(Opc), ResultReg);
-  AddLoadStoreOperands(VT, Addr, MIB);
+  AddLoadStoreOperands(VT, Addr, MIB, MachineMemOperand::MOLoad);
   return true;
 }
 
@@ -977,7 +1000,7 @@ bool ARMFastISel::ARMEmitStore(EVT VT, unsigned SrcReg, Address &Addr) {
   MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                                     TII.get(StrOpc))
                             .addReg(SrcReg, getKillRegState(true));
-  AddLoadStoreOperands(VT, Addr, MIB);
+  AddLoadStoreOperands(VT, Addr, MIB, MachineMemOperand::MOStore);
   return true;
 }
 
@@ -1061,18 +1084,16 @@ bool ARMFastISel::SelectBranch(const Instruction *I) {
   // behavior.
   // TODO: Factor this out.
   if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
-    if (CI->hasOneUse() && (CI->getParent() == I->getParent())) {
-      MVT VT;
-      const Type *Ty = CI->getOperand(0)->getType();
-      if (!isTypeLegal(Ty, VT))
-        return false;
-
+    MVT SourceVT;
+    const Type *Ty = CI->getOperand(0)->getType();
+    if (CI->hasOneUse() && (CI->getParent() == I->getParent())
+        && isTypeLegal(Ty, SourceVT)) {
       bool isFloat = (Ty->isDoubleTy() || Ty->isFloatTy());
       if (isFloat && !Subtarget->hasVFP2())
         return false;
 
       unsigned CmpOpc;
-      switch (VT.SimpleTy) {
+      switch (SourceVT.SimpleTy) {
         default: return false;
         // TODO: Verify compares.
         case MVT::f32:
@@ -1087,7 +1108,14 @@ bool ARMFastISel::SelectBranch(const Instruction *I) {
       }
 
       // Get the compare predicate.
-      ARMCC::CondCodes ARMPred = getComparePred(CI->getPredicate());
+      // Try to take advantage of fallthrough opportunities.
+      CmpInst::Predicate Predicate = CI->getPredicate();
+      if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
+        std::swap(TBB, FBB);
+        Predicate = CmpInst::getInversePredicate(Predicate);
+      }
+
+      ARMCC::CondCodes ARMPred = getComparePred(Predicate);
 
       // We may not handle every CC for now.
       if (ARMPred == ARMCC::AL) return false;
@@ -1115,19 +1143,55 @@ bool ARMFastISel::SelectBranch(const Instruction *I) {
       FuncInfo.MBB->addSuccessor(TBB);
       return true;
     }
+  } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) {
+    MVT SourceVT;
+    if (TI->hasOneUse() && TI->getParent() == I->getParent() &&
+        (isLoadTypeLegal(TI->getOperand(0)->getType(), SourceVT))) {
+      unsigned TstOpc = isThumb ? ARM::t2TSTri : ARM::TSTri;
+      unsigned OpReg = getRegForValue(TI->getOperand(0));
+      AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                              TII.get(TstOpc))
+                      .addReg(OpReg).addImm(1));
+
+      unsigned CCMode = ARMCC::NE;
+      if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
+        std::swap(TBB, FBB);
+        CCMode = ARMCC::EQ;
+      }
+
+      unsigned BrOpc = isThumb ? ARM::t2Bcc : ARM::Bcc;
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(BrOpc))
+      .addMBB(TBB).addImm(CCMode).addReg(ARM::CPSR);
+
+      FastEmitBranch(FBB, DL);
+      FuncInfo.MBB->addSuccessor(TBB);
+      return true;
+    }
   }
 
   unsigned CmpReg = getRegForValue(BI->getCondition());
   if (CmpReg == 0) return false;
 
-  // Re-set the flags just in case.
-  unsigned CmpOpc = isThumb ? ARM::t2CMPri : ARM::CMPri;
-  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CmpOpc))
-                  .addReg(CmpReg).addImm(0));
+  // We've been divorced from our compare!  Our block was split, and
+  // now our compare lives in a predecessor block.  We musn't
+  // re-compare here, as the children of the compare aren't guaranteed
+  // live across the block boundary (we *could* check for this).
+  // Regardless, the compare has been done in the predecessor block,
+  // and it left a value for us in a virtual register.  Ergo, we test
+  // the one-bit value left in the virtual register.
+  unsigned TstOpc = isThumb ? ARM::t2TSTri : ARM::TSTri;
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TstOpc))
+                  .addReg(CmpReg).addImm(1));
+
+  unsigned CCMode = ARMCC::NE;
+  if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
+    std::swap(TBB, FBB);
+    CCMode = ARMCC::EQ;
+  }
 
   unsigned BrOpc = isThumb ? ARM::t2Bcc : ARM::Bcc;
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(BrOpc))
-                  .addMBB(TBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
+                  .addMBB(TBB).addImm(CCMode).addReg(ARM::CPSR);
   FastEmitBranch(FBB, DL);
   FuncInfo.MBB->addSuccessor(TBB);
   return true;
@@ -1249,6 +1313,10 @@ bool ARMFastISel::SelectSIToFP(const Instruction *I) {
   if (!isTypeLegal(Ty, DstVT))
     return false;
 
+  // FIXME: Handle sign-extension where necessary.
+  if (!I->getOperand(0)->getType()->isIntegerTy(32))
+    return false;
+
   unsigned Op = getRegForValue(I->getOperand(0));
   if (Op == 0) return false;
 
@@ -1474,7 +1542,7 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args,
                                   CallingConv::ID CC,
                                   unsigned &NumBytes) {
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CC, false, TM, ArgLocs, *Context);
+  CCState CCInfo(CC, false, *FuncInfo.MF, TM, ArgLocs, *Context);
   CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CCAssignFnForCall(CC, false));
 
   // Get a count of how many bytes are to be pushed on the stack.
@@ -1587,7 +1655,7 @@ bool ARMFastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
   // Now the return value.
   if (RetVT != MVT::isVoid) {
     SmallVector<CCValAssign, 16> RVLocs;
-    CCState CCInfo(CC, false, TM, RVLocs, *Context);
+    CCState CCInfo(CC, false, *FuncInfo.MF, TM, RVLocs, *Context);
     CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC, true));
 
     // Copy all of the result registers out of their specified physreg.
@@ -1643,7 +1711,7 @@ bool ARMFastISel::SelectRet(const Instruction *I) {
 
     // Analyze operands of the call, assigning locations to each operand.
     SmallVector<CCValAssign, 16> ValLocs;
-    CCState CCInfo(CC, F.isVarArg(), TM, ValLocs, I->getContext());
+    CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, TM, ValLocs, I->getContext());
     CCInfo.AnalyzeReturn(Outs, CCAssignFnForCall(CC, true /* is Ret */));
 
     const Value *RV = Ret->getOperand(0);
@@ -1717,9 +1785,6 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
   else if (!isTypeLegal(RetTy, RetVT))
     return false;
 
-  // For now we're using BLX etc on the assumption that we have v5t ops.
-  if (!Subtarget->hasV5TOps()) return false;
-
   // TODO: For now if we have long calls specified we don't handle the call.
   if (EnableARMLongCalls) return false;
 
@@ -1757,7 +1822,7 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
   if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags, RegArgs, CC, NumBytes))
     return false;
 
-  // Issue the call, BLXr9 for darwin, BLX otherwise. This uses V5 ops.
+  // Issue the call, BLr9 for darwin, BL otherwise.
   // TODO: Turn this into the table of arm call ops.
   MachineInstrBuilder MIB;
   unsigned CallOpc = ARMSelectCallOp(NULL);
@@ -1793,9 +1858,9 @@ bool ARMFastISel::SelectCall(const Instruction *I) {
   // Can't handle inline asm or worry about intrinsics yet.
   if (isa<InlineAsm>(Callee) || isa<IntrinsicInst>(CI)) return false;
 
-  // Only handle global variable Callees that are direct calls.
+  // Only handle global variable Callees.
   const GlobalValue *GV = dyn_cast<GlobalValue>(Callee);
-  if (!GV || Subtarget->GVIsIndirectSymbol(GV, TM.getRelocationModel()))
+  if (!GV)
     return false;
 
   // Check the calling convention.
@@ -1818,13 +1883,9 @@ bool ARMFastISel::SelectCall(const Instruction *I) {
   else if (!isTypeLegal(RetTy, RetVT))
     return false;
 
-  // For now we're using BLX etc on the assumption that we have v5t ops.
-  // TODO: Maybe?
-  if (!Subtarget->hasV5TOps()) return false;
-
   // TODO: For now if we have long calls specified we don't handle the call.
   if (EnableARMLongCalls) return false;
-  
+
   // Set up the argument vectors.
   SmallVector<Value*, 8> Args;
   SmallVector<unsigned, 8> ArgRegs;
@@ -1873,7 +1934,7 @@ bool ARMFastISel::SelectCall(const Instruction *I) {
   if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags, RegArgs, CC, NumBytes))
     return false;
 
-  // Issue the call, BLXr9 for darwin, BLX otherwise. This uses V5 ops.
+  // Issue the call, BLr9 for darwin, BL otherwise.
   // TODO: Turn this into the table of arm call ops.
   MachineInstrBuilder MIB;
   unsigned CallOpc = ARMSelectCallOp(GV);
@@ -1888,7 +1949,7 @@ bool ARMFastISel::SelectCall(const Instruction *I) {
     MIB = AddDefaultPred(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                          TII.get(CallOpc))
           .addGlobalAddress(GV, 0, 0));
-  
+
   // Add implicit physical register uses to the call.
   for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
     MIB.addReg(RegArgs[i]);
@@ -1904,6 +1965,79 @@ bool ARMFastISel::SelectCall(const Instruction *I) {
 
 }
 
+bool ARMFastISel::SelectIntCast(const Instruction *I) {
+  // On ARM, in general, integer casts don't involve legal types; this code
+  // handles promotable integers.  The high bits for a type smaller than
+  // the register size are assumed to be undefined.
+  const Type *DestTy = I->getType();
+  Value *Op = I->getOperand(0);
+  const Type *SrcTy = Op->getType();
+
+  EVT SrcVT, DestVT;
+  SrcVT = TLI.getValueType(SrcTy, true);
+  DestVT = TLI.getValueType(DestTy, true);
+
+  if (isa<TruncInst>(I)) {
+    if (SrcVT != MVT::i32 && SrcVT != MVT::i16 && SrcVT != MVT::i8)
+      return false;
+    if (DestVT != MVT::i16 && DestVT != MVT::i8 && DestVT != MVT::i1)
+      return false;
+
+    unsigned SrcReg = getRegForValue(Op);
+    if (!SrcReg) return false;
+
+    // Because the high bits are undefined, a truncate doesn't generate
+    // any code.
+    UpdateValueMap(I, SrcReg);
+    return true;
+  }
+  if (DestVT != MVT::i32 && DestVT != MVT::i16 && DestVT != MVT::i8)
+    return false;
+
+  unsigned Opc;
+  bool isZext = isa<ZExtInst>(I);
+  bool isBoolZext = false;
+  if (!SrcVT.isSimple())
+    return false;
+  switch (SrcVT.getSimpleVT().SimpleTy) {
+  default: return false;
+  case MVT::i16:
+    if (isZext)
+      Opc = isThumb ? ARM::t2UXTHr : ARM::UXTHr;
+    else
+      Opc = isThumb ? ARM::t2SXTHr : ARM::SXTHr;
+    break;
+  case MVT::i8:
+    if (isZext)
+      Opc = isThumb ? ARM::t2UXTBr : ARM::UXTBr;
+    else
+      Opc = isThumb ? ARM::t2SXTBr : ARM::SXTBr;
+    break;
+  case MVT::i1:
+    if (isZext) {
+      Opc = isThumb ? ARM::t2ANDri : ARM::ANDri;
+      isBoolZext = true;
+      break;
+    }
+    return false;
+  }
+
+  // FIXME: We could save an instruction in many cases by special-casing
+  // load instructions.
+  unsigned SrcReg = getRegForValue(Op);
+  if (!SrcReg) return false;
+
+  unsigned DestReg = createResultReg(TLI.getRegClassFor(MVT::i32));
+  MachineInstrBuilder MIB;
+  MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), DestReg)
+        .addReg(SrcReg);
+  if (isBoolZext)
+    MIB.addImm(1);
+  AddOptionalDefs(MIB);
+  UpdateValueMap(I, DestReg);
+  return true;
+}
+
 // TODO: SoftFP support.
 bool ARMFastISel::TargetSelectInstruction(const Instruction *I) {
 
@@ -1941,6 +2075,10 @@ bool ARMFastISel::TargetSelectInstruction(const Instruction *I) {
       return SelectSelect(I);
     case Instruction::Ret:
       return SelectRet(I);
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+      return SelectIntCast(I);
     default: break;
   }
   return false;
diff --git a/lib/Target/ARM/ARMFixupKinds.h b/lib/Target/ARM/ARMFixupKinds.h
index 3d175e3..350c92d 100644
--- a/lib/Target/ARM/ARMFixupKinds.h
+++ b/lib/Target/ARM/ARMFixupKinds.h
@@ -56,7 +56,7 @@ enum Fixups {
   // fixup_arm_thumb_br - 12-bit fixup for Thumb B instructions.
   fixup_arm_thumb_br,
 
-  // fixup_arm_thumb_blx - Fixup for Thumb BL instructions.
+  // fixup_arm_thumb_bl - Fixup for Thumb BL instructions.
   fixup_arm_thumb_bl,
 
   // fixup_arm_thumb_blx - Fixup for Thumb BLX instructions.
diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp
index c99259d..4ef2666 100644
--- a/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/lib/Target/ARM/ARMFrameLowering.cpp
@@ -427,6 +427,7 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
 
     // Delete the pseudo instruction TCRETURN.
     MBB.erase(MBBI);
+    MBBI = NewMI;
   }
 
   if (VARegSaveSize)
@@ -445,8 +446,7 @@ ARMFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
 
 int
 ARMFrameLowering::ResolveFrameIndexReference(const MachineFunction &MF,
-                                             int FI,
-                                             unsigned &FrameReg,
+                                             int FI, unsigned &FrameReg,
                                              int SPAdj) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   const ARMBaseRegisterInfo *RegInfo =
@@ -490,19 +490,23 @@ ARMFrameLowering::ResolveFrameIndexReference(const MachineFunction &MF,
       return FPOffset;
     } else if (MFI->hasVarSizedObjects()) {
       assert(RegInfo->hasBasePointer(MF) && "missing base pointer!");
-      // Try to use the frame pointer if we can, else use the base pointer
-      // since it's available. This is handy for the emergency spill slot, in
-      // particular.
       if (AFI->isThumb2Function()) {
+        // Try to use the frame pointer if we can, else use the base pointer
+        // since it's available. This is handy for the emergency spill slot, in
+        // particular.
         if (FPOffset >= -255 && FPOffset < 0) {
           FrameReg = RegInfo->getFrameRegister(MF);
           return FPOffset;
         }
-      } else
-        FrameReg = RegInfo->getBaseRegister();
+      }
     } else if (AFI->isThumb2Function()) {
+      // Use  add <rd>, sp, #<imm8> 
+      //      ldr <rd>, [sp, #<imm8>]
+      // if at all possible to save space.
+      if (Offset >= 0 && (Offset & 3) == 0 && Offset <= 1020)
+        return Offset;
       // In Thumb2 mode, the negative offset is very limited. Try to avoid
-      // out of range references.
+      // out of range references. ldr <rt>,[<rn>, #-<imm8>]
       if (FPOffset >= -255 && FPOffset < 0) {
         FrameReg = RegInfo->getFrameRegister(MF);
         return FPOffset;
@@ -838,9 +842,14 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
     if (AFI->getVarArgsRegSaveSize() > 0)
       MF.getRegInfo().setPhysRegUsed(ARM::LR);
 
-    // Spill R4 if Thumb1 epilogue has to restore SP from FP since 
+    // Spill R4 if Thumb1 epilogue has to restore SP from FP. We don't know
+    // for sure what the stack size will be, but for this, an estimate is good
+    // enough. If there anything changes it, it'll be a spill, which implies
+    // we've used all the registers and so R4 is already used, so not marking
+    // it here will be OK.
     // FIXME: It will be better just to find spare register here.
-    if (MFI->hasVarSizedObjects())
+    unsigned StackSize = estimateStackSize(MF);
+    if (MFI->hasVarSizedObjects() || StackSize > 508)
       MF.getRegInfo().setPhysRegUsed(ARM::R4);
   }
 
diff --git a/lib/Target/ARM/ARMFrameLowering.h b/lib/Target/ARM/ARMFrameLowering.h
index a7b7f15..61bb8af 100644
--- a/lib/Target/ARM/ARMFrameLowering.h
+++ b/lib/Target/ARM/ARMFrameLowering.h
@@ -51,7 +51,8 @@ public:
   bool canSimplifyCallFramePseudos(const MachineFunction &MF) const;
   int getFrameIndexReference(const MachineFunction &MF, int FI,
                              unsigned &FrameReg) const;
-  int ResolveFrameIndexReference(const MachineFunction &MF, int FI,
+  int ResolveFrameIndexReference(const MachineFunction &MF,
+                                 int FI,
                                  unsigned &FrameReg, int SPAdj) const;
   int getFrameIndexOffset(const MachineFunction &MF, int FI) const;
 
diff --git a/lib/Target/ARM/ARMHazardRecognizer.cpp b/lib/Target/ARM/ARMHazardRecognizer.cpp
index e97ce50..517bba8 100644
--- a/lib/Target/ARM/ARMHazardRecognizer.cpp
+++ b/lib/Target/ARM/ARMHazardRecognizer.cpp
@@ -49,6 +49,8 @@ ARMHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
       const TargetInstrDesc &LastTID = LastMI->getDesc();
       // Skip over one non-VFP / NEON instruction.
       if (!LastTID.isBarrier() &&
+          // On A9, AGU and NEON/FPU are muxed.
+          !(STI.isCortexA9() && (LastTID.mayLoad() || LastTID.mayStore())) &&
           (LastTID.TSFlags & ARMII::DomainMask) == ARMII::DomainGeneral) {
         MachineBasicBlock::iterator I = LastMI;
         if (I != LastMI->getParent()->begin()) {
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 1201d91..6f57a04 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -45,7 +45,7 @@ DisableShifterOp("disable-shifter-op", cl::Hidden,
 static cl::opt<bool>
 CheckVMLxHazard("check-vmlx-hazard", cl::Hidden,
   cl::desc("Check fp vmla / vmls hazard at isel time"),
-  cl::init(false));
+  cl::init(true));
 
 //===--------------------------------------------------------------------===//
 /// ARMDAGToDAGISel - ARM specific code to select ARM machine
@@ -179,16 +179,6 @@ public:
     return ARM_AM::getT2SOImmVal(~Imm) != -1;
   }
 
-  inline bool Pred_so_imm(SDNode *inN) const {
-    ConstantSDNode *N = cast<ConstantSDNode>(inN);
-    return is_so_imm(N->getZExtValue());
-  }
-
-  inline bool Pred_t2_so_imm(SDNode *inN) const {
-    ConstantSDNode *N = cast<ConstantSDNode>(inN);
-    return is_t2_so_imm(N->getZExtValue());
-  }
-
   // Include the pieces autogenerated from the target description.
 #include "ARMGenDAGISel.inc"
 
@@ -1364,30 +1354,34 @@ SDNode *ARMDAGToDAGISel::SelectT2IndexedLoad(SDNode *N) {
 ///
 SDNode *ARMDAGToDAGISel::PairSRegs(EVT VT, SDValue V0, SDValue V1) {
   DebugLoc dl = V0.getNode()->getDebugLoc();
+  SDValue RegClass =
+    CurDAG->getTargetConstant(ARM::DPR_VFP2RegClassID, MVT::i32);
   SDValue SubReg0 = CurDAG->getTargetConstant(ARM::ssub_0, MVT::i32);
   SDValue SubReg1 = CurDAG->getTargetConstant(ARM::ssub_1, MVT::i32);
-  const SDValue Ops[] = { V0, SubReg0, V1, SubReg1 };
-  return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 4);
+  const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1 };
+  return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 5);
 }
 
 /// PairDRegs - Form a quad register from a pair of D registers.
 ///
 SDNode *ARMDAGToDAGISel::PairDRegs(EVT VT, SDValue V0, SDValue V1) {
   DebugLoc dl = V0.getNode()->getDebugLoc();
+  SDValue RegClass = CurDAG->getTargetConstant(ARM::QPRRegClassID, MVT::i32);
   SDValue SubReg0 = CurDAG->getTargetConstant(ARM::dsub_0, MVT::i32);
   SDValue SubReg1 = CurDAG->getTargetConstant(ARM::dsub_1, MVT::i32);
-  const SDValue Ops[] = { V0, SubReg0, V1, SubReg1 };
-  return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 4);
+  const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1 };
+  return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 5);
 }
 
 /// PairQRegs - Form 4 consecutive D registers from a pair of Q registers.
 ///
 SDNode *ARMDAGToDAGISel::PairQRegs(EVT VT, SDValue V0, SDValue V1) {
   DebugLoc dl = V0.getNode()->getDebugLoc();
+  SDValue RegClass = CurDAG->getTargetConstant(ARM::QQPRRegClassID, MVT::i32);
   SDValue SubReg0 = CurDAG->getTargetConstant(ARM::qsub_0, MVT::i32);
   SDValue SubReg1 = CurDAG->getTargetConstant(ARM::qsub_1, MVT::i32);
-  const SDValue Ops[] = { V0, SubReg0, V1, SubReg1 };
-  return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 4);
+  const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1 };
+  return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 5);
 }
 
 /// QuadSRegs - Form 4 consecutive S registers.
@@ -1395,12 +1389,15 @@ SDNode *ARMDAGToDAGISel::PairQRegs(EVT VT, SDValue V0, SDValue V1) {
 SDNode *ARMDAGToDAGISel::QuadSRegs(EVT VT, SDValue V0, SDValue V1,
                                    SDValue V2, SDValue V3) {
   DebugLoc dl = V0.getNode()->getDebugLoc();
+  SDValue RegClass =
+    CurDAG->getTargetConstant(ARM::QPR_VFP2RegClassID, MVT::i32);
   SDValue SubReg0 = CurDAG->getTargetConstant(ARM::ssub_0, MVT::i32);
   SDValue SubReg1 = CurDAG->getTargetConstant(ARM::ssub_1, MVT::i32);
   SDValue SubReg2 = CurDAG->getTargetConstant(ARM::ssub_2, MVT::i32);
   SDValue SubReg3 = CurDAG->getTargetConstant(ARM::ssub_3, MVT::i32);
-  const SDValue Ops[] = { V0, SubReg0, V1, SubReg1, V2, SubReg2, V3, SubReg3 };
-  return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 8);
+  const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1,
+                                    V2, SubReg2, V3, SubReg3 };
+  return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 9);
 }
 
 /// QuadDRegs - Form 4 consecutive D registers.
@@ -1408,12 +1405,14 @@ SDNode *ARMDAGToDAGISel::QuadSRegs(EVT VT, SDValue V0, SDValue V1,
 SDNode *ARMDAGToDAGISel::QuadDRegs(EVT VT, SDValue V0, SDValue V1,
                                    SDValue V2, SDValue V3) {
   DebugLoc dl = V0.getNode()->getDebugLoc();
+  SDValue RegClass = CurDAG->getTargetConstant(ARM::QQPRRegClassID, MVT::i32);
   SDValue SubReg0 = CurDAG->getTargetConstant(ARM::dsub_0, MVT::i32);
   SDValue SubReg1 = CurDAG->getTargetConstant(ARM::dsub_1, MVT::i32);
   SDValue SubReg2 = CurDAG->getTargetConstant(ARM::dsub_2, MVT::i32);
   SDValue SubReg3 = CurDAG->getTargetConstant(ARM::dsub_3, MVT::i32);
-  const SDValue Ops[] = { V0, SubReg0, V1, SubReg1, V2, SubReg2, V3, SubReg3 };
-  return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 8);
+  const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1,
+                                    V2, SubReg2, V3, SubReg3 };
+  return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 9);
 }
 
 /// QuadQRegs - Form 4 consecutive Q registers.
@@ -1421,12 +1420,14 @@ SDNode *ARMDAGToDAGISel::QuadDRegs(EVT VT, SDValue V0, SDValue V1,
 SDNode *ARMDAGToDAGISel::QuadQRegs(EVT VT, SDValue V0, SDValue V1,
                                    SDValue V2, SDValue V3) {
   DebugLoc dl = V0.getNode()->getDebugLoc();
+  SDValue RegClass = CurDAG->getTargetConstant(ARM::QQQQPRRegClassID, MVT::i32);
   SDValue SubReg0 = CurDAG->getTargetConstant(ARM::qsub_0, MVT::i32);
   SDValue SubReg1 = CurDAG->getTargetConstant(ARM::qsub_1, MVT::i32);
   SDValue SubReg2 = CurDAG->getTargetConstant(ARM::qsub_2, MVT::i32);
   SDValue SubReg3 = CurDAG->getTargetConstant(ARM::qsub_3, MVT::i32);
-  const SDValue Ops[] = { V0, SubReg0, V1, SubReg1, V2, SubReg2, V3, SubReg3 };
-  return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 8);
+  const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1,
+                                    V2, SubReg2, V3, SubReg3 };
+  return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 9);
 }
 
 /// GetVLDSTAlign - Get the alignment (in bytes) for the alignment operand
@@ -1553,6 +1554,11 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
                                  Ops.data(), Ops.size());
   }
 
+  // Transfer memoperands.
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(VLd)->setMemRefs(MemOp, MemOp + 1);
+
   if (NumVecs == 1)
     return VLd;
 
@@ -1582,6 +1588,9 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
   if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align))
     return NULL;
 
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+
   SDValue Chain = N->getOperand(0);
   EVT VT = N->getOperand(Vec0Idx).getValueType();
   bool is64BitVector = VT.is64BitVector();
@@ -1654,7 +1663,13 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
     Ops.push_back(Pred);
     Ops.push_back(Reg0);
     Ops.push_back(Chain);
-    return CurDAG->getMachineNode(Opc, dl, ResTys, Ops.data(), Ops.size());
+    SDNode *VSt =
+      CurDAG->getMachineNode(Opc, dl, ResTys, Ops.data(), Ops.size());
+
+    // Transfer memoperands.
+    cast<MachineSDNode>(VSt)->setMemRefs(MemOp, MemOp + 1);
+
+    return VSt;
   }
 
   // Otherwise, quad registers are stored with two separate instructions,
@@ -1675,6 +1690,7 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
   SDNode *VStA = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex], dl,
                                         MemAddr.getValueType(),
                                         MVT::Other, OpsA, 7);
+  cast<MachineSDNode>(VStA)->setMemRefs(MemOp, MemOp + 1);
   Chain = SDValue(VStA, 1);
 
   // Store the odd D registers.
@@ -1691,8 +1707,10 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
   Ops.push_back(Pred);
   Ops.push_back(Reg0);
   Ops.push_back(Chain);
-  return CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys,
-                                Ops.data(), Ops.size());
+  SDNode *VStB = CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys,
+                                        Ops.data(), Ops.size());
+  cast<MachineSDNode>(VStB)->setMemRefs(MemOp, MemOp + 1);
+  return VStB;
 }
 
 SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
@@ -1708,6 +1726,9 @@ SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
   if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align))
     return NULL;
 
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+
   SDValue Chain = N->getOperand(0);
   unsigned Lane =
     cast<ConstantSDNode>(N->getOperand(Vec0Idx + NumVecs))->getZExtValue();
@@ -1794,6 +1815,7 @@ SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
                                   QOpcodes[OpcodeIndex]);
   SDNode *VLdLn = CurDAG->getMachineNode(Opc, dl, ResTys,
                                          Ops.data(), Ops.size());
+  cast<MachineSDNode>(VLdLn)->setMemRefs(MemOp, MemOp + 1);
   if (!IsLoad)
     return VLdLn;
 
@@ -1820,6 +1842,9 @@ SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating,
   if (!SelectAddrMode6(N, N->getOperand(1), MemAddr, Align))
     return NULL;
 
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+
   SDValue Chain = N->getOperand(0);
   EVT VT = N->getValueType(0);
 
@@ -1864,12 +1889,13 @@ SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating,
 
   unsigned ResTyElts = (NumVecs == 3) ? 4 : NumVecs;
   std::vector<EVT> ResTys;
-  ResTys.push_back(EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, ResTyElts));
+  ResTys.push_back(EVT::getVectorVT(*CurDAG->getContext(), MVT::i64,ResTyElts));
   if (isUpdating)
     ResTys.push_back(MVT::i32);
   ResTys.push_back(MVT::Other);
   SDNode *VLdDup =
     CurDAG->getMachineNode(Opc, dl, ResTys, Ops.data(), Ops.size());
+  cast<MachineSDNode>(VLdDup)->setMemRefs(MemOp, MemOp + 1);
   SuperReg = SDValue(VLdDup, 0);
 
   // Extract the subregisters.
@@ -2676,6 +2702,111 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     default:
       break;
 
+    case Intrinsic::arm_ldrexd: {
+      SDValue MemAddr = N->getOperand(2);
+      DebugLoc dl = N->getDebugLoc();
+      SDValue Chain = N->getOperand(0);
+
+      unsigned NewOpc = ARM::LDREXD;
+      if (Subtarget->isThumb() && Subtarget->hasThumb2())
+        NewOpc = ARM::t2LDREXD;
+
+      // arm_ldrexd returns a i64 value in {i32, i32}
+      std::vector<EVT> ResTys;
+      ResTys.push_back(MVT::i32);
+      ResTys.push_back(MVT::i32);
+      ResTys.push_back(MVT::Other);
+
+      // place arguments in the right order
+      SmallVector<SDValue, 7> Ops;
+      Ops.push_back(MemAddr);
+      Ops.push_back(getAL(CurDAG));
+      Ops.push_back(CurDAG->getRegister(0, MVT::i32));
+      Ops.push_back(Chain);
+      SDNode *Ld = CurDAG->getMachineNode(NewOpc, dl, ResTys, Ops.data(),
+                                          Ops.size());
+      // Transfer memoperands.
+      MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+      MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+      cast<MachineSDNode>(Ld)->setMemRefs(MemOp, MemOp + 1);
+
+      // Until there's support for specifing explicit register constraints
+      // like the use of even/odd register pair, hardcode ldrexd to always
+      // use the pair [R0, R1] to hold the load result.
+      Chain = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ARM::R0,
+                                   SDValue(Ld, 0), SDValue(0,0));
+      Chain = CurDAG->getCopyToReg(Chain, dl, ARM::R1,
+                                   SDValue(Ld, 1), Chain.getValue(1));
+
+      // Remap uses.
+      SDValue Glue = Chain.getValue(1);
+      if (!SDValue(N, 0).use_empty()) {
+        SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+                                                ARM::R0, MVT::i32, Glue);
+        Glue = Result.getValue(2);
+        ReplaceUses(SDValue(N, 0), Result);
+      }
+      if (!SDValue(N, 1).use_empty()) {
+        SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+                                                ARM::R1, MVT::i32, Glue);
+        Glue = Result.getValue(2);
+        ReplaceUses(SDValue(N, 1), Result);
+      }
+
+      ReplaceUses(SDValue(N, 2), SDValue(Ld, 2));
+      return NULL;
+    }
+
+    case Intrinsic::arm_strexd: {
+      DebugLoc dl = N->getDebugLoc();
+      SDValue Chain = N->getOperand(0);
+      SDValue Val0 = N->getOperand(2);
+      SDValue Val1 = N->getOperand(3);
+      SDValue MemAddr = N->getOperand(4);
+
+      // Until there's support for specifing explicit register constraints
+      // like the use of even/odd register pair, hardcode strexd to always
+      // use the pair [R2, R3] to hold the i64 (i32, i32) value to be stored.
+      Chain = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ARM::R2, Val0,
+                                   SDValue(0, 0));
+      Chain = CurDAG->getCopyToReg(Chain, dl, ARM::R3, Val1, Chain.getValue(1));
+
+      SDValue Glue = Chain.getValue(1);
+      Val0 = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+                                    ARM::R2, MVT::i32, Glue);
+      Glue = Val0.getValue(1);
+      Val1 = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+                                    ARM::R3, MVT::i32, Glue);
+
+      // Store exclusive double return a i32 value which is the return status
+      // of the issued store.
+      std::vector<EVT> ResTys;
+      ResTys.push_back(MVT::i32);
+      ResTys.push_back(MVT::Other);
+
+      // place arguments in the right order
+      SmallVector<SDValue, 7> Ops;
+      Ops.push_back(Val0);
+      Ops.push_back(Val1);
+      Ops.push_back(MemAddr);
+      Ops.push_back(getAL(CurDAG));
+      Ops.push_back(CurDAG->getRegister(0, MVT::i32));
+      Ops.push_back(Chain);
+
+      unsigned NewOpc = ARM::STREXD;
+      if (Subtarget->isThumb() && Subtarget->hasThumb2())
+        NewOpc = ARM::t2STREXD;
+
+      SDNode *St = CurDAG->getMachineNode(NewOpc, dl, ResTys, Ops.data(),
+                                          Ops.size());
+      // Transfer memoperands.
+      MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+      MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+      cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
+
+      return St;
+    }
+
     case Intrinsic::arm_neon_vld1: {
       unsigned DOpcodes[] = { ARM::VLD1d8, ARM::VLD1d16,
                               ARM::VLD1d32, ARM::VLD1d64 };
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 7c456ed..7c44c10 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -72,10 +72,25 @@ ARMInterworking("arm-interworking", cl::Hidden,
   cl::desc("Enable / disable ARM interworking (for debugging only)"),
   cl::init(true));
 
-static cl::opt<std::string>
-TrapFuncName("arm-trap-func", cl::Hidden,
-  cl::desc("Emit a call to trap function rather than a trap instruction"),
-  cl::init(""));
+namespace llvm {
+  class ARMCCState : public CCState {
+  public:
+    ARMCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF,
+               const TargetMachine &TM, SmallVector<CCValAssign, 16> &locs,
+               LLVMContext &C, ParmContext PC)
+        : CCState(CC, isVarArg, MF, TM, locs, C) {
+      assert(((PC == Call) || (PC == Prologue)) &&
+             "ARMCCState users must specify whether their context is call"
+             "or prologue generation.");
+      CallOrPrologue = PC;
+    }
+  };
+}
+
+// The APCS parameter registers.
+static const unsigned GPRArgRegs[] = {
+  ARM::R0, ARM::R1, ARM::R2, ARM::R3
+};
 
 void ARMTargetLowering::addTypeForNEON(EVT VT, EVT PromotedLdStVT,
                                        EVT PromotedBitwiseVT) {
@@ -396,11 +411,12 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     setLibcallCallingConv(RTLIB::UDIV_I8, CallingConv::ARM_AAPCS);
     setLibcallCallingConv(RTLIB::UDIV_I16, CallingConv::ARM_AAPCS);
     setLibcallCallingConv(RTLIB::UDIV_I32, CallingConv::ARM_AAPCS);
-  }
 
-  if (HasDivModLibcall) {
-    setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
-    setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
+    // Memory operations
+    // RTABI chapter 4.3.4
+    setLibcallName(RTLIB::MEMCPY,  "__aeabi_memcpy");
+    setLibcallName(RTLIB::MEMMOVE, "__aeabi_memmove");
+    setLibcallName(RTLIB::MEMSET,  "__aeabi_memset");
   }
 
   if (Subtarget->isThumb1Only())
@@ -516,18 +532,15 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   }
 
   // i64 operation support.
+  setOperationAction(ISD::MUL,     MVT::i64, Expand);
+  setOperationAction(ISD::MULHU,   MVT::i32, Expand);
   if (Subtarget->isThumb1Only()) {
-    setOperationAction(ISD::MUL,     MVT::i64, Expand);
-    setOperationAction(ISD::MULHU,   MVT::i32, Expand);
-    setOperationAction(ISD::MULHS,   MVT::i32, Expand);
     setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
     setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
-  } else {
-    setOperationAction(ISD::MUL,     MVT::i64, Expand);
-    setOperationAction(ISD::MULHU,   MVT::i32, Expand);
-    if (!Subtarget->hasV6Ops())
-      setOperationAction(ISD::MULHS, MVT::i32, Expand);
   }
+  if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops())
+    setOperationAction(ISD::MULHS, MVT::i32, Expand);
+
   setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
   setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
   setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
@@ -562,10 +575,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
   setOperationAction(ISD::BlockAddress, MVT::i32, Custom);
 
-  if (TrapFuncName.size())
-    setOperationAction(ISD::TRAP, MVT::Other, Custom);
-  else
-    setOperationAction(ISD::TRAP, MVT::Other, Legal);
+  setOperationAction(ISD::TRAP, MVT::Other, Legal);
 
   // Use the default implementation.
   setOperationAction(ISD::VASTART,            MVT::Other, Custom);
@@ -614,6 +624,18 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i8,  Expand);
     setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i16, Expand);
     setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i8,  Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i16, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i8,  Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i16, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i8,  Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i16, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i8,  Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i16, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Expand);
     // Since the libcalls include locking, fold in the fences
     setShouldFoldAtomicFences(true);
   }
@@ -649,6 +671,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
     setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
     setOperationAction(ISD::EH_SJLJ_DISPATCHSETUP, MVT::Other, Custom);
+    setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
   }
 
   setOperationAction(ISD::SETCC,     MVT::i32, Expand);
@@ -723,6 +746,8 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   setMinStackArgumentAlignment(4);
 
   benefitFromCodePlacementOpt = true;
+
+  setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2);
 }
 
 // FIXME: It might make sense to define the representative register class as the
@@ -733,7 +758,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
 // pressure of the register class's representative and all of it's super
 // classes' representatives transitively. We have not implemented this because
 // of the difficulty prior to coalescing of modeling operand register classes
-// due to the common occurence of cross class copies and subregister insertions
+// due to the common occurrence of cross class copies and subregister insertions
 // and extractions.
 std::pair<const TargetRegisterClass*, uint8_t>
 ARMTargetLowering::findRepresentativeClass(EVT VT) const{
@@ -924,11 +949,6 @@ ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo) const {
   return ARM::createFastISel(funcInfo);
 }
 
-/// getFunctionAlignment - Return the Log2 alignment of this function.
-unsigned ARMTargetLowering::getFunctionAlignment(const Function *F) const {
-  return getTargetMachine().getSubtarget<ARMSubtarget>().isThumb() ? 1 : 2;
-}
-
 /// getMaximalGlobalOffset - Returns the maximal possible offset which can
 /// be used for loads / stores from the global.
 unsigned ARMTargetLowering::getMaximalGlobalOffset() const {
@@ -1066,8 +1086,8 @@ ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
 
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
-                 RVLocs, *DAG.getContext());
+  ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                    getTargetMachine(), RVLocs, *DAG.getContext(), Call);
   CCInfo.AnalyzeCallResult(Ins,
                            CCAssignFnForNode(CallConv, /* Return*/ true,
                                              isVarArg));
@@ -1128,22 +1148,6 @@ ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
   return Chain;
 }
 
-/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
-/// by "Src" to address "Dst" of size "Size".  Alignment information is
-/// specified by the specific parameter attribute.  The copy will be passed as
-/// a byval function parameter.
-/// Sometimes what we are copying is the end of a larger object, the part that
-/// does not fit in registers.
-static SDValue
-CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
-                          ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
-                          DebugLoc dl) {
-  SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
-  return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
-                       /*isVolatile=*/false, /*AlwaysInline=*/false,
-                       MachinePointerInfo(0), MachinePointerInfo(0));
-}
-
 /// LowerMemOpCallTo - Store the argument to the stack.
 SDValue
 ARMTargetLowering::LowerMemOpCallTo(SDValue Chain,
@@ -1154,9 +1158,6 @@ ARMTargetLowering::LowerMemOpCallTo(SDValue Chain,
   unsigned LocMemOffset = VA.getLocMemOffset();
   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
-  if (Flags.isByVal())
-    return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
-
   return DAG.getStore(Chain, dl, Arg, PtrOff,
                       MachinePointerInfo::getStack(LocMemOffset),
                       false, false, 0);
@@ -1220,8 +1221,8 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs,
-                 *DAG.getContext());
+  ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext(), Call);
   CCInfo.AnalyzeCallOperands(Outs,
                              CCAssignFnForNode(CallConv, /* Return*/ false,
                                                isVarArg));
@@ -1298,7 +1299,44 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
       }
     } else if (VA.isRegLoc()) {
       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
-    } else if (!IsSibCall || isByVal) {
+    } else if (isByVal) {
+      assert(VA.isMemLoc());
+      unsigned offset = 0;
+
+      // True if this byval aggregate will be split between registers
+      // and memory.
+      if (CCInfo.isFirstByValRegValid()) {
+        EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+        unsigned int i, j;
+        for (i = 0, j = CCInfo.getFirstByValReg(); j < ARM::R4; i++, j++) {
+          SDValue Const = DAG.getConstant(4*i, MVT::i32);
+          SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
+          SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg,
+                                     MachinePointerInfo(),
+                                     false, false, 0);
+          MemOpChains.push_back(Load.getValue(1));
+          RegsToPass.push_back(std::make_pair(j, Load));
+        }
+        offset = ARM::R4 - CCInfo.getFirstByValReg();
+        CCInfo.clearFirstByValReg();
+      }
+
+      unsigned LocMemOffset = VA.getLocMemOffset();
+      SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset);
+      SDValue Dst = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr,
+                                StkPtrOff);
+      SDValue SrcOffset = DAG.getIntPtrConstant(4*offset);
+      SDValue Src = DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg, SrcOffset);
+      SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset,
+                                         MVT::i32);
+      MemOpChains.push_back(DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode,
+                                          Flags.getByValAlign(),
+                                          /*isVolatile=*/false,
+                                          /*AlwaysInline=*/false,
+                                          MachinePointerInfo(0),
+                                          MachinePointerInfo(0)));
+
+    } else if (!IsSibCall) {
       assert(VA.isMemLoc());
 
       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
@@ -1331,7 +1369,7 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
     // than necessary, because it means that each store effectively depends
     // on every argument instead of just those arguments it would clobber.
 
-    // Do not flag preceeding copytoreg stuff together with the following stuff.
+    // Do not flag preceding copytoreg stuff together with the following stuff.
     InFlag = SDValue();
     for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
       Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
@@ -1492,14 +1530,32 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
 }
 
 /// HandleByVal - Every parameter *after* a byval parameter is passed
-/// on the stack.  Confiscate all the parameter registers to insure
+/// on the stack.  Remember the next parameter register to allocate,
+/// and then confiscate the rest of the parameter registers to insure
 /// this.
 void
-llvm::ARMTargetLowering::HandleByVal(CCState *State) const {
-  static const unsigned RegList1[] = {
-    ARM::R0, ARM::R1, ARM::R2, ARM::R3
-  };
-  do {} while (State->AllocateReg(RegList1, 4));
+llvm::ARMTargetLowering::HandleByVal(CCState *State, unsigned &size) const {
+  unsigned reg = State->AllocateReg(GPRArgRegs, 4);
+  assert((State->getCallOrPrologue() == Prologue ||
+          State->getCallOrPrologue() == Call) &&
+         "unhandled ParmContext");
+  if ((!State->isFirstByValRegValid()) &&
+      (ARM::R0 <= reg) && (reg <= ARM::R3)) {
+    State->setFirstByValReg(reg);
+    // At a call site, a byval parameter that is split between
+    // registers and memory needs its size truncated here.  In a
+    // function prologue, such byval parameters are reassembled in
+    // memory, and are not truncated.
+    if (State->getCallOrPrologue() == Call) {
+      unsigned excess = 4 * (ARM::R4 - reg);
+      assert(size >= excess && "expected larger existing stack allocation");
+      size -= excess;
+    }
+  }
+  // Confiscate any remaining parameter registers to preclude their
+  // assignment to subsequent parameters.
+  while (State->AllocateReg(GPRArgRegs, 4))
+    ;
 }
 
 /// MatchingStackOffset - Return true if the given stack call argument is
@@ -1596,13 +1652,13 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
   // results are returned in the same way as what the caller expects.
   if (!CCMatch) {
     SmallVector<CCValAssign, 16> RVLocs1;
-    CCState CCInfo1(CalleeCC, false, getTargetMachine(),
-                    RVLocs1, *DAG.getContext());
+    ARMCCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
+                       getTargetMachine(), RVLocs1, *DAG.getContext(), Call);
     CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForNode(CalleeCC, true, isVarArg));
 
     SmallVector<CCValAssign, 16> RVLocs2;
-    CCState CCInfo2(CallerCC, false, getTargetMachine(),
-                    RVLocs2, *DAG.getContext());
+    ARMCCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
+                       getTargetMachine(), RVLocs2, *DAG.getContext(), Call);
     CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForNode(CallerCC, true, isVarArg));
 
     if (RVLocs1.size() != RVLocs2.size())
@@ -1628,8 +1684,8 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
     // Check if stack adjustment is needed. For now, do not do this if any
     // argument is passed on the stack.
     SmallVector<CCValAssign, 16> ArgLocs;
-    CCState CCInfo(CalleeCC, isVarArg, getTargetMachine(),
-                   ArgLocs, *DAG.getContext());
+    ARMCCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
+                      getTargetMachine(), ArgLocs, *DAG.getContext(), Call);
     CCInfo.AnalyzeCallOperands(Outs,
                                CCAssignFnForNode(CalleeCC, false, isVarArg));
     if (CCInfo.getNextStackOffset()) {
@@ -1688,8 +1744,8 @@ ARMTargetLowering::LowerReturn(SDValue Chain,
   SmallVector<CCValAssign, 16> RVLocs;
 
   // CCState - Info about the registers and stack slots.
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(), RVLocs,
-                 *DAG.getContext());
+  ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                    getTargetMachine(), RVLocs, *DAG.getContext(), Call);
 
   // Analyze outgoing return values.
   CCInfo.AnalyzeReturn(Outs, CCAssignFnForNode(CallConv, /* Return */ true,
@@ -2041,7 +2097,8 @@ SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
   MachineFunction &MF = DAG.getMachineFunction();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
 
-  if (Subtarget->useMovt()) {
+  // FIXME: Enable this for static codegen when tool issues are fixed.
+  if (Subtarget->useMovt() && RelocM != Reloc::Static) {
     ++NumMovwMovt;
     // FIXME: Once remat is capable of dealing with instructions with register
     // operands, expand this into two nodes.
@@ -2116,7 +2173,7 @@ ARMTargetLowering::LowerEH_SJLJ_DISPATCHSETUP(SDValue Op, SelectionDAG &DAG)
   const {
   DebugLoc dl = Op.getDebugLoc();
   return DAG.getNode(ARMISD::EH_SJLJ_DISPATCHSETUP, dl, MVT::Other,
-                     Op.getOperand(0));
+                     Op.getOperand(0), Op.getOperand(1));
 }
 
 SDValue
@@ -2224,12 +2281,13 @@ static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG,
     // ARMv7 with MP extension has PLDW.
     return Op.getOperand(0);
 
-  if (Subtarget->isThumb())
+  unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
+  if (Subtarget->isThumb()) {
     // Invert the bits.
     isRead = ~isRead & 1;
-  unsigned isData = Subtarget->isThumb() ? 0 : 1;
+    isData = ~isData & 1;
+  }
 
-  // Currently there is no intrinsic that matches pli.
   return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0),
                      Op.getOperand(1), DAG.getConstant(isRead, MVT::i32),
                      DAG.getConstant(isData, MVT::i32));
@@ -2284,6 +2342,88 @@ ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
   return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2);
 }
 
+void
+ARMTargetLowering::computeRegArea(CCState &CCInfo, MachineFunction &MF,
+                                  unsigned &VARegSize, unsigned &VARegSaveSize)
+  const {
+  unsigned NumGPRs;
+  if (CCInfo.isFirstByValRegValid())
+    NumGPRs = ARM::R4 - CCInfo.getFirstByValReg();
+  else {
+    unsigned int firstUnalloced;
+    firstUnalloced = CCInfo.getFirstUnallocated(GPRArgRegs,
+                                                sizeof(GPRArgRegs) /
+                                                sizeof(GPRArgRegs[0]));
+    NumGPRs = (firstUnalloced <= 3) ? (4 - firstUnalloced) : 0;
+  }
+
+  unsigned Align = MF.getTarget().getFrameLowering()->getStackAlignment();
+  VARegSize = NumGPRs * 4;
+  VARegSaveSize = (VARegSize + Align - 1) & ~(Align - 1);
+}
+
+// The remaining GPRs hold either the beginning of variable-argument
+// data, or the beginning of an aggregate passed by value (usuall
+// byval).  Either way, we allocate stack slots adjacent to the data
+// provided by our caller, and store the unallocated registers there.
+// If this is a variadic function, the va_list pointer will begin with
+// these values; otherwise, this reassembles a (byval) structure that
+// was split between registers and memory.
+void
+ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
+                                        DebugLoc dl, SDValue &Chain,
+                                        unsigned ArgOffset) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  unsigned firstRegToSaveIndex;
+  if (CCInfo.isFirstByValRegValid())
+    firstRegToSaveIndex = CCInfo.getFirstByValReg() - ARM::R0;
+  else {
+    firstRegToSaveIndex = CCInfo.getFirstUnallocated
+      (GPRArgRegs, sizeof(GPRArgRegs) / sizeof(GPRArgRegs[0]));
+  }
+
+  unsigned VARegSize, VARegSaveSize;
+  computeRegArea(CCInfo, MF, VARegSize, VARegSaveSize);
+  if (VARegSaveSize) {
+    // If this function is vararg, store any remaining integer argument regs
+    // to their spots on the stack so that they may be loaded by deferencing
+    // the result of va_next.
+    AFI->setVarArgsRegSaveSize(VARegSaveSize);
+    AFI->setVarArgsFrameIndex(MFI->CreateFixedObject(VARegSaveSize,
+                                                     ArgOffset + VARegSaveSize
+                                                     - VARegSize,
+                                                     false));
+    SDValue FIN = DAG.getFrameIndex(AFI->getVarArgsFrameIndex(),
+                                    getPointerTy());
+
+    SmallVector<SDValue, 4> MemOps;
+    for (; firstRegToSaveIndex < 4; ++firstRegToSaveIndex) {
+      TargetRegisterClass *RC;
+      if (AFI->isThumb1OnlyFunction())
+        RC = ARM::tGPRRegisterClass;
+      else
+        RC = ARM::GPRRegisterClass;
+
+      unsigned VReg = MF.addLiveIn(GPRArgRegs[firstRegToSaveIndex], RC);
+      SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
+      SDValue Store =
+        DAG.getStore(Val.getValue(1), dl, Val, FIN,
+                 MachinePointerInfo::getFixedStack(AFI->getVarArgsFrameIndex()),
+                     false, false, 0);
+      MemOps.push_back(Store);
+      FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN,
+                        DAG.getConstant(4, getPointerTy()));
+    }
+    if (!MemOps.empty())
+      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                          &MemOps[0], MemOps.size());
+  } else
+    // This will point to the next argument passed via stack.
+    AFI->setVarArgsFrameIndex(MFI->CreateFixedObject(4, ArgOffset, true));
+}
+
 SDValue
 ARMTargetLowering::LowerFormalArguments(SDValue Chain,
                                         CallingConv::ID CallConv, bool isVarArg,
@@ -2292,7 +2432,6 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
                                         DebugLoc dl, SelectionDAG &DAG,
                                         SmallVectorImpl<SDValue> &InVals)
                                           const {
-
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
 
@@ -2300,8 +2439,8 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs,
-                 *DAG.getContext());
+  ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                    getTargetMachine(), ArgLocs, *DAG.getContext(), Prologue);
   CCInfo.AnalyzeFormalArguments(Ins,
                                 CCAssignFnForNode(CallConv, /* Return*/ false,
                                                   isVarArg));
@@ -2399,14 +2538,19 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
       if (index != lastInsIndex)
         {
           ISD::ArgFlagsTy Flags = Ins[index].Flags;
-          // FIXME: For now, all byval parameter objects are marked mutable. This can be
-          // changed with more analysis.
-          // In case of tail call optimization mark all arguments mutable. Since they
-          // could be overwritten by lowering of arguments in case of a tail call.
+          // FIXME: For now, all byval parameter objects are marked mutable.
+          // This can be changed with more analysis.
+          // In case of tail call optimization mark all arguments mutable.
+          // Since they could be overwritten by lowering of arguments in case of
+          // a tail call.
           if (Flags.isByVal()) {
-            unsigned Bytes = Flags.getByValSize();
+            unsigned VARegSize, VARegSaveSize;
+            computeRegArea(CCInfo, MF, VARegSize, VARegSaveSize);
+            VarArgStyleRegisters(CCInfo, DAG, dl, Chain, 0);
+            unsigned Bytes = Flags.getByValSize() - VARegSize;
             if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
-            int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), false);
+            int FI = MFI->CreateFixedObject(Bytes,
+                                            VA.getLocMemOffset(), false);
             InVals.push_back(DAG.getFrameIndex(FI, getPointerTy()));
           } else {
             int FI = MFI->CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
@@ -2424,55 +2568,8 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
   }
 
   // varargs
-  if (isVarArg) {
-    static const unsigned GPRArgRegs[] = {
-      ARM::R0, ARM::R1, ARM::R2, ARM::R3
-    };
-
-    unsigned NumGPRs = CCInfo.getFirstUnallocated
-      (GPRArgRegs, sizeof(GPRArgRegs) / sizeof(GPRArgRegs[0]));
-
-    unsigned Align = MF.getTarget().getFrameLowering()->getStackAlignment();
-    unsigned VARegSize = (4 - NumGPRs) * 4;
-    unsigned VARegSaveSize = (VARegSize + Align - 1) & ~(Align - 1);
-    unsigned ArgOffset = CCInfo.getNextStackOffset();
-    if (VARegSaveSize) {
-      // If this function is vararg, store any remaining integer argument regs
-      // to their spots on the stack so that they may be loaded by deferencing
-      // the result of va_next.
-      AFI->setVarArgsRegSaveSize(VARegSaveSize);
-      AFI->setVarArgsFrameIndex(
-        MFI->CreateFixedObject(VARegSaveSize,
-                               ArgOffset + VARegSaveSize - VARegSize,
-                               false));
-      SDValue FIN = DAG.getFrameIndex(AFI->getVarArgsFrameIndex(),
-                                      getPointerTy());
-
-      SmallVector<SDValue, 4> MemOps;
-      for (; NumGPRs < 4; ++NumGPRs) {
-        TargetRegisterClass *RC;
-        if (AFI->isThumb1OnlyFunction())
-          RC = ARM::tGPRRegisterClass;
-        else
-          RC = ARM::GPRRegisterClass;
-
-        unsigned VReg = MF.addLiveIn(GPRArgRegs[NumGPRs], RC);
-        SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
-        SDValue Store =
-          DAG.getStore(Val.getValue(1), dl, Val, FIN,
-               MachinePointerInfo::getFixedStack(AFI->getVarArgsFrameIndex()),
-                       false, false, 0);
-        MemOps.push_back(Store);
-        FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN,
-                          DAG.getConstant(4, getPointerTy()));
-      }
-      if (!MemOps.empty())
-        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                            &MemOps[0], MemOps.size());
-    } else
-      // This will point to the next argument passed via stack.
-      AFI->setVarArgsFrameIndex(MFI->CreateFixedObject(4, ArgOffset, true));
-  }
+  if (isVarArg)
+    VarArgStyleRegisters(CCInfo, DAG, dl, Chain, CCInfo.getNextStackOffset());
 
   return Chain;
 }
@@ -2618,10 +2715,11 @@ SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
       }
 
       if (True.getNode() && False.getNode()) {
-        EVT VT = Cond.getValueType();
+        EVT VT = Op.getValueType();
         SDValue ARMcc = Cond.getOperand(2);
         SDValue CCR = Cond.getOperand(3);
         SDValue Cmp = duplicateCmp(Cond.getOperand(4), DAG);
+        assert(True.getValueType() == VT);
         return DAG.getNode(ARMISD::CMOV, dl, VT, True, False, ARMcc, CCR, Cmp);
       }
     }
@@ -2960,7 +3058,10 @@ SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
         Tmp1 = DAG.getNode(ARMISD::VSHL, dl, OpVT,
                            DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1),
                            DAG.getConstant(32, MVT::i32));
-    }
+    } else if (VT == MVT::f32)
+      Tmp1 = DAG.getNode(ARMISD::VSHRu, dl, MVT::v1i64,
+                         DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1),
+                         DAG.getConstant(32, MVT::i32));
     Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0);
     Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1);
 
@@ -4104,7 +4205,16 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
   switch (OpNum) {
   default: llvm_unreachable("Unknown shuffle opcode!");
   case OP_VREV:
-    return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS);
+    // VREV divides the vector in half and swaps within the half.
+    if (VT.getVectorElementType() == MVT::i32 ||
+        VT.getVectorElementType() == MVT::f32)
+      return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS);
+    // vrev <4 x i16> -> VREV32
+    if (VT.getVectorElementType() == MVT::i16)
+      return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS);
+    // vrev <4 x i8> -> VREV16
+    assert(VT.getVectorElementType() == MVT::i8);
+    return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS);
   case OP_VDUP0:
   case OP_VDUP1:
   case OP_VDUP2:
@@ -4575,10 +4685,10 @@ LowerSDIV_v4i16(SDValue N0, SDValue N1, DebugLoc dl, SelectionDAG &DAG) {
   // Because short has a smaller range than ushort, we can actually get away
   // with only a single newton step.  This requires that we use a weird bias
   // of 89, however (again, this has been exhaustively tested).
-  // float4 result = as_float4(as_int4(xf*recip) + 89);
+  // float4 result = as_float4(as_int4(xf*recip) + 0x89);
   N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
-  N1 = DAG.getConstant(89, MVT::i32);
+  N1 = DAG.getConstant(0x89, MVT::i32);
   N1 = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, N1, N1, N1, N1);
   N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
@@ -4665,26 +4775,26 @@ static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) {
   N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0);
   N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1);
   N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
-  N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
+  SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
 
   // Use reciprocal estimate and two refinement steps.
   // float4 recip = vrecpeq_f32(yf);
   // recip *= vrecpsq_f32(yf, recip);
   // recip *= vrecpsq_f32(yf, recip);
   N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
-                   DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), N1);
+                   DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), BN1);
   N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
                    DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32),
-                   N1, N2);
+                   BN1, N2);
   N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
   N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
                    DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32),
-                   N1, N2);
+                   BN1, N2);
   N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
   // Simply multiplying by the reciprocal estimate can leave us a few ulps
   // too low, so we add 2 ulps (exhaustive testing shows that this is enough,
   // and that it will never cause us to return an answer too large).
-  // float4 result = as_float4(as_int4(xf*recip) + 89);
+  // float4 result = as_float4(as_int4(xf*recip) + 2);
   N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
   N1 = DAG.getConstant(2, MVT::i32);
@@ -4698,19 +4808,6 @@ static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) {
   return N0;
 }
 
-static SDValue LowerTrap(SDValue Op, SelectionDAG &DAG) {
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  TargetLowering::ArgListTy Args;
-  std::pair<SDValue, SDValue> CallResult =
-    TLI.LowerCallTo(Op.getOperand(0), Type::getVoidTy(*DAG.getContext()),
-                false, false, false, false, 0, CallingConv::C,
-                /*isTailCall=*/false,
-                /*isReturnValueUsed=*/true,
-                DAG.getExternalSymbol(TrapFuncName.c_str(), TLI.getPointerTy()),
-                Args, DAG, Op.getDebugLoc());
-  return CallResult.second;
-}
-
 SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
   default: llvm_unreachable("Don't know how to custom lower this!");
@@ -4757,7 +4854,6 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::MUL:           return LowerMUL(Op, DAG);
   case ISD::SDIV:          return LowerSDIV(Op, DAG);
   case ISD::UDIV:          return LowerUDIV(Op, DAG);
-  case ISD::TRAP:          return LowerTrap(Op, DAG);
   }
   return SDValue();
 }
@@ -4796,12 +4892,21 @@ ARMTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI,
   unsigned ptr     = MI->getOperand(1).getReg();
   unsigned oldval  = MI->getOperand(2).getReg();
   unsigned newval  = MI->getOperand(3).getReg();
-  unsigned scratch = BB->getParent()->getRegInfo()
-    .createVirtualRegister(ARM::GPRRegisterClass);
   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
   DebugLoc dl = MI->getDebugLoc();
   bool isThumb2 = Subtarget->isThumb2();
 
+  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+  unsigned scratch =
+    MRI.createVirtualRegister(isThumb2 ? ARM::rGPRRegisterClass
+                                       : ARM::GPRRegisterClass);
+
+  if (isThumb2) {
+    MRI.constrainRegClass(dest, ARM::rGPRRegisterClass);
+    MRI.constrainRegClass(oldval, ARM::rGPRRegisterClass);
+    MRI.constrainRegClass(newval, ARM::rGPRRegisterClass);
+  }
+
   unsigned ldrOpc, strOpc;
   switch (Size) {
   default: llvm_unreachable("unsupported size for AtomicCmpSwap!");
@@ -4893,8 +4998,14 @@ ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
   unsigned ptr = MI->getOperand(1).getReg();
   unsigned incr = MI->getOperand(2).getReg();
   DebugLoc dl = MI->getDebugLoc();
-
   bool isThumb2 = Subtarget->isThumb2();
+
+  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+  if (isThumb2) {
+    MRI.constrainRegClass(dest, ARM::rGPRRegisterClass);
+    MRI.constrainRegClass(ptr, ARM::rGPRRegisterClass);
+  }
+
   unsigned ldrOpc, strOpc;
   switch (Size) {
   default: llvm_unreachable("unsupported size for AtomicCmpSwap!");
@@ -4923,10 +5034,10 @@ ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
                   BB->end());
   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
-  MachineRegisterInfo &RegInfo = MF->getRegInfo();
-  unsigned scratch = RegInfo.createVirtualRegister(ARM::GPRRegisterClass);
-  unsigned scratch2 = (!BinOpcode) ? incr :
-    RegInfo.createVirtualRegister(ARM::GPRRegisterClass);
+  TargetRegisterClass *TRC =
+    isThumb2 ? ARM::tGPRRegisterClass : ARM::GPRRegisterClass;
+  unsigned scratch = MRI.createVirtualRegister(TRC);
+  unsigned scratch2 = (!BinOpcode) ? incr : MRI.createVirtualRegister(TRC);
 
   //  thisMBB:
   //   ...
@@ -4971,6 +5082,116 @@ ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
   return BB;
 }
 
+MachineBasicBlock *
+ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI,
+                                          MachineBasicBlock *BB,
+                                          unsigned Size,
+                                          bool signExtend,
+                                          ARMCC::CondCodes Cond) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction *MF = BB->getParent();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  unsigned dest = MI->getOperand(0).getReg();
+  unsigned ptr = MI->getOperand(1).getReg();
+  unsigned incr = MI->getOperand(2).getReg();
+  unsigned oldval = dest;
+  DebugLoc dl = MI->getDebugLoc();
+  bool isThumb2 = Subtarget->isThumb2();
+
+  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+  if (isThumb2) {
+    MRI.constrainRegClass(dest, ARM::rGPRRegisterClass);
+    MRI.constrainRegClass(ptr, ARM::rGPRRegisterClass);
+  }
+
+  unsigned ldrOpc, strOpc, extendOpc;
+  switch (Size) {
+  default: llvm_unreachable("unsupported size for AtomicCmpSwap!");
+  case 1:
+    ldrOpc = isThumb2 ? ARM::t2LDREXB : ARM::LDREXB;
+    strOpc = isThumb2 ? ARM::t2STREXB : ARM::STREXB;
+    extendOpc = isThumb2 ? ARM::t2SXTBr : ARM::SXTBr;
+    break;
+  case 2:
+    ldrOpc = isThumb2 ? ARM::t2LDREXH : ARM::LDREXH;
+    strOpc = isThumb2 ? ARM::t2STREXH : ARM::STREXH;
+    extendOpc = isThumb2 ? ARM::t2SXTHr : ARM::SXTHr;
+    break;
+  case 4:
+    ldrOpc = isThumb2 ? ARM::t2LDREX : ARM::LDREX;
+    strOpc = isThumb2 ? ARM::t2STREX : ARM::STREX;
+    extendOpc = 0;
+    break;
+  }
+
+  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MF->insert(It, loopMBB);
+  MF->insert(It, exitMBB);
+
+  // Transfer the remainder of BB and its successor edges to exitMBB.
+  exitMBB->splice(exitMBB->begin(), BB,
+                  llvm::next(MachineBasicBlock::iterator(MI)),
+                  BB->end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  TargetRegisterClass *TRC =
+    isThumb2 ? ARM::tGPRRegisterClass : ARM::GPRRegisterClass;
+  unsigned scratch = MRI.createVirtualRegister(TRC);
+  unsigned scratch2 = MRI.createVirtualRegister(TRC);
+
+  //  thisMBB:
+  //   ...
+  //   fallthrough --> loopMBB
+  BB->addSuccessor(loopMBB);
+
+  //  loopMBB:
+  //   ldrex dest, ptr
+  //   (sign extend dest, if required)
+  //   cmp dest, incr
+  //   cmov.cond scratch2, dest, incr
+  //   strex scratch, scratch2, ptr
+  //   cmp scratch, #0
+  //   bne- loopMBB
+  //   fallthrough --> exitMBB
+  BB = loopMBB;
+  AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr));
+
+  // Sign extend the value, if necessary.
+  if (signExtend && extendOpc) {
+    oldval = MRI.createVirtualRegister(ARM::GPRRegisterClass);
+    AddDefaultPred(BuildMI(BB, dl, TII->get(extendOpc), oldval).addReg(dest));
+  }
+
+  // Build compare and cmov instructions.
+  AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
+                 .addReg(oldval).addReg(incr));
+  BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2MOVCCr : ARM::MOVCCr), scratch2)
+         .addReg(oldval).addReg(incr).addImm(Cond).addReg(ARM::CPSR);
+
+  AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(scratch2)
+                 .addReg(ptr));
+  AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
+                 .addReg(scratch).addImm(0));
+  BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
+    .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
+
+  BB->addSuccessor(loopMBB);
+  BB->addSuccessor(exitMBB);
+
+  //  exitMBB:
+  //   ...
+  BB = exitMBB;
+
+  MI->eraseFromParent();   // The instruction is gone now.
+
+  return BB;
+}
+
 static
 MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) {
   for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
@@ -4980,6 +5201,72 @@ MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) {
   llvm_unreachable("Expecting a BB with two successors!");
 }
 
+// FIXME: This opcode table should obviously be expressed in the target
+// description. We probably just need a "machine opcode" value in the pseudo
+// instruction. But the ideal solution maybe to simply remove the "S" version
+// of the opcode altogether.
+struct AddSubFlagsOpcodePair {
+  unsigned PseudoOpc;
+  unsigned MachineOpc;
+};
+
+static AddSubFlagsOpcodePair AddSubFlagsOpcodeMap[] = {
+  {ARM::ADCSri, ARM::ADCri},
+  {ARM::ADCSrr, ARM::ADCrr},
+  {ARM::ADCSrs, ARM::ADCrs},
+  {ARM::SBCSri, ARM::SBCri},
+  {ARM::SBCSrr, ARM::SBCrr},
+  {ARM::SBCSrs, ARM::SBCrs},
+  {ARM::RSBSri, ARM::RSBri},
+  {ARM::RSBSrr, ARM::RSBrr},
+  {ARM::RSBSrs, ARM::RSBrs},
+  {ARM::RSCSri, ARM::RSCri},
+  {ARM::RSCSrs, ARM::RSCrs},
+  {ARM::t2ADCSri, ARM::t2ADCri},
+  {ARM::t2ADCSrr, ARM::t2ADCrr},
+  {ARM::t2ADCSrs, ARM::t2ADCrs},
+  {ARM::t2SBCSri, ARM::t2SBCri},
+  {ARM::t2SBCSrr, ARM::t2SBCrr},
+  {ARM::t2SBCSrs, ARM::t2SBCrs},
+  {ARM::t2RSBSri, ARM::t2RSBri},
+  {ARM::t2RSBSrs, ARM::t2RSBrs},
+};
+
+// Convert and Add or Subtract with Carry and Flags to a generic opcode with
+// CPSR<def> operand. e.g. ADCS (...) -> ADC (... CPSR<def>).
+//
+// FIXME: Somewhere we should assert that CPSR<def> is in the correct
+// position to be recognized by the target descrition as the 'S' bit.
+bool ARMTargetLowering::RemapAddSubWithFlags(MachineInstr *MI,
+                                             MachineBasicBlock *BB) const {
+  unsigned OldOpc = MI->getOpcode();
+  unsigned NewOpc = 0;
+
+  // This is only called for instructions that need remapping, so iterating over
+  // the tiny opcode table is not costly.
+  static const int NPairs =
+    sizeof(AddSubFlagsOpcodeMap) / sizeof(AddSubFlagsOpcodePair);
+  for (AddSubFlagsOpcodePair *Pair = &AddSubFlagsOpcodeMap[0],
+         *End = &AddSubFlagsOpcodeMap[NPairs]; Pair != End; ++Pair) {
+    if (OldOpc == Pair->PseudoOpc) {
+      NewOpc = Pair->MachineOpc;
+      break;
+    }
+  }
+  if (!NewOpc)
+    return false;
+
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc dl = MI->getDebugLoc();
+  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc));
+  for (unsigned i = 0; i < MI->getNumOperands(); ++i)
+    MIB.addOperand(MI->getOperand(i));
+  AddDefaultPred(MIB);
+  MIB.addReg(ARM::CPSR, RegState::Define); // S bit
+  MI->eraseFromParent();
+  return true;
+}
+
 MachineBasicBlock *
 ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
                                                MachineBasicBlock *BB) const {
@@ -4987,10 +5274,13 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   DebugLoc dl = MI->getDebugLoc();
   bool isThumb2 = Subtarget->isThumb2();
   switch (MI->getOpcode()) {
-  default:
+  default: {
+    if (RemapAddSubWithFlags(MI, BB))
+      return BB;
+
     MI->dump();
     llvm_unreachable("Unexpected instr type to insert");
-
+  }
   case ARM::ATOMIC_LOAD_ADD_I8:
      return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2ADDrr : ARM::ADDrr);
   case ARM::ATOMIC_LOAD_ADD_I16:
@@ -5033,6 +5323,34 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   case ARM::ATOMIC_LOAD_SUB_I32:
      return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr);
 
+  case ARM::ATOMIC_LOAD_MIN_I8:
+     return EmitAtomicBinaryMinMax(MI, BB, 1, true, ARMCC::LT);
+  case ARM::ATOMIC_LOAD_MIN_I16:
+     return EmitAtomicBinaryMinMax(MI, BB, 2, true, ARMCC::LT);
+  case ARM::ATOMIC_LOAD_MIN_I32:
+     return EmitAtomicBinaryMinMax(MI, BB, 4, true, ARMCC::LT);
+
+  case ARM::ATOMIC_LOAD_MAX_I8:
+     return EmitAtomicBinaryMinMax(MI, BB, 1, true, ARMCC::GT);
+  case ARM::ATOMIC_LOAD_MAX_I16:
+     return EmitAtomicBinaryMinMax(MI, BB, 2, true, ARMCC::GT);
+  case ARM::ATOMIC_LOAD_MAX_I32:
+     return EmitAtomicBinaryMinMax(MI, BB, 4, true, ARMCC::GT);
+
+  case ARM::ATOMIC_LOAD_UMIN_I8:
+     return EmitAtomicBinaryMinMax(MI, BB, 1, false, ARMCC::LO);
+  case ARM::ATOMIC_LOAD_UMIN_I16:
+     return EmitAtomicBinaryMinMax(MI, BB, 2, false, ARMCC::LO);
+  case ARM::ATOMIC_LOAD_UMIN_I32:
+     return EmitAtomicBinaryMinMax(MI, BB, 4, false, ARMCC::LO);
+
+  case ARM::ATOMIC_LOAD_UMAX_I8:
+     return EmitAtomicBinaryMinMax(MI, BB, 1, false, ARMCC::HI);
+  case ARM::ATOMIC_LOAD_UMAX_I16:
+     return EmitAtomicBinaryMinMax(MI, BB, 2, false, ARMCC::HI);
+  case ARM::ATOMIC_LOAD_UMAX_I32:
+     return EmitAtomicBinaryMinMax(MI, BB, 4, false, ARMCC::HI);
+
   case ARM::ATOMIC_SWAP_I8:  return EmitAtomicBinary(MI, BB, 1, 0);
   case ARM::ATOMIC_SWAP_I16: return EmitAtomicBinary(MI, BB, 2, 0);
   case ARM::ATOMIC_SWAP_I32: return EmitAtomicBinary(MI, BB, 4, 0);
@@ -5041,68 +5359,6 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   case ARM::ATOMIC_CMP_SWAP_I16: return EmitAtomicCmpSwap(MI, BB, 2);
   case ARM::ATOMIC_CMP_SWAP_I32: return EmitAtomicCmpSwap(MI, BB, 4);
 
-  case ARM::ADCSSri:
-  case ARM::ADCSSrr:
-  case ARM::ADCSSrs:
-  case ARM::SBCSSri:
-  case ARM::SBCSSrr:
-  case ARM::SBCSSrs:
-  case ARM::RSBSri:
-  case ARM::RSBSrr:
-  case ARM::RSBSrs:
-  case ARM::RSCSri:
-  case ARM::RSCSrs: {
-    unsigned OldOpc = MI->getOpcode();
-    unsigned Opc = 0;
-    switch (OldOpc) {
-      case ARM::ADCSSrr:
-        Opc = ARM::ADCrr;
-        break;
-      case ARM::ADCSSri:
-        Opc = ARM::ADCri;
-        break;
-      case ARM::ADCSSrs:
-        Opc = ARM::ADCrs;
-        break;
-      case ARM::SBCSSrr:
-        Opc = ARM::SBCrr;
-        break;
-      case ARM::SBCSSri:
-        Opc = ARM::SBCri;
-        break;
-      case ARM::SBCSSrs:
-        Opc = ARM::SBCrs;
-        break;
-      case ARM::RSBSri:
-        Opc = ARM::RSBri;
-        break;
-      case ARM::RSBSrr:
-        Opc = ARM::RSBrr;
-        break;
-      case ARM::RSBSrs:
-        Opc = ARM::RSBrs;
-        break;
-      case ARM::RSCSri:
-        Opc = ARM::RSCri;
-        break;
-      case ARM::RSCSrs:
-        Opc = ARM::RSCrs;
-        break;
-      default:
-        llvm_unreachable("Unknown opcode?");
-    }
-
-    MachineInstrBuilder MIB =
-      BuildMI(*BB, MI, MI->getDebugLoc(), TII->get(Opc));
-    for (unsigned i = 0; i < MI->getNumOperands(); ++i)
-      MIB.addOperand(MI->getOperand(i));
-    AddDefaultPred(MIB);
-    MIB.addReg(ARM::CPSR, RegState::Define); // S bit
-    MI->eraseFromParent();
-    return BB;
-  }
-
-
   case ARM::tMOVCCr_pseudo: {
     // To "insert" a SELECT_CC instruction, we actually have to insert the
     // diamond control-flow pattern.  The incoming instruction knows the
@@ -5267,12 +5523,109 @@ SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp,
   return SDValue();
 }
 
+// AddCombineToVPADDL- For pair-wise add on neon, use the vpaddl instruction 
+// (only after legalization).
+static SDValue AddCombineToVPADDL(SDNode *N, SDValue N0, SDValue N1,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const ARMSubtarget *Subtarget) {
+
+  // Only perform optimization if after legalize, and if NEON is available. We
+  // also expected both operands to be BUILD_VECTORs.
+  if (DCI.isBeforeLegalize() || !Subtarget->hasNEON()
+      || N0.getOpcode() != ISD::BUILD_VECTOR
+      || N1.getOpcode() != ISD::BUILD_VECTOR)
+    return SDValue();
+
+  // Check output type since VPADDL operand elements can only be 8, 16, or 32.
+  EVT VT = N->getValueType(0);
+  if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64)
+    return SDValue();
+
+  // Check that the vector operands are of the right form.
+  // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR
+  // operands, where N is the size of the formed vector.
+  // Each EXTRACT_VECTOR should have the same input vector and odd or even
+  // index such that we have a pair wise add pattern.
+
+  // Grab the vector that all EXTRACT_VECTOR nodes should be referencing.
+  if (N0->getOperand(0)->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+    return SDValue();
+  SDValue Vec = N0->getOperand(0)->getOperand(0);
+  SDNode *V = Vec.getNode();
+  unsigned nextIndex = 0;
+
+  // For each operands to the ADD which are BUILD_VECTORs, 
+  // check to see if each of their operands are an EXTRACT_VECTOR with
+  // the same vector and appropriate index.
+  for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) {
+    if (N0->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT
+        && N1->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+      
+      SDValue ExtVec0 = N0->getOperand(i);
+      SDValue ExtVec1 = N1->getOperand(i);
+      
+      // First operand is the vector, verify its the same.
+      if (V != ExtVec0->getOperand(0).getNode() ||
+          V != ExtVec1->getOperand(0).getNode())
+        return SDValue();
+      
+      // Second is the constant, verify its correct.
+      ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(ExtVec0->getOperand(1));
+      ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(ExtVec1->getOperand(1));
+      
+      // For the constant, we want to see all the even or all the odd.
+      if (!C0 || !C1 || C0->getZExtValue() != nextIndex
+          || C1->getZExtValue() != nextIndex+1)
+        return SDValue();
+
+      // Increment index.
+      nextIndex+=2;
+    } else 
+      return SDValue();
+  }
+
+  // Create VPADDL node.
+  SelectionDAG &DAG = DCI.DAG;
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  DebugLoc DL = N->getDebugLoc();
+
+  // Build operand list.
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls,
+                                TLI.getPointerTy()));
+
+  // Input is the vector.
+  Ops.push_back(Vec);
+  
+  // Get widened type and narrowed type.
+  MVT widenType;
+  unsigned numElem = VT.getVectorNumElements();
+  switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
+    case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break;
+    case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break;
+    case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break;
+    default:
+      assert(0 && "Invalid vector element type for padd optimization.");
+  }
+
+  SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, N->getDebugLoc(),
+                            widenType, &Ops[0], Ops.size());
+  return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, tmp);
+}
+
 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
 /// operands N0 and N1.  This is a helper for PerformADDCombine that is
 /// called with the default operands, and if that fails, with commuted
 /// operands.
 static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
-                                         TargetLowering::DAGCombinerInfo &DCI) {
+                                          TargetLowering::DAGCombinerInfo &DCI,
+                                          const ARMSubtarget *Subtarget){
+
+  // Attempt to create vpaddl for this add.
+  SDValue Result = AddCombineToVPADDL(N, N0, N1, DCI, Subtarget);
+  if (Result.getNode())
+    return Result;
+  
   // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
   if (N0.getOpcode() == ISD::SELECT && N0.getNode()->hasOneUse()) {
     SDValue Result = combineSelectAndUse(N, N0, N1, DCI);
@@ -5284,17 +5637,18 @@ static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
 ///
 static SDValue PerformADDCombine(SDNode *N,
-                                 TargetLowering::DAGCombinerInfo &DCI) {
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const ARMSubtarget *Subtarget) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
 
   // First try with the default operand order.
-  SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI);
+  SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget);
   if (Result.getNode())
     return Result;
 
   // If that didn't work, try again with the operands commuted.
-  return PerformADDCombineWithOperands(N, N1, N0, DCI);
+  return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget);
 }
 
 /// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
@@ -5333,7 +5687,7 @@ static SDValue PerformVMULCombine(SDNode *N,
   unsigned Opcode = N0.getOpcode();
   if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
       Opcode != ISD::FADD && Opcode != ISD::FSUB) {
-    Opcode = N0.getOpcode();
+    Opcode = N1.getOpcode();
     if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
         Opcode != ISD::FADD && Opcode != ISD::FSUB)
       return SDValue();
@@ -5414,7 +5768,7 @@ static SDValue PerformANDCombine(SDNode *N,
 
   if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
     return SDValue();
-  
+
   APInt SplatBits, SplatUndef;
   unsigned SplatBitSize;
   bool HasAnyUndefs;
@@ -5450,7 +5804,7 @@ static SDValue PerformORCombine(SDNode *N,
 
   if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
     return SDValue();
-  
+
   APInt SplatBits, SplatUndef;
   unsigned SplatBitSize;
   bool HasAnyUndefs;
@@ -5496,7 +5850,7 @@ static SDValue PerformORCombine(SDNode *N,
         EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
         SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT,
                                      N0->getOperand(1), N0->getOperand(0),
-                                     N1->getOperand(1));
+                                     N1->getOperand(0));
         return DAG.getNode(ISD::BITCAST, dl, VT, Result);
       }
     }
@@ -5619,8 +5973,8 @@ static SDValue PerformORCombine(SDNode *N,
   return SDValue();
 }
 
-/// PerformBFICombine - (bfi A, (and B, C1), C2) -> (bfi A, B, C2) iff
-/// C1 & C2 == C1.
+/// PerformBFICombine - (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff
+/// the bits being cleared by the AND are not demanded by the BFI.
 static SDValue PerformBFICombine(SDNode *N,
                                  TargetLowering::DAGCombinerInfo &DCI) {
   SDValue N1 = N->getOperand(1);
@@ -5628,9 +5982,12 @@ static SDValue PerformBFICombine(SDNode *N,
     ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
     if (!N11C)
       return SDValue();
-    unsigned Mask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+    unsigned InvMask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+    unsigned LSB = CountTrailingZeros_32(~InvMask);
+    unsigned Width = (32 - CountLeadingZeros_32(~InvMask)) - LSB;
+    unsigned Mask = (1 << Width)-1;
     unsigned Mask2 = N11C->getZExtValue();
-    if ((Mask & Mask2) == Mask2)
+    if ((Mask & (~Mask2)) == 0)
       return DCI.DAG.getNode(ARMISD::BFI, N->getDebugLoc(), N->getValueType(0),
                              N->getOperand(0), N1.getOperand(0),
                              N->getOperand(2));
@@ -5706,8 +6063,28 @@ static SDValue PerformSTORECombine(SDNode *N,
   // Otherwise, the i64 value will be legalized to a pair of i32 values.
   StoreSDNode *St = cast<StoreSDNode>(N);
   SDValue StVal = St->getValue();
-  if (!ISD::isNormalStore(St) || St->isVolatile() ||
-      StVal.getValueType() != MVT::i64 ||
+  if (!ISD::isNormalStore(St) || St->isVolatile())
+    return SDValue();
+
+  if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&
+      StVal.getNode()->hasOneUse() && !St->isVolatile()) {
+    SelectionDAG  &DAG = DCI.DAG;
+    DebugLoc DL = St->getDebugLoc();
+    SDValue BasePtr = St->getBasePtr();
+    SDValue NewST1 = DAG.getStore(St->getChain(), DL,
+                                  StVal.getNode()->getOperand(0), BasePtr,
+                                  St->getPointerInfo(), St->isVolatile(),
+                                  St->isNonTemporal(), St->getAlignment());
+
+    SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
+                                    DAG.getConstant(4, MVT::i32));
+    return DAG.getStore(NewST1.getValue(0), DL, StVal.getNode()->getOperand(1),
+                        OffsetPtr, St->getPointerInfo(), St->isVolatile(),
+                        St->isNonTemporal(),
+                        std::min(4U, St->getAlignment() / 2));
+  }
+
+  if (StVal.getValueType() != MVT::i64 ||
       StVal.getNode()->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
     return SDValue();
 
@@ -6479,7 +6856,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
   switch (N->getOpcode()) {
   default: break;
-  case ISD::ADD:        return PerformADDCombine(N, DCI);
+  case ISD::ADD:        return PerformADDCombine(N, DCI, Subtarget);
   case ISD::SUB:        return PerformSUBCombine(N, DCI);
   case ISD::MUL:        return PerformMULCombine(N, DCI, Subtarget);
   case ISD::OR:         return PerformORCombine(N, DCI, Subtarget);
@@ -6753,6 +7130,14 @@ bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
   return Imm >= 0 && Imm <= 255;
 }
 
+/// isLegalAddImmediate - Return true if the specified immediate is legal
+/// add immediate, that is the target has add instructions which can add
+/// a register with the immediate without having to materialize the
+/// immediate into a register.
+bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const {
+  return ARM_AM::getSOImmVal(Imm) != -1;
+}
+
 static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT,
                                       bool isSEXTLoad, SDValue &Base,
                                       SDValue &Offset, bool &isInc,
@@ -6995,6 +7380,9 @@ ARMTargetLowering::getConstraintType(const std::string &Constraint) const {
     case 'l': return C_RegisterClass;
     case 'w': return C_RegisterClass;
     }
+  } else {
+    if (Constraint == "Uv")
+      return C_Memory;
   }
   return TargetLowering::getConstraintType(Constraint);
 }
@@ -7106,12 +7494,16 @@ getRegClassForInlineAsmConstraint(const std::string &Constraint,
 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
 /// vector.  If it is invalid, don't add anything to Ops.
 void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
-                                                     char Constraint,
+                                                     std::string &Constraint,
                                                      std::vector<SDValue>&Ops,
                                                      SelectionDAG &DAG) const {
   SDValue Result(0, 0);
 
-  switch (Constraint) {
+  // Currently only support length 1 constraints.
+  if (Constraint.length() != 1) return;
+
+  char ConstraintLetter = Constraint[0];
+  switch (ConstraintLetter) {
   default: break;
   case 'I': case 'J': case 'K': case 'L':
   case 'M': case 'N': case 'O':
@@ -7126,7 +7518,7 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
     if (CVal != CVal64)
       return;
 
-    switch (Constraint) {
+    switch (ConstraintLetter) {
       case 'I':
         if (Subtarget->isThumb1Only()) {
           // This must be a constant between 0 and 255, for ADD
@@ -7389,6 +7781,28 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.writeMem = true;
     return true;
   }
+  case Intrinsic::arm_strexd: {
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::i64;
+    Info.ptrVal = I.getArgOperand(2);
+    Info.offset = 0;
+    Info.align = 8;
+    Info.vol = true;
+    Info.readMem = false;
+    Info.writeMem = true;
+    return true;
+  }
+  case Intrinsic::arm_ldrexd: {
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::i64;
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.align = 8;
+    Info.vol = true;
+    Info.readMem = true;
+    Info.writeMem = false;
+    return true;
+  }
   default:
     break;
   }
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index e37855d..21a9a3a 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -88,7 +88,7 @@ namespace llvm {
       MEMBARRIER_MCR, // Memory barrier (MCR)
 
       PRELOAD,      // Preload
-      
+
       VCEQ,         // Vector compare equal.
       VCEQZ,        // Vector compare equal to zero.
       VCGE,         // Vector compare greater than or equal.
@@ -173,7 +173,7 @@ namespace llvm {
 
       // Bit-field insert
       BFI,
-      
+
       // Vector OR with immediate
       VORRIMM,
       // Vector AND with NOT of immediate
@@ -264,6 +264,12 @@ namespace llvm {
     /// the immediate into a register.
     virtual bool isLegalICmpImmediate(int64_t Imm) const;
 
+    /// isLegalAddImmediate - Return true if the specified immediate is legal
+    /// add immediate, that is the target has add instructions which can
+    /// add a register and the immediate without having to materialize
+    /// the immediate into a register.
+    virtual bool isLegalAddImmediate(int64_t Imm) const;
+
     /// getPreIndexedAddressParts - returns true by value, base pointer and
     /// offset pointer and addressing mode by reference if the node's address
     /// can be legally represented as pre-indexed load / store address.
@@ -309,7 +315,7 @@ namespace llvm {
     /// true it means one of the asm constraint of the inline asm instruction
     /// being processed is 'm'.
     virtual void LowerAsmOperandForConstraint(SDValue Op,
-                                              char ConstraintLetter,
+                                              std::string &Constraint,
                                               std::vector<SDValue> &Ops,
                                               SelectionDAG &DAG) const;
 
@@ -321,9 +327,6 @@ namespace llvm {
     /// specified value type.
     virtual TargetRegisterClass *getRegClassFor(EVT VT) const;
 
-    /// getFunctionAlignment - Return the Log2 alignment of this function.
-    virtual unsigned getFunctionAlignment(const Function *F) const;
-
     /// getMaximalGlobalOffset - Returns the maximal possible offset which can
     /// be used for loads / stores from the global.
     virtual unsigned getMaximalGlobalOffset() const;
@@ -408,7 +411,7 @@ namespace llvm {
     SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, 
+    SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
                               const ARMSubtarget *ST) const;
 
     SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const;
@@ -426,6 +429,13 @@ namespace llvm {
                            DebugLoc dl, SelectionDAG &DAG,
                            SmallVectorImpl<SDValue> &InVals) const;
 
+    void VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
+                              DebugLoc dl, SDValue &Chain, unsigned ArgOffset)
+      const;
+
+    void computeRegArea(CCState &CCInfo, MachineFunction &MF,
+                        unsigned &VARegSize, unsigned &VARegSaveSize) const;
+
     virtual SDValue
       LowerCall(SDValue Chain, SDValue Callee,
                 CallingConv::ID CallConv, bool isVarArg,
@@ -437,7 +447,7 @@ namespace llvm {
                 SmallVectorImpl<SDValue> &InVals) const;
 
     /// HandleByVal - Target-specific cleanup for ByVal support.
-    virtual void HandleByVal(CCState *) const;
+    virtual void HandleByVal(CCState *, unsigned &) const;
 
     /// IsEligibleForTailCallOptimization - Check whether the call is eligible
     /// for tail call optimization. Targets which want to do tail call
@@ -477,16 +487,22 @@ namespace llvm {
                                         MachineBasicBlock *BB,
                                         unsigned Size,
                                         unsigned BinOpcode) const;
+    MachineBasicBlock * EmitAtomicBinaryMinMax(MachineInstr *MI,
+                                               MachineBasicBlock *BB,
+                                               unsigned Size,
+                                               bool signExtend,
+                                               ARMCC::CondCodes Cond) const;
 
+    bool RemapAddSubWithFlags(MachineInstr *MI, MachineBasicBlock *BB) const;
   };
-  
+
   enum NEONModImmType {
     VMOVModImm,
     VMVNModImm,
     OtherModImm
   };
-  
-  
+
+
   namespace ARM {
     FastISel *createFastISel(FunctionLoweringInfo &funcInfo);
   }
diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td
index f5fb98e..897d8a5 100644
--- a/lib/Target/ARM/ARMInstrFormats.td
+++ b/lib/Target/ARM/ARMInstrFormats.td
@@ -860,6 +860,9 @@ class APKHI<bits<8> opcod, bit tb, dag oops, dag iops, InstrItinClass itin,
 class ARMPat<dag pattern, dag result> : Pat<pattern, result> {
   list<Predicate> Predicates = [IsARM];
 }
+class ARMV5TPat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsARM, HasV5T];
+}
 class ARMV5TEPat<dag pattern, dag result> : Pat<pattern, result> {
   list<Predicate> Predicates = [IsARM, HasV5TE];
 }
@@ -1020,6 +1023,10 @@ class T1LoadStore<bits<4> opA, bits<3> opB> : Encoding16 {
 }
 class T1LdStSP<bits<3> opB>   : T1LoadStore<0b1001, opB>; // SP relative
 
+class T1BranchCond<bits<4> opcode> : Encoding16 {
+  let Inst{15-12} = opcode;
+}
+
 // Helper classes to encode Thumb1 loads and stores. For immediates, the
 // following bits are used for "opA" (see A6.2.4):
 //
@@ -1208,6 +1215,11 @@ class T1Pat<dag pattern, dag result> : Pat<pattern, result> {
   list<Predicate> Predicates = [IsThumb, IsThumb1Only];
 }
 
+// T2v6Pat - Same as Pat<>, but requires V6T2 Thumb2 mode.
+class T2v6Pat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsThumb2, HasV6T2];
+}
+
 // T2Pat - Same as Pat<>, but requires that the compiler be in Thumb2 mode.
 class T2Pat<dag pattern, dag result> : Pat<pattern, result> {
   list<Predicate> Predicates = [IsThumb2];
@@ -1742,9 +1754,10 @@ class N2VImm<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4,
 
 // NEON 3 vector register format.
 
-class N3VCommon<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4,
-          dag oops, dag iops, Format f, InstrItinClass itin,
-          string opc, string dt, string asm, string cstr, list<dag> pattern>
+class N3VCommon<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6,
+                bit op4, dag oops, dag iops, Format f, InstrItinClass itin,
+                string opc, string dt, string asm, string cstr,
+                list<dag> pattern>
   : NDataI<oops, iops, f, itin, opc, dt, asm, cstr, pattern> {
   let Inst{24}    = op24;
   let Inst{23}    = op23;
@@ -1773,9 +1786,10 @@ class N3V<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4,
   let Inst{5}     = Vm{4};
 }
 
-class N3VLane32<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4,
-          dag oops, dag iops, Format f, InstrItinClass itin,
-          string opc, string dt, string asm, string cstr, list<dag> pattern>
+class N3VLane32<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6,
+                bit op4, dag oops, dag iops, Format f, InstrItinClass itin,
+                string opc, string dt, string asm, string cstr,
+                list<dag> pattern>
   : N3VCommon<op24, op23, op21_20, op11_8, op6, op4,
               oops, iops, f, itin, opc, dt, asm, cstr, pattern> {
 
@@ -1793,9 +1807,10 @@ class N3VLane32<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bi
   let Inst{5}     = lane;
 }
 
-class N3VLane16<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4,
-          dag oops, dag iops, Format f, InstrItinClass itin,
-          string opc, string dt, string asm, string cstr, list<dag> pattern>
+class N3VLane16<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6,
+                bit op4, dag oops, dag iops, Format f, InstrItinClass itin,
+                string opc, string dt, string asm, string cstr,
+                list<dag> pattern>
   : N3VCommon<op24, op23, op21_20, op11_8, op6, op4,
               oops, iops, f, itin, opc, dt, asm, cstr, pattern> {
 
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index b787d35..2537fc3 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -58,10 +58,13 @@ def SDT_ARMEH_SJLJ_Setjmp : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisPtrTy<1>,
                                                  SDTCisInt<2>]>;
 def SDT_ARMEH_SJLJ_Longjmp: SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisInt<1>]>;
 
-def SDT_ARMEH_SJLJ_DispatchSetup: SDTypeProfile<0, 0, []>;
+def SDT_ARMEH_SJLJ_DispatchSetup: SDTypeProfile<0, 1, [SDTCisInt<0>]>;
 
 def SDT_ARMMEMBARRIER     : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
 
+def SDT_ARMPREFETCH : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisSameAs<1, 2>,
+                                           SDTCisInt<1>]>;
+
 def SDT_ARMTCRET : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
 
 def SDT_ARMBFI : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
@@ -130,7 +133,7 @@ def ARMMemBarrier     : SDNode<"ARMISD::MEMBARRIER", SDT_ARMMEMBARRIER,
                                [SDNPHasChain]>;
 def ARMMemBarrierMCR  : SDNode<"ARMISD::MEMBARRIER_MCR", SDT_ARMMEMBARRIER,
                                [SDNPHasChain]>;
-def ARMPreload        : SDNode<"ARMISD::PRELOAD", SDTPrefetch,
+def ARMPreload        : SDNode<"ARMISD::PRELOAD", SDT_ARMPREFETCH,
                                [SDNPHasChain, SDNPMayLoad, SDNPMayStore]>;
 
 def ARMrbit          : SDNode<"ARMISD::RBIT", SDTIntUnaryOp>;
@@ -203,13 +206,13 @@ def so_imm_not_XFORM : SDNodeXForm<imm, [{
 }]>;
 
 /// imm1_15 predicate - True if the 32-bit immediate is in the range [1,15].
-def imm1_15 : PatLeaf<(i32 imm), [{
-  return (int32_t)N->getZExtValue() >= 1 && (int32_t)N->getZExtValue() < 16;
+def imm1_15 : ImmLeaf<i32, [{
+  return (int32_t)Imm >= 1 && (int32_t)Imm < 16;
 }]>;
 
 /// imm16_31 predicate - True if the 32-bit immediate is in the range [16,31].
-def imm16_31 : PatLeaf<(i32 imm), [{
-  return (int32_t)N->getZExtValue() >= 16 && (int32_t)N->getZExtValue() < 32;
+def imm16_31 : ImmLeaf<i32, [{
+  return (int32_t)Imm >= 16 && (int32_t)Imm < 32;
 }]>;
 
 def so_imm_neg :
@@ -239,8 +242,8 @@ def lo16AllZero : PatLeaf<(i32 imm), [{
 
 /// imm0_65535 predicate - True if the 32-bit immediate is in the range
 /// [0.65535].
-def imm0_65535 : PatLeaf<(i32 imm), [{
-  return (uint32_t)N->getZExtValue() < 65536;
+def imm0_65535 : ImmLeaf<i32, [{
+  return Imm >= 0 && Imm < 65536;
 }]>;
 
 class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>;
@@ -375,8 +378,8 @@ def neon_vcvt_imm32 : Operand<i32> {
 }
 
 // rot_imm: An integer that encodes a rotate amount. Must be 8, 16, or 24.
-def rot_imm : Operand<i32>, PatLeaf<(i32 imm), [{
-    int32_t v = (int32_t)N->getZExtValue();
+def rot_imm : Operand<i32>, ImmLeaf<i32, [{
+    int32_t v = (int32_t)Imm;
     return v == 8 || v == 16 || v == 24; }]> {
   let EncoderMethod = "getRotImmOpValue";
 }
@@ -412,7 +415,9 @@ def shift_so_reg : Operand<i32>,    // reg reg imm
 
 // so_imm - Match a 32-bit shifter_operand immediate operand, which is an
 // 8-bit immediate rotated by an arbitrary number of bits.
-def so_imm : Operand<i32>, PatLeaf<(imm), [{ return Pred_so_imm(N); }]> {
+def so_imm : Operand<i32>, ImmLeaf<i32, [{
+    return ARM_AM::getSOImmVal(Imm) != -1;
+  }]> {
   let EncoderMethod = "getSOImmOpValue";
   let PrintMethod = "printSOImmOperand";
 }
@@ -433,13 +438,13 @@ def arm_i32imm : PatLeaf<(imm), [{
 }]>;
 
 /// imm0_31 predicate - True if the 32-bit immediate is in the range [0,31].
-def imm0_31 : Operand<i32>, PatLeaf<(imm), [{
-  return (int32_t)N->getZExtValue() < 32;
+def imm0_31 : Operand<i32>, ImmLeaf<i32, [{
+  return Imm >= 0 && Imm < 32;
 }]>;
 
 /// imm0_31_m1 - Matches and prints like imm0_31, but encodes as 'value - 1'.
-def imm0_31_m1 : Operand<i32>, PatLeaf<(imm), [{
-  return (int32_t)N->getZExtValue() < 32;
+def imm0_31_m1 : Operand<i32>, ImmLeaf<i32, [{
+  return Imm >= 0 && Imm < 32;
 }]> {
   let EncoderMethod = "getImmMinusOneOpValue";
 }
@@ -462,17 +467,23 @@ def bf_inv_mask_imm : Operand<i32>,
 }
 
 /// lsb_pos_imm - position of the lsb bit, used by BFI4p and t2BFI4p
-def lsb_pos_imm : Operand<i32>, PatLeaf<(imm), [{
-  return isInt<5>(N->getSExtValue());
+def lsb_pos_imm : Operand<i32>, ImmLeaf<i32, [{
+  return isInt<5>(Imm);
 }]>;
 
 /// width_imm - number of bits to be copied, used by BFI4p and t2BFI4p
-def width_imm : Operand<i32>, PatLeaf<(imm), [{
-  return N->getSExtValue() > 0 &&  N->getSExtValue() <= 32;
+def width_imm : Operand<i32>, ImmLeaf<i32, [{
+  return Imm > 0 &&  Imm <= 32;
 }] > {
   let EncoderMethod = "getMsbOpValue";
 }
 
+def ssat_imm : Operand<i32>, ImmLeaf<i32, [{
+  return Imm > 0 && Imm <= 32;
+}]> {
+  let EncoderMethod = "getSsatBitPosValue";
+}
+
 // Define ARM specific addressing modes.
 
 def MemMode2AsmOperand : AsmOperandClass {
@@ -586,6 +597,15 @@ def am6offset : Operand<i32>,
   let EncoderMethod = "getAddrMode6OffsetOpValue";
 }
 
+// Special version of addrmode6 to handle alignment encoding for VST1/VLD1
+// (single element from one lane) for size 32.
+def addrmode6oneL32 : Operand<i32>,
+                ComplexPattern<i32, 2, "SelectAddrMode6", [], [SDNPWantParent]>{
+  let PrintMethod = "printAddrMode6Operand";
+  let MIOperandInfo = (ops GPR:$addr, i32imm);
+  let EncoderMethod = "getAddrMode6OneLane32AddressOpValue";
+}
+
 // Special version of addrmode6 to handle alignment encoding for VLD-dup
 // instructions, specifically VLD4-dup.
 def addrmode6dup : Operand<i32>,
@@ -934,22 +954,25 @@ multiclass AI1_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
     let Inst{19-16} = Rn;
   }
 }
+}
+
 // Carry setting variants
 // NOTE: CPSR def omitted because it will be handled by the custom inserter.
 let usesCustomInserter = 1 in {
 multiclass AI1_adde_sube_s_irs<PatFrag opnode, bit Commutable = 0> {
-  def Sri : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm),
-                Size4Bytes, IIC_iALUi,
+  def ri : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm),
+               Size4Bytes, IIC_iALUi,
                [(set GPR:$Rd, (opnode GPR:$Rn, so_imm:$imm))]>;
-  def Srr : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
-                Size4Bytes, IIC_iALUr,
-               [(set GPR:$Rd, (opnode GPR:$Rn, GPR:$Rm))]>;
-  def Srs : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, so_reg:$shift),
-                Size4Bytes, IIC_iALUsr,
+  def rr : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+               Size4Bytes, IIC_iALUr,
+               [(set GPR:$Rd, (opnode GPR:$Rn, GPR:$Rm))]> {
+    let isCommutable = Commutable;
+  }
+  def rs : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, so_reg:$shift),
+               Size4Bytes, IIC_iALUsr,
                [(set GPR:$Rd, (opnode GPR:$Rn, so_reg:$shift))]>;
 }
 }
-}
 
 let canFoldAsLoad = 1, isReMaterializable = 1 in {
 multiclass AI_ldr1<bit isByte, string opc, InstrItinClass iii,
@@ -1299,6 +1322,15 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
     let Inst{3-0}  = dst;
   }
 
+  // For disassembly only.
+  def BX_pred : AXI<(outs), (ins GPR:$dst, pred:$p), BrMiscFrm, IIC_Br,
+                  "bx$p\t$dst", [/* pattern left blank */]>,
+              Requires<[IsARM, HasV4T]> {
+    bits<4> dst;
+    let Inst{27-4} = 0b000100101111111111110001;
+    let Inst{3-0}  = dst;
+  }
+
   // ARMV4 only
   // FIXME: We would really like to define this as a vanilla ARMPat like:
   // ARMPat<(brind GPR:$dst), (MOVr PC, GPR:$dst)>
@@ -1316,10 +1348,7 @@ let isCall = 1,
   // FIXME:  Do we really need a non-predicated version? If so, it should
   // at least be a pseudo instruction expanding to the predicated version
   // at MC lowering time.
-  Defs = [R0,  R1,  R2,  R3,  R12, LR,
-          D0,  D1,  D2,  D3,  D4,  D5,  D6,  D7,
-          D16, D17, D18, D19, D20, D21, D22, D23,
-          D24, D25, D26, D27, D28, D29, D30, D31, CPSR, FPSCR],
+  Defs = [R0,  R1,  R2,  R3,  R12, LR, QQQQ0, QQQQ2, QQQQ3, CPSR, FPSCR],
   Uses = [SP] in {
   def BL  : ABXI<0b1011, (outs), (ins bl_target:$func, variable_ops),
                 IIC_Br, "bl\t$func",
@@ -1373,10 +1402,7 @@ let isCall = 1,
   // On Darwin R9 is call-clobbered.
   // R7 is marked as a use to prevent frame-pointer assignments from being
   // moved above / below calls.
-  Defs = [R0,  R1,  R2,  R3,  R9,  R12, LR,
-          D0,  D1,  D2,  D3,  D4,  D5,  D6,  D7,
-          D16, D17, D18, D19, D20, D21, D22, D23,
-          D24, D25, D26, D27, D28, D29, D30, D31, CPSR, FPSCR],
+  Defs = [R0,  R1,  R2,  R3,  R9,  R12, LR, QQQQ0, QQQQ2, QQQQ3, CPSR, FPSCR],
   Uses = [R7, SP] in {
   def BLr9  : ARMPseudoInst<(outs), (ins bltarget:$func, variable_ops),
                 Size4Bytes, IIC_Br,
@@ -1415,10 +1441,7 @@ let isCall = 1,
 // FIXME: The Thumb versions of these should live in ARMInstrThumb.td
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
   // Darwin versions.
-  let Defs = [R0, R1, R2, R3, R9, R12,
-              D0, D1, D2, D3, D4, D5, D6, D7,
-              D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26,
-              D27, D28, D29, D30, D31, PC],
+  let Defs = [R0, R1, R2, R3, R9, R12, QQQQ0, QQQQ2, QQQQ3, PC],
       Uses = [SP] in {
     def TCRETURNdi : PseudoInst<(outs), (ins i32imm:$dst, variable_ops),
                        IIC_Br, []>, Requires<[IsDarwin]>;
@@ -1444,10 +1467,7 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
   }
 
   // Non-Darwin versions (the difference is R9).
-  let Defs = [R0, R1, R2, R3, R12,
-              D0, D1, D2, D3, D4, D5, D6, D7,
-              D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26,
-              D27, D28, D29, D30, D31, PC],
+  let Defs = [R0, R1, R2, R3, R12, QQQQ0, QQQQ2, QQQQ3, PC],
       Uses = [SP] in {
     def TCRETURNdiND : PseudoInst<(outs), (ins i32imm:$dst, variable_ops),
                        IIC_Br, []>, Requires<[IsNotDarwin]>;
@@ -1629,7 +1649,7 @@ def LDRSB : AI3ld<0b1101, 1, (outs GPR:$Rt), (ins addrmode3:$addr), LdMiscFrm,
                    IIC_iLoad_bh_r, "ldrsb", "\t$Rt, $addr",
                    [(set GPR:$Rt, (sextloadi8 addrmode3:$addr))]>;
 
-let mayLoad = 1, neverHasSideEffects = 1 in {
+let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in {
 // Load doubleword
 def LDRD : AI3ld<0b1101, 0, (outs GPR:$Rd, GPR:$dst2),
                  (ins addrmode3:$addr), LdMiscFrm,
@@ -1704,6 +1724,7 @@ let mayLoad = 1, neverHasSideEffects = 1 in {
 defm LDRH  : AI3_ldridx<0b1011, 1, "ldrh", IIC_iLoad_bh_ru>;
 defm LDRSH : AI3_ldridx<0b1111, 1, "ldrsh", IIC_iLoad_bh_ru>;
 defm LDRSB : AI3_ldridx<0b1101, 1, "ldrsb", IIC_iLoad_bh_ru>;
+let hasExtraDefRegAllocReq = 1 in {
 def LDRD_PRE : AI3ldstidx<0b1101, 0, 1, 1, (outs GPR:$Rt, GPR:$Rt2, GPR:$Rn_wb),
                           (ins addrmode3:$addr), IndexModePre,
                           LdMiscFrm, IIC_iLoad_d_ru,
@@ -1729,6 +1750,7 @@ def LDRD_POST: AI3ldstidx<0b1101, 0, 1, 0, (outs GPR:$Rt, GPR:$Rt2, GPR:$Rn_wb),
   let Inst{11-8}  = offset{7-4};    // imm7_4/zero
   let Inst{3-0}   = offset{3-0};    // imm3_0/Rm
 }
+} // hasExtraDefRegAllocReq = 1
 } // mayLoad = 1, neverHasSideEffects = 1
 
 // LDRT, LDRBT, LDRSBT, LDRHT, LDRSHT are for disassembly only.
@@ -1797,45 +1819,52 @@ def STRD : AI3str<0b1111, (outs), (ins GPR:$Rt, GPR:$src2, addrmode3:$addr),
 def STR_PRE  : AI2stridx<0, 1, (outs GPR:$Rn_wb),
                      (ins GPR:$Rt, GPR:$Rn, am2offset:$offset),
                      IndexModePre, StFrm, IIC_iStore_ru,
-                     "str", "\t$Rt, [$Rn, $offset]!", "$Rn = $Rn_wb",
+                     "str", "\t$Rt, [$Rn, $offset]!",
+                     "$Rn = $Rn_wb,@earlyclobber $Rn_wb",
                      [(set GPR:$Rn_wb,
                       (pre_store GPR:$Rt, GPR:$Rn, am2offset:$offset))]>;
 
 def STR_POST : AI2stridx<0, 0, (outs GPR:$Rn_wb),
                      (ins GPR:$Rt, GPR:$Rn, am2offset:$offset),
                      IndexModePost, StFrm, IIC_iStore_ru,
-                     "str", "\t$Rt, [$Rn], $offset", "$Rn = $Rn_wb",
+                     "str", "\t$Rt, [$Rn], $offset",
+                     "$Rn = $Rn_wb,@earlyclobber $Rn_wb",
                      [(set GPR:$Rn_wb,
                       (post_store GPR:$Rt, GPR:$Rn, am2offset:$offset))]>;
 
 def STRB_PRE : AI2stridx<1, 1, (outs GPR:$Rn_wb),
                      (ins GPR:$Rt, GPR:$Rn, am2offset:$offset),
                      IndexModePre, StFrm, IIC_iStore_bh_ru,
-                     "strb", "\t$Rt, [$Rn, $offset]!", "$Rn = $Rn_wb",
+                     "strb", "\t$Rt, [$Rn, $offset]!",
+                     "$Rn = $Rn_wb,@earlyclobber $Rn_wb",
                      [(set GPR:$Rn_wb, (pre_truncsti8 GPR:$Rt,
                                         GPR:$Rn, am2offset:$offset))]>;
 def STRB_POST: AI2stridx<1, 0, (outs GPR:$Rn_wb),
                      (ins GPR:$Rt, GPR:$Rn, am2offset:$offset),
                      IndexModePost, StFrm, IIC_iStore_bh_ru,
-                     "strb", "\t$Rt, [$Rn], $offset", "$Rn = $Rn_wb",
+                     "strb", "\t$Rt, [$Rn], $offset",
+                     "$Rn = $Rn_wb,@earlyclobber $Rn_wb",
                      [(set GPR:$Rn_wb, (post_truncsti8 GPR:$Rt,
                                         GPR:$Rn, am2offset:$offset))]>;
 
 def STRH_PRE : AI3stridx<0b1011, 0, 1, (outs GPR:$Rn_wb),
                      (ins GPR:$Rt, GPR:$Rn, am3offset:$offset),
                      IndexModePre, StMiscFrm, IIC_iStore_ru,
-                     "strh", "\t$Rt, [$Rn, $offset]!", "$Rn = $Rn_wb",
+                     "strh", "\t$Rt, [$Rn, $offset]!",
+                     "$Rn = $Rn_wb,@earlyclobber $Rn_wb",
                      [(set GPR:$Rn_wb,
                       (pre_truncsti16 GPR:$Rt, GPR:$Rn, am3offset:$offset))]>;
 
 def STRH_POST: AI3stridx<0b1011, 0, 0, (outs GPR:$Rn_wb),
                      (ins GPR:$Rt, GPR:$Rn, am3offset:$offset),
                      IndexModePost, StMiscFrm, IIC_iStore_bh_ru,
-                     "strh", "\t$Rt, [$Rn], $offset", "$Rn = $Rn_wb",
+                     "strh", "\t$Rt, [$Rn], $offset",
+                     "$Rn = $Rn_wb,@earlyclobber $Rn_wb",
                      [(set GPR:$Rn_wb, (post_truncsti16 GPR:$Rt,
                                         GPR:$Rn, am3offset:$offset))]>;
 
 // For disassembly only
+let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in {
 def STRD_PRE : AI3stdpr<(outs GPR:$base_wb),
                      (ins GPR:$src1, GPR:$src2, GPR:$base, am3offset:$offset),
                      StMiscFrm, IIC_iStore_d_ru,
@@ -1848,6 +1877,7 @@ def STRD_POST: AI3stdpo<(outs GPR:$base_wb),
                      StMiscFrm, IIC_iStore_d_ru,
                      "strd", "\t$src1, $src2, [$base], $offset",
                      "$base = $base_wb", []>;
+} // mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1
 
 // STRT, STRBT, and STRHT are for disassembly only.
 
@@ -2317,8 +2347,10 @@ def : ARMPat<(addc   GPR:$src, so_imm_neg:$imm),
 // The with-carry-in form matches bitwise not instead of the negation.
 // Effectively, the inverse interpretation of the carry flag already accounts
 // for part of the negation.
-def : ARMPat<(adde   GPR:$src, so_imm_not:$imm),
+def : ARMPat<(adde_dead_carry   GPR:$src, so_imm_not:$imm),
              (SBCri  GPR:$src, so_imm_not:$imm)>;
+def : ARMPat<(adde_live_carry   GPR:$src, so_imm_not:$imm),
+             (SBCSri GPR:$src, so_imm_not:$imm)>;
 
 // Note: These are implemented in C++ code, because they have to generate
 // ADD/SUBrs instructions, which use a complex pattern that a xform function
@@ -2432,7 +2464,7 @@ def USADA8 : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
 
 // Signed/Unsigned saturate -- for disassembly only
 
-def SSAT : AI<(outs GPR:$Rd), (ins i32imm:$sat_imm, GPR:$a, shift_imm:$sh),
+def SSAT : AI<(outs GPR:$Rd), (ins ssat_imm:$sat_imm, GPR:$a, shift_imm:$sh),
               SatFrm, NoItinerary, "ssat", "\t$Rd, $sat_imm, $a$sh",
               [/* For disassembly only; pattern left blank */]> {
   bits<4> Rd;
@@ -2448,7 +2480,7 @@ def SSAT : AI<(outs GPR:$Rd), (ins i32imm:$sat_imm, GPR:$a, shift_imm:$sh),
   let Inst{3-0} = Rn;
 }
 
-def SSAT16 : AI<(outs GPR:$Rd), (ins i32imm:$sat_imm, GPR:$Rn), SatFrm,
+def SSAT16 : AI<(outs GPR:$Rd), (ins ssat_imm:$sat_imm, GPR:$Rn), SatFrm,
                 NoItinerary, "ssat16", "\t$Rd, $sat_imm, $Rn",
                 [/* For disassembly only; pattern left blank */]> {
   bits<4> Rd;
@@ -2641,9 +2673,9 @@ def MUL  : AsMul1I32<0b0000000, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
 
 let Constraints = "@earlyclobber $Rd" in
 def MLAv5: ARMPseudoInst<(outs GPR:$Rd),
-                         (ins GPR:$Rn, GPR:$Rm, GPR:$Ra, pred:$p, cc_out:$s),
-                         Size4Bytes, IIC_iMAC32,
-                         [(set GPR:$Rd, (add (mul GPR:$Rn, GPR:$Rm), GPR:$Ra))]>,
+                        (ins GPR:$Rn, GPR:$Rm, GPR:$Ra, pred:$p, cc_out:$s),
+                        Size4Bytes, IIC_iMAC32,
+                        [(set GPR:$Rd, (add (mul GPR:$Rn, GPR:$Rm), GPR:$Ra))]>,
                         Requires<[IsARM, NoV6]> {
   bits<4> Ra;
   let Inst{15-12} = Ra;
@@ -2997,6 +3029,10 @@ def : ARMV6Pat<(sext_inreg (or (srl (and GPR:$Rm, 0xFF00), (i32 8)),
                                (shl GPR:$Rm, (i32 8))), i16),
                (REVSH GPR:$Rm)>;
 
+def : ARMV6Pat<(or (sra (shl GPR:$Rm, (i32 24)), (i32 16)),
+                   (and (srl GPR:$Rm, (i32 8)), 0xFF)),
+               (REVSH GPR:$Rm)>;
+
 // Need the AddedComplexity or else MOVs + REV would be chosen.
 let AddedComplexity = 5 in
 def : ARMV6Pat<(sra (bswap GPR:$Rm), (i32 16)), (REVSH GPR:$Rm)>;
@@ -3006,8 +3042,8 @@ def lsl_shift_imm : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(Sh, MVT::i32);
 }]>;
 
-def lsl_amt : PatLeaf<(i32 imm), [{
-  return (N->getZExtValue() < 32);
+def lsl_amt : ImmLeaf<i32, [{
+  return Imm > 0 && Imm < 32;
 }], lsl_shift_imm>;
 
 def PKHBT : APKHI<0b01101000, 0, (outs GPR:$Rd),
@@ -3029,8 +3065,8 @@ def asr_shift_imm : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(Sh, MVT::i32);
 }]>;
 
-def asr_amt : PatLeaf<(i32 imm), [{
-  return (N->getZExtValue() <= 32);
+def asr_amt : ImmLeaf<i32, [{
+  return Imm > 0 && Imm <= 32;
 }], asr_shift_imm>;
 
 // Note: Shifts of 1-15 bits will be transformed to srl instead of sra and
@@ -3241,6 +3277,18 @@ let usesCustomInserter = 1 in {
     def ATOMIC_LOAD_NAND_I8 : PseudoInst<
       (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
       [(set GPR:$dst, (atomic_load_nand_8 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_MIN_I8 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
+      [(set GPR:$dst, (atomic_load_min_8 GPR:$ptr, GPR:$val))]>;
+    def ATOMIC_LOAD_MAX_I8 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
+      [(set GPR:$dst, (atomic_load_max_8 GPR:$ptr, GPR:$val))]>;
+    def ATOMIC_LOAD_UMIN_I8 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
+      [(set GPR:$dst, (atomic_load_min_8 GPR:$ptr, GPR:$val))]>;
+    def ATOMIC_LOAD_UMAX_I8 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
+      [(set GPR:$dst, (atomic_load_max_8 GPR:$ptr, GPR:$val))]>;
     def ATOMIC_LOAD_ADD_I16 : PseudoInst<
       (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
       [(set GPR:$dst, (atomic_load_add_16 GPR:$ptr, GPR:$incr))]>;
@@ -3259,6 +3307,18 @@ let usesCustomInserter = 1 in {
     def ATOMIC_LOAD_NAND_I16 : PseudoInst<
       (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
       [(set GPR:$dst, (atomic_load_nand_16 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_MIN_I16 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
+      [(set GPR:$dst, (atomic_load_min_16 GPR:$ptr, GPR:$val))]>;
+    def ATOMIC_LOAD_MAX_I16 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
+      [(set GPR:$dst, (atomic_load_max_16 GPR:$ptr, GPR:$val))]>;
+    def ATOMIC_LOAD_UMIN_I16 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
+      [(set GPR:$dst, (atomic_load_min_16 GPR:$ptr, GPR:$val))]>;
+    def ATOMIC_LOAD_UMAX_I16 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
+      [(set GPR:$dst, (atomic_load_max_16 GPR:$ptr, GPR:$val))]>;
     def ATOMIC_LOAD_ADD_I32 : PseudoInst<
       (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
       [(set GPR:$dst, (atomic_load_add_32 GPR:$ptr, GPR:$incr))]>;
@@ -3277,6 +3337,18 @@ let usesCustomInserter = 1 in {
     def ATOMIC_LOAD_NAND_I32 : PseudoInst<
       (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
       [(set GPR:$dst, (atomic_load_nand_32 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_MIN_I32 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
+      [(set GPR:$dst, (atomic_load_min_32 GPR:$ptr, GPR:$val))]>;
+    def ATOMIC_LOAD_MAX_I32 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
+      [(set GPR:$dst, (atomic_load_max_32 GPR:$ptr, GPR:$val))]>;
+    def ATOMIC_LOAD_UMIN_I32 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
+      [(set GPR:$dst, (atomic_load_min_32 GPR:$ptr, GPR:$val))]>;
+    def ATOMIC_LOAD_UMAX_I32 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
+      [(set GPR:$dst, (atomic_load_max_32 GPR:$ptr, GPR:$val))]>;
 
     def ATOMIC_SWAP_I8 : PseudoInst<
       (outs GPR:$dst), (ins GPR:$ptr, GPR:$new), NoItinerary,
@@ -3307,8 +3379,9 @@ def LDREXH : AIldrex<0b11, (outs GPR:$Rt), (ins addrmode7:$addr), NoItinerary,
                     "ldrexh", "\t$Rt, $addr", []>;
 def LDREX  : AIldrex<0b00, (outs GPR:$Rt), (ins addrmode7:$addr), NoItinerary,
                     "ldrex", "\t$Rt, $addr", []>;
-def LDREXD : AIldrex<0b01, (outs GPR:$Rt, GPR:$Rt2), (ins addrmode7:$addr),
-                    NoItinerary, "ldrexd", "\t$Rt, $Rt2, $addr", []>;
+let hasExtraDefRegAllocReq = 1 in
+  def LDREXD : AIldrex<0b01, (outs GPR:$Rt, GPR:$Rt2), (ins addrmode7:$addr),
+                      NoItinerary, "ldrexd", "\t$Rt, $Rt2, $addr", []>;
 }
 
 let mayStore = 1, Constraints = "@earlyclobber $Rd" in {
@@ -3318,10 +3391,12 @@ def STREXH : AIstrex<0b11, (outs GPR:$Rd), (ins GPR:$Rt, addrmode7:$addr),
                     NoItinerary, "strexh", "\t$Rd, $Rt, $addr", []>;
 def STREX  : AIstrex<0b00, (outs GPR:$Rd), (ins GPR:$Rt, addrmode7:$addr),
                     NoItinerary, "strex", "\t$Rd, $Rt, $addr", []>;
+}
+
+let hasExtraSrcRegAllocReq = 1, Constraints = "@earlyclobber $Rd" in
 def STREXD : AIstrex<0b01, (outs GPR:$Rd),
                     (ins GPR:$Rt, GPR:$Rt2, addrmode7:$addr),
                     NoItinerary, "strexd", "\t$Rd, $Rt, $Rt2, $addr", []>;
-}
 
 // Clear-Exclusive is for disassembly only.
 def CLREX : AXI<(outs), (ins), MiscFrm, NoItinerary, "clrex",
@@ -3345,7 +3420,8 @@ def SWPB : AIswp<1, (outs GPR:$Rt), (ins GPR:$Rt2, GPR:$Rn), "swpb",
 def CDP : ABI<0b1110, (outs), (ins p_imm:$cop, i32imm:$opc1,
             c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, i32imm:$opc2),
             NoItinerary, "cdp", "\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
-            [/* For disassembly only; pattern left blank */]> {
+            [(int_arm_cdp imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
+                          imm:$CRm, imm:$opc2)]> {
   bits<4> opc1;
   bits<4> CRn;
   bits<4> CRd;
@@ -3365,7 +3441,8 @@ def CDP : ABI<0b1110, (outs), (ins p_imm:$cop, i32imm:$opc1,
 def CDP2 : ABXI<0b1110, (outs), (ins p_imm:$cop, i32imm:$opc1,
                c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, i32imm:$opc2),
                NoItinerary, "cdp2\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
-               [/* For disassembly only; pattern left blank */]> {
+               [(int_arm_cdp2 imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
+                              imm:$CRm, imm:$opc2)]> {
   let Inst{31-28} = 0b1111;
   bits<4> opc1;
   bits<4> CRn;
@@ -3489,10 +3566,10 @@ defm STC2 : LdStCop<0b1111,    0, (ins),         "stc2", "">;
 // Move between coprocessor and ARM core register -- for disassembly only
 //
 
-class MovRCopro<string opc, bit direction, dag oops, dag iops>
+class MovRCopro<string opc, bit direction, dag oops, dag iops,
+                list<dag> pattern>
   : ABI<0b1110, oops, iops, NoItinerary, opc,
-        "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2",
-        [/* For disassembly only; pattern left blank */]> {
+        "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2", pattern> {
   let Inst{20} = direction;
   let Inst{4} = 1;
 
@@ -3512,17 +3589,23 @@ class MovRCopro<string opc, bit direction, dag oops, dag iops>
 }
 
 def MCR : MovRCopro<"mcr", 0 /* from ARM core register to coprocessor */,
-                    (outs), (ins p_imm:$cop, i32imm:$opc1,
-                                 GPR:$Rt, c_imm:$CRn, c_imm:$CRm,
-                                 i32imm:$opc2)>;
+                    (outs),
+                    (ins p_imm:$cop, i32imm:$opc1, GPR:$Rt, c_imm:$CRn,
+                         c_imm:$CRm, i32imm:$opc2),
+                    [(int_arm_mcr imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
+                                  imm:$CRm, imm:$opc2)]>;
 def MRC : MovRCopro<"mrc", 1 /* from coprocessor to ARM core register */,
-                    (outs GPR:$Rt), (ins p_imm:$cop, i32imm:$opc1,
-                                         c_imm:$CRn, c_imm:$CRm, i32imm:$opc2)>;
+                    (outs GPR:$Rt),
+                    (ins p_imm:$cop, i32imm:$opc1, c_imm:$CRn, c_imm:$CRm,
+                         i32imm:$opc2), []>;
+
+def : ARMPat<(int_arm_mrc imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2),
+             (MRC imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>;
 
-class MovRCopro2<string opc, bit direction, dag oops, dag iops>
+class MovRCopro2<string opc, bit direction, dag oops, dag iops,
+                 list<dag> pattern>
   : ABXI<0b1110, oops, iops, NoItinerary,
-         !strconcat(opc, "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2"),
-         [/* For disassembly only; pattern left blank */]> {
+         !strconcat(opc, "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2"), pattern> {
   let Inst{31-28} = 0b1111;
   let Inst{20} = direction;
   let Inst{4} = 1;
@@ -3543,19 +3626,25 @@ class MovRCopro2<string opc, bit direction, dag oops, dag iops>
 }
 
 def MCR2 : MovRCopro2<"mcr2", 0 /* from ARM core register to coprocessor */,
-                      (outs), (ins p_imm:$cop, i32imm:$opc1,
-                                   GPR:$Rt, c_imm:$CRn, c_imm:$CRm,
-                                   i32imm:$opc2)>;
+                      (outs),
+                      (ins p_imm:$cop, i32imm:$opc1, GPR:$Rt, c_imm:$CRn,
+                           c_imm:$CRm, i32imm:$opc2),
+                      [(int_arm_mcr2 imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
+                                     imm:$CRm, imm:$opc2)]>;
 def MRC2 : MovRCopro2<"mrc2", 1 /* from coprocessor to ARM core register */,
-                      (outs GPR:$Rt), (ins p_imm:$cop, i32imm:$opc1,
-                                           c_imm:$CRn, c_imm:$CRm,
-                                           i32imm:$opc2)>;
+                      (outs GPR:$Rt),
+                      (ins p_imm:$cop, i32imm:$opc1, c_imm:$CRn, c_imm:$CRm,
+                           i32imm:$opc2), []>;
+
+def : ARMV5TPat<(int_arm_mrc2 imm:$cop, imm:$opc1, imm:$CRn,
+                              imm:$CRm, imm:$opc2),
+                (MRC2 imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>;
 
-class MovRRCopro<string opc, bit direction>
+class MovRRCopro<string opc, bit direction,
+                 list<dag> pattern = [/* For disassembly only */]>
   : ABI<0b1100, (outs), (ins p_imm:$cop, i32imm:$opc1,
         GPR:$Rt, GPR:$Rt2, c_imm:$CRm),
-        NoItinerary, opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm",
-        [/* For disassembly only; pattern left blank */]> {
+        NoItinerary, opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm", pattern> {
   let Inst{23-21} = 0b010;
   let Inst{20} = direction;
 
@@ -3572,14 +3661,16 @@ class MovRRCopro<string opc, bit direction>
   let Inst{3-0}   = CRm;
 }
 
-def MCRR : MovRRCopro<"mcrr", 0 /* from ARM core register to coprocessor */>;
+def MCRR : MovRRCopro<"mcrr", 0 /* from ARM core register to coprocessor */,
+                      [(int_arm_mcrr imm:$cop, imm:$opc1, GPR:$Rt, GPR:$Rt2,
+                                     imm:$CRm)]>;
 def MRRC : MovRRCopro<"mrrc", 1 /* from coprocessor to ARM core register */>;
 
-class MovRRCopro2<string opc, bit direction>
+class MovRRCopro2<string opc, bit direction,
+                  list<dag> pattern = [/* For disassembly only */]>
   : ABXI<0b1100, (outs), (ins p_imm:$cop, i32imm:$opc1,
-         GPR:$Rt, GPR:$Rt2, c_imm:$CRm),
-         NoItinerary, !strconcat(opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm"),
-         [/* For disassembly only; pattern left blank */]> {
+         GPR:$Rt, GPR:$Rt2, c_imm:$CRm), NoItinerary,
+         !strconcat(opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm"), pattern> {
   let Inst{31-28} = 0b1111;
   let Inst{23-21} = 0b010;
   let Inst{20} = direction;
@@ -3597,7 +3688,9 @@ class MovRRCopro2<string opc, bit direction>
   let Inst{3-0}   = CRm;
 }
 
-def MCRR2 : MovRRCopro2<"mcrr2", 0 /* from ARM core register to coprocessor */>;
+def MCRR2 : MovRRCopro2<"mcrr2", 0 /* from ARM core register to coprocessor */,
+                        [(int_arm_mcrr2 imm:$cop, imm:$opc1, GPR:$Rt, GPR:$Rt2,
+                                        imm:$CRm)]>;
 def MRRC2 : MovRRCopro2<"mrrc2", 1 /* from coprocessor to ARM core register */>;
 
 //===----------------------------------------------------------------------===//
@@ -3677,7 +3770,7 @@ let isCall = 1,
 //   here, and we're using the stack frame for the containing function to
 //   save/restore registers, we can't keep anything live in regs across
 //   the eh_sjlj_setjmp(), else it will almost certainly have been tromped upon
-//   when we get here from a longjmp(). We force everthing out of registers
+//   when we get here from a longjmp(). We force everything out of registers
 //   except for our own input by listing the relevant registers in Defs. By
 //   doing so, we also cause the prologue/epilogue code to actively preserve
 //   all of the callee-saved resgisters, which is exactly what we want.
@@ -3686,10 +3779,8 @@ let isCall = 1,
 // These are pseudo-instructions and are lowered to individual MC-insts, so
 // no encoding information is necessary.
 let Defs =
-  [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR,  D0,
-    D1,  D2,  D3,  D4,  D5,  D6,  D7,  D8,  D9,  D10, D11, D12, D13, D14, D15,
-    D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26, D27, D28, D29, D30,
-    D31 ], hasSideEffects = 1, isBarrier = 1 in {
+  [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR, CPSR,
+    QQQQ0, QQQQ1, QQQQ2, QQQQ3 ], hasSideEffects = 1, isBarrier = 1 in {
   def Int_eh_sjlj_setjmp : PseudoInst<(outs), (ins GPR:$src, GPR:$val),
                                NoItinerary,
                          [(set R0, (ARMeh_sjlj_setjmp GPR:$src, GPR:$val))]>,
@@ -3697,7 +3788,7 @@ let Defs =
 }
 
 let Defs =
-  [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR ],
+  [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR, CPSR ],
   hasSideEffects = 1, isBarrier = 1 in {
   def Int_eh_sjlj_setjmp_nofp : PseudoInst<(outs), (ins GPR:$src, GPR:$val),
                                    NoItinerary,
@@ -3720,8 +3811,8 @@ def Int_eh_sjlj_longjmp : PseudoInst<(outs), (ins GPR:$src, GPR:$scratch),
 // that need the instruction size).
 let isBarrier = 1, hasSideEffects = 1 in
 def Int_eh_sjlj_dispatchsetup :
- PseudoInst<(outs), (ins), NoItinerary,
-            [(ARMeh_sjlj_dispatchsetup)]>,
+ PseudoInst<(outs), (ins GPR:$src), NoItinerary,
+            [(ARMeh_sjlj_dispatchsetup GPR:$src)]>,
               Requires<[IsDarwin]>;
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index e34d69a..977139f 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -175,7 +175,7 @@ class VLDQQWBPseudo<InstrItinClass itin>
                 (ins addrmode6:$addr, am6offset:$offset), itin,
                 "$addr.addr = $wb">;
 class VLDQQQQPseudo<InstrItinClass itin>
-  : PseudoNLdSt<(outs QQQQPR:$dst), (ins addrmode6:$addr, QQQQPR:$src), itin,"">;
+  : PseudoNLdSt<(outs QQQQPR:$dst), (ins addrmode6:$addr, QQQQPR:$src),itin,"">;
 class VLDQQQQWBPseudo<InstrItinClass itin>
   : PseudoNLdSt<(outs QQQQPR:$dst, GPR:$wb),
                 (ins addrmode6:$addr, am6offset:$offset, QQQQPR:$src), itin,
@@ -531,6 +531,17 @@ class VLD1LN<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
                                          imm:$lane))]> {
   let Rm = 0b1111;
 }
+class VLD1LN32<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
+             PatFrag LoadOp>
+  : NLdStLn<1, 0b10, op11_8, op7_4, (outs DPR:$Vd),
+          (ins addrmode6oneL32:$Rn, DPR:$src, nohash_imm:$lane),
+          IIC_VLD1ln, "vld1", Dt, "\\{$Vd[$lane]\\}, $Rn",
+          "$src = $Vd",
+          [(set DPR:$Vd, (vector_insert (Ty DPR:$src),
+                                         (i32 (LoadOp addrmode6oneL32:$Rn)),
+                                         imm:$lane))]> {
+  let Rm = 0b1111;
+}
 class VLD1QLNPseudo<ValueType Ty, PatFrag LoadOp> : VLDQLNPseudo<IIC_VLD1ln> {
   let Pattern = [(set QPR:$dst, (vector_insert (Ty QPR:$src),
                                                (i32 (LoadOp addrmode6:$addr)),
@@ -544,7 +555,7 @@ def VLD1LNd16 : VLD1LN<0b0100, {?,?,0,?}, "16", v4i16, extloadi16> {
   let Inst{7-6} = lane{1-0};
   let Inst{4}   = Rn{4};
 }
-def VLD1LNd32 : VLD1LN<0b1000, {?,0,?,?}, "32", v2i32, load> {
+def VLD1LNd32 : VLD1LN32<0b1000, {?,0,?,?}, "32", v2i32, load> {
   let Inst{7} = lane{0};
   let Inst{5} = Rn{4};
   let Inst{4} = Rn{4};
@@ -1371,6 +1382,14 @@ class VST1LN<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
           [(StoreOp (ExtractOp (Ty DPR:$Vd), imm:$lane), addrmode6:$Rn)]> {
   let Rm = 0b1111;
 }
+class VST1LN32<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
+             PatFrag StoreOp, SDNode ExtractOp>
+  : NLdStLn<1, 0b00, op11_8, op7_4, (outs),
+          (ins addrmode6oneL32:$Rn, DPR:$Vd, nohash_imm:$lane),
+          IIC_VST1ln, "vst1", Dt, "\\{$Vd[$lane]\\}, $Rn", "",
+          [(StoreOp (ExtractOp (Ty DPR:$Vd), imm:$lane), addrmode6oneL32:$Rn)]>{
+  let Rm = 0b1111;
+}
 class VST1QLNPseudo<ValueType Ty, PatFrag StoreOp, SDNode ExtractOp>
   : VSTQLNPseudo<IIC_VST1ln> {
   let Pattern = [(StoreOp (ExtractOp (Ty QPR:$src), imm:$lane),
@@ -1386,7 +1405,8 @@ def VST1LNd16 : VST1LN<0b0100, {?,?,0,?}, "16", v4i16, truncstorei16,
   let Inst{7-6} = lane{1-0};
   let Inst{4}   = Rn{5};
 }
-def VST1LNd32 : VST1LN<0b1000, {?,0,?,?}, "32", v2i32, store, extractelt> {
+
+def VST1LNd32 : VST1LN32<0b1000, {?,0,?,?}, "32", v2i32, store, extractelt> {
   let Inst{7}   = lane{0};
   let Inst{5-4} = Rn{5-4};
 }
@@ -3773,7 +3793,8 @@ def  VBSLd    : N3VX<1, 0, 0b01, 0b0001, 0, 1, (outs DPR:$Vd),
                      (ins DPR:$src1, DPR:$Vn, DPR:$Vm),
                      N3RegFrm, IIC_VCNTiD,
                      "vbsl", "$Vd, $Vn, $Vm", "$src1 = $Vd",
-                     [(set DPR:$Vd, (v2i32 (NEONvbsl DPR:$src1, DPR:$Vn, DPR:$Vm)))]>;
+                     [(set DPR:$Vd,
+                           (v2i32 (NEONvbsl DPR:$src1, DPR:$Vn, DPR:$Vm)))]>;
 
 def : Pat<(v2i32 (or (and DPR:$Vn, DPR:$Vd),
                      (and DPR:$Vm, (vnotd DPR:$Vd)))),
@@ -3783,7 +3804,8 @@ def  VBSLq    : N3VX<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$Vd),
                      (ins QPR:$src1, QPR:$Vn, QPR:$Vm),
                      N3RegFrm, IIC_VCNTiQ,
                      "vbsl", "$Vd, $Vn, $Vm", "$src1 = $Vd",
-                     [(set QPR:$Vd, (v4i32 (NEONvbsl QPR:$src1, QPR:$Vn, QPR:$Vm)))]>;
+                     [(set QPR:$Vd,
+                           (v4i32 (NEONvbsl QPR:$src1, QPR:$Vn, QPR:$Vm)))]>;
 
 def : Pat<(v4i32 (or (and QPR:$Vn, QPR:$Vd),
                      (and QPR:$Vm, (vnotq QPR:$Vd)))),
@@ -4683,8 +4705,9 @@ def VEXTd32 : VEXTd<"vext", "32", v2i32> {
   let Inst{9-8}    = 0b00;
 }
 def VEXTdf  : VEXTd<"vext", "32", v2f32> {
-  let Inst{11}    = index{0};
-  let Inst{10-8}  = 0b000;
+  let Inst{11-10}    = index{1-0};
+  let Inst{9-8}  = 0b00;
+
 }
 
 def VEXTq8  : VEXTq<"vext", "8",  v16i8> {
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index 15ab455..8430aa3 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -27,22 +27,22 @@ def imm_comp_XFORM : SDNodeXForm<imm, [{
 }]>;
 
 /// imm0_7 predicate - True if the 32-bit immediate is in the range [0,7].
-def imm0_7 : PatLeaf<(i32 imm), [{
-  return (uint32_t)N->getZExtValue() < 8;
+def imm0_7 : ImmLeaf<i32, [{
+  return Imm >= 0 && Imm < 8;
 }]>;
 def imm0_7_neg : PatLeaf<(i32 imm), [{
   return (uint32_t)-N->getZExtValue() < 8;
 }], imm_neg_XFORM>;
 
-def imm0_255 : PatLeaf<(i32 imm), [{
-  return (uint32_t)N->getZExtValue() < 256;
+def imm0_255 : ImmLeaf<i32, [{
+  return Imm >= 0 && Imm < 256;
 }]>;
 def imm0_255_comp : PatLeaf<(i32 imm), [{
   return ~((uint32_t)N->getZExtValue()) < 256;
 }]>;
 
-def imm8_255 : PatLeaf<(i32 imm), [{
-  return (uint32_t)N->getZExtValue() >= 8 && (uint32_t)N->getZExtValue() < 256;
+def imm8_255 : ImmLeaf<i32, [{
+  return Imm >= 8 && Imm < 256;
 }]>;
 def imm8_255_neg : PatLeaf<(i32 imm), [{
   unsigned Val = -N->getZExtValue();
@@ -383,6 +383,14 @@ let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
 
 // Indirect branches
 let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
+  def tBX : TI<(outs), (ins GPR:$Rm, pred:$p), IIC_Br, "bx${p}\t$Rm", []>,
+            T1Special<{1,1,0,?}> {
+    // A6.2.3 & A8.6.25
+    bits<4> Rm;
+    let Inst{6-3} = Rm;
+    let Inst{2-0} = 0b000;
+  }
+
   def tBRIND : TI<(outs), (ins GPR:$Rm),
                   IIC_Br,
                   "mov\tpc, $Rm",
@@ -414,10 +422,7 @@ def tPOP_RET : T1I<(outs), (ins pred:$p, reglist:$regs, variable_ops),
 // potentially appearing dead.
 let isCall = 1,
   // On non-Darwin platforms R9 is callee-saved.
-  Defs = [R0,  R1,  R2,  R3,  R12, LR,
-          D0,  D1,  D2,  D3,  D4,  D5,  D6,  D7,
-          D16, D17, D18, D19, D20, D21, D22, D23,
-          D24, D25, D26, D27, D28, D29, D30, D31, CPSR, FPSCR],
+  Defs = [R0,  R1,  R2,  R3,  R12, LR, QQQQ0, QQQQ2, QQQQ3, CPSR, FPSCR],
   Uses = [SP] in {
   // Also used for Thumb2
   def tBL  : TIx2<0b11110, 0b11, 1,
@@ -451,14 +456,15 @@ let isCall = 1,
                   "blx\t$func",
                   [(ARMtcall GPR:$func)]>,
               Requires<[IsThumb, HasV5T, IsNotDarwin]>,
-              T1Special<{1,1,1,?}>; // A6.2.3 & A8.6.24;
+              T1Special<{1,1,1,?}> { // A6.2.3 & A8.6.24;
+    bits<4> func;
+    let Inst{6-3} = func;
+    let Inst{2-0} = 0b000;
+  }
 
   // ARMv4T
-  // FIXME: Should be a pseudo.
-  let isCodeGenOnly = 1 in
-  def tBX : TIx2<{?,?,?,?,?}, {?,?}, ?,
-                  (outs), (ins tGPR:$func, variable_ops), IIC_Br,
-                  "mov\tlr, pc\n\tbx\t$func",
+  def tBX_CALL : tPseudoInst<(outs), (ins tGPR:$func, variable_ops),
+                  Size4Bytes, IIC_Br,
                   [(ARMcall_nolink tGPR:$func)]>,
             Requires<[IsThumb, IsThumb1Only, IsNotDarwin]>;
 }
@@ -467,10 +473,7 @@ let isCall = 1,
   // On Darwin R9 is call-clobbered.
   // R7 is marked as a use to prevent frame-pointer assignments from being
   // moved above / below calls.
-  Defs = [R0,  R1,  R2,  R3,  R9,  R12, LR,
-          D0,  D1,  D2,  D3,  D4,  D5,  D6,  D7,
-          D16, D17, D18, D19, D20, D21, D22, D23,
-          D24, D25, D26, D27, D28, D29, D30, D31, CPSR, FPSCR],
+  Defs = [R0,  R1,  R2,  R3,  R9,  R12, LR, QQQQ0, QQQQ2, QQQQ3, CPSR, FPSCR],
   Uses = [R7, SP] in {
   // Also used for Thumb2
   def tBLr9 : TIx2<0b11110, 0b11, 1,
@@ -512,11 +515,8 @@ let isCall = 1,
   }
 
   // ARMv4T
-  let isCodeGenOnly = 1 in
-  // FIXME: Should be a pseudo.
-  def tBXr9 : TIx2<{?,?,?,?,?}, {?,?}, ?,
-                   (outs), (ins tGPR:$func, variable_ops), IIC_Br,
-                   "mov\tlr, pc\n\tbx\t$func",
+  def tBXr9_CALL : tPseudoInst<(outs), (ins tGPR:$func, variable_ops),
+                   Size4Bytes, IIC_Br,
                    [(ARMcall_nolink tGPR:$func)]>,
               Requires<[IsThumb, IsThumb1Only, IsDarwin]>;
 }
@@ -551,7 +551,7 @@ let isBranch = 1, isTerminator = 1 in
   def tBcc : T1I<(outs), (ins t_bcctarget:$target, pred:$p), IIC_Br,
                  "b${p}\t$target",
                  [/*(ARMbrcond bb:$target, imm:$cc)*/]>,
-             T1Encoding<{1,1,0,1,?,?}> {
+             T1BranchCond<{1,1,0,1}> {
   bits<4> p;
   bits<8> target;
   let Inst{11-8} = p;
@@ -597,7 +597,7 @@ def tSVC : T1pI<(outs), (ins i32imm:$imm), IIC_Br,
 
 // The assembler uses 0xDEFE for a trap instruction.
 let isBarrier = 1, isTerminator = 1 in
-def tTRAP : TI<(outs), (ins), IIC_Br, 
+def tTRAP : TI<(outs), (ins), IIC_Br,
                "trap", [(trap)]>, Encoding16 {
   let Inst = 0xdefe;
 }
@@ -712,6 +712,19 @@ def tLDRpci : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_pc:$addr), IIC_iLoad_i,
   let Inst{7-0}  = addr;
 }
 
+// FIXME: Remove this entry when the above ldr.n workaround is fixed.
+// For disassembly use only.
+def tLDRpciDIS : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_pc:$addr), IIC_iLoad_i,
+                       "ldr", "\t$Rt, $addr",
+                       [/* disassembly only */]>,
+                 T1Encoding<{0,1,0,0,1,?}> {
+  // A6.2 & A8.6.59
+  bits<3> Rt;
+  bits<8> addr;
+  let Inst{10-8} = Rt;
+  let Inst{7-0}  = addr;
+}
+
 // A8.6.194 & A8.6.192
 defm tSTR  : thumb_st_rr_ri_enc<0b000, 0b0110, t_addrmode_rrs4,
                                 t_addrmode_is4, AddrModeT1_4,
@@ -726,9 +739,9 @@ defm tSTRB : thumb_st_rr_ri_enc<0b010, 0b0111, t_addrmode_rrs1,
 
 // A8.6.207 & A8.6.205
 defm tSTRH : thumb_st_rr_ri_enc<0b001, 0b1000, t_addrmode_rrs2,
-                                t_addrmode_is2, AddrModeT1_2,
-                                IIC_iStore_bh_r, IIC_iStore_bh_i, "strh",
-                                BinOpFrag<(truncstorei16 node:$LHS, node:$RHS)>>;
+                               t_addrmode_is2, AddrModeT1_2,
+                               IIC_iStore_bh_r, IIC_iStore_bh_i, "strh",
+                               BinOpFrag<(truncstorei16 node:$LHS, node:$RHS)>>;
 
 
 def tSTRspi : T1pIs<(outs), (ins tGPR:$Rt, t_addrmode_sp:$addr), IIC_iStore_i,
@@ -791,7 +804,7 @@ defm tLDM : thumb_ldst_mult<"ldm", IIC_iLoad_m, IIC_iLoad_mu,
 let mayStore = 1, hasExtraSrcRegAllocReq = 1 in
 defm tSTM : thumb_ldst_mult<"stm", IIC_iStore_m, IIC_iStore_mu,
                             {1,1,0,0,0,?}, 0>;
- 
+
 } // neverHasSideEffects
 
 let mayLoad = 1, Uses = [SP], Defs = [SP], hasExtraDefRegAllocReq = 1 in
@@ -898,7 +911,8 @@ def tADC :                      // A8.6.2
 
 // Add immediate
 def tADDi3 :                    // A8.6.4 T1
-  T1sIGenEncodeImm<0b01110, (outs tGPR:$Rd), (ins tGPR:$Rm, i32imm:$imm3), IIC_iALUi,
+  T1sIGenEncodeImm<0b01110, (outs tGPR:$Rd), (ins tGPR:$Rm, i32imm:$imm3),
+                   IIC_iALUi,
                    "add", "\t$Rd, $Rm, $imm3",
                    [(set tGPR:$Rd, (add tGPR:$Rm, imm0_7:$imm3))]> {
   bits<3> imm3;
@@ -1175,10 +1189,18 @@ def tREVSH :                    // A8.6.136
                  "revsh", "\t$Rd, $Rm",
                  [(set tGPR:$Rd,
                        (sext_inreg
-                         (or (srl (and tGPR:$Rm, 0xFF00), (i32 8)),
+                         (or (srl tGPR:$Rm, (i32 8)),
                              (shl tGPR:$Rm, (i32 8))), i16))]>,
                  Requires<[IsThumb, IsThumb1Only, HasV6]>;
 
+def : T1Pat<(sext_inreg (or (srl (and tGPR:$Rm, 0xFF00), (i32 8)),
+                            (shl tGPR:$Rm, (i32 8))), i16),
+            (tREVSH tGPR:$Rm)>,
+      Requires<[IsThumb, IsThumb1Only, HasV6]>;
+
+def : T1Pat<(sra (bswap tGPR:$Rm), (i32 16)), (tREVSH tGPR:$Rm)>,
+      Requires<[IsThumb, IsThumb1Only, HasV6]>;
+
 // Rotate right register
 def tROR :                      // A8.6.139
   T1sItDPEncode<0b0111, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
@@ -1322,9 +1344,10 @@ def tLEApcrelJT : tPseudoInst<(outs tGPR:$Rd),
 // Move between coprocessor and ARM core register -- for disassembly only
 //
 
-class tMovRCopro<string opc, bit direction, dag oops, dag iops>
+class tMovRCopro<string opc, bit direction, dag oops, dag iops,
+                 list<dag> pattern>
   : T1Cop<oops, iops, !strconcat(opc, "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2"),
-          [/* For disassembly only; pattern left blank */]> {
+          pattern> {
   let Inst{27-24} = 0b1110;
   let Inst{20} = direction;
   let Inst{4} = 1;
@@ -1345,16 +1368,24 @@ class tMovRCopro<string opc, bit direction, dag oops, dag iops>
 }
 
 def tMCR : tMovRCopro<"mcr", 0 /* from ARM core register to coprocessor */,
-           (outs), (ins p_imm:$cop, i32imm:$opc1, GPR:$Rt, c_imm:$CRn,
-                        c_imm:$CRm, i32imm:$opc2)>;
+           (outs),
+           (ins p_imm:$cop, i32imm:$opc1, GPR:$Rt, c_imm:$CRn,
+                c_imm:$CRm, i32imm:$opc2),
+           [(int_arm_mcr imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
+                         imm:$CRm, imm:$opc2)]>;
 def tMRC : tMovRCopro<"mrc", 1 /* from coprocessor to ARM core register */,
-           (outs GPR:$Rt), (ins p_imm:$cop, i32imm:$opc1, c_imm:$CRn,
-                                c_imm:$CRm, i32imm:$opc2)>;
+           (outs GPR:$Rt),
+           (ins p_imm:$cop, i32imm:$opc1, c_imm:$CRn, c_imm:$CRm, i32imm:$opc2),
+           []>;
+
+def : Pat<(int_arm_mrc imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2),
+          (tMRC imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>,
+          Requires<[IsThumb, HasV6T2]>;
 
-class tMovRRCopro<string opc, bit direction>
+class tMovRRCopro<string opc, bit direction,
+                  list<dag> pattern = [/* For disassembly only */]>
   : T1Cop<(outs), (ins p_imm:$cop, i32imm:$opc1, GPR:$Rt, GPR:$Rt2, c_imm:$CRm),
-          !strconcat(opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm"),
-          [/* For disassembly only; pattern left blank */]> {
+          !strconcat(opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm"), pattern> {
   let Inst{27-24} = 0b1100;
   let Inst{23-21} = 0b010;
   let Inst{20} = direction;
@@ -1372,7 +1403,9 @@ class tMovRRCopro<string opc, bit direction>
   let Inst{3-0}   = CRm;
 }
 
-def tMCRR : tMovRRCopro<"mcrr", 0 /* from ARM core register to coprocessor */>;
+def tMCRR : tMovRRCopro<"mcrr", 0 /* from ARM core register to coprocessor */,
+                        [(int_arm_mcrr imm:$cop, imm:$opc1, GPR:$Rt, GPR:$Rt2,
+                                       imm:$CRm)]>;
 def tMRRC : tMovRRCopro<"mrrc", 1 /* from coprocessor to ARM core register */>;
 
 //===----------------------------------------------------------------------===//
@@ -1381,7 +1414,8 @@ def tMRRC : tMovRRCopro<"mrrc", 1 /* from coprocessor to ARM core register */>;
 def tCDP : T1Cop<(outs), (ins p_imm:$cop, i32imm:$opc1,
                  c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, i32imm:$opc2),
                  "cdp\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
-                 [/* For disassembly only; pattern left blank */]> {
+                 [(int_arm_cdp imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
+                               imm:$CRm, imm:$opc2)]> {
   let Inst{27-24} = 0b1110;
 
   bits<4> opc1;
@@ -1415,19 +1449,19 @@ def tTPsoft : TIx2<0b11110, 0b11, 1, (outs), (ins), IIC_Br,
 
 //===----------------------------------------------------------------------===//
 // SJLJ Exception handling intrinsics
-// 
+//
 
 // eh_sjlj_setjmp() is an instruction sequence to store the return address and
 // save #0 in R0 for the non-longjmp case.  Since by its nature we may be coming
 // from some other function to get here, and we're using the stack frame for the
 // containing function to save/restore registers, we can't keep anything live in
 // regs across the eh_sjlj_setjmp(), else it will almost certainly have been
-// tromped upon when we get here from a longjmp(). We force everthing out of
+// tromped upon when we get here from a longjmp(). We force everything out of
 // registers except for our own input by listing the relevant registers in
 // Defs. By doing so, we also cause the prologue/epilogue code to actively
 // preserve all of the callee-saved resgisters, which is exactly what we want.
 // $val is a scratch register for our use.
-let Defs = [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7, R12 ],
+let Defs = [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7, R12, CPSR ],
     hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1 in
 def tInt_eh_sjlj_setjmp : ThumbXI<(outs),(ins tGPR:$src, tGPR:$val),
                                   AddrModeNone, SizeSpecial, NoItinerary, "","",
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index 6794e75..53b9cec 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -44,7 +44,9 @@ def t2_so_imm_neg_XFORM : SDNodeXForm<imm, [{
 // t2_so_imm - Match a 32-bit immediate operand, which is an
 // 8-bit immediate rotated by an arbitrary number of bits, or an 8-bit
 // immediate splatted into multiple bytes of the word.
-def t2_so_imm : Operand<i32>, PatLeaf<(imm), [{ return Pred_t2_so_imm(N); }]> {
+def t2_so_imm : Operand<i32>, ImmLeaf<i32, [{
+    return ARM_AM::getT2SOImmVal(Imm) != -1;
+  }]> {
   let EncoderMethod = "getT2SOImmOpValue";
 }
 
@@ -62,14 +64,14 @@ def t2_so_imm_neg : Operand<i32>,
 }], t2_so_imm_neg_XFORM>;
 
 /// imm1_31 predicate - True if the 32-bit immediate is in the range [1,31].
-def imm1_31 : PatLeaf<(i32 imm), [{
-  return (int32_t)N->getZExtValue() >= 1 && (int32_t)N->getZExtValue() < 32;
+def imm1_31 : ImmLeaf<i32, [{
+  return (int32_t)Imm >= 1 && (int32_t)Imm < 32;
 }]>;
 
 /// imm0_4095 predicate - True if the 32-bit immediate is in the range [0.4095].
 def imm0_4095 : Operand<i32>,
-                PatLeaf<(i32 imm), [{
-  return (uint32_t)N->getZExtValue() < 4096;
+                ImmLeaf<i32, [{
+  return Imm >= 0 && Imm < 4096;
 }]>;
 
 def imm0_4095_neg : PatLeaf<(i32 imm), [{
@@ -84,6 +86,11 @@ def imm0_255_not : PatLeaf<(i32 imm), [{
   return (uint32_t)(~N->getZExtValue()) < 255;
 }], imm_comp_XFORM>;
 
+def lo5AllOne : PatLeaf<(i32 imm), [{
+  // Returns true if all low 5-bits are 1.
+  return (((uint32_t)N->getZExtValue()) & 0x1FUL) == 0x1FUL;
+}]>;
+
 // Define Thumb2 specific addressing modes.
 
 // t2addrmode_imm12  := reg + imm12
@@ -151,7 +158,7 @@ def t2addrmode_so_reg : Operand<i32>,
 //
 def t2addrmode_reg : Operand<i32> {
   let PrintMethod = "printAddrMode7Operand";
-  let MIOperandInfo = (ops tGPR);
+  let MIOperandInfo = (ops GPR);
   let ParserMatchClass = MemMode7AsmOperand;
 }
 
@@ -681,49 +688,27 @@ multiclass T2I_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
      let Inst{24-21} = opcod;
    }
 }
+}
 
 // Carry setting variants
-let isCodeGenOnly = 1, Defs = [CPSR] in {
-multiclass T2I_adde_sube_s_irs<bits<4> opcod, string opc, PatFrag opnode,
-                               bit Commutable = 0> {
+// NOTE: CPSR def omitted because it will be handled by the custom inserter.
+let usesCustomInserter = 1 in {
+multiclass T2I_adde_sube_s_irs<PatFrag opnode, bit Commutable = 0> {
    // shifted imm
-   def ri : T2sTwoRegImm<
-                 (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_imm:$imm), IIC_iALUi,
-                 opc, "\t$Rd, $Rn, $imm",
-                 [(set rGPR:$Rd, (opnode rGPR:$Rn, t2_so_imm:$imm))]>,
-                 Requires<[IsThumb2]> {
-     let Inst{31-27} = 0b11110;
-     let Inst{25} = 0;
-     let Inst{24-21} = opcod;
-     let Inst{20} = 1; // The S bit.
-     let Inst{15} = 0;
-   }
+   def ri : t2PseudoInst<(outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_imm:$imm),
+                Size4Bytes, IIC_iALUi,
+                [(set rGPR:$Rd, (opnode rGPR:$Rn, t2_so_imm:$imm))]>;
    // register
-   def rr : T2sThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iALUr,
-                 opc, ".w\t$Rd, $Rn, $Rm",
-                 [(set rGPR:$Rd, (opnode rGPR:$Rn, rGPR:$Rm))]>,
-                 Requires<[IsThumb2]> {
+   def rr : t2PseudoInst<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm),
+                Size4Bytes, IIC_iALUr,
+                [(set rGPR:$Rd, (opnode rGPR:$Rn, rGPR:$Rm))]> {
      let isCommutable = Commutable;
-     let Inst{31-27} = 0b11101;
-     let Inst{26-25} = 0b01;
-     let Inst{24-21} = opcod;
-     let Inst{20} = 1; // The S bit.
-     let Inst{14-12} = 0b000; // imm3
-     let Inst{7-6} = 0b00; // imm2
-     let Inst{5-4} = 0b00; // type
    }
    // shifted register
-   def rs : T2sTwoRegShiftedReg<
-                 (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_reg:$ShiftedRm),
-                 IIC_iALUsi, opc, ".w\t$Rd, $Rn, $ShiftedRm",
-                 [(set rGPR:$Rd, (opnode rGPR:$Rn, t2_so_reg:$ShiftedRm))]>,
-                 Requires<[IsThumb2]> {
-     let Inst{31-27} = 0b11101;
-     let Inst{26-25} = 0b01;
-     let Inst{24-21} = opcod;
-     let Inst{20} = 1; // The S bit.
-   }
-}
+   def rs : t2PseudoInst<
+                (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_reg:$ShiftedRm),
+                Size4Bytes, IIC_iALUsi,
+                [(set rGPR:$Rd, (opnode rGPR:$Rn, t2_so_reg:$ShiftedRm))]>;
 }
 }
 
@@ -845,6 +830,7 @@ multiclass T2I_ld<bit signed, bits<2> opcod, string opc,
     let Inst{15-12} = Rt;
 
     bits<17> addr;
+    let addr{12}    = 1;           // add = TRUE
     let Inst{19-16} = addr{16-13}; // Rn
     let Inst{23}    = addr{12};    // U
     let Inst{11-0}  = addr{11-0};  // imm
@@ -925,6 +911,7 @@ multiclass T2I_st<bits<2> opcod, string opc,
     let Inst{15-12} = Rt;
 
     bits<17> addr;
+    let addr{12}    = 1;           // add = TRUE
     let Inst{19-16} = addr{16-13}; // Rn
     let Inst{23}    = addr{12};    // U
     let Inst{11-0}  = addr{11-0};  // imm
@@ -1097,7 +1084,7 @@ multiclass T2I_exta_rrot_DO<bits<3> opcod, string opc> {
      let Inst{7} = 1;
      let Inst{5-4} = 0b00; // rotate
    }
-  def rr_rot : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, i32imm:$rot),
+  def rr_rot :T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, i32imm:$rot),
                   IIC_iEXTAsr, opc, "\t$Rd, $Rn, $Rm, ror $rot", []> {
      let Inst{31-27} = 0b11111;
      let Inst{26-23} = 0b0100;
@@ -1379,7 +1366,7 @@ def t2LDRSH_POST : T2Iidxldst<1, 0b01, 1, 0, (outs GPR:$dst, GPR:$Rn),
 // for disassembly only.
 // Ref: A8.6.57 LDR (immediate, Thumb) Encoding T4
 class T2IldT<bit signed, bits<2> type, string opc, InstrItinClass ii>
-  : T2Ii8<(outs GPR:$Rt), (ins t2addrmode_imm8:$addr), ii, opc,
+  : T2Ii8<(outs rGPR:$Rt), (ins t2addrmode_imm8:$addr), ii, opc,
           "\t$Rt, $addr", []> {
   let Inst{31-27} = 0b11111;
   let Inst{26-25} = 0b00;
@@ -1421,42 +1408,48 @@ def t2STRDi8 : T2Ii8s4<1, 0, 0, (outs),
 def t2STR_PRE  : T2Iidxldst<0, 0b10, 0, 1, (outs GPR:$base_wb),
                             (ins GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iStore_iu,
-                         "str", "\t$Rt, [$Rn, $addr]!", "$Rn = $base_wb",
+                         "str", "\t$Rt, [$Rn, $addr]!",
+                         "$Rn = $base_wb,@earlyclobber $base_wb",
              [(set GPR:$base_wb,
                    (pre_store GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr))]>;
 
 def t2STR_POST : T2Iidxldst<0, 0b10, 0, 0, (outs GPR:$base_wb),
                             (ins GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr),
                             AddrModeT2_i8, IndexModePost, IIC_iStore_iu,
-                          "str", "\t$Rt, [$Rn], $addr", "$Rn = $base_wb",
+                          "str", "\t$Rt, [$Rn], $addr",
+                          "$Rn = $base_wb,@earlyclobber $base_wb",
              [(set GPR:$base_wb,
                   (post_store GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr))]>;
 
 def t2STRH_PRE  : T2Iidxldst<0, 0b01, 0, 1, (outs GPR:$base_wb),
                             (ins GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iStore_iu,
-                        "strh", "\t$Rt, [$Rn, $addr]!", "$Rn = $base_wb",
+                        "strh", "\t$Rt, [$Rn, $addr]!",
+                        "$Rn = $base_wb,@earlyclobber $base_wb",
         [(set GPR:$base_wb,
               (pre_truncsti16 GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr))]>;
 
 def t2STRH_POST : T2Iidxldst<0, 0b01, 0, 0, (outs GPR:$base_wb),
                             (ins GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr),
                             AddrModeT2_i8, IndexModePost, IIC_iStore_bh_iu,
-                         "strh", "\t$Rt, [$Rn], $addr", "$Rn = $base_wb",
+                         "strh", "\t$Rt, [$Rn], $addr",
+                         "$Rn = $base_wb,@earlyclobber $base_wb",
        [(set GPR:$base_wb,
              (post_truncsti16 GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr))]>;
 
 def t2STRB_PRE  : T2Iidxldst<0, 0b00, 0, 1, (outs GPR:$base_wb),
                             (ins GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iStore_bh_iu,
-                        "strb", "\t$Rt, [$Rn, $addr]!", "$Rn = $base_wb",
+                        "strb", "\t$Rt, [$Rn, $addr]!",
+                        "$Rn = $base_wb,@earlyclobber $base_wb",
          [(set GPR:$base_wb,
                (pre_truncsti8 GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr))]>;
 
 def t2STRB_POST : T2Iidxldst<0, 0b00, 0, 0, (outs GPR:$base_wb),
                             (ins GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr),
                             AddrModeT2_i8, IndexModePost, IIC_iStore_bh_iu,
-                         "strb", "\t$Rt, [$Rn], $addr", "$Rn = $base_wb",
+                         "strb", "\t$Rt, [$Rn], $addr",
+                         "$Rn = $base_wb,@earlyclobber $base_wb",
         [(set GPR:$base_wb,
               (post_truncsti8 GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr))]>;
 
@@ -1464,7 +1457,7 @@ def t2STRB_POST : T2Iidxldst<0, 0b00, 0, 0, (outs GPR:$base_wb),
 // only.
 // Ref: A8.6.193 STR (immediate, Thumb) Encoding T4
 class T2IstT<bits<2> type, string opc, InstrItinClass ii>
-  : T2Ii8<(outs GPR:$Rt), (ins t2addrmode_imm8:$addr), ii, opc,
+  : T2Ii8<(outs rGPR:$Rt), (ins t2addrmode_imm8:$addr), ii, opc,
           "\t$Rt, $addr", []> {
   let Inst{31-27} = 0b11111;
   let Inst{26-25} = 0b00;
@@ -1489,20 +1482,20 @@ def t2STRHT  : T2IstT<0b01, "strht", IIC_iStore_bh_i>;
 // ldrd / strd pre / post variants
 // For disassembly only.
 
-def t2LDRD_PRE  : T2Ii8s4<1, 1, 1, (outs GPR:$Rt, GPR:$Rt2),
+def t2LDRD_PRE  : T2Ii8s4<1, 1, 1, (outs rGPR:$Rt, rGPR:$Rt2),
                  (ins GPR:$base, t2am_imm8s4_offset:$imm), IIC_iLoad_d_ru,
                  "ldrd", "\t$Rt, $Rt2, [$base, $imm]!", []>;
 
-def t2LDRD_POST : T2Ii8s4<0, 1, 1, (outs GPR:$Rt, GPR:$Rt2),
+def t2LDRD_POST : T2Ii8s4<0, 1, 1, (outs rGPR:$Rt, rGPR:$Rt2),
                  (ins GPR:$base, t2am_imm8s4_offset:$imm), IIC_iLoad_d_ru,
                  "ldrd", "\t$Rt, $Rt2, [$base], $imm", []>;
 
 def t2STRD_PRE  : T2Ii8s4<1, 1, 0, (outs),
-                 (ins GPR:$Rt, GPR:$Rt2, GPR:$base, t2am_imm8s4_offset:$imm),
+                 (ins rGPR:$Rt, rGPR:$Rt2, GPR:$base, t2am_imm8s4_offset:$imm),
                  IIC_iStore_d_ru, "strd", "\t$Rt, $Rt2, [$base, $imm]!", []>;
 
 def t2STRD_POST : T2Ii8s4<0, 1, 0, (outs),
-                 (ins GPR:$Rt, GPR:$Rt2, GPR:$base, t2am_imm8s4_offset:$imm),
+                 (ins rGPR:$Rt, rGPR:$Rt2, GPR:$base, t2am_imm8s4_offset:$imm),
                  IIC_iStore_d_ru, "strd", "\t$Rt, $Rt2, [$base], $imm", []>;
 
 // T2Ipl (Preload Data/Instruction) signals the memory system of possible future
@@ -1522,6 +1515,7 @@ multiclass T2Ipl<bits<1> write, bits<1> instr, string opc> {
     let Inst{15-12} = 0b1111;
 
     bits<17> addr;
+    let addr{12}    = 1;           // add = TRUE
     let Inst{19-16} = addr{16-13}; // Rn
     let Inst{23}    = addr{12};    // U
     let Inst{11-0}  = addr{11-0};  // imm12
@@ -1794,10 +1788,10 @@ defm t2ADC  : T2I_adde_sube_irs<0b1010, "adc",
                           BinOpFrag<(adde_dead_carry node:$LHS, node:$RHS)>, 1>;
 defm t2SBC  : T2I_adde_sube_irs<0b1011, "sbc",
                           BinOpFrag<(sube_dead_carry node:$LHS, node:$RHS)>>;
-defm t2ADCS : T2I_adde_sube_s_irs<0b1010, "adc",
-                          BinOpFrag<(adde_live_carry node:$LHS, node:$RHS)>, 1>;
-defm t2SBCS : T2I_adde_sube_s_irs<0b1011, "sbc",
-                          BinOpFrag<(sube_live_carry node:$LHS, node:$RHS)>>;
+defm t2ADCS : T2I_adde_sube_s_irs<BinOpFrag<(adde_live_carry node:$LHS,
+                                                             node:$RHS)>, 1>;
+defm t2SBCS : T2I_adde_sube_s_irs<BinOpFrag<(sube_live_carry node:$LHS,
+                                                             node:$RHS)>>;
 
 // RSB
 defm t2RSB  : T2I_rbin_irs  <0b1110, "rsb",
@@ -1828,9 +1822,14 @@ def : T2Pat<(addc       rGPR:$src, t2_so_imm_neg:$imm),
 // Effectively, the inverse interpretation of the carry flag already accounts
 // for part of the negation.
 let AddedComplexity = 1 in
-def : T2Pat<(adde       rGPR:$src, imm0_255_not:$imm),
+def : T2Pat<(adde_dead_carry       rGPR:$src, imm0_255_not:$imm),
+            (t2SBCri    rGPR:$src, imm0_255_not:$imm)>;
+def : T2Pat<(adde_dead_carry       rGPR:$src, t2_so_imm_not:$imm),
+            (t2SBCri    rGPR:$src, t2_so_imm_not:$imm)>;
+let AddedComplexity = 1 in
+def : T2Pat<(adde_live_carry       rGPR:$src, imm0_255_not:$imm),
             (t2SBCSri   rGPR:$src, imm0_255_not:$imm)>;
-def : T2Pat<(adde       rGPR:$src, t2_so_imm_not:$imm),
+def : T2Pat<(adde_live_carry       rGPR:$src, t2_so_imm_not:$imm),
             (t2SBCSri   rGPR:$src, t2_so_imm_not:$imm)>;
 
 // Select Bytes -- for disassembly only
@@ -1976,9 +1975,9 @@ class T2SatI<dag oops, dag iops, InstrItinClass itin,
 }
 
 def t2SSAT: T2SatI<
-                (outs rGPR:$Rd), (ins i32imm:$sat_imm, rGPR:$Rn, shift_imm:$sh),
-                NoItinerary, "ssat", "\t$Rd, $sat_imm, $Rn$sh",
-                [/* For disassembly only; pattern left blank */]> {
+              (outs rGPR:$Rd), (ins ssat_imm:$sat_imm, rGPR:$Rn, shift_imm:$sh),
+              NoItinerary, "ssat", "\t$Rd, $sat_imm, $Rn$sh",
+              [/* For disassembly only; pattern left blank */]> {
   let Inst{31-27} = 0b11110;
   let Inst{25-22} = 0b1100;
   let Inst{20} = 0;
@@ -1986,9 +1985,9 @@ def t2SSAT: T2SatI<
 }
 
 def t2SSAT16: T2SatI<
-                   (outs rGPR:$Rd), (ins i32imm:$sat_imm, rGPR:$Rn), NoItinerary,
-                   "ssat16", "\t$Rd, $sat_imm, $Rn",
-                   [/* For disassembly only; pattern left blank */]> {
+                (outs rGPR:$Rd), (ins ssat_imm:$sat_imm, rGPR:$Rn), NoItinerary,
+                "ssat16", "\t$Rd, $sat_imm, $Rn",
+                [/* For disassembly only; pattern left blank */]> {
   let Inst{31-27} = 0b11110;
   let Inst{25-22} = 0b1100;
   let Inst{20} = 0;
@@ -2008,10 +2007,10 @@ def t2USAT: T2SatI<
   let Inst{15} = 0;
 }
 
-def t2USAT16: T2SatI<
-                    (outs rGPR:$dst), (ins i32imm:$sat_imm, rGPR:$Rn), NoItinerary,
-                   "usat16", "\t$dst, $sat_imm, $Rn",
-                   [/* For disassembly only; pattern left blank */]> {
+def t2USAT16: T2SatI<(outs rGPR:$dst), (ins i32imm:$sat_imm, rGPR:$Rn),
+                     NoItinerary,
+                     "usat16", "\t$dst, $sat_imm, $Rn",
+                     [/* For disassembly only; pattern left blank */]> {
   let Inst{31-27} = 0b11110;
   let Inst{25-22} = 0b1110;
   let Inst{20} = 0;
@@ -2033,6 +2032,10 @@ defm t2LSR  : T2I_sh_ir<0b01, "lsr", BinOpFrag<(srl  node:$LHS, node:$RHS)>>;
 defm t2ASR  : T2I_sh_ir<0b10, "asr", BinOpFrag<(sra  node:$LHS, node:$RHS)>>;
 defm t2ROR  : T2I_sh_ir<0b11, "ror", BinOpFrag<(rotr node:$LHS, node:$RHS)>>;
 
+// (rotr x, (and y, 0x...1f)) ==> (ROR x, y)
+def : Pat<(rotr rGPR:$lhs, (and rGPR:$rhs, lo5AllOne)),
+          (t2RORrr rGPR:$lhs, rGPR:$rhs)>;
+
 let Uses = [CPSR] in {
 def t2RRX : T2sTwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iMOVsi,
                    "rrx", "\t$Rd, $Rm",
@@ -2121,10 +2124,12 @@ def t2BFC : T2BitFI<(outs rGPR:$Rd), (ins rGPR:$src, bf_inv_mask_imm:$imm),
                 IIC_iUNAsi, "bfc", "\t$Rd, $imm",
                 [(set rGPR:$Rd, (and rGPR:$src, bf_inv_mask_imm:$imm))]> {
   let Inst{31-27} = 0b11110;
+  let Inst{26} = 0; // should be 0.
   let Inst{25} = 1;
   let Inst{24-20} = 0b10110;
   let Inst{19-16} = 0b1111; // Rn
   let Inst{15} = 0;
+  let Inst{5} = 0; // should be 0.
 
   bits<10> imm;
   let msb{4-0} = imm{9-5};
@@ -2157,9 +2162,11 @@ let Constraints = "$src = $Rd" in {
                   [(set rGPR:$Rd, (ARMbfi rGPR:$src, rGPR:$Rn,
                                    bf_inv_mask_imm:$imm))]> {
     let Inst{31-27} = 0b11110;
+    let Inst{26} = 0; // should be 0.
     let Inst{25} = 1;
     let Inst{24-20} = 0b10110;
     let Inst{15} = 0;
+    let Inst{5} = 0; // should be 0.
 
     bits<10> imm;
     let msb{4-0} = imm{9-5};
@@ -2174,9 +2181,11 @@ let Constraints = "$src = $Rd" in {
                   IIC_iBITi, "bfi", "\t$Rd, $Rn, $lsbit, $width",
                   []> {
     let Inst{31-27} = 0b11110;
+    let Inst{26} = 0; // should be 0.
     let Inst{25} = 1;
     let Inst{24-20} = 0b10110;
     let Inst{15} = 0;
+    let Inst{5} = 0; // should be 0.
 
     bits<5> lsbit;
     bits<5> width;
@@ -2595,6 +2604,10 @@ def : T2Pat<(sext_inreg (or (srl (and rGPR:$Rm, 0xFF00), (i32 8)),
                             (shl rGPR:$Rm, (i32 8))), i16),
             (t2REVSH rGPR:$Rm)>;
 
+def : T2Pat<(or (sra (shl rGPR:$Rm, (i32 24)), (i32 16)),
+                   (and (srl rGPR:$Rm, (i32 8)), 0xFF)),
+            (t2REVSH rGPR:$Rm)>;
+
 def : T2Pat<(sra (bswap rGPR:$Rm), (i32 16)), (t2REVSH rGPR:$Rm)>;
 
 def t2PKHBT : T2ThreeReg<
@@ -2854,16 +2867,15 @@ class T2I_strex<bits<2> opcod, dag oops, dag iops, AddrMode am, SizeFlagVal sz,
 }
 
 let mayLoad = 1 in {
-def t2LDREXB : T2I_ldrex<0b00, (outs rGPR:$Rt), (ins t2addrmode_reg:$addr), AddrModeNone,
-                         Size4Bytes, NoItinerary, "ldrexb", "\t$Rt, $addr",
-                         "", []>;
-def t2LDREXH : T2I_ldrex<0b01, (outs rGPR:$Rt), (ins t2addrmode_reg:$addr), AddrModeNone,
-                         Size4Bytes, NoItinerary, "ldrexh", "\t$Rt, $addr",
-                         "", []>;
-def t2LDREX  : Thumb2I<(outs rGPR:$Rt), (ins t2addrmode_reg:$addr), AddrModeNone,
-                       Size4Bytes, NoItinerary,
-                       "ldrex", "\t$Rt, $addr", "",
-                      []> {
+def t2LDREXB : T2I_ldrex<0b00, (outs rGPR:$Rt), (ins t2addrmode_reg:$addr),
+                         AddrModeNone, Size4Bytes, NoItinerary,
+                         "ldrexb", "\t$Rt, $addr", "", []>;
+def t2LDREXH : T2I_ldrex<0b01, (outs rGPR:$Rt), (ins t2addrmode_reg:$addr),
+                         AddrModeNone, Size4Bytes, NoItinerary,
+                         "ldrexh", "\t$Rt, $addr", "", []>;
+def t2LDREX  : Thumb2I<(outs rGPR:$Rt), (ins t2addrmode_reg:$addr),
+                       AddrModeNone, Size4Bytes, NoItinerary,
+                       "ldrex", "\t$Rt, $addr", "", []> {
   let Inst{31-27} = 0b11101;
   let Inst{26-20} = 0b0000101;
   let Inst{11-8} = 0b1111;
@@ -2874,7 +2886,9 @@ def t2LDREX  : Thumb2I<(outs rGPR:$Rt), (ins t2addrmode_reg:$addr), AddrModeNone
   let Inst{19-16} = addr;
   let Inst{15-12} = Rt;
 }
-def t2LDREXD : T2I_ldrex<0b11, (outs rGPR:$Rt, rGPR:$Rt2), (ins t2addrmode_reg:$addr),
+let hasExtraDefRegAllocReq = 1 in
+def t2LDREXD : T2I_ldrex<0b11, (outs rGPR:$Rt, rGPR:$Rt2),
+                         (ins t2addrmode_reg:$addr),
                          AddrModeNone, Size4Bytes, NoItinerary,
                          "ldrexd", "\t$Rt, $Rt2, $addr", "",
                          [], {?, ?, ?, ?}> {
@@ -2884,12 +2898,14 @@ def t2LDREXD : T2I_ldrex<0b11, (outs rGPR:$Rt, rGPR:$Rt2), (ins t2addrmode_reg:$
 }
 
 let mayStore = 1, Constraints = "@earlyclobber $Rd" in {
-def t2STREXB : T2I_strex<0b00, (outs rGPR:$Rd), (ins rGPR:$Rt, t2addrmode_reg:$addr),
-                  AddrModeNone, Size4Bytes, NoItinerary,
-                  "strexb", "\t$Rd, $Rt, $addr", "", []>;
-def t2STREXH : T2I_strex<0b01, (outs rGPR:$Rd), (ins rGPR:$Rt, t2addrmode_reg:$addr),
-                  AddrModeNone, Size4Bytes, NoItinerary,
-                  "strexh", "\t$Rd, $Rt, $addr", "", []>;
+def t2STREXB : T2I_strex<0b00, (outs rGPR:$Rd),
+                         (ins rGPR:$Rt, t2addrmode_reg:$addr),
+                         AddrModeNone, Size4Bytes, NoItinerary,
+                         "strexb", "\t$Rd, $Rt, $addr", "", []>;
+def t2STREXH : T2I_strex<0b01, (outs rGPR:$Rd),
+                         (ins rGPR:$Rt, t2addrmode_reg:$addr),
+                         AddrModeNone, Size4Bytes, NoItinerary,
+                         "strexh", "\t$Rd, $Rt, $addr", "", []>;
 def t2STREX  : Thumb2I<(outs rGPR:$Rd), (ins rGPR:$Rt, t2addrmode_reg:$addr),
                   AddrModeNone, Size4Bytes, NoItinerary,
                   "strex", "\t$Rd, $Rt, $addr", "",
@@ -2905,6 +2921,9 @@ def t2STREX  : Thumb2I<(outs rGPR:$Rd), (ins rGPR:$Rt, t2addrmode_reg:$addr),
   let Inst{19-16} = addr;
   let Inst{15-12} = Rt;
 }
+}
+
+let hasExtraSrcRegAllocReq = 1, Constraints = "@earlyclobber $Rd" in
 def t2STREXD : T2I_strex<0b11, (outs rGPR:$Rd),
                          (ins rGPR:$Rt, rGPR:$Rt2, t2addrmode_reg:$addr),
                          AddrModeNone, Size4Bytes, NoItinerary,
@@ -2913,7 +2932,6 @@ def t2STREXD : T2I_strex<0b11, (outs rGPR:$Rd),
   bits<4> Rt2;
   let Inst{11-8} = Rt2;
 }
-}
 
 // Clear-Exclusive is for disassembly only.
 def t2CLREX : T2XI<(outs), (ins), NoItinerary, "clrex",
@@ -2952,16 +2970,15 @@ let isCall = 1,
 //   here, and we're using the stack frame for the containing function to
 //   save/restore registers, we can't keep anything live in regs across
 //   the eh_sjlj_setjmp(), else it will almost certainly have been tromped upon
-//   when we get here from a longjmp(). We force everthing out of registers
+//   when we get here from a longjmp(). We force everything out of registers
 //   except for our own input by listing the relevant registers in Defs. By
 //   doing so, we also cause the prologue/epilogue code to actively preserve
 //   all of the callee-saved resgisters, which is exactly what we want.
 //   $val is a scratch register for our use.
 let Defs =
-  [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR,  D0,
-    D1,  D2,  D3,  D4,  D5,  D6,  D7,  D8,  D9,  D10, D11, D12, D13, D14, D15,
-    D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26, D27, D28, D29, D30,
-    D31 ], hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1 in {
+  [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR, CPSR,
+    QQQQ0, QQQQ1, QQQQ2, QQQQ3 ],
+  hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1 in {
   def t2Int_eh_sjlj_setjmp : Thumb2XI<(outs), (ins tGPR:$src, tGPR:$val),
                                AddrModeNone, SizeSpecial, NoItinerary, "", "",
                           [(set R0, (ARMeh_sjlj_setjmp tGPR:$src, tGPR:$val))]>,
@@ -2969,7 +2986,7 @@ let Defs =
 }
 
 let Defs =
-  [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR ],
+  [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR, CPSR ],
   hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1 in {
   def t2Int_eh_sjlj_setjmp_nofp : Thumb2XI<(outs), (ins tGPR:$src, tGPR:$val),
                                AddrModeNone, SizeSpecial, NoItinerary, "", "",
@@ -3225,19 +3242,20 @@ class T2RFE<bits<12> op31_20, dag oops, dag iops, InstrItinClass itin,
 
   bits<4> Rn;
   let Inst{19-16} = Rn;
+  let Inst{15-0} = 0xc000;
 }
 
 def t2RFEDBW : T2RFE<0b111010000011,
-                   (outs), (ins rGPR:$Rn), NoItinerary, "rfedb", "\t$Rn!",
+                   (outs), (ins GPR:$Rn), NoItinerary, "rfedb", "\t$Rn!",
                    [/* For disassembly only; pattern left blank */]>;
 def t2RFEDB  : T2RFE<0b111010000001,
-                   (outs), (ins rGPR:$Rn), NoItinerary, "rfeab", "\t$Rn",
+                   (outs), (ins GPR:$Rn), NoItinerary, "rfedb", "\t$Rn",
                    [/* For disassembly only; pattern left blank */]>;
 def t2RFEIAW : T2RFE<0b111010011011,
-                   (outs), (ins rGPR:$Rn), NoItinerary, "rfeia", "\t$Rn!",
+                   (outs), (ins GPR:$Rn), NoItinerary, "rfeia", "\t$Rn!",
                    [/* For disassembly only; pattern left blank */]>;
 def t2RFEIA  : T2RFE<0b111010011001,
-                   (outs), (ins rGPR:$Rn), NoItinerary, "rfeia", "\t$Rn",
+                   (outs), (ins GPR:$Rn), NoItinerary, "rfeia", "\t$Rn",
                    [/* For disassembly only; pattern left blank */]>;
 
 //===----------------------------------------------------------------------===//
@@ -3339,9 +3357,10 @@ def t2MSR : T2SpecialReg<0b111100111000 /* op31-20 */, 0b10 /* op15-14 */,
 // Move between coprocessor and ARM core register -- for disassembly only
 //
 
-class t2MovRCopro<string opc, bit direction, dag oops, dag iops>
+class t2MovRCopro<string opc, bit direction, dag oops, dag iops,
+                  list<dag> pattern>
   : T2Cop<oops, iops, !strconcat(opc, "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2"),
-          [/* For disassembly only; pattern left blank */]> {
+          pattern> {
   let Inst{27-24} = 0b1110;
   let Inst{20} = direction;
   let Inst{4} = 1;
@@ -3363,15 +3382,21 @@ class t2MovRCopro<string opc, bit direction, dag oops, dag iops>
 
 def t2MCR2 : t2MovRCopro<"mcr2", 0 /* from ARM core register to coprocessor */,
              (outs), (ins p_imm:$cop, i32imm:$opc1, GPR:$Rt, c_imm:$CRn,
-                          c_imm:$CRm, i32imm:$opc2)>;
+                          c_imm:$CRm, i32imm:$opc2),
+             [(int_arm_mcr2 imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
+                            imm:$CRm, imm:$opc2)]>;
 def t2MRC2 : t2MovRCopro<"mrc2", 1 /* from coprocessor to ARM core register */,
              (outs GPR:$Rt), (ins p_imm:$cop, i32imm:$opc1, c_imm:$CRn,
-                                  c_imm:$CRm, i32imm:$opc2)>;
+                                  c_imm:$CRm, i32imm:$opc2), []>;
+
+def : T2v6Pat<(int_arm_mrc2 imm:$cop, imm:$opc1, imm:$CRn,
+                            imm:$CRm, imm:$opc2),
+              (t2MRC2 imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>;
 
-class t2MovRRCopro<string opc, bit direction>
+class t2MovRRCopro<string opc, bit direction,
+                   list<dag> pattern = [/* For disassembly only */]>
   : T2Cop<(outs), (ins p_imm:$cop, i32imm:$opc1, GPR:$Rt, GPR:$Rt2, c_imm:$CRm),
-          !strconcat(opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm"),
-          [/* For disassembly only; pattern left blank */]> {
+          !strconcat(opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm"), pattern> {
   let Inst{27-24} = 0b1100;
   let Inst{23-21} = 0b010;
   let Inst{20} = direction;
@@ -3390,7 +3415,9 @@ class t2MovRRCopro<string opc, bit direction>
 }
 
 def t2MCRR2 : t2MovRRCopro<"mcrr2",
-                           0 /* from ARM core register to coprocessor */>;
+                           0 /* from ARM core register to coprocessor */,
+                           [(int_arm_mcrr2 imm:$cop, imm:$opc1, GPR:$Rt,
+                                           GPR:$Rt2, imm:$CRm)]>;
 def t2MRRC2 : t2MovRRCopro<"mrrc2",
                            1 /* from coprocessor to ARM core register */>;
 
@@ -3401,7 +3428,8 @@ def t2MRRC2 : t2MovRRCopro<"mrrc2",
 def t2CDP2 : T2Cop<(outs), (ins p_imm:$cop, i32imm:$opc1,
                    c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, i32imm:$opc2),
                    "cdp2\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
-                   [/* For disassembly only; pattern left blank */]> {
+                   [(int_arm_cdp2 imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
+                                  imm:$CRm, imm:$opc2)]> {
   let Inst{27-24} = 0b1110;
 
   bits<4> opc1;
diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td
index a731731..b4c3239 100644
--- a/lib/Target/ARM/ARMInstrVFP.td
+++ b/lib/Target/ARM/ARMInstrVFP.td
@@ -94,7 +94,8 @@ multiclass vfp_ldst_mult<string asm, bit L_bit,
     let Inst{20}    = L_bit;
   }
   def DIA_UPD :
-    AXDI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, variable_ops),
+    AXDI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs,
+                               variable_ops),
           IndexModeUpd, itin_upd,
           !strconcat(asm, "ia${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
     let Inst{24-23} = 0b01;       // Increment After
@@ -102,7 +103,8 @@ multiclass vfp_ldst_mult<string asm, bit L_bit,
     let Inst{20}    = L_bit;
   }
   def DDB_UPD :
-    AXDI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, variable_ops),
+    AXDI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs,
+                               variable_ops),
           IndexModeUpd, itin_upd,
           !strconcat(asm, "db${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
     let Inst{24-23} = 0b10;       // Decrement Before
@@ -124,7 +126,8 @@ multiclass vfp_ldst_mult<string asm, bit L_bit,
     let D = VFPNeonDomain;
   }
   def SIA_UPD :
-    AXSI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, spr_reglist:$regs, variable_ops),
+    AXSI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, spr_reglist:$regs,
+                               variable_ops),
           IndexModeUpd, itin_upd,
           !strconcat(asm, "ia${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
     let Inst{24-23} = 0b01;       // Increment After
@@ -136,7 +139,8 @@ multiclass vfp_ldst_mult<string asm, bit L_bit,
     let D = VFPNeonDomain;
   }
   def SDB_UPD :
-    AXSI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, spr_reglist:$regs, variable_ops),
+    AXSI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, spr_reglist:$regs,
+                               variable_ops),
           IndexModeUpd, itin_upd,
           !strconcat(asm, "db${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
     let Inst{24-23} = 0b10;       // Decrement Before
@@ -447,6 +451,10 @@ def VMOVRS : AVConv2I<0b11100001, 0b1010,
 
   let Inst{6-5}   = 0b00;
   let Inst{3-0}   = 0b0000;
+
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
 }
 
 def VMOVSR : AVConv4I<0b11100000, 0b1010,
@@ -464,6 +472,10 @@ def VMOVSR : AVConv4I<0b11100000, 0b1010,
 
   let Inst{6-5}   = 0b00;
   let Inst{3-0}   = 0b0000;
+
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
 }
 
 let neverHasSideEffects = 1 in {
@@ -483,6 +495,10 @@ def VMOVRRD  : AVConv3I<0b11000101, 0b1011,
   let Inst{19-16} = Rt2;
 
   let Inst{7-6} = 0b00;
+
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
 }
 
 def VMOVRRS  : AVConv3I<0b11000101, 0b1010,
@@ -490,6 +506,10 @@ def VMOVRRS  : AVConv3I<0b11000101, 0b1010,
                  IIC_fpMOVDI, "vmov", "\t$wb, $dst2, $src1, $src2",
                  [/* For disassembly only; pattern left blank */]> {
   let Inst{7-6} = 0b00;
+
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
 }
 } // neverHasSideEffects
 
@@ -512,6 +532,10 @@ def VMOVDRR : AVConv5I<0b11000100, 0b1011,
   let Inst{19-16} = Rt2;
 
   let Inst{7-6}   = 0b00;
+
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
 }
 
 let neverHasSideEffects = 1 in
@@ -520,6 +544,10 @@ def VMOVSRR : AVConv5I<0b11000100, 0b1010,
                 IIC_fpMOVID, "vmov", "\t$dst1, $dst2, $src1, $src2",
                 [/* For disassembly only; pattern left blank */]> {
   let Inst{7-6} = 0b00;
+
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
 }
 
 // FMRDH: SPR -> GPR
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index ac5cbfe..f4645f1 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -761,7 +761,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
     MIB.addOperand(MI->getOperand(OpNum));
 
   // Transfer memoperands.
-  (*MIB).setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+  MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
 
   MBB.erase(MBBI);
   return true;
@@ -947,8 +947,8 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
   return true;
 }
 
-/// isMemoryOp - Returns true if instruction is a memory operations (that this
-/// pass is capable of operating on).
+/// isMemoryOp - Returns true if instruction is a memory operation that this
+/// pass is capable of operating on.
 static bool isMemoryOp(const MachineInstr *MI) {
   // When no memory operands are present, conservatively assume unaligned,
   // volatile, unfoldable.
@@ -1287,14 +1287,14 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
         MergeLDR_STR(MBB, 0, CurrBase, CurrOpc, CurrSize,
                      CurrPred, CurrPredReg, Scratch, MemOps, Merges);
 
-        // Try folding preceeding/trailing base inc/dec into the generated
+        // Try folding preceding/trailing base inc/dec into the generated
         // LDM/STM ops.
         for (unsigned i = 0, e = Merges.size(); i < e; ++i)
           if (MergeBaseUpdateLSMultiple(MBB, Merges[i], Advance, MBBI))
             ++NumMerges;
         NumMerges += Merges.size();
 
-        // Try folding preceeding/trailing base inc/dec into those load/store
+        // Try folding preceding/trailing base inc/dec into those load/store
         // that were not merged to form LDM/STM ops.
         for (unsigned i = 0; i != NumMemOps; ++i)
           if (!MemOps[i].Merged)
@@ -1304,7 +1304,7 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
         // RS may be pointing to an instruction that's deleted.
         RS->skipTo(prior(MBBI));
       } else if (NumMemOps == 1) {
-        // Try folding preceeding/trailing base inc/dec into the single
+        // Try folding preceding/trailing base inc/dec into the single
         // load/store.
         if (MergeBaseUpdateLoadStore(MBB, MemOps[0].MBBI, TII, Advance, MBBI)) {
           ++NumMerges;
@@ -1334,7 +1334,7 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
 }
 
 /// MergeReturnIntoLDM - If this is a exit BB, try merging the return ops
-/// ("bx lr" and "mov pc, lr") into the preceeding stack restore so it
+/// ("bx lr" and "mov pc, lr") into the preceding stack restore so it
 /// directly restore the value of LR into pc.
 ///   ldmfd sp!, {..., lr}
 ///   bx lr
@@ -1672,10 +1672,14 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
           Ops.pop_back();
           Ops.pop_back();
 
+          const TargetInstrDesc &TID = TII->get(NewOpc);
+          const TargetRegisterClass *TRC = TID.OpInfo[0].getRegClass(TRI);
+          MRI->constrainRegClass(EvenReg, TRC);
+          MRI->constrainRegClass(OddReg, TRC);
+
           // Form the pair instruction.
           if (isLd) {
-            MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos,
-                                              dl, TII->get(NewOpc))
+            MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos, dl, TID)
               .addReg(EvenReg, RegState::Define)
               .addReg(OddReg, RegState::Define)
               .addReg(BaseReg);
@@ -1687,8 +1691,7 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
             MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
             ++NumLDRDFormed;
           } else {
-            MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos,
-                                              dl, TII->get(NewOpc))
+            MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos, dl, TID)
               .addReg(EvenReg)
               .addReg(OddReg)
               .addReg(BaseReg);
diff --git a/lib/Target/ARM/ARMMCAsmInfo.cpp b/lib/Target/ARM/ARMMCAsmInfo.cpp
index a3f89e9..53b4c95 100644
--- a/lib/Target/ARM/ARMMCAsmInfo.cpp
+++ b/lib/Target/ARM/ARMMCAsmInfo.cpp
@@ -70,8 +70,6 @@ ARMELFMCAsmInfo::ARMELFMCAsmInfo() {
   WeakRefDirective = "\t.weak\t";
   HasLCOMMDirective = true;
 
-  DwarfRequiresFrameSection = false;
-
   SupportsDebugInformation = true;
 
   // Exceptions handling
diff --git a/lib/Target/ARM/ARMMCCodeEmitter.cpp b/lib/Target/ARM/ARMMCCodeEmitter.cpp
index 10607b1..c5f727d 100644
--- a/lib/Target/ARM/ARMMCCodeEmitter.cpp
+++ b/lib/Target/ARM/ARMMCCodeEmitter.cpp
@@ -269,10 +269,15 @@ public:
   unsigned getMsbOpValue(const MCInst &MI, unsigned Op,
                          SmallVectorImpl<MCFixup> &Fixups) const;
 
+  unsigned getSsatBitPosValue(const MCInst &MI, unsigned Op,
+                              SmallVectorImpl<MCFixup> &Fixups) const;
+
   unsigned getRegisterListOpValue(const MCInst &MI, unsigned Op,
                                   SmallVectorImpl<MCFixup> &Fixups) const;
   unsigned getAddrMode6AddressOpValue(const MCInst &MI, unsigned Op,
                                       SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getAddrMode6OneLane32AddressOpValue(const MCInst &MI, unsigned Op,
+                                        SmallVectorImpl<MCFixup> &Fixups) const;
   unsigned getAddrMode6DupAddressOpValue(const MCInst &MI, unsigned Op,
                                         SmallVectorImpl<MCFixup> &Fixups) const;
   unsigned getAddrMode6OffsetOpValue(const MCInst &MI, unsigned Op,
@@ -1122,6 +1127,13 @@ getMsbOpValue(const MCInst &MI, unsigned Op,
 }
 
 unsigned ARMMCCodeEmitter::
+getSsatBitPosValue(const MCInst &MI, unsigned Op,
+                   SmallVectorImpl<MCFixup> &Fixups) const {
+  // For ssat instructions, the bit position should be encoded decremented by 1
+  return MI.getOperand(Op).getImm()-1;
+}
+
+unsigned ARMMCCodeEmitter::
 getRegisterListOpValue(const MCInst &MI, unsigned Op,
                        SmallVectorImpl<MCFixup> &Fixups) const {
   // VLDM/VSTM:
@@ -1178,6 +1190,30 @@ getAddrMode6AddressOpValue(const MCInst &MI, unsigned Op,
   return RegNo | (Align << 4);
 }
 
+/// getAddrMode6OneLane32AddressOpValue - Encode an addrmode6 register number
+/// along  with the alignment operand for use in VST1 and VLD1 with size 32.
+unsigned ARMMCCodeEmitter::
+getAddrMode6OneLane32AddressOpValue(const MCInst &MI, unsigned Op,
+                                    SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand &Reg = MI.getOperand(Op);
+  const MCOperand &Imm = MI.getOperand(Op + 1);
+
+  unsigned RegNo = getARMRegisterNumbering(Reg.getReg());
+  unsigned Align = 0;
+
+  switch (Imm.getImm()) {
+  default: break;
+  case 2:
+  case 4:
+  case 8:
+  case 16: Align = 0x00; break;
+  case 32: Align = 0x03; break;
+  }
+
+  return RegNo | (Align << 4);
+}
+
+
 /// getAddrMode6DupAddressOpValue - Encode an addrmode6 register number and
 /// alignment operand for use in VLD-dup instructions.  This is the same as
 /// getAddrMode6AddressOpValue except for the alignment encoding, which is
diff --git a/lib/Target/ARM/ARMMCExpr.h b/lib/Target/ARM/ARMMCExpr.h
index d42f766..0a2e883 100644
--- a/lib/Target/ARM/ARMMCExpr.h
+++ b/lib/Target/ARM/ARMMCExpr.h
@@ -60,6 +60,9 @@ public:
   bool EvaluateAsRelocatableImpl(MCValue &Res,
                                  const MCAsmLayout *Layout) const;
   void AddValueSymbols(MCAssembler *) const;
+  const MCSection *FindAssociatedSection() const {
+    return getSubExpr()->FindAssociatedSection();
+  }
 
   static bool classof(const MCExpr *E) {
     return E->getKind() == MCExpr::Target;
diff --git a/lib/Target/ARM/ARMPerfectShuffle.h b/lib/Target/ARM/ARMPerfectShuffle.h
index edecc4b..18e1620 100644
--- a/lib/Target/ARM/ARMPerfectShuffle.h
+++ b/lib/Target/ARM/ARMPerfectShuffle.h
@@ -21,6566 +21,6566 @@
 
 // This table is 6561*4 = 26244 bytes in size.
 static const unsigned PerfectShuffleTable[6561+1] = {
-   135053414U,  // <0,0,0,0>: Cost 1 vdup0 LHS
-  1543503974U,  // <0,0,0,1>: Cost 2 vext2 <0,0,0,0>, LHS
-  2618572962U,  // <0,0,0,2>: Cost 3 vext2 <0,2,0,0>, <0,2,0,0>
-  2568054923U,  // <0,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0>
-  1476398390U,  // <0,0,0,4>: Cost 2 vext1 <0,0,0,0>, RHS
-  2550140624U,  // <0,0,0,5>: Cost 3 vext1 <0,0,0,0>, <5,1,7,3>
-  2550141434U,  // <0,0,0,6>: Cost 3 vext1 <0,0,0,0>, <6,2,7,3>
-  2591945711U,  // <0,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0>
-   135053414U,  // <0,0,0,u>: Cost 1 vdup0 LHS
-  2886516736U,  // <0,0,1,0>: Cost 3 vzipl LHS, <0,0,0,0>
-  1812775014U,  // <0,0,1,1>: Cost 2 vzipl LHS, LHS
-  1618133094U,  // <0,0,1,2>: Cost 2 vext3 <1,2,3,0>, LHS
-  2625209292U,  // <0,0,1,3>: Cost 3 vext2 <1,3,0,0>, <1,3,0,0>
-  2886558034U,  // <0,0,1,4>: Cost 3 vzipl LHS, <0,4,1,5>
-  2617246864U,  // <0,0,1,5>: Cost 3 vext2 <0,0,0,0>, <1,5,3,7>
-  3659723031U,  // <0,0,1,6>: Cost 4 vext1 <6,0,0,1>, <6,0,0,1>
-  2591953904U,  // <0,0,1,7>: Cost 3 vext1 <7,0,0,1>, <7,0,0,1>
-  1812775581U,  // <0,0,1,u>: Cost 2 vzipl LHS, LHS
-  3020734464U,  // <0,0,2,0>: Cost 3 vtrnl LHS, <0,0,0,0>
-  3020734474U,  // <0,0,2,1>: Cost 3 vtrnl LHS, <0,0,1,1>
-  1946992742U,  // <0,0,2,2>: Cost 2 vtrnl LHS, LHS
-  2631181989U,  // <0,0,2,3>: Cost 3 vext2 <2,3,0,0>, <2,3,0,0>
-  3020734668U,  // <0,0,2,4>: Cost 3 vtrnl LHS, <0,2,4,6>
-  3826550569U,  // <0,0,2,5>: Cost 4 vuzpl <0,2,0,2>, <2,4,5,6>
-  2617247674U,  // <0,0,2,6>: Cost 3 vext2 <0,0,0,0>, <2,6,3,7>
-  2591962097U,  // <0,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2>
-  1946992796U,  // <0,0,2,u>: Cost 2 vtrnl LHS, LHS
-  2635163787U,  // <0,0,3,0>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0>
-  2686419196U,  // <0,0,3,1>: Cost 3 vext3 <0,3,1,0>, <0,3,1,0>
-  2686492933U,  // <0,0,3,2>: Cost 3 vext3 <0,3,2,0>, <0,3,2,0>
-  2617248156U,  // <0,0,3,3>: Cost 3 vext2 <0,0,0,0>, <3,3,3,3>
-  2617248258U,  // <0,0,3,4>: Cost 3 vext2 <0,0,0,0>, <3,4,5,6>
-  3826551298U,  // <0,0,3,5>: Cost 4 vuzpl <0,2,0,2>, <3,4,5,6>
-  3690990200U,  // <0,0,3,6>: Cost 4 vext2 <0,0,0,0>, <3,6,0,7>
-  3713551042U,  // <0,0,3,7>: Cost 4 vext2 <3,7,0,0>, <3,7,0,0>
-  2635163787U,  // <0,0,3,u>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0>
-  2617248658U,  // <0,0,4,0>: Cost 3 vext2 <0,0,0,0>, <4,0,5,1>
-  2888450150U,  // <0,0,4,1>: Cost 3 vzipl <0,4,1,5>, LHS
-  3021570150U,  // <0,0,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS
-  3641829519U,  // <0,0,4,3>: Cost 4 vext1 <3,0,0,4>, <3,0,0,4>
-  3021570252U,  // <0,0,4,4>: Cost 3 vtrnl <0,2,4,6>, <0,2,4,6>
-  1543507254U,  // <0,0,4,5>: Cost 2 vext2 <0,0,0,0>, RHS
-  2752810294U,  // <0,0,4,6>: Cost 3 vuzpl <0,2,0,2>, RHS
-  3786998152U,  // <0,0,4,7>: Cost 4 vext3 <4,7,5,0>, <0,4,7,5>
-  1543507497U,  // <0,0,4,u>: Cost 2 vext2 <0,0,0,0>, RHS
-  2684354972U,  // <0,0,5,0>: Cost 3 vext3 <0,0,0,0>, <0,5,0,7>
-  2617249488U,  // <0,0,5,1>: Cost 3 vext2 <0,0,0,0>, <5,1,7,3>
-  3765617070U,  // <0,0,5,2>: Cost 4 vext3 <1,2,3,0>, <0,5,2,7>
-  3635865780U,  // <0,0,5,3>: Cost 4 vext1 <2,0,0,5>, <3,0,4,5>
-  2617249734U,  // <0,0,5,4>: Cost 3 vext2 <0,0,0,0>, <5,4,7,6>
-  2617249796U,  // <0,0,5,5>: Cost 3 vext2 <0,0,0,0>, <5,5,5,5>
-  2718712274U,  // <0,0,5,6>: Cost 3 vext3 <5,6,7,0>, <0,5,6,7>
-  2617249960U,  // <0,0,5,7>: Cost 3 vext2 <0,0,0,0>, <5,7,5,7>
-  2720039396U,  // <0,0,5,u>: Cost 3 vext3 <5,u,7,0>, <0,5,u,7>
-  2684355053U,  // <0,0,6,0>: Cost 3 vext3 <0,0,0,0>, <0,6,0,7>
-  3963609190U,  // <0,0,6,1>: Cost 4 vzipl <0,6,2,7>, LHS
-  2617250298U,  // <0,0,6,2>: Cost 3 vext2 <0,0,0,0>, <6,2,7,3>
-  3796435464U,  // <0,0,6,3>: Cost 4 vext3 <6,3,7,0>, <0,6,3,7>
-  3659762998U,  // <0,0,6,4>: Cost 4 vext1 <6,0,0,6>, RHS
-  3659763810U,  // <0,0,6,5>: Cost 4 vext1 <6,0,0,6>, <5,6,7,0>
-  2617250616U,  // <0,0,6,6>: Cost 3 vext2 <0,0,0,0>, <6,6,6,6>
-  2657727309U,  // <0,0,6,7>: Cost 3 vext2 <6,7,0,0>, <6,7,0,0>
-  2658390942U,  // <0,0,6,u>: Cost 3 vext2 <6,u,0,0>, <6,u,0,0>
-  2659054575U,  // <0,0,7,0>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0>
-  3635880854U,  // <0,0,7,1>: Cost 4 vext1 <2,0,0,7>, <1,2,3,0>
-  3635881401U,  // <0,0,7,2>: Cost 4 vext1 <2,0,0,7>, <2,0,0,7>
-  3734787298U,  // <0,0,7,3>: Cost 4 vext2 <7,3,0,0>, <7,3,0,0>
-  2617251174U,  // <0,0,7,4>: Cost 3 vext2 <0,0,0,0>, <7,4,5,6>
-  3659772002U,  // <0,0,7,5>: Cost 4 vext1 <6,0,0,7>, <5,6,7,0>
-  3659772189U,  // <0,0,7,6>: Cost 4 vext1 <6,0,0,7>, <6,0,0,7>
-  2617251436U,  // <0,0,7,7>: Cost 3 vext2 <0,0,0,0>, <7,7,7,7>
-  2659054575U,  // <0,0,7,u>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0>
-   135053414U,  // <0,0,u,0>: Cost 1 vdup0 LHS
-  1817419878U,  // <0,0,u,1>: Cost 2 vzipl LHS, LHS
-  1947435110U,  // <0,0,u,2>: Cost 2 vtrnl LHS, LHS
-  2568120467U,  // <0,0,u,3>: Cost 3 vext1 <3,0,0,u>, <3,0,0,u>
-  1476463926U,  // <0,0,u,4>: Cost 2 vext1 <0,0,0,u>, RHS
-  1543510170U,  // <0,0,u,5>: Cost 2 vext2 <0,0,0,0>, RHS
-  2752813210U,  // <0,0,u,6>: Cost 3 vuzpl <0,2,0,2>, RHS
-  2592011255U,  // <0,0,u,7>: Cost 3 vext1 <7,0,0,u>, <7,0,0,u>
-   135053414U,  // <0,0,u,u>: Cost 1 vdup0 LHS
-  2618581002U,  // <0,1,0,0>: Cost 3 vext2 <0,2,0,1>, <0,0,1,1>
-  1557446758U,  // <0,1,0,1>: Cost 2 vext2 <2,3,0,1>, LHS
-  2618581155U,  // <0,1,0,2>: Cost 3 vext2 <0,2,0,1>, <0,2,0,1>
-  2690548468U,  // <0,1,0,3>: Cost 3 vext3 <1,0,3,0>, <1,0,3,0>
-  2626543954U,  // <0,1,0,4>: Cost 3 vext2 <1,5,0,1>, <0,4,1,5>
-  4094985216U,  // <0,1,0,5>: Cost 4 vtrnl <0,2,0,2>, <1,3,5,7>
-  2592019278U,  // <0,1,0,6>: Cost 3 vext1 <7,0,1,0>, <6,7,0,1>
-  2592019448U,  // <0,1,0,7>: Cost 3 vext1 <7,0,1,0>, <7,0,1,0>
-  1557447325U,  // <0,1,0,u>: Cost 2 vext2 <2,3,0,1>, LHS
-  1476476938U,  // <0,1,1,0>: Cost 2 vext1 <0,0,1,1>, <0,0,1,1>
-  2886517556U,  // <0,1,1,1>: Cost 3 vzipl LHS, <1,1,1,1>
-  2886517654U,  // <0,1,1,2>: Cost 3 vzipl LHS, <1,2,3,0>
-  2886517720U,  // <0,1,1,3>: Cost 3 vzipl LHS, <1,3,1,3>
-  1476480310U,  // <0,1,1,4>: Cost 2 vext1 <0,0,1,1>, RHS
-  2886558864U,  // <0,1,1,5>: Cost 3 vzipl LHS, <1,5,3,7>
-  2550223354U,  // <0,1,1,6>: Cost 3 vext1 <0,0,1,1>, <6,2,7,3>
-  2550223856U,  // <0,1,1,7>: Cost 3 vext1 <0,0,1,1>, <7,0,0,1>
-  1476482862U,  // <0,1,1,u>: Cost 2 vext1 <0,0,1,1>, LHS
-  1494401126U,  // <0,1,2,0>: Cost 2 vext1 <3,0,1,2>, LHS
-  3020735284U,  // <0,1,2,1>: Cost 3 vtrnl LHS, <1,1,1,1>
-  2562172349U,  // <0,1,2,2>: Cost 3 vext1 <2,0,1,2>, <2,0,1,2>
-      835584U,  // <0,1,2,3>: Cost 0 copy LHS
-  1494404406U,  // <0,1,2,4>: Cost 2 vext1 <3,0,1,2>, RHS
-  3020735488U,  // <0,1,2,5>: Cost 3 vtrnl LHS, <1,3,5,7>
-  2631190458U,  // <0,1,2,6>: Cost 3 vext2 <2,3,0,1>, <2,6,3,7>
-  1518294010U,  // <0,1,2,7>: Cost 2 vext1 <7,0,1,2>, <7,0,1,2>
-      835584U,  // <0,1,2,u>: Cost 0 copy LHS
-  2692318156U,  // <0,1,3,0>: Cost 3 vext3 <1,3,0,0>, <1,3,0,0>
-  2691875800U,  // <0,1,3,1>: Cost 3 vext3 <1,2,3,0>, <1,3,1,3>
-  2691875806U,  // <0,1,3,2>: Cost 3 vext3 <1,2,3,0>, <1,3,2,0>
-  2692539367U,  // <0,1,3,3>: Cost 3 vext3 <1,3,3,0>, <1,3,3,0>
-  2562182454U,  // <0,1,3,4>: Cost 3 vext1 <2,0,1,3>, RHS
-  2691875840U,  // <0,1,3,5>: Cost 3 vext3 <1,2,3,0>, <1,3,5,7>
-  2692760578U,  // <0,1,3,6>: Cost 3 vext3 <1,3,6,0>, <1,3,6,0>
-  2639817411U,  // <0,1,3,7>: Cost 3 vext2 <3,7,0,1>, <3,7,0,1>
-  2691875863U,  // <0,1,3,u>: Cost 3 vext3 <1,2,3,0>, <1,3,u,3>
-  2568159334U,  // <0,1,4,0>: Cost 3 vext1 <3,0,1,4>, LHS
-  4095312692U,  // <0,1,4,1>: Cost 4 vtrnl <0,2,4,6>, <1,1,1,1>
-  2568160934U,  // <0,1,4,2>: Cost 3 vext1 <3,0,1,4>, <2,3,0,1>
-  2568161432U,  // <0,1,4,3>: Cost 3 vext1 <3,0,1,4>, <3,0,1,4>
-  2568162614U,  // <0,1,4,4>: Cost 3 vext1 <3,0,1,4>, RHS
-  1557450038U,  // <0,1,4,5>: Cost 2 vext2 <2,3,0,1>, RHS
-  2754235702U,  // <0,1,4,6>: Cost 3 vuzpl <0,4,1,5>, RHS
-  2592052220U,  // <0,1,4,7>: Cost 3 vext1 <7,0,1,4>, <7,0,1,4>
-  1557450281U,  // <0,1,4,u>: Cost 2 vext2 <2,3,0,1>, RHS
-  3765617775U,  // <0,1,5,0>: Cost 4 vext3 <1,2,3,0>, <1,5,0,1>
-  2647781007U,  // <0,1,5,1>: Cost 3 vext2 <5,1,0,1>, <5,1,0,1>
-  3704934138U,  // <0,1,5,2>: Cost 4 vext2 <2,3,0,1>, <5,2,3,0>
-  2691875984U,  // <0,1,5,3>: Cost 3 vext3 <1,2,3,0>, <1,5,3,7>
-  2657734598U,  // <0,1,5,4>: Cost 3 vext2 <6,7,0,1>, <5,4,7,6>
-  2650435539U,  // <0,1,5,5>: Cost 3 vext2 <5,5,0,1>, <5,5,0,1>
-  2651099172U,  // <0,1,5,6>: Cost 3 vext2 <5,6,0,1>, <5,6,0,1>
-  2651762805U,  // <0,1,5,7>: Cost 3 vext2 <5,7,0,1>, <5,7,0,1>
-  2691876029U,  // <0,1,5,u>: Cost 3 vext3 <1,2,3,0>, <1,5,u,7>
-  2592063590U,  // <0,1,6,0>: Cost 3 vext1 <7,0,1,6>, LHS
-  3765617871U,  // <0,1,6,1>: Cost 4 vext3 <1,2,3,0>, <1,6,1,7>
-  2654417337U,  // <0,1,6,2>: Cost 3 vext2 <6,2,0,1>, <6,2,0,1>
-  3765617889U,  // <0,1,6,3>: Cost 4 vext3 <1,2,3,0>, <1,6,3,7>
-  2592066870U,  // <0,1,6,4>: Cost 3 vext1 <7,0,1,6>, RHS
-  3765617907U,  // <0,1,6,5>: Cost 4 vext3 <1,2,3,0>, <1,6,5,7>
-  2657071869U,  // <0,1,6,6>: Cost 3 vext2 <6,6,0,1>, <6,6,0,1>
-  1583993678U,  // <0,1,6,7>: Cost 2 vext2 <6,7,0,1>, <6,7,0,1>
-  1584657311U,  // <0,1,6,u>: Cost 2 vext2 <6,u,0,1>, <6,u,0,1>
-  2657735672U,  // <0,1,7,0>: Cost 3 vext2 <6,7,0,1>, <7,0,1,0>
-  2657735808U,  // <0,1,7,1>: Cost 3 vext2 <6,7,0,1>, <7,1,7,1>
-  2631193772U,  // <0,1,7,2>: Cost 3 vext2 <2,3,0,1>, <7,2,3,0>
-  2661053667U,  // <0,1,7,3>: Cost 3 vext2 <7,3,0,1>, <7,3,0,1>
-  2657736038U,  // <0,1,7,4>: Cost 3 vext2 <6,7,0,1>, <7,4,5,6>
-  3721524621U,  // <0,1,7,5>: Cost 4 vext2 <5,1,0,1>, <7,5,1,0>
-  2657736158U,  // <0,1,7,6>: Cost 3 vext2 <6,7,0,1>, <7,6,1,0>
-  2657736300U,  // <0,1,7,7>: Cost 3 vext2 <6,7,0,1>, <7,7,7,7>
-  2657736322U,  // <0,1,7,u>: Cost 3 vext2 <6,7,0,1>, <7,u,1,2>
-  1494450278U,  // <0,1,u,0>: Cost 2 vext1 <3,0,1,u>, LHS
-  1557452590U,  // <0,1,u,1>: Cost 2 vext2 <2,3,0,1>, LHS
-  2754238254U,  // <0,1,u,2>: Cost 3 vuzpl <0,4,1,5>, LHS
-      835584U,  // <0,1,u,3>: Cost 0 copy LHS
-  1494453558U,  // <0,1,u,4>: Cost 2 vext1 <3,0,1,u>, RHS
-  1557452954U,  // <0,1,u,5>: Cost 2 vext2 <2,3,0,1>, RHS
-  2754238618U,  // <0,1,u,6>: Cost 3 vuzpl <0,4,1,5>, RHS
-  1518343168U,  // <0,1,u,7>: Cost 2 vext1 <7,0,1,u>, <7,0,1,u>
-      835584U,  // <0,1,u,u>: Cost 0 copy LHS
-  2752299008U,  // <0,2,0,0>: Cost 3 vuzpl LHS, <0,0,0,0>
-  1544847462U,  // <0,2,0,1>: Cost 2 vext2 <0,2,0,2>, LHS
-  1678557286U,  // <0,2,0,2>: Cost 2 vuzpl LHS, LHS
-  2696521165U,  // <0,2,0,3>: Cost 3 vext3 <2,0,3,0>, <2,0,3,0>
-  2752340172U,  // <0,2,0,4>: Cost 3 vuzpl LHS, <0,2,4,6>
-  2691876326U,  // <0,2,0,5>: Cost 3 vext3 <1,2,3,0>, <2,0,5,7>
-  2618589695U,  // <0,2,0,6>: Cost 3 vext2 <0,2,0,2>, <0,6,2,7>
-  2592093185U,  // <0,2,0,7>: Cost 3 vext1 <7,0,2,0>, <7,0,2,0>
-  1678557340U,  // <0,2,0,u>: Cost 2 vuzpl LHS, LHS
-  2618589942U,  // <0,2,1,0>: Cost 3 vext2 <0,2,0,2>, <1,0,3,2>
-  2752299828U,  // <0,2,1,1>: Cost 3 vuzpl LHS, <1,1,1,1>
-  2886518376U,  // <0,2,1,2>: Cost 3 vzipl LHS, <2,2,2,2>
-  2752299766U,  // <0,2,1,3>: Cost 3 vuzpl LHS, <1,0,3,2>
-  2550295862U,  // <0,2,1,4>: Cost 3 vext1 <0,0,2,1>, RHS
-  2752340992U,  // <0,2,1,5>: Cost 3 vuzpl LHS, <1,3,5,7>
-  2886559674U,  // <0,2,1,6>: Cost 3 vzipl LHS, <2,6,3,7>
-  3934208106U,  // <0,2,1,7>: Cost 4 vuzpr <7,0,1,2>, <0,1,2,7>
-  2752340771U,  // <0,2,1,u>: Cost 3 vuzpl LHS, <1,0,u,2>
-  1476558868U,  // <0,2,2,0>: Cost 2 vext1 <0,0,2,2>, <0,0,2,2>
-  2226628029U,  // <0,2,2,1>: Cost 3 vrev <2,0,1,2>
-  2752300648U,  // <0,2,2,2>: Cost 3 vuzpl LHS, <2,2,2,2>
-  3020736114U,  // <0,2,2,3>: Cost 3 vtrnl LHS, <2,2,3,3>
-  1476562230U,  // <0,2,2,4>: Cost 2 vext1 <0,0,2,2>, RHS
-  2550304464U,  // <0,2,2,5>: Cost 3 vext1 <0,0,2,2>, <5,1,7,3>
-  2618591162U,  // <0,2,2,6>: Cost 3 vext2 <0,2,0,2>, <2,6,3,7>
-  2550305777U,  // <0,2,2,7>: Cost 3 vext1 <0,0,2,2>, <7,0,0,2>
-  1476564782U,  // <0,2,2,u>: Cost 2 vext1 <0,0,2,2>, LHS
-  2618591382U,  // <0,2,3,0>: Cost 3 vext2 <0,2,0,2>, <3,0,1,2>
-  2752301206U,  // <0,2,3,1>: Cost 3 vuzpl LHS, <3,0,1,2>
-  3826043121U,  // <0,2,3,2>: Cost 4 vuzpl LHS, <3,1,2,3>
-  2752301468U,  // <0,2,3,3>: Cost 3 vuzpl LHS, <3,3,3,3>
-  2618591746U,  // <0,2,3,4>: Cost 3 vext2 <0,2,0,2>, <3,4,5,6>
-  2752301570U,  // <0,2,3,5>: Cost 3 vuzpl LHS, <3,4,5,6>
-  3830688102U,  // <0,2,3,6>: Cost 4 vuzpl LHS, <3,2,6,3>
-  2698807012U,  // <0,2,3,7>: Cost 3 vext3 <2,3,7,0>, <2,3,7,0>
-  2752301269U,  // <0,2,3,u>: Cost 3 vuzpl LHS, <3,0,u,2>
-  2562261094U,  // <0,2,4,0>: Cost 3 vext1 <2,0,2,4>, LHS
-  4095313828U,  // <0,2,4,1>: Cost 4 vtrnl <0,2,4,6>, <2,6,1,3>
-  2226718152U,  // <0,2,4,2>: Cost 3 vrev <2,0,2,4>
-  2568235169U,  // <0,2,4,3>: Cost 3 vext1 <3,0,2,4>, <3,0,2,4>
-  2562264374U,  // <0,2,4,4>: Cost 3 vext1 <2,0,2,4>, RHS
-  1544850742U,  // <0,2,4,5>: Cost 2 vext2 <0,2,0,2>, RHS
-  1678560566U,  // <0,2,4,6>: Cost 2 vuzpl LHS, RHS
-  2592125957U,  // <0,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4>
-  1678560584U,  // <0,2,4,u>: Cost 2 vuzpl LHS, RHS
-  2691876686U,  // <0,2,5,0>: Cost 3 vext3 <1,2,3,0>, <2,5,0,7>
-  2618592976U,  // <0,2,5,1>: Cost 3 vext2 <0,2,0,2>, <5,1,7,3>
-  3765618528U,  // <0,2,5,2>: Cost 4 vext3 <1,2,3,0>, <2,5,2,7>
-  3765618536U,  // <0,2,5,3>: Cost 4 vext3 <1,2,3,0>, <2,5,3,6>
-  2618593222U,  // <0,2,5,4>: Cost 3 vext2 <0,2,0,2>, <5,4,7,6>
-  2752303108U,  // <0,2,5,5>: Cost 3 vuzpl LHS, <5,5,5,5>
-  2618593378U,  // <0,2,5,6>: Cost 3 vext2 <0,2,0,2>, <5,6,7,0>
-  2824785206U,  // <0,2,5,7>: Cost 3 vuzpr <1,0,3,2>, RHS
-  2824785207U,  // <0,2,5,u>: Cost 3 vuzpr <1,0,3,2>, RHS
-  2752303950U,  // <0,2,6,0>: Cost 3 vuzpl LHS, <6,7,0,1>
-  3830690081U,  // <0,2,6,1>: Cost 4 vuzpl LHS, <6,0,1,2>
-  2618593786U,  // <0,2,6,2>: Cost 3 vext2 <0,2,0,2>, <6,2,7,3>
-  2691876794U,  // <0,2,6,3>: Cost 3 vext3 <1,2,3,0>, <2,6,3,7>
-  2752303990U,  // <0,2,6,4>: Cost 3 vuzpl LHS, <6,7,4,5>
-  3830690445U,  // <0,2,6,5>: Cost 4 vuzpl LHS, <6,4,5,6>
-  2752303928U,  // <0,2,6,6>: Cost 3 vuzpl LHS, <6,6,6,6>
-  2657743695U,  // <0,2,6,7>: Cost 3 vext2 <6,7,0,2>, <6,7,0,2>
-  2691876839U,  // <0,2,6,u>: Cost 3 vext3 <1,2,3,0>, <2,6,u,7>
-  2659070961U,  // <0,2,7,0>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2>
-  2659734594U,  // <0,2,7,1>: Cost 3 vext2 <7,1,0,2>, <7,1,0,2>
-  3734140051U,  // <0,2,7,2>: Cost 4 vext2 <7,2,0,2>, <7,2,0,2>
-  2701166596U,  // <0,2,7,3>: Cost 3 vext3 <2,7,3,0>, <2,7,3,0>
-  2662389094U,  // <0,2,7,4>: Cost 3 vext2 <7,5,0,2>, <7,4,5,6>
-  2662389126U,  // <0,2,7,5>: Cost 3 vext2 <7,5,0,2>, <7,5,0,2>
-  3736794583U,  // <0,2,7,6>: Cost 4 vext2 <7,6,0,2>, <7,6,0,2>
-  2752304748U,  // <0,2,7,7>: Cost 3 vuzpl LHS, <7,7,7,7>
-  2659070961U,  // <0,2,7,u>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2>
-  1476608026U,  // <0,2,u,0>: Cost 2 vext1 <0,0,2,u>, <0,0,2,u>
-  1544853294U,  // <0,2,u,1>: Cost 2 vext2 <0,2,0,2>, LHS
-  1678563118U,  // <0,2,u,2>: Cost 2 vuzpl LHS, LHS
-  3021178482U,  // <0,2,u,3>: Cost 3 vtrnl LHS, <2,2,3,3>
-  1476611382U,  // <0,2,u,4>: Cost 2 vext1 <0,0,2,u>, RHS
-  1544853658U,  // <0,2,u,5>: Cost 2 vext2 <0,2,0,2>, RHS
-  1678563482U,  // <0,2,u,6>: Cost 2 vuzpl LHS, RHS
-  2824785449U,  // <0,2,u,7>: Cost 3 vuzpr <1,0,3,2>, RHS
-  1678563172U,  // <0,2,u,u>: Cost 2 vuzpl LHS, LHS
-  2556329984U,  // <0,3,0,0>: Cost 3 vext1 <1,0,3,0>, <0,0,0,0>
-  2686421142U,  // <0,3,0,1>: Cost 3 vext3 <0,3,1,0>, <3,0,1,2>
-  2562303437U,  // <0,3,0,2>: Cost 3 vext1 <2,0,3,0>, <2,0,3,0>
-  4094986652U,  // <0,3,0,3>: Cost 4 vtrnl <0,2,0,2>, <3,3,3,3>
-  2556333366U,  // <0,3,0,4>: Cost 3 vext1 <1,0,3,0>, RHS
-  4094986754U,  // <0,3,0,5>: Cost 4 vtrnl <0,2,0,2>, <3,4,5,6>
-  3798796488U,  // <0,3,0,6>: Cost 4 vext3 <6,7,3,0>, <3,0,6,7>
-  3776530634U,  // <0,3,0,7>: Cost 4 vext3 <3,0,7,0>, <3,0,7,0>
-  2556335918U,  // <0,3,0,u>: Cost 3 vext1 <1,0,3,0>, LHS
-  2886518934U,  // <0,3,1,0>: Cost 3 vzipl LHS, <3,0,1,2>
-  2556338933U,  // <0,3,1,1>: Cost 3 vext1 <1,0,3,1>, <1,0,3,1>
-  2691877105U,  // <0,3,1,2>: Cost 3 vext3 <1,2,3,0>, <3,1,2,3>
-  2886519196U,  // <0,3,1,3>: Cost 3 vzipl LHS, <3,3,3,3>
-  2886519298U,  // <0,3,1,4>: Cost 3 vzipl LHS, <3,4,5,6>
-  4095740418U,  // <0,3,1,5>: Cost 4 vtrnl <0,3,1,4>, <3,4,5,6>
-  3659944242U,  // <0,3,1,6>: Cost 4 vext1 <6,0,3,1>, <6,0,3,1>
-  3769600286U,  // <0,3,1,7>: Cost 4 vext3 <1,u,3,0>, <3,1,7,3>
-  2886519582U,  // <0,3,1,u>: Cost 3 vzipl LHS, <3,u,1,2>
-  1482604646U,  // <0,3,2,0>: Cost 2 vext1 <1,0,3,2>, LHS
-  1482605302U,  // <0,3,2,1>: Cost 2 vext1 <1,0,3,2>, <1,0,3,2>
-  2556348008U,  // <0,3,2,2>: Cost 3 vext1 <1,0,3,2>, <2,2,2,2>
-  3020736924U,  // <0,3,2,3>: Cost 3 vtrnl LHS, <3,3,3,3>
-  1482607926U,  // <0,3,2,4>: Cost 2 vext1 <1,0,3,2>, RHS
-  3020737026U,  // <0,3,2,5>: Cost 3 vtrnl LHS, <3,4,5,6>
-  2598154746U,  // <0,3,2,6>: Cost 3 vext1 <u,0,3,2>, <6,2,7,3>
-  2598155258U,  // <0,3,2,7>: Cost 3 vext1 <u,0,3,2>, <7,0,1,2>
-  1482610478U,  // <0,3,2,u>: Cost 2 vext1 <1,0,3,2>, LHS
-  3692341398U,  // <0,3,3,0>: Cost 4 vext2 <0,2,0,3>, <3,0,1,2>
-  2635851999U,  // <0,3,3,1>: Cost 3 vext2 <3,1,0,3>, <3,1,0,3>
-  3636069840U,  // <0,3,3,2>: Cost 4 vext1 <2,0,3,3>, <2,0,3,3>
-  2691877276U,  // <0,3,3,3>: Cost 3 vext3 <1,2,3,0>, <3,3,3,3>
-  3961522690U,  // <0,3,3,4>: Cost 4 vzipl <0,3,1,4>, <3,4,5,6>
-  3826797058U,  // <0,3,3,5>: Cost 4 vuzpl <0,2,3,5>, <3,4,5,6>
-  3703622282U,  // <0,3,3,6>: Cost 4 vext2 <2,1,0,3>, <3,6,2,7>
-  3769600452U,  // <0,3,3,7>: Cost 4 vext3 <1,u,3,0>, <3,3,7,7>
-  2640497430U,  // <0,3,3,u>: Cost 3 vext2 <3,u,0,3>, <3,u,0,3>
-  3962194070U,  // <0,3,4,0>: Cost 4 vzipl <0,4,1,5>, <3,0,1,2>
-  2232617112U,  // <0,3,4,1>: Cost 3 vrev <3,0,1,4>
-  2232690849U,  // <0,3,4,2>: Cost 3 vrev <3,0,2,4>
-  4095314332U,  // <0,3,4,3>: Cost 4 vtrnl <0,2,4,6>, <3,3,3,3>
-  3962194434U,  // <0,3,4,4>: Cost 4 vzipl <0,4,1,5>, <3,4,5,6>
-  2691877378U,  // <0,3,4,5>: Cost 3 vext3 <1,2,3,0>, <3,4,5,6>
-  3826765110U,  // <0,3,4,6>: Cost 4 vuzpl <0,2,3,1>, RHS
-  3665941518U,  // <0,3,4,7>: Cost 4 vext1 <7,0,3,4>, <7,0,3,4>
-  2691877405U,  // <0,3,4,u>: Cost 3 vext3 <1,2,3,0>, <3,4,u,6>
-  3630112870U,  // <0,3,5,0>: Cost 4 vext1 <1,0,3,5>, LHS
-  3630113526U,  // <0,3,5,1>: Cost 4 vext1 <1,0,3,5>, <1,0,3,2>
-  4035199734U,  // <0,3,5,2>: Cost 4 vzipr <1,4,0,5>, <1,0,3,2>
-  3769600578U,  // <0,3,5,3>: Cost 4 vext3 <1,u,3,0>, <3,5,3,7>
-  2232846516U,  // <0,3,5,4>: Cost 3 vrev <3,0,4,5>
-  3779037780U,  // <0,3,5,5>: Cost 4 vext3 <3,4,5,0>, <3,5,5,7>
-  2718714461U,  // <0,3,5,6>: Cost 3 vext3 <5,6,7,0>, <3,5,6,7>
-  2706106975U,  // <0,3,5,7>: Cost 3 vext3 <3,5,7,0>, <3,5,7,0>
-  2233141464U,  // <0,3,5,u>: Cost 3 vrev <3,0,u,5>
-  2691877496U,  // <0,3,6,0>: Cost 3 vext3 <1,2,3,0>, <3,6,0,7>
-  3727511914U,  // <0,3,6,1>: Cost 4 vext2 <6,1,0,3>, <6,1,0,3>
-  3765619338U,  // <0,3,6,2>: Cost 4 vext3 <1,2,3,0>, <3,6,2,7>
-  3765619347U,  // <0,3,6,3>: Cost 4 vext3 <1,2,3,0>, <3,6,3,7>
-  3765987996U,  // <0,3,6,4>: Cost 4 vext3 <1,2,u,0>, <3,6,4,7>
-  3306670270U,  // <0,3,6,5>: Cost 4 vrev <3,0,5,6>
-  3792456365U,  // <0,3,6,6>: Cost 4 vext3 <5,6,7,0>, <3,6,6,6>
-  2706770608U,  // <0,3,6,7>: Cost 3 vext3 <3,6,7,0>, <3,6,7,0>
-  2706844345U,  // <0,3,6,u>: Cost 3 vext3 <3,6,u,0>, <3,6,u,0>
-  3769600707U,  // <0,3,7,0>: Cost 4 vext3 <1,u,3,0>, <3,7,0,1>
-  2659742787U,  // <0,3,7,1>: Cost 3 vext2 <7,1,0,3>, <7,1,0,3>
-  3636102612U,  // <0,3,7,2>: Cost 4 vext1 <2,0,3,7>, <2,0,3,7>
-  3769600740U,  // <0,3,7,3>: Cost 4 vext3 <1,u,3,0>, <3,7,3,7>
-  3769600747U,  // <0,3,7,4>: Cost 4 vext3 <1,u,3,0>, <3,7,4,5>
-  3769600758U,  // <0,3,7,5>: Cost 4 vext3 <1,u,3,0>, <3,7,5,7>
-  3659993400U,  // <0,3,7,6>: Cost 4 vext1 <6,0,3,7>, <6,0,3,7>
-  3781176065U,  // <0,3,7,7>: Cost 4 vext3 <3,7,7,0>, <3,7,7,0>
-  2664388218U,  // <0,3,7,u>: Cost 3 vext2 <7,u,0,3>, <7,u,0,3>
-  1482653798U,  // <0,3,u,0>: Cost 2 vext1 <1,0,3,u>, LHS
-  1482654460U,  // <0,3,u,1>: Cost 2 vext1 <1,0,3,u>, <1,0,3,u>
-  2556397160U,  // <0,3,u,2>: Cost 3 vext1 <1,0,3,u>, <2,2,2,2>
-  3021179292U,  // <0,3,u,3>: Cost 3 vtrnl LHS, <3,3,3,3>
-  1482657078U,  // <0,3,u,4>: Cost 2 vext1 <1,0,3,u>, RHS
-  3021179394U,  // <0,3,u,5>: Cost 3 vtrnl LHS, <3,4,5,6>
-  2598203898U,  // <0,3,u,6>: Cost 3 vext1 <u,0,3,u>, <6,2,7,3>
-  2708097874U,  // <0,3,u,7>: Cost 3 vext3 <3,u,7,0>, <3,u,7,0>
-  1482659630U,  // <0,3,u,u>: Cost 2 vext1 <1,0,3,u>, LHS
-  2617278468U,  // <0,4,0,0>: Cost 3 vext2 <0,0,0,4>, <0,0,0,4>
-  2618605670U,  // <0,4,0,1>: Cost 3 vext2 <0,2,0,4>, LHS
-  2618605734U,  // <0,4,0,2>: Cost 3 vext2 <0,2,0,4>, <0,2,0,4>
-  3642091695U,  // <0,4,0,3>: Cost 4 vext1 <3,0,4,0>, <3,0,4,0>
-  2753134796U,  // <0,4,0,4>: Cost 3 vuzpl <0,2,4,6>, <0,2,4,6>
-  2718714770U,  // <0,4,0,5>: Cost 3 vext3 <5,6,7,0>, <4,0,5,1>
-  3021245750U,  // <0,4,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS
-  3665982483U,  // <0,4,0,7>: Cost 4 vext1 <7,0,4,0>, <7,0,4,0>
-  3021245768U,  // <0,4,0,u>: Cost 3 vtrnl <0,2,0,2>, RHS
-  2568355942U,  // <0,4,1,0>: Cost 3 vext1 <3,0,4,1>, LHS
-  3692348212U,  // <0,4,1,1>: Cost 4 vext2 <0,2,0,4>, <1,1,1,1>
-  3692348310U,  // <0,4,1,2>: Cost 4 vext2 <0,2,0,4>, <1,2,3,0>
-  2568358064U,  // <0,4,1,3>: Cost 3 vext1 <3,0,4,1>, <3,0,4,1>
-  2568359222U,  // <0,4,1,4>: Cost 3 vext1 <3,0,4,1>, RHS
-  1812778294U,  // <0,4,1,5>: Cost 2 vzipl LHS, RHS
-  3022671158U,  // <0,4,1,6>: Cost 3 vtrnl <0,4,1,5>, RHS
-  2592248852U,  // <0,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1>
-  1812778537U,  // <0,4,1,u>: Cost 2 vzipl LHS, RHS
-  2568364134U,  // <0,4,2,0>: Cost 3 vext1 <3,0,4,2>, LHS
-  2238573423U,  // <0,4,2,1>: Cost 3 vrev <4,0,1,2>
-  3692349032U,  // <0,4,2,2>: Cost 4 vext2 <0,2,0,4>, <2,2,2,2>
-  2631214761U,  // <0,4,2,3>: Cost 3 vext2 <2,3,0,4>, <2,3,0,4>
-  2568367414U,  // <0,4,2,4>: Cost 3 vext1 <3,0,4,2>, RHS
-  2887028022U,  // <0,4,2,5>: Cost 3 vzipl <0,2,0,2>, RHS
-  1946996022U,  // <0,4,2,6>: Cost 2 vtrnl LHS, RHS
-  2592257045U,  // <0,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2>
-  1946996040U,  // <0,4,2,u>: Cost 2 vtrnl LHS, RHS
-  3692349590U,  // <0,4,3,0>: Cost 4 vext2 <0,2,0,4>, <3,0,1,2>
-  3826878614U,  // <0,4,3,1>: Cost 4 vuzpl <0,2,4,6>, <3,0,1,2>
-  3826878625U,  // <0,4,3,2>: Cost 4 vuzpl <0,2,4,6>, <3,0,2,4>
-  3692349852U,  // <0,4,3,3>: Cost 4 vext2 <0,2,0,4>, <3,3,3,3>
-  3692349954U,  // <0,4,3,4>: Cost 4 vext2 <0,2,0,4>, <3,4,5,6>
-  3826878978U,  // <0,4,3,5>: Cost 4 vuzpl <0,2,4,6>, <3,4,5,6>
-  4095200566U,  // <0,4,3,6>: Cost 4 vtrnl <0,2,3,1>, RHS
-  3713583814U,  // <0,4,3,7>: Cost 4 vext2 <3,7,0,4>, <3,7,0,4>
-  3692350238U,  // <0,4,3,u>: Cost 4 vext2 <0,2,0,4>, <3,u,1,2>
-  2550464552U,  // <0,4,4,0>: Cost 3 vext1 <0,0,4,4>, <0,0,4,4>
-  3962194914U,  // <0,4,4,1>: Cost 4 vzipl <0,4,1,5>, <4,1,5,0>
-  3693677631U,  // <0,4,4,2>: Cost 4 vext2 <0,4,0,4>, <4,2,6,3>
-  3642124467U,  // <0,4,4,3>: Cost 4 vext1 <3,0,4,4>, <3,0,4,4>
-  2718715088U,  // <0,4,4,4>: Cost 3 vext3 <5,6,7,0>, <4,4,4,4>
-  2618608950U,  // <0,4,4,5>: Cost 3 vext2 <0,2,0,4>, RHS
-  2753137974U,  // <0,4,4,6>: Cost 3 vuzpl <0,2,4,6>, RHS
-  3666015255U,  // <0,4,4,7>: Cost 4 vext1 <7,0,4,4>, <7,0,4,4>
-  2618609193U,  // <0,4,4,u>: Cost 3 vext2 <0,2,0,4>, RHS
-  2568388710U,  // <0,4,5,0>: Cost 3 vext1 <3,0,4,5>, LHS
-  2568389526U,  // <0,4,5,1>: Cost 3 vext1 <3,0,4,5>, <1,2,3,0>
-  3636159963U,  // <0,4,5,2>: Cost 4 vext1 <2,0,4,5>, <2,0,4,5>
-  2568390836U,  // <0,4,5,3>: Cost 3 vext1 <3,0,4,5>, <3,0,4,5>
-  2568391990U,  // <0,4,5,4>: Cost 3 vext1 <3,0,4,5>, RHS
-  2718715180U,  // <0,4,5,5>: Cost 3 vext3 <5,6,7,0>, <4,5,5,6>
-  1618136374U,  // <0,4,5,6>: Cost 2 vext3 <1,2,3,0>, RHS
-  2592281624U,  // <0,4,5,7>: Cost 3 vext1 <7,0,4,5>, <7,0,4,5>
-  1618136392U,  // <0,4,5,u>: Cost 2 vext3 <1,2,3,0>, RHS
-  2550480938U,  // <0,4,6,0>: Cost 3 vext1 <0,0,4,6>, <0,0,4,6>
-  3826880801U,  // <0,4,6,1>: Cost 4 vuzpl <0,2,4,6>, <6,0,1,2>
-  2562426332U,  // <0,4,6,2>: Cost 3 vext1 <2,0,4,6>, <2,0,4,6>
-  3786190181U,  // <0,4,6,3>: Cost 4 vext3 <4,6,3,0>, <4,6,3,0>
-  2718715252U,  // <0,4,6,4>: Cost 3 vext3 <5,6,7,0>, <4,6,4,6>
-  3826881165U,  // <0,4,6,5>: Cost 4 vuzpl <0,2,4,6>, <6,4,5,6>
-  2712669568U,  // <0,4,6,6>: Cost 3 vext3 <4,6,6,0>, <4,6,6,0>
-  2657760081U,  // <0,4,6,7>: Cost 3 vext2 <6,7,0,4>, <6,7,0,4>
-  2718715284U,  // <0,4,6,u>: Cost 3 vext3 <5,6,7,0>, <4,6,u,2>
-  3654090854U,  // <0,4,7,0>: Cost 4 vext1 <5,0,4,7>, LHS
-  3934229326U,  // <0,4,7,1>: Cost 4 vuzpr <7,0,1,4>, <6,7,0,1>
-  3734156437U,  // <0,4,7,2>: Cost 4 vext2 <7,2,0,4>, <7,2,0,4>
-  3734820070U,  // <0,4,7,3>: Cost 4 vext2 <7,3,0,4>, <7,3,0,4>
-  3654094134U,  // <0,4,7,4>: Cost 4 vext1 <5,0,4,7>, RHS
-  2713259464U,  // <0,4,7,5>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0>
-  2713333201U,  // <0,4,7,6>: Cost 3 vext3 <4,7,6,0>, <4,7,6,0>
-  3654095866U,  // <0,4,7,7>: Cost 4 vext1 <5,0,4,7>, <7,0,1,2>
-  2713259464U,  // <0,4,7,u>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0>
-  2568413286U,  // <0,4,u,0>: Cost 3 vext1 <3,0,4,u>, LHS
-  2618611502U,  // <0,4,u,1>: Cost 3 vext2 <0,2,0,4>, LHS
-  2753140526U,  // <0,4,u,2>: Cost 3 vuzpl <0,2,4,6>, LHS
-  2568415415U,  // <0,4,u,3>: Cost 3 vext1 <3,0,4,u>, <3,0,4,u>
-  2568416566U,  // <0,4,u,4>: Cost 3 vext1 <3,0,4,u>, RHS
-  1817423158U,  // <0,4,u,5>: Cost 2 vzipl LHS, RHS
-  1947438390U,  // <0,4,u,6>: Cost 2 vtrnl LHS, RHS
-  2592306203U,  // <0,4,u,7>: Cost 3 vext1 <7,0,4,u>, <7,0,4,u>
-  1947438408U,  // <0,4,u,u>: Cost 2 vtrnl LHS, RHS
-  3630219264U,  // <0,5,0,0>: Cost 4 vext1 <1,0,5,0>, <0,0,0,0>
-  2625912934U,  // <0,5,0,1>: Cost 3 vext2 <1,4,0,5>, LHS
-  3692355748U,  // <0,5,0,2>: Cost 4 vext2 <0,2,0,5>, <0,2,0,2>
-  3693019384U,  // <0,5,0,3>: Cost 4 vext2 <0,3,0,5>, <0,3,0,5>
-  3630222646U,  // <0,5,0,4>: Cost 4 vext1 <1,0,5,0>, RHS
-  3699655062U,  // <0,5,0,5>: Cost 4 vext2 <1,4,0,5>, <0,5,0,1>
-  2718715508U,  // <0,5,0,6>: Cost 3 vext3 <5,6,7,0>, <5,0,6,1>
-  3087011126U,  // <0,5,0,7>: Cost 3 vtrnr <0,0,0,0>, RHS
-  2625913501U,  // <0,5,0,u>: Cost 3 vext2 <1,4,0,5>, LHS
-  1500659814U,  // <0,5,1,0>: Cost 2 vext1 <4,0,5,1>, LHS
-  2886520528U,  // <0,5,1,1>: Cost 3 vzipl LHS, <5,1,7,3>
-  2574403176U,  // <0,5,1,2>: Cost 3 vext1 <4,0,5,1>, <2,2,2,2>
-  2574403734U,  // <0,5,1,3>: Cost 3 vext1 <4,0,5,1>, <3,0,1,2>
-  1500662674U,  // <0,5,1,4>: Cost 2 vext1 <4,0,5,1>, <4,0,5,1>
-  2886520836U,  // <0,5,1,5>: Cost 3 vzipl LHS, <5,5,5,5>
-  2886520930U,  // <0,5,1,6>: Cost 3 vzipl LHS, <5,6,7,0>
-  2718715600U,  // <0,5,1,7>: Cost 3 vext3 <5,6,7,0>, <5,1,7,3>
-  1500665646U,  // <0,5,1,u>: Cost 2 vext1 <4,0,5,1>, LHS
-  2556493926U,  // <0,5,2,0>: Cost 3 vext1 <1,0,5,2>, LHS
-  2244546120U,  // <0,5,2,1>: Cost 3 vrev <5,0,1,2>
-  3692357256U,  // <0,5,2,2>: Cost 4 vext2 <0,2,0,5>, <2,2,5,7>
-  2568439994U,  // <0,5,2,3>: Cost 3 vext1 <3,0,5,2>, <3,0,5,2>
-  2556497206U,  // <0,5,2,4>: Cost 3 vext1 <1,0,5,2>, RHS
-  3020738564U,  // <0,5,2,5>: Cost 3 vtrnl LHS, <5,5,5,5>
-  4027877161U,  // <0,5,2,6>: Cost 4 vzipr <0,2,0,2>, <2,4,5,6>
-  3093220662U,  // <0,5,2,7>: Cost 3 vtrnr <1,0,3,2>, RHS
-  3093220663U,  // <0,5,2,u>: Cost 3 vtrnr <1,0,3,2>, RHS
-  3699656854U,  // <0,5,3,0>: Cost 4 vext2 <1,4,0,5>, <3,0,1,2>
-  3699656927U,  // <0,5,3,1>: Cost 4 vext2 <1,4,0,5>, <3,1,0,3>
-  3699657006U,  // <0,5,3,2>: Cost 4 vext2 <1,4,0,5>, <3,2,0,1>
-  3699657116U,  // <0,5,3,3>: Cost 4 vext2 <1,4,0,5>, <3,3,3,3>
-  2637859284U,  // <0,5,3,4>: Cost 3 vext2 <3,4,0,5>, <3,4,0,5>
-  3790319453U,  // <0,5,3,5>: Cost 4 vext3 <5,3,5,0>, <5,3,5,0>
-  3699657354U,  // <0,5,3,6>: Cost 4 vext2 <1,4,0,5>, <3,6,2,7>
-  2716725103U,  // <0,5,3,7>: Cost 3 vext3 <5,3,7,0>, <5,3,7,0>
-  2716798840U,  // <0,5,3,u>: Cost 3 vext3 <5,3,u,0>, <5,3,u,0>
-  2661747602U,  // <0,5,4,0>: Cost 3 vext2 <7,4,0,5>, <4,0,5,1>
-  3630252810U,  // <0,5,4,1>: Cost 4 vext1 <1,0,5,4>, <1,0,5,4>
-  3636225507U,  // <0,5,4,2>: Cost 4 vext1 <2,0,5,4>, <2,0,5,4>
-  3716910172U,  // <0,5,4,3>: Cost 4 vext2 <4,3,0,5>, <4,3,0,5>
-  3962195892U,  // <0,5,4,4>: Cost 4 vzipl <0,4,1,5>, <5,4,5,6>
-  2625916214U,  // <0,5,4,5>: Cost 3 vext2 <1,4,0,5>, RHS
-  3718901071U,  // <0,5,4,6>: Cost 4 vext2 <4,6,0,5>, <4,6,0,5>
-  2718715846U,  // <0,5,4,7>: Cost 3 vext3 <5,6,7,0>, <5,4,7,6>
-  2625916457U,  // <0,5,4,u>: Cost 3 vext2 <1,4,0,5>, RHS
-  3791278034U,  // <0,5,5,0>: Cost 4 vext3 <5,5,0,0>, <5,5,0,0>
-  3791351771U,  // <0,5,5,1>: Cost 4 vext3 <5,5,1,0>, <5,5,1,0>
-  3318386260U,  // <0,5,5,2>: Cost 4 vrev <5,0,2,5>
-  3791499245U,  // <0,5,5,3>: Cost 4 vext3 <5,5,3,0>, <5,5,3,0>
-  3318533734U,  // <0,5,5,4>: Cost 4 vrev <5,0,4,5>
-  2718715908U,  // <0,5,5,5>: Cost 3 vext3 <5,6,7,0>, <5,5,5,5>
-  2657767522U,  // <0,5,5,6>: Cost 3 vext2 <6,7,0,5>, <5,6,7,0>
-  2718715928U,  // <0,5,5,7>: Cost 3 vext3 <5,6,7,0>, <5,5,7,7>
-  2718715937U,  // <0,5,5,u>: Cost 3 vext3 <5,6,7,0>, <5,5,u,7>
-  2592358502U,  // <0,5,6,0>: Cost 3 vext1 <7,0,5,6>, LHS
-  3792015404U,  // <0,5,6,1>: Cost 4 vext3 <5,6,1,0>, <5,6,1,0>
-  3731509754U,  // <0,5,6,2>: Cost 4 vext2 <6,7,0,5>, <6,2,7,3>
-  3785748546U,  // <0,5,6,3>: Cost 4 vext3 <4,5,6,0>, <5,6,3,4>
-  2592361782U,  // <0,5,6,4>: Cost 3 vext1 <7,0,5,6>, RHS
-  2592362594U,  // <0,5,6,5>: Cost 3 vext1 <7,0,5,6>, <5,6,7,0>
-  3785748576U,  // <0,5,6,6>: Cost 4 vext3 <4,5,6,0>, <5,6,6,7>
-  1644974178U,  // <0,5,6,7>: Cost 2 vext3 <5,6,7,0>, <5,6,7,0>
-  1645047915U,  // <0,5,6,u>: Cost 2 vext3 <5,6,u,0>, <5,6,u,0>
-  2562506854U,  // <0,5,7,0>: Cost 3 vext1 <2,0,5,7>, LHS
-  2562507670U,  // <0,5,7,1>: Cost 3 vext1 <2,0,5,7>, <1,2,3,0>
-  2562508262U,  // <0,5,7,2>: Cost 3 vext1 <2,0,5,7>, <2,0,5,7>
-  3636250774U,  // <0,5,7,3>: Cost 4 vext1 <2,0,5,7>, <3,0,1,2>
-  2562510134U,  // <0,5,7,4>: Cost 3 vext1 <2,0,5,7>, RHS
-  2718716072U,  // <0,5,7,5>: Cost 3 vext3 <5,6,7,0>, <5,7,5,7>
-  2718716074U,  // <0,5,7,6>: Cost 3 vext3 <5,6,7,0>, <5,7,6,0>
-  2719379635U,  // <0,5,7,7>: Cost 3 vext3 <5,7,7,0>, <5,7,7,0>
-  2562512686U,  // <0,5,7,u>: Cost 3 vext1 <2,0,5,7>, LHS
-  1500717158U,  // <0,5,u,0>: Cost 2 vext1 <4,0,5,u>, LHS
-  2625918766U,  // <0,5,u,1>: Cost 3 vext2 <1,4,0,5>, LHS
-  2719674583U,  // <0,5,u,2>: Cost 3 vext3 <5,u,2,0>, <5,u,2,0>
-  2568489152U,  // <0,5,u,3>: Cost 3 vext1 <3,0,5,u>, <3,0,5,u>
-  1500720025U,  // <0,5,u,4>: Cost 2 vext1 <4,0,5,u>, <4,0,5,u>
-  2625919130U,  // <0,5,u,5>: Cost 3 vext2 <1,4,0,5>, RHS
-  2586407243U,  // <0,5,u,6>: Cost 3 vext1 <6,0,5,u>, <6,0,5,u>
-  1646301444U,  // <0,5,u,7>: Cost 2 vext3 <5,u,7,0>, <5,u,7,0>
-  1646375181U,  // <0,5,u,u>: Cost 2 vext3 <5,u,u,0>, <5,u,u,0>
-  2586411110U,  // <0,6,0,0>: Cost 3 vext1 <6,0,6,0>, LHS
-  2619949158U,  // <0,6,0,1>: Cost 3 vext2 <0,4,0,6>, LHS
-  2619949220U,  // <0,6,0,2>: Cost 3 vext2 <0,4,0,6>, <0,2,0,2>
-  3785748789U,  // <0,6,0,3>: Cost 4 vext3 <4,5,6,0>, <6,0,3,4>
-  2619949386U,  // <0,6,0,4>: Cost 3 vext2 <0,4,0,6>, <0,4,0,6>
-  2586415202U,  // <0,6,0,5>: Cost 3 vext1 <6,0,6,0>, <5,6,7,0>
-  2586415436U,  // <0,6,0,6>: Cost 3 vext1 <6,0,6,0>, <6,0,6,0>
-  2952793398U,  // <0,6,0,7>: Cost 3 vzipr <0,0,0,0>, RHS
-  2619949725U,  // <0,6,0,u>: Cost 3 vext2 <0,4,0,6>, LHS
-  2562531430U,  // <0,6,1,0>: Cost 3 vext1 <2,0,6,1>, LHS
-  3693691700U,  // <0,6,1,1>: Cost 4 vext2 <0,4,0,6>, <1,1,1,1>
-  2886521338U,  // <0,6,1,2>: Cost 3 vzipl LHS, <6,2,7,3>
-  3693691864U,  // <0,6,1,3>: Cost 4 vext2 <0,4,0,6>, <1,3,1,3>
-  2562534710U,  // <0,6,1,4>: Cost 3 vext1 <2,0,6,1>, RHS
-  2580450932U,  // <0,6,1,5>: Cost 3 vext1 <5,0,6,1>, <5,0,6,1>
-  2886521656U,  // <0,6,1,6>: Cost 3 vzipl LHS, <6,6,6,6>
-  2966736182U,  // <0,6,1,7>: Cost 3 vzipr <2,3,0,1>, RHS
-  2966736183U,  // <0,6,1,u>: Cost 3 vzipr <2,3,0,1>, RHS
-  1500741734U,  // <0,6,2,0>: Cost 2 vext1 <4,0,6,2>, LHS
-  2250518817U,  // <0,6,2,1>: Cost 3 vrev <6,0,1,2>
-  2574485096U,  // <0,6,2,2>: Cost 3 vext1 <4,0,6,2>, <2,2,2,2>
-  2631894694U,  // <0,6,2,3>: Cost 3 vext2 <2,4,0,6>, <2,3,0,1>
-  1500744604U,  // <0,6,2,4>: Cost 2 vext1 <4,0,6,2>, <4,0,6,2>
-  2574487248U,  // <0,6,2,5>: Cost 3 vext1 <4,0,6,2>, <5,1,7,3>
-  3020739384U,  // <0,6,2,6>: Cost 3 vtrnl LHS, <6,6,6,6>
-  2954136886U,  // <0,6,2,7>: Cost 3 vzipr <0,2,0,2>, RHS
-  1500747566U,  // <0,6,2,u>: Cost 2 vext1 <4,0,6,2>, LHS
-  3693693078U,  // <0,6,3,0>: Cost 4 vext2 <0,4,0,6>, <3,0,1,2>
-  3705637136U,  // <0,6,3,1>: Cost 4 vext2 <2,4,0,6>, <3,1,5,7>
-  3705637192U,  // <0,6,3,2>: Cost 4 vext2 <2,4,0,6>, <3,2,3,0>
-  3693693340U,  // <0,6,3,3>: Cost 4 vext2 <0,4,0,6>, <3,3,3,3>
-  2637867477U,  // <0,6,3,4>: Cost 3 vext2 <3,4,0,6>, <3,4,0,6>
-  3705637424U,  // <0,6,3,5>: Cost 4 vext2 <2,4,0,6>, <3,5,1,7>
-  3666154056U,  // <0,6,3,6>: Cost 4 vext1 <7,0,6,3>, <6,3,7,0>
-  2722697800U,  // <0,6,3,7>: Cost 3 vext3 <6,3,7,0>, <6,3,7,0>
-  2722771537U,  // <0,6,3,u>: Cost 3 vext3 <6,3,u,0>, <6,3,u,0>
-  2562556006U,  // <0,6,4,0>: Cost 3 vext1 <2,0,6,4>, LHS
-  4095316257U,  // <0,6,4,1>: Cost 4 vtrnl <0,2,4,6>, <6,0,1,2>
-  2562557420U,  // <0,6,4,2>: Cost 3 vext1 <2,0,6,4>, <2,0,6,4>
-  3636299926U,  // <0,6,4,3>: Cost 4 vext1 <2,0,6,4>, <3,0,1,2>
-  2562559286U,  // <0,6,4,4>: Cost 3 vext1 <2,0,6,4>, RHS
-  2619952438U,  // <0,6,4,5>: Cost 3 vext2 <0,4,0,6>, RHS
-  2723287696U,  // <0,6,4,6>: Cost 3 vext3 <6,4,6,0>, <6,4,6,0>
-  4027895094U,  // <0,6,4,7>: Cost 4 vzipr <0,2,0,4>, RHS
-  2619952681U,  // <0,6,4,u>: Cost 3 vext2 <0,4,0,6>, RHS
-  2718716594U,  // <0,6,5,0>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7>
-  3648250774U,  // <0,6,5,1>: Cost 4 vext1 <4,0,6,5>, <1,2,3,0>
-  3792458436U,  // <0,6,5,2>: Cost 4 vext3 <5,6,7,0>, <6,5,2,7>
-  3705638767U,  // <0,6,5,3>: Cost 5 vext2 <2,4,0,6>, <5,3,7,0>
-  3648252831U,  // <0,6,5,4>: Cost 4 vext1 <4,0,6,5>, <4,0,6,5>
-  3797619416U,  // <0,6,5,5>: Cost 4 vext3 <6,5,5,0>, <6,5,5,0>
-  3792458472U,  // <0,6,5,6>: Cost 4 vext3 <5,6,7,0>, <6,5,6,7>
-  4035202358U,  // <0,6,5,7>: Cost 4 vzipr <1,4,0,5>, RHS
-  2718716594U,  // <0,6,5,u>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7>
-  3786412796U,  // <0,6,6,0>: Cost 4 vext3 <4,6,6,0>, <6,6,0,0>
-  3792458504U,  // <0,6,6,1>: Cost 4 vext3 <5,6,7,0>, <6,6,1,3>
-  3728200126U,  // <0,6,6,2>: Cost 4 vext2 <6,2,0,6>, <6,2,0,6>
-  3798135575U,  // <0,6,6,3>: Cost 4 vext3 <6,6,3,0>, <6,6,3,0>
-  3786412836U,  // <0,6,6,4>: Cost 4 vext3 <4,6,6,0>, <6,6,4,4>
-  3792458543U,  // <0,6,6,5>: Cost 4 vext3 <5,6,7,0>, <6,6,5,6>
-  2718716728U,  // <0,6,6,6>: Cost 3 vext3 <5,6,7,0>, <6,6,6,6>
-  2718716738U,  // <0,6,6,7>: Cost 3 vext3 <5,6,7,0>, <6,6,7,7>
-  2718716747U,  // <0,6,6,u>: Cost 3 vext3 <5,6,7,0>, <6,6,u,7>
-  2718716750U,  // <0,6,7,0>: Cost 3 vext3 <5,6,7,0>, <6,7,0,1>
-  2724909910U,  // <0,6,7,1>: Cost 3 vext3 <6,7,1,0>, <6,7,1,0>
-  3636323823U,  // <0,6,7,2>: Cost 4 vext1 <2,0,6,7>, <2,0,6,7>
-  2725057384U,  // <0,6,7,3>: Cost 3 vext3 <6,7,3,0>, <6,7,3,0>
-  2718716790U,  // <0,6,7,4>: Cost 3 vext3 <5,6,7,0>, <6,7,4,5>
-  2718716800U,  // <0,6,7,5>: Cost 3 vext3 <5,6,7,0>, <6,7,5,6>
-  3792458629U,  // <0,6,7,6>: Cost 4 vext3 <5,6,7,0>, <6,7,6,2>
-  2725352332U,  // <0,6,7,7>: Cost 3 vext3 <6,7,7,0>, <6,7,7,0>
-  2718716822U,  // <0,6,7,u>: Cost 3 vext3 <5,6,7,0>, <6,7,u,1>
-  1500790886U,  // <0,6,u,0>: Cost 2 vext1 <4,0,6,u>, LHS
-  2619954990U,  // <0,6,u,1>: Cost 3 vext2 <0,4,0,6>, LHS
-  2562590192U,  // <0,6,u,2>: Cost 3 vext1 <2,0,6,u>, <2,0,6,u>
-  2725721017U,  // <0,6,u,3>: Cost 3 vext3 <6,u,3,0>, <6,u,3,0>
-  1500793762U,  // <0,6,u,4>: Cost 2 vext1 <4,0,6,u>, <4,0,6,u>
-  2619955354U,  // <0,6,u,5>: Cost 3 vext2 <0,4,0,6>, RHS
-  2725942228U,  // <0,6,u,6>: Cost 3 vext3 <6,u,6,0>, <6,u,6,0>
-  2954186038U,  // <0,6,u,7>: Cost 3 vzipr <0,2,0,u>, RHS
-  1500796718U,  // <0,6,u,u>: Cost 2 vext1 <4,0,6,u>, LHS
-  2256401391U,  // <0,7,0,0>: Cost 3 vrev <7,0,0,0>
-  2632564838U,  // <0,7,0,1>: Cost 3 vext2 <2,5,0,7>, LHS
-  2256548865U,  // <0,7,0,2>: Cost 3 vrev <7,0,2,0>
-  3700998396U,  // <0,7,0,3>: Cost 4 vext2 <1,6,0,7>, <0,3,1,0>
-  2718716952U,  // <0,7,0,4>: Cost 3 vext3 <5,6,7,0>, <7,0,4,5>
-  2718716962U,  // <0,7,0,5>: Cost 3 vext3 <5,6,7,0>, <7,0,5,6>
-  2621284845U,  // <0,7,0,6>: Cost 3 vext2 <0,6,0,7>, <0,6,0,7>
-  3904685542U,  // <0,7,0,7>: Cost 4 vuzpr <2,0,5,7>, <2,0,5,7>
-  2632565405U,  // <0,7,0,u>: Cost 3 vext2 <2,5,0,7>, LHS
-  2256409584U,  // <0,7,1,0>: Cost 3 vrev <7,0,0,1>
-  3706307380U,  // <0,7,1,1>: Cost 4 vext2 <2,5,0,7>, <1,1,1,1>
-  2632565654U,  // <0,7,1,2>: Cost 3 vext2 <2,5,0,7>, <1,2,3,0>
-  3769603168U,  // <0,7,1,3>: Cost 4 vext3 <1,u,3,0>, <7,1,3,5>
-  2256704532U,  // <0,7,1,4>: Cost 3 vrev <7,0,4,1>
-  3769603184U,  // <0,7,1,5>: Cost 4 vext3 <1,u,3,0>, <7,1,5,3>
-  3700999366U,  // <0,7,1,6>: Cost 4 vext2 <1,6,0,7>, <1,6,0,7>
-  2886522476U,  // <0,7,1,7>: Cost 3 vzipl LHS, <7,7,7,7>
-  2256999480U,  // <0,7,1,u>: Cost 3 vrev <7,0,u,1>
-  2586501222U,  // <0,7,2,0>: Cost 3 vext1 <6,0,7,2>, LHS
-  1182749690U,  // <0,7,2,1>: Cost 2 vrev <7,0,1,2>
-  3636356595U,  // <0,7,2,2>: Cost 4 vext1 <2,0,7,2>, <2,0,7,2>
-  2727711916U,  // <0,7,2,3>: Cost 3 vext3 <7,2,3,0>, <7,2,3,0>
-  2586504502U,  // <0,7,2,4>: Cost 3 vext1 <6,0,7,2>, RHS
-  2632566606U,  // <0,7,2,5>: Cost 3 vext2 <2,5,0,7>, <2,5,0,7>
-  2586505559U,  // <0,7,2,6>: Cost 3 vext1 <6,0,7,2>, <6,0,7,2>
-  3020740204U,  // <0,7,2,7>: Cost 3 vtrnl LHS, <7,7,7,7>
-  1183265849U,  // <0,7,2,u>: Cost 2 vrev <7,0,u,2>
-  3701000342U,  // <0,7,3,0>: Cost 4 vext2 <1,6,0,7>, <3,0,1,2>
-  3706308849U,  // <0,7,3,1>: Cost 4 vext2 <2,5,0,7>, <3,1,2,3>
-  3330315268U,  // <0,7,3,2>: Cost 4 vrev <7,0,2,3>
-  3706309020U,  // <0,7,3,3>: Cost 4 vext2 <2,5,0,7>, <3,3,3,3>
-  3706309122U,  // <0,7,3,4>: Cost 4 vext2 <2,5,0,7>, <3,4,5,6>
-  3712281127U,  // <0,7,3,5>: Cost 4 vext2 <3,5,0,7>, <3,5,0,7>
-  2639202936U,  // <0,7,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7>
-  3802412321U,  // <0,7,3,7>: Cost 4 vext3 <7,3,7,0>, <7,3,7,0>
-  2640530202U,  // <0,7,3,u>: Cost 3 vext2 <3,u,0,7>, <3,u,0,7>
-  3654287462U,  // <0,7,4,0>: Cost 4 vext1 <5,0,7,4>, LHS
-  2256507900U,  // <0,7,4,1>: Cost 3 vrev <7,0,1,4>
-  2256581637U,  // <0,7,4,2>: Cost 3 vrev <7,0,2,4>
-  3660262008U,  // <0,7,4,3>: Cost 4 vext1 <6,0,7,4>, <3,6,0,7>
-  3786413405U,  // <0,7,4,4>: Cost 4 vext3 <4,6,6,0>, <7,4,4,6>
-  2632568118U,  // <0,7,4,5>: Cost 3 vext2 <2,5,0,7>, RHS
-  3718917457U,  // <0,7,4,6>: Cost 4 vext2 <4,6,0,7>, <4,6,0,7>
-  3787003255U,  // <0,7,4,7>: Cost 4 vext3 <4,7,5,0>, <7,4,7,5>
-  2632568361U,  // <0,7,4,u>: Cost 3 vext2 <2,5,0,7>, RHS
-  3706310268U,  // <0,7,5,0>: Cost 4 vext2 <2,5,0,7>, <5,0,7,0>
-  3792459156U,  // <0,7,5,1>: Cost 4 vext3 <5,6,7,0>, <7,5,1,7>
-  3330331654U,  // <0,7,5,2>: Cost 4 vrev <7,0,2,5>
-  3722899255U,  // <0,7,5,3>: Cost 4 vext2 <5,3,0,7>, <5,3,0,7>
-  2256737304U,  // <0,7,5,4>: Cost 3 vrev <7,0,4,5>
-  3724226521U,  // <0,7,5,5>: Cost 4 vext2 <5,5,0,7>, <5,5,0,7>
-  2718717377U,  // <0,7,5,6>: Cost 3 vext3 <5,6,7,0>, <7,5,6,7>
-  2729997763U,  // <0,7,5,7>: Cost 3 vext3 <7,5,7,0>, <7,5,7,0>
-  2720044499U,  // <0,7,5,u>: Cost 3 vext3 <5,u,7,0>, <7,5,u,7>
-  3712946517U,  // <0,7,6,0>: Cost 4 vext2 <3,6,0,7>, <6,0,7,0>
-  2256524286U,  // <0,7,6,1>: Cost 3 vrev <7,0,1,6>
-  3792459246U,  // <0,7,6,2>: Cost 4 vext3 <5,6,7,0>, <7,6,2,7>
-  3796440567U,  // <0,7,6,3>: Cost 4 vext3 <6,3,7,0>, <7,6,3,7>
-  3654307126U,  // <0,7,6,4>: Cost 4 vext1 <5,0,7,6>, RHS
-  2656457394U,  // <0,7,6,5>: Cost 3 vext2 <6,5,0,7>, <6,5,0,7>
-  3792459281U,  // <0,7,6,6>: Cost 4 vext3 <5,6,7,0>, <7,6,6,6>
-  2730661396U,  // <0,7,6,7>: Cost 3 vext3 <7,6,7,0>, <7,6,7,0>
-  2658448293U,  // <0,7,6,u>: Cost 3 vext2 <6,u,0,7>, <6,u,0,7>
-  3787003431U,  // <0,7,7,0>: Cost 4 vext3 <4,7,5,0>, <7,7,0,1>
-  3654312854U,  // <0,7,7,1>: Cost 4 vext1 <5,0,7,7>, <1,2,3,0>
-  3654313446U,  // <0,7,7,2>: Cost 4 vext1 <5,0,7,7>, <2,0,5,7>
-  3804771905U,  // <0,7,7,3>: Cost 4 vext3 <7,7,3,0>, <7,7,3,0>
-  3654315318U,  // <0,7,7,4>: Cost 4 vext1 <5,0,7,7>, RHS
-  3654315651U,  // <0,7,7,5>: Cost 4 vext1 <5,0,7,7>, <5,0,7,7>
-  3660288348U,  // <0,7,7,6>: Cost 4 vext1 <6,0,7,7>, <6,0,7,7>
-  2718717548U,  // <0,7,7,7>: Cost 3 vext3 <5,6,7,0>, <7,7,7,7>
-  2664420990U,  // <0,7,7,u>: Cost 3 vext2 <7,u,0,7>, <7,u,0,7>
-  2256466935U,  // <0,7,u,0>: Cost 3 vrev <7,0,0,u>
-  1182798848U,  // <0,7,u,1>: Cost 2 vrev <7,0,1,u>
-  2256614409U,  // <0,7,u,2>: Cost 3 vrev <7,0,2,u>
-  2731693714U,  // <0,7,u,3>: Cost 3 vext3 <7,u,3,0>, <7,u,3,0>
-  2256761883U,  // <0,7,u,4>: Cost 3 vrev <7,0,4,u>
-  2632571034U,  // <0,7,u,5>: Cost 3 vext2 <2,5,0,7>, RHS
-  2669066421U,  // <0,7,u,6>: Cost 3 vext2 <u,6,0,7>, <u,6,0,7>
-  2731988662U,  // <0,7,u,7>: Cost 3 vext3 <7,u,7,0>, <7,u,7,0>
-  1183315007U,  // <0,7,u,u>: Cost 2 vrev <7,0,u,u>
-   135053414U,  // <0,u,0,0>: Cost 1 vdup0 LHS
-  1544896614U,  // <0,u,0,1>: Cost 2 vext2 <0,2,0,u>, LHS
-  1678999654U,  // <0,u,0,2>: Cost 2 vuzpl LHS, LHS
-  2691880677U,  // <0,u,0,3>: Cost 3 vext3 <1,2,3,0>, <u,0,3,2>
-  1476988214U,  // <0,u,0,4>: Cost 2 vext1 <0,0,u,0>, RHS
-  2718791419U,  // <0,u,0,5>: Cost 3 vext3 <5,6,u,0>, <u,0,5,6>
-  3021248666U,  // <0,u,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS
-  2592535607U,  // <0,u,0,7>: Cost 3 vext1 <7,0,u,0>, <7,0,u,0>
-   135053414U,  // <0,u,0,u>: Cost 1 vdup0 LHS
-  1476993097U,  // <0,u,1,0>: Cost 2 vext1 <0,0,u,1>, <0,0,u,1>
-  1812780846U,  // <0,u,1,1>: Cost 2 vzipl LHS, LHS
-  1618138926U,  // <0,u,1,2>: Cost 2 vext3 <1,2,3,0>, LHS
-  2752742134U,  // <0,u,1,3>: Cost 3 vuzpl LHS, <1,0,3,2>
-  1476996406U,  // <0,u,1,4>: Cost 2 vext1 <0,0,u,1>, RHS
-  1812781210U,  // <0,u,1,5>: Cost 2 vzipl LHS, RHS
-  2887006416U,  // <0,u,1,6>: Cost 3 vzipl LHS, <u,6,3,7>
-  2966736200U,  // <0,u,1,7>: Cost 3 vzipr <2,3,0,1>, RHS
-  1812781413U,  // <0,u,1,u>: Cost 2 vzipl LHS, LHS
-  1482973286U,  // <0,u,2,0>: Cost 2 vext1 <1,0,u,2>, LHS
-  1482973987U,  // <0,u,2,1>: Cost 2 vext1 <1,0,u,2>, <1,0,u,2>
-  1946998574U,  // <0,u,2,2>: Cost 2 vtrnl LHS, LHS
-      835584U,  // <0,u,2,3>: Cost 0 copy LHS
-  1482976566U,  // <0,u,2,4>: Cost 2 vext1 <1,0,u,2>, RHS
-  3020781631U,  // <0,u,2,5>: Cost 3 vtrnl LHS, <u,4,5,6>
-  1946998938U,  // <0,u,2,6>: Cost 2 vtrnl LHS, RHS
-  1518810169U,  // <0,u,2,7>: Cost 2 vext1 <7,0,u,2>, <7,0,u,2>
-      835584U,  // <0,u,2,u>: Cost 0 copy LHS
-  2618640534U,  // <0,u,3,0>: Cost 3 vext2 <0,2,0,u>, <3,0,1,2>
-  2752743574U,  // <0,u,3,1>: Cost 3 vuzpl LHS, <3,0,1,2>
-  2636556597U,  // <0,u,3,2>: Cost 3 vext2 <3,2,0,u>, <3,2,0,u>
-  2752743836U,  // <0,u,3,3>: Cost 3 vuzpl LHS, <3,3,3,3>
-  2618640898U,  // <0,u,3,4>: Cost 3 vext2 <0,2,0,u>, <3,4,5,6>
-  2752743938U,  // <0,u,3,5>: Cost 3 vuzpl LHS, <3,4,5,6>
-  2639202936U,  // <0,u,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7>
-  2639874762U,  // <0,u,3,7>: Cost 3 vext2 <3,7,0,u>, <3,7,0,u>
-  2752743637U,  // <0,u,3,u>: Cost 3 vuzpl LHS, <3,0,u,2>
-  2562703462U,  // <0,u,4,0>: Cost 3 vext1 <2,0,u,4>, LHS
-  2888455982U,  // <0,u,4,1>: Cost 3 vzipl <0,4,1,5>, LHS
-  3021575982U,  // <0,u,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS
-  2568677591U,  // <0,u,4,3>: Cost 3 vext1 <3,0,u,4>, <3,0,u,4>
-  2562706742U,  // <0,u,4,4>: Cost 3 vext1 <2,0,u,4>, RHS
-  1544899894U,  // <0,u,4,5>: Cost 2 vext2 <0,2,0,u>, RHS
-  1679002934U,  // <0,u,4,6>: Cost 2 vuzpl LHS, RHS
-  2718718033U,  // <0,u,4,7>: Cost 3 vext3 <5,6,7,0>, <u,4,7,6>
-  1679002952U,  // <0,u,4,u>: Cost 2 vuzpl LHS, RHS
-  2568683622U,  // <0,u,5,0>: Cost 3 vext1 <3,0,u,5>, LHS
-  2568684438U,  // <0,u,5,1>: Cost 3 vext1 <3,0,u,5>, <1,2,3,0>
-  3765622902U,  // <0,u,5,2>: Cost 4 vext3 <1,2,3,0>, <u,5,2,7>
-  2691881087U,  // <0,u,5,3>: Cost 3 vext3 <1,2,3,0>, <u,5,3,7>
-  2568686902U,  // <0,u,5,4>: Cost 3 vext1 <3,0,u,5>, RHS
-  2650492890U,  // <0,u,5,5>: Cost 3 vext2 <5,5,0,u>, <5,5,0,u>
-  1618139290U,  // <0,u,5,6>: Cost 2 vext3 <1,2,3,0>, RHS
-  2824834358U,  // <0,u,5,7>: Cost 3 vuzpr <1,0,3,u>, RHS
-  1618139308U,  // <0,u,5,u>: Cost 2 vext3 <1,2,3,0>, RHS
-  2592579686U,  // <0,u,6,0>: Cost 3 vext1 <7,0,u,6>, LHS
-  2262496983U,  // <0,u,6,1>: Cost 3 vrev <u,0,1,6>
-  2654474688U,  // <0,u,6,2>: Cost 3 vext2 <6,2,0,u>, <6,2,0,u>
-  2691881168U,  // <0,u,6,3>: Cost 3 vext3 <1,2,3,0>, <u,6,3,7>
-  2592582966U,  // <0,u,6,4>: Cost 3 vext1 <7,0,u,6>, RHS
-  2656465587U,  // <0,u,6,5>: Cost 3 vext2 <6,5,0,u>, <6,5,0,u>
-  2657129220U,  // <0,u,6,6>: Cost 3 vext2 <6,6,0,u>, <6,6,0,u>
-  1584051029U,  // <0,u,6,7>: Cost 2 vext2 <6,7,0,u>, <6,7,0,u>
-  1584714662U,  // <0,u,6,u>: Cost 2 vext2 <6,u,0,u>, <6,u,0,u>
-  2562728038U,  // <0,u,7,0>: Cost 3 vext1 <2,0,u,7>, LHS
-  2562728854U,  // <0,u,7,1>: Cost 3 vext1 <2,0,u,7>, <1,2,3,0>
-  2562729473U,  // <0,u,7,2>: Cost 3 vext1 <2,0,u,7>, <2,0,u,7>
-  2661111018U,  // <0,u,7,3>: Cost 3 vext2 <7,3,0,u>, <7,3,0,u>
-  2562731318U,  // <0,u,7,4>: Cost 3 vext1 <2,0,u,7>, RHS
-  2718718258U,  // <0,u,7,5>: Cost 3 vext3 <5,6,7,0>, <u,7,5,6>
-  2586620261U,  // <0,u,7,6>: Cost 3 vext1 <6,0,u,7>, <6,0,u,7>
-  2657793644U,  // <0,u,7,7>: Cost 3 vext2 <6,7,0,u>, <7,7,7,7>
-  2562733870U,  // <0,u,7,u>: Cost 3 vext1 <2,0,u,7>, LHS
-   135053414U,  // <0,u,u,0>: Cost 1 vdup0 LHS
-  1544902446U,  // <0,u,u,1>: Cost 2 vext2 <0,2,0,u>, LHS
-  1679005486U,  // <0,u,u,2>: Cost 2 vuzpl LHS, LHS
-      835584U,  // <0,u,u,3>: Cost 0 copy LHS
-  1483025718U,  // <0,u,u,4>: Cost 2 vext1 <1,0,u,u>, RHS
-  1544902810U,  // <0,u,u,5>: Cost 2 vext2 <0,2,0,u>, RHS
-  1679005850U,  // <0,u,u,6>: Cost 2 vuzpl LHS, RHS
-  1518859327U,  // <0,u,u,7>: Cost 2 vext1 <7,0,u,u>, <7,0,u,u>
-      835584U,  // <0,u,u,u>: Cost 0 copy LHS
-  2689744896U,  // <1,0,0,0>: Cost 3 vext3 <0,u,1,1>, <0,0,0,0>
-  1610694666U,  // <1,0,0,1>: Cost 2 vext3 <0,0,1,1>, <0,0,1,1>
-  2689744916U,  // <1,0,0,2>: Cost 3 vext3 <0,u,1,1>, <0,0,2,2>
-  2619310332U,  // <1,0,0,3>: Cost 3 vext2 <0,3,1,0>, <0,3,1,0>
-  2684657701U,  // <1,0,0,4>: Cost 3 vext3 <0,0,4,1>, <0,0,4,1>
-  2620637598U,  // <1,0,0,5>: Cost 3 vext2 <0,5,1,0>, <0,5,1,0>
-  3708977654U,  // <1,0,0,6>: Cost 4 vext2 <3,0,1,0>, <0,6,1,7>
-  3666351168U,  // <1,0,0,7>: Cost 4 vext1 <7,1,0,0>, <7,1,0,0>
-  1611210825U,  // <1,0,0,u>: Cost 2 vext3 <0,0,u,1>, <0,0,u,1>
-  2556780646U,  // <1,0,1,0>: Cost 3 vext1 <1,1,0,1>, LHS
-  2556781355U,  // <1,0,1,1>: Cost 3 vext1 <1,1,0,1>, <1,1,0,1>
-  1616003174U,  // <1,0,1,2>: Cost 2 vext3 <0,u,1,1>, LHS
-  3693052888U,  // <1,0,1,3>: Cost 4 vext2 <0,3,1,0>, <1,3,1,3>
-  2556783926U,  // <1,0,1,4>: Cost 3 vext1 <1,1,0,1>, RHS
-  2580672143U,  // <1,0,1,5>: Cost 3 vext1 <5,1,0,1>, <5,1,0,1>
-  2724839566U,  // <1,0,1,6>: Cost 3 vext3 <6,7,0,1>, <0,1,6,7>
-  3654415354U,  // <1,0,1,7>: Cost 4 vext1 <5,1,0,1>, <7,0,1,2>
-  1616003228U,  // <1,0,1,u>: Cost 2 vext3 <0,u,1,1>, LHS
-  2685690019U,  // <1,0,2,0>: Cost 3 vext3 <0,2,0,1>, <0,2,0,1>
-  2685763756U,  // <1,0,2,1>: Cost 3 vext3 <0,2,1,1>, <0,2,1,1>
-  2698297524U,  // <1,0,2,2>: Cost 3 vext3 <2,3,0,1>, <0,2,2,0>
-  2685911230U,  // <1,0,2,3>: Cost 3 vext3 <0,2,3,1>, <0,2,3,1>
-  2689745100U,  // <1,0,2,4>: Cost 3 vext3 <0,u,1,1>, <0,2,4,6>
-  3764814038U,  // <1,0,2,5>: Cost 4 vext3 <1,1,1,1>, <0,2,5,7>
-  2724839640U,  // <1,0,2,6>: Cost 3 vext3 <6,7,0,1>, <0,2,6,0>
-  2592625658U,  // <1,0,2,7>: Cost 3 vext1 <7,1,0,2>, <7,0,1,2>
-  2686279915U,  // <1,0,2,u>: Cost 3 vext3 <0,2,u,1>, <0,2,u,1>
-  3087843328U,  // <1,0,3,0>: Cost 3 vtrnr LHS, <0,0,0,0>
-  3087843338U,  // <1,0,3,1>: Cost 3 vtrnr LHS, <0,0,1,1>
-    67944550U,  // <1,0,3,2>: Cost 1 vrev LHS
-  2568743135U,  // <1,0,3,3>: Cost 3 vext1 <3,1,0,3>, <3,1,0,3>
-  2562772278U,  // <1,0,3,4>: Cost 3 vext1 <2,1,0,3>, RHS
-  4099850454U,  // <1,0,3,5>: Cost 4 vtrnl <1,0,3,2>, <0,2,5,7>
-  3704998538U,  // <1,0,3,6>: Cost 4 vext2 <2,3,1,0>, <3,6,2,7>
-  2592633923U,  // <1,0,3,7>: Cost 3 vext1 <7,1,0,3>, <7,1,0,3>
-    68386972U,  // <1,0,3,u>: Cost 1 vrev LHS
-  2620640146U,  // <1,0,4,0>: Cost 3 vext2 <0,5,1,0>, <4,0,5,1>
-  2689745234U,  // <1,0,4,1>: Cost 3 vext3 <0,u,1,1>, <0,4,1,5>
-  2689745244U,  // <1,0,4,2>: Cost 3 vext3 <0,u,1,1>, <0,4,2,6>
-  3760980320U,  // <1,0,4,3>: Cost 4 vext3 <0,4,3,1>, <0,4,3,1>
-  3761054057U,  // <1,0,4,4>: Cost 4 vext3 <0,4,4,1>, <0,4,4,1>
-  2619313462U,  // <1,0,4,5>: Cost 3 vext2 <0,3,1,0>, RHS
-  3761201531U,  // <1,0,4,6>: Cost 4 vext3 <0,4,6,1>, <0,4,6,1>
-  3666383940U,  // <1,0,4,7>: Cost 4 vext1 <7,1,0,4>, <7,1,0,4>
-  2619313705U,  // <1,0,4,u>: Cost 3 vext2 <0,3,1,0>, RHS
-  4029300736U,  // <1,0,5,0>: Cost 4 vzipr <0,4,1,5>, <0,0,0,0>
-  2895249510U,  // <1,0,5,1>: Cost 3 vzipl <1,5,3,7>, LHS
-  3028287590U,  // <1,0,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS
-  3642501345U,  // <1,0,5,3>: Cost 4 vext1 <3,1,0,5>, <3,1,0,5>
-  2215592058U,  // <1,0,5,4>: Cost 3 vrev <0,1,4,5>
-  3724242907U,  // <1,0,5,5>: Cost 4 vext2 <5,5,1,0>, <5,5,1,0>
-  3724906540U,  // <1,0,5,6>: Cost 4 vext2 <5,6,1,0>, <5,6,1,0>
-  3911118134U,  // <1,0,5,7>: Cost 4 vuzpr <3,1,3,0>, RHS
-  3028287644U,  // <1,0,5,u>: Cost 3 vtrnl <1,3,5,7>, LHS
-  3762086375U,  // <1,0,6,0>: Cost 4 vext3 <0,6,0,1>, <0,6,0,1>
-  2698297846U,  // <1,0,6,1>: Cost 3 vext3 <2,3,0,1>, <0,6,1,7>
-  3760022015U,  // <1,0,6,2>: Cost 4 vext3 <0,2,u,1>, <0,6,2,7>
-  3642509538U,  // <1,0,6,3>: Cost 4 vext1 <3,1,0,6>, <3,1,0,6>
-  3762381323U,  // <1,0,6,4>: Cost 4 vext3 <0,6,4,1>, <0,6,4,1>
-  3730215604U,  // <1,0,6,5>: Cost 4 vext2 <6,5,1,0>, <6,5,1,0>
-  3730879237U,  // <1,0,6,6>: Cost 4 vext2 <6,6,1,0>, <6,6,1,0>
-  2657801046U,  // <1,0,6,7>: Cost 3 vext2 <6,7,1,0>, <6,7,1,0>
-  2658464679U,  // <1,0,6,u>: Cost 3 vext2 <6,u,1,0>, <6,u,1,0>
-  2659128312U,  // <1,0,7,0>: Cost 3 vext2 <7,0,1,0>, <7,0,1,0>
-  4047898278U,  // <1,0,7,1>: Cost 4 vzipr <3,5,1,7>, <2,3,0,1>
-  2215460970U,  // <1,0,7,2>: Cost 3 vrev <0,1,2,7>
-  3734861035U,  // <1,0,7,3>: Cost 4 vext2 <7,3,1,0>, <7,3,1,0>
-  3731543398U,  // <1,0,7,4>: Cost 4 vext2 <6,7,1,0>, <7,4,5,6>
-  3736188301U,  // <1,0,7,5>: Cost 4 vext2 <7,5,1,0>, <7,5,1,0>
-  2663110110U,  // <1,0,7,6>: Cost 3 vext2 <7,6,1,0>, <7,6,1,0>
-  3731543660U,  // <1,0,7,7>: Cost 4 vext2 <6,7,1,0>, <7,7,7,7>
-  2664437376U,  // <1,0,7,u>: Cost 3 vext2 <7,u,1,0>, <7,u,1,0>
-  3087884288U,  // <1,0,u,0>: Cost 3 vtrnr LHS, <0,0,0,0>
-  1616003730U,  // <1,0,u,1>: Cost 2 vext3 <0,u,1,1>, <0,u,1,1>
-    67985515U,  // <1,0,u,2>: Cost 1 vrev LHS
-  2689893028U,  // <1,0,u,3>: Cost 3 vext3 <0,u,3,1>, <0,u,3,1>
-  2689745586U,  // <1,0,u,4>: Cost 3 vext3 <0,u,1,1>, <0,u,4,6>
-  2619316378U,  // <1,0,u,5>: Cost 3 vext2 <0,3,1,0>, RHS
-  2669082807U,  // <1,0,u,6>: Cost 3 vext2 <u,6,1,0>, <u,6,1,0>
-  2592674888U,  // <1,0,u,7>: Cost 3 vext1 <7,1,0,u>, <7,1,0,u>
-    68427937U,  // <1,0,u,u>: Cost 1 vrev LHS
-  1543585802U,  // <1,1,0,0>: Cost 2 vext2 <0,0,1,1>, <0,0,1,1>
-  1548894310U,  // <1,1,0,1>: Cost 2 vext2 <0,u,1,1>, LHS
-  2618654892U,  // <1,1,0,2>: Cost 3 vext2 <0,2,1,1>, <0,2,1,1>
-  2689745654U,  // <1,1,0,3>: Cost 3 vext3 <0,u,1,1>, <1,0,3,2>
-  2622636370U,  // <1,1,0,4>: Cost 3 vext2 <0,u,1,1>, <0,4,1,5>
-  2620645791U,  // <1,1,0,5>: Cost 3 vext2 <0,5,1,1>, <0,5,1,1>
-  3696378367U,  // <1,1,0,6>: Cost 4 vext2 <0,u,1,1>, <0,6,2,7>
-  3666424905U,  // <1,1,0,7>: Cost 4 vext1 <7,1,1,0>, <7,1,1,0>
-  1548894866U,  // <1,1,0,u>: Cost 2 vext2 <0,u,1,1>, <0,u,1,1>
-  1483112550U,  // <1,1,1,0>: Cost 2 vext1 <1,1,1,1>, LHS
-   202162278U,  // <1,1,1,1>: Cost 1 vdup1 LHS
-  2622636950U,  // <1,1,1,2>: Cost 3 vext2 <0,u,1,1>, <1,2,3,0>
-  2622637016U,  // <1,1,1,3>: Cost 3 vext2 <0,u,1,1>, <1,3,1,3>
-  1483115830U,  // <1,1,1,4>: Cost 2 vext1 <1,1,1,1>, RHS
-  2622637200U,  // <1,1,1,5>: Cost 3 vext2 <0,u,1,1>, <1,5,3,7>
-  2622637263U,  // <1,1,1,6>: Cost 3 vext2 <0,u,1,1>, <1,6,1,7>
-  2592691274U,  // <1,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1>
-   202162278U,  // <1,1,1,u>: Cost 1 vdup1 LHS
-  2550890588U,  // <1,1,2,0>: Cost 3 vext1 <0,1,1,2>, <0,1,1,2>
-  2617329183U,  // <1,1,2,1>: Cost 3 vext2 <0,0,1,1>, <2,1,3,1>
-  2622637672U,  // <1,1,2,2>: Cost 3 vext2 <0,u,1,1>, <2,2,2,2>
-  2622637734U,  // <1,1,2,3>: Cost 3 vext2 <0,u,1,1>, <2,3,0,1>
-  2550893878U,  // <1,1,2,4>: Cost 3 vext1 <0,1,1,2>, RHS
-  3696379744U,  // <1,1,2,5>: Cost 4 vext2 <0,u,1,1>, <2,5,2,7>
-  2622638010U,  // <1,1,2,6>: Cost 3 vext2 <0,u,1,1>, <2,6,3,7>
-  3804554170U,  // <1,1,2,7>: Cost 4 vext3 <7,7,0,1>, <1,2,7,0>
-  2622638139U,  // <1,1,2,u>: Cost 3 vext2 <0,u,1,1>, <2,u,0,1>
-  2622638230U,  // <1,1,3,0>: Cost 3 vext2 <0,u,1,1>, <3,0,1,2>
-  3087844148U,  // <1,1,3,1>: Cost 3 vtrnr LHS, <1,1,1,1>
-  4161585244U,  // <1,1,3,2>: Cost 4 vtrnr LHS, <0,1,1,2>
-  2014101606U,  // <1,1,3,3>: Cost 2 vtrnr LHS, LHS
-  2622638594U,  // <1,1,3,4>: Cost 3 vext2 <0,u,1,1>, <3,4,5,6>
-  2689745920U,  // <1,1,3,5>: Cost 3 vext3 <0,u,1,1>, <1,3,5,7>
-  3763487753U,  // <1,1,3,6>: Cost 4 vext3 <0,u,1,1>, <1,3,6,7>
-  2592707660U,  // <1,1,3,7>: Cost 3 vext1 <7,1,1,3>, <7,1,1,3>
-  2014101611U,  // <1,1,3,u>: Cost 2 vtrnr LHS, LHS
-  2556878950U,  // <1,1,4,0>: Cost 3 vext1 <1,1,1,4>, LHS
-  2221335351U,  // <1,1,4,1>: Cost 3 vrev <1,1,1,4>
-  3696380988U,  // <1,1,4,2>: Cost 4 vext2 <0,u,1,1>, <4,2,6,0>
-  3763487805U,  // <1,1,4,3>: Cost 4 vext3 <0,u,1,1>, <1,4,3,5>
-  2556882230U,  // <1,1,4,4>: Cost 3 vext1 <1,1,1,4>, RHS
-  1548897590U,  // <1,1,4,5>: Cost 2 vext2 <0,u,1,1>, RHS
-  2758184246U,  // <1,1,4,6>: Cost 3 vuzpl <1,1,1,1>, RHS
-  3666457677U,  // <1,1,4,7>: Cost 4 vext1 <7,1,1,4>, <7,1,1,4>
-  1548897833U,  // <1,1,4,u>: Cost 2 vext2 <0,u,1,1>, RHS
-  2693653615U,  // <1,1,5,0>: Cost 3 vext3 <1,5,0,1>, <1,5,0,1>
-  2617331408U,  // <1,1,5,1>: Cost 3 vext2 <0,0,1,1>, <5,1,7,3>
-  4029302934U,  // <1,1,5,2>: Cost 4 vzipr <0,4,1,5>, <3,0,1,2>
-  2689746064U,  // <1,1,5,3>: Cost 3 vext3 <0,u,1,1>, <1,5,3,7>
-  2221564755U,  // <1,1,5,4>: Cost 3 vrev <1,1,4,5>
-  2955559250U,  // <1,1,5,5>: Cost 3 vzipr <0,4,1,5>, <0,4,1,5>
-  2617331810U,  // <1,1,5,6>: Cost 3 vext2 <0,0,1,1>, <5,6,7,0>
-  2825293110U,  // <1,1,5,7>: Cost 3 vuzpr <1,1,1,1>, RHS
-  2689746109U,  // <1,1,5,u>: Cost 3 vext3 <0,u,1,1>, <1,5,u,7>
-  3696382241U,  // <1,1,6,0>: Cost 4 vext2 <0,u,1,1>, <6,0,1,2>
-  2689746127U,  // <1,1,6,1>: Cost 3 vext3 <0,u,1,1>, <1,6,1,7>
-  2617332218U,  // <1,1,6,2>: Cost 3 vext2 <0,0,1,1>, <6,2,7,3>
-  3763487969U,  // <1,1,6,3>: Cost 4 vext3 <0,u,1,1>, <1,6,3,7>
-  3696382605U,  // <1,1,6,4>: Cost 4 vext2 <0,u,1,1>, <6,4,5,6>
-  4029309266U,  // <1,1,6,5>: Cost 4 vzipr <0,4,1,6>, <0,4,1,5>
-  2617332536U,  // <1,1,6,6>: Cost 3 vext2 <0,0,1,1>, <6,6,6,6>
-  2724840702U,  // <1,1,6,7>: Cost 3 vext3 <6,7,0,1>, <1,6,7,0>
-  2725504263U,  // <1,1,6,u>: Cost 3 vext3 <6,u,0,1>, <1,6,u,0>
-  2617332720U,  // <1,1,7,0>: Cost 3 vext2 <0,0,1,1>, <7,0,0,1>
-  2659800138U,  // <1,1,7,1>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1>
-  3691074717U,  // <1,1,7,2>: Cost 4 vext2 <0,0,1,1>, <7,2,1,3>
-  4167811174U,  // <1,1,7,3>: Cost 4 vtrnr <1,1,5,7>, LHS
-  2617333094U,  // <1,1,7,4>: Cost 3 vext2 <0,0,1,1>, <7,4,5,6>
-  3295396702U,  // <1,1,7,5>: Cost 4 vrev <1,1,5,7>
-  3803891014U,  // <1,1,7,6>: Cost 4 vext3 <7,6,0,1>, <1,7,6,0>
-  2617333356U,  // <1,1,7,7>: Cost 3 vext2 <0,0,1,1>, <7,7,7,7>
-  2659800138U,  // <1,1,7,u>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1>
-  1483112550U,  // <1,1,u,0>: Cost 2 vext1 <1,1,1,1>, LHS
-   202162278U,  // <1,1,u,1>: Cost 1 vdup1 LHS
-  2622642056U,  // <1,1,u,2>: Cost 3 vext2 <0,u,1,1>, <u,2,3,3>
-  2014142566U,  // <1,1,u,3>: Cost 2 vtrnr LHS, LHS
-  1483115830U,  // <1,1,u,4>: Cost 2 vext1 <1,1,1,1>, RHS
-  1548900506U,  // <1,1,u,5>: Cost 2 vext2 <0,u,1,1>, RHS
-  2622642384U,  // <1,1,u,6>: Cost 3 vext2 <0,u,1,1>, <u,6,3,7>
-  2825293353U,  // <1,1,u,7>: Cost 3 vuzpr <1,1,1,1>, RHS
-   202162278U,  // <1,1,u,u>: Cost 1 vdup1 LHS
-  2635251712U,  // <1,2,0,0>: Cost 3 vext2 <3,0,1,2>, <0,0,0,0>
-  1561509990U,  // <1,2,0,1>: Cost 2 vext2 <3,0,1,2>, LHS
-  2618663085U,  // <1,2,0,2>: Cost 3 vext2 <0,2,1,2>, <0,2,1,2>
-  2696529358U,  // <1,2,0,3>: Cost 3 vext3 <2,0,3,1>, <2,0,3,1>
-  2635252050U,  // <1,2,0,4>: Cost 3 vext2 <3,0,1,2>, <0,4,1,5>
-  3769533926U,  // <1,2,0,5>: Cost 4 vext3 <1,u,2,1>, <2,0,5,7>
-  2621317617U,  // <1,2,0,6>: Cost 3 vext2 <0,6,1,2>, <0,6,1,2>
-  2659140170U,  // <1,2,0,7>: Cost 3 vext2 <7,0,1,2>, <0,7,2,1>
-  1561510557U,  // <1,2,0,u>: Cost 2 vext2 <3,0,1,2>, LHS
-  2623308516U,  // <1,2,1,0>: Cost 3 vext2 <1,0,1,2>, <1,0,1,2>
-  2635252532U,  // <1,2,1,1>: Cost 3 vext2 <3,0,1,2>, <1,1,1,1>
-  2631271318U,  // <1,2,1,2>: Cost 3 vext2 <2,3,1,2>, <1,2,3,0>
-  2958180454U,  // <1,2,1,3>: Cost 3 vzipr <0,u,1,1>, LHS
-  2550959414U,  // <1,2,1,4>: Cost 3 vext1 <0,1,2,1>, RHS
-  2635252880U,  // <1,2,1,5>: Cost 3 vext2 <3,0,1,2>, <1,5,3,7>
-  2635252952U,  // <1,2,1,6>: Cost 3 vext2 <3,0,1,2>, <1,6,2,7>
-  3732882731U,  // <1,2,1,7>: Cost 4 vext2 <7,0,1,2>, <1,7,3,0>
-  2958180459U,  // <1,2,1,u>: Cost 3 vzipr <0,u,1,1>, LHS
-  2629281213U,  // <1,2,2,0>: Cost 3 vext2 <2,0,1,2>, <2,0,1,2>
-  2635253280U,  // <1,2,2,1>: Cost 3 vext2 <3,0,1,2>, <2,1,3,2>
-  2618664552U,  // <1,2,2,2>: Cost 3 vext2 <0,2,1,2>, <2,2,2,2>
-  2689746546U,  // <1,2,2,3>: Cost 3 vext3 <0,u,1,1>, <2,2,3,3>
-  3764815485U,  // <1,2,2,4>: Cost 4 vext3 <1,1,1,1>, <2,2,4,5>
-  3760023176U,  // <1,2,2,5>: Cost 4 vext3 <0,2,u,1>, <2,2,5,7>
-  2635253690U,  // <1,2,2,6>: Cost 3 vext2 <3,0,1,2>, <2,6,3,7>
-  2659141610U,  // <1,2,2,7>: Cost 3 vext2 <7,0,1,2>, <2,7,0,1>
-  2689746591U,  // <1,2,2,u>: Cost 3 vext3 <0,u,1,1>, <2,2,u,3>
-   403488870U,  // <1,2,3,0>: Cost 1 vext1 LHS, LHS
-  1477231350U,  // <1,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
-  1477232232U,  // <1,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2>
-  1477233052U,  // <1,2,3,3>: Cost 2 vext1 LHS, <3,3,3,3>
-   403492150U,  // <1,2,3,4>: Cost 1 vext1 LHS, RHS
-  1525010128U,  // <1,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3>
-  1525010938U,  // <1,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
-  1525011450U,  // <1,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2>
-   403494702U,  // <1,2,3,u>: Cost 1 vext1 LHS, LHS
-  2641226607U,  // <1,2,4,0>: Cost 3 vext2 <4,0,1,2>, <4,0,1,2>
-  3624723446U,  // <1,2,4,1>: Cost 4 vext1 <0,1,2,4>, <1,3,4,6>
-  3301123609U,  // <1,2,4,2>: Cost 4 vrev <2,1,2,4>
-  2598759198U,  // <1,2,4,3>: Cost 3 vext1 <u,1,2,4>, <3,u,1,2>
-  2659142864U,  // <1,2,4,4>: Cost 3 vext2 <7,0,1,2>, <4,4,4,4>
-  1561513270U,  // <1,2,4,5>: Cost 2 vext2 <3,0,1,2>, RHS
-  2659143028U,  // <1,2,4,6>: Cost 3 vext2 <7,0,1,2>, <4,6,4,6>
-  2659143112U,  // <1,2,4,7>: Cost 3 vext2 <7,0,1,2>, <4,7,5,0>
-  1561513513U,  // <1,2,4,u>: Cost 2 vext2 <3,0,1,2>, RHS
-  2550988902U,  // <1,2,5,0>: Cost 3 vext1 <0,1,2,5>, LHS
-  2550989824U,  // <1,2,5,1>: Cost 3 vext1 <0,1,2,5>, <1,3,5,7>
-  3624732264U,  // <1,2,5,2>: Cost 4 vext1 <0,1,2,5>, <2,2,2,2>
-  2955559014U,  // <1,2,5,3>: Cost 3 vzipr <0,4,1,5>, LHS
-  2550992182U,  // <1,2,5,4>: Cost 3 vext1 <0,1,2,5>, RHS
-  2659143684U,  // <1,2,5,5>: Cost 3 vext2 <7,0,1,2>, <5,5,5,5>
-  2659143778U,  // <1,2,5,6>: Cost 3 vext2 <7,0,1,2>, <5,6,7,0>
-  2659143848U,  // <1,2,5,7>: Cost 3 vext2 <7,0,1,2>, <5,7,5,7>
-  2550994734U,  // <1,2,5,u>: Cost 3 vext1 <0,1,2,5>, LHS
-  2700289945U,  // <1,2,6,0>: Cost 3 vext3 <2,6,0,1>, <2,6,0,1>
-  2635256232U,  // <1,2,6,1>: Cost 3 vext2 <3,0,1,2>, <6,1,7,2>
-  2659144186U,  // <1,2,6,2>: Cost 3 vext2 <7,0,1,2>, <6,2,7,3>
-  2689746874U,  // <1,2,6,3>: Cost 3 vext3 <0,u,1,1>, <2,6,3,7>
-  3763488705U,  // <1,2,6,4>: Cost 4 vext3 <0,u,1,1>, <2,6,4,5>
-  3763488716U,  // <1,2,6,5>: Cost 4 vext3 <0,u,1,1>, <2,6,5,7>
-  2659144504U,  // <1,2,6,6>: Cost 3 vext2 <7,0,1,2>, <6,6,6,6>
-  2657817432U,  // <1,2,6,7>: Cost 3 vext2 <6,7,1,2>, <6,7,1,2>
-  2689746919U,  // <1,2,6,u>: Cost 3 vext3 <0,u,1,1>, <2,6,u,7>
-  1585402874U,  // <1,2,7,0>: Cost 2 vext2 <7,0,1,2>, <7,0,1,2>
-  2659144770U,  // <1,2,7,1>: Cost 3 vext2 <7,0,1,2>, <7,1,0,2>
-  3708998858U,  // <1,2,7,2>: Cost 4 vext2 <3,0,1,2>, <7,2,6,3>
-  2635257059U,  // <1,2,7,3>: Cost 3 vext2 <3,0,1,2>, <7,3,0,1>
-  2659145062U,  // <1,2,7,4>: Cost 3 vext2 <7,0,1,2>, <7,4,5,6>
-  3732886916U,  // <1,2,7,5>: Cost 4 vext2 <7,0,1,2>, <7,5,0,0>
-  3732886998U,  // <1,2,7,6>: Cost 4 vext2 <7,0,1,2>, <7,6,0,1>
-  2659145255U,  // <1,2,7,7>: Cost 3 vext2 <7,0,1,2>, <7,7,0,1>
-  1590711938U,  // <1,2,7,u>: Cost 2 vext2 <7,u,1,2>, <7,u,1,2>
-   403529835U,  // <1,2,u,0>: Cost 1 vext1 LHS, LHS
-  1477272310U,  // <1,2,u,1>: Cost 2 vext1 LHS, <1,0,3,2>
-  1477273192U,  // <1,2,u,2>: Cost 2 vext1 LHS, <2,2,2,2>
-  1477273750U,  // <1,2,u,3>: Cost 2 vext1 LHS, <3,0,1,2>
-   403533110U,  // <1,2,u,4>: Cost 1 vext1 LHS, RHS
-  1561516186U,  // <1,2,u,5>: Cost 2 vext2 <3,0,1,2>, RHS
-  1525051898U,  // <1,2,u,6>: Cost 2 vext1 LHS, <6,2,7,3>
-  1525052410U,  // <1,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2>
-   403535662U,  // <1,2,u,u>: Cost 1 vext1 LHS, LHS
-  2819407872U,  // <1,3,0,0>: Cost 3 vuzpr LHS, <0,0,0,0>
-  1551564902U,  // <1,3,0,1>: Cost 2 vext2 <1,3,1,3>, LHS
-  2819408630U,  // <1,3,0,2>: Cost 3 vuzpr LHS, <1,0,3,2>
-  2619334911U,  // <1,3,0,3>: Cost 3 vext2 <0,3,1,3>, <0,3,1,3>
-  2625306962U,  // <1,3,0,4>: Cost 3 vext2 <1,3,1,3>, <0,4,1,5>
-  3832725879U,  // <1,3,0,5>: Cost 4 vuzpl <1,2,3,0>, <0,4,5,6>
-  3699048959U,  // <1,3,0,6>: Cost 4 vext2 <1,3,1,3>, <0,6,2,7>
-  3776538827U,  // <1,3,0,7>: Cost 4 vext3 <3,0,7,1>, <3,0,7,1>
-  1551565469U,  // <1,3,0,u>: Cost 2 vext2 <1,3,1,3>, LHS
-  2618671862U,  // <1,3,1,0>: Cost 3 vext2 <0,2,1,3>, <1,0,3,2>
-  2819408692U,  // <1,3,1,1>: Cost 3 vuzpr LHS, <1,1,1,1>
-  2624643975U,  // <1,3,1,2>: Cost 3 vext2 <1,2,1,3>, <1,2,1,3>
-  1745666150U,  // <1,3,1,3>: Cost 2 vuzpr LHS, LHS
-  2557005110U,  // <1,3,1,4>: Cost 3 vext1 <1,1,3,1>, RHS
-  2625307792U,  // <1,3,1,5>: Cost 3 vext2 <1,3,1,3>, <1,5,3,7>
-  3698386127U,  // <1,3,1,6>: Cost 4 vext2 <1,2,1,3>, <1,6,1,7>
-  2592838748U,  // <1,3,1,7>: Cost 3 vext1 <7,1,3,1>, <7,1,3,1>
-  1745666155U,  // <1,3,1,u>: Cost 2 vuzpr LHS, LHS
-  2819408790U,  // <1,3,2,0>: Cost 3 vuzpr LHS, <1,2,3,0>
-  2625308193U,  // <1,3,2,1>: Cost 3 vext2 <1,3,1,3>, <2,1,3,3>
-  2819408036U,  // <1,3,2,2>: Cost 3 vuzpr LHS, <0,2,0,2>
-  2819851890U,  // <1,3,2,3>: Cost 3 vuzpr LHS, <2,2,3,3>
-  2819408794U,  // <1,3,2,4>: Cost 3 vuzpr LHS, <1,2,3,4>
-  3893149890U,  // <1,3,2,5>: Cost 4 vuzpr LHS, <0,2,3,5>
-  2819408076U,  // <1,3,2,6>: Cost 3 vuzpr LHS, <0,2,4,6>
-  3772041583U,  // <1,3,2,7>: Cost 4 vext3 <2,3,0,1>, <3,2,7,3>
-  2819408042U,  // <1,3,2,u>: Cost 3 vuzpr LHS, <0,2,0,u>
-  1483276390U,  // <1,3,3,0>: Cost 2 vext1 <1,1,3,3>, LHS
-  1483277128U,  // <1,3,3,1>: Cost 2 vext1 <1,1,3,3>, <1,1,3,3>
-  2557019752U,  // <1,3,3,2>: Cost 3 vext1 <1,1,3,3>, <2,2,2,2>
-  2819408856U,  // <1,3,3,3>: Cost 3 vuzpr LHS, <1,3,1,3>
-  1483279670U,  // <1,3,3,4>: Cost 2 vext1 <1,1,3,3>, RHS
-  2819409614U,  // <1,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5>
-  2598826490U,  // <1,3,3,6>: Cost 3 vext1 <u,1,3,3>, <6,2,7,3>
-  3087844352U,  // <1,3,3,7>: Cost 3 vtrnr LHS, <1,3,5,7>
-  1483282222U,  // <1,3,3,u>: Cost 2 vext1 <1,1,3,3>, LHS
-  2568970342U,  // <1,3,4,0>: Cost 3 vext1 <3,1,3,4>, LHS
-  2568971224U,  // <1,3,4,1>: Cost 3 vext1 <3,1,3,4>, <1,3,1,3>
-  3832761290U,  // <1,3,4,2>: Cost 4 vuzpl <1,2,3,4>, <4,1,2,3>
-  2233428219U,  // <1,3,4,3>: Cost 3 vrev <3,1,3,4>
-  2568973622U,  // <1,3,4,4>: Cost 3 vext1 <3,1,3,4>, RHS
-  1551568182U,  // <1,3,4,5>: Cost 2 vext2 <1,3,1,3>, RHS
-  2819410434U,  // <1,3,4,6>: Cost 3 vuzpr LHS, <3,4,5,6>
-  3666605151U,  // <1,3,4,7>: Cost 4 vext1 <7,1,3,4>, <7,1,3,4>
-  1551568425U,  // <1,3,4,u>: Cost 2 vext2 <1,3,1,3>, RHS
-  2563006566U,  // <1,3,5,0>: Cost 3 vext1 <2,1,3,5>, LHS
-  2568979456U,  // <1,3,5,1>: Cost 3 vext1 <3,1,3,5>, <1,3,5,7>
-  2563008035U,  // <1,3,5,2>: Cost 3 vext1 <2,1,3,5>, <2,1,3,5>
-  2233436412U,  // <1,3,5,3>: Cost 3 vrev <3,1,3,5>
-  2563009846U,  // <1,3,5,4>: Cost 3 vext1 <2,1,3,5>, RHS
-  2867187716U,  // <1,3,5,5>: Cost 3 vuzpr LHS, <5,5,5,5>
-  2655834214U,  // <1,3,5,6>: Cost 3 vext2 <6,4,1,3>, <5,6,7,4>
-  1745669430U,  // <1,3,5,7>: Cost 2 vuzpr LHS, RHS
-  1745669431U,  // <1,3,5,u>: Cost 2 vuzpr LHS, RHS
-  2867187810U,  // <1,3,6,0>: Cost 3 vuzpr LHS, <5,6,7,0>
-  3699052931U,  // <1,3,6,1>: Cost 4 vext2 <1,3,1,3>, <6,1,3,1>
-  2654507460U,  // <1,3,6,2>: Cost 3 vext2 <6,2,1,3>, <6,2,1,3>
-  3766291091U,  // <1,3,6,3>: Cost 4 vext3 <1,3,3,1>, <3,6,3,7>
-  2655834726U,  // <1,3,6,4>: Cost 3 vext2 <6,4,1,3>, <6,4,1,3>
-  3923384562U,  // <1,3,6,5>: Cost 4 vuzpr <5,1,7,3>, <u,6,7,5>
-  2657161992U,  // <1,3,6,6>: Cost 3 vext2 <6,6,1,3>, <6,6,1,3>
-  2819852218U,  // <1,3,6,7>: Cost 3 vuzpr LHS, <2,6,3,7>
-  2819852219U,  // <1,3,6,u>: Cost 3 vuzpr LHS, <2,6,3,u>
-  2706926275U,  // <1,3,7,0>: Cost 3 vext3 <3,7,0,1>, <3,7,0,1>
-  2659816524U,  // <1,3,7,1>: Cost 3 vext2 <7,1,1,3>, <7,1,1,3>
-  3636766245U,  // <1,3,7,2>: Cost 4 vext1 <2,1,3,7>, <2,1,3,7>
-  2867187903U,  // <1,3,7,3>: Cost 3 vuzpr LHS, <5,7,u,3>
-  2625312102U,  // <1,3,7,4>: Cost 3 vext2 <1,3,1,3>, <7,4,5,6>
-  2867188598U,  // <1,3,7,5>: Cost 3 vuzpr LHS, <6,7,4,5>
-  3728250344U,  // <1,3,7,6>: Cost 4 vext2 <6,2,1,3>, <7,6,2,1>
-  2867187880U,  // <1,3,7,7>: Cost 3 vuzpr LHS, <5,7,5,7>
-  2707516171U,  // <1,3,7,u>: Cost 3 vext3 <3,7,u,1>, <3,7,u,1>
-  1483317350U,  // <1,3,u,0>: Cost 2 vext1 <1,1,3,u>, LHS
-  1483318093U,  // <1,3,u,1>: Cost 2 vext1 <1,1,3,u>, <1,1,3,u>
-  2819410718U,  // <1,3,u,2>: Cost 3 vuzpr LHS, <3,u,1,2>
-  1745666717U,  // <1,3,u,3>: Cost 2 vuzpr LHS, LHS
-  1483320630U,  // <1,3,u,4>: Cost 2 vext1 <1,1,3,u>, RHS
-  1551571098U,  // <1,3,u,5>: Cost 2 vext2 <1,3,1,3>, RHS
-  2819410758U,  // <1,3,u,6>: Cost 3 vuzpr LHS, <3,u,5,6>
-  1745669673U,  // <1,3,u,7>: Cost 2 vuzpr LHS, RHS
-  1745666722U,  // <1,3,u,u>: Cost 2 vuzpr LHS, LHS
-  2617352205U,  // <1,4,0,0>: Cost 3 vext2 <0,0,1,4>, <0,0,1,4>
-  2619342950U,  // <1,4,0,1>: Cost 3 vext2 <0,3,1,4>, LHS
-  3692421295U,  // <1,4,0,2>: Cost 4 vext2 <0,2,1,4>, <0,2,1,4>
-  2619343104U,  // <1,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4>
-  2617352530U,  // <1,4,0,4>: Cost 3 vext2 <0,0,1,4>, <0,4,1,5>
-  1634880402U,  // <1,4,0,5>: Cost 2 vext3 <4,0,5,1>, <4,0,5,1>
-  2713930652U,  // <1,4,0,6>: Cost 3 vext3 <4,u,5,1>, <4,0,6,2>
-  3732898396U,  // <1,4,0,7>: Cost 4 vext2 <7,0,1,4>, <0,7,4,1>
-  1635101613U,  // <1,4,0,u>: Cost 2 vext3 <4,0,u,1>, <4,0,u,1>
-  3693085430U,  // <1,4,1,0>: Cost 4 vext2 <0,3,1,4>, <1,0,3,2>
-  2623988535U,  // <1,4,1,1>: Cost 3 vext2 <1,1,1,4>, <1,1,1,4>
-  3693085590U,  // <1,4,1,2>: Cost 4 vext2 <0,3,1,4>, <1,2,3,0>
-  3692422134U,  // <1,4,1,3>: Cost 4 vext2 <0,2,1,4>, <1,3,4,6>
-  3693085726U,  // <1,4,1,4>: Cost 4 vext2 <0,3,1,4>, <1,4,0,1>
-  2892401974U,  // <1,4,1,5>: Cost 3 vzipl <1,1,1,1>, RHS
-  3026619702U,  // <1,4,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS
-  3800206324U,  // <1,4,1,7>: Cost 4 vext3 <7,0,4,1>, <4,1,7,0>
-  2892402217U,  // <1,4,1,u>: Cost 3 vzipl <1,1,1,1>, RHS
-  3966978927U,  // <1,4,2,0>: Cost 4 vzipl <1,2,3,4>, <4,0,1,2>
-  3966979018U,  // <1,4,2,1>: Cost 4 vzipl <1,2,3,4>, <4,1,2,3>
-  3693086312U,  // <1,4,2,2>: Cost 4 vext2 <0,3,1,4>, <2,2,2,2>
-  2635269798U,  // <1,4,2,3>: Cost 3 vext2 <3,0,1,4>, <2,3,0,1>
-  3966979280U,  // <1,4,2,4>: Cost 4 vzipl <1,2,3,4>, <4,4,4,4>
-  2893204790U,  // <1,4,2,5>: Cost 3 vzipl <1,2,3,0>, RHS
-  3693086650U,  // <1,4,2,6>: Cost 4 vext2 <0,3,1,4>, <2,6,3,7>
-  3666662502U,  // <1,4,2,7>: Cost 4 vext1 <7,1,4,2>, <7,1,4,2>
-  2893205033U,  // <1,4,2,u>: Cost 3 vzipl <1,2,3,0>, RHS
-  2563063910U,  // <1,4,3,0>: Cost 3 vext1 <2,1,4,3>, LHS
-  2563064730U,  // <1,4,3,1>: Cost 3 vext1 <2,1,4,3>, <1,2,3,4>
-  2563065386U,  // <1,4,3,2>: Cost 3 vext1 <2,1,4,3>, <2,1,4,3>
-  3693087132U,  // <1,4,3,3>: Cost 4 vext2 <0,3,1,4>, <3,3,3,3>
-  2619345410U,  // <1,4,3,4>: Cost 3 vext2 <0,3,1,4>, <3,4,5,6>
-  3087843666U,  // <1,4,3,5>: Cost 3 vtrnr LHS, <0,4,1,5>
-  3087843676U,  // <1,4,3,6>: Cost 3 vtrnr LHS, <0,4,2,6>
-  3666670695U,  // <1,4,3,7>: Cost 4 vext1 <7,1,4,3>, <7,1,4,3>
-  3087843669U,  // <1,4,3,u>: Cost 3 vtrnr LHS, <0,4,1,u>
-  2620672914U,  // <1,4,4,0>: Cost 3 vext2 <0,5,1,4>, <4,0,5,1>
-  3630842706U,  // <1,4,4,1>: Cost 4 vext1 <1,1,4,4>, <1,1,4,4>
-  3313069003U,  // <1,4,4,2>: Cost 4 vrev <4,1,2,4>
-  3642788100U,  // <1,4,4,3>: Cost 4 vext1 <3,1,4,4>, <3,1,4,4>
-  2713930960U,  // <1,4,4,4>: Cost 3 vext3 <4,u,5,1>, <4,4,4,4>
-  2619346230U,  // <1,4,4,5>: Cost 3 vext2 <0,3,1,4>, RHS
-  2713930980U,  // <1,4,4,6>: Cost 3 vext3 <4,u,5,1>, <4,4,6,6>
-  3736882642U,  // <1,4,4,7>: Cost 4 vext2 <7,6,1,4>, <4,7,6,1>
-  2619346473U,  // <1,4,4,u>: Cost 3 vext2 <0,3,1,4>, RHS
-  2557108326U,  // <1,4,5,0>: Cost 3 vext1 <1,1,4,5>, LHS
-  2557109075U,  // <1,4,5,1>: Cost 3 vext1 <1,1,4,5>, <1,1,4,5>
-  2598913774U,  // <1,4,5,2>: Cost 3 vext1 <u,1,4,5>, <2,3,u,1>
-  3630852246U,  // <1,4,5,3>: Cost 4 vext1 <1,1,4,5>, <3,0,1,2>
-  2557111606U,  // <1,4,5,4>: Cost 3 vext1 <1,1,4,5>, RHS
-  2895252790U,  // <1,4,5,5>: Cost 3 vzipl <1,5,3,7>, RHS
-  1616006454U,  // <1,4,5,6>: Cost 2 vext3 <0,u,1,1>, RHS
-  3899059510U,  // <1,4,5,7>: Cost 4 vuzpr <1,1,1,4>, RHS
-  1616006472U,  // <1,4,5,u>: Cost 2 vext3 <0,u,1,1>, RHS
-  2557116518U,  // <1,4,6,0>: Cost 3 vext1 <1,1,4,6>, LHS
-  2557117236U,  // <1,4,6,1>: Cost 3 vext1 <1,1,4,6>, <1,1,1,1>
-  3630859880U,  // <1,4,6,2>: Cost 4 vext1 <1,1,4,6>, <2,2,2,2>
-  2569062550U,  // <1,4,6,3>: Cost 3 vext1 <3,1,4,6>, <3,0,1,2>
-  2557119798U,  // <1,4,6,4>: Cost 3 vext1 <1,1,4,6>, RHS
-  3763490174U,  // <1,4,6,5>: Cost 4 vext3 <0,u,1,1>, <4,6,5,7>
-  3763490183U,  // <1,4,6,6>: Cost 4 vext3 <0,u,1,1>, <4,6,6,7>
-  2712751498U,  // <1,4,6,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1>
-  2557122350U,  // <1,4,6,u>: Cost 3 vext1 <1,1,4,6>, LHS
-  2659161084U,  // <1,4,7,0>: Cost 3 vext2 <7,0,1,4>, <7,0,1,4>
-  3732903040U,  // <1,4,7,1>: Cost 4 vext2 <7,0,1,4>, <7,1,7,1>
-  3734230174U,  // <1,4,7,2>: Cost 4 vext2 <7,2,1,4>, <7,2,1,4>
-  3734893807U,  // <1,4,7,3>: Cost 4 vext2 <7,3,1,4>, <7,3,1,4>
-  3660729654U,  // <1,4,7,4>: Cost 4 vext1 <6,1,4,7>, RHS
-  3786493384U,  // <1,4,7,5>: Cost 4 vext3 <4,6,7,1>, <4,7,5,0>
-  2713341394U,  // <1,4,7,6>: Cost 3 vext3 <4,7,6,1>, <4,7,6,1>
-  3660731386U,  // <1,4,7,7>: Cost 4 vext1 <6,1,4,7>, <7,0,1,2>
-  2664470148U,  // <1,4,7,u>: Cost 3 vext2 <7,u,1,4>, <7,u,1,4>
-  2557132902U,  // <1,4,u,0>: Cost 3 vext1 <1,1,4,u>, LHS
-  2619348782U,  // <1,4,u,1>: Cost 3 vext2 <0,3,1,4>, LHS
-  2563106351U,  // <1,4,u,2>: Cost 3 vext1 <2,1,4,u>, <2,1,4,u>
-  2713783816U,  // <1,4,u,3>: Cost 3 vext3 <4,u,3,1>, <4,u,3,1>
-  2622666815U,  // <1,4,u,4>: Cost 3 vext2 <0,u,1,4>, <u,4,5,6>
-  1640189466U,  // <1,4,u,5>: Cost 2 vext3 <4,u,5,1>, <4,u,5,1>
-  1616006697U,  // <1,4,u,6>: Cost 2 vext3 <0,u,1,1>, RHS
-  2712751498U,  // <1,4,u,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1>
-  1616006715U,  // <1,4,u,u>: Cost 2 vext3 <0,u,1,1>, RHS
-  2620014592U,  // <1,5,0,0>: Cost 3 vext2 <0,4,1,5>, <0,0,0,0>
-  1546272870U,  // <1,5,0,1>: Cost 2 vext2 <0,4,1,5>, LHS
-  2618687664U,  // <1,5,0,2>: Cost 3 vext2 <0,2,1,5>, <0,2,1,5>
-  3693093120U,  // <1,5,0,3>: Cost 4 vext2 <0,3,1,5>, <0,3,1,4>
-  1546273106U,  // <1,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5>
-  2620678563U,  // <1,5,0,5>: Cost 3 vext2 <0,5,1,5>, <0,5,1,5>
-  2714668660U,  // <1,5,0,6>: Cost 3 vext3 <5,0,6,1>, <5,0,6,1>
-  3772042877U,  // <1,5,0,7>: Cost 4 vext3 <2,3,0,1>, <5,0,7,1>
-  1546273437U,  // <1,5,0,u>: Cost 2 vext2 <0,4,1,5>, LHS
-  2620015350U,  // <1,5,1,0>: Cost 3 vext2 <0,4,1,5>, <1,0,3,2>
-  2620015412U,  // <1,5,1,1>: Cost 3 vext2 <0,4,1,5>, <1,1,1,1>
-  2620015510U,  // <1,5,1,2>: Cost 3 vext2 <0,4,1,5>, <1,2,3,0>
-  2618688512U,  // <1,5,1,3>: Cost 3 vext2 <0,2,1,5>, <1,3,5,7>
-  2620015677U,  // <1,5,1,4>: Cost 3 vext2 <0,4,1,5>, <1,4,3,5>
-  2620015727U,  // <1,5,1,5>: Cost 3 vext2 <0,4,1,5>, <1,5,0,1>
-  2620015859U,  // <1,5,1,6>: Cost 3 vext2 <0,4,1,5>, <1,6,5,7>
-  3093728566U,  // <1,5,1,7>: Cost 3 vtrnr <1,1,1,1>, RHS
-  2620015981U,  // <1,5,1,u>: Cost 3 vext2 <0,4,1,5>, <1,u,1,3>
-  3692430816U,  // <1,5,2,0>: Cost 4 vext2 <0,2,1,5>, <2,0,5,1>
-  2620016163U,  // <1,5,2,1>: Cost 3 vext2 <0,4,1,5>, <2,1,3,5>
-  2620016232U,  // <1,5,2,2>: Cost 3 vext2 <0,4,1,5>, <2,2,2,2>
-  2620016294U,  // <1,5,2,3>: Cost 3 vext2 <0,4,1,5>, <2,3,0,1>
-  3693758221U,  // <1,5,2,4>: Cost 4 vext2 <0,4,1,5>, <2,4,2,5>
-  3692431209U,  // <1,5,2,5>: Cost 4 vext2 <0,2,1,5>, <2,5,3,7>
-  2620016570U,  // <1,5,2,6>: Cost 3 vext2 <0,4,1,5>, <2,6,3,7>
-  4173598006U,  // <1,5,2,7>: Cost 4 vtrnr <2,1,3,2>, RHS
-  2620016699U,  // <1,5,2,u>: Cost 3 vext2 <0,4,1,5>, <2,u,0,1>
-  2620016790U,  // <1,5,3,0>: Cost 3 vext2 <0,4,1,5>, <3,0,1,2>
-  2569110672U,  // <1,5,3,1>: Cost 3 vext1 <3,1,5,3>, <1,5,3,7>
-  3693758785U,  // <1,5,3,2>: Cost 4 vext2 <0,4,1,5>, <3,2,2,2>
-  2620017052U,  // <1,5,3,3>: Cost 3 vext2 <0,4,1,5>, <3,3,3,3>
-  2620017154U,  // <1,5,3,4>: Cost 3 vext2 <0,4,1,5>, <3,4,5,6>
-  3135623172U,  // <1,5,3,5>: Cost 3 vtrnr LHS, <5,5,5,5>
-  4161587048U,  // <1,5,3,6>: Cost 4 vtrnr LHS, <2,5,3,6>
-  2014104886U,  // <1,5,3,7>: Cost 2 vtrnr LHS, RHS
-  2014104887U,  // <1,5,3,u>: Cost 2 vtrnr LHS, RHS
-  2620017554U,  // <1,5,4,0>: Cost 3 vext2 <0,4,1,5>, <4,0,5,1>
-  2620017634U,  // <1,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0>
-  3693759551U,  // <1,5,4,2>: Cost 4 vext2 <0,4,1,5>, <4,2,6,3>
-  3642861837U,  // <1,5,4,3>: Cost 4 vext1 <3,1,5,4>, <3,1,5,4>
-  2575092710U,  // <1,5,4,4>: Cost 3 vext1 <4,1,5,4>, <4,1,5,4>
-  1546276150U,  // <1,5,4,5>: Cost 2 vext2 <0,4,1,5>, RHS
-  2759855414U,  // <1,5,4,6>: Cost 3 vuzpl <1,3,5,7>, RHS
-  2713931718U,  // <1,5,4,7>: Cost 3 vext3 <4,u,5,1>, <5,4,7,6>
-  1546276393U,  // <1,5,4,u>: Cost 2 vext2 <0,4,1,5>, RHS
-  2557182054U,  // <1,5,5,0>: Cost 3 vext1 <1,1,5,5>, LHS
-  2557182812U,  // <1,5,5,1>: Cost 3 vext1 <1,1,5,5>, <1,1,5,5>
-  3630925347U,  // <1,5,5,2>: Cost 4 vext1 <1,1,5,5>, <2,1,3,5>
-  4029301675U,  // <1,5,5,3>: Cost 4 vzipr <0,4,1,5>, <1,2,5,3>
-  2557185334U,  // <1,5,5,4>: Cost 3 vext1 <1,1,5,5>, RHS
-  2713931780U,  // <1,5,5,5>: Cost 3 vext3 <4,u,5,1>, <5,5,5,5>
-  2667794530U,  // <1,5,5,6>: Cost 3 vext2 <u,4,1,5>, <5,6,7,0>
-  2713931800U,  // <1,5,5,7>: Cost 3 vext3 <4,u,5,1>, <5,5,7,7>
-  2557187886U,  // <1,5,5,u>: Cost 3 vext1 <1,1,5,5>, LHS
-  2718208036U,  // <1,5,6,0>: Cost 3 vext3 <5,6,0,1>, <5,6,0,1>
-  2620019115U,  // <1,5,6,1>: Cost 3 vext2 <0,4,1,5>, <6,1,7,5>
-  2667794938U,  // <1,5,6,2>: Cost 3 vext2 <u,4,1,5>, <6,2,7,3>
-  3787673666U,  // <1,5,6,3>: Cost 4 vext3 <4,u,5,1>, <5,6,3,4>
-  3693761165U,  // <1,5,6,4>: Cost 4 vext2 <0,4,1,5>, <6,4,5,6>
-  3319279297U,  // <1,5,6,5>: Cost 4 vrev <5,1,5,6>
-  2667795256U,  // <1,5,6,6>: Cost 3 vext2 <u,4,1,5>, <6,6,6,6>
-  2713931874U,  // <1,5,6,7>: Cost 3 vext3 <4,u,5,1>, <5,6,7,0>
-  2713931883U,  // <1,5,6,u>: Cost 3 vext3 <4,u,5,1>, <5,6,u,0>
-  2557198438U,  // <1,5,7,0>: Cost 3 vext1 <1,1,5,7>, LHS
-  2557199156U,  // <1,5,7,1>: Cost 3 vext1 <1,1,5,7>, <1,1,1,1>
-  2569143974U,  // <1,5,7,2>: Cost 3 vext1 <3,1,5,7>, <2,3,0,1>
-  2569144592U,  // <1,5,7,3>: Cost 3 vext1 <3,1,5,7>, <3,1,5,7>
-  2557201718U,  // <1,5,7,4>: Cost 3 vext1 <1,1,5,7>, RHS
-  2713931944U,  // <1,5,7,5>: Cost 3 vext3 <4,u,5,1>, <5,7,5,7>
-  3787673770U,  // <1,5,7,6>: Cost 4 vext3 <4,u,5,1>, <5,7,6,0>
-  2719387828U,  // <1,5,7,7>: Cost 3 vext3 <5,7,7,1>, <5,7,7,1>
-  2557204270U,  // <1,5,7,u>: Cost 3 vext1 <1,1,5,7>, LHS
-  2620020435U,  // <1,5,u,0>: Cost 3 vext2 <0,4,1,5>, <u,0,1,2>
-  1546278702U,  // <1,5,u,1>: Cost 2 vext2 <0,4,1,5>, LHS
-  2620020616U,  // <1,5,u,2>: Cost 3 vext2 <0,4,1,5>, <u,2,3,3>
-  2620020668U,  // <1,5,u,3>: Cost 3 vext2 <0,4,1,5>, <u,3,0,1>
-  1594054682U,  // <1,5,u,4>: Cost 2 vext2 <u,4,1,5>, <u,4,1,5>
-  1546279066U,  // <1,5,u,5>: Cost 2 vext2 <0,4,1,5>, RHS
-  2620020944U,  // <1,5,u,6>: Cost 3 vext2 <0,4,1,5>, <u,6,3,7>
-  2014145846U,  // <1,5,u,7>: Cost 2 vtrnr LHS, RHS
-  2014145847U,  // <1,5,u,u>: Cost 2 vtrnr LHS, RHS
-  3692437504U,  // <1,6,0,0>: Cost 4 vext2 <0,2,1,6>, <0,0,0,0>
-  2618695782U,  // <1,6,0,1>: Cost 3 vext2 <0,2,1,6>, LHS
-  2618695857U,  // <1,6,0,2>: Cost 3 vext2 <0,2,1,6>, <0,2,1,6>
-  3794161970U,  // <1,6,0,3>: Cost 4 vext3 <6,0,3,1>, <6,0,3,1>
-  2620023122U,  // <1,6,0,4>: Cost 3 vext2 <0,4,1,6>, <0,4,1,5>
-  2620686756U,  // <1,6,0,5>: Cost 3 vext2 <0,5,1,6>, <0,5,1,6>
-  2621350389U,  // <1,6,0,6>: Cost 3 vext2 <0,6,1,6>, <0,6,1,6>
-  4028599606U,  // <1,6,0,7>: Cost 4 vzipr <0,3,1,0>, RHS
-  2618696349U,  // <1,6,0,u>: Cost 3 vext2 <0,2,1,6>, LHS
-  3692438262U,  // <1,6,1,0>: Cost 4 vext2 <0,2,1,6>, <1,0,3,2>
-  2625995572U,  // <1,6,1,1>: Cost 3 vext2 <1,4,1,6>, <1,1,1,1>
-  3692438422U,  // <1,6,1,2>: Cost 4 vext2 <0,2,1,6>, <1,2,3,0>
-  3692438488U,  // <1,6,1,3>: Cost 4 vext2 <0,2,1,6>, <1,3,1,3>
-  2625995820U,  // <1,6,1,4>: Cost 3 vext2 <1,4,1,6>, <1,4,1,6>
-  3692438672U,  // <1,6,1,5>: Cost 4 vext2 <0,2,1,6>, <1,5,3,7>
-  3692438720U,  // <1,6,1,6>: Cost 4 vext2 <0,2,1,6>, <1,6,0,1>
-  2958183734U,  // <1,6,1,7>: Cost 3 vzipr <0,u,1,1>, RHS
-  2958183735U,  // <1,6,1,u>: Cost 3 vzipr <0,u,1,1>, RHS
-  2721526201U,  // <1,6,2,0>: Cost 3 vext3 <6,2,0,1>, <6,2,0,1>
-  3692439097U,  // <1,6,2,1>: Cost 4 vext2 <0,2,1,6>, <2,1,6,0>
-  3692439144U,  // <1,6,2,2>: Cost 4 vext2 <0,2,1,6>, <2,2,2,2>
-  3692439206U,  // <1,6,2,3>: Cost 4 vext2 <0,2,1,6>, <2,3,0,1>
-  3636948278U,  // <1,6,2,4>: Cost 4 vext1 <2,1,6,2>, RHS
-  3787674092U,  // <1,6,2,5>: Cost 4 vext3 <4,u,5,1>, <6,2,5,7>
-  2618697658U,  // <1,6,2,6>: Cost 3 vext2 <0,2,1,6>, <2,6,3,7>
-  2970799414U,  // <1,6,2,7>: Cost 3 vzipr <3,0,1,2>, RHS
-  2970799415U,  // <1,6,2,u>: Cost 3 vzipr <3,0,1,2>, RHS
-  2563211366U,  // <1,6,3,0>: Cost 3 vext1 <2,1,6,3>, LHS
-  3699738854U,  // <1,6,3,1>: Cost 4 vext2 <1,4,1,6>, <3,1,1,1>
-  2563212860U,  // <1,6,3,2>: Cost 3 vext1 <2,1,6,3>, <2,1,6,3>
-  3692439964U,  // <1,6,3,3>: Cost 4 vext2 <0,2,1,6>, <3,3,3,3>
-  2563214646U,  // <1,6,3,4>: Cost 3 vext1 <2,1,6,3>, RHS
-  4191820018U,  // <1,6,3,5>: Cost 4 vtrnr <5,1,7,3>, <u,6,7,5>
-  2587103648U,  // <1,6,3,6>: Cost 3 vext1 <6,1,6,3>, <6,1,6,3>
-  3087845306U,  // <1,6,3,7>: Cost 3 vtrnr LHS, <2,6,3,7>
-  3087845307U,  // <1,6,3,u>: Cost 3 vtrnr LHS, <2,6,3,u>
-  3693767570U,  // <1,6,4,0>: Cost 4 vext2 <0,4,1,6>, <4,0,5,1>
-  3693767650U,  // <1,6,4,1>: Cost 4 vext2 <0,4,1,6>, <4,1,5,0>
-  3636962877U,  // <1,6,4,2>: Cost 4 vext1 <2,1,6,4>, <2,1,6,4>
-  3325088134U,  // <1,6,4,3>: Cost 4 vrev <6,1,3,4>
-  3693767898U,  // <1,6,4,4>: Cost 4 vext2 <0,4,1,6>, <4,4,5,5>
-  2618699062U,  // <1,6,4,5>: Cost 3 vext2 <0,2,1,6>, RHS
-  3833670966U,  // <1,6,4,6>: Cost 4 vuzpl <1,3,6,7>, RHS
-  4028632374U,  // <1,6,4,7>: Cost 4 vzipr <0,3,1,4>, RHS
-  2618699305U,  // <1,6,4,u>: Cost 3 vext2 <0,2,1,6>, RHS
-  3693768264U,  // <1,6,5,0>: Cost 4 vext2 <0,4,1,6>, <5,0,1,2>
-  3630998373U,  // <1,6,5,1>: Cost 4 vext1 <1,1,6,5>, <1,1,6,5>
-  3636971070U,  // <1,6,5,2>: Cost 4 vext1 <2,1,6,5>, <2,1,6,5>
-  3642943767U,  // <1,6,5,3>: Cost 4 vext1 <3,1,6,5>, <3,1,6,5>
-  3693768628U,  // <1,6,5,4>: Cost 4 vext2 <0,4,1,6>, <5,4,5,6>
-  3732918276U,  // <1,6,5,5>: Cost 4 vext2 <7,0,1,6>, <5,5,5,5>
-  2620690530U,  // <1,6,5,6>: Cost 3 vext2 <0,5,1,6>, <5,6,7,0>
-  2955562294U,  // <1,6,5,7>: Cost 3 vzipr <0,4,1,5>, RHS
-  2955562295U,  // <1,6,5,u>: Cost 3 vzipr <0,4,1,5>, RHS
-  2724180733U,  // <1,6,6,0>: Cost 3 vext3 <6,6,0,1>, <6,6,0,1>
-  3631006566U,  // <1,6,6,1>: Cost 4 vext1 <1,1,6,6>, <1,1,6,6>
-  3631007674U,  // <1,6,6,2>: Cost 4 vext1 <1,1,6,6>, <2,6,3,7>
-  3692442184U,  // <1,6,6,3>: Cost 4 vext2 <0,2,1,6>, <6,3,7,0>
-  3631009078U,  // <1,6,6,4>: Cost 4 vext1 <1,1,6,6>, RHS
-  3787674416U,  // <1,6,6,5>: Cost 4 vext3 <4,u,5,1>, <6,6,5,7>
-  2713932600U,  // <1,6,6,6>: Cost 3 vext3 <4,u,5,1>, <6,6,6,6>
-  2713932610U,  // <1,6,6,7>: Cost 3 vext3 <4,u,5,1>, <6,6,7,7>
-  2713932619U,  // <1,6,6,u>: Cost 3 vext3 <4,u,5,1>, <6,6,u,7>
-  1651102542U,  // <1,6,7,0>: Cost 2 vext3 <6,7,0,1>, <6,7,0,1>
-  2724918103U,  // <1,6,7,1>: Cost 3 vext3 <6,7,1,1>, <6,7,1,1>
-  2698302306U,  // <1,6,7,2>: Cost 3 vext3 <2,3,0,1>, <6,7,2,3>
-  3642960153U,  // <1,6,7,3>: Cost 4 vext1 <3,1,6,7>, <3,1,6,7>
-  2713932662U,  // <1,6,7,4>: Cost 3 vext3 <4,u,5,1>, <6,7,4,5>
-  2725213051U,  // <1,6,7,5>: Cost 3 vext3 <6,7,5,1>, <6,7,5,1>
-  2724844426U,  // <1,6,7,6>: Cost 3 vext3 <6,7,0,1>, <6,7,6,7>
-  4035956022U,  // <1,6,7,7>: Cost 4 vzipr <1,5,1,7>, RHS
-  1651692438U,  // <1,6,7,u>: Cost 2 vext3 <6,7,u,1>, <6,7,u,1>
-  1651766175U,  // <1,6,u,0>: Cost 2 vext3 <6,u,0,1>, <6,u,0,1>
-  2618701614U,  // <1,6,u,1>: Cost 3 vext2 <0,2,1,6>, LHS
-  3135663508U,  // <1,6,u,2>: Cost 3 vtrnr LHS, <4,6,u,2>
-  3692443580U,  // <1,6,u,3>: Cost 4 vext2 <0,2,1,6>, <u,3,0,1>
-  2713932743U,  // <1,6,u,4>: Cost 3 vext3 <4,u,5,1>, <6,u,4,5>
-  2618701978U,  // <1,6,u,5>: Cost 3 vext2 <0,2,1,6>, RHS
-  2622683344U,  // <1,6,u,6>: Cost 3 vext2 <0,u,1,6>, <u,6,3,7>
-  3087886266U,  // <1,6,u,7>: Cost 3 vtrnr LHS, <2,6,3,7>
-  1652356071U,  // <1,6,u,u>: Cost 2 vext3 <6,u,u,1>, <6,u,u,1>
-  2726171632U,  // <1,7,0,0>: Cost 3 vext3 <7,0,0,1>, <7,0,0,1>
-  2626666598U,  // <1,7,0,1>: Cost 3 vext2 <1,5,1,7>, LHS
-  3695100067U,  // <1,7,0,2>: Cost 4 vext2 <0,6,1,7>, <0,2,0,1>
-  3707044102U,  // <1,7,0,3>: Cost 4 vext2 <2,6,1,7>, <0,3,2,1>
-  2726466580U,  // <1,7,0,4>: Cost 3 vext3 <7,0,4,1>, <7,0,4,1>
-  3654921933U,  // <1,7,0,5>: Cost 4 vext1 <5,1,7,0>, <5,1,7,0>
-  2621358582U,  // <1,7,0,6>: Cost 3 vext2 <0,6,1,7>, <0,6,1,7>
-  2622022215U,  // <1,7,0,7>: Cost 3 vext2 <0,7,1,7>, <0,7,1,7>
-  2626667165U,  // <1,7,0,u>: Cost 3 vext2 <1,5,1,7>, LHS
-  2593128550U,  // <1,7,1,0>: Cost 3 vext1 <7,1,7,1>, LHS
-  2626667316U,  // <1,7,1,1>: Cost 3 vext2 <1,5,1,7>, <1,1,1,1>
-  3700409238U,  // <1,7,1,2>: Cost 4 vext2 <1,5,1,7>, <1,2,3,0>
-  2257294428U,  // <1,7,1,3>: Cost 3 vrev <7,1,3,1>
-  2593131830U,  // <1,7,1,4>: Cost 3 vext1 <7,1,7,1>, RHS
-  2626667646U,  // <1,7,1,5>: Cost 3 vext2 <1,5,1,7>, <1,5,1,7>
-  2627331279U,  // <1,7,1,6>: Cost 3 vext2 <1,6,1,7>, <1,6,1,7>
-  2593133696U,  // <1,7,1,7>: Cost 3 vext1 <7,1,7,1>, <7,1,7,1>
-  2628658545U,  // <1,7,1,u>: Cost 3 vext2 <1,u,1,7>, <1,u,1,7>
-  2587164774U,  // <1,7,2,0>: Cost 3 vext1 <6,1,7,2>, LHS
-  3701073445U,  // <1,7,2,1>: Cost 4 vext2 <1,6,1,7>, <2,1,3,7>
-  3700409960U,  // <1,7,2,2>: Cost 4 vext2 <1,5,1,7>, <2,2,2,2>
-  2638612134U,  // <1,7,2,3>: Cost 3 vext2 <3,5,1,7>, <2,3,0,1>
-  2587168054U,  // <1,7,2,4>: Cost 3 vext1 <6,1,7,2>, RHS
-  3706382167U,  // <1,7,2,5>: Cost 4 vext2 <2,5,1,7>, <2,5,1,7>
-  2587169192U,  // <1,7,2,6>: Cost 3 vext1 <6,1,7,2>, <6,1,7,2>
-  3660911610U,  // <1,7,2,7>: Cost 4 vext1 <6,1,7,2>, <7,0,1,2>
-  2587170606U,  // <1,7,2,u>: Cost 3 vext1 <6,1,7,2>, LHS
-  1507459174U,  // <1,7,3,0>: Cost 2 vext1 <5,1,7,3>, LHS
-  2569257984U,  // <1,7,3,1>: Cost 3 vext1 <3,1,7,3>, <1,3,5,7>
-  2581202536U,  // <1,7,3,2>: Cost 3 vext1 <5,1,7,3>, <2,2,2,2>
-  2569259294U,  // <1,7,3,3>: Cost 3 vext1 <3,1,7,3>, <3,1,7,3>
-  1507462454U,  // <1,7,3,4>: Cost 2 vext1 <5,1,7,3>, RHS
-  1507462864U,  // <1,7,3,5>: Cost 2 vext1 <5,1,7,3>, <5,1,7,3>
-  2581205498U,  // <1,7,3,6>: Cost 3 vext1 <5,1,7,3>, <6,2,7,3>
-  2581206010U,  // <1,7,3,7>: Cost 3 vext1 <5,1,7,3>, <7,0,1,2>
-  1507465006U,  // <1,7,3,u>: Cost 2 vext1 <5,1,7,3>, LHS
-  2728826164U,  // <1,7,4,0>: Cost 3 vext3 <7,4,0,1>, <7,4,0,1>
-  3654951732U,  // <1,7,4,1>: Cost 4 vext1 <5,1,7,4>, <1,1,1,1>
-  3330987094U,  // <1,7,4,2>: Cost 4 vrev <7,1,2,4>
-  3331060831U,  // <1,7,4,3>: Cost 4 vrev <7,1,3,4>
-  3787674971U,  // <1,7,4,4>: Cost 4 vext3 <4,u,5,1>, <7,4,4,4>
-  2626669878U,  // <1,7,4,5>: Cost 3 vext2 <1,5,1,7>, RHS
-  3785979241U,  // <1,7,4,6>: Cost 4 vext3 <4,6,0,1>, <7,4,6,0>
-  3787085176U,  // <1,7,4,7>: Cost 4 vext3 <4,7,6,1>, <7,4,7,6>
-  2626670121U,  // <1,7,4,u>: Cost 3 vext2 <1,5,1,7>, RHS
-  2569273446U,  // <1,7,5,0>: Cost 3 vext1 <3,1,7,5>, LHS
-  2569274368U,  // <1,7,5,1>: Cost 3 vext1 <3,1,7,5>, <1,3,5,7>
-  3643016808U,  // <1,7,5,2>: Cost 4 vext1 <3,1,7,5>, <2,2,2,2>
-  2569275680U,  // <1,7,5,3>: Cost 3 vext1 <3,1,7,5>, <3,1,7,5>
-  2569276726U,  // <1,7,5,4>: Cost 3 vext1 <3,1,7,5>, RHS
-  4102034790U,  // <1,7,5,5>: Cost 4 vtrnl <1,3,5,7>, <7,4,5,6>
-  2651222067U,  // <1,7,5,6>: Cost 3 vext2 <5,6,1,7>, <5,6,1,7>
-  3899378998U,  // <1,7,5,7>: Cost 4 vuzpr <1,1,5,7>, RHS
-  2569279278U,  // <1,7,5,u>: Cost 3 vext1 <3,1,7,5>, LHS
-  2730153430U,  // <1,7,6,0>: Cost 3 vext3 <7,6,0,1>, <7,6,0,1>
-  2724845022U,  // <1,7,6,1>: Cost 3 vext3 <6,7,0,1>, <7,6,1,0>
-  3643025338U,  // <1,7,6,2>: Cost 4 vext1 <3,1,7,6>, <2,6,3,7>
-  3643025697U,  // <1,7,6,3>: Cost 4 vext1 <3,1,7,6>, <3,1,7,6>
-  3643026742U,  // <1,7,6,4>: Cost 4 vext1 <3,1,7,6>, RHS
-  3654971091U,  // <1,7,6,5>: Cost 4 vext1 <5,1,7,6>, <5,1,7,6>
-  3787675153U,  // <1,7,6,6>: Cost 4 vext3 <4,u,5,1>, <7,6,6,6>
-  2724845076U,  // <1,7,6,7>: Cost 3 vext3 <6,7,0,1>, <7,6,7,0>
-  2725508637U,  // <1,7,6,u>: Cost 3 vext3 <6,u,0,1>, <7,6,u,0>
-  2730817063U,  // <1,7,7,0>: Cost 3 vext3 <7,7,0,1>, <7,7,0,1>
-  3631088436U,  // <1,7,7,1>: Cost 4 vext1 <1,1,7,7>, <1,1,1,1>
-  3660949158U,  // <1,7,7,2>: Cost 4 vext1 <6,1,7,7>, <2,3,0,1>
-  3801904705U,  // <1,7,7,3>: Cost 4 vext3 <7,3,0,1>, <7,7,3,0>
-  3631090998U,  // <1,7,7,4>: Cost 4 vext1 <1,1,7,7>, RHS
-  2662503828U,  // <1,7,7,5>: Cost 3 vext2 <7,5,1,7>, <7,5,1,7>
-  3660951981U,  // <1,7,7,6>: Cost 4 vext1 <6,1,7,7>, <6,1,7,7>
-  2713933420U,  // <1,7,7,7>: Cost 3 vext3 <4,u,5,1>, <7,7,7,7>
-  2731406959U,  // <1,7,7,u>: Cost 3 vext3 <7,7,u,1>, <7,7,u,1>
-  1507500134U,  // <1,7,u,0>: Cost 2 vext1 <5,1,7,u>, LHS
-  2626672430U,  // <1,7,u,1>: Cost 3 vext2 <1,5,1,7>, LHS
-  2581243496U,  // <1,7,u,2>: Cost 3 vext1 <5,1,7,u>, <2,2,2,2>
-  2569300259U,  // <1,7,u,3>: Cost 3 vext1 <3,1,7,u>, <3,1,7,u>
-  1507503414U,  // <1,7,u,4>: Cost 2 vext1 <5,1,7,u>, RHS
-  1507503829U,  // <1,7,u,5>: Cost 2 vext1 <5,1,7,u>, <5,1,7,u>
-  2581246458U,  // <1,7,u,6>: Cost 3 vext1 <5,1,7,u>, <6,2,7,3>
-  2581246970U,  // <1,7,u,7>: Cost 3 vext1 <5,1,7,u>, <7,0,1,2>
-  1507505966U,  // <1,7,u,u>: Cost 2 vext1 <5,1,7,u>, LHS
-  1543643153U,  // <1,u,0,0>: Cost 2 vext2 <0,0,1,u>, <0,0,1,u>
-  1546297446U,  // <1,u,0,1>: Cost 2 vext2 <0,4,1,u>, LHS
-  2819448852U,  // <1,u,0,2>: Cost 3 vuzpr LHS, <0,0,2,2>
-  2619375876U,  // <1,u,0,3>: Cost 3 vext2 <0,3,1,u>, <0,3,1,u>
-  1546297685U,  // <1,u,0,4>: Cost 2 vext2 <0,4,1,u>, <0,4,1,u>
-  1658771190U,  // <1,u,0,5>: Cost 2 vext3 <u,0,5,1>, <u,0,5,1>
-  2736789248U,  // <1,u,0,6>: Cost 3 vext3 <u,7,0,1>, <u,0,6,2>
-  2659189376U,  // <1,u,0,7>: Cost 3 vext2 <7,0,1,u>, <0,7,u,1>
-  1546298013U,  // <1,u,0,u>: Cost 2 vext2 <0,4,1,u>, LHS
-  1483112550U,  // <1,u,1,0>: Cost 2 vext1 <1,1,1,1>, LHS
-   202162278U,  // <1,u,1,1>: Cost 1 vdup1 LHS
-  1616009006U,  // <1,u,1,2>: Cost 2 vext3 <0,u,1,1>, LHS
-  1745707110U,  // <1,u,1,3>: Cost 2 vuzpr LHS, LHS
-  1483115830U,  // <1,u,1,4>: Cost 2 vext1 <1,1,1,1>, RHS
-  2620040336U,  // <1,u,1,5>: Cost 3 vext2 <0,4,1,u>, <1,5,3,7>
-  3026622618U,  // <1,u,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS
-  2958183752U,  // <1,u,1,7>: Cost 3 vzipr <0,u,1,1>, RHS
-   202162278U,  // <1,u,1,u>: Cost 1 vdup1 LHS
-  2819449750U,  // <1,u,2,0>: Cost 3 vuzpr LHS, <1,2,3,0>
-  2893207342U,  // <1,u,2,1>: Cost 3 vzipl <1,2,3,0>, LHS
-  2819448996U,  // <1,u,2,2>: Cost 3 vuzpr LHS, <0,2,0,2>
-  2819450482U,  // <1,u,2,3>: Cost 3 vuzpr LHS, <2,2,3,3>
-  2819449754U,  // <1,u,2,4>: Cost 3 vuzpr LHS, <1,2,3,4>
-  2893207706U,  // <1,u,2,5>: Cost 3 vzipl <1,2,3,0>, RHS
-  2819449036U,  // <1,u,2,6>: Cost 3 vuzpr LHS, <0,2,4,6>
-  2970799432U,  // <1,u,2,7>: Cost 3 vzipr <3,0,1,2>, RHS
-  2819449002U,  // <1,u,2,u>: Cost 3 vuzpr LHS, <0,2,0,u>
-   403931292U,  // <1,u,3,0>: Cost 1 vext1 LHS, LHS
-  1477673718U,  // <1,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
-   115726126U,  // <1,u,3,2>: Cost 1 vrev LHS
-  2014102173U,  // <1,u,3,3>: Cost 2 vtrnr LHS, LHS
-   403934518U,  // <1,u,3,4>: Cost 1 vext1 LHS, RHS
-  1507536601U,  // <1,u,3,5>: Cost 2 vext1 <5,1,u,3>, <5,1,u,3>
-  1525453306U,  // <1,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
-  2014105129U,  // <1,u,3,7>: Cost 2 vtrnr LHS, RHS
-   403937070U,  // <1,u,3,u>: Cost 1 vext1 LHS, LHS
-  2620042157U,  // <1,u,4,0>: Cost 3 vext2 <0,4,1,u>, <4,0,u,1>
-  2620042237U,  // <1,u,4,1>: Cost 3 vext2 <0,4,1,u>, <4,1,u,0>
-  2263217967U,  // <1,u,4,2>: Cost 3 vrev <u,1,2,4>
-  2569341224U,  // <1,u,4,3>: Cost 3 vext1 <3,1,u,4>, <3,1,u,4>
-  2569342262U,  // <1,u,4,4>: Cost 3 vext1 <3,1,u,4>, RHS
-  1546300726U,  // <1,u,4,5>: Cost 2 vext2 <0,4,1,u>, RHS
-  2819449180U,  // <1,u,4,6>: Cost 3 vuzpr LHS, <0,4,2,6>
-  2724845649U,  // <1,u,4,7>: Cost 3 vext3 <6,7,0,1>, <u,4,7,6>
-  1546300969U,  // <1,u,4,u>: Cost 2 vext2 <0,4,1,u>, RHS
-  2551431270U,  // <1,u,5,0>: Cost 3 vext1 <0,1,u,5>, LHS
-  2551432192U,  // <1,u,5,1>: Cost 3 vext1 <0,1,u,5>, <1,3,5,7>
-  3028293422U,  // <1,u,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS
-  2955559068U,  // <1,u,5,3>: Cost 3 vzipr <0,4,1,5>, LHS
-  2551434550U,  // <1,u,5,4>: Cost 3 vext1 <0,1,u,5>, RHS
-  2895255706U,  // <1,u,5,5>: Cost 3 vzipl <1,5,3,7>, RHS
-  1616009370U,  // <1,u,5,6>: Cost 2 vext3 <0,u,1,1>, RHS
-  1745710390U,  // <1,u,5,7>: Cost 2 vuzpr LHS, RHS
-  1745710391U,  // <1,u,5,u>: Cost 2 vuzpr LHS, RHS
-  2653221159U,  // <1,u,6,0>: Cost 3 vext2 <6,0,1,u>, <6,0,1,u>
-  2725509303U,  // <1,u,6,1>: Cost 3 vext3 <6,u,0,1>, <u,6,1,0>
-  2659193338U,  // <1,u,6,2>: Cost 3 vext2 <7,0,1,u>, <6,2,7,3>
-  2689751248U,  // <1,u,6,3>: Cost 3 vext3 <0,u,1,1>, <u,6,3,7>
-  2867228774U,  // <1,u,6,4>: Cost 3 vuzpr LHS, <5,6,7,4>
-  3764820194U,  // <1,u,6,5>: Cost 4 vext3 <1,1,1,1>, <u,6,5,7>
-  2657202957U,  // <1,u,6,6>: Cost 3 vext2 <6,6,1,u>, <6,6,1,u>
-  2819450810U,  // <1,u,6,7>: Cost 3 vuzpr LHS, <2,6,3,7>
-  2819450811U,  // <1,u,6,u>: Cost 3 vuzpr LHS, <2,6,3,u>
-  1585452032U,  // <1,u,7,0>: Cost 2 vext2 <7,0,1,u>, <7,0,1,u>
-  2557420340U,  // <1,u,7,1>: Cost 3 vext1 <1,1,u,7>, <1,1,1,1>
-  2569365158U,  // <1,u,7,2>: Cost 3 vext1 <3,1,u,7>, <2,3,0,1>
-  2569365803U,  // <1,u,7,3>: Cost 3 vext1 <3,1,u,7>, <3,1,u,7>
-  2557422902U,  // <1,u,7,4>: Cost 3 vext1 <1,1,u,7>, RHS
-  2662512021U,  // <1,u,7,5>: Cost 3 vext2 <7,5,1,u>, <7,5,1,u>
-  2724845884U,  // <1,u,7,6>: Cost 3 vext3 <6,7,0,1>, <u,7,6,7>
-  2659194476U,  // <1,u,7,7>: Cost 3 vext2 <7,0,1,u>, <7,7,7,7>
-  1590761096U,  // <1,u,7,u>: Cost 2 vext2 <7,u,1,u>, <7,u,1,u>
-   403972257U,  // <1,u,u,0>: Cost 1 vext1 LHS, LHS
-   202162278U,  // <1,u,u,1>: Cost 1 vdup1 LHS
-   115767091U,  // <1,u,u,2>: Cost 1 vrev LHS
-  1745707677U,  // <1,u,u,3>: Cost 2 vuzpr LHS, LHS
-   403975478U,  // <1,u,u,4>: Cost 1 vext1 LHS, RHS
-  1546303642U,  // <1,u,u,5>: Cost 2 vext2 <0,4,1,u>, RHS
-  1616009613U,  // <1,u,u,6>: Cost 2 vext3 <0,u,1,1>, RHS
-  1745710633U,  // <1,u,u,7>: Cost 2 vuzpr LHS, RHS
-   403978030U,  // <1,u,u,u>: Cost 1 vext1 LHS, LHS
-  2551463936U,  // <2,0,0,0>: Cost 3 vext1 <0,2,0,0>, <0,0,0,0>
-  2685698058U,  // <2,0,0,1>: Cost 3 vext3 <0,2,0,2>, <0,0,1,1>
-  1610776596U,  // <2,0,0,2>: Cost 2 vext3 <0,0,2,2>, <0,0,2,2>
-  2619384069U,  // <2,0,0,3>: Cost 3 vext2 <0,3,2,0>, <0,3,2,0>
-  2551467318U,  // <2,0,0,4>: Cost 3 vext1 <0,2,0,0>, RHS
-  3899836596U,  // <2,0,0,5>: Cost 4 vuzpr <1,2,3,0>, <3,0,4,5>
-  2621374968U,  // <2,0,0,6>: Cost 3 vext2 <0,6,2,0>, <0,6,2,0>
-  4168271334U,  // <2,0,0,7>: Cost 4 vtrnr <1,2,3,0>, <2,0,5,7>
-  1611219018U,  // <2,0,0,u>: Cost 2 vext3 <0,0,u,2>, <0,0,u,2>
-  2551472138U,  // <2,0,1,0>: Cost 3 vext1 <0,2,0,1>, <0,0,1,1>
-  2690564186U,  // <2,0,1,1>: Cost 3 vext3 <1,0,3,2>, <0,1,1,0>
-  1611956326U,  // <2,0,1,2>: Cost 2 vext3 <0,2,0,2>, LHS
-  2826092646U,  // <2,0,1,3>: Cost 3 vuzpr <1,2,3,0>, LHS
-  2551475510U,  // <2,0,1,4>: Cost 3 vext1 <0,2,0,1>, RHS
-  3692463248U,  // <2,0,1,5>: Cost 4 vext2 <0,2,2,0>, <1,5,3,7>
-  2587308473U,  // <2,0,1,6>: Cost 3 vext1 <6,2,0,1>, <6,2,0,1>
-  3661050874U,  // <2,0,1,7>: Cost 4 vext1 <6,2,0,1>, <7,0,1,2>
-  1611956380U,  // <2,0,1,u>: Cost 2 vext3 <0,2,0,2>, LHS
-  1477738598U,  // <2,0,2,0>: Cost 2 vext1 <0,2,0,2>, LHS
-  2551481078U,  // <2,0,2,1>: Cost 3 vext1 <0,2,0,2>, <1,0,3,2>
-  2551481796U,  // <2,0,2,2>: Cost 3 vext1 <0,2,0,2>, <2,0,2,0>
-  2551482518U,  // <2,0,2,3>: Cost 3 vext1 <0,2,0,2>, <3,0,1,2>
-  1477741878U,  // <2,0,2,4>: Cost 2 vext1 <0,2,0,2>, RHS
-  2551484112U,  // <2,0,2,5>: Cost 3 vext1 <0,2,0,2>, <5,1,7,3>
-  2551484759U,  // <2,0,2,6>: Cost 3 vext1 <0,2,0,2>, <6,0,7,2>
-  2551485434U,  // <2,0,2,7>: Cost 3 vext1 <0,2,0,2>, <7,0,1,2>
-  1477744430U,  // <2,0,2,u>: Cost 2 vext1 <0,2,0,2>, LHS
-  2953625600U,  // <2,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0>
-  2953627302U,  // <2,0,3,1>: Cost 3 vzipr LHS, <2,3,0,1>
-  2953625764U,  // <2,0,3,2>: Cost 3 vzipr LHS, <0,2,0,2>
-  4027369695U,  // <2,0,3,3>: Cost 4 vzipr LHS, <3,1,0,3>
-  3625233718U,  // <2,0,3,4>: Cost 4 vext1 <0,2,0,3>, RHS
-  3899836110U,  // <2,0,3,5>: Cost 4 vuzpr <1,2,3,0>, <2,3,4,5>
-  4032012618U,  // <2,0,3,6>: Cost 4 vzipr LHS, <0,4,0,6>
-  3899835392U,  // <2,0,3,7>: Cost 4 vuzpr <1,2,3,0>, <1,3,5,7>
-  2953625770U,  // <2,0,3,u>: Cost 3 vzipr LHS, <0,2,0,u>
-  2551496806U,  // <2,0,4,0>: Cost 3 vext1 <0,2,0,4>, LHS
-  2685698386U,  // <2,0,4,1>: Cost 3 vext3 <0,2,0,2>, <0,4,1,5>
-  2685698396U,  // <2,0,4,2>: Cost 3 vext3 <0,2,0,2>, <0,4,2,6>
-  3625240726U,  // <2,0,4,3>: Cost 4 vext1 <0,2,0,4>, <3,0,1,2>
-  2551500086U,  // <2,0,4,4>: Cost 3 vext1 <0,2,0,4>, RHS
-  2618723638U,  // <2,0,4,5>: Cost 3 vext2 <0,2,2,0>, RHS
-  2765409590U,  // <2,0,4,6>: Cost 3 vuzpl <2,3,0,1>, RHS
-  3799990664U,  // <2,0,4,7>: Cost 4 vext3 <7,0,1,2>, <0,4,7,5>
-  2685698450U,  // <2,0,4,u>: Cost 3 vext3 <0,2,0,2>, <0,4,u,6>
-  3625246822U,  // <2,0,5,0>: Cost 4 vext1 <0,2,0,5>, LHS
-  3289776304U,  // <2,0,5,1>: Cost 4 vrev <0,2,1,5>
-  2690564526U,  // <2,0,5,2>: Cost 3 vext3 <1,0,3,2>, <0,5,2,7>
-  3289923778U,  // <2,0,5,3>: Cost 4 vrev <0,2,3,5>
-  2216255691U,  // <2,0,5,4>: Cost 3 vrev <0,2,4,5>
-  3726307332U,  // <2,0,5,5>: Cost 4 vext2 <5,u,2,0>, <5,5,5,5>
-  3726307426U,  // <2,0,5,6>: Cost 4 vext2 <5,u,2,0>, <5,6,7,0>
-  2826095926U,  // <2,0,5,7>: Cost 3 vuzpr <1,2,3,0>, RHS
-  2216550639U,  // <2,0,5,u>: Cost 3 vrev <0,2,u,5>
-  4162420736U,  // <2,0,6,0>: Cost 4 vtrnr <0,2,4,6>, <0,0,0,0>
-  2901885030U,  // <2,0,6,1>: Cost 3 vzipl <2,6,3,7>, LHS
-  2685698559U,  // <2,0,6,2>: Cost 3 vext3 <0,2,0,2>, <0,6,2,7>
-  3643173171U,  // <2,0,6,3>: Cost 4 vext1 <3,2,0,6>, <3,2,0,6>
-  2216263884U,  // <2,0,6,4>: Cost 3 vrev <0,2,4,6>
-  3730289341U,  // <2,0,6,5>: Cost 4 vext2 <6,5,2,0>, <6,5,2,0>
-  3726308152U,  // <2,0,6,6>: Cost 4 vext2 <5,u,2,0>, <6,6,6,6>
-  3899836346U,  // <2,0,6,7>: Cost 4 vuzpr <1,2,3,0>, <2,6,3,7>
-  2216558832U,  // <2,0,6,u>: Cost 3 vrev <0,2,u,6>
-  2659202049U,  // <2,0,7,0>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0>
-  3726308437U,  // <2,0,7,1>: Cost 4 vext2 <5,u,2,0>, <7,1,2,3>
-  2726249034U,  // <2,0,7,2>: Cost 3 vext3 <7,0,1,2>, <0,7,2,1>
-  3734934772U,  // <2,0,7,3>: Cost 4 vext2 <7,3,2,0>, <7,3,2,0>
-  3726308710U,  // <2,0,7,4>: Cost 4 vext2 <5,u,2,0>, <7,4,5,6>
-  3726308814U,  // <2,0,7,5>: Cost 4 vext2 <5,u,2,0>, <7,5,u,2>
-  3736925671U,  // <2,0,7,6>: Cost 4 vext2 <7,6,2,0>, <7,6,2,0>
-  3726308972U,  // <2,0,7,7>: Cost 4 vext2 <5,u,2,0>, <7,7,7,7>
-  2659202049U,  // <2,0,7,u>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0>
-  1477787750U,  // <2,0,u,0>: Cost 2 vext1 <0,2,0,u>, LHS
-  2953668262U,  // <2,0,u,1>: Cost 3 vzipr LHS, <2,3,0,1>
-  1611956893U,  // <2,0,u,2>: Cost 2 vext3 <0,2,0,2>, LHS
-  2551531670U,  // <2,0,u,3>: Cost 3 vext1 <0,2,0,u>, <3,0,1,2>
-  1477791030U,  // <2,0,u,4>: Cost 2 vext1 <0,2,0,u>, RHS
-  2618726554U,  // <2,0,u,5>: Cost 3 vext2 <0,2,2,0>, RHS
-  2765412506U,  // <2,0,u,6>: Cost 3 vuzpl <2,3,0,1>, RHS
-  2826096169U,  // <2,0,u,7>: Cost 3 vuzpr <1,2,3,0>, RHS
-  1611956947U,  // <2,0,u,u>: Cost 2 vext3 <0,2,0,2>, LHS
-  2569453670U,  // <2,1,0,0>: Cost 3 vext1 <3,2,1,0>, LHS
-  2619392102U,  // <2,1,0,1>: Cost 3 vext2 <0,3,2,1>, LHS
-  3759440619U,  // <2,1,0,2>: Cost 4 vext3 <0,2,0,2>, <1,0,2,0>
-  1616823030U,  // <2,1,0,3>: Cost 2 vext3 <1,0,3,2>, <1,0,3,2>
-  2569456950U,  // <2,1,0,4>: Cost 3 vext1 <3,2,1,0>, RHS
-  2690712328U,  // <2,1,0,5>: Cost 3 vext3 <1,0,5,2>, <1,0,5,2>
-  3661115841U,  // <2,1,0,6>: Cost 4 vext1 <6,2,1,0>, <6,2,1,0>
-  2622046794U,  // <2,1,0,7>: Cost 3 vext2 <0,7,2,1>, <0,7,2,1>
-  1617191715U,  // <2,1,0,u>: Cost 2 vext3 <1,0,u,2>, <1,0,u,2>
-  2551545958U,  // <2,1,1,0>: Cost 3 vext1 <0,2,1,1>, LHS
-  2685698868U,  // <2,1,1,1>: Cost 3 vext3 <0,2,0,2>, <1,1,1,1>
-  2628682646U,  // <2,1,1,2>: Cost 3 vext2 <1,u,2,1>, <1,2,3,0>
-  2685698888U,  // <2,1,1,3>: Cost 3 vext3 <0,2,0,2>, <1,1,3,3>
-  2551549238U,  // <2,1,1,4>: Cost 3 vext1 <0,2,1,1>, RHS
-  3693134992U,  // <2,1,1,5>: Cost 4 vext2 <0,3,2,1>, <1,5,3,7>
-  3661124034U,  // <2,1,1,6>: Cost 4 vext1 <6,2,1,1>, <6,2,1,1>
-  3625292794U,  // <2,1,1,7>: Cost 4 vext1 <0,2,1,1>, <7,0,1,2>
-  2685698933U,  // <2,1,1,u>: Cost 3 vext3 <0,2,0,2>, <1,1,u,3>
-  2551554150U,  // <2,1,2,0>: Cost 3 vext1 <0,2,1,2>, LHS
-  3893649571U,  // <2,1,2,1>: Cost 4 vuzpr <0,2,0,1>, <0,2,0,1>
-  2551555688U,  // <2,1,2,2>: Cost 3 vext1 <0,2,1,2>, <2,2,2,2>
-  2685698966U,  // <2,1,2,3>: Cost 3 vext3 <0,2,0,2>, <1,2,3,0>
-  2551557430U,  // <2,1,2,4>: Cost 3 vext1 <0,2,1,2>, RHS
-  3763422123U,  // <2,1,2,5>: Cost 4 vext3 <0,u,0,2>, <1,2,5,3>
-  3693135802U,  // <2,1,2,6>: Cost 4 vext2 <0,3,2,1>, <2,6,3,7>
-  2726249402U,  // <2,1,2,7>: Cost 3 vext3 <7,0,1,2>, <1,2,7,0>
-  2685699011U,  // <2,1,2,u>: Cost 3 vext3 <0,2,0,2>, <1,2,u,0>
-  2551562342U,  // <2,1,3,0>: Cost 3 vext1 <0,2,1,3>, LHS
-  2953625610U,  // <2,1,3,1>: Cost 3 vzipr LHS, <0,0,1,1>
-  2953627798U,  // <2,1,3,2>: Cost 3 vzipr LHS, <3,0,1,2>
-  2953626584U,  // <2,1,3,3>: Cost 3 vzipr LHS, <1,3,1,3>
-  2551565622U,  // <2,1,3,4>: Cost 3 vext1 <0,2,1,3>, RHS
-  2953625938U,  // <2,1,3,5>: Cost 3 vzipr LHS, <0,4,1,5>
-  2587398596U,  // <2,1,3,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3>
-  4032013519U,  // <2,1,3,7>: Cost 4 vzipr LHS, <1,6,1,7>
-  2953625617U,  // <2,1,3,u>: Cost 3 vzipr LHS, <0,0,1,u>
-  2690565154U,  // <2,1,4,0>: Cost 3 vext3 <1,0,3,2>, <1,4,0,5>
-  3625313270U,  // <2,1,4,1>: Cost 4 vext1 <0,2,1,4>, <1,3,4,6>
-  3771532340U,  // <2,1,4,2>: Cost 4 vext3 <2,2,2,2>, <1,4,2,5>
-  1148404634U,  // <2,1,4,3>: Cost 2 vrev <1,2,3,4>
-  3625315638U,  // <2,1,4,4>: Cost 4 vext1 <0,2,1,4>, RHS
-  2619395382U,  // <2,1,4,5>: Cost 3 vext2 <0,3,2,1>, RHS
-  3837242678U,  // <2,1,4,6>: Cost 4 vuzpl <2,0,1,2>, RHS
-  3799991394U,  // <2,1,4,7>: Cost 4 vext3 <7,0,1,2>, <1,4,7,6>
-  1148773319U,  // <2,1,4,u>: Cost 2 vrev <1,2,u,4>
-  2551578726U,  // <2,1,5,0>: Cost 3 vext1 <0,2,1,5>, LHS
-  2551579648U,  // <2,1,5,1>: Cost 3 vext1 <0,2,1,5>, <1,3,5,7>
-  3625321952U,  // <2,1,5,2>: Cost 4 vext1 <0,2,1,5>, <2,0,5,1>
-  2685699216U,  // <2,1,5,3>: Cost 3 vext3 <0,2,0,2>, <1,5,3,7>
-  2551582006U,  // <2,1,5,4>: Cost 3 vext1 <0,2,1,5>, RHS
-  3740913668U,  // <2,1,5,5>: Cost 4 vext2 <u,3,2,1>, <5,5,5,5>
-  3661156806U,  // <2,1,5,6>: Cost 4 vext1 <6,2,1,5>, <6,2,1,5>
-  3893652790U,  // <2,1,5,7>: Cost 4 vuzpr <0,2,0,1>, RHS
-  2685699261U,  // <2,1,5,u>: Cost 3 vext3 <0,2,0,2>, <1,5,u,7>
-  2551586918U,  // <2,1,6,0>: Cost 3 vext1 <0,2,1,6>, LHS
-  3625329398U,  // <2,1,6,1>: Cost 4 vext1 <0,2,1,6>, <1,0,3,2>
-  2551588794U,  // <2,1,6,2>: Cost 3 vext1 <0,2,1,6>, <2,6,3,7>
-  3088679014U,  // <2,1,6,3>: Cost 3 vtrnr <0,2,4,6>, LHS
-  2551590198U,  // <2,1,6,4>: Cost 3 vext1 <0,2,1,6>, RHS
-  4029382994U,  // <2,1,6,5>: Cost 4 vzipr <0,4,2,6>, <0,4,1,5>
-  3625333560U,  // <2,1,6,6>: Cost 4 vext1 <0,2,1,6>, <6,6,6,6>
-  3731624800U,  // <2,1,6,7>: Cost 4 vext2 <6,7,2,1>, <6,7,2,1>
-  2551592750U,  // <2,1,6,u>: Cost 3 vext1 <0,2,1,6>, LHS
-  2622051322U,  // <2,1,7,0>: Cost 3 vext2 <0,7,2,1>, <7,0,1,2>
-  3733615699U,  // <2,1,7,1>: Cost 4 vext2 <7,1,2,1>, <7,1,2,1>
-  3795125538U,  // <2,1,7,2>: Cost 4 vext3 <6,1,7,2>, <1,7,2,0>
-  2222171037U,  // <2,1,7,3>: Cost 3 vrev <1,2,3,7>
-  3740915046U,  // <2,1,7,4>: Cost 4 vext2 <u,3,2,1>, <7,4,5,6>
-  3296060335U,  // <2,1,7,5>: Cost 4 vrev <1,2,5,7>
-  3736933864U,  // <2,1,7,6>: Cost 4 vext2 <7,6,2,1>, <7,6,2,1>
-  3805300055U,  // <2,1,7,7>: Cost 4 vext3 <7,u,1,2>, <1,7,7,u>
-  2669827714U,  // <2,1,7,u>: Cost 3 vext2 <u,7,2,1>, <7,u,1,2>
-  2551603302U,  // <2,1,u,0>: Cost 3 vext1 <0,2,1,u>, LHS
-  2953666570U,  // <2,1,u,1>: Cost 3 vzipr LHS, <0,0,1,1>
-  2953668758U,  // <2,1,u,2>: Cost 3 vzipr LHS, <3,0,1,2>
-  1148437406U,  // <2,1,u,3>: Cost 2 vrev <1,2,3,u>
-  2551606582U,  // <2,1,u,4>: Cost 3 vext1 <0,2,1,u>, RHS
-  2953666898U,  // <2,1,u,5>: Cost 3 vzipr LHS, <0,4,1,5>
-  2587398596U,  // <2,1,u,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3>
-  2669828370U,  // <2,1,u,7>: Cost 3 vext2 <u,7,2,1>, <u,7,2,1>
-  1148806091U,  // <2,1,u,u>: Cost 2 vrev <1,2,u,u>
-  1543667732U,  // <2,2,0,0>: Cost 2 vext2 <0,0,2,2>, <0,0,2,2>
-  1548976230U,  // <2,2,0,1>: Cost 2 vext2 <0,u,2,2>, LHS
-  2685699524U,  // <2,2,0,2>: Cost 3 vext3 <0,2,0,2>, <2,0,2,0>
-  2685699535U,  // <2,2,0,3>: Cost 3 vext3 <0,2,0,2>, <2,0,3,2>
-  2551614774U,  // <2,2,0,4>: Cost 3 vext1 <0,2,2,0>, RHS
-  3704422830U,  // <2,2,0,5>: Cost 4 vext2 <2,2,2,2>, <0,5,2,7>
-  3893657642U,  // <2,2,0,6>: Cost 4 vuzpr <0,2,0,2>, <0,0,4,6>
-  3770574323U,  // <2,2,0,7>: Cost 4 vext3 <2,0,7,2>, <2,0,7,2>
-  1548976796U,  // <2,2,0,u>: Cost 2 vext2 <0,u,2,2>, <0,u,2,2>
-  2622718710U,  // <2,2,1,0>: Cost 3 vext2 <0,u,2,2>, <1,0,3,2>
-  2622718772U,  // <2,2,1,1>: Cost 3 vext2 <0,u,2,2>, <1,1,1,1>
-  2622718870U,  // <2,2,1,2>: Cost 3 vext2 <0,u,2,2>, <1,2,3,0>
-  2819915878U,  // <2,2,1,3>: Cost 3 vuzpr <0,2,0,2>, LHS
-  3625364790U,  // <2,2,1,4>: Cost 4 vext1 <0,2,2,1>, RHS
-  2622719120U,  // <2,2,1,5>: Cost 3 vext2 <0,u,2,2>, <1,5,3,7>
-  3760031292U,  // <2,2,1,6>: Cost 4 vext3 <0,2,u,2>, <2,1,6,3>
-  3667170468U,  // <2,2,1,7>: Cost 4 vext1 <7,2,2,1>, <7,2,2,1>
-  2819915883U,  // <2,2,1,u>: Cost 3 vuzpr <0,2,0,2>, LHS
-  1489829990U,  // <2,2,2,0>: Cost 2 vext1 <2,2,2,2>, LHS
-  2563572470U,  // <2,2,2,1>: Cost 3 vext1 <2,2,2,2>, <1,0,3,2>
-   269271142U,  // <2,2,2,2>: Cost 1 vdup2 LHS
-  2685699698U,  // <2,2,2,3>: Cost 3 vext3 <0,2,0,2>, <2,2,3,3>
-  1489833270U,  // <2,2,2,4>: Cost 2 vext1 <2,2,2,2>, RHS
-  2685699720U,  // <2,2,2,5>: Cost 3 vext3 <0,2,0,2>, <2,2,5,7>
-  2622719930U,  // <2,2,2,6>: Cost 3 vext2 <0,u,2,2>, <2,6,3,7>
-  2593436837U,  // <2,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2>
-   269271142U,  // <2,2,2,u>: Cost 1 vdup2 LHS
-  2685699750U,  // <2,2,3,0>: Cost 3 vext3 <0,2,0,2>, <2,3,0,1>
-  2690565806U,  // <2,2,3,1>: Cost 3 vext3 <1,0,3,2>, <2,3,1,0>
-  2953627240U,  // <2,2,3,2>: Cost 3 vzipr LHS, <2,2,2,2>
-  1879883878U,  // <2,2,3,3>: Cost 2 vzipr LHS, LHS
-  2685699790U,  // <2,2,3,4>: Cost 3 vext3 <0,2,0,2>, <2,3,4,5>
-  3893659342U,  // <2,2,3,5>: Cost 4 vuzpr <0,2,0,2>, <2,3,4,5>
-  2958270812U,  // <2,2,3,6>: Cost 3 vzipr LHS, <0,4,2,6>
-  2593445030U,  // <2,2,3,7>: Cost 3 vext1 <7,2,2,3>, <7,2,2,3>
-  1879883883U,  // <2,2,3,u>: Cost 2 vzipr LHS, LHS
-  2551644262U,  // <2,2,4,0>: Cost 3 vext1 <0,2,2,4>, LHS
-  3625386742U,  // <2,2,4,1>: Cost 4 vext1 <0,2,2,4>, <1,0,3,2>
-  2551645902U,  // <2,2,4,2>: Cost 3 vext1 <0,2,2,4>, <2,3,4,5>
-  3759441686U,  // <2,2,4,3>: Cost 4 vext3 <0,2,0,2>, <2,4,3,5>
-  2551647542U,  // <2,2,4,4>: Cost 3 vext1 <0,2,2,4>, RHS
-  1548979510U,  // <2,2,4,5>: Cost 2 vext2 <0,u,2,2>, RHS
-  2764901686U,  // <2,2,4,6>: Cost 3 vuzpl <2,2,2,2>, RHS
-  3667195047U,  // <2,2,4,7>: Cost 4 vext1 <7,2,2,4>, <7,2,2,4>
-  1548979753U,  // <2,2,4,u>: Cost 2 vext2 <0,u,2,2>, RHS
-  3696463432U,  // <2,2,5,0>: Cost 4 vext2 <0,u,2,2>, <5,0,1,2>
-  2617413328U,  // <2,2,5,1>: Cost 3 vext2 <0,0,2,2>, <5,1,7,3>
-  2685699936U,  // <2,2,5,2>: Cost 3 vext3 <0,2,0,2>, <2,5,2,7>
-  4027383910U,  // <2,2,5,3>: Cost 4 vzipr <0,1,2,5>, LHS
-  2228201085U,  // <2,2,5,4>: Cost 3 vrev <2,2,4,5>
-  2617413636U,  // <2,2,5,5>: Cost 3 vext2 <0,0,2,2>, <5,5,5,5>
-  2617413730U,  // <2,2,5,6>: Cost 3 vext2 <0,0,2,2>, <5,6,7,0>
-  2819919158U,  // <2,2,5,7>: Cost 3 vuzpr <0,2,0,2>, RHS
-  2819919159U,  // <2,2,5,u>: Cost 3 vuzpr <0,2,0,2>, RHS
-  3625402554U,  // <2,2,6,0>: Cost 4 vext1 <0,2,2,6>, <0,2,2,6>
-  3760031652U,  // <2,2,6,1>: Cost 4 vext3 <0,2,u,2>, <2,6,1,3>
-  2617414138U,  // <2,2,6,2>: Cost 3 vext2 <0,0,2,2>, <6,2,7,3>
-  2685700026U,  // <2,2,6,3>: Cost 3 vext3 <0,2,0,2>, <2,6,3,7>
-  3625405750U,  // <2,2,6,4>: Cost 4 vext1 <0,2,2,6>, RHS
-  3760031692U,  // <2,2,6,5>: Cost 4 vext3 <0,2,u,2>, <2,6,5,7>
-  3088679116U,  // <2,2,6,6>: Cost 3 vtrnr <0,2,4,6>, <0,2,4,6>
-  2657891169U,  // <2,2,6,7>: Cost 3 vext2 <6,7,2,2>, <6,7,2,2>
-  2685700071U,  // <2,2,6,u>: Cost 3 vext3 <0,2,0,2>, <2,6,u,7>
-  2726250474U,  // <2,2,7,0>: Cost 3 vext3 <7,0,1,2>, <2,7,0,1>
-  3704427616U,  // <2,2,7,1>: Cost 4 vext2 <2,2,2,2>, <7,1,3,5>
-  2660545701U,  // <2,2,7,2>: Cost 3 vext2 <7,2,2,2>, <7,2,2,2>
-  4030718054U,  // <2,2,7,3>: Cost 4 vzipr <0,6,2,7>, LHS
-  2617415014U,  // <2,2,7,4>: Cost 3 vext2 <0,0,2,2>, <7,4,5,6>
-  3302033032U,  // <2,2,7,5>: Cost 4 vrev <2,2,5,7>
-  3661246929U,  // <2,2,7,6>: Cost 4 vext1 <6,2,2,7>, <6,2,2,7>
-  2617415276U,  // <2,2,7,7>: Cost 3 vext2 <0,0,2,2>, <7,7,7,7>
-  2731558962U,  // <2,2,7,u>: Cost 3 vext3 <7,u,1,2>, <2,7,u,1>
-  1489829990U,  // <2,2,u,0>: Cost 2 vext1 <2,2,2,2>, LHS
-  1548982062U,  // <2,2,u,1>: Cost 2 vext2 <0,u,2,2>, LHS
-   269271142U,  // <2,2,u,2>: Cost 1 vdup2 LHS
-  1879924838U,  // <2,2,u,3>: Cost 2 vzipr LHS, LHS
-  1489833270U,  // <2,2,u,4>: Cost 2 vext1 <2,2,2,2>, RHS
-  1548982426U,  // <2,2,u,5>: Cost 2 vext2 <0,u,2,2>, RHS
-  2953666908U,  // <2,2,u,6>: Cost 3 vzipr LHS, <0,4,2,6>
-  2819919401U,  // <2,2,u,7>: Cost 3 vuzpr <0,2,0,2>, RHS
-   269271142U,  // <2,2,u,u>: Cost 1 vdup2 LHS
-  1544339456U,  // <2,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
-   470597734U,  // <2,3,0,1>: Cost 1 vext2 LHS, LHS
-  1548984484U,  // <2,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
-  2619408648U,  // <2,3,0,3>: Cost 3 vext2 <0,3,2,3>, <0,3,2,3>
-  1548984658U,  // <2,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
-  2665857454U,  // <2,3,0,5>: Cost 3 vext2 LHS, <0,5,2,7>
-  2622726655U,  // <2,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7>
-  2593494188U,  // <2,3,0,7>: Cost 3 vext1 <7,2,3,0>, <7,2,3,0>
-   470598301U,  // <2,3,0,u>: Cost 1 vext2 LHS, LHS
-  1544340214U,  // <2,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
-  1544340276U,  // <2,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
-  1544340374U,  // <2,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
-  1548985304U,  // <2,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
-  2551696694U,  // <2,3,1,4>: Cost 3 vext1 <0,2,3,1>, RHS
-  1548985488U,  // <2,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
-  2622727375U,  // <2,3,1,6>: Cost 3 vext2 LHS, <1,6,1,7>
-  2665858347U,  // <2,3,1,7>: Cost 3 vext2 LHS, <1,7,3,0>
-  1548985709U,  // <2,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3>
-  2622727613U,  // <2,3,2,0>: Cost 3 vext2 LHS, <2,0,1,2>
-  2622727711U,  // <2,3,2,1>: Cost 3 vext2 LHS, <2,1,3,1>
-  1544341096U,  // <2,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2>
-  1544341158U,  // <2,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
-  2622727958U,  // <2,3,2,4>: Cost 3 vext2 LHS, <2,4,3,5>
-  2622728032U,  // <2,3,2,5>: Cost 3 vext2 LHS, <2,5,2,7>
-  1548986298U,  // <2,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
-  2665859050U,  // <2,3,2,7>: Cost 3 vext2 LHS, <2,7,0,1>
-  1548986427U,  // <2,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1>
-  1548986518U,  // <2,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
-  2622728415U,  // <2,3,3,1>: Cost 3 vext2 LHS, <3,1,0,3>
-  1489913458U,  // <2,3,3,2>: Cost 2 vext1 <2,2,3,3>, <2,2,3,3>
-  1544341916U,  // <2,3,3,3>: Cost 2 vext2 LHS, <3,3,3,3>
-  1548986882U,  // <2,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
-  2665859632U,  // <2,3,3,5>: Cost 3 vext2 LHS, <3,5,1,7>
-  2234304870U,  // <2,3,3,6>: Cost 3 vrev <3,2,6,3>
-  2958271632U,  // <2,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7>
-  1548987166U,  // <2,3,3,u>: Cost 2 vext2 LHS, <3,u,1,2>
-  1483948134U,  // <2,3,4,0>: Cost 2 vext1 <1,2,3,4>, LHS
-  1483948954U,  // <2,3,4,1>: Cost 2 vext1 <1,2,3,4>, <1,2,3,4>
-  2622729276U,  // <2,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0>
-  2557692054U,  // <2,3,4,3>: Cost 3 vext1 <1,2,3,4>, <3,0,1,2>
-  1483951414U,  // <2,3,4,4>: Cost 2 vext1 <1,2,3,4>, RHS
-   470601014U,  // <2,3,4,5>: Cost 1 vext2 LHS, RHS
-  1592118644U,  // <2,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
-  2593526960U,  // <2,3,4,7>: Cost 3 vext1 <7,2,3,4>, <7,2,3,4>
-   470601257U,  // <2,3,4,u>: Cost 1 vext2 LHS, RHS
-  2551726182U,  // <2,3,5,0>: Cost 3 vext1 <0,2,3,5>, LHS
-  1592118992U,  // <2,3,5,1>: Cost 2 vext2 LHS, <5,1,7,3>
-  2665860862U,  // <2,3,5,2>: Cost 3 vext2 LHS, <5,2,3,4>
-  2551728642U,  // <2,3,5,3>: Cost 3 vext1 <0,2,3,5>, <3,4,5,6>
-  1592119238U,  // <2,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
-  1592119300U,  // <2,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
-  1592119394U,  // <2,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0>
-  1592119464U,  // <2,3,5,7>: Cost 2 vext2 LHS, <5,7,5,7>
-  1592119545U,  // <2,3,5,u>: Cost 2 vext2 LHS, <5,u,5,7>
-  2622730529U,  // <2,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2>
-  2557707164U,  // <2,3,6,1>: Cost 3 vext1 <1,2,3,6>, <1,2,3,6>
-  1592119802U,  // <2,3,6,2>: Cost 2 vext2 LHS, <6,2,7,3>
-  2665861682U,  // <2,3,6,3>: Cost 3 vext2 LHS, <6,3,4,5>
-  2622730893U,  // <2,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6>
-  2665861810U,  // <2,3,6,5>: Cost 3 vext2 LHS, <6,5,0,7>
-  1592120120U,  // <2,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
-  1592120142U,  // <2,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
-  1592120223U,  // <2,3,6,u>: Cost 2 vext2 LHS, <6,u,0,1>
-  1592120314U,  // <2,3,7,0>: Cost 2 vext2 LHS, <7,0,1,2>
-  2659890261U,  // <2,3,7,1>: Cost 3 vext2 <7,1,2,3>, <7,1,2,3>
-  2660553894U,  // <2,3,7,2>: Cost 3 vext2 <7,2,2,3>, <7,2,2,3>
-  2665862371U,  // <2,3,7,3>: Cost 3 vext2 LHS, <7,3,0,1>
-  1592120678U,  // <2,3,7,4>: Cost 2 vext2 LHS, <7,4,5,6>
-  2665862534U,  // <2,3,7,5>: Cost 3 vext2 LHS, <7,5,0,2>
-  2665862614U,  // <2,3,7,6>: Cost 3 vext2 LHS, <7,6,0,1>
-  1592120940U,  // <2,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
-  1592120962U,  // <2,3,7,u>: Cost 2 vext2 LHS, <7,u,1,2>
-  1548990163U,  // <2,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2>
-   470603566U,  // <2,3,u,1>: Cost 1 vext2 LHS, LHS
-  1548990341U,  // <2,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0>
-  1548990396U,  // <2,3,u,3>: Cost 2 vext2 LHS, <u,3,0,1>
-  1548990527U,  // <2,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6>
-   470603930U,  // <2,3,u,5>: Cost 1 vext2 LHS, RHS
-  1548990672U,  // <2,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7>
-  1592121600U,  // <2,3,u,7>: Cost 2 vext2 LHS, <u,7,0,1>
-   470604133U,  // <2,3,u,u>: Cost 1 vext2 LHS, LHS
-  2617425942U,  // <2,4,0,0>: Cost 3 vext2 <0,0,2,4>, <0,0,2,4>
-  2618753126U,  // <2,4,0,1>: Cost 3 vext2 <0,2,2,4>, LHS
-  2618753208U,  // <2,4,0,2>: Cost 3 vext2 <0,2,2,4>, <0,2,2,4>
-  2619416841U,  // <2,4,0,3>: Cost 3 vext2 <0,3,2,4>, <0,3,2,4>
-  2587593628U,  // <2,4,0,4>: Cost 3 vext1 <6,2,4,0>, <4,0,6,2>
-  2712832914U,  // <2,4,0,5>: Cost 3 vext3 <4,6,u,2>, <4,0,5,1>
-  1634962332U,  // <2,4,0,6>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2>
-  3799993252U,  // <2,4,0,7>: Cost 4 vext3 <7,0,1,2>, <4,0,7,1>
-  1634962332U,  // <2,4,0,u>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2>
-  2619417334U,  // <2,4,1,0>: Cost 3 vext2 <0,3,2,4>, <1,0,3,2>
-  3692495668U,  // <2,4,1,1>: Cost 4 vext2 <0,2,2,4>, <1,1,1,1>
-  2625389466U,  // <2,4,1,2>: Cost 3 vext2 <1,3,2,4>, <1,2,3,4>
-  2826125414U,  // <2,4,1,3>: Cost 3 vuzpr <1,2,3,4>, LHS
-  3699794995U,  // <2,4,1,4>: Cost 4 vext2 <1,4,2,4>, <1,4,2,4>
-  3692496016U,  // <2,4,1,5>: Cost 4 vext2 <0,2,2,4>, <1,5,3,7>
-  3763424238U,  // <2,4,1,6>: Cost 4 vext3 <0,u,0,2>, <4,1,6,3>
-  3667317942U,  // <2,4,1,7>: Cost 4 vext1 <7,2,4,1>, <7,2,4,1>
-  2826125419U,  // <2,4,1,u>: Cost 3 vuzpr <1,2,3,4>, LHS
-  2629371336U,  // <2,4,2,0>: Cost 3 vext2 <2,0,2,4>, <2,0,2,4>
-  3699131946U,  // <2,4,2,1>: Cost 4 vext2 <1,3,2,4>, <2,1,4,3>
-  2630698602U,  // <2,4,2,2>: Cost 3 vext2 <2,2,2,4>, <2,2,2,4>
-  2618754766U,  // <2,4,2,3>: Cost 3 vext2 <0,2,2,4>, <2,3,4,5>
-  2826126234U,  // <2,4,2,4>: Cost 3 vuzpr <1,2,3,4>, <1,2,3,4>
-  2899119414U,  // <2,4,2,5>: Cost 3 vzipl <2,2,2,2>, RHS
-  3033337142U,  // <2,4,2,6>: Cost 3 vtrnl <2,2,2,2>, RHS
-  3800214597U,  // <2,4,2,7>: Cost 4 vext3 <7,0,4,2>, <4,2,7,0>
-  2899119657U,  // <2,4,2,u>: Cost 3 vzipl <2,2,2,2>, RHS
-  2635344033U,  // <2,4,3,0>: Cost 3 vext2 <3,0,2,4>, <3,0,2,4>
-  4032012325U,  // <2,4,3,1>: Cost 4 vzipr LHS, <0,0,4,1>
-  3692497228U,  // <2,4,3,2>: Cost 4 vext2 <0,2,2,4>, <3,2,3,4>
-  3692497308U,  // <2,4,3,3>: Cost 4 vext2 <0,2,2,4>, <3,3,3,3>
-  3001404624U,  // <2,4,3,4>: Cost 3 vzipr LHS, <4,4,4,4>
-  2953627342U,  // <2,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5>
-  2953625804U,  // <2,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6>
-  3899868160U,  // <2,4,3,7>: Cost 4 vuzpr <1,2,3,4>, <1,3,5,7>
-  2953625806U,  // <2,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u>
-  2710916266U,  // <2,4,4,0>: Cost 3 vext3 <4,4,0,2>, <4,4,0,2>
-  3899869648U,  // <2,4,4,1>: Cost 4 vuzpr <1,2,3,4>, <3,4,0,1>
-  3899869658U,  // <2,4,4,2>: Cost 4 vuzpr <1,2,3,4>, <3,4,1,2>
-  3899868930U,  // <2,4,4,3>: Cost 4 vuzpr <1,2,3,4>, <2,4,1,3>
-  2712833232U,  // <2,4,4,4>: Cost 3 vext3 <4,6,u,2>, <4,4,4,4>
-  2618756406U,  // <2,4,4,5>: Cost 3 vext2 <0,2,2,4>, RHS
-  2765737270U,  // <2,4,4,6>: Cost 3 vuzpl <2,3,4,5>, RHS
-  4168304426U,  // <2,4,4,7>: Cost 4 vtrnr <1,2,3,4>, <2,4,5,7>
-  2618756649U,  // <2,4,4,u>: Cost 3 vext2 <0,2,2,4>, RHS
-  2551800011U,  // <2,4,5,0>: Cost 3 vext1 <0,2,4,5>, <0,2,4,5>
-  2569716470U,  // <2,4,5,1>: Cost 3 vext1 <3,2,4,5>, <1,0,3,2>
-  2563745405U,  // <2,4,5,2>: Cost 3 vext1 <2,2,4,5>, <2,2,4,5>
-  2569718102U,  // <2,4,5,3>: Cost 3 vext1 <3,2,4,5>, <3,2,4,5>
-  2551803190U,  // <2,4,5,4>: Cost 3 vext1 <0,2,4,5>, RHS
-  3625545732U,  // <2,4,5,5>: Cost 4 vext1 <0,2,4,5>, <5,5,5,5>
-  1611959606U,  // <2,4,5,6>: Cost 2 vext3 <0,2,0,2>, RHS
-  2826128694U,  // <2,4,5,7>: Cost 3 vuzpr <1,2,3,4>, RHS
-  1611959624U,  // <2,4,5,u>: Cost 2 vext3 <0,2,0,2>, RHS
-  1478066278U,  // <2,4,6,0>: Cost 2 vext1 <0,2,4,6>, LHS
-  2551808758U,  // <2,4,6,1>: Cost 3 vext1 <0,2,4,6>, <1,0,3,2>
-  2551809516U,  // <2,4,6,2>: Cost 3 vext1 <0,2,4,6>, <2,0,6,4>
-  2551810198U,  // <2,4,6,3>: Cost 3 vext1 <0,2,4,6>, <3,0,1,2>
-  1478069558U,  // <2,4,6,4>: Cost 2 vext1 <0,2,4,6>, RHS
-  2901888310U,  // <2,4,6,5>: Cost 3 vzipl <2,6,3,7>, RHS
-  2551812920U,  // <2,4,6,6>: Cost 3 vext1 <0,2,4,6>, <6,6,6,6>
-  2726251914U,  // <2,4,6,7>: Cost 3 vext3 <7,0,1,2>, <4,6,7,1>
-  1478072110U,  // <2,4,6,u>: Cost 2 vext1 <0,2,4,6>, LHS
-  2659234821U,  // <2,4,7,0>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4>
-  3786722726U,  // <2,4,7,1>: Cost 4 vext3 <4,7,1,2>, <4,7,1,2>
-  3734303911U,  // <2,4,7,2>: Cost 4 vext2 <7,2,2,4>, <7,2,2,4>
-  3734967544U,  // <2,4,7,3>: Cost 4 vext2 <7,3,2,4>, <7,3,2,4>
-  3727005030U,  // <2,4,7,4>: Cost 4 vext2 <6,0,2,4>, <7,4,5,6>
-  2726251976U,  // <2,4,7,5>: Cost 3 vext3 <7,0,1,2>, <4,7,5,0>
-  2726251986U,  // <2,4,7,6>: Cost 3 vext3 <7,0,1,2>, <4,7,6,1>
-  3727005292U,  // <2,4,7,7>: Cost 4 vext2 <6,0,2,4>, <7,7,7,7>
-  2659234821U,  // <2,4,7,u>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4>
-  1478082662U,  // <2,4,u,0>: Cost 2 vext1 <0,2,4,u>, LHS
-  2618758958U,  // <2,4,u,1>: Cost 3 vext2 <0,2,2,4>, LHS
-  2551826024U,  // <2,4,u,2>: Cost 3 vext1 <0,2,4,u>, <2,2,2,2>
-  2551826582U,  // <2,4,u,3>: Cost 3 vext1 <0,2,4,u>, <3,0,1,2>
-  1478085942U,  // <2,4,u,4>: Cost 2 vext1 <0,2,4,u>, RHS
-  2953668302U,  // <2,4,u,5>: Cost 3 vzipr LHS, <2,3,4,5>
-  1611959849U,  // <2,4,u,6>: Cost 2 vext3 <0,2,0,2>, RHS
-  2826128937U,  // <2,4,u,7>: Cost 3 vuzpr <1,2,3,4>, RHS
-  1611959867U,  // <2,4,u,u>: Cost 2 vext3 <0,2,0,2>, RHS
-  3691839488U,  // <2,5,0,0>: Cost 4 vext2 <0,1,2,5>, <0,0,0,0>
-  2618097766U,  // <2,5,0,1>: Cost 3 vext2 <0,1,2,5>, LHS
-  2620088484U,  // <2,5,0,2>: Cost 3 vext2 <0,4,2,5>, <0,2,0,2>
-  2619425034U,  // <2,5,0,3>: Cost 3 vext2 <0,3,2,5>, <0,3,2,5>
-  2620088667U,  // <2,5,0,4>: Cost 3 vext2 <0,4,2,5>, <0,4,2,5>
-  2620752300U,  // <2,5,0,5>: Cost 3 vext2 <0,5,2,5>, <0,5,2,5>
-  3693830655U,  // <2,5,0,6>: Cost 4 vext2 <0,4,2,5>, <0,6,2,7>
-  3094531382U,  // <2,5,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS
-  2618098333U,  // <2,5,0,u>: Cost 3 vext2 <0,1,2,5>, LHS
-  3691840246U,  // <2,5,1,0>: Cost 4 vext2 <0,1,2,5>, <1,0,3,2>
-  3691840308U,  // <2,5,1,1>: Cost 4 vext2 <0,1,2,5>, <1,1,1,1>
-  2626061206U,  // <2,5,1,2>: Cost 3 vext2 <1,4,2,5>, <1,2,3,0>
-  2618098688U,  // <2,5,1,3>: Cost 3 vext2 <0,1,2,5>, <1,3,5,7>
-  2626061364U,  // <2,5,1,4>: Cost 3 vext2 <1,4,2,5>, <1,4,2,5>
-  3691840656U,  // <2,5,1,5>: Cost 4 vext2 <0,1,2,5>, <1,5,3,7>
-  3789082310U,  // <2,5,1,6>: Cost 4 vext3 <5,1,6,2>, <5,1,6,2>
-  2712833744U,  // <2,5,1,7>: Cost 3 vext3 <4,6,u,2>, <5,1,7,3>
-  2628715896U,  // <2,5,1,u>: Cost 3 vext2 <1,u,2,5>, <1,u,2,5>
-  3693831613U,  // <2,5,2,0>: Cost 4 vext2 <0,4,2,5>, <2,0,1,2>
-  4026698642U,  // <2,5,2,1>: Cost 4 vzipr <0,0,2,2>, <4,0,5,1>
-  2632033896U,  // <2,5,2,2>: Cost 3 vext2 <2,4,2,5>, <2,2,2,2>
-  3691841190U,  // <2,5,2,3>: Cost 4 vext2 <0,1,2,5>, <2,3,0,1>
-  2632034061U,  // <2,5,2,4>: Cost 3 vext2 <2,4,2,5>, <2,4,2,5>
-  3691841352U,  // <2,5,2,5>: Cost 4 vext2 <0,1,2,5>, <2,5,0,1>
-  3691841466U,  // <2,5,2,6>: Cost 4 vext2 <0,1,2,5>, <2,6,3,7>
-  3088354614U,  // <2,5,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS
-  3088354615U,  // <2,5,2,u>: Cost 3 vtrnr <0,2,0,2>, RHS
-  2557829222U,  // <2,5,3,0>: Cost 3 vext1 <1,2,5,3>, LHS
-  2557830059U,  // <2,5,3,1>: Cost 3 vext1 <1,2,5,3>, <1,2,5,3>
-  2575746766U,  // <2,5,3,2>: Cost 3 vext1 <4,2,5,3>, <2,3,4,5>
-  3691841948U,  // <2,5,3,3>: Cost 4 vext2 <0,1,2,5>, <3,3,3,3>
-  2619427330U,  // <2,5,3,4>: Cost 3 vext2 <0,3,2,5>, <3,4,5,6>
-  2581720847U,  // <2,5,3,5>: Cost 3 vext1 <5,2,5,3>, <5,2,5,3>
-  2953628162U,  // <2,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6>
-  2953626624U,  // <2,5,3,7>: Cost 3 vzipr LHS, <1,3,5,7>
-  2953626625U,  // <2,5,3,u>: Cost 3 vzipr LHS, <1,3,5,u>
-  2569781350U,  // <2,5,4,0>: Cost 3 vext1 <3,2,5,4>, LHS
-  3631580076U,  // <2,5,4,1>: Cost 4 vext1 <1,2,5,4>, <1,2,5,4>
-  2569782990U,  // <2,5,4,2>: Cost 3 vext1 <3,2,5,4>, <2,3,4,5>
-  2569783646U,  // <2,5,4,3>: Cost 3 vext1 <3,2,5,4>, <3,2,5,4>
-  2569784630U,  // <2,5,4,4>: Cost 3 vext1 <3,2,5,4>, RHS
-  2618101046U,  // <2,5,4,5>: Cost 3 vext2 <0,1,2,5>, RHS
-  3893905922U,  // <2,5,4,6>: Cost 4 vuzpr <0,2,3,5>, <3,4,5,6>
-  3094564150U,  // <2,5,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS
-  2618101289U,  // <2,5,4,u>: Cost 3 vext2 <0,1,2,5>, RHS
-  2551873638U,  // <2,5,5,0>: Cost 3 vext1 <0,2,5,5>, LHS
-  3637560320U,  // <2,5,5,1>: Cost 4 vext1 <2,2,5,5>, <1,3,5,7>
-  3637560966U,  // <2,5,5,2>: Cost 4 vext1 <2,2,5,5>, <2,2,5,5>
-  3723030343U,  // <2,5,5,3>: Cost 4 vext2 <5,3,2,5>, <5,3,2,5>
-  2551876918U,  // <2,5,5,4>: Cost 3 vext1 <0,2,5,5>, RHS
-  2712834052U,  // <2,5,5,5>: Cost 3 vext3 <4,6,u,2>, <5,5,5,5>
-  4028713474U,  // <2,5,5,6>: Cost 4 vzipr <0,3,2,5>, <3,4,5,6>
-  2712834072U,  // <2,5,5,7>: Cost 3 vext3 <4,6,u,2>, <5,5,7,7>
-  2712834081U,  // <2,5,5,u>: Cost 3 vext3 <4,6,u,2>, <5,5,u,7>
-  2575769702U,  // <2,5,6,0>: Cost 3 vext1 <4,2,5,6>, LHS
-  3631596462U,  // <2,5,6,1>: Cost 4 vext1 <1,2,5,6>, <1,2,5,6>
-  2655924730U,  // <2,5,6,2>: Cost 3 vext2 <6,4,2,5>, <6,2,7,3>
-  3643541856U,  // <2,5,6,3>: Cost 4 vext1 <3,2,5,6>, <3,2,5,6>
-  2655924849U,  // <2,5,6,4>: Cost 3 vext2 <6,4,2,5>, <6,4,2,5>
-  3787755607U,  // <2,5,6,5>: Cost 4 vext3 <4,u,6,2>, <5,6,5,7>
-  4029385218U,  // <2,5,6,6>: Cost 4 vzipr <0,4,2,6>, <3,4,5,6>
-  3088682294U,  // <2,5,6,7>: Cost 3 vtrnr <0,2,4,6>, RHS
-  3088682295U,  // <2,5,6,u>: Cost 3 vtrnr <0,2,4,6>, RHS
-  2563833958U,  // <2,5,7,0>: Cost 3 vext1 <2,2,5,7>, LHS
-  2551890678U,  // <2,5,7,1>: Cost 3 vext1 <0,2,5,7>, <1,0,3,2>
-  2563835528U,  // <2,5,7,2>: Cost 3 vext1 <2,2,5,7>, <2,2,5,7>
-  3637577878U,  // <2,5,7,3>: Cost 4 vext1 <2,2,5,7>, <3,0,1,2>
-  2563837238U,  // <2,5,7,4>: Cost 3 vext1 <2,2,5,7>, RHS
-  2712834216U,  // <2,5,7,5>: Cost 3 vext3 <4,6,u,2>, <5,7,5,7>
-  2712834220U,  // <2,5,7,6>: Cost 3 vext3 <4,6,u,2>, <5,7,6,2>
-  4174449974U,  // <2,5,7,7>: Cost 4 vtrnr <2,2,5,7>, RHS
-  2563839790U,  // <2,5,7,u>: Cost 3 vext1 <2,2,5,7>, LHS
-  2563842150U,  // <2,5,u,0>: Cost 3 vext1 <2,2,5,u>, LHS
-  2618103598U,  // <2,5,u,1>: Cost 3 vext2 <0,1,2,5>, LHS
-  2563843721U,  // <2,5,u,2>: Cost 3 vext1 <2,2,5,u>, <2,2,5,u>
-  2569816418U,  // <2,5,u,3>: Cost 3 vext1 <3,2,5,u>, <3,2,5,u>
-  2622748735U,  // <2,5,u,4>: Cost 3 vext2 <0,u,2,5>, <u,4,5,6>
-  2618103962U,  // <2,5,u,5>: Cost 3 vext2 <0,1,2,5>, RHS
-  2953669122U,  // <2,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6>
-  2953667584U,  // <2,5,u,7>: Cost 3 vzipr LHS, <1,3,5,7>
-  2618104165U,  // <2,5,u,u>: Cost 3 vext2 <0,1,2,5>, LHS
-  2620096512U,  // <2,6,0,0>: Cost 3 vext2 <0,4,2,6>, <0,0,0,0>
-  1546354790U,  // <2,6,0,1>: Cost 2 vext2 <0,4,2,6>, LHS
-  2620096676U,  // <2,6,0,2>: Cost 3 vext2 <0,4,2,6>, <0,2,0,2>
-  3693838588U,  // <2,6,0,3>: Cost 4 vext2 <0,4,2,6>, <0,3,1,0>
-  1546355036U,  // <2,6,0,4>: Cost 2 vext2 <0,4,2,6>, <0,4,2,6>
-  3694502317U,  // <2,6,0,5>: Cost 4 vext2 <0,5,2,6>, <0,5,2,6>
-  2551911246U,  // <2,6,0,6>: Cost 3 vext1 <0,2,6,0>, <6,7,0,1>
-  2720723287U,  // <2,6,0,7>: Cost 3 vext3 <6,0,7,2>, <6,0,7,2>
-  1546355357U,  // <2,6,0,u>: Cost 2 vext2 <0,4,2,6>, LHS
-  2620097270U,  // <2,6,1,0>: Cost 3 vext2 <0,4,2,6>, <1,0,3,2>
-  2620097332U,  // <2,6,1,1>: Cost 3 vext2 <0,4,2,6>, <1,1,1,1>
-  2620097430U,  // <2,6,1,2>: Cost 3 vext2 <0,4,2,6>, <1,2,3,0>
-  2820243558U,  // <2,6,1,3>: Cost 3 vuzpr <0,2,4,6>, LHS
-  2620097598U,  // <2,6,1,4>: Cost 3 vext2 <0,4,2,6>, <1,4,3,6>
-  2620097680U,  // <2,6,1,5>: Cost 3 vext2 <0,4,2,6>, <1,5,3,7>
-  3693839585U,  // <2,6,1,6>: Cost 4 vext2 <0,4,2,6>, <1,6,3,7>
-  2721386920U,  // <2,6,1,7>: Cost 3 vext3 <6,1,7,2>, <6,1,7,2>
-  2820243563U,  // <2,6,1,u>: Cost 3 vuzpr <0,2,4,6>, LHS
-  2714014137U,  // <2,6,2,0>: Cost 3 vext3 <4,u,6,2>, <6,2,0,1>
-  2712834500U,  // <2,6,2,1>: Cost 3 vext3 <4,6,u,2>, <6,2,1,3>
-  2620098152U,  // <2,6,2,2>: Cost 3 vext2 <0,4,2,6>, <2,2,2,2>
-  2620098214U,  // <2,6,2,3>: Cost 3 vext2 <0,4,2,6>, <2,3,0,1>
-  2632042254U,  // <2,6,2,4>: Cost 3 vext2 <2,4,2,6>, <2,4,2,6>
-  2712834540U,  // <2,6,2,5>: Cost 3 vext3 <4,6,u,2>, <6,2,5,7>
-  2820243660U,  // <2,6,2,6>: Cost 3 vuzpr <0,2,4,6>, <0,2,4,6>
-  2958265654U,  // <2,6,2,7>: Cost 3 vzipr <0,u,2,2>, RHS
-  2620098619U,  // <2,6,2,u>: Cost 3 vext2 <0,4,2,6>, <2,u,0,1>
-  2620098710U,  // <2,6,3,0>: Cost 3 vext2 <0,4,2,6>, <3,0,1,2>
-  3893986982U,  // <2,6,3,1>: Cost 4 vuzpr <0,2,4,6>, <2,3,0,1>
-  2569848762U,  // <2,6,3,2>: Cost 3 vext1 <3,2,6,3>, <2,6,3,7>
-  2620098972U,  // <2,6,3,3>: Cost 3 vext2 <0,4,2,6>, <3,3,3,3>
-  2620099074U,  // <2,6,3,4>: Cost 3 vext2 <0,4,2,6>, <3,4,5,6>
-  3893987022U,  // <2,6,3,5>: Cost 4 vuzpr <0,2,4,6>, <2,3,4,5>
-  3001404644U,  // <2,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6>
-  1879887158U,  // <2,6,3,7>: Cost 2 vzipr LHS, RHS
-  1879887159U,  // <2,6,3,u>: Cost 2 vzipr LHS, RHS
-  2620099484U,  // <2,6,4,0>: Cost 3 vext2 <0,4,2,6>, <4,0,6,2>
-  2620099566U,  // <2,6,4,1>: Cost 3 vext2 <0,4,2,6>, <4,1,6,3>
-  2620099644U,  // <2,6,4,2>: Cost 3 vext2 <0,4,2,6>, <4,2,6,0>
-  3643599207U,  // <2,6,4,3>: Cost 4 vext1 <3,2,6,4>, <3,2,6,4>
-  2575830080U,  // <2,6,4,4>: Cost 3 vext1 <4,2,6,4>, <4,2,6,4>
-  1546358070U,  // <2,6,4,5>: Cost 2 vext2 <0,4,2,6>, RHS
-  2667875700U,  // <2,6,4,6>: Cost 3 vext2 <u,4,2,6>, <4,6,4,6>
-  4028042550U,  // <2,6,4,7>: Cost 4 vzipr <0,2,2,4>, RHS
-  1546358313U,  // <2,6,4,u>: Cost 2 vext2 <0,4,2,6>, RHS
-  3693841992U,  // <2,6,5,0>: Cost 4 vext2 <0,4,2,6>, <5,0,1,2>
-  2667876048U,  // <2,6,5,1>: Cost 3 vext2 <u,4,2,6>, <5,1,7,3>
-  2712834756U,  // <2,6,5,2>: Cost 3 vext3 <4,6,u,2>, <6,5,2,7>
-  3643607400U,  // <2,6,5,3>: Cost 4 vext1 <3,2,6,5>, <3,2,6,5>
-  2252091873U,  // <2,6,5,4>: Cost 3 vrev <6,2,4,5>
-  2667876356U,  // <2,6,5,5>: Cost 3 vext2 <u,4,2,6>, <5,5,5,5>
-  2667876450U,  // <2,6,5,6>: Cost 3 vext2 <u,4,2,6>, <5,6,7,0>
-  2820246838U,  // <2,6,5,7>: Cost 3 vuzpr <0,2,4,6>, RHS
-  2820246839U,  // <2,6,5,u>: Cost 3 vuzpr <0,2,4,6>, RHS
-  2563899494U,  // <2,6,6,0>: Cost 3 vext1 <2,2,6,6>, LHS
-  3893988683U,  // <2,6,6,1>: Cost 4 vuzpr <0,2,4,6>, <4,6,0,1>
-  2563901072U,  // <2,6,6,2>: Cost 3 vext1 <2,2,6,6>, <2,2,6,6>
-  3893987236U,  // <2,6,6,3>: Cost 4 vuzpr <0,2,4,6>, <2,6,1,3>
-  2563902774U,  // <2,6,6,4>: Cost 3 vext1 <2,2,6,6>, RHS
-  3893988723U,  // <2,6,6,5>: Cost 4 vuzpr <0,2,4,6>, <4,6,4,5>
-  2712834872U,  // <2,6,6,6>: Cost 3 vext3 <4,6,u,2>, <6,6,6,6>
-  2955644214U,  // <2,6,6,7>: Cost 3 vzipr <0,4,2,6>, RHS
-  2955644215U,  // <2,6,6,u>: Cost 3 vzipr <0,4,2,6>, RHS
-  2712834894U,  // <2,6,7,0>: Cost 3 vext3 <4,6,u,2>, <6,7,0,1>
-  2724926296U,  // <2,6,7,1>: Cost 3 vext3 <6,7,1,2>, <6,7,1,2>
-  2725000033U,  // <2,6,7,2>: Cost 3 vext3 <6,7,2,2>, <6,7,2,2>
-  2702365544U,  // <2,6,7,3>: Cost 3 vext3 <3,0,1,2>, <6,7,3,0>
-  2712834934U,  // <2,6,7,4>: Cost 3 vext3 <4,6,u,2>, <6,7,4,5>
-  3776107393U,  // <2,6,7,5>: Cost 4 vext3 <3,0,1,2>, <6,7,5,7>
-  2725294981U,  // <2,6,7,6>: Cost 3 vext3 <6,7,6,2>, <6,7,6,2>
-  2726253452U,  // <2,6,7,7>: Cost 3 vext3 <7,0,1,2>, <6,7,7,0>
-  2712834966U,  // <2,6,7,u>: Cost 3 vext3 <4,6,u,2>, <6,7,u,1>
-  2620102355U,  // <2,6,u,0>: Cost 3 vext2 <0,4,2,6>, <u,0,1,2>
-  1546360622U,  // <2,6,u,1>: Cost 2 vext2 <0,4,2,6>, LHS
-  2620102536U,  // <2,6,u,2>: Cost 3 vext2 <0,4,2,6>, <u,2,3,3>
-  2820244125U,  // <2,6,u,3>: Cost 3 vuzpr <0,2,4,6>, LHS
-  1594136612U,  // <2,6,u,4>: Cost 2 vext2 <u,4,2,6>, <u,4,2,6>
-  1546360986U,  // <2,6,u,5>: Cost 2 vext2 <0,4,2,6>, RHS
-  2620102864U,  // <2,6,u,6>: Cost 3 vext2 <0,4,2,6>, <u,6,3,7>
-  1879928118U,  // <2,6,u,7>: Cost 2 vzipr LHS, RHS
-  1879928119U,  // <2,6,u,u>: Cost 2 vzipr LHS, RHS
-  2726179825U,  // <2,7,0,0>: Cost 3 vext3 <7,0,0,2>, <7,0,0,2>
-  1652511738U,  // <2,7,0,1>: Cost 2 vext3 <7,0,1,2>, <7,0,1,2>
-  2621431972U,  // <2,7,0,2>: Cost 3 vext2 <0,6,2,7>, <0,2,0,2>
-  2257949868U,  // <2,7,0,3>: Cost 3 vrev <7,2,3,0>
-  2726474773U,  // <2,7,0,4>: Cost 3 vext3 <7,0,4,2>, <7,0,4,2>
-  2620768686U,  // <2,7,0,5>: Cost 3 vext2 <0,5,2,7>, <0,5,2,7>
-  2621432319U,  // <2,7,0,6>: Cost 3 vext2 <0,6,2,7>, <0,6,2,7>
-  2599760953U,  // <2,7,0,7>: Cost 3 vext1 <u,2,7,0>, <7,0,u,2>
-  1653027897U,  // <2,7,0,u>: Cost 2 vext3 <7,0,u,2>, <7,0,u,2>
-  2639348470U,  // <2,7,1,0>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2>
-  3695174452U,  // <2,7,1,1>: Cost 4 vext2 <0,6,2,7>, <1,1,1,1>
-  3695174550U,  // <2,7,1,2>: Cost 4 vext2 <0,6,2,7>, <1,2,3,0>
-  3694511104U,  // <2,7,1,3>: Cost 4 vext2 <0,5,2,7>, <1,3,5,7>
-  3713090594U,  // <2,7,1,4>: Cost 4 vext2 <3,6,2,7>, <1,4,0,5>
-  3693184144U,  // <2,7,1,5>: Cost 4 vext2 <0,3,2,7>, <1,5,3,7>
-  2627405016U,  // <2,7,1,6>: Cost 3 vext2 <1,6,2,7>, <1,6,2,7>
-  3799995519U,  // <2,7,1,7>: Cost 4 vext3 <7,0,1,2>, <7,1,7,0>
-  2639348470U,  // <2,7,1,u>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2>
-  3695175101U,  // <2,7,2,0>: Cost 4 vext2 <0,6,2,7>, <2,0,1,2>
-  3643655168U,  // <2,7,2,1>: Cost 4 vext1 <3,2,7,2>, <1,3,5,7>
-  2257892517U,  // <2,7,2,2>: Cost 3 vrev <7,2,2,2>
-  3695175334U,  // <2,7,2,3>: Cost 4 vext2 <0,6,2,7>, <2,3,0,1>
-  3695175465U,  // <2,7,2,4>: Cost 4 vext2 <0,6,2,7>, <2,4,5,6>
-  2632714080U,  // <2,7,2,5>: Cost 3 vext2 <2,5,2,7>, <2,5,2,7>
-  2633377713U,  // <2,7,2,6>: Cost 3 vext2 <2,6,2,7>, <2,6,2,7>
-  3695175658U,  // <2,7,2,7>: Cost 4 vext2 <0,6,2,7>, <2,7,0,1>
-  2634704979U,  // <2,7,2,u>: Cost 3 vext2 <2,u,2,7>, <2,u,2,7>
-  1514094694U,  // <2,7,3,0>: Cost 2 vext1 <6,2,7,3>, LHS
-  2569921680U,  // <2,7,3,1>: Cost 3 vext1 <3,2,7,3>, <1,5,3,7>
-  2587838056U,  // <2,7,3,2>: Cost 3 vext1 <6,2,7,3>, <2,2,2,2>
-  2569922927U,  // <2,7,3,3>: Cost 3 vext1 <3,2,7,3>, <3,2,7,3>
-  1514097974U,  // <2,7,3,4>: Cost 2 vext1 <6,2,7,3>, RHS
-  2581868321U,  // <2,7,3,5>: Cost 3 vext1 <5,2,7,3>, <5,2,7,3>
-  1514099194U,  // <2,7,3,6>: Cost 2 vext1 <6,2,7,3>, <6,2,7,3>
-  2587841530U,  // <2,7,3,7>: Cost 3 vext1 <6,2,7,3>, <7,0,1,2>
-  1514100526U,  // <2,7,3,u>: Cost 2 vext1 <6,2,7,3>, LHS
-  2708706617U,  // <2,7,4,0>: Cost 3 vext3 <4,0,6,2>, <7,4,0,6>
-  3649643418U,  // <2,7,4,1>: Cost 4 vext1 <4,2,7,4>, <1,2,3,4>
-  3649644330U,  // <2,7,4,2>: Cost 4 vext1 <4,2,7,4>, <2,4,5,7>
-  2257982640U,  // <2,7,4,3>: Cost 3 vrev <7,2,3,4>
-  3649645641U,  // <2,7,4,4>: Cost 4 vext1 <4,2,7,4>, <4,2,7,4>
-  2621435190U,  // <2,7,4,5>: Cost 3 vext2 <0,6,2,7>, RHS
-  2712835441U,  // <2,7,4,6>: Cost 3 vext3 <4,6,u,2>, <7,4,6,u>
-  3799995762U,  // <2,7,4,7>: Cost 4 vext3 <7,0,1,2>, <7,4,7,0>
-  2621435433U,  // <2,7,4,u>: Cost 3 vext2 <0,6,2,7>, RHS
-  2729497990U,  // <2,7,5,0>: Cost 3 vext3 <7,5,0,2>, <7,5,0,2>
-  3643679744U,  // <2,7,5,1>: Cost 4 vext1 <3,2,7,5>, <1,3,5,7>
-  3637708424U,  // <2,7,5,2>: Cost 4 vext1 <2,2,7,5>, <2,2,5,7>
-  3643681137U,  // <2,7,5,3>: Cost 4 vext1 <3,2,7,5>, <3,2,7,5>
-  2599800118U,  // <2,7,5,4>: Cost 3 vext1 <u,2,7,5>, RHS
-  3786577334U,  // <2,7,5,5>: Cost 4 vext3 <4,6,u,2>, <7,5,5,5>
-  3786577345U,  // <2,7,5,6>: Cost 4 vext3 <4,6,u,2>, <7,5,6,7>
-  2599802214U,  // <2,7,5,7>: Cost 3 vext1 <u,2,7,5>, <7,4,5,6>
-  2599802670U,  // <2,7,5,u>: Cost 3 vext1 <u,2,7,5>, LHS
-  2581889126U,  // <2,7,6,0>: Cost 3 vext1 <5,2,7,6>, LHS
-  3643687936U,  // <2,7,6,1>: Cost 4 vext1 <3,2,7,6>, <1,3,5,7>
-  2663240186U,  // <2,7,6,2>: Cost 3 vext2 <7,6,2,7>, <6,2,7,3>
-  3643689330U,  // <2,7,6,3>: Cost 4 vext1 <3,2,7,6>, <3,2,7,6>
-  2581892406U,  // <2,7,6,4>: Cost 3 vext1 <5,2,7,6>, RHS
-  2581892900U,  // <2,7,6,5>: Cost 3 vext1 <5,2,7,6>, <5,2,7,6>
-  2587865597U,  // <2,7,6,6>: Cost 3 vext1 <6,2,7,6>, <6,2,7,6>
-  3786577428U,  // <2,7,6,7>: Cost 4 vext3 <4,6,u,2>, <7,6,7,0>
-  2581894958U,  // <2,7,6,u>: Cost 3 vext1 <5,2,7,6>, LHS
-  2726254119U,  // <2,7,7,0>: Cost 3 vext3 <7,0,1,2>, <7,7,0,1>
-  3804640817U,  // <2,7,7,1>: Cost 4 vext3 <7,7,1,2>, <7,7,1,2>
-  3637724826U,  // <2,7,7,2>: Cost 4 vext1 <2,2,7,7>, <2,2,7,7>
-  3734992123U,  // <2,7,7,3>: Cost 4 vext2 <7,3,2,7>, <7,3,2,7>
-  2552040758U,  // <2,7,7,4>: Cost 3 vext1 <0,2,7,7>, RHS
-  3799995992U,  // <2,7,7,5>: Cost 4 vext3 <7,0,1,2>, <7,7,5,5>
-  2663241198U,  // <2,7,7,6>: Cost 3 vext2 <7,6,2,7>, <7,6,2,7>
-  2712835692U,  // <2,7,7,7>: Cost 3 vext3 <4,6,u,2>, <7,7,7,7>
-  2731562607U,  // <2,7,7,u>: Cost 3 vext3 <7,u,1,2>, <7,7,u,1>
-  1514135654U,  // <2,7,u,0>: Cost 2 vext1 <6,2,7,u>, LHS
-  1657820802U,  // <2,7,u,1>: Cost 2 vext3 <7,u,1,2>, <7,u,1,2>
-  2587879016U,  // <2,7,u,2>: Cost 3 vext1 <6,2,7,u>, <2,2,2,2>
-  2569963892U,  // <2,7,u,3>: Cost 3 vext1 <3,2,7,u>, <3,2,7,u>
-  1514138934U,  // <2,7,u,4>: Cost 2 vext1 <6,2,7,u>, RHS
-  2621438106U,  // <2,7,u,5>: Cost 3 vext2 <0,6,2,7>, RHS
-  1514140159U,  // <2,7,u,6>: Cost 2 vext1 <6,2,7,u>, <6,2,7,u>
-  2587882490U,  // <2,7,u,7>: Cost 3 vext1 <6,2,7,u>, <7,0,1,2>
-  1514141486U,  // <2,7,u,u>: Cost 2 vext1 <6,2,7,u>, LHS
-  1544380416U,  // <2,u,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
-   470638699U,  // <2,u,0,1>: Cost 1 vext2 LHS, LHS
-  1544380580U,  // <2,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
-  1658631909U,  // <2,u,0,3>: Cost 2 vext3 <u,0,3,2>, <u,0,3,2>
-  1544380754U,  // <2,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
-  2665898414U,  // <2,u,0,5>: Cost 3 vext2 LHS, <0,5,2,7>
-  1658853120U,  // <2,u,0,6>: Cost 2 vext3 <u,0,6,2>, <u,0,6,2>
-  3094531625U,  // <2,u,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS
-   470639261U,  // <2,u,0,u>: Cost 1 vext2 LHS, LHS
-  1544381174U,  // <2,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
-  1544381236U,  // <2,u,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
-  1544381334U,  // <2,u,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
-  1544381400U,  // <2,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
-  2618123325U,  // <2,u,1,4>: Cost 3 vext2 LHS, <1,4,3,5>
-  1544381584U,  // <2,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
-  2618123489U,  // <2,u,1,6>: Cost 3 vext2 LHS, <1,6,3,7>
-  2726254427U,  // <2,u,1,7>: Cost 3 vext3 <7,0,1,2>, <u,1,7,3>
-  1544381823U,  // <2,u,1,u>: Cost 2 vext2 LHS, <1,u,3,3>
-  1478328422U,  // <2,u,2,0>: Cost 2 vext1 <0,2,u,2>, LHS
-  2618123807U,  // <2,u,2,1>: Cost 3 vext2 LHS, <2,1,3,1>
-   269271142U,  // <2,u,2,2>: Cost 1 vdup2 LHS
-  1544382118U,  // <2,u,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
-  1478331702U,  // <2,u,2,4>: Cost 2 vext1 <0,2,u,2>, RHS
-  2618124136U,  // <2,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
-  1544382394U,  // <2,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
-  3088354857U,  // <2,u,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS
-   269271142U,  // <2,u,2,u>: Cost 1 vdup2 LHS
-  1544382614U,  // <2,u,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
-  2953627374U,  // <2,u,3,1>: Cost 3 vzipr LHS, <2,3,u,1>
-  1490282143U,  // <2,u,3,2>: Cost 2 vext1 <2,2,u,3>, <2,2,u,3>
-  1879883932U,  // <2,u,3,3>: Cost 2 vzipr LHS, LHS
-  1544382978U,  // <2,u,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
-  2953627378U,  // <2,u,3,5>: Cost 3 vzipr LHS, <2,3,u,5>
-  1514172931U,  // <2,u,3,6>: Cost 2 vext1 <6,2,u,3>, <6,2,u,3>
-  1879887176U,  // <2,u,3,7>: Cost 2 vzipr LHS, RHS
-  1879883937U,  // <2,u,3,u>: Cost 2 vzipr LHS, LHS
-  1484316774U,  // <2,u,4,0>: Cost 2 vext1 <1,2,u,4>, LHS
-  1484317639U,  // <2,u,4,1>: Cost 2 vext1 <1,2,u,4>, <1,2,u,4>
-  2552088270U,  // <2,u,4,2>: Cost 3 vext1 <0,2,u,4>, <2,3,4,5>
-  1190213513U,  // <2,u,4,3>: Cost 2 vrev <u,2,3,4>
-  1484320054U,  // <2,u,4,4>: Cost 2 vext1 <1,2,u,4>, RHS
-   470641974U,  // <2,u,4,5>: Cost 1 vext2 LHS, RHS
-  1592159604U,  // <2,u,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
-  3094564393U,  // <2,u,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS
-   470642217U,  // <2,u,4,u>: Cost 1 vext2 LHS, RHS
-  2552094959U,  // <2,u,5,0>: Cost 3 vext1 <0,2,u,5>, <0,2,u,5>
-  1592159952U,  // <2,u,5,1>: Cost 2 vext2 LHS, <5,1,7,3>
-  2564040353U,  // <2,u,5,2>: Cost 3 vext1 <2,2,u,5>, <2,2,u,5>
-  2690275455U,  // <2,u,5,3>: Cost 3 vext3 <0,u,u,2>, <u,5,3,7>
-  1592160198U,  // <2,u,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
-  1592160260U,  // <2,u,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
-  1611962522U,  // <2,u,5,6>: Cost 2 vext3 <0,2,0,2>, RHS
-  1592160424U,  // <2,u,5,7>: Cost 2 vext2 LHS, <5,7,5,7>
-  1611962540U,  // <2,u,5,u>: Cost 2 vext3 <0,2,0,2>, RHS
-  1478361190U,  // <2,u,6,0>: Cost 2 vext1 <0,2,u,6>, LHS
-  2552103670U,  // <2,u,6,1>: Cost 3 vext1 <0,2,u,6>, <1,0,3,2>
-  1592160762U,  // <2,u,6,2>: Cost 2 vext2 LHS, <6,2,7,3>
-  2685704400U,  // <2,u,6,3>: Cost 3 vext3 <0,2,0,2>, <u,6,3,7>
-  1478364470U,  // <2,u,6,4>: Cost 2 vext1 <0,2,u,6>, RHS
-  2901891226U,  // <2,u,6,5>: Cost 3 vzipl <2,6,3,7>, RHS
-  1592161080U,  // <2,u,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
-  1592161102U,  // <2,u,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
-  1478367022U,  // <2,u,6,u>: Cost 2 vext1 <0,2,u,6>, LHS
-  1592161274U,  // <2,u,7,0>: Cost 2 vext2 LHS, <7,0,1,2>
-  2659931226U,  // <2,u,7,1>: Cost 3 vext2 <7,1,2,u>, <7,1,2,u>
-  2564056739U,  // <2,u,7,2>: Cost 3 vext1 <2,2,u,7>, <2,2,u,7>
-  2665903331U,  // <2,u,7,3>: Cost 3 vext2 LHS, <7,3,0,1>
-  1592161638U,  // <2,u,7,4>: Cost 2 vext2 LHS, <7,4,5,6>
-  2665903494U,  // <2,u,7,5>: Cost 3 vext2 LHS, <7,5,0,2>
-  2587947527U,  // <2,u,7,6>: Cost 3 vext1 <6,2,u,7>, <6,2,u,7>
-  1592161900U,  // <2,u,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
-  1592161922U,  // <2,u,7,u>: Cost 2 vext2 LHS, <7,u,1,2>
-  1478377574U,  // <2,u,u,0>: Cost 2 vext1 <0,2,u,u>, LHS
-   470644526U,  // <2,u,u,1>: Cost 1 vext2 LHS, LHS
-   269271142U,  // <2,u,u,2>: Cost 1 vdup2 LHS
-  1879924892U,  // <2,u,u,3>: Cost 2 vzipr LHS, LHS
-  1478380854U,  // <2,u,u,4>: Cost 2 vext1 <0,2,u,u>, RHS
-   470644890U,  // <2,u,u,5>: Cost 1 vext2 LHS, RHS
-  1611962765U,  // <2,u,u,6>: Cost 2 vext3 <0,2,0,2>, RHS
-  1879928136U,  // <2,u,u,7>: Cost 2 vzipr LHS, RHS
-   470645093U,  // <2,u,u,u>: Cost 1 vext2 LHS, LHS
-  1611448320U,  // <3,0,0,0>: Cost 2 vext3 LHS, <0,0,0,0>
-  1611890698U,  // <3,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1>
-  1611890708U,  // <3,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2>
-  3763576860U,  // <3,0,0,3>: Cost 4 vext3 LHS, <0,0,3,1>
-  2689835045U,  // <3,0,0,4>: Cost 3 vext3 LHS, <0,0,4,1>
-  3698508206U,  // <3,0,0,5>: Cost 4 vext2 <1,2,3,0>, <0,5,2,7>
-  3763576887U,  // <3,0,0,6>: Cost 4 vext3 LHS, <0,0,6,1>
-  3667678434U,  // <3,0,0,7>: Cost 4 vext1 <7,3,0,0>, <7,3,0,0>
-  1616093258U,  // <3,0,0,u>: Cost 2 vext3 LHS, <0,0,u,2>
-  1490337894U,  // <3,0,1,0>: Cost 2 vext1 <2,3,0,1>, LHS
-  2685632602U,  // <3,0,1,1>: Cost 3 vext3 LHS, <0,1,1,0>
-   537706598U,  // <3,0,1,2>: Cost 1 vext3 LHS, LHS
-  2624766936U,  // <3,0,1,3>: Cost 3 vext2 <1,2,3,0>, <1,3,1,3>
-  1490341174U,  // <3,0,1,4>: Cost 2 vext1 <2,3,0,1>, RHS
-  2624767120U,  // <3,0,1,5>: Cost 3 vext2 <1,2,3,0>, <1,5,3,7>
-  2732966030U,  // <3,0,1,6>: Cost 3 vext3 LHS, <0,1,6,7>
-  2593944803U,  // <3,0,1,7>: Cost 3 vext1 <7,3,0,1>, <7,3,0,1>
-   537706652U,  // <3,0,1,u>: Cost 1 vext3 LHS, LHS
-  1611890852U,  // <3,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
-  2685632684U,  // <3,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1>
-  2685632692U,  // <3,0,2,2>: Cost 3 vext3 LHS, <0,2,2,0>
-  2685632702U,  // <3,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1>
-  1611890892U,  // <3,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
-  2732966102U,  // <3,0,2,5>: Cost 3 vext3 LHS, <0,2,5,7>
-  2624767930U,  // <3,0,2,6>: Cost 3 vext2 <1,2,3,0>, <2,6,3,7>
-  2685632744U,  // <3,0,2,7>: Cost 3 vext3 LHS, <0,2,7,7>
-  1611890924U,  // <3,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2>
-  2624768150U,  // <3,0,3,0>: Cost 3 vext2 <1,2,3,0>, <3,0,1,2>
-  2685632764U,  // <3,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0>
-  2685632774U,  // <3,0,3,2>: Cost 3 vext3 LHS, <0,3,2,1>
-  2624768412U,  // <3,0,3,3>: Cost 3 vext2 <1,2,3,0>, <3,3,3,3>
-  2624768514U,  // <3,0,3,4>: Cost 3 vext2 <1,2,3,0>, <3,4,5,6>
-  3702491714U,  // <3,0,3,5>: Cost 4 vext2 <1,u,3,0>, <3,5,3,7>
-  2624768632U,  // <3,0,3,6>: Cost 3 vext2 <1,2,3,0>, <3,6,0,7>
-  3702491843U,  // <3,0,3,7>: Cost 4 vext2 <1,u,3,0>, <3,7,0,1>
-  2686959934U,  // <3,0,3,u>: Cost 3 vext3 <0,3,u,3>, <0,3,u,3>
-  2689835336U,  // <3,0,4,0>: Cost 3 vext3 LHS, <0,4,0,4>
-  1611891026U,  // <3,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5>
-  1611891036U,  // <3,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6>
-  3763577184U,  // <3,0,4,3>: Cost 4 vext3 LHS, <0,4,3,1>
-  2689835374U,  // <3,0,4,4>: Cost 3 vext3 LHS, <0,4,4,6>
-  1551027510U,  // <3,0,4,5>: Cost 2 vext2 <1,2,3,0>, RHS
-  2666573172U,  // <3,0,4,6>: Cost 3 vext2 <u,2,3,0>, <4,6,4,6>
-  3667711206U,  // <3,0,4,7>: Cost 4 vext1 <7,3,0,4>, <7,3,0,4>
-  1616093586U,  // <3,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6>
-  2685190556U,  // <3,0,5,0>: Cost 3 vext3 LHS, <0,5,0,7>
-  2666573520U,  // <3,0,5,1>: Cost 3 vext2 <u,2,3,0>, <5,1,7,3>
-  3040886886U,  // <3,0,5,2>: Cost 3 vtrnl <3,4,5,6>, LHS
-  3625912834U,  // <3,0,5,3>: Cost 4 vext1 <0,3,0,5>, <3,4,5,6>
-  2666573766U,  // <3,0,5,4>: Cost 3 vext2 <u,2,3,0>, <5,4,7,6>
-  2666573828U,  // <3,0,5,5>: Cost 3 vext2 <u,2,3,0>, <5,5,5,5>
-  2732966354U,  // <3,0,5,6>: Cost 3 vext3 LHS, <0,5,6,7>
-  2666573992U,  // <3,0,5,7>: Cost 3 vext2 <u,2,3,0>, <5,7,5,7>
-  3040886940U,  // <3,0,5,u>: Cost 3 vtrnl <3,4,5,6>, LHS
-  2685190637U,  // <3,0,6,0>: Cost 3 vext3 LHS, <0,6,0,7>
-  2732966390U,  // <3,0,6,1>: Cost 3 vext3 LHS, <0,6,1,7>
-  2689835519U,  // <3,0,6,2>: Cost 3 vext3 LHS, <0,6,2,7>
-  3667724438U,  // <3,0,6,3>: Cost 4 vext1 <7,3,0,6>, <3,0,1,2>
-  3763577355U,  // <3,0,6,4>: Cost 4 vext3 LHS, <0,6,4,1>
-  3806708243U,  // <3,0,6,5>: Cost 4 vext3 LHS, <0,6,5,0>
-  2666574648U,  // <3,0,6,6>: Cost 3 vext2 <u,2,3,0>, <6,6,6,6>
-  2657948520U,  // <3,0,6,7>: Cost 3 vext2 <6,7,3,0>, <6,7,3,0>
-  2689835573U,  // <3,0,6,u>: Cost 3 vext3 LHS, <0,6,u,7>
-  2666574842U,  // <3,0,7,0>: Cost 3 vext2 <u,2,3,0>, <7,0,1,2>
-  2685633095U,  // <3,0,7,1>: Cost 3 vext3 LHS, <0,7,1,7>
-  2660603052U,  // <3,0,7,2>: Cost 3 vext2 <7,2,3,0>, <7,2,3,0>
-  3643844997U,  // <3,0,7,3>: Cost 4 vext1 <3,3,0,7>, <3,3,0,7>
-  2666575206U,  // <3,0,7,4>: Cost 3 vext2 <u,2,3,0>, <7,4,5,6>
-  3655790391U,  // <3,0,7,5>: Cost 4 vext1 <5,3,0,7>, <5,3,0,7>
-  3731690968U,  // <3,0,7,6>: Cost 4 vext2 <6,7,3,0>, <7,6,0,3>
-  2666575468U,  // <3,0,7,7>: Cost 3 vext2 <u,2,3,0>, <7,7,7,7>
-  2664584850U,  // <3,0,7,u>: Cost 3 vext2 <7,u,3,0>, <7,u,3,0>
-  1616093834U,  // <3,0,u,0>: Cost 2 vext3 LHS, <0,u,0,2>
-  1611891346U,  // <3,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1>
-   537707165U,  // <3,0,u,2>: Cost 1 vext3 LHS, LHS
-  2689835684U,  // <3,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1>
-  1616093874U,  // <3,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6>
-  1551030426U,  // <3,0,u,5>: Cost 2 vext2 <1,2,3,0>, RHS
-  2624772304U,  // <3,0,u,6>: Cost 3 vext2 <1,2,3,0>, <u,6,3,7>
-  2594002154U,  // <3,0,u,7>: Cost 3 vext1 <7,3,0,u>, <7,3,0,u>
-   537707219U,  // <3,0,u,u>: Cost 1 vext3 LHS, LHS
-  2552201318U,  // <3,1,0,0>: Cost 3 vext1 <0,3,1,0>, LHS
-  2618802278U,  // <3,1,0,1>: Cost 3 vext2 <0,2,3,1>, LHS
-  2618802366U,  // <3,1,0,2>: Cost 3 vext2 <0,2,3,1>, <0,2,3,1>
-  1611449078U,  // <3,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2>
-  2552204598U,  // <3,1,0,4>: Cost 3 vext1 <0,3,1,0>, RHS
-  2732966663U,  // <3,1,0,5>: Cost 3 vext3 LHS, <1,0,5,1>
-  3906258396U,  // <3,1,0,6>: Cost 4 vuzpr <2,3,0,1>, <2,0,4,6>
-  3667752171U,  // <3,1,0,7>: Cost 4 vext1 <7,3,1,0>, <7,3,1,0>
-  1611891491U,  // <3,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2>
-  2689835819U,  // <3,1,1,0>: Cost 3 vext3 LHS, <1,1,0,1>
-  1611449140U,  // <3,1,1,1>: Cost 2 vext3 LHS, <1,1,1,1>
-  2624775063U,  // <3,1,1,2>: Cost 3 vext2 <1,2,3,1>, <1,2,3,1>
-  1611891528U,  // <3,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3>
-  2689835859U,  // <3,1,1,4>: Cost 3 vext3 LHS, <1,1,4,5>
-  2689835868U,  // <3,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5>
-  3763577701U,  // <3,1,1,6>: Cost 4 vext3 LHS, <1,1,6,5>
-  3765273452U,  // <3,1,1,7>: Cost 4 vext3 <1,1,7,3>, <1,1,7,3>
-  1611891573U,  // <3,1,1,u>: Cost 2 vext3 LHS, <1,1,u,3>
-  2629420494U,  // <3,1,2,0>: Cost 3 vext2 <2,0,3,1>, <2,0,3,1>
-  2689835911U,  // <3,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3>
-  2564163248U,  // <3,1,2,2>: Cost 3 vext1 <2,3,1,2>, <2,3,1,2>
-  1611449238U,  // <3,1,2,3>: Cost 2 vext3 LHS, <1,2,3,0>
-  2564164918U,  // <3,1,2,4>: Cost 3 vext1 <2,3,1,2>, RHS
-  2689835947U,  // <3,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3>
-  3692545978U,  // <3,1,2,6>: Cost 4 vext2 <0,2,3,1>, <2,6,3,7>
-  2732966842U,  // <3,1,2,7>: Cost 3 vext3 LHS, <1,2,7,0>
-  1611891651U,  // <3,1,2,u>: Cost 2 vext3 LHS, <1,2,u,0>
-  1484456038U,  // <3,1,3,0>: Cost 2 vext1 <1,3,1,3>, LHS
-  1611891672U,  // <3,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3>
-  2685633502U,  // <3,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0>
-  2685633512U,  // <3,1,3,3>: Cost 3 vext3 LHS, <1,3,3,1>
-  1484459318U,  // <3,1,3,4>: Cost 2 vext1 <1,3,1,3>, RHS
-  1611891712U,  // <3,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7>
-  2689836041U,  // <3,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7>
-  2733409294U,  // <3,1,3,7>: Cost 3 vext3 LHS, <1,3,7,3>
-  1611891735U,  // <3,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3>
-  2552234086U,  // <3,1,4,0>: Cost 3 vext1 <0,3,1,4>, LHS
-  2732966955U,  // <3,1,4,1>: Cost 3 vext3 LHS, <1,4,1,5>
-  2732966964U,  // <3,1,4,2>: Cost 3 vext3 LHS, <1,4,2,5>
-  2685633597U,  // <3,1,4,3>: Cost 3 vext3 LHS, <1,4,3,5>
-  2552237366U,  // <3,1,4,4>: Cost 3 vext1 <0,3,1,4>, RHS
-  2618805558U,  // <3,1,4,5>: Cost 3 vext2 <0,2,3,1>, RHS
-  2769472822U,  // <3,1,4,6>: Cost 3 vuzpl <3,0,1,2>, RHS
-  3667784943U,  // <3,1,4,7>: Cost 4 vext1 <7,3,1,4>, <7,3,1,4>
-  2685633642U,  // <3,1,4,u>: Cost 3 vext3 LHS, <1,4,u,5>
-  2689836143U,  // <3,1,5,0>: Cost 3 vext3 LHS, <1,5,0,1>
-  2564187280U,  // <3,1,5,1>: Cost 3 vext1 <2,3,1,5>, <1,5,3,7>
-  2564187827U,  // <3,1,5,2>: Cost 3 vext1 <2,3,1,5>, <2,3,1,5>
-  1611891856U,  // <3,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7>
-  2689836183U,  // <3,1,5,4>: Cost 3 vext3 LHS, <1,5,4,5>
-  3759375522U,  // <3,1,5,5>: Cost 4 vext3 LHS, <1,5,5,7>
-  3720417378U,  // <3,1,5,6>: Cost 4 vext2 <4,u,3,1>, <5,6,7,0>
-  2832518454U,  // <3,1,5,7>: Cost 3 vuzpr <2,3,0,1>, RHS
-  1611891901U,  // <3,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7>
-  3763578048U,  // <3,1,6,0>: Cost 4 vext3 LHS, <1,6,0,1>
-  2689836239U,  // <3,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7>
-  2732967128U,  // <3,1,6,2>: Cost 3 vext3 LHS, <1,6,2,7>
-  2685633761U,  // <3,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7>
-  3763578088U,  // <3,1,6,4>: Cost 4 vext3 LHS, <1,6,4,5>
-  2689836275U,  // <3,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7>
-  3763578108U,  // <3,1,6,6>: Cost 4 vext3 LHS, <1,6,6,7>
-  2732967166U,  // <3,1,6,7>: Cost 3 vext3 LHS, <1,6,7,0>
-  2685633806U,  // <3,1,6,u>: Cost 3 vext3 LHS, <1,6,u,7>
-  3631972454U,  // <3,1,7,0>: Cost 4 vext1 <1,3,1,7>, LHS
-  2659947612U,  // <3,1,7,1>: Cost 3 vext2 <7,1,3,1>, <7,1,3,1>
-  4036102294U,  // <3,1,7,2>: Cost 4 vzipr <1,5,3,7>, <3,0,1,2>
-  3095396454U,  // <3,1,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS
-  3631975734U,  // <3,1,7,4>: Cost 4 vext1 <1,3,1,7>, RHS
-  2222982144U,  // <3,1,7,5>: Cost 3 vrev <1,3,5,7>
-  3296797705U,  // <3,1,7,6>: Cost 4 vrev <1,3,6,7>
-  3720418924U,  // <3,1,7,7>: Cost 4 vext2 <4,u,3,1>, <7,7,7,7>
-  3095396459U,  // <3,1,7,u>: Cost 3 vtrnr <1,3,5,7>, LHS
-  1484496998U,  // <3,1,u,0>: Cost 2 vext1 <1,3,1,u>, LHS
-  1611892077U,  // <3,1,u,1>: Cost 2 vext3 LHS, <1,u,1,3>
-  2685633907U,  // <3,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0>
-  1611892092U,  // <3,1,u,3>: Cost 2 vext3 LHS, <1,u,3,0>
-  1484500278U,  // <3,1,u,4>: Cost 2 vext1 <1,3,1,u>, RHS
-  1611892117U,  // <3,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7>
-  2685633950U,  // <3,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7>
-  2832518697U,  // <3,1,u,7>: Cost 3 vuzpr <2,3,0,1>, RHS
-  1611892140U,  // <3,1,u,u>: Cost 2 vext3 LHS, <1,u,u,3>
-  2623455232U,  // <3,2,0,0>: Cost 3 vext2 <1,0,3,2>, <0,0,0,0>
-  1549713510U,  // <3,2,0,1>: Cost 2 vext2 <1,0,3,2>, LHS
-  2689836484U,  // <3,2,0,2>: Cost 3 vext3 LHS, <2,0,2,0>
-  2685633997U,  // <3,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0>
-  2623455570U,  // <3,2,0,4>: Cost 3 vext2 <1,0,3,2>, <0,4,1,5>
-  2732967398U,  // <3,2,0,5>: Cost 3 vext3 LHS, <2,0,5,7>
-  2689836524U,  // <3,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4>
-  2229044964U,  // <3,2,0,7>: Cost 3 vrev <2,3,7,0>
-  1549714077U,  // <3,2,0,u>: Cost 2 vext2 <1,0,3,2>, LHS
-  1549714166U,  // <3,2,1,0>: Cost 2 vext2 <1,0,3,2>, <1,0,3,2>
-  2623456052U,  // <3,2,1,1>: Cost 3 vext2 <1,0,3,2>, <1,1,1,1>
-  2623456150U,  // <3,2,1,2>: Cost 3 vext2 <1,0,3,2>, <1,2,3,0>
-  2685634079U,  // <3,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1>
-  2552286518U,  // <3,2,1,4>: Cost 3 vext1 <0,3,2,1>, RHS
-  2623456400U,  // <3,2,1,5>: Cost 3 vext2 <1,0,3,2>, <1,5,3,7>
-  2689836604U,  // <3,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3>
-  3667834101U,  // <3,2,1,7>: Cost 4 vext1 <7,3,2,1>, <7,3,2,1>
-  1155385070U,  // <3,2,1,u>: Cost 2 vrev <2,3,u,1>
-  2689836629U,  // <3,2,2,0>: Cost 3 vext3 LHS, <2,2,0,1>
-  2689836640U,  // <3,2,2,1>: Cost 3 vext3 LHS, <2,2,1,3>
-  1611449960U,  // <3,2,2,2>: Cost 2 vext3 LHS, <2,2,2,2>
-  1611892338U,  // <3,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3>
-  2689836669U,  // <3,2,2,4>: Cost 3 vext3 LHS, <2,2,4,5>
-  2689836680U,  // <3,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7>
-  2689836688U,  // <3,2,2,6>: Cost 3 vext3 LHS, <2,2,6,6>
-  3763578518U,  // <3,2,2,7>: Cost 4 vext3 LHS, <2,2,7,3>
-  1611892383U,  // <3,2,2,u>: Cost 2 vext3 LHS, <2,2,u,3>
-  1611450022U,  // <3,2,3,0>: Cost 2 vext3 LHS, <2,3,0,1>
-  2685191854U,  // <3,2,3,1>: Cost 3 vext3 LHS, <2,3,1,0>
-  2685191865U,  // <3,2,3,2>: Cost 3 vext3 LHS, <2,3,2,2>
-  2685191875U,  // <3,2,3,3>: Cost 3 vext3 LHS, <2,3,3,3>
-  1611450062U,  // <3,2,3,4>: Cost 2 vext3 LHS, <2,3,4,5>
-  2732967635U,  // <3,2,3,5>: Cost 3 vext3 LHS, <2,3,5,1>
-  2732967645U,  // <3,2,3,6>: Cost 3 vext3 LHS, <2,3,6,2>
-  2732967652U,  // <3,2,3,7>: Cost 3 vext3 LHS, <2,3,7,0>
-  1611450094U,  // <3,2,3,u>: Cost 2 vext3 LHS, <2,3,u,1>
-  2558279782U,  // <3,2,4,0>: Cost 3 vext1 <1,3,2,4>, LHS
-  2558280602U,  // <3,2,4,1>: Cost 3 vext1 <1,3,2,4>, <1,2,3,4>
-  2732967692U,  // <3,2,4,2>: Cost 3 vext3 LHS, <2,4,2,4>
-  2685634326U,  // <3,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5>
-  2558283062U,  // <3,2,4,4>: Cost 3 vext1 <1,3,2,4>, RHS
-  1549716790U,  // <3,2,4,5>: Cost 2 vext2 <1,0,3,2>, RHS
-  2689836844U,  // <3,2,4,6>: Cost 3 vext3 LHS, <2,4,6,0>
-  2229077736U,  // <3,2,4,7>: Cost 3 vrev <2,3,7,4>
-  1549717033U,  // <3,2,4,u>: Cost 2 vext2 <1,0,3,2>, RHS
-  2552316006U,  // <3,2,5,0>: Cost 3 vext1 <0,3,2,5>, LHS
-  2228643507U,  // <3,2,5,1>: Cost 3 vrev <2,3,1,5>
-  2689836896U,  // <3,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7>
-  2685634408U,  // <3,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6>
-  1155122894U,  // <3,2,5,4>: Cost 2 vrev <2,3,4,5>
-  2665263108U,  // <3,2,5,5>: Cost 3 vext2 <u,0,3,2>, <5,5,5,5>
-  2689836932U,  // <3,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7>
-  2665263272U,  // <3,2,5,7>: Cost 3 vext2 <u,0,3,2>, <5,7,5,7>
-  1155417842U,  // <3,2,5,u>: Cost 2 vrev <2,3,u,5>
-  2689836953U,  // <3,2,6,0>: Cost 3 vext3 LHS, <2,6,0,1>
-  2689836964U,  // <3,2,6,1>: Cost 3 vext3 LHS, <2,6,1,3>
-  2689836976U,  // <3,2,6,2>: Cost 3 vext3 LHS, <2,6,2,6>
-  1611892666U,  // <3,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7>
-  2689836993U,  // <3,2,6,4>: Cost 3 vext3 LHS, <2,6,4,5>
-  2689837004U,  // <3,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7>
-  2689837013U,  // <3,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7>
-  2665263950U,  // <3,2,6,7>: Cost 3 vext2 <u,0,3,2>, <6,7,0,1>
-  1611892711U,  // <3,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7>
-  2665264122U,  // <3,2,7,0>: Cost 3 vext2 <u,0,3,2>, <7,0,1,2>
-  2623460419U,  // <3,2,7,1>: Cost 3 vext2 <1,0,3,2>, <7,1,0,3>
-  4169138340U,  // <3,2,7,2>: Cost 4 vtrnr <1,3,5,7>, <0,2,0,2>
-  2962358374U,  // <3,2,7,3>: Cost 3 vzipr <1,5,3,7>, LHS
-  2665264486U,  // <3,2,7,4>: Cost 3 vext2 <u,0,3,2>, <7,4,5,6>
-  2228954841U,  // <3,2,7,5>: Cost 3 vrev <2,3,5,7>
-  2229028578U,  // <3,2,7,6>: Cost 3 vrev <2,3,6,7>
-  2665264748U,  // <3,2,7,7>: Cost 3 vext2 <u,0,3,2>, <7,7,7,7>
-  2962358379U,  // <3,2,7,u>: Cost 3 vzipr <1,5,3,7>, LHS
-  1611892795U,  // <3,2,u,0>: Cost 2 vext3 LHS, <2,u,0,1>
-  1549719342U,  // <3,2,u,1>: Cost 2 vext2 <1,0,3,2>, LHS
-  1611449960U,  // <3,2,u,2>: Cost 2 vext3 LHS, <2,2,2,2>
-  1611892824U,  // <3,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3>
-  1611892835U,  // <3,2,u,4>: Cost 2 vext3 LHS, <2,u,4,5>
-  1549719706U,  // <3,2,u,5>: Cost 2 vext2 <1,0,3,2>, RHS
-  2689837168U,  // <3,2,u,6>: Cost 3 vext3 LHS, <2,u,6,0>
-  2665265408U,  // <3,2,u,7>: Cost 3 vext2 <u,0,3,2>, <u,7,0,1>
-  1611892867U,  // <3,2,u,u>: Cost 2 vext3 LHS, <2,u,u,1>
-  2685192331U,  // <3,3,0,0>: Cost 3 vext3 LHS, <3,0,0,0>
-  1611450518U,  // <3,3,0,1>: Cost 2 vext3 LHS, <3,0,1,2>
-  2685634717U,  // <3,3,0,2>: Cost 3 vext3 LHS, <3,0,2,0>
-  2564294806U,  // <3,3,0,3>: Cost 3 vext1 <2,3,3,0>, <3,0,1,2>
-  2685634736U,  // <3,3,0,4>: Cost 3 vext3 LHS, <3,0,4,1>
-  2732968122U,  // <3,3,0,5>: Cost 3 vext3 LHS, <3,0,5,2>
-  3763579075U,  // <3,3,0,6>: Cost 4 vext3 LHS, <3,0,6,2>
-  4034053264U,  // <3,3,0,7>: Cost 4 vzipr <1,2,3,0>, <1,5,3,7>
-  1611450581U,  // <3,3,0,u>: Cost 2 vext3 LHS, <3,0,u,2>
-  2685192415U,  // <3,3,1,0>: Cost 3 vext3 LHS, <3,1,0,3>
-  1550385992U,  // <3,3,1,1>: Cost 2 vext2 <1,1,3,3>, <1,1,3,3>
-  2685192433U,  // <3,3,1,2>: Cost 3 vext3 LHS, <3,1,2,3>
-  2685634808U,  // <3,3,1,3>: Cost 3 vext3 LHS, <3,1,3,1>
-  2558332214U,  // <3,3,1,4>: Cost 3 vext1 <1,3,3,1>, RHS
-  2685634828U,  // <3,3,1,5>: Cost 3 vext3 LHS, <3,1,5,3>
-  3759376661U,  // <3,3,1,6>: Cost 4 vext3 LHS, <3,1,6,3>
-  2703477022U,  // <3,3,1,7>: Cost 3 vext3 <3,1,7,3>, <3,1,7,3>
-  1555031423U,  // <3,3,1,u>: Cost 2 vext2 <1,u,3,3>, <1,u,3,3>
-  2564309094U,  // <3,3,2,0>: Cost 3 vext1 <2,3,3,2>, LHS
-  2630100513U,  // <3,3,2,1>: Cost 3 vext2 <2,1,3,3>, <2,1,3,3>
-  1557022322U,  // <3,3,2,2>: Cost 2 vext2 <2,2,3,3>, <2,2,3,3>
-  2685192520U,  // <3,3,2,3>: Cost 3 vext3 LHS, <3,2,3,0>
-  2564312374U,  // <3,3,2,4>: Cost 3 vext1 <2,3,3,2>, RHS
-  2732968286U,  // <3,3,2,5>: Cost 3 vext3 LHS, <3,2,5,4>
-  2685634918U,  // <3,3,2,6>: Cost 3 vext3 LHS, <3,2,6,3>
-  2704140655U,  // <3,3,2,7>: Cost 3 vext3 <3,2,7,3>, <3,2,7,3>
-  1561004120U,  // <3,3,2,u>: Cost 2 vext2 <2,u,3,3>, <2,u,3,3>
-  1496547430U,  // <3,3,3,0>: Cost 2 vext1 <3,3,3,3>, LHS
-  2624129256U,  // <3,3,3,1>: Cost 3 vext2 <1,1,3,3>, <3,1,1,3>
-  2630764866U,  // <3,3,3,2>: Cost 3 vext2 <2,2,3,3>, <3,2,2,3>
-   336380006U,  // <3,3,3,3>: Cost 1 vdup3 LHS
-  1496550710U,  // <3,3,3,4>: Cost 2 vext1 <3,3,3,3>, RHS
-  2732968368U,  // <3,3,3,5>: Cost 3 vext3 LHS, <3,3,5,5>
-  2624129683U,  // <3,3,3,6>: Cost 3 vext2 <1,1,3,3>, <3,6,3,7>
-  2594182400U,  // <3,3,3,7>: Cost 3 vext1 <7,3,3,3>, <7,3,3,3>
-   336380006U,  // <3,3,3,u>: Cost 1 vdup3 LHS
-  2558353510U,  // <3,3,4,0>: Cost 3 vext1 <1,3,3,4>, LHS
-  2558354411U,  // <3,3,4,1>: Cost 3 vext1 <1,3,3,4>, <1,3,3,4>
-  2564327108U,  // <3,3,4,2>: Cost 3 vext1 <2,3,3,4>, <2,3,3,4>
-  2564327938U,  // <3,3,4,3>: Cost 3 vext1 <2,3,3,4>, <3,4,5,6>
-  2960343962U,  // <3,3,4,4>: Cost 3 vzipr <1,2,3,4>, <1,2,3,4>
-  1611893250U,  // <3,3,4,5>: Cost 2 vext3 LHS, <3,4,5,6>
-  2771619126U,  // <3,3,4,6>: Cost 3 vuzpl <3,3,3,3>, RHS
-  4034086032U,  // <3,3,4,7>: Cost 4 vzipr <1,2,3,4>, <1,5,3,7>
-  1611893277U,  // <3,3,4,u>: Cost 2 vext3 LHS, <3,4,u,6>
-  2558361702U,  // <3,3,5,0>: Cost 3 vext1 <1,3,3,5>, LHS
-  2558362604U,  // <3,3,5,1>: Cost 3 vext1 <1,3,3,5>, <1,3,3,5>
-  2558363342U,  // <3,3,5,2>: Cost 3 vext1 <1,3,3,5>, <2,3,4,5>
-  2732968512U,  // <3,3,5,3>: Cost 3 vext3 LHS, <3,5,3,5>
-  2558364982U,  // <3,3,5,4>: Cost 3 vext1 <1,3,3,5>, RHS
-  3101279950U,  // <3,3,5,5>: Cost 3 vtrnr <2,3,4,5>, <2,3,4,5>
-  2665934946U,  // <3,3,5,6>: Cost 3 vext2 <u,1,3,3>, <5,6,7,0>
-  2826636598U,  // <3,3,5,7>: Cost 3 vuzpr <1,3,1,3>, RHS
-  2826636599U,  // <3,3,5,u>: Cost 3 vuzpr <1,3,1,3>, RHS
-  2732968568U,  // <3,3,6,0>: Cost 3 vext3 LHS, <3,6,0,7>
-  3763579521U,  // <3,3,6,1>: Cost 4 vext3 LHS, <3,6,1,7>
-  2732968586U,  // <3,3,6,2>: Cost 3 vext3 LHS, <3,6,2,7>
-  2732968595U,  // <3,3,6,3>: Cost 3 vext3 LHS, <3,6,3,7>
-  2732968604U,  // <3,3,6,4>: Cost 3 vext3 LHS, <3,6,4,7>
-  3763579557U,  // <3,3,6,5>: Cost 4 vext3 LHS, <3,6,5,7>
-  2732968621U,  // <3,3,6,6>: Cost 3 vext3 LHS, <3,6,6,6>
-  2657973099U,  // <3,3,6,7>: Cost 3 vext2 <6,7,3,3>, <6,7,3,3>
-  2658636732U,  // <3,3,6,u>: Cost 3 vext2 <6,u,3,3>, <6,u,3,3>
-  2558378086U,  // <3,3,7,0>: Cost 3 vext1 <1,3,3,7>, LHS
-  2558378990U,  // <3,3,7,1>: Cost 3 vext1 <1,3,3,7>, <1,3,3,7>
-  2564351687U,  // <3,3,7,2>: Cost 3 vext1 <2,3,3,7>, <2,3,3,7>
-  2661291264U,  // <3,3,7,3>: Cost 3 vext2 <7,3,3,3>, <7,3,3,3>
-  2558381366U,  // <3,3,7,4>: Cost 3 vext1 <1,3,3,7>, RHS
-  2732968694U,  // <3,3,7,5>: Cost 3 vext3 LHS, <3,7,5,7>
-  3781126907U,  // <3,3,7,6>: Cost 4 vext3 <3,7,6,3>, <3,7,6,3>
-  3095397376U,  // <3,3,7,7>: Cost 3 vtrnr <1,3,5,7>, <1,3,5,7>
-  2558383918U,  // <3,3,7,u>: Cost 3 vext1 <1,3,3,7>, LHS
-  1496547430U,  // <3,3,u,0>: Cost 2 vext1 <3,3,3,3>, LHS
-  1611893534U,  // <3,3,u,1>: Cost 2 vext3 LHS, <3,u,1,2>
-  1592858504U,  // <3,3,u,2>: Cost 2 vext2 <u,2,3,3>, <u,2,3,3>
-   336380006U,  // <3,3,u,3>: Cost 1 vdup3 LHS
-  1496550710U,  // <3,3,u,4>: Cost 2 vext1 <3,3,3,3>, RHS
-  1611893574U,  // <3,3,u,5>: Cost 2 vext3 LHS, <3,u,5,6>
-  2690280268U,  // <3,3,u,6>: Cost 3 vext3 LHS, <3,u,6,3>
-  2826636841U,  // <3,3,u,7>: Cost 3 vuzpr <1,3,1,3>, RHS
-   336380006U,  // <3,3,u,u>: Cost 1 vdup3 LHS
-  2624798720U,  // <3,4,0,0>: Cost 3 vext2 <1,2,3,4>, <0,0,0,0>
-  1551056998U,  // <3,4,0,1>: Cost 2 vext2 <1,2,3,4>, LHS
-  2624798884U,  // <3,4,0,2>: Cost 3 vext2 <1,2,3,4>, <0,2,0,2>
-  3693232384U,  // <3,4,0,3>: Cost 4 vext2 <0,3,3,4>, <0,3,1,4>
-  2624799058U,  // <3,4,0,4>: Cost 3 vext2 <1,2,3,4>, <0,4,1,5>
-  1659227026U,  // <3,4,0,5>: Cost 2 vext3 LHS, <4,0,5,1>
-  1659227036U,  // <3,4,0,6>: Cost 2 vext3 LHS, <4,0,6,2>
-  3667973382U,  // <3,4,0,7>: Cost 4 vext1 <7,3,4,0>, <7,3,4,0>
-  1551057565U,  // <3,4,0,u>: Cost 2 vext2 <1,2,3,4>, LHS
-  2624799478U,  // <3,4,1,0>: Cost 3 vext2 <1,2,3,4>, <1,0,3,2>
-  2624799540U,  // <3,4,1,1>: Cost 3 vext2 <1,2,3,4>, <1,1,1,1>
-  1551057818U,  // <3,4,1,2>: Cost 2 vext2 <1,2,3,4>, <1,2,3,4>
-  2624799704U,  // <3,4,1,3>: Cost 3 vext2 <1,2,3,4>, <1,3,1,3>
-  2564377910U,  // <3,4,1,4>: Cost 3 vext1 <2,3,4,1>, RHS
-  2689838050U,  // <3,4,1,5>: Cost 3 vext3 LHS, <4,1,5,0>
-  2689838062U,  // <3,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3>
-  2628117807U,  // <3,4,1,7>: Cost 3 vext2 <1,7,3,4>, <1,7,3,4>
-  1555039616U,  // <3,4,1,u>: Cost 2 vext2 <1,u,3,4>, <1,u,3,4>
-  3626180710U,  // <3,4,2,0>: Cost 4 vext1 <0,3,4,2>, LHS
-  2624800298U,  // <3,4,2,1>: Cost 3 vext2 <1,2,3,4>, <2,1,4,3>
-  2624800360U,  // <3,4,2,2>: Cost 3 vext2 <1,2,3,4>, <2,2,2,2>
-  2624800422U,  // <3,4,2,3>: Cost 3 vext2 <1,2,3,4>, <2,3,0,1>
-  2624800514U,  // <3,4,2,4>: Cost 3 vext2 <1,2,3,4>, <2,4,1,3>
-  2709965878U,  // <3,4,2,5>: Cost 3 vext3 <4,2,5,3>, <4,2,5,3>
-  2689838140U,  // <3,4,2,6>: Cost 3 vext3 LHS, <4,2,6,0>
-  2634090504U,  // <3,4,2,7>: Cost 3 vext2 <2,7,3,4>, <2,7,3,4>
-  2689838158U,  // <3,4,2,u>: Cost 3 vext3 LHS, <4,2,u,0>
-  2624800918U,  // <3,4,3,0>: Cost 3 vext2 <1,2,3,4>, <3,0,1,2>
-  2636081403U,  // <3,4,3,1>: Cost 3 vext2 <3,1,3,4>, <3,1,3,4>
-  2636745036U,  // <3,4,3,2>: Cost 3 vext2 <3,2,3,4>, <3,2,3,4>
-  2624801180U,  // <3,4,3,3>: Cost 3 vext2 <1,2,3,4>, <3,3,3,3>
-  2624801232U,  // <3,4,3,4>: Cost 3 vext2 <1,2,3,4>, <3,4,0,1>
-  2905836854U,  // <3,4,3,5>: Cost 3 vzipl <3,3,3,3>, RHS
-  3040054582U,  // <3,4,3,6>: Cost 3 vtrnl <3,3,3,3>, RHS
-  3702524611U,  // <3,4,3,7>: Cost 4 vext2 <1,u,3,4>, <3,7,0,1>
-  2624801566U,  // <3,4,3,u>: Cost 3 vext2 <1,2,3,4>, <3,u,1,2>
-  2564399206U,  // <3,4,4,0>: Cost 3 vext1 <2,3,4,4>, LHS
-  2564400026U,  // <3,4,4,1>: Cost 3 vext1 <2,3,4,4>, <1,2,3,4>
-  2564400845U,  // <3,4,4,2>: Cost 3 vext1 <2,3,4,4>, <2,3,4,4>
-  2570373542U,  // <3,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4>
-  1659227344U,  // <3,4,4,4>: Cost 2 vext3 LHS, <4,4,4,4>
-  1551060278U,  // <3,4,4,5>: Cost 2 vext2 <1,2,3,4>, RHS
-  1659227364U,  // <3,4,4,6>: Cost 2 vext3 LHS, <4,4,6,6>
-  3668006154U,  // <3,4,4,7>: Cost 4 vext1 <7,3,4,4>, <7,3,4,4>
-  1551060521U,  // <3,4,4,u>: Cost 2 vext2 <1,2,3,4>, RHS
-  1490665574U,  // <3,4,5,0>: Cost 2 vext1 <2,3,4,5>, LHS
-  2689838341U,  // <3,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3>
-  1490667214U,  // <3,4,5,2>: Cost 2 vext1 <2,3,4,5>, <2,3,4,5>
-  2564409494U,  // <3,4,5,3>: Cost 3 vext1 <2,3,4,5>, <3,0,1,2>
-  1490668854U,  // <3,4,5,4>: Cost 2 vext1 <2,3,4,5>, RHS
-  2689838381U,  // <3,4,5,5>: Cost 3 vext3 LHS, <4,5,5,7>
-   537709878U,  // <3,4,5,6>: Cost 1 vext3 LHS, RHS
-  2594272523U,  // <3,4,5,7>: Cost 3 vext1 <7,3,4,5>, <7,3,4,5>
-   537709896U,  // <3,4,5,u>: Cost 1 vext3 LHS, RHS
-  2689838411U,  // <3,4,6,0>: Cost 3 vext3 LHS, <4,6,0,1>
-  2558444534U,  // <3,4,6,1>: Cost 3 vext1 <1,3,4,6>, <1,3,4,6>
-  2666607098U,  // <3,4,6,2>: Cost 3 vext2 <u,2,3,4>, <6,2,7,3>
-  2558446082U,  // <3,4,6,3>: Cost 3 vext1 <1,3,4,6>, <3,4,5,6>
-  1659227508U,  // <3,4,6,4>: Cost 2 vext3 LHS, <4,6,4,6>
-  2689838462U,  // <3,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7>
-  2689838471U,  // <3,4,6,6>: Cost 3 vext3 LHS, <4,6,6,7>
-  2657981292U,  // <3,4,6,7>: Cost 3 vext2 <6,7,3,4>, <6,7,3,4>
-  1659227540U,  // <3,4,6,u>: Cost 2 vext3 LHS, <4,6,u,2>
-  2666607610U,  // <3,4,7,0>: Cost 3 vext2 <u,2,3,4>, <7,0,1,2>
-  3702527072U,  // <3,4,7,1>: Cost 4 vext2 <1,u,3,4>, <7,1,3,5>
-  2660635824U,  // <3,4,7,2>: Cost 3 vext2 <7,2,3,4>, <7,2,3,4>
-  3644139945U,  // <3,4,7,3>: Cost 4 vext1 <3,3,4,7>, <3,3,4,7>
-  2666607974U,  // <3,4,7,4>: Cost 3 vext2 <u,2,3,4>, <7,4,5,6>
-  2732969416U,  // <3,4,7,5>: Cost 3 vext3 LHS, <4,7,5,0>
-  2732969425U,  // <3,4,7,6>: Cost 3 vext3 LHS, <4,7,6,0>
-  2666608236U,  // <3,4,7,7>: Cost 3 vext2 <u,2,3,4>, <7,7,7,7>
-  2664617622U,  // <3,4,7,u>: Cost 3 vext2 <7,u,3,4>, <7,u,3,4>
-  1490690150U,  // <3,4,u,0>: Cost 2 vext1 <2,3,4,u>, LHS
-  1551062830U,  // <3,4,u,1>: Cost 2 vext2 <1,2,3,4>, LHS
-  1490691793U,  // <3,4,u,2>: Cost 2 vext1 <2,3,4,u>, <2,3,4,u>
-  2624804796U,  // <3,4,u,3>: Cost 3 vext2 <1,2,3,4>, <u,3,0,1>
-  1490693430U,  // <3,4,u,4>: Cost 2 vext1 <2,3,4,u>, RHS
-  1551063194U,  // <3,4,u,5>: Cost 2 vext2 <1,2,3,4>, RHS
-   537710121U,  // <3,4,u,6>: Cost 1 vext3 LHS, RHS
-  2594297102U,  // <3,4,u,7>: Cost 3 vext1 <7,3,4,u>, <7,3,4,u>
-   537710139U,  // <3,4,u,u>: Cost 1 vext3 LHS, RHS
-  3692576768U,  // <3,5,0,0>: Cost 4 vext2 <0,2,3,5>, <0,0,0,0>
-  2618835046U,  // <3,5,0,1>: Cost 3 vext2 <0,2,3,5>, LHS
-  2618835138U,  // <3,5,0,2>: Cost 3 vext2 <0,2,3,5>, <0,2,3,5>
-  3692577024U,  // <3,5,0,3>: Cost 4 vext2 <0,2,3,5>, <0,3,1,4>
-  2689838690U,  // <3,5,0,4>: Cost 3 vext3 LHS, <5,0,4,1>
-  2732969579U,  // <3,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1>
-  2732969588U,  // <3,5,0,6>: Cost 3 vext3 LHS, <5,0,6,1>
-  2246963055U,  // <3,5,0,7>: Cost 3 vrev <5,3,7,0>
-  2618835613U,  // <3,5,0,u>: Cost 3 vext2 <0,2,3,5>, LHS
-  2594308198U,  // <3,5,1,0>: Cost 3 vext1 <7,3,5,1>, LHS
-  3692577588U,  // <3,5,1,1>: Cost 4 vext2 <0,2,3,5>, <1,1,1,1>
-  2624807835U,  // <3,5,1,2>: Cost 3 vext2 <1,2,3,5>, <1,2,3,5>
-  2625471468U,  // <3,5,1,3>: Cost 3 vext2 <1,3,3,5>, <1,3,3,5>
-  2626135101U,  // <3,5,1,4>: Cost 3 vext2 <1,4,3,5>, <1,4,3,5>
-  2594311888U,  // <3,5,1,5>: Cost 3 vext1 <7,3,5,1>, <5,1,7,3>
-  3699877107U,  // <3,5,1,6>: Cost 4 vext2 <1,4,3,5>, <1,6,5,7>
-  1641680592U,  // <3,5,1,7>: Cost 2 vext3 <5,1,7,3>, <5,1,7,3>
-  1641754329U,  // <3,5,1,u>: Cost 2 vext3 <5,1,u,3>, <5,1,u,3>
-  3692578274U,  // <3,5,2,0>: Cost 4 vext2 <0,2,3,5>, <2,0,5,3>
-  2630116899U,  // <3,5,2,1>: Cost 3 vext2 <2,1,3,5>, <2,1,3,5>
-  3692578408U,  // <3,5,2,2>: Cost 4 vext2 <0,2,3,5>, <2,2,2,2>
-  2625472206U,  // <3,5,2,3>: Cost 3 vext2 <1,3,3,5>, <2,3,4,5>
-  2632107798U,  // <3,5,2,4>: Cost 3 vext2 <2,4,3,5>, <2,4,3,5>
-  2715938575U,  // <3,5,2,5>: Cost 3 vext3 <5,2,5,3>, <5,2,5,3>
-  3692578746U,  // <3,5,2,6>: Cost 4 vext2 <0,2,3,5>, <2,6,3,7>
-  2716086049U,  // <3,5,2,7>: Cost 3 vext3 <5,2,7,3>, <5,2,7,3>
-  2634762330U,  // <3,5,2,u>: Cost 3 vext2 <2,u,3,5>, <2,u,3,5>
-  3692578966U,  // <3,5,3,0>: Cost 4 vext2 <0,2,3,5>, <3,0,1,2>
-  2636089596U,  // <3,5,3,1>: Cost 3 vext2 <3,1,3,5>, <3,1,3,5>
-  3699214668U,  // <3,5,3,2>: Cost 4 vext2 <1,3,3,5>, <3,2,3,4>
-  2638080412U,  // <3,5,3,3>: Cost 3 vext2 <3,4,3,5>, <3,3,3,3>
-  2618837506U,  // <3,5,3,4>: Cost 3 vext2 <0,2,3,5>, <3,4,5,6>
-  2832844494U,  // <3,5,3,5>: Cost 3 vuzpr <2,3,4,5>, <2,3,4,5>
-  4033415682U,  // <3,5,3,6>: Cost 4 vzipr <1,1,3,3>, <3,4,5,6>
-  3095072054U,  // <3,5,3,7>: Cost 3 vtrnr <1,3,1,3>, RHS
-  3095072055U,  // <3,5,3,u>: Cost 3 vtrnr <1,3,1,3>, RHS
-  2600304742U,  // <3,5,4,0>: Cost 3 vext1 <u,3,5,4>, LHS
-  3763580815U,  // <3,5,4,1>: Cost 4 vext3 LHS, <5,4,1,5>
-  2564474582U,  // <3,5,4,2>: Cost 3 vext1 <2,3,5,4>, <2,3,5,4>
-  3699879044U,  // <3,5,4,3>: Cost 4 vext2 <1,4,3,5>, <4,3,5,0>
-  2600308022U,  // <3,5,4,4>: Cost 3 vext1 <u,3,5,4>, RHS
-  2618838326U,  // <3,5,4,5>: Cost 3 vext2 <0,2,3,5>, RHS
-  2772454710U,  // <3,5,4,6>: Cost 3 vuzpl <3,4,5,6>, RHS
-  1659228102U,  // <3,5,4,7>: Cost 2 vext3 LHS, <5,4,7,6>
-  1659228111U,  // <3,5,4,u>: Cost 2 vext3 LHS, <5,4,u,6>
-  2570453094U,  // <3,5,5,0>: Cost 3 vext1 <3,3,5,5>, LHS
-  2624810704U,  // <3,5,5,1>: Cost 3 vext2 <1,2,3,5>, <5,1,7,3>
-  2570454734U,  // <3,5,5,2>: Cost 3 vext1 <3,3,5,5>, <2,3,4,5>
-  2570455472U,  // <3,5,5,3>: Cost 3 vext1 <3,3,5,5>, <3,3,5,5>
-  2570456374U,  // <3,5,5,4>: Cost 3 vext1 <3,3,5,5>, RHS
-  1659228164U,  // <3,5,5,5>: Cost 2 vext3 LHS, <5,5,5,5>
-  2732969998U,  // <3,5,5,6>: Cost 3 vext3 LHS, <5,5,6,6>
-  1659228184U,  // <3,5,5,7>: Cost 2 vext3 LHS, <5,5,7,7>
-  1659228193U,  // <3,5,5,u>: Cost 2 vext3 LHS, <5,5,u,7>
-  2732970020U,  // <3,5,6,0>: Cost 3 vext3 LHS, <5,6,0,1>
-  2732970035U,  // <3,5,6,1>: Cost 3 vext3 LHS, <5,6,1,7>
-  2564490968U,  // <3,5,6,2>: Cost 3 vext1 <2,3,5,6>, <2,3,5,6>
-  2732970050U,  // <3,5,6,3>: Cost 3 vext3 LHS, <5,6,3,4>
-  2732970060U,  // <3,5,6,4>: Cost 3 vext3 LHS, <5,6,4,5>
-  2732970071U,  // <3,5,6,5>: Cost 3 vext3 LHS, <5,6,5,7>
-  2732970080U,  // <3,5,6,6>: Cost 3 vext3 LHS, <5,6,6,7>
-  1659228258U,  // <3,5,6,7>: Cost 2 vext3 LHS, <5,6,7,0>
-  1659228267U,  // <3,5,6,u>: Cost 2 vext3 LHS, <5,6,u,0>
-  1484783718U,  // <3,5,7,0>: Cost 2 vext1 <1,3,5,7>, LHS
-  1484784640U,  // <3,5,7,1>: Cost 2 vext1 <1,3,5,7>, <1,3,5,7>
-  2558527080U,  // <3,5,7,2>: Cost 3 vext1 <1,3,5,7>, <2,2,2,2>
-  2558527638U,  // <3,5,7,3>: Cost 3 vext1 <1,3,5,7>, <3,0,1,2>
-  1484786998U,  // <3,5,7,4>: Cost 2 vext1 <1,3,5,7>, RHS
-  1659228328U,  // <3,5,7,5>: Cost 2 vext3 LHS, <5,7,5,7>
-  2732970154U,  // <3,5,7,6>: Cost 3 vext3 LHS, <5,7,6,0>
-  2558531180U,  // <3,5,7,7>: Cost 3 vext1 <1,3,5,7>, <7,7,7,7>
-  1484789550U,  // <3,5,7,u>: Cost 2 vext1 <1,3,5,7>, LHS
-  1484791910U,  // <3,5,u,0>: Cost 2 vext1 <1,3,5,u>, LHS
-  1484792833U,  // <3,5,u,1>: Cost 2 vext1 <1,3,5,u>, <1,3,5,u>
-  2558535272U,  // <3,5,u,2>: Cost 3 vext1 <1,3,5,u>, <2,2,2,2>
-  2558535830U,  // <3,5,u,3>: Cost 3 vext1 <1,3,5,u>, <3,0,1,2>
-  1484795190U,  // <3,5,u,4>: Cost 2 vext1 <1,3,5,u>, RHS
-  1659228409U,  // <3,5,u,5>: Cost 2 vext3 LHS, <5,u,5,7>
-  2772457626U,  // <3,5,u,6>: Cost 3 vuzpl <3,4,5,6>, RHS
-  1646326023U,  // <3,5,u,7>: Cost 2 vext3 <5,u,7,3>, <5,u,7,3>
-  1484797742U,  // <3,5,u,u>: Cost 2 vext1 <1,3,5,u>, LHS
-  2558541926U,  // <3,6,0,0>: Cost 3 vext1 <1,3,6,0>, LHS
-  2689839393U,  // <3,6,0,1>: Cost 3 vext3 LHS, <6,0,1,2>
-  2689839404U,  // <3,6,0,2>: Cost 3 vext3 LHS, <6,0,2,4>
-  3706519808U,  // <3,6,0,3>: Cost 4 vext2 <2,5,3,6>, <0,3,1,4>
-  2689839420U,  // <3,6,0,4>: Cost 3 vext3 LHS, <6,0,4,2>
-  2732970314U,  // <3,6,0,5>: Cost 3 vext3 LHS, <6,0,5,7>
-  2732970316U,  // <3,6,0,6>: Cost 3 vext3 LHS, <6,0,6,0>
-  2960313654U,  // <3,6,0,7>: Cost 3 vzipr <1,2,3,0>, RHS
-  2689839456U,  // <3,6,0,u>: Cost 3 vext3 LHS, <6,0,u,2>
-  3763581290U,  // <3,6,1,0>: Cost 4 vext3 LHS, <6,1,0,3>
-  3763581297U,  // <3,6,1,1>: Cost 4 vext3 LHS, <6,1,1,1>
-  2624816028U,  // <3,6,1,2>: Cost 3 vext2 <1,2,3,6>, <1,2,3,6>
-  3763581315U,  // <3,6,1,3>: Cost 4 vext3 LHS, <6,1,3,1>
-  2626143294U,  // <3,6,1,4>: Cost 3 vext2 <1,4,3,6>, <1,4,3,6>
-  3763581335U,  // <3,6,1,5>: Cost 4 vext3 LHS, <6,1,5,3>
-  2721321376U,  // <3,6,1,6>: Cost 3 vext3 <6,1,6,3>, <6,1,6,3>
-  2721395113U,  // <3,6,1,7>: Cost 3 vext3 <6,1,7,3>, <6,1,7,3>
-  2628797826U,  // <3,6,1,u>: Cost 3 vext2 <1,u,3,6>, <1,u,3,6>
-  2594390118U,  // <3,6,2,0>: Cost 3 vext1 <7,3,6,2>, LHS
-  2721616324U,  // <3,6,2,1>: Cost 3 vext3 <6,2,1,3>, <6,2,1,3>
-  2630788725U,  // <3,6,2,2>: Cost 3 vext2 <2,2,3,6>, <2,2,3,6>
-  3763581395U,  // <3,6,2,3>: Cost 4 vext3 LHS, <6,2,3,0>
-  2632115991U,  // <3,6,2,4>: Cost 3 vext2 <2,4,3,6>, <2,4,3,6>
-  2632779624U,  // <3,6,2,5>: Cost 3 vext2 <2,5,3,6>, <2,5,3,6>
-  2594394618U,  // <3,6,2,6>: Cost 3 vext1 <7,3,6,2>, <6,2,7,3>
-  1648316922U,  // <3,6,2,7>: Cost 2 vext3 <6,2,7,3>, <6,2,7,3>
-  1648390659U,  // <3,6,2,u>: Cost 2 vext3 <6,2,u,3>, <6,2,u,3>
-  3693914262U,  // <3,6,3,0>: Cost 4 vext2 <0,4,3,6>, <3,0,1,2>
-  3638281176U,  // <3,6,3,1>: Cost 4 vext1 <2,3,6,3>, <1,3,1,3>
-  3696568678U,  // <3,6,3,2>: Cost 4 vext2 <0,u,3,6>, <3,2,6,3>
-  2638088604U,  // <3,6,3,3>: Cost 3 vext2 <3,4,3,6>, <3,3,3,3>
-  2632780290U,  // <3,6,3,4>: Cost 3 vext2 <2,5,3,6>, <3,4,5,6>
-  3712494145U,  // <3,6,3,5>: Cost 4 vext2 <3,5,3,6>, <3,5,3,6>
-  3698559612U,  // <3,6,3,6>: Cost 4 vext2 <1,2,3,6>, <3,6,1,2>
-  2959674678U,  // <3,6,3,7>: Cost 3 vzipr <1,1,3,3>, RHS
-  2959674679U,  // <3,6,3,u>: Cost 3 vzipr <1,1,3,3>, RHS
-  3763581536U,  // <3,6,4,0>: Cost 4 vext3 LHS, <6,4,0,6>
-  2722943590U,  // <3,6,4,1>: Cost 3 vext3 <6,4,1,3>, <6,4,1,3>
-  2732970609U,  // <3,6,4,2>: Cost 3 vext3 LHS, <6,4,2,5>
-  3698560147U,  // <3,6,4,3>: Cost 4 vext2 <1,2,3,6>, <4,3,6,6>
-  2732970628U,  // <3,6,4,4>: Cost 3 vext3 LHS, <6,4,4,6>
-  2689839757U,  // <3,6,4,5>: Cost 3 vext3 LHS, <6,4,5,6>
-  2732970640U,  // <3,6,4,6>: Cost 3 vext3 LHS, <6,4,6,0>
-  2960346422U,  // <3,6,4,7>: Cost 3 vzipr <1,2,3,4>, RHS
-  2689839784U,  // <3,6,4,u>: Cost 3 vext3 LHS, <6,4,u,6>
-  2576498790U,  // <3,6,5,0>: Cost 3 vext1 <4,3,6,5>, LHS
-  3650241270U,  // <3,6,5,1>: Cost 4 vext1 <4,3,6,5>, <1,0,3,2>
-  2732970692U,  // <3,6,5,2>: Cost 3 vext3 LHS, <6,5,2,7>
-  2576501250U,  // <3,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6>
-  2576501906U,  // <3,6,5,4>: Cost 3 vext1 <4,3,6,5>, <4,3,6,5>
-  3650244622U,  // <3,6,5,5>: Cost 4 vext1 <4,3,6,5>, <5,5,6,6>
-  4114633528U,  // <3,6,5,6>: Cost 4 vtrnl <3,4,5,6>, <6,6,6,6>
-  2732970735U,  // <3,6,5,7>: Cost 3 vext3 LHS, <6,5,7,5>
-  2576504622U,  // <3,6,5,u>: Cost 3 vext1 <4,3,6,5>, LHS
-  2732970749U,  // <3,6,6,0>: Cost 3 vext3 LHS, <6,6,0,1>
-  2724270856U,  // <3,6,6,1>: Cost 3 vext3 <6,6,1,3>, <6,6,1,3>
-  2624819706U,  // <3,6,6,2>: Cost 3 vext2 <1,2,3,6>, <6,2,7,3>
-  3656223234U,  // <3,6,6,3>: Cost 4 vext1 <5,3,6,6>, <3,4,5,6>
-  2732970788U,  // <3,6,6,4>: Cost 3 vext3 LHS, <6,6,4,4>
-  2732970800U,  // <3,6,6,5>: Cost 3 vext3 LHS, <6,6,5,7>
-  1659228984U,  // <3,6,6,6>: Cost 2 vext3 LHS, <6,6,6,6>
-  1659228994U,  // <3,6,6,7>: Cost 2 vext3 LHS, <6,6,7,7>
-  1659229003U,  // <3,6,6,u>: Cost 2 vext3 LHS, <6,6,u,7>
-  1659229006U,  // <3,6,7,0>: Cost 2 vext3 LHS, <6,7,0,1>
-  2558600201U,  // <3,6,7,1>: Cost 3 vext1 <1,3,6,7>, <1,3,6,7>
-  2558601146U,  // <3,6,7,2>: Cost 3 vext1 <1,3,6,7>, <2,6,3,7>
-  2725081963U,  // <3,6,7,3>: Cost 3 vext3 <6,7,3,3>, <6,7,3,3>
-  1659229046U,  // <3,6,7,4>: Cost 2 vext3 LHS, <6,7,4,5>
-  2715423611U,  // <3,6,7,5>: Cost 3 vext3 <5,1,7,3>, <6,7,5,1>
-  2722059141U,  // <3,6,7,6>: Cost 3 vext3 <6,2,7,3>, <6,7,6,2>
-  2962361654U,  // <3,6,7,7>: Cost 3 vzipr <1,5,3,7>, RHS
-  1659229078U,  // <3,6,7,u>: Cost 2 vext3 LHS, <6,7,u,1>
-  1659229087U,  // <3,6,u,0>: Cost 2 vext3 LHS, <6,u,0,1>
-  2689840041U,  // <3,6,u,1>: Cost 3 vext3 LHS, <6,u,1,2>
-  2558609339U,  // <3,6,u,2>: Cost 3 vext1 <1,3,6,u>, <2,6,3,u>
-  2576525853U,  // <3,6,u,3>: Cost 3 vext1 <4,3,6,u>, <3,4,u,6>
-  1659229127U,  // <3,6,u,4>: Cost 2 vext3 LHS, <6,u,4,5>
-  2689840081U,  // <3,6,u,5>: Cost 3 vext3 LHS, <6,u,5,6>
-  1659228984U,  // <3,6,u,6>: Cost 2 vext3 LHS, <6,6,6,6>
-  1652298720U,  // <3,6,u,7>: Cost 2 vext3 <6,u,7,3>, <6,u,7,3>
-  1659229159U,  // <3,6,u,u>: Cost 2 vext3 LHS, <6,u,u,1>
-  2626813952U,  // <3,7,0,0>: Cost 3 vext2 <1,5,3,7>, <0,0,0,0>
-  1553072230U,  // <3,7,0,1>: Cost 2 vext2 <1,5,3,7>, LHS
-  2626814116U,  // <3,7,0,2>: Cost 3 vext2 <1,5,3,7>, <0,2,0,2>
-  3700556028U,  // <3,7,0,3>: Cost 4 vext2 <1,5,3,7>, <0,3,1,0>
-  2626814290U,  // <3,7,0,4>: Cost 3 vext2 <1,5,3,7>, <0,4,1,5>
-  2582507375U,  // <3,7,0,5>: Cost 3 vext1 <5,3,7,0>, <5,3,7,0>
-  2588480072U,  // <3,7,0,6>: Cost 3 vext1 <6,3,7,0>, <6,3,7,0>
-  2732971055U,  // <3,7,0,7>: Cost 3 vext3 LHS, <7,0,7,1>
-  1553072797U,  // <3,7,0,u>: Cost 2 vext2 <1,5,3,7>, LHS
-  2626814710U,  // <3,7,1,0>: Cost 3 vext2 <1,5,3,7>, <1,0,3,2>
-  2626814772U,  // <3,7,1,1>: Cost 3 vext2 <1,5,3,7>, <1,1,1,1>
-  2626814870U,  // <3,7,1,2>: Cost 3 vext2 <1,5,3,7>, <1,2,3,0>
-  2625487854U,  // <3,7,1,3>: Cost 3 vext2 <1,3,3,7>, <1,3,3,7>
-  2582514998U,  // <3,7,1,4>: Cost 3 vext1 <5,3,7,1>, RHS
-  1553073296U,  // <3,7,1,5>: Cost 2 vext2 <1,5,3,7>, <1,5,3,7>
-  2627478753U,  // <3,7,1,6>: Cost 3 vext2 <1,6,3,7>, <1,6,3,7>
-  2727367810U,  // <3,7,1,7>: Cost 3 vext3 <7,1,7,3>, <7,1,7,3>
-  1555064195U,  // <3,7,1,u>: Cost 2 vext2 <1,u,3,7>, <1,u,3,7>
-  2588491878U,  // <3,7,2,0>: Cost 3 vext1 <6,3,7,2>, LHS
-  3700557318U,  // <3,7,2,1>: Cost 4 vext2 <1,5,3,7>, <2,1,0,3>
-  2626815592U,  // <3,7,2,2>: Cost 3 vext2 <1,5,3,7>, <2,2,2,2>
-  2626815654U,  // <3,7,2,3>: Cost 3 vext2 <1,5,3,7>, <2,3,0,1>
-  2588495158U,  // <3,7,2,4>: Cost 3 vext1 <6,3,7,2>, RHS
-  2632787817U,  // <3,7,2,5>: Cost 3 vext2 <2,5,3,7>, <2,5,3,7>
-  1559709626U,  // <3,7,2,6>: Cost 2 vext2 <2,6,3,7>, <2,6,3,7>
-  2728031443U,  // <3,7,2,7>: Cost 3 vext3 <7,2,7,3>, <7,2,7,3>
-  1561036892U,  // <3,7,2,u>: Cost 2 vext2 <2,u,3,7>, <2,u,3,7>
-  2626816150U,  // <3,7,3,0>: Cost 3 vext2 <1,5,3,7>, <3,0,1,2>
-  2626816268U,  // <3,7,3,1>: Cost 3 vext2 <1,5,3,7>, <3,1,5,3>
-  2633451878U,  // <3,7,3,2>: Cost 3 vext2 <2,6,3,7>, <3,2,6,3>
-  2626816412U,  // <3,7,3,3>: Cost 3 vext2 <1,5,3,7>, <3,3,3,3>
-  2626816514U,  // <3,7,3,4>: Cost 3 vext2 <1,5,3,7>, <3,4,5,6>
-  2638760514U,  // <3,7,3,5>: Cost 3 vext2 <3,5,3,7>, <3,5,3,7>
-  2639424147U,  // <3,7,3,6>: Cost 3 vext2 <3,6,3,7>, <3,6,3,7>
-  2826961920U,  // <3,7,3,7>: Cost 3 vuzpr <1,3,5,7>, <1,3,5,7>
-  2626816798U,  // <3,7,3,u>: Cost 3 vext2 <1,5,3,7>, <3,u,1,2>
-  2582536294U,  // <3,7,4,0>: Cost 3 vext1 <5,3,7,4>, LHS
-  2582537360U,  // <3,7,4,1>: Cost 3 vext1 <5,3,7,4>, <1,5,3,7>
-  2588510138U,  // <3,7,4,2>: Cost 3 vext1 <6,3,7,4>, <2,6,3,7>
-  3700558996U,  // <3,7,4,3>: Cost 4 vext2 <1,5,3,7>, <4,3,6,7>
-  2582539574U,  // <3,7,4,4>: Cost 3 vext1 <5,3,7,4>, RHS
-  1553075510U,  // <3,7,4,5>: Cost 2 vext2 <1,5,3,7>, RHS
-  2588512844U,  // <3,7,4,6>: Cost 3 vext1 <6,3,7,4>, <6,3,7,4>
-  2564625766U,  // <3,7,4,7>: Cost 3 vext1 <2,3,7,4>, <7,4,5,6>
-  1553075753U,  // <3,7,4,u>: Cost 2 vext2 <1,5,3,7>, RHS
-  2732971398U,  // <3,7,5,0>: Cost 3 vext3 LHS, <7,5,0,2>
-  2626817744U,  // <3,7,5,1>: Cost 3 vext2 <1,5,3,7>, <5,1,7,3>
-  3700559649U,  // <3,7,5,2>: Cost 4 vext2 <1,5,3,7>, <5,2,7,3>
-  2626817903U,  // <3,7,5,3>: Cost 3 vext2 <1,5,3,7>, <5,3,7,0>
-  2258728203U,  // <3,7,5,4>: Cost 3 vrev <7,3,4,5>
-  2732971446U,  // <3,7,5,5>: Cost 3 vext3 LHS, <7,5,5,5>
-  2732971457U,  // <3,7,5,6>: Cost 3 vext3 LHS, <7,5,6,7>
-  2826964278U,  // <3,7,5,7>: Cost 3 vuzpr <1,3,5,7>, RHS
-  2826964279U,  // <3,7,5,u>: Cost 3 vuzpr <1,3,5,7>, RHS
-  2732971478U,  // <3,7,6,0>: Cost 3 vext3 LHS, <7,6,0,1>
-  2732971486U,  // <3,7,6,1>: Cost 3 vext3 LHS, <7,6,1,0>
-  2633454074U,  // <3,7,6,2>: Cost 3 vext2 <2,6,3,7>, <6,2,7,3>
-  2633454152U,  // <3,7,6,3>: Cost 3 vext2 <2,6,3,7>, <6,3,7,0>
-  2732971518U,  // <3,7,6,4>: Cost 3 vext3 LHS, <7,6,4,5>
-  2732971526U,  // <3,7,6,5>: Cost 3 vext3 LHS, <7,6,5,4>
-  2732971537U,  // <3,7,6,6>: Cost 3 vext3 LHS, <7,6,6,6>
-  2732971540U,  // <3,7,6,7>: Cost 3 vext3 LHS, <7,6,7,0>
-  2726041124U,  // <3,7,6,u>: Cost 3 vext3 <6,u,7,3>, <7,6,u,7>
-  2570616934U,  // <3,7,7,0>: Cost 3 vext1 <3,3,7,7>, LHS
-  2570617856U,  // <3,7,7,1>: Cost 3 vext1 <3,3,7,7>, <1,3,5,7>
-  2564646635U,  // <3,7,7,2>: Cost 3 vext1 <2,3,7,7>, <2,3,7,7>
-  2570619332U,  // <3,7,7,3>: Cost 3 vext1 <3,3,7,7>, <3,3,7,7>
-  2570620214U,  // <3,7,7,4>: Cost 3 vext1 <3,3,7,7>, RHS
-  2582564726U,  // <3,7,7,5>: Cost 3 vext1 <5,3,7,7>, <5,3,7,7>
-  2588537423U,  // <3,7,7,6>: Cost 3 vext1 <6,3,7,7>, <6,3,7,7>
-  1659229804U,  // <3,7,7,7>: Cost 2 vext3 LHS, <7,7,7,7>
-  1659229804U,  // <3,7,7,u>: Cost 2 vext3 LHS, <7,7,7,7>
-  2626819795U,  // <3,7,u,0>: Cost 3 vext2 <1,5,3,7>, <u,0,1,2>
-  1553078062U,  // <3,7,u,1>: Cost 2 vext2 <1,5,3,7>, LHS
-  2626819973U,  // <3,7,u,2>: Cost 3 vext2 <1,5,3,7>, <u,2,3,0>
-  2826961565U,  // <3,7,u,3>: Cost 3 vuzpr <1,3,5,7>, LHS
-  2626820159U,  // <3,7,u,4>: Cost 3 vext2 <1,5,3,7>, <u,4,5,6>
-  1553078426U,  // <3,7,u,5>: Cost 2 vext2 <1,5,3,7>, RHS
-  1595545808U,  // <3,7,u,6>: Cost 2 vext2 <u,6,3,7>, <u,6,3,7>
-  1659229804U,  // <3,7,u,7>: Cost 2 vext3 LHS, <7,7,7,7>
-  1553078629U,  // <3,7,u,u>: Cost 2 vext2 <1,5,3,7>, LHS
-  1611448320U,  // <3,u,0,0>: Cost 2 vext3 LHS, <0,0,0,0>
-  1611896531U,  // <3,u,0,1>: Cost 2 vext3 LHS, <u,0,1,2>
-  1659672284U,  // <3,u,0,2>: Cost 2 vext3 LHS, <u,0,2,2>
-  1616099045U,  // <3,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2>
-  2685638381U,  // <3,u,0,4>: Cost 3 vext3 LHS, <u,0,4,1>
-  1663874806U,  // <3,u,0,5>: Cost 2 vext3 LHS, <u,0,5,1>
-  1663874816U,  // <3,u,0,6>: Cost 2 vext3 LHS, <u,0,6,2>
-  2960313672U,  // <3,u,0,7>: Cost 3 vzipr <1,2,3,0>, RHS
-  1611896594U,  // <3,u,0,u>: Cost 2 vext3 LHS, <u,0,u,2>
-  1549763324U,  // <3,u,1,0>: Cost 2 vext2 <1,0,3,u>, <1,0,3,u>
-  1550426957U,  // <3,u,1,1>: Cost 2 vext2 <1,1,3,u>, <1,1,3,u>
-   537712430U,  // <3,u,1,2>: Cost 1 vext3 LHS, LHS
-  1616541495U,  // <3,u,1,3>: Cost 2 vext3 LHS, <u,1,3,3>
-  1490930998U,  // <3,u,1,4>: Cost 2 vext1 <2,3,u,1>, RHS
-  1553081489U,  // <3,u,1,5>: Cost 2 vext2 <1,5,3,u>, <1,5,3,u>
-  2627486946U,  // <3,u,1,6>: Cost 3 vext2 <1,6,3,u>, <1,6,3,u>
-  1659230043U,  // <3,u,1,7>: Cost 2 vext3 LHS, <u,1,7,3>
-   537712484U,  // <3,u,1,u>: Cost 1 vext3 LHS, LHS
-  1611890852U,  // <3,u,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
-  2624833102U,  // <3,u,2,1>: Cost 3 vext2 <1,2,3,u>, <2,1,u,3>
-  1557063287U,  // <3,u,2,2>: Cost 2 vext2 <2,2,3,u>, <2,2,3,u>
-  1616099205U,  // <3,u,2,3>: Cost 2 vext3 LHS, <u,2,3,0>
-  1611890892U,  // <3,u,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
-  2689841054U,  // <3,u,2,5>: Cost 3 vext3 LHS, <u,2,5,7>
-  1559717819U,  // <3,u,2,6>: Cost 2 vext2 <2,6,3,u>, <2,6,3,u>
-  1659230124U,  // <3,u,2,7>: Cost 2 vext3 LHS, <u,2,7,3>
-  1616541618U,  // <3,u,2,u>: Cost 2 vext3 LHS, <u,2,u,0>
-  1611896764U,  // <3,u,3,0>: Cost 2 vext3 LHS, <u,3,0,1>
-  1484973079U,  // <3,u,3,1>: Cost 2 vext1 <1,3,u,3>, <1,3,u,3>
-  2685638607U,  // <3,u,3,2>: Cost 3 vext3 LHS, <u,3,2,2>
-   336380006U,  // <3,u,3,3>: Cost 1 vdup3 LHS
-  1611896804U,  // <3,u,3,4>: Cost 2 vext3 LHS, <u,3,4,5>
-  1616541679U,  // <3,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7>
-  2690283512U,  // <3,u,3,6>: Cost 3 vext3 LHS, <u,3,6,7>
-  2959674696U,  // <3,u,3,7>: Cost 3 vzipr <1,1,3,3>, RHS
-   336380006U,  // <3,u,3,u>: Cost 1 vdup3 LHS
-  2558722150U,  // <3,u,4,0>: Cost 3 vext1 <1,3,u,4>, LHS
-  1659672602U,  // <3,u,4,1>: Cost 2 vext3 LHS, <u,4,1,5>
-  1659672612U,  // <3,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6>
-  2689841196U,  // <3,u,4,3>: Cost 3 vext3 LHS, <u,4,3,5>
-  1659227344U,  // <3,u,4,4>: Cost 2 vext3 LHS, <4,4,4,4>
-  1611896895U,  // <3,u,4,5>: Cost 2 vext3 LHS, <u,4,5,6>
-  1663875144U,  // <3,u,4,6>: Cost 2 vext3 LHS, <u,4,6,6>
-  1659230289U,  // <3,u,4,7>: Cost 2 vext3 LHS, <u,4,7,6>
-  1611896922U,  // <3,u,4,u>: Cost 2 vext3 LHS, <u,4,u,6>
-  1490960486U,  // <3,u,5,0>: Cost 2 vext1 <2,3,u,5>, LHS
-  2689841261U,  // <3,u,5,1>: Cost 3 vext3 LHS, <u,5,1,7>
-  1490962162U,  // <3,u,5,2>: Cost 2 vext1 <2,3,u,5>, <2,3,u,5>
-  1616541823U,  // <3,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7>
-  1490963766U,  // <3,u,5,4>: Cost 2 vext1 <2,3,u,5>, RHS
-  1659228164U,  // <3,u,5,5>: Cost 2 vext3 LHS, <5,5,5,5>
-   537712794U,  // <3,u,5,6>: Cost 1 vext3 LHS, RHS
-  1659230371U,  // <3,u,5,7>: Cost 2 vext3 LHS, <u,5,7,7>
-   537712812U,  // <3,u,5,u>: Cost 1 vext3 LHS, RHS
-  2689841327U,  // <3,u,6,0>: Cost 3 vext3 LHS, <u,6,0,1>
-  2558739482U,  // <3,u,6,1>: Cost 3 vext1 <1,3,u,6>, <1,3,u,6>
-  2689841351U,  // <3,u,6,2>: Cost 3 vext3 LHS, <u,6,2,7>
-  1616099536U,  // <3,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7>
-  1659227508U,  // <3,u,6,4>: Cost 2 vext3 LHS, <4,6,4,6>
-  2690283746U,  // <3,u,6,5>: Cost 3 vext3 LHS, <u,6,5,7>
-  1659228984U,  // <3,u,6,6>: Cost 2 vext3 LHS, <6,6,6,6>
-  1659230445U,  // <3,u,6,7>: Cost 2 vext3 LHS, <u,6,7,0>
-  1616099581U,  // <3,u,6,u>: Cost 2 vext3 LHS, <u,6,u,7>
-  1485004902U,  // <3,u,7,0>: Cost 2 vext1 <1,3,u,7>, LHS
-  1485005851U,  // <3,u,7,1>: Cost 2 vext1 <1,3,u,7>, <1,3,u,7>
-  2558748264U,  // <3,u,7,2>: Cost 3 vext1 <1,3,u,7>, <2,2,2,2>
-  3095397021U,  // <3,u,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS
-  1485008182U,  // <3,u,7,4>: Cost 2 vext1 <1,3,u,7>, RHS
-  1659228328U,  // <3,u,7,5>: Cost 2 vext3 LHS, <5,7,5,7>
-  2722060599U,  // <3,u,7,6>: Cost 3 vext3 <6,2,7,3>, <u,7,6,2>
-  1659229804U,  // <3,u,7,7>: Cost 2 vext3 LHS, <7,7,7,7>
-  1485010734U,  // <3,u,7,u>: Cost 2 vext1 <1,3,u,7>, LHS
-  1616099665U,  // <3,u,u,0>: Cost 2 vext3 LHS, <u,u,0,1>
-  1611897179U,  // <3,u,u,1>: Cost 2 vext3 LHS, <u,u,1,2>
-   537712997U,  // <3,u,u,2>: Cost 1 vext3 LHS, LHS
-   336380006U,  // <3,u,u,3>: Cost 1 vdup3 LHS
-  1616099705U,  // <3,u,u,4>: Cost 2 vext3 LHS, <u,u,4,5>
-  1611897219U,  // <3,u,u,5>: Cost 2 vext3 LHS, <u,u,5,6>
-   537713037U,  // <3,u,u,6>: Cost 1 vext3 LHS, RHS
-  1659230607U,  // <3,u,u,7>: Cost 2 vext3 LHS, <u,u,7,0>
-   537713051U,  // <3,u,u,u>: Cost 1 vext3 LHS, LHS
-  2691907584U,  // <4,0,0,0>: Cost 3 vext3 <1,2,3,4>, <0,0,0,0>
-  2691907594U,  // <4,0,0,1>: Cost 3 vext3 <1,2,3,4>, <0,0,1,1>
-  2691907604U,  // <4,0,0,2>: Cost 3 vext3 <1,2,3,4>, <0,0,2,2>
-  3709862144U,  // <4,0,0,3>: Cost 4 vext2 <3,1,4,0>, <0,3,1,4>
-  2684682280U,  // <4,0,0,4>: Cost 3 vext3 <0,0,4,4>, <0,0,4,4>
-  3694600633U,  // <4,0,0,5>: Cost 4 vext2 <0,5,4,0>, <0,5,4,0>
-  3291431290U,  // <4,0,0,6>: Cost 4 vrev <0,4,6,0>
-  3668342067U,  // <4,0,0,7>: Cost 4 vext1 <7,4,0,0>, <7,4,0,0>
-  2691907657U,  // <4,0,0,u>: Cost 3 vext3 <1,2,3,4>, <0,0,u,1>
-  2570715238U,  // <4,0,1,0>: Cost 3 vext1 <3,4,0,1>, LHS
-  2570716058U,  // <4,0,1,1>: Cost 3 vext1 <3,4,0,1>, <1,2,3,4>
-  1618165862U,  // <4,0,1,2>: Cost 2 vext3 <1,2,3,4>, LHS
-  2570717648U,  // <4,0,1,3>: Cost 3 vext1 <3,4,0,1>, <3,4,0,1>
-  2570718518U,  // <4,0,1,4>: Cost 3 vext1 <3,4,0,1>, RHS
-  2594607206U,  // <4,0,1,5>: Cost 3 vext1 <7,4,0,1>, <5,6,7,4>
-  3662377563U,  // <4,0,1,6>: Cost 4 vext1 <6,4,0,1>, <6,4,0,1>
-  2594608436U,  // <4,0,1,7>: Cost 3 vext1 <7,4,0,1>, <7,4,0,1>
-  1618165916U,  // <4,0,1,u>: Cost 2 vext3 <1,2,3,4>, LHS
-  2685714598U,  // <4,0,2,0>: Cost 3 vext3 <0,2,0,4>, <0,2,0,4>
-  3759530159U,  // <4,0,2,1>: Cost 4 vext3 <0,2,1,4>, <0,2,1,4>
-  2685862072U,  // <4,0,2,2>: Cost 3 vext3 <0,2,2,4>, <0,2,2,4>
-  2631476937U,  // <4,0,2,3>: Cost 3 vext2 <2,3,4,0>, <2,3,4,0>
-  2685714636U,  // <4,0,2,4>: Cost 3 vext3 <0,2,0,4>, <0,2,4,6>
-  3765649622U,  // <4,0,2,5>: Cost 4 vext3 <1,2,3,4>, <0,2,5,7>
-  2686157020U,  // <4,0,2,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4>
-  3668358453U,  // <4,0,2,7>: Cost 4 vext1 <7,4,0,2>, <7,4,0,2>
-  2686304494U,  // <4,0,2,u>: Cost 3 vext3 <0,2,u,4>, <0,2,u,4>
-  3632529510U,  // <4,0,3,0>: Cost 4 vext1 <1,4,0,3>, LHS
-  2686451968U,  // <4,0,3,1>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4>
-  2686525705U,  // <4,0,3,2>: Cost 3 vext3 <0,3,2,4>, <0,3,2,4>
-  3760341266U,  // <4,0,3,3>: Cost 4 vext3 <0,3,3,4>, <0,3,3,4>
-  3632532790U,  // <4,0,3,4>: Cost 4 vext1 <1,4,0,3>, RHS
-  3913254606U,  // <4,0,3,5>: Cost 4 vuzpr <3,4,5,0>, <2,3,4,5>
-  3705219740U,  // <4,0,3,6>: Cost 4 vext2 <2,3,4,0>, <3,6,4,7>
-  3713845990U,  // <4,0,3,7>: Cost 4 vext2 <3,7,4,0>, <3,7,4,0>
-  2686451968U,  // <4,0,3,u>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4>
-  2552823910U,  // <4,0,4,0>: Cost 3 vext1 <0,4,0,4>, LHS
-  2691907922U,  // <4,0,4,1>: Cost 3 vext3 <1,2,3,4>, <0,4,1,5>
-  2691907932U,  // <4,0,4,2>: Cost 3 vext3 <1,2,3,4>, <0,4,2,6>
-  3626567830U,  // <4,0,4,3>: Cost 4 vext1 <0,4,0,4>, <3,0,1,2>
-  2552827190U,  // <4,0,4,4>: Cost 3 vext1 <0,4,0,4>, RHS
-  2631478582U,  // <4,0,4,5>: Cost 3 vext2 <2,3,4,0>, RHS
-  3626570017U,  // <4,0,4,6>: Cost 4 vext1 <0,4,0,4>, <6,0,1,2>
-  3668374839U,  // <4,0,4,7>: Cost 4 vext1 <7,4,0,4>, <7,4,0,4>
-  2552829742U,  // <4,0,4,u>: Cost 3 vext1 <0,4,0,4>, LHS
-  2558804070U,  // <4,0,5,0>: Cost 3 vext1 <1,4,0,5>, LHS
-  1839644774U,  // <4,0,5,1>: Cost 2 vzipl RHS, LHS
-  2913386660U,  // <4,0,5,2>: Cost 3 vzipl RHS, <0,2,0,2>
-  2570750420U,  // <4,0,5,3>: Cost 3 vext1 <3,4,0,5>, <3,4,0,5>
-  2558807350U,  // <4,0,5,4>: Cost 3 vext1 <1,4,0,5>, RHS
-  3987128750U,  // <4,0,5,5>: Cost 4 vzipl RHS, <0,5,2,7>
-  3987128822U,  // <4,0,5,6>: Cost 4 vzipl RHS, <0,6,1,7>
-  2594641208U,  // <4,0,5,7>: Cost 3 vext1 <7,4,0,5>, <7,4,0,5>
-  1839645341U,  // <4,0,5,u>: Cost 2 vzipl RHS, LHS
-  2552840294U,  // <4,0,6,0>: Cost 3 vext1 <0,4,0,6>, LHS
-  3047604234U,  // <4,0,6,1>: Cost 3 vtrnl RHS, <0,0,1,1>
-  1973862502U,  // <4,0,6,2>: Cost 2 vtrnl RHS, LHS
-  2570758613U,  // <4,0,6,3>: Cost 3 vext1 <3,4,0,6>, <3,4,0,6>
-  2552843574U,  // <4,0,6,4>: Cost 3 vext1 <0,4,0,6>, RHS
-  2217664887U,  // <4,0,6,5>: Cost 3 vrev <0,4,5,6>
-  3662418528U,  // <4,0,6,6>: Cost 4 vext1 <6,4,0,6>, <6,4,0,6>
-  2658022257U,  // <4,0,6,7>: Cost 3 vext2 <6,7,4,0>, <6,7,4,0>
-  1973862556U,  // <4,0,6,u>: Cost 2 vtrnl RHS, LHS
-  3731764218U,  // <4,0,7,0>: Cost 4 vext2 <6,7,4,0>, <7,0,1,2>
-  3988324454U,  // <4,0,7,1>: Cost 4 vzipl <4,7,5,0>, LHS
-  4122034278U,  // <4,0,7,2>: Cost 4 vtrnl <4,6,7,1>, LHS
-  3735082246U,  // <4,0,7,3>: Cost 4 vext2 <7,3,4,0>, <7,3,4,0>
-  3731764536U,  // <4,0,7,4>: Cost 4 vext2 <6,7,4,0>, <7,4,0,5>
-  3937145718U,  // <4,0,7,5>: Cost 4 vuzpr <7,4,5,0>, <6,7,4,5>
-  3737073145U,  // <4,0,7,6>: Cost 4 vext2 <7,6,4,0>, <7,6,4,0>
-  3731764844U,  // <4,0,7,7>: Cost 4 vext2 <6,7,4,0>, <7,7,7,7>
-  4122034332U,  // <4,0,7,u>: Cost 4 vtrnl <4,6,7,1>, LHS
-  2552856678U,  // <4,0,u,0>: Cost 3 vext1 <0,4,0,u>, LHS
-  1841635430U,  // <4,0,u,1>: Cost 2 vzipl RHS, LHS
-  1618166429U,  // <4,0,u,2>: Cost 2 vext3 <1,2,3,4>, LHS
-  2570774999U,  // <4,0,u,3>: Cost 3 vext1 <3,4,0,u>, <3,4,0,u>
-  2552859958U,  // <4,0,u,4>: Cost 3 vext1 <0,4,0,u>, RHS
-  2631481498U,  // <4,0,u,5>: Cost 3 vext2 <2,3,4,0>, RHS
-  2686157020U,  // <4,0,u,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4>
-  2594665787U,  // <4,0,u,7>: Cost 3 vext1 <7,4,0,u>, <7,4,0,u>
-  1618166483U,  // <4,0,u,u>: Cost 2 vext3 <1,2,3,4>, LHS
-  2617548837U,  // <4,1,0,0>: Cost 3 vext2 <0,0,4,1>, <0,0,4,1>
-  2622857318U,  // <4,1,0,1>: Cost 3 vext2 <0,u,4,1>, LHS
-  3693281484U,  // <4,1,0,2>: Cost 4 vext2 <0,3,4,1>, <0,2,4,6>
-  2691908342U,  // <4,1,0,3>: Cost 3 vext3 <1,2,3,4>, <1,0,3,2>
-  2622857554U,  // <4,1,0,4>: Cost 3 vext2 <0,u,4,1>, <0,4,1,5>
-  3764470538U,  // <4,1,0,5>: Cost 4 vext3 <1,0,5,4>, <1,0,5,4>
-  3695272459U,  // <4,1,0,6>: Cost 4 vext2 <0,6,4,1>, <0,6,4,1>
-  3733094980U,  // <4,1,0,7>: Cost 4 vext2 <7,0,4,1>, <0,7,1,4>
-  2622857885U,  // <4,1,0,u>: Cost 3 vext2 <0,u,4,1>, LHS
-  3696599798U,  // <4,1,1,0>: Cost 4 vext2 <0,u,4,1>, <1,0,3,2>
-  2691097399U,  // <4,1,1,1>: Cost 3 vext3 <1,1,1,4>, <1,1,1,4>
-  2631484314U,  // <4,1,1,2>: Cost 3 vext2 <2,3,4,1>, <1,2,3,4>
-  2691908424U,  // <4,1,1,3>: Cost 3 vext3 <1,2,3,4>, <1,1,3,3>
-  3696600125U,  // <4,1,1,4>: Cost 4 vext2 <0,u,4,1>, <1,4,3,5>
-  3696600175U,  // <4,1,1,5>: Cost 4 vext2 <0,u,4,1>, <1,5,0,1>
-  3696600307U,  // <4,1,1,6>: Cost 4 vext2 <0,u,4,1>, <1,6,5,7>
-  3668423997U,  // <4,1,1,7>: Cost 4 vext1 <7,4,1,1>, <7,4,1,1>
-  2691908469U,  // <4,1,1,u>: Cost 3 vext3 <1,2,3,4>, <1,1,u,3>
-  2570797158U,  // <4,1,2,0>: Cost 3 vext1 <3,4,1,2>, LHS
-  2570797978U,  // <4,1,2,1>: Cost 3 vext1 <3,4,1,2>, <1,2,3,4>
-  3696600680U,  // <4,1,2,2>: Cost 4 vext2 <0,u,4,1>, <2,2,2,2>
-  1618166682U,  // <4,1,2,3>: Cost 2 vext3 <1,2,3,4>, <1,2,3,4>
-  2570800438U,  // <4,1,2,4>: Cost 3 vext1 <3,4,1,2>, RHS
-  3765650347U,  // <4,1,2,5>: Cost 4 vext3 <1,2,3,4>, <1,2,5,3>
-  3696601018U,  // <4,1,2,6>: Cost 4 vext2 <0,u,4,1>, <2,6,3,7>
-  3668432190U,  // <4,1,2,7>: Cost 4 vext1 <7,4,1,2>, <7,4,1,2>
-  1618535367U,  // <4,1,2,u>: Cost 2 vext3 <1,2,u,4>, <1,2,u,4>
-  2564833382U,  // <4,1,3,0>: Cost 3 vext1 <2,4,1,3>, LHS
-  2691908568U,  // <4,1,3,1>: Cost 3 vext3 <1,2,3,4>, <1,3,1,3>
-  2691908578U,  // <4,1,3,2>: Cost 3 vext3 <1,2,3,4>, <1,3,2,4>
-  2692572139U,  // <4,1,3,3>: Cost 3 vext3 <1,3,3,4>, <1,3,3,4>
-  2564836662U,  // <4,1,3,4>: Cost 3 vext1 <2,4,1,3>, RHS
-  2691908608U,  // <4,1,3,5>: Cost 3 vext3 <1,2,3,4>, <1,3,5,7>
-  2588725862U,  // <4,1,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
-  3662468090U,  // <4,1,3,7>: Cost 4 vext1 <6,4,1,3>, <7,0,1,2>
-  2691908631U,  // <4,1,3,u>: Cost 3 vext3 <1,2,3,4>, <1,3,u,3>
-  3760194590U,  // <4,1,4,0>: Cost 4 vext3 <0,3,1,4>, <1,4,0,1>
-  3693947874U,  // <4,1,4,1>: Cost 4 vext2 <0,4,4,1>, <4,1,5,0>
-  3765650484U,  // <4,1,4,2>: Cost 4 vext3 <1,2,3,4>, <1,4,2,5>
-  3113877606U,  // <4,1,4,3>: Cost 3 vtrnr <4,4,4,4>, LHS
-  3760194630U,  // <4,1,4,4>: Cost 4 vext3 <0,3,1,4>, <1,4,4,5>
-  2622860598U,  // <4,1,4,5>: Cost 3 vext2 <0,u,4,1>, RHS
-  3297436759U,  // <4,1,4,6>: Cost 4 vrev <1,4,6,4>
-  3800007772U,  // <4,1,4,7>: Cost 4 vext3 <7,0,1,4>, <1,4,7,0>
-  2622860841U,  // <4,1,4,u>: Cost 3 vext2 <0,u,4,1>, RHS
-  1479164006U,  // <4,1,5,0>: Cost 2 vext1 <0,4,1,5>, LHS
-  2552906486U,  // <4,1,5,1>: Cost 3 vext1 <0,4,1,5>, <1,0,3,2>
-  2552907299U,  // <4,1,5,2>: Cost 3 vext1 <0,4,1,5>, <2,1,3,5>
-  2552907926U,  // <4,1,5,3>: Cost 3 vext1 <0,4,1,5>, <3,0,1,2>
-  1479167286U,  // <4,1,5,4>: Cost 2 vext1 <0,4,1,5>, RHS
-  2913387664U,  // <4,1,5,5>: Cost 3 vzipl RHS, <1,5,3,7>
-  2600686074U,  // <4,1,5,6>: Cost 3 vext1 <u,4,1,5>, <6,2,7,3>
-  2600686586U,  // <4,1,5,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2>
-  1479169838U,  // <4,1,5,u>: Cost 2 vext1 <0,4,1,5>, LHS
-  2552914022U,  // <4,1,6,0>: Cost 3 vext1 <0,4,1,6>, LHS
-  2558886708U,  // <4,1,6,1>: Cost 3 vext1 <1,4,1,6>, <1,1,1,1>
-  4028205206U,  // <4,1,6,2>: Cost 4 vzipr <0,2,4,6>, <3,0,1,2>
-  3089858662U,  // <4,1,6,3>: Cost 3 vtrnr <0,4,2,6>, LHS
-  2552917302U,  // <4,1,6,4>: Cost 3 vext1 <0,4,1,6>, RHS
-  2223637584U,  // <4,1,6,5>: Cost 3 vrev <1,4,5,6>
-  4121347081U,  // <4,1,6,6>: Cost 4 vtrnl RHS, <1,3,6,7>
-  3721155406U,  // <4,1,6,7>: Cost 4 vext2 <5,0,4,1>, <6,7,0,1>
-  2552919854U,  // <4,1,6,u>: Cost 3 vext1 <0,4,1,6>, LHS
-  2659357716U,  // <4,1,7,0>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1>
-  3733763173U,  // <4,1,7,1>: Cost 4 vext2 <7,1,4,1>, <7,1,4,1>
-  3734426806U,  // <4,1,7,2>: Cost 4 vext2 <7,2,4,1>, <7,2,4,1>
-  2695226671U,  // <4,1,7,3>: Cost 3 vext3 <1,7,3,4>, <1,7,3,4>
-  3721155942U,  // <4,1,7,4>: Cost 4 vext2 <5,0,4,1>, <7,4,5,6>
-  3721155976U,  // <4,1,7,5>: Cost 4 vext2 <5,0,4,1>, <7,5,0,4>
-  3662500458U,  // <4,1,7,6>: Cost 4 vext1 <6,4,1,7>, <6,4,1,7>
-  3721156204U,  // <4,1,7,7>: Cost 4 vext2 <5,0,4,1>, <7,7,7,7>
-  2659357716U,  // <4,1,7,u>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1>
-  1479188582U,  // <4,1,u,0>: Cost 2 vext1 <0,4,1,u>, LHS
-  2552931062U,  // <4,1,u,1>: Cost 3 vext1 <0,4,1,u>, <1,0,3,2>
-  2552931944U,  // <4,1,u,2>: Cost 3 vext1 <0,4,1,u>, <2,2,2,2>
-  1622148480U,  // <4,1,u,3>: Cost 2 vext3 <1,u,3,4>, <1,u,3,4>
-  1479191862U,  // <4,1,u,4>: Cost 2 vext1 <0,4,1,u>, RHS
-  2622863514U,  // <4,1,u,5>: Cost 3 vext2 <0,u,4,1>, RHS
-  2588725862U,  // <4,1,u,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
-  2600686586U,  // <4,1,u,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2>
-  1479194414U,  // <4,1,u,u>: Cost 2 vext1 <0,4,1,u>, LHS
-  2617557030U,  // <4,2,0,0>: Cost 3 vext2 <0,0,4,2>, <0,0,4,2>
-  2622865510U,  // <4,2,0,1>: Cost 3 vext2 <0,u,4,2>, LHS
-  2622865612U,  // <4,2,0,2>: Cost 3 vext2 <0,u,4,2>, <0,2,4,6>
-  3693289753U,  // <4,2,0,3>: Cost 4 vext2 <0,3,4,2>, <0,3,4,2>
-  2635473244U,  // <4,2,0,4>: Cost 3 vext2 <3,0,4,2>, <0,4,2,6>
-  3765650918U,  // <4,2,0,5>: Cost 4 vext3 <1,2,3,4>, <2,0,5,7>
-  2696775148U,  // <4,2,0,6>: Cost 3 vext3 <2,0,6,4>, <2,0,6,4>
-  3695944285U,  // <4,2,0,7>: Cost 4 vext2 <0,7,4,2>, <0,7,4,2>
-  2622866077U,  // <4,2,0,u>: Cost 3 vext2 <0,u,4,2>, LHS
-  3696607990U,  // <4,2,1,0>: Cost 4 vext2 <0,u,4,2>, <1,0,3,2>
-  3696608052U,  // <4,2,1,1>: Cost 4 vext2 <0,u,4,2>, <1,1,1,1>
-  3696608150U,  // <4,2,1,2>: Cost 4 vext2 <0,u,4,2>, <1,2,3,0>
-  3895574630U,  // <4,2,1,3>: Cost 4 vuzpr <0,4,u,2>, LHS
-  2691909162U,  // <4,2,1,4>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3>
-  3696608400U,  // <4,2,1,5>: Cost 4 vext2 <0,u,4,2>, <1,5,3,7>
-  3760784956U,  // <4,2,1,6>: Cost 4 vext3 <0,4,0,4>, <2,1,6,3>
-  3773908549U,  // <4,2,1,7>: Cost 5 vext3 <2,5,7,4>, <2,1,7,3>
-  2691909162U,  // <4,2,1,u>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3>
-  3696608748U,  // <4,2,2,0>: Cost 4 vext2 <0,u,4,2>, <2,0,6,4>
-  3696608828U,  // <4,2,2,1>: Cost 4 vext2 <0,u,4,2>, <2,1,6,3>
-  2691909224U,  // <4,2,2,2>: Cost 3 vext3 <1,2,3,4>, <2,2,2,2>
-  2691909234U,  // <4,2,2,3>: Cost 3 vext3 <1,2,3,4>, <2,2,3,3>
-  3759605368U,  // <4,2,2,4>: Cost 4 vext3 <0,2,2,4>, <2,2,4,0>
-  3696609156U,  // <4,2,2,5>: Cost 4 vext2 <0,u,4,2>, <2,5,6,7>
-  3760785040U,  // <4,2,2,6>: Cost 4 vext3 <0,4,0,4>, <2,2,6,6>
-  3668505927U,  // <4,2,2,7>: Cost 4 vext1 <7,4,2,2>, <7,4,2,2>
-  2691909279U,  // <4,2,2,u>: Cost 3 vext3 <1,2,3,4>, <2,2,u,3>
-  2691909286U,  // <4,2,3,0>: Cost 3 vext3 <1,2,3,4>, <2,3,0,1>
-  3764840111U,  // <4,2,3,1>: Cost 4 vext3 <1,1,1,4>, <2,3,1,1>
-  3765651129U,  // <4,2,3,2>: Cost 4 vext3 <1,2,3,4>, <2,3,2,2>
-  2698544836U,  // <4,2,3,3>: Cost 3 vext3 <2,3,3,4>, <2,3,3,4>
-  2685863630U,  // <4,2,3,4>: Cost 3 vext3 <0,2,2,4>, <2,3,4,5>
-  2698692310U,  // <4,2,3,5>: Cost 3 vext3 <2,3,5,4>, <2,3,5,4>
-  3772507871U,  // <4,2,3,6>: Cost 4 vext3 <2,3,6,4>, <2,3,6,4>
-  2698839784U,  // <4,2,3,7>: Cost 3 vext3 <2,3,7,4>, <2,3,7,4>
-  2691909358U,  // <4,2,3,u>: Cost 3 vext3 <1,2,3,4>, <2,3,u,1>
-  2564915302U,  // <4,2,4,0>: Cost 3 vext1 <2,4,2,4>, LHS
-  2564916122U,  // <4,2,4,1>: Cost 3 vext1 <2,4,2,4>, <1,2,3,4>
-  2564917004U,  // <4,2,4,2>: Cost 3 vext1 <2,4,2,4>, <2,4,2,4>
-  2699208469U,  // <4,2,4,3>: Cost 3 vext3 <2,4,3,4>, <2,4,3,4>
-  2564918582U,  // <4,2,4,4>: Cost 3 vext1 <2,4,2,4>, RHS
-  2622868790U,  // <4,2,4,5>: Cost 3 vext2 <0,u,4,2>, RHS
-  2229667632U,  // <4,2,4,6>: Cost 3 vrev <2,4,6,4>
-  3800082229U,  // <4,2,4,7>: Cost 4 vext3 <7,0,2,4>, <2,4,7,0>
-  2622869033U,  // <4,2,4,u>: Cost 3 vext2 <0,u,4,2>, RHS
-  2552979558U,  // <4,2,5,0>: Cost 3 vext1 <0,4,2,5>, LHS
-  2558952342U,  // <4,2,5,1>: Cost 3 vext1 <1,4,2,5>, <1,2,3,0>
-  2564925032U,  // <4,2,5,2>: Cost 3 vext1 <2,4,2,5>, <2,2,2,2>
-  2967060582U,  // <4,2,5,3>: Cost 3 vzipr <2,3,4,5>, LHS
-  2552982838U,  // <4,2,5,4>: Cost 3 vext1 <0,4,2,5>, RHS
-  3987130190U,  // <4,2,5,5>: Cost 4 vzipl RHS, <2,5,0,7>
-  2913388474U,  // <4,2,5,6>: Cost 3 vzipl RHS, <2,6,3,7>
-  3895577910U,  // <4,2,5,7>: Cost 4 vuzpr <0,4,u,2>, RHS
-  2552985390U,  // <4,2,5,u>: Cost 3 vext1 <0,4,2,5>, LHS
-  1479245926U,  // <4,2,6,0>: Cost 2 vext1 <0,4,2,6>, LHS
-  2552988406U,  // <4,2,6,1>: Cost 3 vext1 <0,4,2,6>, <1,0,3,2>
-  2552989288U,  // <4,2,6,2>: Cost 3 vext1 <0,4,2,6>, <2,2,2,2>
-  2954461286U,  // <4,2,6,3>: Cost 3 vzipr <0,2,4,6>, LHS
-  1479249206U,  // <4,2,6,4>: Cost 2 vext1 <0,4,2,6>, RHS
-  2229610281U,  // <4,2,6,5>: Cost 3 vrev <2,4,5,6>
-  2600767994U,  // <4,2,6,6>: Cost 3 vext1 <u,4,2,6>, <6,2,7,3>
-  2600768506U,  // <4,2,6,7>: Cost 3 vext1 <u,4,2,6>, <7,0,1,2>
-  1479251758U,  // <4,2,6,u>: Cost 2 vext1 <0,4,2,6>, LHS
-  2659365909U,  // <4,2,7,0>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2>
-  3733771366U,  // <4,2,7,1>: Cost 4 vext2 <7,1,4,2>, <7,1,4,2>
-  3734434999U,  // <4,2,7,2>: Cost 4 vext2 <7,2,4,2>, <7,2,4,2>
-  2701199368U,  // <4,2,7,3>: Cost 3 vext3 <2,7,3,4>, <2,7,3,4>
-  4175774618U,  // <4,2,7,4>: Cost 4 vtrnr <2,4,5,7>, <1,2,3,4>
-  3303360298U,  // <4,2,7,5>: Cost 4 vrev <2,4,5,7>
-  3727136217U,  // <4,2,7,6>: Cost 4 vext2 <6,0,4,2>, <7,6,0,4>
-  3727136364U,  // <4,2,7,7>: Cost 4 vext2 <6,0,4,2>, <7,7,7,7>
-  2659365909U,  // <4,2,7,u>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2>
-  1479262310U,  // <4,2,u,0>: Cost 2 vext1 <0,4,2,u>, LHS
-  2553004790U,  // <4,2,u,1>: Cost 3 vext1 <0,4,2,u>, <1,0,3,2>
-  2553005672U,  // <4,2,u,2>: Cost 3 vext1 <0,4,2,u>, <2,2,2,2>
-  2954477670U,  // <4,2,u,3>: Cost 3 vzipr <0,2,4,u>, LHS
-  1479265590U,  // <4,2,u,4>: Cost 2 vext1 <0,4,2,u>, RHS
-  2622871706U,  // <4,2,u,5>: Cost 3 vext2 <0,u,4,2>, RHS
-  2229700404U,  // <4,2,u,6>: Cost 3 vrev <2,4,6,u>
-  2600784890U,  // <4,2,u,7>: Cost 3 vext1 <u,4,2,u>, <7,0,1,2>
-  1479268142U,  // <4,2,u,u>: Cost 2 vext1 <0,4,2,u>, LHS
-  3765651595U,  // <4,3,0,0>: Cost 4 vext3 <1,2,3,4>, <3,0,0,0>
-  2691909782U,  // <4,3,0,1>: Cost 3 vext3 <1,2,3,4>, <3,0,1,2>
-  2702452897U,  // <4,3,0,2>: Cost 3 vext3 <3,0,2,4>, <3,0,2,4>
-  3693297946U,  // <4,3,0,3>: Cost 4 vext2 <0,3,4,3>, <0,3,4,3>
-  3760711856U,  // <4,3,0,4>: Cost 4 vext3 <0,3,u,4>, <3,0,4,1>
-  2235533820U,  // <4,3,0,5>: Cost 3 vrev <3,4,5,0>
-  3309349381U,  // <4,3,0,6>: Cost 4 vrev <3,4,6,0>
-  3668563278U,  // <4,3,0,7>: Cost 4 vext1 <7,4,3,0>, <7,4,3,0>
-  2691909845U,  // <4,3,0,u>: Cost 3 vext3 <1,2,3,4>, <3,0,u,2>
-  2235173328U,  // <4,3,1,0>: Cost 3 vrev <3,4,0,1>
-  3764840678U,  // <4,3,1,1>: Cost 4 vext3 <1,1,1,4>, <3,1,1,1>
-  2630173594U,  // <4,3,1,2>: Cost 3 vext2 <2,1,4,3>, <1,2,3,4>
-  2703190267U,  // <4,3,1,3>: Cost 3 vext3 <3,1,3,4>, <3,1,3,4>
-  3760195840U,  // <4,3,1,4>: Cost 4 vext3 <0,3,1,4>, <3,1,4,0>
-  3765651724U,  // <4,3,1,5>: Cost 4 vext3 <1,2,3,4>, <3,1,5,3>
-  3309357574U,  // <4,3,1,6>: Cost 4 vrev <3,4,6,1>
-  3769633054U,  // <4,3,1,7>: Cost 4 vext3 <1,u,3,4>, <3,1,7,3>
-  2703558952U,  // <4,3,1,u>: Cost 3 vext3 <3,1,u,4>, <3,1,u,4>
-  3626770534U,  // <4,3,2,0>: Cost 4 vext1 <0,4,3,2>, LHS
-  2630174250U,  // <4,3,2,1>: Cost 3 vext2 <2,1,4,3>, <2,1,4,3>
-  3765651777U,  // <4,3,2,2>: Cost 4 vext3 <1,2,3,4>, <3,2,2,2>
-  2703853900U,  // <4,3,2,3>: Cost 3 vext3 <3,2,3,4>, <3,2,3,4>
-  3626773814U,  // <4,3,2,4>: Cost 4 vext1 <0,4,3,2>, RHS
-  2704001374U,  // <4,3,2,5>: Cost 3 vext3 <3,2,5,4>, <3,2,5,4>
-  3765651814U,  // <4,3,2,6>: Cost 4 vext3 <1,2,3,4>, <3,2,6,3>
-  3769633135U,  // <4,3,2,7>: Cost 4 vext3 <1,u,3,4>, <3,2,7,3>
-  2634819681U,  // <4,3,2,u>: Cost 3 vext2 <2,u,4,3>, <2,u,4,3>
-  3765651839U,  // <4,3,3,0>: Cost 4 vext3 <1,2,3,4>, <3,3,0,1>
-  3765651848U,  // <4,3,3,1>: Cost 4 vext3 <1,2,3,4>, <3,3,1,1>
-  3710552404U,  // <4,3,3,2>: Cost 4 vext2 <3,2,4,3>, <3,2,4,3>
-  2691910044U,  // <4,3,3,3>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3>
-  2704591270U,  // <4,3,3,4>: Cost 3 vext3 <3,3,4,4>, <3,3,4,4>
-  3769633202U,  // <4,3,3,5>: Cost 4 vext3 <1,u,3,4>, <3,3,5,7>
-  3703917212U,  // <4,3,3,6>: Cost 4 vext2 <2,1,4,3>, <3,6,4,7>
-  3769633220U,  // <4,3,3,7>: Cost 4 vext3 <1,u,3,4>, <3,3,7,7>
-  2691910044U,  // <4,3,3,u>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3>
-  2691910096U,  // <4,3,4,0>: Cost 3 vext3 <1,2,3,4>, <3,4,0,1>
-  2691910106U,  // <4,3,4,1>: Cost 3 vext3 <1,2,3,4>, <3,4,1,2>
-  2564990741U,  // <4,3,4,2>: Cost 3 vext1 <2,4,3,4>, <2,4,3,4>
-  3765651946U,  // <4,3,4,3>: Cost 4 vext3 <1,2,3,4>, <3,4,3,0>
-  2691910136U,  // <4,3,4,4>: Cost 3 vext3 <1,2,3,4>, <3,4,4,5>
-  2686454274U,  // <4,3,4,5>: Cost 3 vext3 <0,3,1,4>, <3,4,5,6>
-  2235640329U,  // <4,3,4,6>: Cost 3 vrev <3,4,6,4>
-  3801483792U,  // <4,3,4,7>: Cost 4 vext3 <7,2,3,4>, <3,4,7,2>
-  2691910168U,  // <4,3,4,u>: Cost 3 vext3 <1,2,3,4>, <3,4,u,1>
-  2559025254U,  // <4,3,5,0>: Cost 3 vext1 <1,4,3,5>, LHS
-  2559026237U,  // <4,3,5,1>: Cost 3 vext1 <1,4,3,5>, <1,4,3,5>
-  2564998862U,  // <4,3,5,2>: Cost 3 vext1 <2,4,3,5>, <2,3,4,5>
-  2570971548U,  // <4,3,5,3>: Cost 3 vext1 <3,4,3,5>, <3,3,3,3>
-  2559028534U,  // <4,3,5,4>: Cost 3 vext1 <1,4,3,5>, RHS
-  4163519477U,  // <4,3,5,5>: Cost 4 vtrnr <0,4,1,5>, <1,3,4,5>
-  3309390346U,  // <4,3,5,6>: Cost 4 vrev <3,4,6,5>
-  2706139747U,  // <4,3,5,7>: Cost 3 vext3 <3,5,7,4>, <3,5,7,4>
-  2559031086U,  // <4,3,5,u>: Cost 3 vext1 <1,4,3,5>, LHS
-  2559033446U,  // <4,3,6,0>: Cost 3 vext1 <1,4,3,6>, LHS
-  2559034430U,  // <4,3,6,1>: Cost 3 vext1 <1,4,3,6>, <1,4,3,6>
-  2565007127U,  // <4,3,6,2>: Cost 3 vext1 <2,4,3,6>, <2,4,3,6>
-  2570979740U,  // <4,3,6,3>: Cost 3 vext1 <3,4,3,6>, <3,3,3,3>
-  2559036726U,  // <4,3,6,4>: Cost 3 vext1 <1,4,3,6>, RHS
-  1161841154U,  // <4,3,6,5>: Cost 2 vrev <3,4,5,6>
-  4028203932U,  // <4,3,6,6>: Cost 4 vzipr <0,2,4,6>, <1,2,3,6>
-  2706803380U,  // <4,3,6,7>: Cost 3 vext3 <3,6,7,4>, <3,6,7,4>
-  1162062365U,  // <4,3,6,u>: Cost 2 vrev <3,4,u,6>
-  3769633475U,  // <4,3,7,0>: Cost 4 vext3 <1,u,3,4>, <3,7,0,1>
-  3769633488U,  // <4,3,7,1>: Cost 4 vext3 <1,u,3,4>, <3,7,1,5>
-  3638757144U,  // <4,3,7,2>: Cost 4 vext1 <2,4,3,7>, <2,4,3,7>
-  3769633508U,  // <4,3,7,3>: Cost 4 vext3 <1,u,3,4>, <3,7,3,7>
-  3769633515U,  // <4,3,7,4>: Cost 4 vext3 <1,u,3,4>, <3,7,4,5>
-  3769633526U,  // <4,3,7,5>: Cost 4 vext3 <1,u,3,4>, <3,7,5,7>
-  3662647932U,  // <4,3,7,6>: Cost 4 vext1 <6,4,3,7>, <6,4,3,7>
-  3781208837U,  // <4,3,7,7>: Cost 4 vext3 <3,7,7,4>, <3,7,7,4>
-  3769633547U,  // <4,3,7,u>: Cost 4 vext3 <1,u,3,4>, <3,7,u,1>
-  2559049830U,  // <4,3,u,0>: Cost 3 vext1 <1,4,3,u>, LHS
-  2691910430U,  // <4,3,u,1>: Cost 3 vext3 <1,2,3,4>, <3,u,1,2>
-  2565023513U,  // <4,3,u,2>: Cost 3 vext1 <2,4,3,u>, <2,4,3,u>
-  2707835698U,  // <4,3,u,3>: Cost 3 vext3 <3,u,3,4>, <3,u,3,4>
-  2559053110U,  // <4,3,u,4>: Cost 3 vext1 <1,4,3,u>, RHS
-  1161857540U,  // <4,3,u,5>: Cost 2 vrev <3,4,5,u>
-  2235673101U,  // <4,3,u,6>: Cost 3 vrev <3,4,6,u>
-  2708130646U,  // <4,3,u,7>: Cost 3 vext3 <3,u,7,4>, <3,u,7,4>
-  1162078751U,  // <4,3,u,u>: Cost 2 vrev <3,4,u,u>
-  2617573416U,  // <4,4,0,0>: Cost 3 vext2 <0,0,4,4>, <0,0,4,4>
-  1570373734U,  // <4,4,0,1>: Cost 2 vext2 <4,4,4,4>, LHS
-  2779676774U,  // <4,4,0,2>: Cost 3 vuzpl <4,6,4,6>, LHS
-  3760196480U,  // <4,4,0,3>: Cost 4 vext3 <0,3,1,4>, <4,0,3,1>
-  2576977100U,  // <4,4,0,4>: Cost 3 vext1 <4,4,4,0>, <4,4,4,0>
-  2718747538U,  // <4,4,0,5>: Cost 3 vext3 <5,6,7,4>, <4,0,5,1>
-  2718747548U,  // <4,4,0,6>: Cost 3 vext3 <5,6,7,4>, <4,0,6,2>
-  3668637015U,  // <4,4,0,7>: Cost 4 vext1 <7,4,4,0>, <7,4,4,0>
-  1570374301U,  // <4,4,0,u>: Cost 2 vext2 <4,4,4,4>, LHS
-  2644116214U,  // <4,4,1,0>: Cost 3 vext2 <4,4,4,4>, <1,0,3,2>
-  2644116276U,  // <4,4,1,1>: Cost 3 vext2 <4,4,4,4>, <1,1,1,1>
-  2691910602U,  // <4,4,1,2>: Cost 3 vext3 <1,2,3,4>, <4,1,2,3>
-  2644116440U,  // <4,4,1,3>: Cost 3 vext2 <4,4,4,4>, <1,3,1,3>
-  2711227356U,  // <4,4,1,4>: Cost 3 vext3 <4,4,4,4>, <4,1,4,3>
-  2709310438U,  // <4,4,1,5>: Cost 3 vext3 <4,1,5,4>, <4,1,5,4>
-  3765652462U,  // <4,4,1,6>: Cost 4 vext3 <1,2,3,4>, <4,1,6,3>
-  3768970231U,  // <4,4,1,7>: Cost 4 vext3 <1,7,3,4>, <4,1,7,3>
-  2695891968U,  // <4,4,1,u>: Cost 3 vext3 <1,u,3,4>, <4,1,u,3>
-  3703260634U,  // <4,4,2,0>: Cost 4 vext2 <2,0,4,4>, <2,0,4,4>
-  3765652499U,  // <4,4,2,1>: Cost 4 vext3 <1,2,3,4>, <4,2,1,4>
-  2644117096U,  // <4,4,2,2>: Cost 3 vext2 <4,4,4,4>, <2,2,2,2>
-  2631509709U,  // <4,4,2,3>: Cost 3 vext2 <2,3,4,4>, <2,3,4,4>
-  2644117269U,  // <4,4,2,4>: Cost 3 vext2 <4,4,4,4>, <2,4,3,4>
-  3705251698U,  // <4,4,2,5>: Cost 4 vext2 <2,3,4,4>, <2,5,4,7>
-  2710047808U,  // <4,4,2,6>: Cost 3 vext3 <4,2,6,4>, <4,2,6,4>
-  3783863369U,  // <4,4,2,7>: Cost 4 vext3 <4,2,7,4>, <4,2,7,4>
-  2634827874U,  // <4,4,2,u>: Cost 3 vext2 <2,u,4,4>, <2,u,4,4>
-  2644117654U,  // <4,4,3,0>: Cost 3 vext2 <4,4,4,4>, <3,0,1,2>
-  3638797210U,  // <4,4,3,1>: Cost 4 vext1 <2,4,4,3>, <1,2,3,4>
-  3638798082U,  // <4,4,3,2>: Cost 4 vext1 <2,4,4,3>, <2,4,1,3>
-  2637482406U,  // <4,4,3,3>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4>
-  2638146039U,  // <4,4,3,4>: Cost 3 vext2 <3,4,4,4>, <3,4,4,4>
-  3913287374U,  // <4,4,3,5>: Cost 4 vuzpr <3,4,5,4>, <2,3,4,5>
-  3765652625U,  // <4,4,3,6>: Cost 4 vext3 <1,2,3,4>, <4,3,6,4>
-  3713878762U,  // <4,4,3,7>: Cost 4 vext2 <3,7,4,4>, <3,7,4,4>
-  2637482406U,  // <4,4,3,u>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4>
-  1503264870U,  // <4,4,4,0>: Cost 2 vext1 <4,4,4,4>, LHS
-  2577007514U,  // <4,4,4,1>: Cost 3 vext1 <4,4,4,4>, <1,2,3,4>
-  2577008232U,  // <4,4,4,2>: Cost 3 vext1 <4,4,4,4>, <2,2,2,2>
-  2571037175U,  // <4,4,4,3>: Cost 3 vext1 <3,4,4,4>, <3,4,4,4>
-   161926454U,  // <4,4,4,4>: Cost 1 vdup0 RHS
-  1570377014U,  // <4,4,4,5>: Cost 2 vext2 <4,4,4,4>, RHS
-  2779680054U,  // <4,4,4,6>: Cost 3 vuzpl <4,6,4,6>, RHS
-  2594927963U,  // <4,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4>
-   161926454U,  // <4,4,4,u>: Cost 1 vdup0 RHS
-  2571042918U,  // <4,4,5,0>: Cost 3 vext1 <3,4,4,5>, LHS
-  2571043738U,  // <4,4,5,1>: Cost 3 vext1 <3,4,4,5>, <1,2,3,4>
-  3638814495U,  // <4,4,5,2>: Cost 4 vext1 <2,4,4,5>, <2,4,4,5>
-  2571045368U,  // <4,4,5,3>: Cost 3 vext1 <3,4,4,5>, <3,4,4,5>
-  2571046198U,  // <4,4,5,4>: Cost 3 vext1 <3,4,4,5>, RHS
-  1839648054U,  // <4,4,5,5>: Cost 2 vzipl RHS, RHS
-  1618169142U,  // <4,4,5,6>: Cost 2 vext3 <1,2,3,4>, RHS
-  2594936156U,  // <4,4,5,7>: Cost 3 vext1 <7,4,4,5>, <7,4,4,5>
-  1618169160U,  // <4,4,5,u>: Cost 2 vext3 <1,2,3,4>, RHS
-  2553135206U,  // <4,4,6,0>: Cost 3 vext1 <0,4,4,6>, LHS
-  3626877686U,  // <4,4,6,1>: Cost 4 vext1 <0,4,4,6>, <1,0,3,2>
-  2565080782U,  // <4,4,6,2>: Cost 3 vext1 <2,4,4,6>, <2,3,4,5>
-  2571053561U,  // <4,4,6,3>: Cost 3 vext1 <3,4,4,6>, <3,4,4,6>
-  2553138486U,  // <4,4,6,4>: Cost 3 vext1 <0,4,4,6>, RHS
-  2241555675U,  // <4,4,6,5>: Cost 3 vrev <4,4,5,6>
-  1973865782U,  // <4,4,6,6>: Cost 2 vtrnl RHS, RHS
-  2658055029U,  // <4,4,6,7>: Cost 3 vext2 <6,7,4,4>, <6,7,4,4>
-  1973865800U,  // <4,4,6,u>: Cost 2 vtrnl RHS, RHS
-  2644120570U,  // <4,4,7,0>: Cost 3 vext2 <4,4,4,4>, <7,0,1,2>
-  3638829978U,  // <4,4,7,1>: Cost 4 vext1 <2,4,4,7>, <1,2,3,4>
-  3638830881U,  // <4,4,7,2>: Cost 4 vext1 <2,4,4,7>, <2,4,4,7>
-  3735115018U,  // <4,4,7,3>: Cost 4 vext2 <7,3,4,4>, <7,3,4,4>
-  2662036827U,  // <4,4,7,4>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4>
-  2713292236U,  // <4,4,7,5>: Cost 3 vext3 <4,7,5,4>, <4,7,5,4>
-  2713365973U,  // <4,4,7,6>: Cost 3 vext3 <4,7,6,4>, <4,7,6,4>
-  2644121196U,  // <4,4,7,7>: Cost 3 vext2 <4,4,4,4>, <7,7,7,7>
-  2662036827U,  // <4,4,7,u>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4>
-  1503297638U,  // <4,4,u,0>: Cost 2 vext1 <4,4,4,u>, LHS
-  1570379566U,  // <4,4,u,1>: Cost 2 vext2 <4,4,4,4>, LHS
-  2779682606U,  // <4,4,u,2>: Cost 3 vuzpl <4,6,4,6>, LHS
-  2571069947U,  // <4,4,u,3>: Cost 3 vext1 <3,4,4,u>, <3,4,4,u>
-   161926454U,  // <4,4,u,4>: Cost 1 vdup0 RHS
-  1841638710U,  // <4,4,u,5>: Cost 2 vzipl RHS, RHS
-  1618169385U,  // <4,4,u,6>: Cost 2 vext3 <1,2,3,4>, RHS
-  2594960735U,  // <4,4,u,7>: Cost 3 vext1 <7,4,4,u>, <7,4,4,u>
-   161926454U,  // <4,4,u,u>: Cost 1 vdup0 RHS
-  2631516160U,  // <4,5,0,0>: Cost 3 vext2 <2,3,4,5>, <0,0,0,0>
-  1557774438U,  // <4,5,0,1>: Cost 2 vext2 <2,3,4,5>, LHS
-  2618908875U,  // <4,5,0,2>: Cost 3 vext2 <0,2,4,5>, <0,2,4,5>
-  2571078140U,  // <4,5,0,3>: Cost 3 vext1 <3,4,5,0>, <3,4,5,0>
-  2626871634U,  // <4,5,0,4>: Cost 3 vext2 <1,5,4,5>, <0,4,1,5>
-  3705258414U,  // <4,5,0,5>: Cost 4 vext2 <2,3,4,5>, <0,5,2,7>
-  2594968438U,  // <4,5,0,6>: Cost 3 vext1 <7,4,5,0>, <6,7,4,5>
-  2594968928U,  // <4,5,0,7>: Cost 3 vext1 <7,4,5,0>, <7,4,5,0>
-  1557775005U,  // <4,5,0,u>: Cost 2 vext2 <2,3,4,5>, LHS
-  2631516918U,  // <4,5,1,0>: Cost 3 vext2 <2,3,4,5>, <1,0,3,2>
-  2624217939U,  // <4,5,1,1>: Cost 3 vext2 <1,1,4,5>, <1,1,4,5>
-  2631517078U,  // <4,5,1,2>: Cost 3 vext2 <2,3,4,5>, <1,2,3,0>
-  2821341286U,  // <4,5,1,3>: Cost 3 vuzpr <0,4,1,5>, LHS
-  3895086054U,  // <4,5,1,4>: Cost 4 vuzpr <0,4,1,5>, <4,1,5,4>
-  2626872471U,  // <4,5,1,5>: Cost 3 vext2 <1,5,4,5>, <1,5,4,5>
-  3895083131U,  // <4,5,1,6>: Cost 4 vuzpr <0,4,1,5>, <0,1,4,6>
-  2718748368U,  // <4,5,1,7>: Cost 3 vext3 <5,6,7,4>, <5,1,7,3>
-  2821341291U,  // <4,5,1,u>: Cost 3 vuzpr <0,4,1,5>, LHS
-  2571092070U,  // <4,5,2,0>: Cost 3 vext1 <3,4,5,2>, LHS
-  3699287585U,  // <4,5,2,1>: Cost 4 vext2 <1,3,4,5>, <2,1,3,3>
-  2630854269U,  // <4,5,2,2>: Cost 3 vext2 <2,2,4,5>, <2,2,4,5>
-  1557776078U,  // <4,5,2,3>: Cost 2 vext2 <2,3,4,5>, <2,3,4,5>
-  2631517974U,  // <4,5,2,4>: Cost 3 vext2 <2,3,4,5>, <2,4,3,5>
-  3692652384U,  // <4,5,2,5>: Cost 4 vext2 <0,2,4,5>, <2,5,2,7>
-  2631518138U,  // <4,5,2,6>: Cost 3 vext2 <2,3,4,5>, <2,6,3,7>
-  4164013366U,  // <4,5,2,7>: Cost 4 vtrnr <0,4,u,2>, RHS
-  1561094243U,  // <4,5,2,u>: Cost 2 vext2 <2,u,4,5>, <2,u,4,5>
-  2631518358U,  // <4,5,3,0>: Cost 3 vext2 <2,3,4,5>, <3,0,1,2>
-  3895084710U,  // <4,5,3,1>: Cost 4 vuzpr <0,4,1,5>, <2,3,0,1>
-  2631518540U,  // <4,5,3,2>: Cost 3 vext2 <2,3,4,5>, <3,2,3,4>
-  2631518620U,  // <4,5,3,3>: Cost 3 vext2 <2,3,4,5>, <3,3,3,3>
-  2631518716U,  // <4,5,3,4>: Cost 3 vext2 <2,3,4,5>, <3,4,5,0>
-  2631518784U,  // <4,5,3,5>: Cost 3 vext2 <2,3,4,5>, <3,5,3,5>
-  2658060980U,  // <4,5,3,6>: Cost 3 vext2 <6,7,4,5>, <3,6,7,4>
-  2640145131U,  // <4,5,3,7>: Cost 3 vext2 <3,7,4,5>, <3,7,4,5>
-  2631519006U,  // <4,5,3,u>: Cost 3 vext2 <2,3,4,5>, <3,u,1,2>
-  2571108454U,  // <4,5,4,0>: Cost 3 vext1 <3,4,5,4>, LHS
-  3632907342U,  // <4,5,4,1>: Cost 4 vext1 <1,4,5,4>, <1,4,5,4>
-  2571110094U,  // <4,5,4,2>: Cost 3 vext1 <3,4,5,4>, <2,3,4,5>
-  2571110912U,  // <4,5,4,3>: Cost 3 vext1 <3,4,5,4>, <3,4,5,4>
-  2571111734U,  // <4,5,4,4>: Cost 3 vext1 <3,4,5,4>, RHS
-  1557777718U,  // <4,5,4,5>: Cost 2 vext2 <2,3,4,5>, RHS
-  2645454195U,  // <4,5,4,6>: Cost 3 vext2 <4,6,4,5>, <4,6,4,5>
-  2718748614U,  // <4,5,4,7>: Cost 3 vext3 <5,6,7,4>, <5,4,7,6>
-  1557777961U,  // <4,5,4,u>: Cost 2 vext2 <2,3,4,5>, RHS
-  1503346790U,  // <4,5,5,0>: Cost 2 vext1 <4,4,5,5>, LHS
-  2913398480U,  // <4,5,5,1>: Cost 3 vzipl RHS, <5,1,7,3>
-  2631519998U,  // <4,5,5,2>: Cost 3 vext2 <2,3,4,5>, <5,2,3,4>
-  2577090710U,  // <4,5,5,3>: Cost 3 vext1 <4,4,5,5>, <3,0,1,2>
-  1503349978U,  // <4,5,5,4>: Cost 2 vext1 <4,4,5,5>, <4,4,5,5>
-  2631520260U,  // <4,5,5,5>: Cost 3 vext2 <2,3,4,5>, <5,5,5,5>
-  2913390690U,  // <4,5,5,6>: Cost 3 vzipl RHS, <5,6,7,0>
-  2821344566U,  // <4,5,5,7>: Cost 3 vuzpr <0,4,1,5>, RHS
-  1503352622U,  // <4,5,5,u>: Cost 2 vext1 <4,4,5,5>, LHS
-  1497383014U,  // <4,5,6,0>: Cost 2 vext1 <3,4,5,6>, LHS
-  2559181904U,  // <4,5,6,1>: Cost 3 vext1 <1,4,5,6>, <1,4,5,6>
-  2565154601U,  // <4,5,6,2>: Cost 3 vext1 <2,4,5,6>, <2,4,5,6>
-  1497385474U,  // <4,5,6,3>: Cost 2 vext1 <3,4,5,6>, <3,4,5,6>
-  1497386294U,  // <4,5,6,4>: Cost 2 vext1 <3,4,5,6>, RHS
-  3047608324U,  // <4,5,6,5>: Cost 3 vtrnl RHS, <5,5,5,5>
-  2571129656U,  // <4,5,6,6>: Cost 3 vext1 <3,4,5,6>, <6,6,6,6>
-    27705344U,  // <4,5,6,7>: Cost 0 copy RHS
-    27705344U,  // <4,5,6,u>: Cost 0 copy RHS
-  2565161062U,  // <4,5,7,0>: Cost 3 vext1 <2,4,5,7>, LHS
-  2565161882U,  // <4,5,7,1>: Cost 3 vext1 <2,4,5,7>, <1,2,3,4>
-  2565162794U,  // <4,5,7,2>: Cost 3 vext1 <2,4,5,7>, <2,4,5,7>
-  2661381387U,  // <4,5,7,3>: Cost 3 vext2 <7,3,4,5>, <7,3,4,5>
-  2565164342U,  // <4,5,7,4>: Cost 3 vext1 <2,4,5,7>, RHS
-  2718748840U,  // <4,5,7,5>: Cost 3 vext3 <5,6,7,4>, <5,7,5,7>
-  2718748846U,  // <4,5,7,6>: Cost 3 vext3 <5,6,7,4>, <5,7,6,4>
-  2719412407U,  // <4,5,7,7>: Cost 3 vext3 <5,7,7,4>, <5,7,7,4>
-  2565166894U,  // <4,5,7,u>: Cost 3 vext1 <2,4,5,7>, LHS
-  1497399398U,  // <4,5,u,0>: Cost 2 vext1 <3,4,5,u>, LHS
-  1557780270U,  // <4,5,u,1>: Cost 2 vext2 <2,3,4,5>, LHS
-  2631522181U,  // <4,5,u,2>: Cost 3 vext2 <2,3,4,5>, <u,2,3,0>
-  1497401860U,  // <4,5,u,3>: Cost 2 vext1 <3,4,5,u>, <3,4,5,u>
-  1497402678U,  // <4,5,u,4>: Cost 2 vext1 <3,4,5,u>, RHS
-  1557780634U,  // <4,5,u,5>: Cost 2 vext2 <2,3,4,5>, RHS
-  2631522512U,  // <4,5,u,6>: Cost 3 vext2 <2,3,4,5>, <u,6,3,7>
-    27705344U,  // <4,5,u,7>: Cost 0 copy RHS
-    27705344U,  // <4,5,u,u>: Cost 0 copy RHS
-  2618916864U,  // <4,6,0,0>: Cost 3 vext2 <0,2,4,6>, <0,0,0,0>
-  1545175142U,  // <4,6,0,1>: Cost 2 vext2 <0,2,4,6>, LHS
-  1545175244U,  // <4,6,0,2>: Cost 2 vext2 <0,2,4,6>, <0,2,4,6>
-  3692658940U,  // <4,6,0,3>: Cost 4 vext2 <0,2,4,6>, <0,3,1,0>
-  2618917202U,  // <4,6,0,4>: Cost 3 vext2 <0,2,4,6>, <0,4,1,5>
-  3852910806U,  // <4,6,0,5>: Cost 4 vuzpl RHS, <0,2,5,7>
-  2253525648U,  // <4,6,0,6>: Cost 3 vrev <6,4,6,0>
-  4040764726U,  // <4,6,0,7>: Cost 4 vzipr <2,3,4,0>, RHS
-  1545175709U,  // <4,6,0,u>: Cost 2 vext2 <0,2,4,6>, LHS
-  2618917622U,  // <4,6,1,0>: Cost 3 vext2 <0,2,4,6>, <1,0,3,2>
-  2618917684U,  // <4,6,1,1>: Cost 3 vext2 <0,2,4,6>, <1,1,1,1>
-  2618917782U,  // <4,6,1,2>: Cost 3 vext2 <0,2,4,6>, <1,2,3,0>
-  2618917848U,  // <4,6,1,3>: Cost 3 vext2 <0,2,4,6>, <1,3,1,3>
-  3692659773U,  // <4,6,1,4>: Cost 4 vext2 <0,2,4,6>, <1,4,3,5>
-  2618918032U,  // <4,6,1,5>: Cost 3 vext2 <0,2,4,6>, <1,5,3,7>
-  3692659937U,  // <4,6,1,6>: Cost 4 vext2 <0,2,4,6>, <1,6,3,7>
-  4032146742U,  // <4,6,1,7>: Cost 4 vzipr <0,u,4,1>, RHS
-  2618918253U,  // <4,6,1,u>: Cost 3 vext2 <0,2,4,6>, <1,u,1,3>
-  2618918380U,  // <4,6,2,0>: Cost 3 vext2 <0,2,4,6>, <2,0,6,4>
-  2618918460U,  // <4,6,2,1>: Cost 3 vext2 <0,2,4,6>, <2,1,6,3>
-  2618918504U,  // <4,6,2,2>: Cost 3 vext2 <0,2,4,6>, <2,2,2,2>
-  2618918566U,  // <4,6,2,3>: Cost 3 vext2 <0,2,4,6>, <2,3,0,1>
-  2618918679U,  // <4,6,2,4>: Cost 3 vext2 <0,2,4,6>, <2,4,3,6>
-  2618918788U,  // <4,6,2,5>: Cost 3 vext2 <0,2,4,6>, <2,5,6,7>
-  2618918842U,  // <4,6,2,6>: Cost 3 vext2 <0,2,4,6>, <2,6,3,7>
-  2718749178U,  // <4,6,2,7>: Cost 3 vext3 <5,6,7,4>, <6,2,7,3>
-  2618918971U,  // <4,6,2,u>: Cost 3 vext2 <0,2,4,6>, <2,u,0,1>
-  2618919062U,  // <4,6,3,0>: Cost 3 vext2 <0,2,4,6>, <3,0,1,2>
-  2636171526U,  // <4,6,3,1>: Cost 3 vext2 <3,1,4,6>, <3,1,4,6>
-  3692661057U,  // <4,6,3,2>: Cost 4 vext2 <0,2,4,6>, <3,2,2,2>
-  2618919324U,  // <4,6,3,3>: Cost 3 vext2 <0,2,4,6>, <3,3,3,3>
-  2618919426U,  // <4,6,3,4>: Cost 3 vext2 <0,2,4,6>, <3,4,5,6>
-  2638826058U,  // <4,6,3,5>: Cost 3 vext2 <3,5,4,6>, <3,5,4,6>
-  3913303030U,  // <4,6,3,6>: Cost 4 vuzpr <3,4,5,6>, <1,3,4,6>
-  2722730572U,  // <4,6,3,7>: Cost 3 vext3 <6,3,7,4>, <6,3,7,4>
-  2618919710U,  // <4,6,3,u>: Cost 3 vext2 <0,2,4,6>, <3,u,1,2>
-  2565210214U,  // <4,6,4,0>: Cost 3 vext1 <2,4,6,4>, LHS
-  2718749286U,  // <4,6,4,1>: Cost 3 vext3 <5,6,7,4>, <6,4,1,3>
-  2565211952U,  // <4,6,4,2>: Cost 3 vext1 <2,4,6,4>, <2,4,6,4>
-  2571184649U,  // <4,6,4,3>: Cost 3 vext1 <3,4,6,4>, <3,4,6,4>
-  2565213494U,  // <4,6,4,4>: Cost 3 vext1 <2,4,6,4>, RHS
-  1545178422U,  // <4,6,4,5>: Cost 2 vext2 <0,2,4,6>, RHS
-  1705430326U,  // <4,6,4,6>: Cost 2 vuzpl RHS, RHS
-  2595075437U,  // <4,6,4,7>: Cost 3 vext1 <7,4,6,4>, <7,4,6,4>
-  1545178665U,  // <4,6,4,u>: Cost 2 vext2 <0,2,4,6>, RHS
-  2565218406U,  // <4,6,5,0>: Cost 3 vext1 <2,4,6,5>, LHS
-  2645462736U,  // <4,6,5,1>: Cost 3 vext2 <4,6,4,6>, <5,1,7,3>
-  2913399290U,  // <4,6,5,2>: Cost 3 vzipl RHS, <6,2,7,3>
-  3913305394U,  // <4,6,5,3>: Cost 4 vuzpr <3,4,5,6>, <4,5,6,3>
-  2645462982U,  // <4,6,5,4>: Cost 3 vext2 <4,6,4,6>, <5,4,7,6>
-  2779172868U,  // <4,6,5,5>: Cost 3 vuzpl RHS, <5,5,5,5>
-  2913391416U,  // <4,6,5,6>: Cost 3 vzipl RHS, <6,6,6,6>
-  2821426486U,  // <4,6,5,7>: Cost 3 vuzpr <0,4,2,6>, RHS
-  2821426487U,  // <4,6,5,u>: Cost 3 vuzpr <0,4,2,6>, RHS
-  1503428710U,  // <4,6,6,0>: Cost 2 vext1 <4,4,6,6>, LHS
-  2577171190U,  // <4,6,6,1>: Cost 3 vext1 <4,4,6,6>, <1,0,3,2>
-  2645463546U,  // <4,6,6,2>: Cost 3 vext2 <4,6,4,6>, <6,2,7,3>
-  2577172630U,  // <4,6,6,3>: Cost 3 vext1 <4,4,6,6>, <3,0,1,2>
-  1503431908U,  // <4,6,6,4>: Cost 2 vext1 <4,4,6,6>, <4,4,6,6>
-  2253501069U,  // <4,6,6,5>: Cost 3 vrev <6,4,5,6>
-  2618921784U,  // <4,6,6,6>: Cost 3 vext2 <0,2,4,6>, <6,6,6,6>
-  2954464566U,  // <4,6,6,7>: Cost 3 vzipr <0,2,4,6>, RHS
-  1503434542U,  // <4,6,6,u>: Cost 2 vext1 <4,4,6,6>, LHS
-  2645464058U,  // <4,6,7,0>: Cost 3 vext2 <4,6,4,6>, <7,0,1,2>
-  2779173882U,  // <4,6,7,1>: Cost 3 vuzpl RHS, <7,0,1,2>
-  3638978355U,  // <4,6,7,2>: Cost 4 vext1 <2,4,6,7>, <2,4,6,7>
-  2725090156U,  // <4,6,7,3>: Cost 3 vext3 <6,7,3,4>, <6,7,3,4>
-  2645464422U,  // <4,6,7,4>: Cost 3 vext2 <4,6,4,6>, <7,4,5,6>
-  2779174246U,  // <4,6,7,5>: Cost 3 vuzpl RHS, <7,4,5,6>
-  3852915914U,  // <4,6,7,6>: Cost 4 vuzpl RHS, <7,2,6,3>
-  2779174508U,  // <4,6,7,7>: Cost 3 vuzpl RHS, <7,7,7,7>
-  2779173945U,  // <4,6,7,u>: Cost 3 vuzpl RHS, <7,0,u,2>
-  1503445094U,  // <4,6,u,0>: Cost 2 vext1 <4,4,6,u>, LHS
-  1545180974U,  // <4,6,u,1>: Cost 2 vext2 <0,2,4,6>, LHS
-  1705432878U,  // <4,6,u,2>: Cost 2 vuzpl RHS, LHS
-  2618922940U,  // <4,6,u,3>: Cost 3 vext2 <0,2,4,6>, <u,3,0,1>
-  1503448294U,  // <4,6,u,4>: Cost 2 vext1 <4,4,6,u>, <4,4,6,u>
-  1545181338U,  // <4,6,u,5>: Cost 2 vext2 <0,2,4,6>, RHS
-  1705433242U,  // <4,6,u,6>: Cost 2 vuzpl RHS, RHS
-  2954480950U,  // <4,6,u,7>: Cost 3 vzipr <0,2,4,u>, RHS
-  1545181541U,  // <4,6,u,u>: Cost 2 vext2 <0,2,4,6>, LHS
-  3706601472U,  // <4,7,0,0>: Cost 4 vext2 <2,5,4,7>, <0,0,0,0>
-  2632859750U,  // <4,7,0,1>: Cost 3 vext2 <2,5,4,7>, LHS
-  2726343685U,  // <4,7,0,2>: Cost 3 vext3 <7,0,2,4>, <7,0,2,4>
-  3701293312U,  // <4,7,0,3>: Cost 4 vext2 <1,6,4,7>, <0,3,1,4>
-  3706601810U,  // <4,7,0,4>: Cost 4 vext2 <2,5,4,7>, <0,4,1,5>
-  2259424608U,  // <4,7,0,5>: Cost 3 vrev <7,4,5,0>
-  3695321617U,  // <4,7,0,6>: Cost 4 vext2 <0,6,4,7>, <0,6,4,7>
-  3800454194U,  // <4,7,0,7>: Cost 4 vext3 <7,0,7,4>, <7,0,7,4>
-  2632860317U,  // <4,7,0,u>: Cost 3 vext2 <2,5,4,7>, LHS
-  2259064116U,  // <4,7,1,0>: Cost 3 vrev <7,4,0,1>
-  3700630324U,  // <4,7,1,1>: Cost 4 vext2 <1,5,4,7>, <1,1,1,1>
-  2632860570U,  // <4,7,1,2>: Cost 3 vext2 <2,5,4,7>, <1,2,3,4>
-  3769635936U,  // <4,7,1,3>: Cost 4 vext3 <1,u,3,4>, <7,1,3,5>
-  3656920374U,  // <4,7,1,4>: Cost 4 vext1 <5,4,7,1>, RHS
-  3700630681U,  // <4,7,1,5>: Cost 4 vext2 <1,5,4,7>, <1,5,4,7>
-  3701294314U,  // <4,7,1,6>: Cost 4 vext2 <1,6,4,7>, <1,6,4,7>
-  3793818754U,  // <4,7,1,7>: Cost 4 vext3 <5,u,7,4>, <7,1,7,3>
-  2259654012U,  // <4,7,1,u>: Cost 3 vrev <7,4,u,1>
-  3656925286U,  // <4,7,2,0>: Cost 4 vext1 <5,4,7,2>, LHS
-  3706603050U,  // <4,7,2,1>: Cost 4 vext2 <2,5,4,7>, <2,1,4,3>
-  3706603112U,  // <4,7,2,2>: Cost 4 vext2 <2,5,4,7>, <2,2,2,2>
-  2727744688U,  // <4,7,2,3>: Cost 3 vext3 <7,2,3,4>, <7,2,3,4>
-  3705939745U,  // <4,7,2,4>: Cost 4 vext2 <2,4,4,7>, <2,4,4,7>
-  2632861554U,  // <4,7,2,5>: Cost 3 vext2 <2,5,4,7>, <2,5,4,7>
-  3706603450U,  // <4,7,2,6>: Cost 4 vext2 <2,5,4,7>, <2,6,3,7>
-  3792491731U,  // <4,7,2,7>: Cost 4 vext3 <5,6,7,4>, <7,2,7,3>
-  2634852453U,  // <4,7,2,u>: Cost 3 vext2 <2,u,4,7>, <2,u,4,7>
-  3706603670U,  // <4,7,3,0>: Cost 4 vext2 <2,5,4,7>, <3,0,1,2>
-  3662906266U,  // <4,7,3,1>: Cost 4 vext1 <6,4,7,3>, <1,2,3,4>
-  3725183326U,  // <4,7,3,2>: Cost 4 vext2 <5,6,4,7>, <3,2,5,4>
-  3706603932U,  // <4,7,3,3>: Cost 4 vext2 <2,5,4,7>, <3,3,3,3>
-  3701295618U,  // <4,7,3,4>: Cost 4 vext2 <1,6,4,7>, <3,4,5,6>
-  2638834251U,  // <4,7,3,5>: Cost 3 vext2 <3,5,4,7>, <3,5,4,7>
-  2639497884U,  // <4,7,3,6>: Cost 3 vext2 <3,6,4,7>, <3,6,4,7>
-  3802445093U,  // <4,7,3,7>: Cost 4 vext3 <7,3,7,4>, <7,3,7,4>
-  2640825150U,  // <4,7,3,u>: Cost 3 vext2 <3,u,4,7>, <3,u,4,7>
-  2718750004U,  // <4,7,4,0>: Cost 3 vext3 <5,6,7,4>, <7,4,0,1>
-  3706604490U,  // <4,7,4,1>: Cost 4 vext2 <2,5,4,7>, <4,1,2,3>
-  3656943474U,  // <4,7,4,2>: Cost 4 vext1 <5,4,7,4>, <2,5,4,7>
-  3779884371U,  // <4,7,4,3>: Cost 4 vext3 <3,5,7,4>, <7,4,3,5>
-  2259383643U,  // <4,7,4,4>: Cost 3 vrev <7,4,4,4>
-  2632863030U,  // <4,7,4,5>: Cost 3 vext2 <2,5,4,7>, RHS
-  2259531117U,  // <4,7,4,6>: Cost 3 vrev <7,4,6,4>
-  3907340074U,  // <4,7,4,7>: Cost 4 vuzpr <2,4,5,7>, <2,4,5,7>
-  2632863273U,  // <4,7,4,u>: Cost 3 vext2 <2,5,4,7>, RHS
-  2913391610U,  // <4,7,5,0>: Cost 3 vzipl RHS, <7,0,1,2>
-  3645006848U,  // <4,7,5,1>: Cost 4 vext1 <3,4,7,5>, <1,3,5,7>
-  2589181646U,  // <4,7,5,2>: Cost 3 vext1 <6,4,7,5>, <2,3,4,5>
-  3645008403U,  // <4,7,5,3>: Cost 4 vext1 <3,4,7,5>, <3,4,7,5>
-  2913391974U,  // <4,7,5,4>: Cost 3 vzipl RHS, <7,4,5,6>
-  2583211973U,  // <4,7,5,5>: Cost 3 vext1 <5,4,7,5>, <5,4,7,5>
-  2589184670U,  // <4,7,5,6>: Cost 3 vext1 <6,4,7,5>, <6,4,7,5>
-  2913392236U,  // <4,7,5,7>: Cost 3 vzipl RHS, <7,7,7,7>
-  2913392258U,  // <4,7,5,u>: Cost 3 vzipl RHS, <7,u,1,2>
-  1509474406U,  // <4,7,6,0>: Cost 2 vext1 <5,4,7,6>, LHS
-  3047609338U,  // <4,7,6,1>: Cost 3 vtrnl RHS, <7,0,1,2>
-  2583217768U,  // <4,7,6,2>: Cost 3 vext1 <5,4,7,6>, <2,2,2,2>
-  2583218326U,  // <4,7,6,3>: Cost 3 vext1 <5,4,7,6>, <3,0,1,2>
-  1509477686U,  // <4,7,6,4>: Cost 2 vext1 <5,4,7,6>, RHS
-  1509478342U,  // <4,7,6,5>: Cost 2 vext1 <5,4,7,6>, <5,4,7,6>
-  2583220730U,  // <4,7,6,6>: Cost 3 vext1 <5,4,7,6>, <6,2,7,3>
-  3047609964U,  // <4,7,6,7>: Cost 3 vtrnl RHS, <7,7,7,7>
-  1509480238U,  // <4,7,6,u>: Cost 2 vext1 <5,4,7,6>, LHS
-  3650994278U,  // <4,7,7,0>: Cost 4 vext1 <4,4,7,7>, LHS
-  3650995098U,  // <4,7,7,1>: Cost 4 vext1 <4,4,7,7>, <1,2,3,4>
-  3650996010U,  // <4,7,7,2>: Cost 4 vext1 <4,4,7,7>, <2,4,5,7>
-  3804804677U,  // <4,7,7,3>: Cost 4 vext3 <7,7,3,4>, <7,7,3,4>
-  3650997486U,  // <4,7,7,4>: Cost 4 vext1 <4,4,7,7>, <4,4,7,7>
-  2662725039U,  // <4,7,7,5>: Cost 3 vext2 <7,5,4,7>, <7,5,4,7>
-  3662942880U,  // <4,7,7,6>: Cost 4 vext1 <6,4,7,7>, <6,4,7,7>
-  2718750316U,  // <4,7,7,7>: Cost 3 vext3 <5,6,7,4>, <7,7,7,7>
-  2664715938U,  // <4,7,7,u>: Cost 3 vext2 <7,u,4,7>, <7,u,4,7>
-  1509490790U,  // <4,7,u,0>: Cost 2 vext1 <5,4,7,u>, LHS
-  2632865582U,  // <4,7,u,1>: Cost 3 vext2 <2,5,4,7>, LHS
-  2583234152U,  // <4,7,u,2>: Cost 3 vext1 <5,4,7,u>, <2,2,2,2>
-  2583234710U,  // <4,7,u,3>: Cost 3 vext1 <5,4,7,u>, <3,0,1,2>
-  1509494070U,  // <4,7,u,4>: Cost 2 vext1 <5,4,7,u>, RHS
-  1509494728U,  // <4,7,u,5>: Cost 2 vext1 <5,4,7,u>, <5,4,7,u>
-  2583237114U,  // <4,7,u,6>: Cost 3 vext1 <5,4,7,u>, <6,2,7,3>
-  3047757420U,  // <4,7,u,7>: Cost 3 vtrnl RHS, <7,7,7,7>
-  1509496622U,  // <4,7,u,u>: Cost 2 vext1 <5,4,7,u>, LHS
-  2618933248U,  // <4,u,0,0>: Cost 3 vext2 <0,2,4,u>, <0,0,0,0>
-  1545191526U,  // <4,u,0,1>: Cost 2 vext2 <0,2,4,u>, LHS
-  1545191630U,  // <4,u,0,2>: Cost 2 vext2 <0,2,4,u>, <0,2,4,u>
-  2691913445U,  // <4,u,0,3>: Cost 3 vext3 <1,2,3,4>, <u,0,3,2>
-  2618933586U,  // <4,u,0,4>: Cost 3 vext2 <0,2,4,u>, <0,4,1,5>
-  2265397305U,  // <4,u,0,5>: Cost 3 vrev <u,4,5,0>
-  2595189625U,  // <4,u,0,6>: Cost 3 vext1 <7,4,u,0>, <6,7,4,u>
-  2595190139U,  // <4,u,0,7>: Cost 3 vext1 <7,4,u,0>, <7,4,u,0>
-  1545192093U,  // <4,u,0,u>: Cost 2 vext2 <0,2,4,u>, LHS
-  2618934006U,  // <4,u,1,0>: Cost 3 vext2 <0,2,4,u>, <1,0,3,2>
-  2618934068U,  // <4,u,1,1>: Cost 3 vext2 <0,2,4,u>, <1,1,1,1>
-  1618171694U,  // <4,u,1,2>: Cost 2 vext3 <1,2,3,4>, LHS
-  2618934232U,  // <4,u,1,3>: Cost 3 vext2 <0,2,4,u>, <1,3,1,3>
-  2695894848U,  // <4,u,1,4>: Cost 3 vext3 <1,u,3,4>, <u,1,4,3>
-  2618934416U,  // <4,u,1,5>: Cost 3 vext2 <0,2,4,u>, <1,5,3,7>
-  3692676321U,  // <4,u,1,6>: Cost 4 vext2 <0,2,4,u>, <1,6,3,7>
-  2718750555U,  // <4,u,1,7>: Cost 3 vext3 <5,6,7,4>, <u,1,7,3>
-  1618171748U,  // <4,u,1,u>: Cost 2 vext3 <1,2,3,4>, LHS
-  2553397350U,  // <4,u,2,0>: Cost 3 vext1 <0,4,u,2>, LHS
-  2630215215U,  // <4,u,2,1>: Cost 3 vext2 <2,1,4,u>, <2,1,4,u>
-  2618934888U,  // <4,u,2,2>: Cost 3 vext2 <0,2,4,u>, <2,2,2,2>
-  1557800657U,  // <4,u,2,3>: Cost 2 vext2 <2,3,4,u>, <2,3,4,u>
-  2618935065U,  // <4,u,2,4>: Cost 3 vext2 <0,2,4,u>, <2,4,3,u>
-  2733864859U,  // <4,u,2,5>: Cost 3 vext3 <u,2,5,4>, <u,2,5,4>
-  2618935226U,  // <4,u,2,6>: Cost 3 vext2 <0,2,4,u>, <2,6,3,7>
-  2718750636U,  // <4,u,2,7>: Cost 3 vext3 <5,6,7,4>, <u,2,7,3>
-  1561118822U,  // <4,u,2,u>: Cost 2 vext2 <2,u,4,u>, <2,u,4,u>
-  2618935446U,  // <4,u,3,0>: Cost 3 vext2 <0,2,4,u>, <3,0,1,2>
-  2779318422U,  // <4,u,3,1>: Cost 3 vuzpl RHS, <3,0,1,2>
-  2636851545U,  // <4,u,3,2>: Cost 3 vext2 <3,2,4,u>, <3,2,4,u>
-  2618935708U,  // <4,u,3,3>: Cost 3 vext2 <0,2,4,u>, <3,3,3,3>
-  2618935810U,  // <4,u,3,4>: Cost 3 vext2 <0,2,4,u>, <3,4,5,6>
-  2691913711U,  // <4,u,3,5>: Cost 3 vext3 <1,2,3,4>, <u,3,5,7>
-  2588725862U,  // <4,u,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
-  2640169710U,  // <4,u,3,7>: Cost 3 vext2 <3,7,4,u>, <3,7,4,u>
-  2618936094U,  // <4,u,3,u>: Cost 3 vext2 <0,2,4,u>, <3,u,1,2>
-  1503559782U,  // <4,u,4,0>: Cost 2 vext1 <4,4,u,4>, LHS
-  2692282391U,  // <4,u,4,1>: Cost 3 vext3 <1,2,u,4>, <u,4,1,2>
-  2565359426U,  // <4,u,4,2>: Cost 3 vext1 <2,4,u,4>, <2,4,u,4>
-  2571332123U,  // <4,u,4,3>: Cost 3 vext1 <3,4,u,4>, <3,4,u,4>
-   161926454U,  // <4,u,4,4>: Cost 1 vdup0 RHS
-  1545194806U,  // <4,u,4,5>: Cost 2 vext2 <0,2,4,u>, RHS
-  1705577782U,  // <4,u,4,6>: Cost 2 vuzpl RHS, RHS
-  2718750801U,  // <4,u,4,7>: Cost 3 vext3 <5,6,7,4>, <u,4,7,6>
-   161926454U,  // <4,u,4,u>: Cost 1 vdup0 RHS
-  1479164006U,  // <4,u,5,0>: Cost 2 vext1 <0,4,1,5>, LHS
-  1839650606U,  // <4,u,5,1>: Cost 2 vzipl RHS, LHS
-  2565367502U,  // <4,u,5,2>: Cost 3 vext1 <2,4,u,5>, <2,3,4,5>
-  3089777309U,  // <4,u,5,3>: Cost 3 vtrnr <0,4,1,5>, LHS
-  1479167286U,  // <4,u,5,4>: Cost 2 vext1 <0,4,1,5>, RHS
-  1839650970U,  // <4,u,5,5>: Cost 2 vzipl RHS, RHS
-  1618172058U,  // <4,u,5,6>: Cost 2 vext3 <1,2,3,4>, RHS
-  3089780265U,  // <4,u,5,7>: Cost 3 vtrnr <0,4,1,5>, RHS
-  1618172076U,  // <4,u,5,u>: Cost 2 vext3 <1,2,3,4>, RHS
-  1479688294U,  // <4,u,6,0>: Cost 2 vext1 <0,4,u,6>, LHS
-  2553430774U,  // <4,u,6,1>: Cost 3 vext1 <0,4,u,6>, <1,0,3,2>
-  1973868334U,  // <4,u,6,2>: Cost 2 vtrnl RHS, LHS
-  1497606685U,  // <4,u,6,3>: Cost 2 vext1 <3,4,u,6>, <3,4,u,6>
-  1479691574U,  // <4,u,6,4>: Cost 2 vext1 <0,4,u,6>, RHS
-  1509552079U,  // <4,u,6,5>: Cost 2 vext1 <5,4,u,6>, <5,4,u,6>
-  1973868698U,  // <4,u,6,6>: Cost 2 vtrnl RHS, RHS
-    27705344U,  // <4,u,6,7>: Cost 0 copy RHS
-    27705344U,  // <4,u,6,u>: Cost 0 copy RHS
-  2565382246U,  // <4,u,7,0>: Cost 3 vext1 <2,4,u,7>, LHS
-  2565383066U,  // <4,u,7,1>: Cost 3 vext1 <2,4,u,7>, <1,2,3,4>
-  2565384005U,  // <4,u,7,2>: Cost 3 vext1 <2,4,u,7>, <2,4,u,7>
-  2661405966U,  // <4,u,7,3>: Cost 3 vext2 <7,3,4,u>, <7,3,4,u>
-  2565385526U,  // <4,u,7,4>: Cost 3 vext1 <2,4,u,7>, RHS
-  2779321702U,  // <4,u,7,5>: Cost 3 vuzpl RHS, <7,4,5,6>
-  2589274793U,  // <4,u,7,6>: Cost 3 vext1 <6,4,u,7>, <6,4,u,7>
-  2779321964U,  // <4,u,7,7>: Cost 3 vuzpl RHS, <7,7,7,7>
-  2565388078U,  // <4,u,7,u>: Cost 3 vext1 <2,4,u,7>, LHS
-  1479704678U,  // <4,u,u,0>: Cost 2 vext1 <0,4,u,u>, LHS
-  1545197358U,  // <4,u,u,1>: Cost 2 vext2 <0,2,4,u>, LHS
-  1618172261U,  // <4,u,u,2>: Cost 2 vext3 <1,2,3,4>, LHS
-  1497623071U,  // <4,u,u,3>: Cost 2 vext1 <3,4,u,u>, <3,4,u,u>
-   161926454U,  // <4,u,u,4>: Cost 1 vdup0 RHS
-  1545197722U,  // <4,u,u,5>: Cost 2 vext2 <0,2,4,u>, RHS
-  1618172301U,  // <4,u,u,6>: Cost 2 vext3 <1,2,3,4>, RHS
-    27705344U,  // <4,u,u,7>: Cost 0 copy RHS
-    27705344U,  // <4,u,u,u>: Cost 0 copy RHS
-  2687123456U,  // <5,0,0,0>: Cost 3 vext3 <0,4,1,5>, <0,0,0,0>
-  2687123466U,  // <5,0,0,1>: Cost 3 vext3 <0,4,1,5>, <0,0,1,1>
-  2687123476U,  // <5,0,0,2>: Cost 3 vext3 <0,4,1,5>, <0,0,2,2>
-  3710599434U,  // <5,0,0,3>: Cost 4 vext2 <3,2,5,0>, <0,3,2,5>
-  2642166098U,  // <5,0,0,4>: Cost 3 vext2 <4,1,5,0>, <0,4,1,5>
-  3657060306U,  // <5,0,0,5>: Cost 4 vext1 <5,5,0,0>, <5,5,0,0>
-  3292094923U,  // <5,0,0,6>: Cost 4 vrev <0,5,6,0>
-  3669005700U,  // <5,0,0,7>: Cost 4 vext1 <7,5,0,0>, <7,5,0,0>
-  2687123530U,  // <5,0,0,u>: Cost 3 vext3 <0,4,1,5>, <0,0,u,2>
-  2559434854U,  // <5,0,1,0>: Cost 3 vext1 <1,5,0,1>, LHS
-  2559435887U,  // <5,0,1,1>: Cost 3 vext1 <1,5,0,1>, <1,5,0,1>
-  1613381734U,  // <5,0,1,2>: Cost 2 vext3 <0,4,1,5>, LHS
-  3698656256U,  // <5,0,1,3>: Cost 4 vext2 <1,2,5,0>, <1,3,5,7>
-  2559438134U,  // <5,0,1,4>: Cost 3 vext1 <1,5,0,1>, RHS
-  2583326675U,  // <5,0,1,5>: Cost 3 vext1 <5,5,0,1>, <5,5,0,1>
-  3715908851U,  // <5,0,1,6>: Cost 4 vext2 <4,1,5,0>, <1,6,5,7>
-  3657069562U,  // <5,0,1,7>: Cost 4 vext1 <5,5,0,1>, <7,0,1,2>
-  1613381788U,  // <5,0,1,u>: Cost 2 vext3 <0,4,1,5>, LHS
-  2686017700U,  // <5,0,2,0>: Cost 3 vext3 <0,2,4,5>, <0,2,0,2>
-  2685796528U,  // <5,0,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5>
-  2698625208U,  // <5,0,2,2>: Cost 3 vext3 <2,3,4,5>, <0,2,2,4>
-  2685944002U,  // <5,0,2,3>: Cost 3 vext3 <0,2,3,5>, <0,2,3,5>
-  2686017739U,  // <5,0,2,4>: Cost 3 vext3 <0,2,4,5>, <0,2,4,5>
-  2686091476U,  // <5,0,2,5>: Cost 3 vext3 <0,2,5,5>, <0,2,5,5>
-  2725167324U,  // <5,0,2,6>: Cost 3 vext3 <6,7,4,5>, <0,2,6,4>
-  2595280230U,  // <5,0,2,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6>
-  2686312687U,  // <5,0,2,u>: Cost 3 vext3 <0,2,u,5>, <0,2,u,5>
-  3760128248U,  // <5,0,3,0>: Cost 4 vext3 <0,3,0,5>, <0,3,0,5>
-  3759685888U,  // <5,0,3,1>: Cost 4 vext3 <0,2,3,5>, <0,3,1,4>
-  2686533898U,  // <5,0,3,2>: Cost 3 vext3 <0,3,2,5>, <0,3,2,5>
-  3760349459U,  // <5,0,3,3>: Cost 4 vext3 <0,3,3,5>, <0,3,3,5>
-  2638187004U,  // <5,0,3,4>: Cost 3 vext2 <3,4,5,0>, <3,4,5,0>
-  3776348452U,  // <5,0,3,5>: Cost 4 vext3 <3,0,4,5>, <0,3,5,4>
-  3713256094U,  // <5,0,3,6>: Cost 4 vext2 <3,6,5,0>, <3,6,5,0>
-  3914064896U,  // <5,0,3,7>: Cost 4 vuzpr <3,5,7,0>, <1,3,5,7>
-  2686976320U,  // <5,0,3,u>: Cost 3 vext3 <0,3,u,5>, <0,3,u,5>
-  2559459430U,  // <5,0,4,0>: Cost 3 vext1 <1,5,0,4>, LHS
-  1613381970U,  // <5,0,4,1>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5>
-  2687123804U,  // <5,0,4,2>: Cost 3 vext3 <0,4,1,5>, <0,4,2,6>
-  3761013092U,  // <5,0,4,3>: Cost 4 vext3 <0,4,3,5>, <0,4,3,5>
-  2559462710U,  // <5,0,4,4>: Cost 3 vext1 <1,5,0,4>, RHS
-  2638187830U,  // <5,0,4,5>: Cost 3 vext2 <3,4,5,0>, RHS
-  3761234303U,  // <5,0,4,6>: Cost 4 vext3 <0,4,6,5>, <0,4,6,5>
-  2646150600U,  // <5,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0>
-  1613381970U,  // <5,0,4,u>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5>
-  3766763926U,  // <5,0,5,0>: Cost 4 vext3 <1,4,0,5>, <0,5,0,1>
-  2919268454U,  // <5,0,5,1>: Cost 3 vzipl <5,5,5,5>, LHS
-  3053486182U,  // <5,0,5,2>: Cost 3 vtrnl <5,5,5,5>, LHS
-  3723210589U,  // <5,0,5,3>: Cost 4 vext2 <5,3,5,0>, <5,3,5,0>
-  3766763966U,  // <5,0,5,4>: Cost 4 vext3 <1,4,0,5>, <0,5,4,5>
-  2650796031U,  // <5,0,5,5>: Cost 3 vext2 <5,5,5,0>, <5,5,5,0>
-  3719893090U,  // <5,0,5,6>: Cost 4 vext2 <4,7,5,0>, <5,6,7,0>
-  3914067254U,  // <5,0,5,7>: Cost 4 vuzpr <3,5,7,0>, RHS
-  2919269021U,  // <5,0,5,u>: Cost 3 vzipl <5,5,5,5>, LHS
-  4047519744U,  // <5,0,6,0>: Cost 4 vzipr <3,4,5,6>, <0,0,0,0>
-  2920038502U,  // <5,0,6,1>: Cost 3 vzipl <5,6,7,0>, LHS
-  3759759871U,  // <5,0,6,2>: Cost 4 vext3 <0,2,4,5>, <0,6,2,7>
-  3645164070U,  // <5,0,6,3>: Cost 4 vext1 <3,5,0,6>, <3,5,0,6>
-  3762414095U,  // <5,0,6,4>: Cost 4 vext3 <0,6,4,5>, <0,6,4,5>
-  3993780690U,  // <5,0,6,5>: Cost 4 vzipl <5,6,7,0>, <0,5,6,7>
-  3719893816U,  // <5,0,6,6>: Cost 4 vext2 <4,7,5,0>, <6,6,6,6>
-  2662077302U,  // <5,0,6,7>: Cost 3 vext2 <7,4,5,0>, <6,7,4,5>
-  2920039069U,  // <5,0,6,u>: Cost 3 vzipl <5,6,7,0>, LHS
-  2565455974U,  // <5,0,7,0>: Cost 3 vext1 <2,5,0,7>, LHS
-  2565456790U,  // <5,0,7,1>: Cost 3 vext1 <2,5,0,7>, <1,2,3,0>
-  2565457742U,  // <5,0,7,2>: Cost 3 vext1 <2,5,0,7>, <2,5,0,7>
-  3639199894U,  // <5,0,7,3>: Cost 4 vext1 <2,5,0,7>, <3,0,1,2>
-  2565459254U,  // <5,0,7,4>: Cost 3 vext1 <2,5,0,7>, RHS
-  2589347938U,  // <5,0,7,5>: Cost 3 vext1 <6,5,0,7>, <5,6,7,0>
-  2589348530U,  // <5,0,7,6>: Cost 3 vext1 <6,5,0,7>, <6,5,0,7>
-  4188456422U,  // <5,0,7,7>: Cost 4 vtrnr RHS, <2,0,5,7>
-  2565461806U,  // <5,0,7,u>: Cost 3 vext1 <2,5,0,7>, LHS
-  2687124106U,  // <5,0,u,0>: Cost 3 vext3 <0,4,1,5>, <0,u,0,2>
-  1616036502U,  // <5,0,u,1>: Cost 2 vext3 <0,u,1,5>, <0,u,1,5>
-  1613382301U,  // <5,0,u,2>: Cost 2 vext3 <0,4,1,5>, LHS
-  2689925800U,  // <5,0,u,3>: Cost 3 vext3 <0,u,3,5>, <0,u,3,5>
-  2687124146U,  // <5,0,u,4>: Cost 3 vext3 <0,4,1,5>, <0,u,4,6>
-  2638190746U,  // <5,0,u,5>: Cost 3 vext2 <3,4,5,0>, RHS
-  2589356723U,  // <5,0,u,6>: Cost 3 vext1 <6,5,0,u>, <6,5,0,u>
-  2595280230U,  // <5,0,u,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6>
-  1613382355U,  // <5,0,u,u>: Cost 2 vext3 <0,4,1,5>, LHS
-  2646818816U,  // <5,1,0,0>: Cost 3 vext2 <4,u,5,1>, <0,0,0,0>
-  1573077094U,  // <5,1,0,1>: Cost 2 vext2 <4,u,5,1>, LHS
-  2646818980U,  // <5,1,0,2>: Cost 3 vext2 <4,u,5,1>, <0,2,0,2>
-  2687124214U,  // <5,1,0,3>: Cost 3 vext3 <0,4,1,5>, <1,0,3,2>
-  2641510738U,  // <5,1,0,4>: Cost 3 vext2 <4,0,5,1>, <0,4,1,5>
-  2641510814U,  // <5,1,0,5>: Cost 3 vext2 <4,0,5,1>, <0,5,1,0>
-  3720561142U,  // <5,1,0,6>: Cost 4 vext2 <4,u,5,1>, <0,6,1,7>
-  3298141357U,  // <5,1,0,7>: Cost 4 vrev <1,5,7,0>
-  1573077661U,  // <5,1,0,u>: Cost 2 vext2 <4,u,5,1>, LHS
-  2223891567U,  // <5,1,1,0>: Cost 3 vrev <1,5,0,1>
-  2687124276U,  // <5,1,1,1>: Cost 3 vext3 <0,4,1,5>, <1,1,1,1>
-  2646819734U,  // <5,1,1,2>: Cost 3 vext2 <4,u,5,1>, <1,2,3,0>
-  2687124296U,  // <5,1,1,3>: Cost 3 vext3 <0,4,1,5>, <1,1,3,3>
-  2691326803U,  // <5,1,1,4>: Cost 3 vext3 <1,1,4,5>, <1,1,4,5>
-  2691400540U,  // <5,1,1,5>: Cost 3 vext3 <1,1,5,5>, <1,1,5,5>
-  3765216101U,  // <5,1,1,6>: Cost 4 vext3 <1,1,6,5>, <1,1,6,5>
-  3765289838U,  // <5,1,1,7>: Cost 4 vext3 <1,1,7,5>, <1,1,7,5>
-  2687124341U,  // <5,1,1,u>: Cost 3 vext3 <0,4,1,5>, <1,1,u,3>
-  3297641584U,  // <5,1,2,0>: Cost 4 vrev <1,5,0,2>
-  3763520391U,  // <5,1,2,1>: Cost 4 vext3 <0,u,1,5>, <1,2,1,3>
-  2646820456U,  // <5,1,2,2>: Cost 3 vext2 <4,u,5,1>, <2,2,2,2>
-  2687124374U,  // <5,1,2,3>: Cost 3 vext3 <0,4,1,5>, <1,2,3,0>
-  2691990436U,  // <5,1,2,4>: Cost 3 vext3 <1,2,4,5>, <1,2,4,5>
-  2687124395U,  // <5,1,2,5>: Cost 3 vext3 <0,4,1,5>, <1,2,5,3>
-  2646820794U,  // <5,1,2,6>: Cost 3 vext2 <4,u,5,1>, <2,6,3,7>
-  3808199610U,  // <5,1,2,7>: Cost 4 vext3 <u,3,4,5>, <1,2,7,0>
-  2687124419U,  // <5,1,2,u>: Cost 3 vext3 <0,4,1,5>, <1,2,u,0>
-  2577440870U,  // <5,1,3,0>: Cost 3 vext1 <4,5,1,3>, LHS
-  2687124440U,  // <5,1,3,1>: Cost 3 vext3 <0,4,1,5>, <1,3,1,3>
-  3759686627U,  // <5,1,3,2>: Cost 4 vext3 <0,2,3,5>, <1,3,2,5>
-  2692580332U,  // <5,1,3,3>: Cost 3 vext3 <1,3,3,5>, <1,3,3,5>
-  2687124469U,  // <5,1,3,4>: Cost 3 vext3 <0,4,1,5>, <1,3,4,5>
-  2685207552U,  // <5,1,3,5>: Cost 3 vext3 <0,1,2,5>, <1,3,5,7>
-  3760866313U,  // <5,1,3,6>: Cost 4 vext3 <0,4,1,5>, <1,3,6,7>
-  2692875280U,  // <5,1,3,7>: Cost 3 vext3 <1,3,7,5>, <1,3,7,5>
-  2687124503U,  // <5,1,3,u>: Cost 3 vext3 <0,4,1,5>, <1,3,u,3>
-  1567771538U,  // <5,1,4,0>: Cost 2 vext2 <4,0,5,1>, <4,0,5,1>
-  2693096491U,  // <5,1,4,1>: Cost 3 vext3 <1,4,1,5>, <1,4,1,5>
-  2693170228U,  // <5,1,4,2>: Cost 3 vext3 <1,4,2,5>, <1,4,2,5>
-  2687124541U,  // <5,1,4,3>: Cost 3 vext3 <0,4,1,5>, <1,4,3,5>
-  2646822096U,  // <5,1,4,4>: Cost 3 vext2 <4,u,5,1>, <4,4,4,4>
-  1573080374U,  // <5,1,4,5>: Cost 2 vext2 <4,u,5,1>, RHS
-  2646822260U,  // <5,1,4,6>: Cost 3 vext2 <4,u,5,1>, <4,6,4,6>
-  3298174129U,  // <5,1,4,7>: Cost 4 vrev <1,5,7,4>
-  1573080602U,  // <5,1,4,u>: Cost 2 vext2 <4,u,5,1>, <4,u,5,1>
-  2687124591U,  // <5,1,5,0>: Cost 3 vext3 <0,4,1,5>, <1,5,0,1>
-  2646822543U,  // <5,1,5,1>: Cost 3 vext2 <4,u,5,1>, <5,1,0,1>
-  3760866433U,  // <5,1,5,2>: Cost 4 vext3 <0,4,1,5>, <1,5,2,1>
-  2687124624U,  // <5,1,5,3>: Cost 3 vext3 <0,4,1,5>, <1,5,3,7>
-  2687124631U,  // <5,1,5,4>: Cost 3 vext3 <0,4,1,5>, <1,5,4,5>
-  2646822916U,  // <5,1,5,5>: Cost 3 vext2 <4,u,5,1>, <5,5,5,5>
-  2646823010U,  // <5,1,5,6>: Cost 3 vext2 <4,u,5,1>, <5,6,7,0>
-  2646823080U,  // <5,1,5,7>: Cost 3 vext2 <4,u,5,1>, <5,7,5,7>
-  2687124663U,  // <5,1,5,u>: Cost 3 vext3 <0,4,1,5>, <1,5,u,1>
-  2553577574U,  // <5,1,6,0>: Cost 3 vext1 <0,5,1,6>, LHS
-  3763520719U,  // <5,1,6,1>: Cost 4 vext3 <0,u,1,5>, <1,6,1,7>
-  2646823418U,  // <5,1,6,2>: Cost 3 vext2 <4,u,5,1>, <6,2,7,3>
-  3760866529U,  // <5,1,6,3>: Cost 4 vext3 <0,4,1,5>, <1,6,3,7>
-  2553580854U,  // <5,1,6,4>: Cost 3 vext1 <0,5,1,6>, RHS
-  2687124723U,  // <5,1,6,5>: Cost 3 vext3 <0,4,1,5>, <1,6,5,7>
-  2646823736U,  // <5,1,6,6>: Cost 3 vext2 <4,u,5,1>, <6,6,6,6>
-  2646823758U,  // <5,1,6,7>: Cost 3 vext2 <4,u,5,1>, <6,7,0,1>
-  2646823839U,  // <5,1,6,u>: Cost 3 vext2 <4,u,5,1>, <6,u,0,1>
-  2559557734U,  // <5,1,7,0>: Cost 3 vext1 <1,5,1,7>, LHS
-  2559558452U,  // <5,1,7,1>: Cost 3 vext1 <1,5,1,7>, <1,1,1,1>
-  2571503270U,  // <5,1,7,2>: Cost 3 vext1 <3,5,1,7>, <2,3,0,1>
-  2040971366U,  // <5,1,7,3>: Cost 2 vtrnr RHS, LHS
-  2559561014U,  // <5,1,7,4>: Cost 3 vext1 <1,5,1,7>, RHS
-  2595393232U,  // <5,1,7,5>: Cost 3 vext1 <7,5,1,7>, <5,1,7,3>
-  4188455035U,  // <5,1,7,6>: Cost 4 vtrnr RHS, <0,1,4,6>
-  2646824556U,  // <5,1,7,7>: Cost 3 vext2 <4,u,5,1>, <7,7,7,7>
-  2040971371U,  // <5,1,7,u>: Cost 2 vtrnr RHS, LHS
-  1591662326U,  // <5,1,u,0>: Cost 2 vext2 <u,0,5,1>, <u,0,5,1>
-  1573082926U,  // <5,1,u,1>: Cost 2 vext2 <4,u,5,1>, LHS
-  2695824760U,  // <5,1,u,2>: Cost 3 vext3 <1,u,2,5>, <1,u,2,5>
-  2040979558U,  // <5,1,u,3>: Cost 2 vtrnr RHS, LHS
-  2687124874U,  // <5,1,u,4>: Cost 3 vext3 <0,4,1,5>, <1,u,4,5>
-  1573083290U,  // <5,1,u,5>: Cost 2 vext2 <4,u,5,1>, RHS
-  2646825168U,  // <5,1,u,6>: Cost 3 vext2 <4,u,5,1>, <u,6,3,7>
-  2646825216U,  // <5,1,u,7>: Cost 3 vext2 <4,u,5,1>, <u,7,0,1>
-  2040979563U,  // <5,1,u,u>: Cost 2 vtrnr RHS, LHS
-  3702652928U,  // <5,2,0,0>: Cost 4 vext2 <1,u,5,2>, <0,0,0,0>
-  2628911206U,  // <5,2,0,1>: Cost 3 vext2 <1,u,5,2>, LHS
-  2641518756U,  // <5,2,0,2>: Cost 3 vext2 <4,0,5,2>, <0,2,0,2>
-  3759760847U,  // <5,2,0,3>: Cost 4 vext3 <0,2,4,5>, <2,0,3,2>
-  3760866775U,  // <5,2,0,4>: Cost 4 vext3 <0,4,1,5>, <2,0,4,1>
-  3759539680U,  // <5,2,0,5>: Cost 4 vext3 <0,2,1,5>, <2,0,5,1>
-  3760866796U,  // <5,2,0,6>: Cost 4 vext3 <0,4,1,5>, <2,0,6,4>
-  3304114054U,  // <5,2,0,7>: Cost 4 vrev <2,5,7,0>
-  2628911773U,  // <5,2,0,u>: Cost 3 vext2 <1,u,5,2>, LHS
-  2623603464U,  // <5,2,1,0>: Cost 3 vext2 <1,0,5,2>, <1,0,5,2>
-  3698008921U,  // <5,2,1,1>: Cost 4 vext2 <1,1,5,2>, <1,1,5,2>
-  3633325603U,  // <5,2,1,2>: Cost 4 vext1 <1,5,2,1>, <2,1,3,5>
-  2687125027U,  // <5,2,1,3>: Cost 3 vext3 <0,4,1,5>, <2,1,3,5>
-  3633327414U,  // <5,2,1,4>: Cost 4 vext1 <1,5,2,1>, RHS
-  3759539760U,  // <5,2,1,5>: Cost 4 vext3 <0,2,1,5>, <2,1,5,0>
-  3760866876U,  // <5,2,1,6>: Cost 4 vext3 <0,4,1,5>, <2,1,6,3>
-  3304122247U,  // <5,2,1,7>: Cost 4 vrev <2,5,7,1>
-  2687125072U,  // <5,2,1,u>: Cost 3 vext3 <0,4,1,5>, <2,1,u,5>
-  3633332326U,  // <5,2,2,0>: Cost 4 vext1 <1,5,2,2>, LHS
-  3759760992U,  // <5,2,2,1>: Cost 4 vext3 <0,2,4,5>, <2,2,1,3>
-  2687125096U,  // <5,2,2,2>: Cost 3 vext3 <0,4,1,5>, <2,2,2,2>
-  2687125106U,  // <5,2,2,3>: Cost 3 vext3 <0,4,1,5>, <2,2,3,3>
-  2697963133U,  // <5,2,2,4>: Cost 3 vext3 <2,2,4,5>, <2,2,4,5>
-  3759466120U,  // <5,2,2,5>: Cost 4 vext3 <0,2,0,5>, <2,2,5,7>
-  3760866960U,  // <5,2,2,6>: Cost 4 vext3 <0,4,1,5>, <2,2,6,6>
-  3771926168U,  // <5,2,2,7>: Cost 4 vext3 <2,2,7,5>, <2,2,7,5>
-  2687125151U,  // <5,2,2,u>: Cost 3 vext3 <0,4,1,5>, <2,2,u,3>
-  2687125158U,  // <5,2,3,0>: Cost 3 vext3 <0,4,1,5>, <2,3,0,1>
-  2698405555U,  // <5,2,3,1>: Cost 3 vext3 <2,3,1,5>, <2,3,1,5>
-  2577516238U,  // <5,2,3,2>: Cost 3 vext1 <4,5,2,3>, <2,3,4,5>
-  3759687365U,  // <5,2,3,3>: Cost 4 vext3 <0,2,3,5>, <2,3,3,5>
-  1624884942U,  // <5,2,3,4>: Cost 2 vext3 <2,3,4,5>, <2,3,4,5>
-  2698700503U,  // <5,2,3,5>: Cost 3 vext3 <2,3,5,5>, <2,3,5,5>
-  3772368608U,  // <5,2,3,6>: Cost 4 vext3 <2,3,4,5>, <2,3,6,5>
-  3702655716U,  // <5,2,3,7>: Cost 4 vext2 <1,u,5,2>, <3,7,3,7>
-  1625179890U,  // <5,2,3,u>: Cost 2 vext3 <2,3,u,5>, <2,3,u,5>
-  2641521555U,  // <5,2,4,0>: Cost 3 vext2 <4,0,5,2>, <4,0,5,2>
-  3772368642U,  // <5,2,4,1>: Cost 4 vext3 <2,3,4,5>, <2,4,1,3>
-  2699142925U,  // <5,2,4,2>: Cost 3 vext3 <2,4,2,5>, <2,4,2,5>
-  2698626838U,  // <5,2,4,3>: Cost 3 vext3 <2,3,4,5>, <2,4,3,5>
-  2698626848U,  // <5,2,4,4>: Cost 3 vext3 <2,3,4,5>, <2,4,4,6>
-  2628914486U,  // <5,2,4,5>: Cost 3 vext2 <1,u,5,2>, RHS
-  2645503353U,  // <5,2,4,6>: Cost 3 vext2 <4,6,5,2>, <4,6,5,2>
-  3304146826U,  // <5,2,4,7>: Cost 4 vrev <2,5,7,4>
-  2628914729U,  // <5,2,4,u>: Cost 3 vext2 <1,u,5,2>, RHS
-  2553643110U,  // <5,2,5,0>: Cost 3 vext1 <0,5,2,5>, LHS
-  3758950227U,  // <5,2,5,1>: Cost 4 vext3 <0,1,2,5>, <2,5,1,3>
-  3759761248U,  // <5,2,5,2>: Cost 4 vext3 <0,2,4,5>, <2,5,2,7>
-  2982396006U,  // <5,2,5,3>: Cost 3 vzipr <4,u,5,5>, LHS
-  2553646390U,  // <5,2,5,4>: Cost 3 vext1 <0,5,2,5>, RHS
-  2553647108U,  // <5,2,5,5>: Cost 3 vext1 <0,5,2,5>, <5,5,5,5>
-  3760867204U,  // <5,2,5,6>: Cost 4 vext3 <0,4,1,5>, <2,5,6,7>
-  3702657141U,  // <5,2,5,7>: Cost 4 vext2 <1,u,5,2>, <5,7,0,1>
-  2982396011U,  // <5,2,5,u>: Cost 3 vzipr <4,u,5,5>, LHS
-  3627393126U,  // <5,2,6,0>: Cost 4 vext1 <0,5,2,6>, LHS
-  3760867236U,  // <5,2,6,1>: Cost 4 vext3 <0,4,1,5>, <2,6,1,3>
-  2645504506U,  // <5,2,6,2>: Cost 3 vext2 <4,6,5,2>, <6,2,7,3>
-  2687125434U,  // <5,2,6,3>: Cost 3 vext3 <0,4,1,5>, <2,6,3,7>
-  2700617665U,  // <5,2,6,4>: Cost 3 vext3 <2,6,4,5>, <2,6,4,5>
-  3760867276U,  // <5,2,6,5>: Cost 4 vext3 <0,4,1,5>, <2,6,5,7>
-  3763521493U,  // <5,2,6,6>: Cost 4 vext3 <0,u,1,5>, <2,6,6,7>
-  3719246670U,  // <5,2,6,7>: Cost 4 vext2 <4,6,5,2>, <6,7,0,1>
-  2687125479U,  // <5,2,6,u>: Cost 3 vext3 <0,4,1,5>, <2,6,u,7>
-  2565603430U,  // <5,2,7,0>: Cost 3 vext1 <2,5,2,7>, LHS
-  2553660150U,  // <5,2,7,1>: Cost 3 vext1 <0,5,2,7>, <1,0,3,2>
-  2565605216U,  // <5,2,7,2>: Cost 3 vext1 <2,5,2,7>, <2,5,2,7>
-  2961178726U,  // <5,2,7,3>: Cost 3 vzipr <1,3,5,7>, LHS
-  2565606710U,  // <5,2,7,4>: Cost 3 vext1 <2,5,2,7>, RHS
-  4034920552U,  // <5,2,7,5>: Cost 4 vzipr <1,3,5,7>, <0,1,2,5>
-  3114713292U,  // <5,2,7,6>: Cost 3 vtrnr RHS, <0,2,4,6>
-  3702658668U,  // <5,2,7,7>: Cost 4 vext2 <1,u,5,2>, <7,7,7,7>
-  2961178731U,  // <5,2,7,u>: Cost 3 vzipr <1,3,5,7>, LHS
-  2687125563U,  // <5,2,u,0>: Cost 3 vext3 <0,4,1,5>, <2,u,0,1>
-  2628917038U,  // <5,2,u,1>: Cost 3 vext2 <1,u,5,2>, LHS
-  2565613409U,  // <5,2,u,2>: Cost 3 vext1 <2,5,2,u>, <2,5,2,u>
-  2687125592U,  // <5,2,u,3>: Cost 3 vext3 <0,4,1,5>, <2,u,3,3>
-  1628203107U,  // <5,2,u,4>: Cost 2 vext3 <2,u,4,5>, <2,u,4,5>
-  2628917402U,  // <5,2,u,5>: Cost 3 vext2 <1,u,5,2>, RHS
-  2702092405U,  // <5,2,u,6>: Cost 3 vext3 <2,u,6,5>, <2,u,6,5>
-  3304179598U,  // <5,2,u,7>: Cost 4 vrev <2,5,7,u>
-  1628498055U,  // <5,2,u,u>: Cost 2 vext3 <2,u,u,5>, <2,u,u,5>
-  3760867467U,  // <5,3,0,0>: Cost 4 vext3 <0,4,1,5>, <3,0,0,0>
-  2687125654U,  // <5,3,0,1>: Cost 3 vext3 <0,4,1,5>, <3,0,1,2>
-  3759761565U,  // <5,3,0,2>: Cost 4 vext3 <0,2,4,5>, <3,0,2,0>
-  3633391766U,  // <5,3,0,3>: Cost 4 vext1 <1,5,3,0>, <3,0,1,2>
-  2687125680U,  // <5,3,0,4>: Cost 3 vext3 <0,4,1,5>, <3,0,4,1>
-  3760277690U,  // <5,3,0,5>: Cost 4 vext3 <0,3,2,5>, <3,0,5,2>
-  3310013014U,  // <5,3,0,6>: Cost 4 vrev <3,5,6,0>
-  2236344927U,  // <5,3,0,7>: Cost 3 vrev <3,5,7,0>
-  2687125717U,  // <5,3,0,u>: Cost 3 vext3 <0,4,1,5>, <3,0,u,2>
-  3760867551U,  // <5,3,1,0>: Cost 4 vext3 <0,4,1,5>, <3,1,0,3>
-  3760867558U,  // <5,3,1,1>: Cost 4 vext3 <0,4,1,5>, <3,1,1,1>
-  2624938923U,  // <5,3,1,2>: Cost 3 vext2 <1,2,5,3>, <1,2,5,3>
-  2703198460U,  // <5,3,1,3>: Cost 3 vext3 <3,1,3,5>, <3,1,3,5>
-  3760867587U,  // <5,3,1,4>: Cost 4 vext3 <0,4,1,5>, <3,1,4,3>
-  2636219536U,  // <5,3,1,5>: Cost 3 vext2 <3,1,5,3>, <1,5,3,7>
-  3698681075U,  // <5,3,1,6>: Cost 4 vext2 <1,2,5,3>, <1,6,5,7>
-  2703493408U,  // <5,3,1,7>: Cost 3 vext3 <3,1,7,5>, <3,1,7,5>
-  2628920721U,  // <5,3,1,u>: Cost 3 vext2 <1,u,5,3>, <1,u,5,3>
-  3766765870U,  // <5,3,2,0>: Cost 4 vext3 <1,4,0,5>, <3,2,0,1>
-  3698681379U,  // <5,3,2,1>: Cost 4 vext2 <1,2,5,3>, <2,1,3,5>
-  3760867649U,  // <5,3,2,2>: Cost 4 vext3 <0,4,1,5>, <3,2,2,2>
-  2698627404U,  // <5,3,2,3>: Cost 3 vext3 <2,3,4,5>, <3,2,3,4>
-  2703935830U,  // <5,3,2,4>: Cost 3 vext3 <3,2,4,5>, <3,2,4,5>
-  2698627422U,  // <5,3,2,5>: Cost 3 vext3 <2,3,4,5>, <3,2,5,4>
-  3760867686U,  // <5,3,2,6>: Cost 4 vext3 <0,4,1,5>, <3,2,6,3>
-  3769788783U,  // <5,3,2,7>: Cost 4 vext3 <1,u,5,5>, <3,2,7,3>
-  2701945209U,  // <5,3,2,u>: Cost 3 vext3 <2,u,4,5>, <3,2,u,4>
-  3760867711U,  // <5,3,3,0>: Cost 4 vext3 <0,4,1,5>, <3,3,0,1>
-  2636220684U,  // <5,3,3,1>: Cost 3 vext2 <3,1,5,3>, <3,1,5,3>
-  3772369298U,  // <5,3,3,2>: Cost 4 vext3 <2,3,4,5>, <3,3,2,2>
-  2687125916U,  // <5,3,3,3>: Cost 3 vext3 <0,4,1,5>, <3,3,3,3>
-  2704599463U,  // <5,3,3,4>: Cost 3 vext3 <3,3,4,5>, <3,3,4,5>
-  2704673200U,  // <5,3,3,5>: Cost 3 vext3 <3,3,5,5>, <3,3,5,5>
-  3709962935U,  // <5,3,3,6>: Cost 4 vext2 <3,1,5,3>, <3,6,7,7>
-  3772369346U,  // <5,3,3,7>: Cost 4 vext3 <2,3,4,5>, <3,3,7,5>
-  2704894411U,  // <5,3,3,u>: Cost 3 vext3 <3,3,u,5>, <3,3,u,5>
-  2704968148U,  // <5,3,4,0>: Cost 3 vext3 <3,4,0,5>, <3,4,0,5>
-  3698682850U,  // <5,3,4,1>: Cost 4 vext2 <1,2,5,3>, <4,1,5,0>
-  2642857014U,  // <5,3,4,2>: Cost 3 vext2 <4,2,5,3>, <4,2,5,3>
-  2705189359U,  // <5,3,4,3>: Cost 3 vext3 <3,4,3,5>, <3,4,3,5>
-  2705263096U,  // <5,3,4,4>: Cost 3 vext3 <3,4,4,5>, <3,4,4,5>
-  2685946370U,  // <5,3,4,5>: Cost 3 vext3 <0,2,3,5>, <3,4,5,6>
-  3779152394U,  // <5,3,4,6>: Cost 4 vext3 <3,4,6,5>, <3,4,6,5>
-  2236377699U,  // <5,3,4,7>: Cost 3 vrev <3,5,7,4>
-  2687126045U,  // <5,3,4,u>: Cost 3 vext3 <0,4,1,5>, <3,4,u,6>
-  2571632742U,  // <5,3,5,0>: Cost 3 vext1 <3,5,3,5>, LHS
-  2559689870U,  // <5,3,5,1>: Cost 3 vext1 <1,5,3,5>, <1,5,3,5>
-  2571634382U,  // <5,3,5,2>: Cost 3 vext1 <3,5,3,5>, <2,3,4,5>
-  2571635264U,  // <5,3,5,3>: Cost 3 vext1 <3,5,3,5>, <3,5,3,5>
-  2571636022U,  // <5,3,5,4>: Cost 3 vext1 <3,5,3,5>, RHS
-  2559692804U,  // <5,3,5,5>: Cost 3 vext1 <1,5,3,5>, <5,5,5,5>
-  3720581218U,  // <5,3,5,6>: Cost 4 vext2 <4,u,5,3>, <5,6,7,0>
-  2236385892U,  // <5,3,5,7>: Cost 3 vrev <3,5,7,5>
-  2571638574U,  // <5,3,5,u>: Cost 3 vext1 <3,5,3,5>, LHS
-  2565668966U,  // <5,3,6,0>: Cost 3 vext1 <2,5,3,6>, LHS
-  3633439887U,  // <5,3,6,1>: Cost 4 vext1 <1,5,3,6>, <1,5,3,6>
-  2565670760U,  // <5,3,6,2>: Cost 3 vext1 <2,5,3,6>, <2,5,3,6>
-  2565671426U,  // <5,3,6,3>: Cost 3 vext1 <2,5,3,6>, <3,4,5,6>
-  2565672246U,  // <5,3,6,4>: Cost 3 vext1 <2,5,3,6>, RHS
-  3639414630U,  // <5,3,6,5>: Cost 4 vext1 <2,5,3,6>, <5,3,6,0>
-  4047521640U,  // <5,3,6,6>: Cost 4 vzipr <3,4,5,6>, <2,5,3,6>
-  2725169844U,  // <5,3,6,7>: Cost 3 vext3 <6,7,4,5>, <3,6,7,4>
-  2565674798U,  // <5,3,6,u>: Cost 3 vext1 <2,5,3,6>, LHS
-  1485963366U,  // <5,3,7,0>: Cost 2 vext1 <1,5,3,7>, LHS
-  1485964432U,  // <5,3,7,1>: Cost 2 vext1 <1,5,3,7>, <1,5,3,7>
-  2559706728U,  // <5,3,7,2>: Cost 3 vext1 <1,5,3,7>, <2,2,2,2>
-  2559707286U,  // <5,3,7,3>: Cost 3 vext1 <1,5,3,7>, <3,0,1,2>
-  1485966646U,  // <5,3,7,4>: Cost 2 vext1 <1,5,3,7>, RHS
-  2559708880U,  // <5,3,7,5>: Cost 3 vext1 <1,5,3,7>, <5,1,7,3>
-  2601513466U,  // <5,3,7,6>: Cost 3 vext1 <u,5,3,7>, <6,2,7,3>
-  3114714112U,  // <5,3,7,7>: Cost 3 vtrnr RHS, <1,3,5,7>
-  1485969198U,  // <5,3,7,u>: Cost 2 vext1 <1,5,3,7>, LHS
-  1485971558U,  // <5,3,u,0>: Cost 2 vext1 <1,5,3,u>, LHS
-  1485972625U,  // <5,3,u,1>: Cost 2 vext1 <1,5,3,u>, <1,5,3,u>
-  2559714920U,  // <5,3,u,2>: Cost 3 vext1 <1,5,3,u>, <2,2,2,2>
-  2559715478U,  // <5,3,u,3>: Cost 3 vext1 <1,5,3,u>, <3,0,1,2>
-  1485974838U,  // <5,3,u,4>: Cost 2 vext1 <1,5,3,u>, RHS
-  2687126342U,  // <5,3,u,5>: Cost 3 vext3 <0,4,1,5>, <3,u,5,6>
-  2601521658U,  // <5,3,u,6>: Cost 3 vext1 <u,5,3,u>, <6,2,7,3>
-  2236410471U,  // <5,3,u,7>: Cost 3 vrev <3,5,7,u>
-  1485977390U,  // <5,3,u,u>: Cost 2 vext1 <1,5,3,u>, LHS
-  3627491430U,  // <5,4,0,0>: Cost 4 vext1 <0,5,4,0>, LHS
-  2636890214U,  // <5,4,0,1>: Cost 3 vext2 <3,2,5,4>, LHS
-  3703333028U,  // <5,4,0,2>: Cost 4 vext2 <2,0,5,4>, <0,2,0,2>
-  3782249348U,  // <5,4,0,3>: Cost 4 vext3 <4,0,3,5>, <4,0,3,5>
-  2642198866U,  // <5,4,0,4>: Cost 3 vext2 <4,1,5,4>, <0,4,1,5>
-  2687126418U,  // <5,4,0,5>: Cost 3 vext3 <0,4,1,5>, <4,0,5,1>
-  2242243887U,  // <5,4,0,6>: Cost 3 vrev <4,5,6,0>
-  3316059448U,  // <5,4,0,7>: Cost 4 vrev <4,5,7,0>
-  2636890781U,  // <5,4,0,u>: Cost 3 vext2 <3,2,5,4>, LHS
-  2241809658U,  // <5,4,1,0>: Cost 3 vrev <4,5,0,1>
-  3698025307U,  // <5,4,1,1>: Cost 4 vext2 <1,1,5,4>, <1,1,5,4>
-  3698688940U,  // <5,4,1,2>: Cost 4 vext2 <1,2,5,4>, <1,2,5,4>
-  3698689024U,  // <5,4,1,3>: Cost 4 vext2 <1,2,5,4>, <1,3,5,7>
-  3700016206U,  // <5,4,1,4>: Cost 4 vext2 <1,4,5,4>, <1,4,5,4>
-  2687126498U,  // <5,4,1,5>: Cost 3 vext3 <0,4,1,5>, <4,1,5,0>
-  3760868336U,  // <5,4,1,6>: Cost 4 vext3 <0,4,1,5>, <4,1,6,5>
-  3316067641U,  // <5,4,1,7>: Cost 4 vrev <4,5,7,1>
-  2242399554U,  // <5,4,1,u>: Cost 3 vrev <4,5,u,1>
-  3703334371U,  // <5,4,2,0>: Cost 4 vext2 <2,0,5,4>, <2,0,5,4>
-  3703998004U,  // <5,4,2,1>: Cost 4 vext2 <2,1,5,4>, <2,1,5,4>
-  3704661637U,  // <5,4,2,2>: Cost 4 vext2 <2,2,5,4>, <2,2,5,4>
-  2636891854U,  // <5,4,2,3>: Cost 3 vext2 <3,2,5,4>, <2,3,4,5>
-  3705988903U,  // <5,4,2,4>: Cost 4 vext2 <2,4,5,4>, <2,4,5,4>
-  2698628150U,  // <5,4,2,5>: Cost 3 vext3 <2,3,4,5>, <4,2,5,3>
-  3760868415U,  // <5,4,2,6>: Cost 4 vext3 <0,4,1,5>, <4,2,6,3>
-  3783871562U,  // <5,4,2,7>: Cost 4 vext3 <4,2,7,5>, <4,2,7,5>
-  2666752099U,  // <5,4,2,u>: Cost 3 vext2 <u,2,5,4>, <2,u,4,5>
-  3639459942U,  // <5,4,3,0>: Cost 4 vext1 <2,5,4,3>, LHS
-  3709970701U,  // <5,4,3,1>: Cost 4 vext2 <3,1,5,4>, <3,1,5,4>
-  2636892510U,  // <5,4,3,2>: Cost 3 vext2 <3,2,5,4>, <3,2,5,4>
-  3710634396U,  // <5,4,3,3>: Cost 4 vext2 <3,2,5,4>, <3,3,3,3>
-  2638219776U,  // <5,4,3,4>: Cost 3 vext2 <3,4,5,4>, <3,4,5,4>
-  3766987908U,  // <5,4,3,5>: Cost 4 vext3 <1,4,3,5>, <4,3,5,0>
-  2710719634U,  // <5,4,3,6>: Cost 3 vext3 <4,3,6,5>, <4,3,6,5>
-  3914097664U,  // <5,4,3,7>: Cost 4 vuzpr <3,5,7,4>, <1,3,5,7>
-  2640874308U,  // <5,4,3,u>: Cost 3 vext2 <3,u,5,4>, <3,u,5,4>
-  2583642214U,  // <5,4,4,0>: Cost 3 vext1 <5,5,4,4>, LHS
-  2642201574U,  // <5,4,4,1>: Cost 3 vext2 <4,1,5,4>, <4,1,5,4>
-  3710635062U,  // <5,4,4,2>: Cost 4 vext2 <3,2,5,4>, <4,2,5,3>
-  3717270664U,  // <5,4,4,3>: Cost 4 vext2 <4,3,5,4>, <4,3,5,4>
-  2713963728U,  // <5,4,4,4>: Cost 3 vext3 <4,u,5,5>, <4,4,4,4>
-  1637567706U,  // <5,4,4,5>: Cost 2 vext3 <4,4,5,5>, <4,4,5,5>
-  2242276659U,  // <5,4,4,6>: Cost 3 vrev <4,5,6,4>
-  2646183372U,  // <5,4,4,7>: Cost 3 vext2 <4,7,5,4>, <4,7,5,4>
-  1637788917U,  // <5,4,4,u>: Cost 2 vext3 <4,4,u,5>, <4,4,u,5>
-  2559762534U,  // <5,4,5,0>: Cost 3 vext1 <1,5,4,5>, LHS
-  2559763607U,  // <5,4,5,1>: Cost 3 vext1 <1,5,4,5>, <1,5,4,5>
-  2698628366U,  // <5,4,5,2>: Cost 3 vext3 <2,3,4,5>, <4,5,2,3>
-  3633506454U,  // <5,4,5,3>: Cost 4 vext1 <1,5,4,5>, <3,0,1,2>
-  2559765814U,  // <5,4,5,4>: Cost 3 vext1 <1,5,4,5>, RHS
-  2583654395U,  // <5,4,5,5>: Cost 3 vext1 <5,5,4,5>, <5,5,4,5>
-  1613385014U,  // <5,4,5,6>: Cost 2 vext3 <0,4,1,5>, RHS
-  3901639990U,  // <5,4,5,7>: Cost 4 vuzpr <1,5,0,4>, RHS
-  1613385032U,  // <5,4,5,u>: Cost 2 vext3 <0,4,1,5>, RHS
-  2559770726U,  // <5,4,6,0>: Cost 3 vext1 <1,5,4,6>, LHS
-  2559771648U,  // <5,4,6,1>: Cost 3 vext1 <1,5,4,6>, <1,3,5,7>
-  3633514088U,  // <5,4,6,2>: Cost 4 vext1 <1,5,4,6>, <2,2,2,2>
-  2571717122U,  // <5,4,6,3>: Cost 3 vext1 <3,5,4,6>, <3,4,5,6>
-  2559774006U,  // <5,4,6,4>: Cost 3 vext1 <1,5,4,6>, RHS
-  2712636796U,  // <5,4,6,5>: Cost 3 vext3 <4,6,5,5>, <4,6,5,5>
-  3760868743U,  // <5,4,6,6>: Cost 4 vext3 <0,4,1,5>, <4,6,6,7>
-  2712784270U,  // <5,4,6,7>: Cost 3 vext3 <4,6,7,5>, <4,6,7,5>
-  2559776558U,  // <5,4,6,u>: Cost 3 vext1 <1,5,4,6>, LHS
-  2565750886U,  // <5,4,7,0>: Cost 3 vext1 <2,5,4,7>, LHS
-  2565751706U,  // <5,4,7,1>: Cost 3 vext1 <2,5,4,7>, <1,2,3,4>
-  2565752690U,  // <5,4,7,2>: Cost 3 vext1 <2,5,4,7>, <2,5,4,7>
-  2571725387U,  // <5,4,7,3>: Cost 3 vext1 <3,5,4,7>, <3,5,4,7>
-  2565754166U,  // <5,4,7,4>: Cost 3 vext1 <2,5,4,7>, RHS
-  3114713426U,  // <5,4,7,5>: Cost 3 vtrnr RHS, <0,4,1,5>
-    94817590U,  // <5,4,7,6>: Cost 1 vrev RHS
-  2595616175U,  // <5,4,7,7>: Cost 3 vext1 <7,5,4,7>, <7,5,4,7>
-    94965064U,  // <5,4,7,u>: Cost 1 vrev RHS
-  2559787110U,  // <5,4,u,0>: Cost 3 vext1 <1,5,4,u>, LHS
-  2559788186U,  // <5,4,u,1>: Cost 3 vext1 <1,5,4,u>, <1,5,4,u>
-  2242014483U,  // <5,4,u,2>: Cost 3 vrev <4,5,2,u>
-  2667419628U,  // <5,4,u,3>: Cost 3 vext2 <u,3,5,4>, <u,3,5,4>
-  2559790390U,  // <5,4,u,4>: Cost 3 vext1 <1,5,4,u>, RHS
-  1640222238U,  // <5,4,u,5>: Cost 2 vext3 <4,u,5,5>, <4,u,5,5>
-    94825783U,  // <5,4,u,6>: Cost 1 vrev RHS
-  2714111536U,  // <5,4,u,7>: Cost 3 vext3 <4,u,7,5>, <4,u,7,5>
-    94973257U,  // <5,4,u,u>: Cost 1 vrev RHS
-  2646851584U,  // <5,5,0,0>: Cost 3 vext2 <4,u,5,5>, <0,0,0,0>
-  1573109862U,  // <5,5,0,1>: Cost 2 vext2 <4,u,5,5>, LHS
-  2646851748U,  // <5,5,0,2>: Cost 3 vext2 <4,u,5,5>, <0,2,0,2>
-  3760279130U,  // <5,5,0,3>: Cost 4 vext3 <0,3,2,5>, <5,0,3,2>
-  2687127138U,  // <5,5,0,4>: Cost 3 vext3 <0,4,1,5>, <5,0,4,1>
-  2248142847U,  // <5,5,0,5>: Cost 3 vrev <5,5,5,0>
-  3720593910U,  // <5,5,0,6>: Cost 4 vext2 <4,u,5,5>, <0,6,1,7>
-  4182502710U,  // <5,5,0,7>: Cost 4 vtrnr <3,5,7,0>, RHS
-  1573110429U,  // <5,5,0,u>: Cost 2 vext2 <4,u,5,5>, LHS
-  2646852342U,  // <5,5,1,0>: Cost 3 vext2 <4,u,5,5>, <1,0,3,2>
-  2624291676U,  // <5,5,1,1>: Cost 3 vext2 <1,1,5,5>, <1,1,5,5>
-  2646852502U,  // <5,5,1,2>: Cost 3 vext2 <4,u,5,5>, <1,2,3,0>
-  2646852568U,  // <5,5,1,3>: Cost 3 vext2 <4,u,5,5>, <1,3,1,3>
-  2715217591U,  // <5,5,1,4>: Cost 3 vext3 <5,1,4,5>, <5,1,4,5>
-  2628936848U,  // <5,5,1,5>: Cost 3 vext2 <1,u,5,5>, <1,5,3,7>
-  3698033907U,  // <5,5,1,6>: Cost 4 vext2 <1,1,5,5>, <1,6,5,7>
-  2713964240U,  // <5,5,1,7>: Cost 3 vext3 <4,u,5,5>, <5,1,7,3>
-  2628937107U,  // <5,5,1,u>: Cost 3 vext2 <1,u,5,5>, <1,u,5,5>
-  3645497446U,  // <5,5,2,0>: Cost 4 vext1 <3,5,5,2>, LHS
-  3760869099U,  // <5,5,2,1>: Cost 4 vext3 <0,4,1,5>, <5,2,1,3>
-  2646853224U,  // <5,5,2,2>: Cost 3 vext2 <4,u,5,5>, <2,2,2,2>
-  2698628862U,  // <5,5,2,3>: Cost 3 vext3 <2,3,4,5>, <5,2,3,4>
-  3772370694U,  // <5,5,2,4>: Cost 4 vext3 <2,3,4,5>, <5,2,4,3>
-  2713964303U,  // <5,5,2,5>: Cost 3 vext3 <4,u,5,5>, <5,2,5,3>
-  2646853562U,  // <5,5,2,6>: Cost 3 vext2 <4,u,5,5>, <2,6,3,7>
-  4038198272U,  // <5,5,2,7>: Cost 4 vzipr <1,u,5,2>, <1,3,5,7>
-  2701946667U,  // <5,5,2,u>: Cost 3 vext3 <2,u,4,5>, <5,2,u,4>
-  2646853782U,  // <5,5,3,0>: Cost 3 vext2 <4,u,5,5>, <3,0,1,2>
-  3698034922U,  // <5,5,3,1>: Cost 4 vext2 <1,1,5,5>, <3,1,1,5>
-  3702679919U,  // <5,5,3,2>: Cost 4 vext2 <1,u,5,5>, <3,2,7,3>
-  2637564336U,  // <5,5,3,3>: Cost 3 vext2 <3,3,5,5>, <3,3,5,5>
-  2646854146U,  // <5,5,3,4>: Cost 3 vext2 <4,u,5,5>, <3,4,5,6>
-  2638891602U,  // <5,5,3,5>: Cost 3 vext2 <3,5,5,5>, <3,5,5,5>
-  3702680247U,  // <5,5,3,6>: Cost 4 vext2 <1,u,5,5>, <3,6,7,7>
-  3702680259U,  // <5,5,3,7>: Cost 4 vext2 <1,u,5,5>, <3,7,0,1>
-  2646854430U,  // <5,5,3,u>: Cost 3 vext2 <4,u,5,5>, <3,u,1,2>
-  2646854546U,  // <5,5,4,0>: Cost 3 vext2 <4,u,5,5>, <4,0,5,1>
-  2642209767U,  // <5,5,4,1>: Cost 3 vext2 <4,1,5,5>, <4,1,5,5>
-  3711306806U,  // <5,5,4,2>: Cost 4 vext2 <3,3,5,5>, <4,2,5,3>
-  3645516369U,  // <5,5,4,3>: Cost 4 vext1 <3,5,5,4>, <3,5,5,4>
-  1570458842U,  // <5,5,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5>
-  1573113142U,  // <5,5,4,5>: Cost 2 vext2 <4,u,5,5>, RHS
-  2645527932U,  // <5,5,4,6>: Cost 3 vext2 <4,6,5,5>, <4,6,5,5>
-  2713964486U,  // <5,5,4,7>: Cost 3 vext3 <4,u,5,5>, <5,4,7,6>
-  1573113374U,  // <5,5,4,u>: Cost 2 vext2 <4,u,5,5>, <4,u,5,5>
-  1509982310U,  // <5,5,5,0>: Cost 2 vext1 <5,5,5,5>, LHS
-  2646855376U,  // <5,5,5,1>: Cost 3 vext2 <4,u,5,5>, <5,1,7,3>
-  2583725672U,  // <5,5,5,2>: Cost 3 vext1 <5,5,5,5>, <2,2,2,2>
-  2583726230U,  // <5,5,5,3>: Cost 3 vext1 <5,5,5,5>, <3,0,1,2>
-  1509985590U,  // <5,5,5,4>: Cost 2 vext1 <5,5,5,5>, RHS
-   229035318U,  // <5,5,5,5>: Cost 1 vdup1 RHS
-  2646855778U,  // <5,5,5,6>: Cost 3 vext2 <4,u,5,5>, <5,6,7,0>
-  2646855848U,  // <5,5,5,7>: Cost 3 vext2 <4,u,5,5>, <5,7,5,7>
-   229035318U,  // <5,5,5,u>: Cost 1 vdup1 RHS
-  2577760358U,  // <5,5,6,0>: Cost 3 vext1 <4,5,5,6>, LHS
-  3633587361U,  // <5,5,6,1>: Cost 4 vext1 <1,5,5,6>, <1,5,5,6>
-  2646856186U,  // <5,5,6,2>: Cost 3 vext2 <4,u,5,5>, <6,2,7,3>
-  3633588738U,  // <5,5,6,3>: Cost 4 vext1 <1,5,5,6>, <3,4,5,6>
-  2718535756U,  // <5,5,6,4>: Cost 3 vext3 <5,6,4,5>, <5,6,4,5>
-  2644202223U,  // <5,5,6,5>: Cost 3 vext2 <4,4,5,5>, <6,5,7,5>
-  2973780482U,  // <5,5,6,6>: Cost 3 vzipr <3,4,5,6>, <3,4,5,6>
-  2646856526U,  // <5,5,6,7>: Cost 3 vext2 <4,u,5,5>, <6,7,0,1>
-  2646856607U,  // <5,5,6,u>: Cost 3 vext2 <4,u,5,5>, <6,u,0,1>
-  2571796582U,  // <5,5,7,0>: Cost 3 vext1 <3,5,5,7>, LHS
-  3633595392U,  // <5,5,7,1>: Cost 4 vext1 <1,5,5,7>, <1,3,5,7>
-  2571798222U,  // <5,5,7,2>: Cost 3 vext1 <3,5,5,7>, <2,3,4,5>
-  2571799124U,  // <5,5,7,3>: Cost 3 vext1 <3,5,5,7>, <3,5,5,7>
-  2571799862U,  // <5,5,7,4>: Cost 3 vext1 <3,5,5,7>, RHS
-  3114717188U,  // <5,5,7,5>: Cost 3 vtrnr RHS, <5,5,5,5>
-  4034923010U,  // <5,5,7,6>: Cost 4 vzipr <1,3,5,7>, <3,4,5,6>
-  2040974646U,  // <5,5,7,7>: Cost 2 vtrnr RHS, RHS
-  2040974647U,  // <5,5,7,u>: Cost 2 vtrnr RHS, RHS
-  1509982310U,  // <5,5,u,0>: Cost 2 vext1 <5,5,5,5>, LHS
-  1573115694U,  // <5,5,u,1>: Cost 2 vext2 <4,u,5,5>, LHS
-  2571806414U,  // <5,5,u,2>: Cost 3 vext1 <3,5,5,u>, <2,3,4,5>
-  2571807317U,  // <5,5,u,3>: Cost 3 vext1 <3,5,5,u>, <3,5,5,u>
-  1509985590U,  // <5,5,u,4>: Cost 2 vext1 <5,5,5,5>, RHS
-   229035318U,  // <5,5,u,5>: Cost 1 vdup1 RHS
-  2646857936U,  // <5,5,u,6>: Cost 3 vext2 <4,u,5,5>, <u,6,3,7>
-  2040982838U,  // <5,5,u,7>: Cost 2 vtrnr RHS, RHS
-   229035318U,  // <5,5,u,u>: Cost 1 vdup1 RHS
-  2638233600U,  // <5,6,0,0>: Cost 3 vext2 <3,4,5,6>, <0,0,0,0>
-  1564491878U,  // <5,6,0,1>: Cost 2 vext2 <3,4,5,6>, LHS
-  2632261796U,  // <5,6,0,2>: Cost 3 vext2 <2,4,5,6>, <0,2,0,2>
-  2638233856U,  // <5,6,0,3>: Cost 3 vext2 <3,4,5,6>, <0,3,1,4>
-  2638233938U,  // <5,6,0,4>: Cost 3 vext2 <3,4,5,6>, <0,4,1,5>
-  3706003885U,  // <5,6,0,5>: Cost 4 vext2 <2,4,5,6>, <0,5,2,6>
-  3706003967U,  // <5,6,0,6>: Cost 4 vext2 <2,4,5,6>, <0,6,2,7>
-  4047473974U,  // <5,6,0,7>: Cost 4 vzipr <3,4,5,0>, RHS
-  1564492445U,  // <5,6,0,u>: Cost 2 vext2 <3,4,5,6>, LHS
-  2638234358U,  // <5,6,1,0>: Cost 3 vext2 <3,4,5,6>, <1,0,3,2>
-  2638234420U,  // <5,6,1,1>: Cost 3 vext2 <3,4,5,6>, <1,1,1,1>
-  2638234518U,  // <5,6,1,2>: Cost 3 vext2 <3,4,5,6>, <1,2,3,0>
-  2638234584U,  // <5,6,1,3>: Cost 3 vext2 <3,4,5,6>, <1,3,1,3>
-  2626290768U,  // <5,6,1,4>: Cost 3 vext2 <1,4,5,6>, <1,4,5,6>
-  2638234768U,  // <5,6,1,5>: Cost 3 vext2 <3,4,5,6>, <1,5,3,7>
-  3700032719U,  // <5,6,1,6>: Cost 4 vext2 <1,4,5,6>, <1,6,1,7>
-  2982366518U,  // <5,6,1,7>: Cost 3 vzipr <4,u,5,1>, RHS
-  2628945300U,  // <5,6,1,u>: Cost 3 vext2 <1,u,5,6>, <1,u,5,6>
-  3706004925U,  // <5,6,2,0>: Cost 4 vext2 <2,4,5,6>, <2,0,1,2>
-  3711976966U,  // <5,6,2,1>: Cost 4 vext2 <3,4,5,6>, <2,1,0,3>
-  2638235240U,  // <5,6,2,2>: Cost 3 vext2 <3,4,5,6>, <2,2,2,2>
-  2638235302U,  // <5,6,2,3>: Cost 3 vext2 <3,4,5,6>, <2,3,0,1>
-  2632263465U,  // <5,6,2,4>: Cost 3 vext2 <2,4,5,6>, <2,4,5,6>
-  2638235496U,  // <5,6,2,5>: Cost 3 vext2 <3,4,5,6>, <2,5,3,6>
-  2638235578U,  // <5,6,2,6>: Cost 3 vext2 <3,4,5,6>, <2,6,3,7>
-  2713965050U,  // <5,6,2,7>: Cost 3 vext3 <4,u,5,5>, <6,2,7,3>
-  2634917997U,  // <5,6,2,u>: Cost 3 vext2 <2,u,5,6>, <2,u,5,6>
-  2638235798U,  // <5,6,3,0>: Cost 3 vext2 <3,4,5,6>, <3,0,1,2>
-  3711977695U,  // <5,6,3,1>: Cost 4 vext2 <3,4,5,6>, <3,1,0,3>
-  3710650720U,  // <5,6,3,2>: Cost 4 vext2 <3,2,5,6>, <3,2,5,6>
-  2638236060U,  // <5,6,3,3>: Cost 3 vext2 <3,4,5,6>, <3,3,3,3>
-  1564494338U,  // <5,6,3,4>: Cost 2 vext2 <3,4,5,6>, <3,4,5,6>
-  2638236234U,  // <5,6,3,5>: Cost 3 vext2 <3,4,5,6>, <3,5,4,6>
-  3711978104U,  // <5,6,3,6>: Cost 4 vext2 <3,4,5,6>, <3,6,0,7>
-  4034227510U,  // <5,6,3,7>: Cost 4 vzipr <1,2,5,3>, RHS
-  1567148870U,  // <5,6,3,u>: Cost 2 vext2 <3,u,5,6>, <3,u,5,6>
-  2577817702U,  // <5,6,4,0>: Cost 3 vext1 <4,5,6,4>, LHS
-  3700034544U,  // <5,6,4,1>: Cost 4 vext2 <1,4,5,6>, <4,1,6,5>
-  2723033713U,  // <5,6,4,2>: Cost 3 vext3 <6,4,2,5>, <6,4,2,5>
-  2638236818U,  // <5,6,4,3>: Cost 3 vext2 <3,4,5,6>, <4,3,6,5>
-  2644208859U,  // <5,6,4,4>: Cost 3 vext2 <4,4,5,6>, <4,4,5,6>
-  1564495158U,  // <5,6,4,5>: Cost 2 vext2 <3,4,5,6>, RHS
-  2645536125U,  // <5,6,4,6>: Cost 3 vext2 <4,6,5,6>, <4,6,5,6>
-  2723402398U,  // <5,6,4,7>: Cost 3 vext3 <6,4,7,5>, <6,4,7,5>
-  1564495401U,  // <5,6,4,u>: Cost 2 vext2 <3,4,5,6>, RHS
-  2577825894U,  // <5,6,5,0>: Cost 3 vext1 <4,5,6,5>, LHS
-  2662125264U,  // <5,6,5,1>: Cost 3 vext2 <7,4,5,6>, <5,1,7,3>
-  3775836867U,  // <5,6,5,2>: Cost 4 vext3 <2,u,6,5>, <6,5,2,6>
-  3711979343U,  // <5,6,5,3>: Cost 4 vext2 <3,4,5,6>, <5,3,3,4>
-  2650181556U,  // <5,6,5,4>: Cost 3 vext2 <5,4,5,6>, <5,4,5,6>
-  2662125572U,  // <5,6,5,5>: Cost 3 vext2 <7,4,5,6>, <5,5,5,5>
-  2638237732U,  // <5,6,5,6>: Cost 3 vext2 <3,4,5,6>, <5,6,0,1>
-  2982399286U,  // <5,6,5,7>: Cost 3 vzipr <4,u,5,5>, RHS
-  2982399287U,  // <5,6,5,u>: Cost 3 vzipr <4,u,5,5>, RHS
-  2583806054U,  // <5,6,6,0>: Cost 3 vext1 <5,5,6,6>, LHS
-  3711979910U,  // <5,6,6,1>: Cost 4 vext2 <3,4,5,6>, <6,1,3,4>
-  2662126074U,  // <5,6,6,2>: Cost 3 vext2 <7,4,5,6>, <6,2,7,3>
-  2583808514U,  // <5,6,6,3>: Cost 3 vext1 <5,5,6,6>, <3,4,5,6>
-  2583809334U,  // <5,6,6,4>: Cost 3 vext1 <5,5,6,6>, RHS
-  2583810062U,  // <5,6,6,5>: Cost 3 vext1 <5,5,6,6>, <5,5,6,6>
-  2638238520U,  // <5,6,6,6>: Cost 3 vext2 <3,4,5,6>, <6,6,6,6>
-  2973781302U,  // <5,6,6,7>: Cost 3 vzipr <3,4,5,6>, RHS
-  2973781303U,  // <5,6,6,u>: Cost 3 vzipr <3,4,5,6>, RHS
-   430358630U,  // <5,6,7,0>: Cost 1 vext1 RHS, LHS
-  1504101110U,  // <5,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2>
-  1504101992U,  // <5,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
-  1504102550U,  // <5,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2>
-   430361910U,  // <5,6,7,4>: Cost 1 vext1 RHS, RHS
-  1504104390U,  // <5,6,7,5>: Cost 2 vext1 RHS, <5,4,7,6>
-  1504105272U,  // <5,6,7,6>: Cost 2 vext1 RHS, <6,6,6,6>
-  1504106092U,  // <5,6,7,7>: Cost 2 vext1 RHS, <7,7,7,7>
-   430364462U,  // <5,6,7,u>: Cost 1 vext1 RHS, LHS
-   430366822U,  // <5,6,u,0>: Cost 1 vext1 RHS, LHS
-  1564497710U,  // <5,6,u,1>: Cost 2 vext2 <3,4,5,6>, LHS
-  1504110184U,  // <5,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2>
-  1504110742U,  // <5,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2>
-   430370103U,  // <5,6,u,4>: Cost 1 vext1 RHS, RHS
-  1564498074U,  // <5,6,u,5>: Cost 2 vext2 <3,4,5,6>, RHS
-  1504113146U,  // <5,6,u,6>: Cost 2 vext1 RHS, <6,2,7,3>
-  1504113658U,  // <5,6,u,7>: Cost 2 vext1 RHS, <7,0,1,2>
-   430372654U,  // <5,6,u,u>: Cost 1 vext1 RHS, LHS
-  2625634304U,  // <5,7,0,0>: Cost 3 vext2 <1,3,5,7>, <0,0,0,0>
-  1551892582U,  // <5,7,0,1>: Cost 2 vext2 <1,3,5,7>, LHS
-  2625634468U,  // <5,7,0,2>: Cost 3 vext2 <1,3,5,7>, <0,2,0,2>
-  2571889247U,  // <5,7,0,3>: Cost 3 vext1 <3,5,7,0>, <3,5,7,0>
-  2625634642U,  // <5,7,0,4>: Cost 3 vext2 <1,3,5,7>, <0,4,1,5>
-  2595778728U,  // <5,7,0,5>: Cost 3 vext1 <7,5,7,0>, <5,7,5,7>
-  3699376639U,  // <5,7,0,6>: Cost 4 vext2 <1,3,5,7>, <0,6,2,7>
-  2260235715U,  // <5,7,0,7>: Cost 3 vrev <7,5,7,0>
-  1551893149U,  // <5,7,0,u>: Cost 2 vext2 <1,3,5,7>, LHS
-  2625635062U,  // <5,7,1,0>: Cost 3 vext2 <1,3,5,7>, <1,0,3,2>
-  2624308020U,  // <5,7,1,1>: Cost 3 vext2 <1,1,5,7>, <1,1,1,1>
-  2625635222U,  // <5,7,1,2>: Cost 3 vext2 <1,3,5,7>, <1,2,3,0>
-  1551893504U,  // <5,7,1,3>: Cost 2 vext2 <1,3,5,7>, <1,3,5,7>
-  2571898166U,  // <5,7,1,4>: Cost 3 vext1 <3,5,7,1>, RHS
-  2625635472U,  // <5,7,1,5>: Cost 3 vext2 <1,3,5,7>, <1,5,3,7>
-  2627626227U,  // <5,7,1,6>: Cost 3 vext2 <1,6,5,7>, <1,6,5,7>
-  3702031684U,  // <5,7,1,7>: Cost 4 vext2 <1,7,5,7>, <1,7,5,7>
-  1555211669U,  // <5,7,1,u>: Cost 2 vext2 <1,u,5,7>, <1,u,5,7>
-  2629617126U,  // <5,7,2,0>: Cost 3 vext2 <2,0,5,7>, <2,0,5,7>
-  3699377670U,  // <5,7,2,1>: Cost 4 vext2 <1,3,5,7>, <2,1,0,3>
-  2625635944U,  // <5,7,2,2>: Cost 3 vext2 <1,3,5,7>, <2,2,2,2>
-  2625636006U,  // <5,7,2,3>: Cost 3 vext2 <1,3,5,7>, <2,3,0,1>
-  2632271658U,  // <5,7,2,4>: Cost 3 vext2 <2,4,5,7>, <2,4,5,7>
-  2625636201U,  // <5,7,2,5>: Cost 3 vext2 <1,3,5,7>, <2,5,3,7>
-  2625636282U,  // <5,7,2,6>: Cost 3 vext2 <1,3,5,7>, <2,6,3,7>
-  3708004381U,  // <5,7,2,7>: Cost 4 vext2 <2,7,5,7>, <2,7,5,7>
-  2625636411U,  // <5,7,2,u>: Cost 3 vext2 <1,3,5,7>, <2,u,0,1>
-  2625636502U,  // <5,7,3,0>: Cost 3 vext2 <1,3,5,7>, <3,0,1,2>
-  2625636604U,  // <5,7,3,1>: Cost 3 vext2 <1,3,5,7>, <3,1,3,5>
-  3699378478U,  // <5,7,3,2>: Cost 4 vext2 <1,3,5,7>, <3,2,0,1>
-  2625636764U,  // <5,7,3,3>: Cost 3 vext2 <1,3,5,7>, <3,3,3,3>
-  2625636866U,  // <5,7,3,4>: Cost 3 vext2 <1,3,5,7>, <3,4,5,6>
-  2625636959U,  // <5,7,3,5>: Cost 3 vext2 <1,3,5,7>, <3,5,7,0>
-  3699378808U,  // <5,7,3,6>: Cost 4 vext2 <1,3,5,7>, <3,6,0,7>
-  2640235254U,  // <5,7,3,7>: Cost 3 vext2 <3,7,5,7>, <3,7,5,7>
-  2625637150U,  // <5,7,3,u>: Cost 3 vext2 <1,3,5,7>, <3,u,1,2>
-  2571919462U,  // <5,7,4,0>: Cost 3 vext1 <3,5,7,4>, LHS
-  2571920384U,  // <5,7,4,1>: Cost 3 vext1 <3,5,7,4>, <1,3,5,7>
-  3699379260U,  // <5,7,4,2>: Cost 4 vext2 <1,3,5,7>, <4,2,6,0>
-  2571922019U,  // <5,7,4,3>: Cost 3 vext1 <3,5,7,4>, <3,5,7,4>
-  2571922742U,  // <5,7,4,4>: Cost 3 vext1 <3,5,7,4>, RHS
-  1551895862U,  // <5,7,4,5>: Cost 2 vext2 <1,3,5,7>, RHS
-  2846277980U,  // <5,7,4,6>: Cost 3 vuzpr RHS, <0,4,2,6>
-  2646207951U,  // <5,7,4,7>: Cost 3 vext2 <4,7,5,7>, <4,7,5,7>
-  1551896105U,  // <5,7,4,u>: Cost 2 vext2 <1,3,5,7>, RHS
-  2583871590U,  // <5,7,5,0>: Cost 3 vext1 <5,5,7,5>, LHS
-  2652180176U,  // <5,7,5,1>: Cost 3 vext2 <5,7,5,7>, <5,1,7,3>
-  2625638177U,  // <5,7,5,2>: Cost 3 vext2 <1,3,5,7>, <5,2,7,3>
-  2625638262U,  // <5,7,5,3>: Cost 3 vext2 <1,3,5,7>, <5,3,7,7>
-  2583874870U,  // <5,7,5,4>: Cost 3 vext1 <5,5,7,5>, RHS
-  2846281732U,  // <5,7,5,5>: Cost 3 vuzpr RHS, <5,5,5,5>
-  2651517015U,  // <5,7,5,6>: Cost 3 vext2 <5,6,5,7>, <5,6,5,7>
-  1772539190U,  // <5,7,5,7>: Cost 2 vuzpr RHS, RHS
-  1772539191U,  // <5,7,5,u>: Cost 2 vuzpr RHS, RHS
-  2846281826U,  // <5,7,6,0>: Cost 3 vuzpr RHS, <5,6,7,0>
-  3699380615U,  // <5,7,6,1>: Cost 4 vext2 <1,3,5,7>, <6,1,3,5>
-  2846281108U,  // <5,7,6,2>: Cost 3 vuzpr RHS, <4,6,u,2>
-  2589854210U,  // <5,7,6,3>: Cost 3 vext1 <6,5,7,6>, <3,4,5,6>
-  2846281830U,  // <5,7,6,4>: Cost 3 vuzpr RHS, <5,6,7,4>
-  2725467658U,  // <5,7,6,5>: Cost 3 vext3 <6,7,u,5>, <7,6,5,u>
-  2846281076U,  // <5,7,6,6>: Cost 3 vuzpr RHS, <4,6,4,6>
-  2846279610U,  // <5,7,6,7>: Cost 3 vuzpr RHS, <2,6,3,7>
-  2846279611U,  // <5,7,6,u>: Cost 3 vuzpr RHS, <2,6,3,u>
-  1510146150U,  // <5,7,7,0>: Cost 2 vext1 <5,5,7,7>, LHS
-  2846282574U,  // <5,7,7,1>: Cost 3 vuzpr RHS, <6,7,0,1>
-  2583889512U,  // <5,7,7,2>: Cost 3 vext1 <5,5,7,7>, <2,2,2,2>
-  2846281919U,  // <5,7,7,3>: Cost 3 vuzpr RHS, <5,7,u,3>
-  1510149430U,  // <5,7,7,4>: Cost 2 vext1 <5,5,7,7>, RHS
-  1510150168U,  // <5,7,7,5>: Cost 2 vext1 <5,5,7,7>, <5,5,7,7>
-  2583892474U,  // <5,7,7,6>: Cost 3 vext1 <5,5,7,7>, <6,2,7,3>
-  2625640044U,  // <5,7,7,7>: Cost 3 vext2 <1,3,5,7>, <7,7,7,7>
-  1510151982U,  // <5,7,7,u>: Cost 2 vext1 <5,5,7,7>, LHS
-  1510154342U,  // <5,7,u,0>: Cost 2 vext1 <5,5,7,u>, LHS
-  1551898414U,  // <5,7,u,1>: Cost 2 vext2 <1,3,5,7>, LHS
-  2625640325U,  // <5,7,u,2>: Cost 3 vext2 <1,3,5,7>, <u,2,3,0>
-  1772536477U,  // <5,7,u,3>: Cost 2 vuzpr RHS, LHS
-  1510157622U,  // <5,7,u,4>: Cost 2 vext1 <5,5,7,u>, RHS
-  1551898778U,  // <5,7,u,5>: Cost 2 vext2 <1,3,5,7>, RHS
-  2625640656U,  // <5,7,u,6>: Cost 3 vext2 <1,3,5,7>, <u,6,3,7>
-  1772539433U,  // <5,7,u,7>: Cost 2 vuzpr RHS, RHS
-  1551898981U,  // <5,7,u,u>: Cost 2 vext2 <1,3,5,7>, LHS
-  2625642496U,  // <5,u,0,0>: Cost 3 vext2 <1,3,5,u>, <0,0,0,0>
-  1551900774U,  // <5,u,0,1>: Cost 2 vext2 <1,3,5,u>, LHS
-  2625642660U,  // <5,u,0,2>: Cost 3 vext2 <1,3,5,u>, <0,2,0,2>
-  2698630885U,  // <5,u,0,3>: Cost 3 vext3 <2,3,4,5>, <u,0,3,2>
-  2687129325U,  // <5,u,0,4>: Cost 3 vext3 <0,4,1,5>, <u,0,4,1>
-  2689783542U,  // <5,u,0,5>: Cost 3 vext3 <0,u,1,5>, <u,0,5,1>
-  2266134675U,  // <5,u,0,6>: Cost 3 vrev <u,5,6,0>
-  2595853772U,  // <5,u,0,7>: Cost 3 vext1 <7,5,u,0>, <7,5,u,0>
-  1551901341U,  // <5,u,0,u>: Cost 2 vext2 <1,3,5,u>, LHS
-  2625643254U,  // <5,u,1,0>: Cost 3 vext2 <1,3,5,u>, <1,0,3,2>
-  2625643316U,  // <5,u,1,1>: Cost 3 vext2 <1,3,5,u>, <1,1,1,1>
-  1613387566U,  // <5,u,1,2>: Cost 2 vext3 <0,4,1,5>, LHS
-  1551901697U,  // <5,u,1,3>: Cost 2 vext2 <1,3,5,u>, <1,3,5,u>
-  2626307154U,  // <5,u,1,4>: Cost 3 vext2 <1,4,5,u>, <1,4,5,u>
-  2689783622U,  // <5,u,1,5>: Cost 3 vext3 <0,u,1,5>, <u,1,5,0>
-  2627634420U,  // <5,u,1,6>: Cost 3 vext2 <1,6,5,u>, <1,6,5,u>
-  2982366536U,  // <5,u,1,7>: Cost 3 vzipr <4,u,5,1>, RHS
-  1613387620U,  // <5,u,1,u>: Cost 2 vext3 <0,4,1,5>, LHS
-  2846286742U,  // <5,u,2,0>: Cost 3 vuzpr RHS, <1,2,3,0>
-  2685796528U,  // <5,u,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5>
-  2625644136U,  // <5,u,2,2>: Cost 3 vext2 <1,3,5,u>, <2,2,2,2>
-  2687129480U,  // <5,u,2,3>: Cost 3 vext3 <0,4,1,5>, <u,2,3,3>
-  2632279851U,  // <5,u,2,4>: Cost 3 vext2 <2,4,5,u>, <2,4,5,u>
-  2625644394U,  // <5,u,2,5>: Cost 3 vext2 <1,3,5,u>, <2,5,3,u>
-  2625644474U,  // <5,u,2,6>: Cost 3 vext2 <1,3,5,u>, <2,6,3,7>
-  2713966508U,  // <5,u,2,7>: Cost 3 vext3 <4,u,5,5>, <u,2,7,3>
-  2625644603U,  // <5,u,2,u>: Cost 3 vext2 <1,3,5,u>, <2,u,0,1>
-  2687129532U,  // <5,u,3,0>: Cost 3 vext3 <0,4,1,5>, <u,3,0,1>
-  2636261649U,  // <5,u,3,1>: Cost 3 vext2 <3,1,5,u>, <3,1,5,u>
-  2636925282U,  // <5,u,3,2>: Cost 3 vext2 <3,2,5,u>, <3,2,5,u>
-  2625644956U,  // <5,u,3,3>: Cost 3 vext2 <1,3,5,u>, <3,3,3,3>
-  1564510724U,  // <5,u,3,4>: Cost 2 vext2 <3,4,5,u>, <3,4,5,u>
-  2625645160U,  // <5,u,3,5>: Cost 3 vext2 <1,3,5,u>, <3,5,u,0>
-  2734610422U,  // <5,u,3,6>: Cost 3 vext3 <u,3,6,5>, <u,3,6,5>
-  2640243447U,  // <5,u,3,7>: Cost 3 vext2 <3,7,5,u>, <3,7,5,u>
-  1567165256U,  // <5,u,3,u>: Cost 2 vext2 <3,u,5,u>, <3,u,5,u>
-  1567828889U,  // <5,u,4,0>: Cost 2 vext2 <4,0,5,u>, <4,0,5,u>
-  1661163546U,  // <5,u,4,1>: Cost 2 vext3 <u,4,1,5>, <u,4,1,5>
-  2734463012U,  // <5,u,4,2>: Cost 3 vext3 <u,3,4,5>, <u,4,2,6>
-  2698631212U,  // <5,u,4,3>: Cost 3 vext3 <2,3,4,5>, <u,4,3,5>
-  1570458842U,  // <5,u,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5>
-  1551904054U,  // <5,u,4,5>: Cost 2 vext2 <1,3,5,u>, RHS
-  2846286172U,  // <5,u,4,6>: Cost 3 vuzpr RHS, <0,4,2,6>
-  2646216144U,  // <5,u,4,7>: Cost 3 vext2 <4,7,5,u>, <4,7,5,u>
-  1551904297U,  // <5,u,4,u>: Cost 2 vext2 <1,3,5,u>, RHS
-  1509982310U,  // <5,u,5,0>: Cost 2 vext1 <5,5,5,5>, LHS
-  2560058555U,  // <5,u,5,1>: Cost 3 vext1 <1,5,u,5>, <1,5,u,5>
-  2698926194U,  // <5,u,5,2>: Cost 3 vext3 <2,3,u,5>, <u,5,2,3>
-  2698631295U,  // <5,u,5,3>: Cost 3 vext3 <2,3,4,5>, <u,5,3,7>
-  1509985590U,  // <5,u,5,4>: Cost 2 vext1 <5,5,5,5>, RHS
-   229035318U,  // <5,u,5,5>: Cost 1 vdup1 RHS
-  1613387930U,  // <5,u,5,6>: Cost 2 vext3 <0,4,1,5>, RHS
-  1772547382U,  // <5,u,5,7>: Cost 2 vuzpr RHS, RHS
-   229035318U,  // <5,u,5,u>: Cost 1 vdup1 RHS
-  2566037606U,  // <5,u,6,0>: Cost 3 vext1 <2,5,u,6>, LHS
-  2920044334U,  // <5,u,6,1>: Cost 3 vzipl <5,6,7,0>, LHS
-  2566039445U,  // <5,u,6,2>: Cost 3 vext1 <2,5,u,6>, <2,5,u,6>
-  2687129808U,  // <5,u,6,3>: Cost 3 vext3 <0,4,1,5>, <u,6,3,7>
-  2566040886U,  // <5,u,6,4>: Cost 3 vext1 <2,5,u,6>, RHS
-  2920044698U,  // <5,u,6,5>: Cost 3 vzipl <5,6,7,0>, RHS
-  2846289268U,  // <5,u,6,6>: Cost 3 vuzpr RHS, <4,6,4,6>
-  2973781320U,  // <5,u,6,7>: Cost 3 vzipr <3,4,5,6>, RHS
-  2687129853U,  // <5,u,6,u>: Cost 3 vext3 <0,4,1,5>, <u,6,u,7>
-   430506086U,  // <5,u,7,0>: Cost 1 vext1 RHS, LHS
-  1486333117U,  // <5,u,7,1>: Cost 2 vext1 <1,5,u,7>, <1,5,u,7>
-  1504249448U,  // <5,u,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
-  2040971933U,  // <5,u,7,3>: Cost 2 vtrnr RHS, LHS
-   430509384U,  // <5,u,7,4>: Cost 1 vext1 RHS, RHS
-  1504251600U,  // <5,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
-   118708378U,  // <5,u,7,6>: Cost 1 vrev RHS
-  2040974889U,  // <5,u,7,7>: Cost 2 vtrnr RHS, RHS
-   430511918U,  // <5,u,7,u>: Cost 1 vext1 RHS, LHS
-   430514278U,  // <5,u,u,0>: Cost 1 vext1 RHS, LHS
-  1551906606U,  // <5,u,u,1>: Cost 2 vext2 <1,3,5,u>, LHS
-  1613388133U,  // <5,u,u,2>: Cost 2 vext3 <0,4,1,5>, LHS
-  1772544669U,  // <5,u,u,3>: Cost 2 vuzpr RHS, LHS
-   430517577U,  // <5,u,u,4>: Cost 1 vext1 RHS, RHS
-   229035318U,  // <5,u,u,5>: Cost 1 vdup1 RHS
-   118716571U,  // <5,u,u,6>: Cost 1 vrev RHS
-  1772547625U,  // <5,u,u,7>: Cost 2 vuzpr RHS, RHS
-   430520110U,  // <5,u,u,u>: Cost 1 vext1 RHS, LHS
-  2686025728U,  // <6,0,0,0>: Cost 3 vext3 <0,2,4,6>, <0,0,0,0>
-  2686025738U,  // <6,0,0,1>: Cost 3 vext3 <0,2,4,6>, <0,0,1,1>
-  2686025748U,  // <6,0,0,2>: Cost 3 vext3 <0,2,4,6>, <0,0,2,2>
-  3779084320U,  // <6,0,0,3>: Cost 4 vext3 <3,4,5,6>, <0,0,3,5>
-  2642903388U,  // <6,0,0,4>: Cost 3 vext2 <4,2,6,0>, <0,4,2,6>
-  3657723939U,  // <6,0,0,5>: Cost 4 vext1 <5,6,0,0>, <5,6,0,0>
-  3926676514U,  // <6,0,0,6>: Cost 4 vuzpr <5,6,7,0>, <7,0,5,6>
-  3926675786U,  // <6,0,0,7>: Cost 4 vuzpr <5,6,7,0>, <6,0,5,7>
-  2686025802U,  // <6,0,0,u>: Cost 3 vext3 <0,2,4,6>, <0,0,u,2>
-  2566070374U,  // <6,0,1,0>: Cost 3 vext1 <2,6,0,1>, LHS
-  3759767642U,  // <6,0,1,1>: Cost 4 vext3 <0,2,4,6>, <0,1,1,0>
-  1612284006U,  // <6,0,1,2>: Cost 2 vext3 <0,2,4,6>, LHS
-  2583988738U,  // <6,0,1,3>: Cost 3 vext1 <5,6,0,1>, <3,4,5,6>
-  2566073654U,  // <6,0,1,4>: Cost 3 vext1 <2,6,0,1>, RHS
-  2583990308U,  // <6,0,1,5>: Cost 3 vext1 <5,6,0,1>, <5,6,0,1>
-  2589963005U,  // <6,0,1,6>: Cost 3 vext1 <6,6,0,1>, <6,6,0,1>
-  2595935702U,  // <6,0,1,7>: Cost 3 vext1 <7,6,0,1>, <7,6,0,1>
-  1612284060U,  // <6,0,1,u>: Cost 2 vext3 <0,2,4,6>, LHS
-  2686025892U,  // <6,0,2,0>: Cost 3 vext3 <0,2,4,6>, <0,2,0,2>
-  2685804721U,  // <6,0,2,1>: Cost 3 vext3 <0,2,1,6>, <0,2,1,6>
-  3759620282U,  // <6,0,2,2>: Cost 4 vext3 <0,2,2,6>, <0,2,2,6>
-  2705342658U,  // <6,0,2,3>: Cost 3 vext3 <3,4,5,6>, <0,2,3,5>
-  1612284108U,  // <6,0,2,4>: Cost 2 vext3 <0,2,4,6>, <0,2,4,6>
-  3706029956U,  // <6,0,2,5>: Cost 4 vext2 <2,4,6,0>, <2,5,6,7>
-  2686173406U,  // <6,0,2,6>: Cost 3 vext3 <0,2,6,6>, <0,2,6,6>
-  3651769338U,  // <6,0,2,7>: Cost 4 vext1 <4,6,0,2>, <7,0,1,2>
-  1612579056U,  // <6,0,2,u>: Cost 2 vext3 <0,2,u,6>, <0,2,u,6>
-  3706030230U,  // <6,0,3,0>: Cost 4 vext2 <2,4,6,0>, <3,0,1,2>
-  2705342720U,  // <6,0,3,1>: Cost 3 vext3 <3,4,5,6>, <0,3,1,4>
-  2705342730U,  // <6,0,3,2>: Cost 3 vext3 <3,4,5,6>, <0,3,2,5>
-  3706030492U,  // <6,0,3,3>: Cost 4 vext2 <2,4,6,0>, <3,3,3,3>
-  2644896258U,  // <6,0,3,4>: Cost 3 vext2 <4,5,6,0>, <3,4,5,6>
-  3718638154U,  // <6,0,3,5>: Cost 4 vext2 <4,5,6,0>, <3,5,4,6>
-  3729918619U,  // <6,0,3,6>: Cost 4 vext2 <6,4,6,0>, <3,6,4,6>
-  3926672384U,  // <6,0,3,7>: Cost 4 vuzpr <5,6,7,0>, <1,3,5,7>
-  2705342784U,  // <6,0,3,u>: Cost 3 vext3 <3,4,5,6>, <0,3,u,5>
-  2687058250U,  // <6,0,4,0>: Cost 3 vext3 <0,4,0,6>, <0,4,0,6>
-  2686026066U,  // <6,0,4,1>: Cost 3 vext3 <0,2,4,6>, <0,4,1,5>
-  1613463900U,  // <6,0,4,2>: Cost 2 vext3 <0,4,2,6>, <0,4,2,6>
-  3761021285U,  // <6,0,4,3>: Cost 4 vext3 <0,4,3,6>, <0,4,3,6>
-  2687353198U,  // <6,0,4,4>: Cost 3 vext3 <0,4,4,6>, <0,4,4,6>
-  2632289590U,  // <6,0,4,5>: Cost 3 vext2 <2,4,6,0>, RHS
-  2645560704U,  // <6,0,4,6>: Cost 3 vext2 <4,6,6,0>, <4,6,6,0>
-  2646224337U,  // <6,0,4,7>: Cost 3 vext2 <4,7,6,0>, <4,7,6,0>
-  1613906322U,  // <6,0,4,u>: Cost 2 vext3 <0,4,u,6>, <0,4,u,6>
-  3651788902U,  // <6,0,5,0>: Cost 4 vext1 <4,6,0,5>, LHS
-  2687795620U,  // <6,0,5,1>: Cost 3 vext3 <0,5,1,6>, <0,5,1,6>
-  3761611181U,  // <6,0,5,2>: Cost 4 vext3 <0,5,2,6>, <0,5,2,6>
-  3723284326U,  // <6,0,5,3>: Cost 4 vext2 <5,3,6,0>, <5,3,6,0>
-  2646224838U,  // <6,0,5,4>: Cost 3 vext2 <4,7,6,0>, <5,4,7,6>
-  3718639630U,  // <6,0,5,5>: Cost 4 vext2 <4,5,6,0>, <5,5,6,6>
-  2652196962U,  // <6,0,5,6>: Cost 3 vext2 <5,7,6,0>, <5,6,7,0>
-  2852932918U,  // <6,0,5,7>: Cost 3 vuzpr <5,6,7,0>, RHS
-  2852932919U,  // <6,0,5,u>: Cost 3 vuzpr <5,6,7,0>, RHS
-  2852933730U,  // <6,0,6,0>: Cost 3 vuzpr <5,6,7,0>, <5,6,7,0>
-  2925985894U,  // <6,0,6,1>: Cost 3 vzipl <6,6,6,6>, LHS
-  3060203622U,  // <6,0,6,2>: Cost 3 vtrnl <6,6,6,6>, LHS
-  3718640178U,  // <6,0,6,3>: Cost 4 vext2 <4,5,6,0>, <6,3,4,5>
-  2656178832U,  // <6,0,6,4>: Cost 3 vext2 <6,4,6,0>, <6,4,6,0>
-  3725939378U,  // <6,0,6,5>: Cost 4 vext2 <5,7,6,0>, <6,5,0,7>
-  2657506098U,  // <6,0,6,6>: Cost 3 vext2 <6,6,6,0>, <6,6,6,0>
-  2619020110U,  // <6,0,6,7>: Cost 3 vext2 <0,2,6,0>, <6,7,0,1>
-  2925986461U,  // <6,0,6,u>: Cost 3 vzipl <6,6,6,6>, LHS
-  2572091494U,  // <6,0,7,0>: Cost 3 vext1 <3,6,0,7>, LHS
-  2572092310U,  // <6,0,7,1>: Cost 3 vext1 <3,6,0,7>, <1,2,3,0>
-  2980495524U,  // <6,0,7,2>: Cost 3 vzipr RHS, <0,2,0,2>
-  2572094072U,  // <6,0,7,3>: Cost 3 vext1 <3,6,0,7>, <3,6,0,7>
-  2572094774U,  // <6,0,7,4>: Cost 3 vext1 <3,6,0,7>, RHS
-  4054238242U,  // <6,0,7,5>: Cost 4 vzipr RHS, <1,4,0,5>
-  3645837653U,  // <6,0,7,6>: Cost 4 vext1 <3,6,0,7>, <6,0,7,0>
-  4054239054U,  // <6,0,7,7>: Cost 4 vzipr RHS, <2,5,0,7>
-  2572097326U,  // <6,0,7,u>: Cost 3 vext1 <3,6,0,7>, LHS
-  2686026378U,  // <6,0,u,0>: Cost 3 vext3 <0,2,4,6>, <0,u,0,2>
-  2686026386U,  // <6,0,u,1>: Cost 3 vext3 <0,2,4,6>, <0,u,1,1>
-  1612284573U,  // <6,0,u,2>: Cost 2 vext3 <0,2,4,6>, LHS
-  2705343144U,  // <6,0,u,3>: Cost 3 vext3 <3,4,5,6>, <0,u,3,5>
-  1616265906U,  // <6,0,u,4>: Cost 2 vext3 <0,u,4,6>, <0,u,4,6>
-  2632292506U,  // <6,0,u,5>: Cost 3 vext2 <2,4,6,0>, RHS
-  2590020356U,  // <6,0,u,6>: Cost 3 vext1 <6,6,0,u>, <6,6,0,u>
-  2852933161U,  // <6,0,u,7>: Cost 3 vuzpr <5,6,7,0>, RHS
-  1612284627U,  // <6,0,u,u>: Cost 2 vext3 <0,2,4,6>, LHS
-  2595995750U,  // <6,1,0,0>: Cost 3 vext1 <7,6,1,0>, LHS
-  2646229094U,  // <6,1,0,1>: Cost 3 vext2 <4,7,6,1>, LHS
-  3694092492U,  // <6,1,0,2>: Cost 4 vext2 <0,4,6,1>, <0,2,4,6>
-  2686026486U,  // <6,1,0,3>: Cost 3 vext3 <0,2,4,6>, <1,0,3,2>
-  2595999030U,  // <6,1,0,4>: Cost 3 vext1 <7,6,1,0>, RHS
-  3767730952U,  // <6,1,0,5>: Cost 4 vext3 <1,5,4,6>, <1,0,5,2>
-  2596000590U,  // <6,1,0,6>: Cost 3 vext1 <7,6,1,0>, <6,7,0,1>
-  2596001246U,  // <6,1,0,7>: Cost 3 vext1 <7,6,1,0>, <7,6,1,0>
-  2686026531U,  // <6,1,0,u>: Cost 3 vext3 <0,2,4,6>, <1,0,u,2>
-  3763602219U,  // <6,1,1,0>: Cost 4 vext3 <0,u,2,6>, <1,1,0,1>
-  2686026548U,  // <6,1,1,1>: Cost 3 vext3 <0,2,4,6>, <1,1,1,1>
-  3764929346U,  // <6,1,1,2>: Cost 4 vext3 <1,1,2,6>, <1,1,2,6>
-  2686026568U,  // <6,1,1,3>: Cost 3 vext3 <0,2,4,6>, <1,1,3,3>
-  2691334996U,  // <6,1,1,4>: Cost 3 vext3 <1,1,4,6>, <1,1,4,6>
-  3760874332U,  // <6,1,1,5>: Cost 4 vext3 <0,4,1,6>, <1,1,5,5>
-  3765224294U,  // <6,1,1,6>: Cost 4 vext3 <1,1,6,6>, <1,1,6,6>
-  3669751263U,  // <6,1,1,7>: Cost 4 vext1 <7,6,1,1>, <7,6,1,1>
-  2686026613U,  // <6,1,1,u>: Cost 3 vext3 <0,2,4,6>, <1,1,u,3>
-  2554208358U,  // <6,1,2,0>: Cost 3 vext1 <0,6,1,2>, LHS
-  3763602311U,  // <6,1,2,1>: Cost 4 vext3 <0,u,2,6>, <1,2,1,3>
-  3639895971U,  // <6,1,2,2>: Cost 4 vext1 <2,6,1,2>, <2,6,1,2>
-  2686026646U,  // <6,1,2,3>: Cost 3 vext3 <0,2,4,6>, <1,2,3,0>
-  2554211638U,  // <6,1,2,4>: Cost 3 vext1 <0,6,1,2>, RHS
-  3760874411U,  // <6,1,2,5>: Cost 4 vext3 <0,4,1,6>, <1,2,5,3>
-  2554212858U,  // <6,1,2,6>: Cost 3 vext1 <0,6,1,2>, <6,2,7,3>
-  3802973114U,  // <6,1,2,7>: Cost 4 vext3 <7,4,5,6>, <1,2,7,0>
-  2686026691U,  // <6,1,2,u>: Cost 3 vext3 <0,2,4,6>, <1,2,u,0>
-  2566160486U,  // <6,1,3,0>: Cost 3 vext1 <2,6,1,3>, LHS
-  2686026712U,  // <6,1,3,1>: Cost 3 vext3 <0,2,4,6>, <1,3,1,3>
-  2686026724U,  // <6,1,3,2>: Cost 3 vext3 <0,2,4,6>, <1,3,2,6>
-  3759768552U,  // <6,1,3,3>: Cost 4 vext3 <0,2,4,6>, <1,3,3,1>
-  2692662262U,  // <6,1,3,4>: Cost 3 vext3 <1,3,4,6>, <1,3,4,6>
-  2686026752U,  // <6,1,3,5>: Cost 3 vext3 <0,2,4,6>, <1,3,5,7>
-  2590053128U,  // <6,1,3,6>: Cost 3 vext1 <6,6,1,3>, <6,6,1,3>
-  3663795194U,  // <6,1,3,7>: Cost 4 vext1 <6,6,1,3>, <7,0,1,2>
-  2686026775U,  // <6,1,3,u>: Cost 3 vext3 <0,2,4,6>, <1,3,u,3>
-  2641587099U,  // <6,1,4,0>: Cost 3 vext2 <4,0,6,1>, <4,0,6,1>
-  2693104684U,  // <6,1,4,1>: Cost 3 vext3 <1,4,1,6>, <1,4,1,6>
-  3639912357U,  // <6,1,4,2>: Cost 4 vext1 <2,6,1,4>, <2,6,1,4>
-  2687206462U,  // <6,1,4,3>: Cost 3 vext3 <0,4,2,6>, <1,4,3,6>
-  3633941814U,  // <6,1,4,4>: Cost 4 vext1 <1,6,1,4>, RHS
-  2693399632U,  // <6,1,4,5>: Cost 3 vext3 <1,4,5,6>, <1,4,5,6>
-  3765077075U,  // <6,1,4,6>: Cost 4 vext3 <1,1,4,6>, <1,4,6,0>
-  2646232530U,  // <6,1,4,7>: Cost 3 vext2 <4,7,6,1>, <4,7,6,1>
-  2687206507U,  // <6,1,4,u>: Cost 3 vext3 <0,4,2,6>, <1,4,u,6>
-  2647559796U,  // <6,1,5,0>: Cost 3 vext2 <5,0,6,1>, <5,0,6,1>
-  3765077118U,  // <6,1,5,1>: Cost 4 vext3 <1,1,4,6>, <1,5,1,7>
-  3767583878U,  // <6,1,5,2>: Cost 4 vext3 <1,5,2,6>, <1,5,2,6>
-  2686026896U,  // <6,1,5,3>: Cost 3 vext3 <0,2,4,6>, <1,5,3,7>
-  2693989528U,  // <6,1,5,4>: Cost 3 vext3 <1,5,4,6>, <1,5,4,6>
-  3767805089U,  // <6,1,5,5>: Cost 4 vext3 <1,5,5,6>, <1,5,5,6>
-  2652868706U,  // <6,1,5,6>: Cost 3 vext2 <5,u,6,1>, <5,6,7,0>
-  3908250934U,  // <6,1,5,7>: Cost 4 vuzpr <2,6,0,1>, RHS
-  2686026941U,  // <6,1,5,u>: Cost 3 vext3 <0,2,4,6>, <1,5,u,7>
-  2554241126U,  // <6,1,6,0>: Cost 3 vext1 <0,6,1,6>, LHS
-  3763602639U,  // <6,1,6,1>: Cost 4 vext3 <0,u,2,6>, <1,6,1,7>
-  3759547607U,  // <6,1,6,2>: Cost 4 vext3 <0,2,1,6>, <1,6,2,6>
-  3115221094U,  // <6,1,6,3>: Cost 3 vtrnr <4,6,4,6>, LHS
-  2554244406U,  // <6,1,6,4>: Cost 3 vext1 <0,6,1,6>, RHS
-  3760874739U,  // <6,1,6,5>: Cost 4 vext3 <0,4,1,6>, <1,6,5,7>
-  2554245944U,  // <6,1,6,6>: Cost 3 vext1 <0,6,1,6>, <6,6,6,6>
-  3719975758U,  // <6,1,6,7>: Cost 4 vext2 <4,7,6,1>, <6,7,0,1>
-  3115221099U,  // <6,1,6,u>: Cost 3 vtrnr <4,6,4,6>, LHS
-  2560221286U,  // <6,1,7,0>: Cost 3 vext1 <1,6,1,7>, LHS
-  2560222415U,  // <6,1,7,1>: Cost 3 vext1 <1,6,1,7>, <1,6,1,7>
-  2980497558U,  // <6,1,7,2>: Cost 3 vzipr RHS, <3,0,1,2>
-  3103211622U,  // <6,1,7,3>: Cost 3 vtrnr <2,6,3,7>, LHS
-  2560224566U,  // <6,1,7,4>: Cost 3 vext1 <1,6,1,7>, RHS
-  2980495698U,  // <6,1,7,5>: Cost 3 vzipr RHS, <0,4,1,5>
-  3633967526U,  // <6,1,7,6>: Cost 4 vext1 <1,6,1,7>, <6,1,7,0>
-  4054237686U,  // <6,1,7,7>: Cost 4 vzipr RHS, <0,6,1,7>
-  2560227118U,  // <6,1,7,u>: Cost 3 vext1 <1,6,1,7>, LHS
-  2560229478U,  // <6,1,u,0>: Cost 3 vext1 <1,6,1,u>, LHS
-  2686027117U,  // <6,1,u,1>: Cost 3 vext3 <0,2,4,6>, <1,u,1,3>
-  2686027129U,  // <6,1,u,2>: Cost 3 vext3 <0,2,4,6>, <1,u,2,6>
-  2686027132U,  // <6,1,u,3>: Cost 3 vext3 <0,2,4,6>, <1,u,3,0>
-  2687206795U,  // <6,1,u,4>: Cost 3 vext3 <0,4,2,6>, <1,u,4,6>
-  2686027157U,  // <6,1,u,5>: Cost 3 vext3 <0,2,4,6>, <1,u,5,7>
-  2590094093U,  // <6,1,u,6>: Cost 3 vext1 <6,6,1,u>, <6,6,1,u>
-  2596066790U,  // <6,1,u,7>: Cost 3 vext1 <7,6,1,u>, <7,6,1,u>
-  2686027177U,  // <6,1,u,u>: Cost 3 vext3 <0,2,4,6>, <1,u,u,0>
-  2646900736U,  // <6,2,0,0>: Cost 3 vext2 <4,u,6,2>, <0,0,0,0>
-  1573159014U,  // <6,2,0,1>: Cost 2 vext2 <4,u,6,2>, LHS
-  2646900900U,  // <6,2,0,2>: Cost 3 vext2 <4,u,6,2>, <0,2,0,2>
-  3759769037U,  // <6,2,0,3>: Cost 4 vext3 <0,2,4,6>, <2,0,3,0>
-  2641592668U,  // <6,2,0,4>: Cost 3 vext2 <4,0,6,2>, <0,4,2,6>
-  3779085794U,  // <6,2,0,5>: Cost 4 vext3 <3,4,5,6>, <2,0,5,3>
-  2686027244U,  // <6,2,0,6>: Cost 3 vext3 <0,2,4,6>, <2,0,6,4>
-  3669816807U,  // <6,2,0,7>: Cost 4 vext1 <7,6,2,0>, <7,6,2,0>
-  1573159581U,  // <6,2,0,u>: Cost 2 vext2 <4,u,6,2>, LHS
-  2230527897U,  // <6,2,1,0>: Cost 3 vrev <2,6,0,1>
-  2646901556U,  // <6,2,1,1>: Cost 3 vext2 <4,u,6,2>, <1,1,1,1>
-  2646901654U,  // <6,2,1,2>: Cost 3 vext2 <4,u,6,2>, <1,2,3,0>
-  2847047782U,  // <6,2,1,3>: Cost 3 vuzpr <4,6,u,2>, LHS
-  3771049517U,  // <6,2,1,4>: Cost 4 vext3 <2,1,4,6>, <2,1,4,6>
-  2646901904U,  // <6,2,1,5>: Cost 3 vext2 <4,u,6,2>, <1,5,3,7>
-  2686027324U,  // <6,2,1,6>: Cost 3 vext3 <0,2,4,6>, <2,1,6,3>
-  3669825000U,  // <6,2,1,7>: Cost 4 vext1 <7,6,2,1>, <7,6,2,1>
-  2231117793U,  // <6,2,1,u>: Cost 3 vrev <2,6,u,1>
-  3763603029U,  // <6,2,2,0>: Cost 4 vext3 <0,u,2,6>, <2,2,0,1>
-  3759769184U,  // <6,2,2,1>: Cost 4 vext3 <0,2,4,6>, <2,2,1,3>
-  2686027368U,  // <6,2,2,2>: Cost 3 vext3 <0,2,4,6>, <2,2,2,2>
-  2686027378U,  // <6,2,2,3>: Cost 3 vext3 <0,2,4,6>, <2,2,3,3>
-  2697971326U,  // <6,2,2,4>: Cost 3 vext3 <2,2,4,6>, <2,2,4,6>
-  3759769224U,  // <6,2,2,5>: Cost 4 vext3 <0,2,4,6>, <2,2,5,7>
-  2698118800U,  // <6,2,2,6>: Cost 3 vext3 <2,2,6,6>, <2,2,6,6>
-  3920794092U,  // <6,2,2,7>: Cost 4 vuzpr <4,6,u,2>, <6,2,5,7>
-  2686027423U,  // <6,2,2,u>: Cost 3 vext3 <0,2,4,6>, <2,2,u,3>
-  2686027430U,  // <6,2,3,0>: Cost 3 vext3 <0,2,4,6>, <2,3,0,1>
-  3759769262U,  // <6,2,3,1>: Cost 4 vext3 <0,2,4,6>, <2,3,1,0>
-  2698487485U,  // <6,2,3,2>: Cost 3 vext3 <2,3,2,6>, <2,3,2,6>
-  2705344196U,  // <6,2,3,3>: Cost 3 vext3 <3,4,5,6>, <2,3,3,4>
-  2686027470U,  // <6,2,3,4>: Cost 3 vext3 <0,2,4,6>, <2,3,4,5>
-  2698708696U,  // <6,2,3,5>: Cost 3 vext3 <2,3,5,6>, <2,3,5,6>
-  2724660961U,  // <6,2,3,6>: Cost 3 vext3 <6,6,6,6>, <2,3,6,6>
-  2729232104U,  // <6,2,3,7>: Cost 3 vext3 <7,4,5,6>, <2,3,7,4>
-  2686027502U,  // <6,2,3,u>: Cost 3 vext3 <0,2,4,6>, <2,3,u,1>
-  1567853468U,  // <6,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2>
-  3759769351U,  // <6,2,4,1>: Cost 4 vext3 <0,2,4,6>, <2,4,1,u>
-  2699151118U,  // <6,2,4,2>: Cost 3 vext3 <2,4,2,6>, <2,4,2,6>
-  2686027543U,  // <6,2,4,3>: Cost 3 vext3 <0,2,4,6>, <2,4,3,6>
-  2699298592U,  // <6,2,4,4>: Cost 3 vext3 <2,4,4,6>, <2,4,4,6>
-  1573162294U,  // <6,2,4,5>: Cost 2 vext2 <4,u,6,2>, RHS
-  2686027564U,  // <6,2,4,6>: Cost 3 vext3 <0,2,4,6>, <2,4,6,0>
-  3719982547U,  // <6,2,4,7>: Cost 4 vext2 <4,7,6,2>, <4,7,6,2>
-  1573162532U,  // <6,2,4,u>: Cost 2 vext2 <4,u,6,2>, <4,u,6,2>
-  3779086154U,  // <6,2,5,0>: Cost 4 vext3 <3,4,5,6>, <2,5,0,3>
-  2646904528U,  // <6,2,5,1>: Cost 3 vext2 <4,u,6,2>, <5,1,7,3>
-  3759769440U,  // <6,2,5,2>: Cost 4 vext3 <0,2,4,6>, <2,5,2,7>
-  2699888488U,  // <6,2,5,3>: Cost 3 vext3 <2,5,3,6>, <2,5,3,6>
-  2230855617U,  // <6,2,5,4>: Cost 3 vrev <2,6,4,5>
-  2646904836U,  // <6,2,5,5>: Cost 3 vext2 <4,u,6,2>, <5,5,5,5>
-  2646904930U,  // <6,2,5,6>: Cost 3 vext2 <4,u,6,2>, <5,6,7,0>
-  2847051062U,  // <6,2,5,7>: Cost 3 vuzpr <4,6,u,2>, RHS
-  2700257173U,  // <6,2,5,u>: Cost 3 vext3 <2,5,u,6>, <2,5,u,6>
-  2687207321U,  // <6,2,6,0>: Cost 3 vext3 <0,4,2,6>, <2,6,0,1>
-  2686027684U,  // <6,2,6,1>: Cost 3 vext3 <0,2,4,6>, <2,6,1,3>
-  2566260656U,  // <6,2,6,2>: Cost 3 vext1 <2,6,2,6>, <2,6,2,6>
-  2685806522U,  // <6,2,6,3>: Cost 3 vext3 <0,2,1,6>, <2,6,3,7>
-  2687207361U,  // <6,2,6,4>: Cost 3 vext3 <0,4,2,6>, <2,6,4,5>
-  2686027724U,  // <6,2,6,5>: Cost 3 vext3 <0,2,4,6>, <2,6,5,7>
-  2646905656U,  // <6,2,6,6>: Cost 3 vext2 <4,u,6,2>, <6,6,6,6>
-  2646905678U,  // <6,2,6,7>: Cost 3 vext2 <4,u,6,2>, <6,7,0,1>
-  2686027751U,  // <6,2,6,u>: Cost 3 vext3 <0,2,4,6>, <2,6,u,7>
-  2554323046U,  // <6,2,7,0>: Cost 3 vext1 <0,6,2,7>, LHS
-  2572239606U,  // <6,2,7,1>: Cost 3 vext1 <3,6,2,7>, <1,0,3,2>
-  2566268849U,  // <6,2,7,2>: Cost 3 vext1 <2,6,2,7>, <2,6,2,7>
-  1906753638U,  // <6,2,7,3>: Cost 2 vzipr RHS, LHS
-  2554326326U,  // <6,2,7,4>: Cost 3 vext1 <0,6,2,7>, RHS
-  3304687564U,  // <6,2,7,5>: Cost 4 vrev <2,6,5,7>
-  2980495708U,  // <6,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6>
-  2646906476U,  // <6,2,7,7>: Cost 3 vext2 <4,u,6,2>, <7,7,7,7>
-  1906753643U,  // <6,2,7,u>: Cost 2 vzipr RHS, LHS
-  1591744256U,  // <6,2,u,0>: Cost 2 vext2 <u,0,6,2>, <u,0,6,2>
-  1573164846U,  // <6,2,u,1>: Cost 2 vext2 <4,u,6,2>, LHS
-  2701805650U,  // <6,2,u,2>: Cost 3 vext3 <2,u,2,6>, <2,u,2,6>
-  1906761830U,  // <6,2,u,3>: Cost 2 vzipr RHS, LHS
-  2686027875U,  // <6,2,u,4>: Cost 3 vext3 <0,2,4,6>, <2,u,4,5>
-  1573165210U,  // <6,2,u,5>: Cost 2 vext2 <4,u,6,2>, RHS
-  2686322800U,  // <6,2,u,6>: Cost 3 vext3 <0,2,u,6>, <2,u,6,0>
-  2847051305U,  // <6,2,u,7>: Cost 3 vuzpr <4,6,u,2>, RHS
-  1906761835U,  // <6,2,u,u>: Cost 2 vzipr RHS, LHS
-  3759769739U,  // <6,3,0,0>: Cost 4 vext3 <0,2,4,6>, <3,0,0,0>
-  2686027926U,  // <6,3,0,1>: Cost 3 vext3 <0,2,4,6>, <3,0,1,2>
-  2686027937U,  // <6,3,0,2>: Cost 3 vext3 <0,2,4,6>, <3,0,2,4>
-  3640027286U,  // <6,3,0,3>: Cost 4 vext1 <2,6,3,0>, <3,0,1,2>
-  2687207601U,  // <6,3,0,4>: Cost 3 vext3 <0,4,2,6>, <3,0,4,2>
-  2705344698U,  // <6,3,0,5>: Cost 3 vext3 <3,4,5,6>, <3,0,5,2>
-  3663917847U,  // <6,3,0,6>: Cost 4 vext1 <6,6,3,0>, <6,6,3,0>
-  2237008560U,  // <6,3,0,7>: Cost 3 vrev <3,6,7,0>
-  2686027989U,  // <6,3,0,u>: Cost 3 vext3 <0,2,4,6>, <3,0,u,2>
-  3759769823U,  // <6,3,1,0>: Cost 4 vext3 <0,2,4,6>, <3,1,0,3>
-  3759769830U,  // <6,3,1,1>: Cost 4 vext3 <0,2,4,6>, <3,1,1,1>
-  3759769841U,  // <6,3,1,2>: Cost 4 vext3 <0,2,4,6>, <3,1,2,3>
-  3759769848U,  // <6,3,1,3>: Cost 4 vext3 <0,2,4,6>, <3,1,3,1>
-  2703280390U,  // <6,3,1,4>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6>
-  3759769868U,  // <6,3,1,5>: Cost 4 vext3 <0,2,4,6>, <3,1,5,3>
-  3704063194U,  // <6,3,1,6>: Cost 4 vext2 <2,1,6,3>, <1,6,3,0>
-  3767732510U,  // <6,3,1,7>: Cost 4 vext3 <1,5,4,6>, <3,1,7,3>
-  2703280390U,  // <6,3,1,u>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6>
-  3704063468U,  // <6,3,2,0>: Cost 4 vext2 <2,1,6,3>, <2,0,6,4>
-  2630321724U,  // <6,3,2,1>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3>
-  3759769921U,  // <6,3,2,2>: Cost 4 vext3 <0,2,4,6>, <3,2,2,2>
-  3759769928U,  // <6,3,2,3>: Cost 4 vext3 <0,2,4,6>, <3,2,3,0>
-  3704063767U,  // <6,3,2,4>: Cost 4 vext2 <2,1,6,3>, <2,4,3,6>
-  3704063876U,  // <6,3,2,5>: Cost 4 vext2 <2,1,6,3>, <2,5,6,7>
-  2636957626U,  // <6,3,2,6>: Cost 3 vext2 <3,2,6,3>, <2,6,3,7>
-  3777907058U,  // <6,3,2,7>: Cost 4 vext3 <3,2,7,6>, <3,2,7,6>
-  2630321724U,  // <6,3,2,u>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3>
-  3759769983U,  // <6,3,3,0>: Cost 4 vext3 <0,2,4,6>, <3,3,0,1>
-  3710036245U,  // <6,3,3,1>: Cost 4 vext2 <3,1,6,3>, <3,1,6,3>
-  2636958054U,  // <6,3,3,2>: Cost 3 vext2 <3,2,6,3>, <3,2,6,3>
-  2686028188U,  // <6,3,3,3>: Cost 3 vext3 <0,2,4,6>, <3,3,3,3>
-  2704607656U,  // <6,3,3,4>: Cost 3 vext3 <3,3,4,6>, <3,3,4,6>
-  3773041072U,  // <6,3,3,5>: Cost 4 vext3 <2,4,4,6>, <3,3,5,5>
-  3711363731U,  // <6,3,3,6>: Cost 4 vext2 <3,3,6,3>, <3,6,3,7>
-  3767732676U,  // <6,3,3,7>: Cost 4 vext3 <1,5,4,6>, <3,3,7,7>
-  2707999179U,  // <6,3,3,u>: Cost 3 vext3 <3,u,5,6>, <3,3,u,5>
-  2584232038U,  // <6,3,4,0>: Cost 3 vext1 <5,6,3,4>, LHS
-  2642267118U,  // <6,3,4,1>: Cost 3 vext2 <4,1,6,3>, <4,1,6,3>
-  2642930751U,  // <6,3,4,2>: Cost 3 vext2 <4,2,6,3>, <4,2,6,3>
-  2705197552U,  // <6,3,4,3>: Cost 3 vext3 <3,4,3,6>, <3,4,3,6>
-  2584235318U,  // <6,3,4,4>: Cost 3 vext1 <5,6,3,4>, RHS
-  1631603202U,  // <6,3,4,5>: Cost 2 vext3 <3,4,5,6>, <3,4,5,6>
-  2654211444U,  // <6,3,4,6>: Cost 3 vext2 <6,1,6,3>, <4,6,4,6>
-  2237041332U,  // <6,3,4,7>: Cost 3 vrev <3,6,7,4>
-  1631824413U,  // <6,3,4,u>: Cost 2 vext3 <3,4,u,6>, <3,4,u,6>
-  3640066150U,  // <6,3,5,0>: Cost 4 vext1 <2,6,3,5>, LHS
-  3772746288U,  // <6,3,5,1>: Cost 4 vext3 <2,4,0,6>, <3,5,1,7>
-  3640067790U,  // <6,3,5,2>: Cost 4 vext1 <2,6,3,5>, <2,3,4,5>
-  3773041216U,  // <6,3,5,3>: Cost 4 vext3 <2,4,4,6>, <3,5,3,5>
-  2705934922U,  // <6,3,5,4>: Cost 3 vext3 <3,5,4,6>, <3,5,4,6>
-  3773041236U,  // <6,3,5,5>: Cost 4 vext3 <2,4,4,6>, <3,5,5,7>
-  3779086940U,  // <6,3,5,6>: Cost 4 vext3 <3,4,5,6>, <3,5,6,6>
-  3767732831U,  // <6,3,5,7>: Cost 4 vext3 <1,5,4,6>, <3,5,7,0>
-  2706229870U,  // <6,3,5,u>: Cost 3 vext3 <3,5,u,6>, <3,5,u,6>
-  2602164326U,  // <6,3,6,0>: Cost 3 vext1 <u,6,3,6>, LHS
-  2654212512U,  // <6,3,6,1>: Cost 3 vext2 <6,1,6,3>, <6,1,6,3>
-  2566334393U,  // <6,3,6,2>: Cost 3 vext1 <2,6,3,6>, <2,6,3,6>
-  3704066588U,  // <6,3,6,3>: Cost 4 vext2 <2,1,6,3>, <6,3,2,1>
-  2602167524U,  // <6,3,6,4>: Cost 3 vext1 <u,6,3,6>, <4,4,6,6>
-  3710702321U,  // <6,3,6,5>: Cost 4 vext2 <3,2,6,3>, <6,5,7,7>
-  2724661933U,  // <6,3,6,6>: Cost 3 vext3 <6,6,6,6>, <3,6,6,6>
-  3710702465U,  // <6,3,6,7>: Cost 4 vext2 <3,2,6,3>, <6,7,5,7>
-  2602170158U,  // <6,3,6,u>: Cost 3 vext1 <u,6,3,6>, LHS
-  1492598886U,  // <6,3,7,0>: Cost 2 vext1 <2,6,3,7>, LHS
-  2560369889U,  // <6,3,7,1>: Cost 3 vext1 <1,6,3,7>, <1,6,3,7>
-  1492600762U,  // <6,3,7,2>: Cost 2 vext1 <2,6,3,7>, <2,6,3,7>
-  2566342806U,  // <6,3,7,3>: Cost 3 vext1 <2,6,3,7>, <3,0,1,2>
-  1492602166U,  // <6,3,7,4>: Cost 2 vext1 <2,6,3,7>, RHS
-  2602176208U,  // <6,3,7,5>: Cost 3 vext1 <u,6,3,7>, <5,1,7,3>
-  2566345210U,  // <6,3,7,6>: Cost 3 vext1 <2,6,3,7>, <6,2,7,3>
-  2980496528U,  // <6,3,7,7>: Cost 3 vzipr RHS, <1,5,3,7>
-  1492604718U,  // <6,3,7,u>: Cost 2 vext1 <2,6,3,7>, LHS
-  1492607078U,  // <6,3,u,0>: Cost 2 vext1 <2,6,3,u>, LHS
-  2686028574U,  // <6,3,u,1>: Cost 3 vext3 <0,2,4,6>, <3,u,1,2>
-  1492608955U,  // <6,3,u,2>: Cost 2 vext1 <2,6,3,u>, <2,6,3,u>
-  2566350998U,  // <6,3,u,3>: Cost 3 vext1 <2,6,3,u>, <3,0,1,2>
-  1492610358U,  // <6,3,u,4>: Cost 2 vext1 <2,6,3,u>, RHS
-  1634257734U,  // <6,3,u,5>: Cost 2 vext3 <3,u,5,6>, <3,u,5,6>
-  2566353489U,  // <6,3,u,6>: Cost 3 vext1 <2,6,3,u>, <6,3,u,0>
-  2980504720U,  // <6,3,u,7>: Cost 3 vzipr RHS, <1,5,3,7>
-  1492612910U,  // <6,3,u,u>: Cost 2 vext1 <2,6,3,u>, LHS
-  3703406592U,  // <6,4,0,0>: Cost 4 vext2 <2,0,6,4>, <0,0,0,0>
-  2629664870U,  // <6,4,0,1>: Cost 3 vext2 <2,0,6,4>, LHS
-  2629664972U,  // <6,4,0,2>: Cost 3 vext2 <2,0,6,4>, <0,2,4,6>
-  3779087232U,  // <6,4,0,3>: Cost 4 vext3 <3,4,5,6>, <4,0,3,1>
-  2642936156U,  // <6,4,0,4>: Cost 3 vext2 <4,2,6,4>, <0,4,2,6>
-  2712570770U,  // <6,4,0,5>: Cost 3 vext3 <4,6,4,6>, <4,0,5,1>
-  2687208348U,  // <6,4,0,6>: Cost 3 vext3 <0,4,2,6>, <4,0,6,2>
-  3316723081U,  // <6,4,0,7>: Cost 4 vrev <4,6,7,0>
-  2629665437U,  // <6,4,0,u>: Cost 3 vext2 <2,0,6,4>, LHS
-  2242473291U,  // <6,4,1,0>: Cost 3 vrev <4,6,0,1>
-  3700089652U,  // <6,4,1,1>: Cost 4 vext2 <1,4,6,4>, <1,1,1,1>
-  3703407510U,  // <6,4,1,2>: Cost 4 vext2 <2,0,6,4>, <1,2,3,0>
-  2852962406U,  // <6,4,1,3>: Cost 3 vuzpr <5,6,7,4>, LHS
-  3628166454U,  // <6,4,1,4>: Cost 4 vext1 <0,6,4,1>, RHS
-  3760876514U,  // <6,4,1,5>: Cost 4 vext3 <0,4,1,6>, <4,1,5,0>
-  2687208430U,  // <6,4,1,6>: Cost 3 vext3 <0,4,2,6>, <4,1,6,3>
-  3316731274U,  // <6,4,1,7>: Cost 4 vrev <4,6,7,1>
-  2243063187U,  // <6,4,1,u>: Cost 3 vrev <4,6,u,1>
-  2629666284U,  // <6,4,2,0>: Cost 3 vext2 <2,0,6,4>, <2,0,6,4>
-  3703408188U,  // <6,4,2,1>: Cost 4 vext2 <2,0,6,4>, <2,1,6,3>
-  3703408232U,  // <6,4,2,2>: Cost 4 vext2 <2,0,6,4>, <2,2,2,2>
-  3703408294U,  // <6,4,2,3>: Cost 4 vext2 <2,0,6,4>, <2,3,0,1>
-  2632320816U,  // <6,4,2,4>: Cost 3 vext2 <2,4,6,4>, <2,4,6,4>
-  2923384118U,  // <6,4,2,5>: Cost 3 vzipl <6,2,7,3>, RHS
-  2687208508U,  // <6,4,2,6>: Cost 3 vext3 <0,4,2,6>, <4,2,6,0>
-  3760950341U,  // <6,4,2,7>: Cost 4 vext3 <0,4,2,6>, <4,2,7,0>
-  2634975348U,  // <6,4,2,u>: Cost 3 vext2 <2,u,6,4>, <2,u,6,4>
-  3703408790U,  // <6,4,3,0>: Cost 4 vext2 <2,0,6,4>, <3,0,1,2>
-  3316305238U,  // <6,4,3,1>: Cost 4 vrev <4,6,1,3>
-  3703408947U,  // <6,4,3,2>: Cost 4 vext2 <2,0,6,4>, <3,2,0,6>
-  3703409052U,  // <6,4,3,3>: Cost 4 vext2 <2,0,6,4>, <3,3,3,3>
-  2644929026U,  // <6,4,3,4>: Cost 3 vext2 <4,5,6,4>, <3,4,5,6>
-  3718670922U,  // <6,4,3,5>: Cost 4 vext2 <4,5,6,4>, <3,5,4,6>
-  2705345682U,  // <6,4,3,6>: Cost 3 vext3 <3,4,5,6>, <4,3,6,5>
-  3926705152U,  // <6,4,3,7>: Cost 4 vuzpr <5,6,7,4>, <1,3,5,7>
-  2668817222U,  // <6,4,3,u>: Cost 3 vext2 <u,5,6,4>, <3,u,5,6>
-  2590277734U,  // <6,4,4,0>: Cost 3 vext1 <6,6,4,4>, LHS
-  3716017135U,  // <6,4,4,1>: Cost 4 vext2 <4,1,6,4>, <4,1,6,4>
-  2642938944U,  // <6,4,4,2>: Cost 3 vext2 <4,2,6,4>, <4,2,6,4>
-  3717344401U,  // <6,4,4,3>: Cost 4 vext2 <4,3,6,4>, <4,3,6,4>
-  2712571088U,  // <6,4,4,4>: Cost 3 vext3 <4,6,4,6>, <4,4,4,4>
-  2629668150U,  // <6,4,4,5>: Cost 3 vext2 <2,0,6,4>, RHS
-  1637649636U,  // <6,4,4,6>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6>
-  2646257109U,  // <6,4,4,7>: Cost 3 vext2 <4,7,6,4>, <4,7,6,4>
-  1637649636U,  // <6,4,4,u>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6>
-  2566398054U,  // <6,4,5,0>: Cost 3 vext1 <2,6,4,5>, LHS
-  3760876805U,  // <6,4,5,1>: Cost 4 vext3 <0,4,1,6>, <4,5,1,3>
-  2566399937U,  // <6,4,5,2>: Cost 3 vext1 <2,6,4,5>, <2,6,4,5>
-  2584316418U,  // <6,4,5,3>: Cost 3 vext1 <5,6,4,5>, <3,4,5,6>
-  2566401334U,  // <6,4,5,4>: Cost 3 vext1 <2,6,4,5>, RHS
-  2584318028U,  // <6,4,5,5>: Cost 3 vext1 <5,6,4,5>, <5,6,4,5>
-  1612287286U,  // <6,4,5,6>: Cost 2 vext3 <0,2,4,6>, RHS
-  2852965686U,  // <6,4,5,7>: Cost 3 vuzpr <5,6,7,4>, RHS
-  1612287304U,  // <6,4,5,u>: Cost 2 vext3 <0,2,4,6>, RHS
-  1504608358U,  // <6,4,6,0>: Cost 2 vext1 <4,6,4,6>, LHS
-  2578350838U,  // <6,4,6,1>: Cost 3 vext1 <4,6,4,6>, <1,0,3,2>
-  2578351720U,  // <6,4,6,2>: Cost 3 vext1 <4,6,4,6>, <2,2,2,2>
-  2578352278U,  // <6,4,6,3>: Cost 3 vext1 <4,6,4,6>, <3,0,1,2>
-  1504611638U,  // <6,4,6,4>: Cost 2 vext1 <4,6,4,6>, RHS
-  2578353872U,  // <6,4,6,5>: Cost 3 vext1 <4,6,4,6>, <5,1,7,3>
-  2578354682U,  // <6,4,6,6>: Cost 3 vext1 <4,6,4,6>, <6,2,7,3>
-  2578355194U,  // <6,4,6,7>: Cost 3 vext1 <4,6,4,6>, <7,0,1,2>
-  1504614190U,  // <6,4,6,u>: Cost 2 vext1 <4,6,4,6>, LHS
-  2572386406U,  // <6,4,7,0>: Cost 3 vext1 <3,6,4,7>, LHS
-  2572387226U,  // <6,4,7,1>: Cost 3 vext1 <3,6,4,7>, <1,2,3,4>
-  3640157902U,  // <6,4,7,2>: Cost 4 vext1 <2,6,4,7>, <2,3,4,5>
-  2572389020U,  // <6,4,7,3>: Cost 3 vext1 <3,6,4,7>, <3,6,4,7>
-  2572389686U,  // <6,4,7,4>: Cost 3 vext1 <3,6,4,7>, RHS
-  2980497102U,  // <6,4,7,5>: Cost 3 vzipr RHS, <2,3,4,5>
-  2980495564U,  // <6,4,7,6>: Cost 3 vzipr RHS, <0,2,4,6>
-  4054239090U,  // <6,4,7,7>: Cost 4 vzipr RHS, <2,5,4,7>
-  2572392238U,  // <6,4,7,u>: Cost 3 vext1 <3,6,4,7>, LHS
-  1504608358U,  // <6,4,u,0>: Cost 2 vext1 <4,6,4,6>, LHS
-  2629670702U,  // <6,4,u,1>: Cost 3 vext2 <2,0,6,4>, LHS
-  2566424516U,  // <6,4,u,2>: Cost 3 vext1 <2,6,4,u>, <2,6,4,u>
-  2584340994U,  // <6,4,u,3>: Cost 3 vext1 <5,6,4,u>, <3,4,5,6>
-  1640156694U,  // <6,4,u,4>: Cost 2 vext3 <4,u,4,6>, <4,u,4,6>
-  2629671066U,  // <6,4,u,5>: Cost 3 vext2 <2,0,6,4>, RHS
-  1612287529U,  // <6,4,u,6>: Cost 2 vext3 <0,2,4,6>, RHS
-  2852965929U,  // <6,4,u,7>: Cost 3 vuzpr <5,6,7,4>, RHS
-  1612287547U,  // <6,4,u,u>: Cost 2 vext3 <0,2,4,6>, RHS
-  3708723200U,  // <6,5,0,0>: Cost 4 vext2 <2,u,6,5>, <0,0,0,0>
-  2634981478U,  // <6,5,0,1>: Cost 3 vext2 <2,u,6,5>, LHS
-  3694125260U,  // <6,5,0,2>: Cost 4 vext2 <0,4,6,5>, <0,2,4,6>
-  3779087962U,  // <6,5,0,3>: Cost 4 vext3 <3,4,5,6>, <5,0,3,2>
-  3760877154U,  // <6,5,0,4>: Cost 4 vext3 <0,4,1,6>, <5,0,4,1>
-  4195110916U,  // <6,5,0,5>: Cost 4 vtrnr <5,6,7,0>, <5,5,5,5>
-  3696779775U,  // <6,5,0,6>: Cost 4 vext2 <0,u,6,5>, <0,6,2,7>
-  1175212130U,  // <6,5,0,7>: Cost 2 vrev <5,6,7,0>
-  1175285867U,  // <6,5,0,u>: Cost 2 vrev <5,6,u,0>
-  2248445988U,  // <6,5,1,0>: Cost 3 vrev <5,6,0,1>
-  3698107237U,  // <6,5,1,1>: Cost 4 vext2 <1,1,6,5>, <1,1,6,5>
-  3708724118U,  // <6,5,1,2>: Cost 4 vext2 <2,u,6,5>, <1,2,3,0>
-  3908575334U,  // <6,5,1,3>: Cost 4 vuzpr <2,6,4,5>, LHS
-  3716023376U,  // <6,5,1,4>: Cost 4 vext2 <4,1,6,5>, <1,4,5,6>
-  3708724368U,  // <6,5,1,5>: Cost 4 vext2 <2,u,6,5>, <1,5,3,7>
-  3767733960U,  // <6,5,1,6>: Cost 4 vext3 <1,5,4,6>, <5,1,6,4>
-  2712571600U,  // <6,5,1,7>: Cost 3 vext3 <4,6,4,6>, <5,1,7,3>
-  2712571609U,  // <6,5,1,u>: Cost 3 vext3 <4,6,4,6>, <5,1,u,3>
-  2578391142U,  // <6,5,2,0>: Cost 3 vext1 <4,6,5,2>, LHS
-  3704079934U,  // <6,5,2,1>: Cost 4 vext2 <2,1,6,5>, <2,1,6,5>
-  3708724840U,  // <6,5,2,2>: Cost 4 vext2 <2,u,6,5>, <2,2,2,2>
-  3705407182U,  // <6,5,2,3>: Cost 4 vext2 <2,3,6,5>, <2,3,4,5>
-  2578394422U,  // <6,5,2,4>: Cost 3 vext1 <4,6,5,2>, RHS
-  3717351272U,  // <6,5,2,5>: Cost 4 vext2 <4,3,6,5>, <2,5,3,6>
-  2634983354U,  // <6,5,2,6>: Cost 3 vext2 <2,u,6,5>, <2,6,3,7>
-  3115486518U,  // <6,5,2,7>: Cost 3 vtrnr <4,6,u,2>, RHS
-  2634983541U,  // <6,5,2,u>: Cost 3 vext2 <2,u,6,5>, <2,u,6,5>
-  3708725398U,  // <6,5,3,0>: Cost 4 vext2 <2,u,6,5>, <3,0,1,2>
-  3710052631U,  // <6,5,3,1>: Cost 4 vext2 <3,1,6,5>, <3,1,6,5>
-  3708725606U,  // <6,5,3,2>: Cost 4 vext2 <2,u,6,5>, <3,2,6,3>
-  3708725660U,  // <6,5,3,3>: Cost 4 vext2 <2,u,6,5>, <3,3,3,3>
-  2643610114U,  // <6,5,3,4>: Cost 3 vext2 <4,3,6,5>, <3,4,5,6>
-  3717352010U,  // <6,5,3,5>: Cost 4 vext2 <4,3,6,5>, <3,5,4,6>
-  3773632358U,  // <6,5,3,6>: Cost 4 vext3 <2,5,3,6>, <5,3,6,0>
-  2248978533U,  // <6,5,3,7>: Cost 3 vrev <5,6,7,3>
-  2249052270U,  // <6,5,3,u>: Cost 3 vrev <5,6,u,3>
-  2596323430U,  // <6,5,4,0>: Cost 3 vext1 <7,6,5,4>, LHS
-  3716025328U,  // <6,5,4,1>: Cost 4 vext2 <4,1,6,5>, <4,1,6,5>
-  3716688961U,  // <6,5,4,2>: Cost 4 vext2 <4,2,6,5>, <4,2,6,5>
-  2643610770U,  // <6,5,4,3>: Cost 3 vext2 <4,3,6,5>, <4,3,6,5>
-  2596326710U,  // <6,5,4,4>: Cost 3 vext1 <7,6,5,4>, RHS
-  2634984758U,  // <6,5,4,5>: Cost 3 vext2 <2,u,6,5>, RHS
-  3767734199U,  // <6,5,4,6>: Cost 4 vext3 <1,5,4,6>, <5,4,6,0>
-  1643696070U,  // <6,5,4,7>: Cost 2 vext3 <5,4,7,6>, <5,4,7,6>
-  1643769807U,  // <6,5,4,u>: Cost 2 vext3 <5,4,u,6>, <5,4,u,6>
-  2578415718U,  // <6,5,5,0>: Cost 3 vext1 <4,6,5,5>, LHS
-  3652158198U,  // <6,5,5,1>: Cost 4 vext1 <4,6,5,5>, <1,0,3,2>
-  3652159080U,  // <6,5,5,2>: Cost 4 vext1 <4,6,5,5>, <2,2,2,2>
-  3652159638U,  // <6,5,5,3>: Cost 4 vext1 <4,6,5,5>, <3,0,1,2>
-  2578418998U,  // <6,5,5,4>: Cost 3 vext1 <4,6,5,5>, RHS
-  2712571908U,  // <6,5,5,5>: Cost 3 vext3 <4,6,4,6>, <5,5,5,5>
-  2718027790U,  // <6,5,5,6>: Cost 3 vext3 <5,5,6,6>, <5,5,6,6>
-  2712571928U,  // <6,5,5,7>: Cost 3 vext3 <4,6,4,6>, <5,5,7,7>
-  2712571937U,  // <6,5,5,u>: Cost 3 vext3 <4,6,4,6>, <5,5,u,7>
-  2705346596U,  // <6,5,6,0>: Cost 3 vext3 <3,4,5,6>, <5,6,0,1>
-  3767144496U,  // <6,5,6,1>: Cost 4 vext3 <1,4,5,6>, <5,6,1,4>
-  3773116473U,  // <6,5,6,2>: Cost 4 vext3 <2,4,5,6>, <5,6,2,4>
-  2705346626U,  // <6,5,6,3>: Cost 3 vext3 <3,4,5,6>, <5,6,3,4>
-  2705346636U,  // <6,5,6,4>: Cost 3 vext3 <3,4,5,6>, <5,6,4,5>
-  3908577217U,  // <6,5,6,5>: Cost 4 vuzpr <2,6,4,5>, <2,6,4,5>
-  2578428728U,  // <6,5,6,6>: Cost 3 vext1 <4,6,5,6>, <6,6,6,6>
-  2712572002U,  // <6,5,6,7>: Cost 3 vext3 <4,6,4,6>, <5,6,7,0>
-  2705346668U,  // <6,5,6,u>: Cost 3 vext3 <3,4,5,6>, <5,6,u,1>
-  2560516198U,  // <6,5,7,0>: Cost 3 vext1 <1,6,5,7>, LHS
-  2560517363U,  // <6,5,7,1>: Cost 3 vext1 <1,6,5,7>, <1,6,5,7>
-  2566490060U,  // <6,5,7,2>: Cost 3 vext1 <2,6,5,7>, <2,6,5,7>
-  3634260118U,  // <6,5,7,3>: Cost 4 vext1 <1,6,5,7>, <3,0,1,2>
-  2560519478U,  // <6,5,7,4>: Cost 3 vext1 <1,6,5,7>, RHS
-  2980498650U,  // <6,5,7,5>: Cost 3 vzipr RHS, <4,4,5,5>
-  2980497922U,  // <6,5,7,6>: Cost 3 vzipr RHS, <3,4,5,6>
-  3103214902U,  // <6,5,7,7>: Cost 3 vtrnr <2,6,3,7>, RHS
-  2560522030U,  // <6,5,7,u>: Cost 3 vext1 <1,6,5,7>, LHS
-  2560524390U,  // <6,5,u,0>: Cost 3 vext1 <1,6,5,u>, LHS
-  2560525556U,  // <6,5,u,1>: Cost 3 vext1 <1,6,5,u>, <1,6,5,u>
-  2566498253U,  // <6,5,u,2>: Cost 3 vext1 <2,6,5,u>, <2,6,5,u>
-  2646931439U,  // <6,5,u,3>: Cost 3 vext2 <4,u,6,5>, <u,3,5,7>
-  2560527670U,  // <6,5,u,4>: Cost 3 vext1 <1,6,5,u>, RHS
-  2634987674U,  // <6,5,u,5>: Cost 3 vext2 <2,u,6,5>, RHS
-  2980506114U,  // <6,5,u,6>: Cost 3 vzipr RHS, <3,4,5,6>
-  1175277674U,  // <6,5,u,7>: Cost 2 vrev <5,6,7,u>
-  1175351411U,  // <6,5,u,u>: Cost 2 vrev <5,6,u,u>
-  2578448486U,  // <6,6,0,0>: Cost 3 vext1 <4,6,6,0>, LHS
-  1573191782U,  // <6,6,0,1>: Cost 2 vext2 <4,u,6,6>, LHS
-  2686030124U,  // <6,6,0,2>: Cost 3 vext3 <0,2,4,6>, <6,0,2,4>
-  3779088690U,  // <6,6,0,3>: Cost 4 vext3 <3,4,5,6>, <6,0,3,1>
-  2687209788U,  // <6,6,0,4>: Cost 3 vext3 <0,4,2,6>, <6,0,4,2>
-  3652194000U,  // <6,6,0,5>: Cost 4 vext1 <4,6,6,0>, <5,1,7,3>
-  2254852914U,  // <6,6,0,6>: Cost 3 vrev <6,6,6,0>
-  4041575734U,  // <6,6,0,7>: Cost 4 vzipr <2,4,6,0>, RHS
-  1573192349U,  // <6,6,0,u>: Cost 2 vext2 <4,u,6,6>, LHS
-  2646934262U,  // <6,6,1,0>: Cost 3 vext2 <4,u,6,6>, <1,0,3,2>
-  2646934324U,  // <6,6,1,1>: Cost 3 vext2 <4,u,6,6>, <1,1,1,1>
-  2646934422U,  // <6,6,1,2>: Cost 3 vext2 <4,u,6,6>, <1,2,3,0>
-  2846785638U,  // <6,6,1,3>: Cost 3 vuzpr <4,6,4,6>, LHS
-  3760951694U,  // <6,6,1,4>: Cost 4 vext3 <0,4,2,6>, <6,1,4,3>
-  2646934672U,  // <6,6,1,5>: Cost 3 vext2 <4,u,6,6>, <1,5,3,7>
-  2712572320U,  // <6,6,1,6>: Cost 3 vext3 <4,6,4,6>, <6,1,6,3>
-  3775549865U,  // <6,6,1,7>: Cost 4 vext3 <2,u,2,6>, <6,1,7,3>
-  2846785643U,  // <6,6,1,u>: Cost 3 vuzpr <4,6,4,6>, LHS
-  3759772094U,  // <6,6,2,0>: Cost 4 vext3 <0,2,4,6>, <6,2,0,6>
-  3704751676U,  // <6,6,2,1>: Cost 4 vext2 <2,2,6,6>, <2,1,6,3>
-  2631009936U,  // <6,6,2,2>: Cost 3 vext2 <2,2,6,6>, <2,2,6,6>
-  2646935206U,  // <6,6,2,3>: Cost 3 vext2 <4,u,6,6>, <2,3,0,1>
-  3759772127U,  // <6,6,2,4>: Cost 4 vext3 <0,2,4,6>, <6,2,4,3>
-  3704752004U,  // <6,6,2,5>: Cost 4 vext2 <2,2,6,6>, <2,5,6,7>
-  2646935482U,  // <6,6,2,6>: Cost 3 vext2 <4,u,6,6>, <2,6,3,7>
-  2712572410U,  // <6,6,2,7>: Cost 3 vext3 <4,6,4,6>, <6,2,7,3>
-  2712572419U,  // <6,6,2,u>: Cost 3 vext3 <4,6,4,6>, <6,2,u,3>
-  2646935702U,  // <6,6,3,0>: Cost 3 vext2 <4,u,6,6>, <3,0,1,2>
-  3777024534U,  // <6,6,3,1>: Cost 4 vext3 <3,1,4,6>, <6,3,1,4>
-  3704752453U,  // <6,6,3,2>: Cost 4 vext2 <2,2,6,6>, <3,2,2,6>
-  2646935964U,  // <6,6,3,3>: Cost 3 vext2 <4,u,6,6>, <3,3,3,3>
-  2705347122U,  // <6,6,3,4>: Cost 3 vext3 <3,4,5,6>, <6,3,4,5>
-  3779678778U,  // <6,6,3,5>: Cost 4 vext3 <3,5,4,6>, <6,3,5,4>
-  2657553069U,  // <6,6,3,6>: Cost 3 vext2 <6,6,6,6>, <3,6,6,6>
-  4039609654U,  // <6,6,3,7>: Cost 4 vzipr <2,1,6,3>, RHS
-  2708001366U,  // <6,6,3,u>: Cost 3 vext3 <3,u,5,6>, <6,3,u,5>
-  2578481254U,  // <6,6,4,0>: Cost 3 vext1 <4,6,6,4>, LHS
-  3652223734U,  // <6,6,4,1>: Cost 4 vext1 <4,6,6,4>, <1,0,3,2>
-  3760951922U,  // <6,6,4,2>: Cost 4 vext3 <0,4,2,6>, <6,4,2,6>
-  3779089019U,  // <6,6,4,3>: Cost 4 vext3 <3,4,5,6>, <6,4,3,6>
-  1570540772U,  // <6,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6>
-  1573195062U,  // <6,6,4,5>: Cost 2 vext2 <4,u,6,6>, RHS
-  2712572560U,  // <6,6,4,6>: Cost 3 vext3 <4,6,4,6>, <6,4,6,0>
-  2723410591U,  // <6,6,4,7>: Cost 3 vext3 <6,4,7,6>, <6,4,7,6>
-  1573195304U,  // <6,6,4,u>: Cost 2 vext2 <4,u,6,6>, <4,u,6,6>
-  3640287334U,  // <6,6,5,0>: Cost 4 vext1 <2,6,6,5>, LHS
-  2646937296U,  // <6,6,5,1>: Cost 3 vext2 <4,u,6,6>, <5,1,7,3>
-  3640289235U,  // <6,6,5,2>: Cost 4 vext1 <2,6,6,5>, <2,6,6,5>
-  3720679279U,  // <6,6,5,3>: Cost 4 vext2 <4,u,6,6>, <5,3,7,0>
-  2646937542U,  // <6,6,5,4>: Cost 3 vext2 <4,u,6,6>, <5,4,7,6>
-  2646937604U,  // <6,6,5,5>: Cost 3 vext2 <4,u,6,6>, <5,5,5,5>
-  2646937698U,  // <6,6,5,6>: Cost 3 vext2 <4,u,6,6>, <5,6,7,0>
-  2846788918U,  // <6,6,5,7>: Cost 3 vuzpr <4,6,4,6>, RHS
-  2846788919U,  // <6,6,5,u>: Cost 3 vuzpr <4,6,4,6>, RHS
-  1516699750U,  // <6,6,6,0>: Cost 2 vext1 <6,6,6,6>, LHS
-  2590442230U,  // <6,6,6,1>: Cost 3 vext1 <6,6,6,6>, <1,0,3,2>
-  2646938106U,  // <6,6,6,2>: Cost 3 vext2 <4,u,6,6>, <6,2,7,3>
-  2590443670U,  // <6,6,6,3>: Cost 3 vext1 <6,6,6,6>, <3,0,1,2>
-  1516703030U,  // <6,6,6,4>: Cost 2 vext1 <6,6,6,6>, RHS
-  2590445264U,  // <6,6,6,5>: Cost 3 vext1 <6,6,6,6>, <5,1,7,3>
-   296144182U,  // <6,6,6,6>: Cost 1 vdup2 RHS
-  2712572738U,  // <6,6,6,7>: Cost 3 vext3 <4,6,4,6>, <6,6,7,7>
-   296144182U,  // <6,6,6,u>: Cost 1 vdup2 RHS
-  2566561894U,  // <6,6,7,0>: Cost 3 vext1 <2,6,6,7>, LHS
-  3634332924U,  // <6,6,7,1>: Cost 4 vext1 <1,6,6,7>, <1,6,6,7>
-  2566563797U,  // <6,6,7,2>: Cost 3 vext1 <2,6,6,7>, <2,6,6,7>
-  2584480258U,  // <6,6,7,3>: Cost 3 vext1 <5,6,6,7>, <3,4,5,6>
-  2566565174U,  // <6,6,7,4>: Cost 3 vext1 <2,6,6,7>, RHS
-  2717438846U,  // <6,6,7,5>: Cost 3 vext3 <5,4,7,6>, <6,7,5,4>
-  2980500280U,  // <6,6,7,6>: Cost 3 vzipr RHS, <6,6,6,6>
-  1906756918U,  // <6,6,7,7>: Cost 2 vzipr RHS, RHS
-  1906756919U,  // <6,6,7,u>: Cost 2 vzipr RHS, RHS
-  1516699750U,  // <6,6,u,0>: Cost 2 vext1 <6,6,6,6>, LHS
-  1573197614U,  // <6,6,u,1>: Cost 2 vext2 <4,u,6,6>, LHS
-  2566571990U,  // <6,6,u,2>: Cost 3 vext1 <2,6,6,u>, <2,6,6,u>
-  2846786205U,  // <6,6,u,3>: Cost 3 vuzpr <4,6,4,6>, LHS
-  1516703030U,  // <6,6,u,4>: Cost 2 vext1 <6,6,6,6>, RHS
-  1573197978U,  // <6,6,u,5>: Cost 2 vext2 <4,u,6,6>, RHS
-   296144182U,  // <6,6,u,6>: Cost 1 vdup2 RHS
-  1906765110U,  // <6,6,u,7>: Cost 2 vzipr RHS, RHS
-   296144182U,  // <6,6,u,u>: Cost 1 vdup2 RHS
-  1571209216U,  // <6,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
-   497467494U,  // <6,7,0,1>: Cost 1 vext2 RHS, LHS
-  1571209380U,  // <6,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
-  2644951292U,  // <6,7,0,3>: Cost 3 vext2 RHS, <0,3,1,0>
-  1571209554U,  // <6,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
-  1510756450U,  // <6,7,0,5>: Cost 2 vext1 <5,6,7,0>, <5,6,7,0>
-  2644951542U,  // <6,7,0,6>: Cost 3 vext2 RHS, <0,6,1,7>
-  2584499194U,  // <6,7,0,7>: Cost 3 vext1 <5,6,7,0>, <7,0,1,2>
-   497468061U,  // <6,7,0,u>: Cost 1 vext2 RHS, LHS
-  1571209974U,  // <6,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
-  1571210036U,  // <6,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
-  1571210134U,  // <6,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0>
-  1571210200U,  // <6,7,1,3>: Cost 2 vext2 RHS, <1,3,1,3>
-  2644952098U,  // <6,7,1,4>: Cost 3 vext2 RHS, <1,4,0,5>
-  1571210384U,  // <6,7,1,5>: Cost 2 vext2 RHS, <1,5,3,7>
-  2644952271U,  // <6,7,1,6>: Cost 3 vext2 RHS, <1,6,1,7>
-  2578535418U,  // <6,7,1,7>: Cost 3 vext1 <4,6,7,1>, <7,0,1,2>
-  1571210605U,  // <6,7,1,u>: Cost 2 vext2 RHS, <1,u,1,3>
-  2644952509U,  // <6,7,2,0>: Cost 3 vext2 RHS, <2,0,1,2>
-  2644952582U,  // <6,7,2,1>: Cost 3 vext2 RHS, <2,1,0,3>
-  1571210856U,  // <6,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
-  1571210918U,  // <6,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
-  2644952828U,  // <6,7,2,4>: Cost 3 vext2 RHS, <2,4,0,6>
-  2633009028U,  // <6,7,2,5>: Cost 3 vext2 <2,5,6,7>, <2,5,6,7>
-  1571211194U,  // <6,7,2,6>: Cost 2 vext2 RHS, <2,6,3,7>
-  2668840938U,  // <6,7,2,7>: Cost 3 vext2 RHS, <2,7,0,1>
-  1571211323U,  // <6,7,2,u>: Cost 2 vext2 RHS, <2,u,0,1>
-  1571211414U,  // <6,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
-  2644953311U,  // <6,7,3,1>: Cost 3 vext2 RHS, <3,1,0,3>
-  2644953390U,  // <6,7,3,2>: Cost 3 vext2 RHS, <3,2,0,1>
-  1571211676U,  // <6,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
-  1571211778U,  // <6,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
-  2644953648U,  // <6,7,3,5>: Cost 3 vext2 RHS, <3,5,1,7>
-  2644953720U,  // <6,7,3,6>: Cost 3 vext2 RHS, <3,6,0,7>
-  2644953795U,  // <6,7,3,7>: Cost 3 vext2 RHS, <3,7,0,1>
-  1571212062U,  // <6,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
-  1573202834U,  // <6,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
-  2644954058U,  // <6,7,4,1>: Cost 3 vext2 RHS, <4,1,2,3>
-  2644954166U,  // <6,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3>
-  2644954258U,  // <6,7,4,3>: Cost 3 vext2 RHS, <4,3,6,5>
-  1571212496U,  // <6,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
-   497470774U,  // <6,7,4,5>: Cost 1 vext2 RHS, RHS
-  1573203316U,  // <6,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
-  2646281688U,  // <6,7,4,7>: Cost 3 vext2 <4,7,6,7>, <4,7,6,7>
-   497471017U,  // <6,7,4,u>: Cost 1 vext2 RHS, RHS
-  2644954696U,  // <6,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2>
-  1573203664U,  // <6,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
-  2644954878U,  // <6,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4>
-  2644954991U,  // <6,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0>
-  1571213254U,  // <6,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
-  1571213316U,  // <6,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
-  1571213410U,  // <6,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0>
-  1573204136U,  // <6,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
-  1573204217U,  // <6,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7>
-  2644955425U,  // <6,7,6,0>: Cost 3 vext2 RHS, <6,0,1,2>
-  2644955561U,  // <6,7,6,1>: Cost 3 vext2 RHS, <6,1,7,3>
-  1573204474U,  // <6,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
-  2644955698U,  // <6,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5>
-  2644955789U,  // <6,7,6,4>: Cost 3 vext2 RHS, <6,4,5,6>
-  2644955889U,  // <6,7,6,5>: Cost 3 vext2 RHS, <6,5,7,7>
-  1571214136U,  // <6,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6>
-  1571214158U,  // <6,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
-  1573204895U,  // <6,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1>
-  1573204986U,  // <6,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2>
-  2572608656U,  // <6,7,7,1>: Cost 3 vext1 <3,6,7,7>, <1,5,3,7>
-  2644956362U,  // <6,7,7,2>: Cost 3 vext2 RHS, <7,2,6,3>
-  2572610231U,  // <6,7,7,3>: Cost 3 vext1 <3,6,7,7>, <3,6,7,7>
-  1573205350U,  // <6,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6>
-  2646947220U,  // <6,7,7,5>: Cost 3 vext2 RHS, <7,5,1,7>
-  1516786498U,  // <6,7,7,6>: Cost 2 vext1 <6,6,7,7>, <6,6,7,7>
-  1571214956U,  // <6,7,7,7>: Cost 2 vext2 RHS, <7,7,7,7>
-  1573205634U,  // <6,7,7,u>: Cost 2 vext2 RHS, <7,u,1,2>
-  1571215059U,  // <6,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2>
-   497473326U,  // <6,7,u,1>: Cost 1 vext2 RHS, LHS
-  1571215237U,  // <6,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0>
-  1571215292U,  // <6,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
-  1571215423U,  // <6,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6>
-   497473690U,  // <6,7,u,5>: Cost 1 vext2 RHS, RHS
-  1571215568U,  // <6,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7>
-  1573206272U,  // <6,7,u,7>: Cost 2 vext2 RHS, <u,7,0,1>
-   497473893U,  // <6,7,u,u>: Cost 1 vext2 RHS, LHS
-  1571217408U,  // <6,u,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
-   497475686U,  // <6,u,0,1>: Cost 1 vext2 RHS, LHS
-  1571217572U,  // <6,u,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
-  2689865445U,  // <6,u,0,3>: Cost 3 vext3 <0,u,2,6>, <u,0,3,2>
-  1571217746U,  // <6,u,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
-  1510830187U,  // <6,u,0,5>: Cost 2 vext1 <5,6,u,0>, <5,6,u,0>
-  2644959734U,  // <6,u,0,6>: Cost 3 vext2 RHS, <0,6,1,7>
-  1193130221U,  // <6,u,0,7>: Cost 2 vrev <u,6,7,0>
-   497476253U,  // <6,u,0,u>: Cost 1 vext2 RHS, LHS
-  1571218166U,  // <6,u,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
-  1571218228U,  // <6,u,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
-  1612289838U,  // <6,u,1,2>: Cost 2 vext3 <0,2,4,6>, LHS
-  1571218392U,  // <6,u,1,3>: Cost 2 vext2 RHS, <1,3,1,3>
-  2566663478U,  // <6,u,1,4>: Cost 3 vext1 <2,6,u,1>, RHS
-  1571218576U,  // <6,u,1,5>: Cost 2 vext2 RHS, <1,5,3,7>
-  2644960463U,  // <6,u,1,6>: Cost 3 vext2 RHS, <1,6,1,7>
-  2717439835U,  // <6,u,1,7>: Cost 3 vext3 <5,4,7,6>, <u,1,7,3>
-  1612289892U,  // <6,u,1,u>: Cost 2 vext3 <0,2,4,6>, LHS
-  1504870502U,  // <6,u,2,0>: Cost 2 vext1 <4,6,u,2>, LHS
-  2644960774U,  // <6,u,2,1>: Cost 3 vext2 RHS, <2,1,0,3>
-  1571219048U,  // <6,u,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
-  1571219110U,  // <6,u,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
-  1504873782U,  // <6,u,2,4>: Cost 2 vext1 <4,6,u,2>, RHS
-  2633017221U,  // <6,u,2,5>: Cost 3 vext2 <2,5,6,u>, <2,5,6,u>
-  1571219386U,  // <6,u,2,6>: Cost 2 vext2 RHS, <2,6,3,7>
-  2712573868U,  // <6,u,2,7>: Cost 3 vext3 <4,6,4,6>, <u,2,7,3>
-  1571219515U,  // <6,u,2,u>: Cost 2 vext2 RHS, <2,u,0,1>
-  1571219606U,  // <6,u,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
-  2644961503U,  // <6,u,3,1>: Cost 3 vext2 RHS, <3,1,0,3>
-  2566678499U,  // <6,u,3,2>: Cost 3 vext1 <2,6,u,3>, <2,6,u,3>
-  1571219868U,  // <6,u,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
-  1571219970U,  // <6,u,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
-  2689865711U,  // <6,u,3,5>: Cost 3 vext3 <0,u,2,6>, <u,3,5,7>
-  2708002806U,  // <6,u,3,6>: Cost 3 vext3 <3,u,5,6>, <u,3,6,5>
-  2644961987U,  // <6,u,3,7>: Cost 3 vext2 RHS, <3,7,0,1>
-  1571220254U,  // <6,u,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
-  1571220370U,  // <6,u,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
-  2644962250U,  // <6,u,4,1>: Cost 3 vext2 RHS, <4,1,2,3>
-  1661245476U,  // <6,u,4,2>: Cost 2 vext3 <u,4,2,6>, <u,4,2,6>
-  2686031917U,  // <6,u,4,3>: Cost 3 vext3 <0,2,4,6>, <u,4,3,6>
-  1571220688U,  // <6,u,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
-   497478967U,  // <6,u,4,5>: Cost 1 vext2 RHS, RHS
-  1571220852U,  // <6,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
-  1661614161U,  // <6,u,4,7>: Cost 2 vext3 <u,4,7,6>, <u,4,7,6>
-   497479209U,  // <6,u,4,u>: Cost 1 vext2 RHS, RHS
-  2566692966U,  // <6,u,5,0>: Cost 3 vext1 <2,6,u,5>, LHS
-  1571221200U,  // <6,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
-  2566694885U,  // <6,u,5,2>: Cost 3 vext1 <2,6,u,5>, <2,6,u,5>
-  2689865855U,  // <6,u,5,3>: Cost 3 vext3 <0,u,2,6>, <u,5,3,7>
-  1571221446U,  // <6,u,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
-  1571221508U,  // <6,u,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
-  1612290202U,  // <6,u,5,6>: Cost 2 vext3 <0,2,4,6>, RHS
-  1571221672U,  // <6,u,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
-  1612290220U,  // <6,u,5,u>: Cost 2 vext3 <0,2,4,6>, RHS
-  1504903270U,  // <6,u,6,0>: Cost 2 vext1 <4,6,u,6>, LHS
-  2644963752U,  // <6,u,6,1>: Cost 3 vext2 RHS, <6,1,7,2>
-  1571222010U,  // <6,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
-  2686032080U,  // <6,u,6,3>: Cost 3 vext3 <0,2,4,6>, <u,6,3,7>
-  1504906550U,  // <6,u,6,4>: Cost 2 vext1 <4,6,u,6>, RHS
-  2644964079U,  // <6,u,6,5>: Cost 3 vext2 RHS, <6,5,7,5>
-   296144182U,  // <6,u,6,6>: Cost 1 vdup2 RHS
-  1571222350U,  // <6,u,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
-   296144182U,  // <6,u,6,u>: Cost 1 vdup2 RHS
-  1492967526U,  // <6,u,7,0>: Cost 2 vext1 <2,6,u,7>, LHS
-  2560738574U,  // <6,u,7,1>: Cost 3 vext1 <1,6,u,7>, <1,6,u,7>
-  1492969447U,  // <6,u,7,2>: Cost 2 vext1 <2,6,u,7>, <2,6,u,7>
-  1906753692U,  // <6,u,7,3>: Cost 2 vzipr RHS, LHS
-  1492970806U,  // <6,u,7,4>: Cost 2 vext1 <2,6,u,7>, RHS
-  2980495761U,  // <6,u,7,5>: Cost 3 vzipr RHS, <0,4,u,5>
-  1516860235U,  // <6,u,7,6>: Cost 2 vext1 <6,6,u,7>, <6,6,u,7>
-  1906756936U,  // <6,u,7,7>: Cost 2 vzipr RHS, RHS
-  1492973358U,  // <6,u,7,u>: Cost 2 vext1 <2,6,u,7>, LHS
-  1492975718U,  // <6,u,u,0>: Cost 2 vext1 <2,6,u,u>, LHS
-   497481518U,  // <6,u,u,1>: Cost 1 vext2 RHS, LHS
-  1612290405U,  // <6,u,u,2>: Cost 2 vext3 <0,2,4,6>, LHS
-  1571223484U,  // <6,u,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
-  1492978998U,  // <6,u,u,4>: Cost 2 vext1 <2,6,u,u>, RHS
-   497481882U,  // <6,u,u,5>: Cost 1 vext2 RHS, RHS
-   296144182U,  // <6,u,u,6>: Cost 1 vdup2 RHS
-  1906765128U,  // <6,u,u,7>: Cost 2 vzipr RHS, RHS
-   497482085U,  // <6,u,u,u>: Cost 1 vext2 RHS, LHS
-  1638318080U,  // <7,0,0,0>: Cost 2 vext3 RHS, <0,0,0,0>
-  1638318090U,  // <7,0,0,1>: Cost 2 vext3 RHS, <0,0,1,1>
-  1638318100U,  // <7,0,0,2>: Cost 2 vext3 RHS, <0,0,2,2>
-  3646442178U,  // <7,0,0,3>: Cost 4 vext1 <3,7,0,0>, <3,7,0,0>
-  2712059941U,  // <7,0,0,4>: Cost 3 vext3 RHS, <0,0,4,1>
-  2651603364U,  // <7,0,0,5>: Cost 3 vext2 <5,6,7,0>, <0,5,1,6>
-  2590618445U,  // <7,0,0,6>: Cost 3 vext1 <6,7,0,0>, <6,7,0,0>
-  3785801798U,  // <7,0,0,7>: Cost 4 vext3 RHS, <0,0,7,7>
-  1638318153U,  // <7,0,0,u>: Cost 2 vext3 RHS, <0,0,u,1>
-  1516879974U,  // <7,0,1,0>: Cost 2 vext1 <6,7,0,1>, LHS
-  2693922911U,  // <7,0,1,1>: Cost 3 vext3 <1,5,3,7>, <0,1,1,5>
-   564576358U,  // <7,0,1,2>: Cost 1 vext3 RHS, LHS
-  2638996480U,  // <7,0,1,3>: Cost 3 vext2 <3,5,7,0>, <1,3,5,7>
-  1516883254U,  // <7,0,1,4>: Cost 2 vext1 <6,7,0,1>, RHS
-  2649613456U,  // <7,0,1,5>: Cost 3 vext2 <5,3,7,0>, <1,5,3,7>
-  1516884814U,  // <7,0,1,6>: Cost 2 vext1 <6,7,0,1>, <6,7,0,1>
-  2590626808U,  // <7,0,1,7>: Cost 3 vext1 <6,7,0,1>, <7,0,1,0>
-   564576412U,  // <7,0,1,u>: Cost 1 vext3 RHS, LHS
-  1638318244U,  // <7,0,2,0>: Cost 2 vext3 RHS, <0,2,0,2>
-  2692743344U,  // <7,0,2,1>: Cost 3 vext3 <1,3,5,7>, <0,2,1,5>
-  2712060084U,  // <7,0,2,2>: Cost 3 vext3 RHS, <0,2,2,0>
-  2712060094U,  // <7,0,2,3>: Cost 3 vext3 RHS, <0,2,3,1>
-  1638318284U,  // <7,0,2,4>: Cost 2 vext3 RHS, <0,2,4,6>
-  2712060118U,  // <7,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7>
-  2651604922U,  // <7,0,2,6>: Cost 3 vext2 <5,6,7,0>, <2,6,3,7>
-  2686255336U,  // <7,0,2,7>: Cost 3 vext3 <0,2,7,7>, <0,2,7,7>
-  1638318316U,  // <7,0,2,u>: Cost 2 vext3 RHS, <0,2,u,2>
-  2651605142U,  // <7,0,3,0>: Cost 3 vext2 <5,6,7,0>, <3,0,1,2>
-  2712060156U,  // <7,0,3,1>: Cost 3 vext3 RHS, <0,3,1,0>
-  2712060165U,  // <7,0,3,2>: Cost 3 vext3 RHS, <0,3,2,0>
-  2651605404U,  // <7,0,3,3>: Cost 3 vext2 <5,6,7,0>, <3,3,3,3>
-  2651605506U,  // <7,0,3,4>: Cost 3 vext2 <5,6,7,0>, <3,4,5,6>
-  2638998111U,  // <7,0,3,5>: Cost 3 vext2 <3,5,7,0>, <3,5,7,0>
-  2639661744U,  // <7,0,3,6>: Cost 3 vext2 <3,6,7,0>, <3,6,7,0>
-  3712740068U,  // <7,0,3,7>: Cost 4 vext2 <3,5,7,0>, <3,7,3,7>
-  2640989010U,  // <7,0,3,u>: Cost 3 vext2 <3,u,7,0>, <3,u,7,0>
-  2712060232U,  // <7,0,4,0>: Cost 3 vext3 RHS, <0,4,0,4>
-  1638318418U,  // <7,0,4,1>: Cost 2 vext3 RHS, <0,4,1,5>
-  1638318428U,  // <7,0,4,2>: Cost 2 vext3 RHS, <0,4,2,6>
-  3646474950U,  // <7,0,4,3>: Cost 4 vext1 <3,7,0,4>, <3,7,0,4>
-  2712060270U,  // <7,0,4,4>: Cost 3 vext3 RHS, <0,4,4,6>
-  1577864502U,  // <7,0,4,5>: Cost 2 vext2 <5,6,7,0>, RHS
-  2651606388U,  // <7,0,4,6>: Cost 3 vext2 <5,6,7,0>, <4,6,4,6>
-  3787792776U,  // <7,0,4,7>: Cost 4 vext3 RHS, <0,4,7,5>
-  1638318481U,  // <7,0,4,u>: Cost 2 vext3 RHS, <0,4,u,5>
-  2590654566U,  // <7,0,5,0>: Cost 3 vext1 <6,7,0,5>, LHS
-  2651606736U,  // <7,0,5,1>: Cost 3 vext2 <5,6,7,0>, <5,1,7,3>
-  2712060334U,  // <7,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7>
-  2649616239U,  // <7,0,5,3>: Cost 3 vext2 <5,3,7,0>, <5,3,7,0>
-  2651606982U,  // <7,0,5,4>: Cost 3 vext2 <5,6,7,0>, <5,4,7,6>
-  2651607044U,  // <7,0,5,5>: Cost 3 vext2 <5,6,7,0>, <5,5,5,5>
-  1577865314U,  // <7,0,5,6>: Cost 2 vext2 <5,6,7,0>, <5,6,7,0>
-  2651607208U,  // <7,0,5,7>: Cost 3 vext2 <5,6,7,0>, <5,7,5,7>
-  1579192580U,  // <7,0,5,u>: Cost 2 vext2 <5,u,7,0>, <5,u,7,0>
-  2688393709U,  // <7,0,6,0>: Cost 3 vext3 <0,6,0,7>, <0,6,0,7>
-  2712060406U,  // <7,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7>
-  2688541183U,  // <7,0,6,2>: Cost 3 vext3 <0,6,2,7>, <0,6,2,7>
-  2655588936U,  // <7,0,6,3>: Cost 3 vext2 <6,3,7,0>, <6,3,7,0>
-  3762430481U,  // <7,0,6,4>: Cost 4 vext3 <0,6,4,7>, <0,6,4,7>
-  2651607730U,  // <7,0,6,5>: Cost 3 vext2 <5,6,7,0>, <6,5,0,7>
-  2651607864U,  // <7,0,6,6>: Cost 3 vext2 <5,6,7,0>, <6,6,6,6>
-  2651607886U,  // <7,0,6,7>: Cost 3 vext2 <5,6,7,0>, <6,7,0,1>
-  2688983605U,  // <7,0,6,u>: Cost 3 vext3 <0,6,u,7>, <0,6,u,7>
-  2651608058U,  // <7,0,7,0>: Cost 3 vext2 <5,6,7,0>, <7,0,1,2>
-  2932703334U,  // <7,0,7,1>: Cost 3 vzipl <7,7,7,7>, LHS
-  3066921062U,  // <7,0,7,2>: Cost 3 vtrnl <7,7,7,7>, LHS
-  3712742678U,  // <7,0,7,3>: Cost 4 vext2 <3,5,7,0>, <7,3,5,7>
-  2651608422U,  // <7,0,7,4>: Cost 3 vext2 <5,6,7,0>, <7,4,5,6>
-  2651608513U,  // <7,0,7,5>: Cost 3 vext2 <5,6,7,0>, <7,5,6,7>
-  2663552532U,  // <7,0,7,6>: Cost 3 vext2 <7,6,7,0>, <7,6,7,0>
-  2651608684U,  // <7,0,7,7>: Cost 3 vext2 <5,6,7,0>, <7,7,7,7>
-  2651608706U,  // <7,0,7,u>: Cost 3 vext2 <5,6,7,0>, <7,u,1,2>
-  1638318730U,  // <7,0,u,0>: Cost 2 vext3 RHS, <0,u,0,2>
-  1638318738U,  // <7,0,u,1>: Cost 2 vext3 RHS, <0,u,1,1>
-   564576925U,  // <7,0,u,2>: Cost 1 vext3 RHS, LHS
-  2572765898U,  // <7,0,u,3>: Cost 3 vext1 <3,7,0,u>, <3,7,0,u>
-  1638318770U,  // <7,0,u,4>: Cost 2 vext3 RHS, <0,u,4,6>
-  1577867418U,  // <7,0,u,5>: Cost 2 vext2 <5,6,7,0>, RHS
-  1516942165U,  // <7,0,u,6>: Cost 2 vext1 <6,7,0,u>, <6,7,0,u>
-  2651609344U,  // <7,0,u,7>: Cost 3 vext2 <5,6,7,0>, <u,7,0,1>
-   564576979U,  // <7,0,u,u>: Cost 1 vext3 RHS, LHS
-  2590687334U,  // <7,1,0,0>: Cost 3 vext1 <6,7,1,0>, LHS
-  2639003750U,  // <7,1,0,1>: Cost 3 vext2 <3,5,7,1>, LHS
-  2793357414U,  // <7,1,0,2>: Cost 3 vuzpl <7,0,1,2>, LHS
-  1638318838U,  // <7,1,0,3>: Cost 2 vext3 RHS, <1,0,3,2>
-  2590690614U,  // <7,1,0,4>: Cost 3 vext1 <6,7,1,0>, RHS
-  2712060679U,  // <7,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1>
-  2590692182U,  // <7,1,0,6>: Cost 3 vext1 <6,7,1,0>, <6,7,1,0>
-  3785802521U,  // <7,1,0,7>: Cost 4 vext3 RHS, <1,0,7,1>
-  1638318883U,  // <7,1,0,u>: Cost 2 vext3 RHS, <1,0,u,2>
-  2712060715U,  // <7,1,1,0>: Cost 3 vext3 RHS, <1,1,0,1>
-  1638318900U,  // <7,1,1,1>: Cost 2 vext3 RHS, <1,1,1,1>
-  3774300994U,  // <7,1,1,2>: Cost 4 vext3 <2,6,3,7>, <1,1,2,6>
-  1638318920U,  // <7,1,1,3>: Cost 2 vext3 RHS, <1,1,3,3>
-  2712060755U,  // <7,1,1,4>: Cost 3 vext3 RHS, <1,1,4,5>
-  2691416926U,  // <7,1,1,5>: Cost 3 vext3 <1,1,5,7>, <1,1,5,7>
-  2590700375U,  // <7,1,1,6>: Cost 3 vext1 <6,7,1,1>, <6,7,1,1>
-  3765158766U,  // <7,1,1,7>: Cost 4 vext3 <1,1,5,7>, <1,1,7,5>
-  1638318965U,  // <7,1,1,u>: Cost 2 vext3 RHS, <1,1,u,3>
-  2712060796U,  // <7,1,2,0>: Cost 3 vext3 RHS, <1,2,0,1>
-  2712060807U,  // <7,1,2,1>: Cost 3 vext3 RHS, <1,2,1,3>
-  3712747112U,  // <7,1,2,2>: Cost 4 vext2 <3,5,7,1>, <2,2,2,2>
-  1638318998U,  // <7,1,2,3>: Cost 2 vext3 RHS, <1,2,3,0>
-  2712060836U,  // <7,1,2,4>: Cost 3 vext3 RHS, <1,2,4,5>
-  2712060843U,  // <7,1,2,5>: Cost 3 vext3 RHS, <1,2,5,3>
-  2590708568U,  // <7,1,2,6>: Cost 3 vext1 <6,7,1,2>, <6,7,1,2>
-  2735948730U,  // <7,1,2,7>: Cost 3 vext3 RHS, <1,2,7,0>
-  1638319043U,  // <7,1,2,u>: Cost 2 vext3 RHS, <1,2,u,0>
-  2712060876U,  // <7,1,3,0>: Cost 3 vext3 RHS, <1,3,0,0>
-  1638319064U,  // <7,1,3,1>: Cost 2 vext3 RHS, <1,3,1,3>
-  2712060894U,  // <7,1,3,2>: Cost 3 vext3 RHS, <1,3,2,0>
-  2692596718U,  // <7,1,3,3>: Cost 3 vext3 <1,3,3,7>, <1,3,3,7>
-  2712060917U,  // <7,1,3,4>: Cost 3 vext3 RHS, <1,3,4,5>
-  1619002368U,  // <7,1,3,5>: Cost 2 vext3 <1,3,5,7>, <1,3,5,7>
-  2692817929U,  // <7,1,3,6>: Cost 3 vext3 <1,3,6,7>, <1,3,6,7>
-  2735948814U,  // <7,1,3,7>: Cost 3 vext3 RHS, <1,3,7,3>
-  1619223579U,  // <7,1,3,u>: Cost 2 vext3 <1,3,u,7>, <1,3,u,7>
-  2712060962U,  // <7,1,4,0>: Cost 3 vext3 RHS, <1,4,0,5>
-  2712060971U,  // <7,1,4,1>: Cost 3 vext3 RHS, <1,4,1,5>
-  2712060980U,  // <7,1,4,2>: Cost 3 vext3 RHS, <1,4,2,5>
-  2712060989U,  // <7,1,4,3>: Cost 3 vext3 RHS, <1,4,3,5>
-  3785802822U,  // <7,1,4,4>: Cost 4 vext3 RHS, <1,4,4,5>
-  2639007030U,  // <7,1,4,5>: Cost 3 vext2 <3,5,7,1>, RHS
-  2645642634U,  // <7,1,4,6>: Cost 3 vext2 <4,6,7,1>, <4,6,7,1>
-  3719384520U,  // <7,1,4,7>: Cost 4 vext2 <4,6,7,1>, <4,7,5,0>
-  2639007273U,  // <7,1,4,u>: Cost 3 vext2 <3,5,7,1>, RHS
-  2572812390U,  // <7,1,5,0>: Cost 3 vext1 <3,7,1,5>, LHS
-  2693776510U,  // <7,1,5,1>: Cost 3 vext3 <1,5,1,7>, <1,5,1,7>
-  3774301318U,  // <7,1,5,2>: Cost 4 vext3 <2,6,3,7>, <1,5,2,6>
-  1620182160U,  // <7,1,5,3>: Cost 2 vext3 <1,5,3,7>, <1,5,3,7>
-  2572815670U,  // <7,1,5,4>: Cost 3 vext1 <3,7,1,5>, RHS
-  3766486178U,  // <7,1,5,5>: Cost 4 vext3 <1,3,5,7>, <1,5,5,7>
-  2651615331U,  // <7,1,5,6>: Cost 3 vext2 <5,6,7,1>, <5,6,7,1>
-  2652278964U,  // <7,1,5,7>: Cost 3 vext2 <5,7,7,1>, <5,7,7,1>
-  1620550845U,  // <7,1,5,u>: Cost 2 vext3 <1,5,u,7>, <1,5,u,7>
-  3768108230U,  // <7,1,6,0>: Cost 4 vext3 <1,6,0,7>, <1,6,0,7>
-  2694440143U,  // <7,1,6,1>: Cost 3 vext3 <1,6,1,7>, <1,6,1,7>
-  2712061144U,  // <7,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7>
-  2694587617U,  // <7,1,6,3>: Cost 3 vext3 <1,6,3,7>, <1,6,3,7>
-  3768403178U,  // <7,1,6,4>: Cost 4 vext3 <1,6,4,7>, <1,6,4,7>
-  2694735091U,  // <7,1,6,5>: Cost 3 vext3 <1,6,5,7>, <1,6,5,7>
-  3768550652U,  // <7,1,6,6>: Cost 4 vext3 <1,6,6,7>, <1,6,6,7>
-  2652279630U,  // <7,1,6,7>: Cost 3 vext2 <5,7,7,1>, <6,7,0,1>
-  2694956302U,  // <7,1,6,u>: Cost 3 vext3 <1,6,u,7>, <1,6,u,7>
-  2645644282U,  // <7,1,7,0>: Cost 3 vext2 <4,6,7,1>, <7,0,1,2>
-  2859062094U,  // <7,1,7,1>: Cost 3 vuzpr <6,7,0,1>, <6,7,0,1>
-  3779462437U,  // <7,1,7,2>: Cost 4 vext3 <3,5,1,7>, <1,7,2,3>
-  3121938534U,  // <7,1,7,3>: Cost 3 vtrnr <5,7,5,7>, LHS
-  2554916150U,  // <7,1,7,4>: Cost 3 vext1 <0,7,1,7>, RHS
-  3769140548U,  // <7,1,7,5>: Cost 4 vext3 <1,7,5,7>, <1,7,5,7>
-  3726022164U,  // <7,1,7,6>: Cost 4 vext2 <5,7,7,1>, <7,6,7,0>
-  2554918508U,  // <7,1,7,7>: Cost 3 vext1 <0,7,1,7>, <7,7,7,7>
-  3121938539U,  // <7,1,7,u>: Cost 3 vtrnr <5,7,5,7>, LHS
-  2572836966U,  // <7,1,u,0>: Cost 3 vext1 <3,7,1,u>, LHS
-  1638319469U,  // <7,1,u,1>: Cost 2 vext3 RHS, <1,u,1,3>
-  2712061299U,  // <7,1,u,2>: Cost 3 vext3 RHS, <1,u,2,0>
-  1622173059U,  // <7,1,u,3>: Cost 2 vext3 <1,u,3,7>, <1,u,3,7>
-  2572840246U,  // <7,1,u,4>: Cost 3 vext1 <3,7,1,u>, RHS
-  1622320533U,  // <7,1,u,5>: Cost 2 vext3 <1,u,5,7>, <1,u,5,7>
-  2696136094U,  // <7,1,u,6>: Cost 3 vext3 <1,u,6,7>, <1,u,6,7>
-  2859060777U,  // <7,1,u,7>: Cost 3 vuzpr <6,7,0,1>, RHS
-  1622541744U,  // <7,1,u,u>: Cost 2 vext3 <1,u,u,7>, <1,u,u,7>
-  2712061364U,  // <7,2,0,0>: Cost 3 vext3 RHS, <2,0,0,2>
-  2712061373U,  // <7,2,0,1>: Cost 3 vext3 RHS, <2,0,1,2>
-  2712061380U,  // <7,2,0,2>: Cost 3 vext3 RHS, <2,0,2,0>
-  2712061389U,  // <7,2,0,3>: Cost 3 vext3 RHS, <2,0,3,0>
-  2712061404U,  // <7,2,0,4>: Cost 3 vext3 RHS, <2,0,4,6>
-  2696725990U,  // <7,2,0,5>: Cost 3 vext3 <2,0,5,7>, <2,0,5,7>
-  2712061417U,  // <7,2,0,6>: Cost 3 vext3 RHS, <2,0,6,1>
-  3785803251U,  // <7,2,0,7>: Cost 4 vext3 RHS, <2,0,7,2>
-  2696947201U,  // <7,2,0,u>: Cost 3 vext3 <2,0,u,7>, <2,0,u,7>
-  2712061446U,  // <7,2,1,0>: Cost 3 vext3 RHS, <2,1,0,3>
-  3785803276U,  // <7,2,1,1>: Cost 4 vext3 RHS, <2,1,1,0>
-  3785803285U,  // <7,2,1,2>: Cost 4 vext3 RHS, <2,1,2,0>
-  2712061471U,  // <7,2,1,3>: Cost 3 vext3 RHS, <2,1,3,1>
-  2712061482U,  // <7,2,1,4>: Cost 3 vext3 RHS, <2,1,4,3>
-  3766486576U,  // <7,2,1,5>: Cost 4 vext3 <1,3,5,7>, <2,1,5,0>
-  2712061500U,  // <7,2,1,6>: Cost 3 vext3 RHS, <2,1,6,3>
-  2602718850U,  // <7,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2>
-  2712061516U,  // <7,2,1,u>: Cost 3 vext3 RHS, <2,1,u,1>
-  2712061525U,  // <7,2,2,0>: Cost 3 vext3 RHS, <2,2,0,1>
-  2712061536U,  // <7,2,2,1>: Cost 3 vext3 RHS, <2,2,1,3>
-  1638319720U,  // <7,2,2,2>: Cost 2 vext3 RHS, <2,2,2,2>
-  1638319730U,  // <7,2,2,3>: Cost 2 vext3 RHS, <2,2,3,3>
-  2712061565U,  // <7,2,2,4>: Cost 3 vext3 RHS, <2,2,4,5>
-  2698053256U,  // <7,2,2,5>: Cost 3 vext3 <2,2,5,7>, <2,2,5,7>
-  2712061584U,  // <7,2,2,6>: Cost 3 vext3 RHS, <2,2,6,6>
-  3771795096U,  // <7,2,2,7>: Cost 4 vext3 <2,2,5,7>, <2,2,7,5>
-  1638319775U,  // <7,2,2,u>: Cost 2 vext3 RHS, <2,2,u,3>
-  1638319782U,  // <7,2,3,0>: Cost 2 vext3 RHS, <2,3,0,1>
-  2693924531U,  // <7,2,3,1>: Cost 3 vext3 <1,5,3,7>, <2,3,1,5>
-  2700560061U,  // <7,2,3,2>: Cost 3 vext3 <2,6,3,7>, <2,3,2,6>
-  2693924551U,  // <7,2,3,3>: Cost 3 vext3 <1,5,3,7>, <2,3,3,7>
-  1638319822U,  // <7,2,3,4>: Cost 2 vext3 RHS, <2,3,4,5>
-  2698716889U,  // <7,2,3,5>: Cost 3 vext3 <2,3,5,7>, <2,3,5,7>
-  2712061665U,  // <7,2,3,6>: Cost 3 vext3 RHS, <2,3,6,6>
-  2735949540U,  // <7,2,3,7>: Cost 3 vext3 RHS, <2,3,7,0>
-  1638319854U,  // <7,2,3,u>: Cost 2 vext3 RHS, <2,3,u,1>
-  2712061692U,  // <7,2,4,0>: Cost 3 vext3 RHS, <2,4,0,6>
-  2712061698U,  // <7,2,4,1>: Cost 3 vext3 RHS, <2,4,1,3>
-  2712061708U,  // <7,2,4,2>: Cost 3 vext3 RHS, <2,4,2,4>
-  2712061718U,  // <7,2,4,3>: Cost 3 vext3 RHS, <2,4,3,5>
-  2712061728U,  // <7,2,4,4>: Cost 3 vext3 RHS, <2,4,4,6>
-  2699380522U,  // <7,2,4,5>: Cost 3 vext3 <2,4,5,7>, <2,4,5,7>
-  2712061740U,  // <7,2,4,6>: Cost 3 vext3 RHS, <2,4,6,0>
-  3809691445U,  // <7,2,4,7>: Cost 4 vext3 RHS, <2,4,7,0>
-  2699601733U,  // <7,2,4,u>: Cost 3 vext3 <2,4,u,7>, <2,4,u,7>
-  2699675470U,  // <7,2,5,0>: Cost 3 vext3 <2,5,0,7>, <2,5,0,7>
-  3766486867U,  // <7,2,5,1>: Cost 4 vext3 <1,3,5,7>, <2,5,1,3>
-  2699822944U,  // <7,2,5,2>: Cost 3 vext3 <2,5,2,7>, <2,5,2,7>
-  2692745065U,  // <7,2,5,3>: Cost 3 vext3 <1,3,5,7>, <2,5,3,7>
-  2699970418U,  // <7,2,5,4>: Cost 3 vext3 <2,5,4,7>, <2,5,4,7>
-  3766486907U,  // <7,2,5,5>: Cost 4 vext3 <1,3,5,7>, <2,5,5,7>
-  2700117892U,  // <7,2,5,6>: Cost 3 vext3 <2,5,6,7>, <2,5,6,7>
-  3771795334U,  // <7,2,5,7>: Cost 4 vext3 <2,2,5,7>, <2,5,7,0>
-  2692745110U,  // <7,2,5,u>: Cost 3 vext3 <1,3,5,7>, <2,5,u,7>
-  2572894310U,  // <7,2,6,0>: Cost 3 vext1 <3,7,2,6>, LHS
-  2712061860U,  // <7,2,6,1>: Cost 3 vext3 RHS, <2,6,1,3>
-  2700486577U,  // <7,2,6,2>: Cost 3 vext3 <2,6,2,7>, <2,6,2,7>
-  1626818490U,  // <7,2,6,3>: Cost 2 vext3 <2,6,3,7>, <2,6,3,7>
-  2572897590U,  // <7,2,6,4>: Cost 3 vext1 <3,7,2,6>, RHS
-  2700707788U,  // <7,2,6,5>: Cost 3 vext3 <2,6,5,7>, <2,6,5,7>
-  2700781525U,  // <7,2,6,6>: Cost 3 vext3 <2,6,6,7>, <2,6,6,7>
-  3774597086U,  // <7,2,6,7>: Cost 4 vext3 <2,6,7,7>, <2,6,7,7>
-  1627187175U,  // <7,2,6,u>: Cost 2 vext3 <2,6,u,7>, <2,6,u,7>
-  2735949802U,  // <7,2,7,0>: Cost 3 vext3 RHS, <2,7,0,1>
-  3780200434U,  // <7,2,7,1>: Cost 4 vext3 <3,6,2,7>, <2,7,1,0>
-  3773564928U,  // <7,2,7,2>: Cost 4 vext3 <2,5,2,7>, <2,7,2,5>
-  2986541158U,  // <7,2,7,3>: Cost 3 vzipr <5,5,7,7>, LHS
-  2554989878U,  // <7,2,7,4>: Cost 3 vext1 <0,7,2,7>, RHS
-  3775113245U,  // <7,2,7,5>: Cost 4 vext3 <2,7,5,7>, <2,7,5,7>
-  4060283228U,  // <7,2,7,6>: Cost 4 vzipr <5,5,7,7>, <0,4,2,6>
-  2554992236U,  // <7,2,7,7>: Cost 3 vext1 <0,7,2,7>, <7,7,7,7>
-  2986541163U,  // <7,2,7,u>: Cost 3 vzipr <5,5,7,7>, LHS
-  1638320187U,  // <7,2,u,0>: Cost 2 vext3 RHS, <2,u,0,1>
-  2693924936U,  // <7,2,u,1>: Cost 3 vext3 <1,5,3,7>, <2,u,1,5>
-  1638319720U,  // <7,2,u,2>: Cost 2 vext3 RHS, <2,2,2,2>
-  1628145756U,  // <7,2,u,3>: Cost 2 vext3 <2,u,3,7>, <2,u,3,7>
-  1638320227U,  // <7,2,u,4>: Cost 2 vext3 RHS, <2,u,4,5>
-  2702035054U,  // <7,2,u,5>: Cost 3 vext3 <2,u,5,7>, <2,u,5,7>
-  2702108791U,  // <7,2,u,6>: Cost 3 vext3 <2,u,6,7>, <2,u,6,7>
-  2735949945U,  // <7,2,u,7>: Cost 3 vext3 RHS, <2,u,7,0>
-  1628514441U,  // <7,2,u,u>: Cost 2 vext3 <2,u,u,7>, <2,u,u,7>
-  2712062091U,  // <7,3,0,0>: Cost 3 vext3 RHS, <3,0,0,0>
-  1638320278U,  // <7,3,0,1>: Cost 2 vext3 RHS, <3,0,1,2>
-  2712062109U,  // <7,3,0,2>: Cost 3 vext3 RHS, <3,0,2,0>
-  2590836886U,  // <7,3,0,3>: Cost 3 vext1 <6,7,3,0>, <3,0,1,2>
-  2712062128U,  // <7,3,0,4>: Cost 3 vext3 RHS, <3,0,4,1>
-  2712062138U,  // <7,3,0,5>: Cost 3 vext3 RHS, <3,0,5,2>
-  2590839656U,  // <7,3,0,6>: Cost 3 vext1 <6,7,3,0>, <6,7,3,0>
-  3311414017U,  // <7,3,0,7>: Cost 4 vrev <3,7,7,0>
-  1638320341U,  // <7,3,0,u>: Cost 2 vext3 RHS, <3,0,u,2>
-  2237164227U,  // <7,3,1,0>: Cost 3 vrev <3,7,0,1>
-  2712062182U,  // <7,3,1,1>: Cost 3 vext3 RHS, <3,1,1,1>
-  2712062193U,  // <7,3,1,2>: Cost 3 vext3 RHS, <3,1,2,3>
-  2692745468U,  // <7,3,1,3>: Cost 3 vext3 <1,3,5,7>, <3,1,3,5>
-  2712062214U,  // <7,3,1,4>: Cost 3 vext3 RHS, <3,1,4,6>
-  2693925132U,  // <7,3,1,5>: Cost 3 vext3 <1,5,3,7>, <3,1,5,3>
-  3768183059U,  // <7,3,1,6>: Cost 4 vext3 <1,6,1,7>, <3,1,6,1>
-  2692745504U,  // <7,3,1,7>: Cost 3 vext3 <1,3,5,7>, <3,1,7,5>
-  2696063273U,  // <7,3,1,u>: Cost 3 vext3 <1,u,5,7>, <3,1,u,5>
-  2712062254U,  // <7,3,2,0>: Cost 3 vext3 RHS, <3,2,0,1>
-  2712062262U,  // <7,3,2,1>: Cost 3 vext3 RHS, <3,2,1,0>
-  2712062273U,  // <7,3,2,2>: Cost 3 vext3 RHS, <3,2,2,2>
-  2712062280U,  // <7,3,2,3>: Cost 3 vext3 RHS, <3,2,3,0>
-  2712062294U,  // <7,3,2,4>: Cost 3 vext3 RHS, <3,2,4,5>
-  2712062302U,  // <7,3,2,5>: Cost 3 vext3 RHS, <3,2,5,4>
-  2700560742U,  // <7,3,2,6>: Cost 3 vext3 <2,6,3,7>, <3,2,6,3>
-  2712062319U,  // <7,3,2,7>: Cost 3 vext3 RHS, <3,2,7,3>
-  2712062325U,  // <7,3,2,u>: Cost 3 vext3 RHS, <3,2,u,0>
-  2712062335U,  // <7,3,3,0>: Cost 3 vext3 RHS, <3,3,0,1>
-  2636368158U,  // <7,3,3,1>: Cost 3 vext2 <3,1,7,3>, <3,1,7,3>
-  2637031791U,  // <7,3,3,2>: Cost 3 vext2 <3,2,7,3>, <3,2,7,3>
-  1638320540U,  // <7,3,3,3>: Cost 2 vext3 RHS, <3,3,3,3>
-  2712062374U,  // <7,3,3,4>: Cost 3 vext3 RHS, <3,3,4,4>
-  2704689586U,  // <7,3,3,5>: Cost 3 vext3 <3,3,5,7>, <3,3,5,7>
-  2590864235U,  // <7,3,3,6>: Cost 3 vext1 <6,7,3,3>, <6,7,3,3>
-  2704837060U,  // <7,3,3,7>: Cost 3 vext3 <3,3,7,7>, <3,3,7,7>
-  1638320540U,  // <7,3,3,u>: Cost 2 vext3 RHS, <3,3,3,3>
-  2712062416U,  // <7,3,4,0>: Cost 3 vext3 RHS, <3,4,0,1>
-  2712062426U,  // <7,3,4,1>: Cost 3 vext3 RHS, <3,4,1,2>
-  2566981640U,  // <7,3,4,2>: Cost 3 vext1 <2,7,3,4>, <2,7,3,4>
-  2712062447U,  // <7,3,4,3>: Cost 3 vext3 RHS, <3,4,3,5>
-  2712062456U,  // <7,3,4,4>: Cost 3 vext3 RHS, <3,4,4,5>
-  1638320642U,  // <7,3,4,5>: Cost 2 vext3 RHS, <3,4,5,6>
-  2648313204U,  // <7,3,4,6>: Cost 3 vext2 <5,1,7,3>, <4,6,4,6>
-  3311446789U,  // <7,3,4,7>: Cost 4 vrev <3,7,7,4>
-  1638320669U,  // <7,3,4,u>: Cost 2 vext3 RHS, <3,4,u,6>
-  2602819686U,  // <7,3,5,0>: Cost 3 vext1 <u,7,3,5>, LHS
-  1574571728U,  // <7,3,5,1>: Cost 2 vext2 <5,1,7,3>, <5,1,7,3>
-  2648977185U,  // <7,3,5,2>: Cost 3 vext2 <5,2,7,3>, <5,2,7,3>
-  2705869378U,  // <7,3,5,3>: Cost 3 vext3 <3,5,3,7>, <3,5,3,7>
-  2237491947U,  // <7,3,5,4>: Cost 3 vrev <3,7,4,5>
-  2706016852U,  // <7,3,5,5>: Cost 3 vext3 <3,5,5,7>, <3,5,5,7>
-  2648313954U,  // <7,3,5,6>: Cost 3 vext2 <5,1,7,3>, <5,6,7,0>
-  2692745823U,  // <7,3,5,7>: Cost 3 vext3 <1,3,5,7>, <3,5,7,0>
-  1579217159U,  // <7,3,5,u>: Cost 2 vext2 <5,u,7,3>, <5,u,7,3>
-  2706311800U,  // <7,3,6,0>: Cost 3 vext3 <3,6,0,7>, <3,6,0,7>
-  2654286249U,  // <7,3,6,1>: Cost 3 vext2 <6,1,7,3>, <6,1,7,3>
-  1581208058U,  // <7,3,6,2>: Cost 2 vext2 <6,2,7,3>, <6,2,7,3>
-  2706533011U,  // <7,3,6,3>: Cost 3 vext3 <3,6,3,7>, <3,6,3,7>
-  2706606748U,  // <7,3,6,4>: Cost 3 vext3 <3,6,4,7>, <3,6,4,7>
-  3780422309U,  // <7,3,6,5>: Cost 4 vext3 <3,6,5,7>, <3,6,5,7>
-  2712062637U,  // <7,3,6,6>: Cost 3 vext3 RHS, <3,6,6,6>
-  2706827959U,  // <7,3,6,7>: Cost 3 vext3 <3,6,7,7>, <3,6,7,7>
-  1585189856U,  // <7,3,6,u>: Cost 2 vext2 <6,u,7,3>, <6,u,7,3>
-  2693925571U,  // <7,3,7,0>: Cost 3 vext3 <1,5,3,7>, <3,7,0,1>
-  2693925584U,  // <7,3,7,1>: Cost 3 vext3 <1,5,3,7>, <3,7,1,5>
-  2700561114U,  // <7,3,7,2>: Cost 3 vext3 <2,6,3,7>, <3,7,2,6>
-  2572978916U,  // <7,3,7,3>: Cost 3 vext1 <3,7,3,7>, <3,7,3,7>
-  2693925611U,  // <7,3,7,4>: Cost 3 vext3 <1,5,3,7>, <3,7,4,5>
-  2707344118U,  // <7,3,7,5>: Cost 3 vext3 <3,7,5,7>, <3,7,5,7>
-  2654950894U,  // <7,3,7,6>: Cost 3 vext2 <6,2,7,3>, <7,6,2,7>
-  2648315500U,  // <7,3,7,7>: Cost 3 vext2 <5,1,7,3>, <7,7,7,7>
-  2693925643U,  // <7,3,7,u>: Cost 3 vext3 <1,5,3,7>, <3,7,u,1>
-  2237221578U,  // <7,3,u,0>: Cost 3 vrev <3,7,0,u>
-  1638320926U,  // <7,3,u,1>: Cost 2 vext3 RHS, <3,u,1,2>
-  1593153452U,  // <7,3,u,2>: Cost 2 vext2 <u,2,7,3>, <u,2,7,3>
-  1638320540U,  // <7,3,u,3>: Cost 2 vext3 RHS, <3,3,3,3>
-  2237516526U,  // <7,3,u,4>: Cost 3 vrev <3,7,4,u>
-  1638320966U,  // <7,3,u,5>: Cost 2 vext3 RHS, <3,u,5,6>
-  2712062796U,  // <7,3,u,6>: Cost 3 vext3 RHS, <3,u,6,3>
-  2692967250U,  // <7,3,u,7>: Cost 3 vext3 <1,3,u,7>, <3,u,7,0>
-  1638320989U,  // <7,3,u,u>: Cost 2 vext3 RHS, <3,u,u,2>
-  2651635712U,  // <7,4,0,0>: Cost 3 vext2 <5,6,7,4>, <0,0,0,0>
-  1577893990U,  // <7,4,0,1>: Cost 2 vext2 <5,6,7,4>, LHS
-  2651635876U,  // <7,4,0,2>: Cost 3 vext2 <5,6,7,4>, <0,2,0,2>
-  3785804672U,  // <7,4,0,3>: Cost 4 vext3 RHS, <4,0,3,1>
-  2651636050U,  // <7,4,0,4>: Cost 3 vext2 <5,6,7,4>, <0,4,1,5>
-  1638468498U,  // <7,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1>
-  1638468508U,  // <7,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2>
-  3787795364U,  // <7,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1>
-  1640459181U,  // <7,4,0,u>: Cost 2 vext3 RHS, <4,0,u,1>
-  2651636470U,  // <7,4,1,0>: Cost 3 vext2 <5,6,7,4>, <1,0,3,2>
-  2651636532U,  // <7,4,1,1>: Cost 3 vext2 <5,6,7,4>, <1,1,1,1>
-  2712062922U,  // <7,4,1,2>: Cost 3 vext3 RHS, <4,1,2,3>
-  2639029248U,  // <7,4,1,3>: Cost 3 vext2 <3,5,7,4>, <1,3,5,7>
-  2712062940U,  // <7,4,1,4>: Cost 3 vext3 RHS, <4,1,4,3>
-  2712062946U,  // <7,4,1,5>: Cost 3 vext3 RHS, <4,1,5,0>
-  2712062958U,  // <7,4,1,6>: Cost 3 vext3 RHS, <4,1,6,3>
-  3785804791U,  // <7,4,1,7>: Cost 4 vext3 RHS, <4,1,7,3>
-  2712062973U,  // <7,4,1,u>: Cost 3 vext3 RHS, <4,1,u,0>
-  3785804807U,  // <7,4,2,0>: Cost 4 vext3 RHS, <4,2,0,1>
-  3785804818U,  // <7,4,2,1>: Cost 4 vext3 RHS, <4,2,1,3>
-  2651637352U,  // <7,4,2,2>: Cost 3 vext2 <5,6,7,4>, <2,2,2,2>
-  2651637414U,  // <7,4,2,3>: Cost 3 vext2 <5,6,7,4>, <2,3,0,1>
-  3716753194U,  // <7,4,2,4>: Cost 4 vext2 <4,2,7,4>, <2,4,5,7>
-  2712063030U,  // <7,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3>
-  2712063036U,  // <7,4,2,6>: Cost 3 vext3 RHS, <4,2,6,0>
-  3773123658U,  // <7,4,2,7>: Cost 4 vext3 <2,4,5,7>, <4,2,7,5>
-  2712063054U,  // <7,4,2,u>: Cost 3 vext3 RHS, <4,2,u,0>
-  2651637910U,  // <7,4,3,0>: Cost 3 vext2 <5,6,7,4>, <3,0,1,2>
-  3712772348U,  // <7,4,3,1>: Cost 4 vext2 <3,5,7,4>, <3,1,3,5>
-  3785804906U,  // <7,4,3,2>: Cost 4 vext3 RHS, <4,3,2,1>
-  2651638172U,  // <7,4,3,3>: Cost 3 vext2 <5,6,7,4>, <3,3,3,3>
-  2651638274U,  // <7,4,3,4>: Cost 3 vext2 <5,6,7,4>, <3,4,5,6>
-  2639030883U,  // <7,4,3,5>: Cost 3 vext2 <3,5,7,4>, <3,5,7,4>
-  2712063122U,  // <7,4,3,6>: Cost 3 vext3 RHS, <4,3,6,5>
-  3712772836U,  // <7,4,3,7>: Cost 4 vext2 <3,5,7,4>, <3,7,3,7>
-  2641021782U,  // <7,4,3,u>: Cost 3 vext2 <3,u,7,4>, <3,u,7,4>
-  2714053802U,  // <7,4,4,0>: Cost 3 vext3 RHS, <4,4,0,2>
-  3785804978U,  // <7,4,4,1>: Cost 4 vext3 RHS, <4,4,1,1>
-  3716754505U,  // <7,4,4,2>: Cost 4 vext2 <4,2,7,4>, <4,2,7,4>
-  3785804998U,  // <7,4,4,3>: Cost 4 vext3 RHS, <4,4,3,3>
-  1638321360U,  // <7,4,4,4>: Cost 2 vext3 RHS, <4,4,4,4>
-  1638468826U,  // <7,4,4,5>: Cost 2 vext3 RHS, <4,4,5,5>
-  1638468836U,  // <7,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6>
-  3785215214U,  // <7,4,4,7>: Cost 4 vext3 <4,4,7,7>, <4,4,7,7>
-  1640459509U,  // <7,4,4,u>: Cost 2 vext3 RHS, <4,4,u,5>
-  1517207654U,  // <7,4,5,0>: Cost 2 vext1 <6,7,4,5>, LHS
-  2573034640U,  // <7,4,5,1>: Cost 3 vext1 <3,7,4,5>, <1,5,3,7>
-  2712063246U,  // <7,4,5,2>: Cost 3 vext3 RHS, <4,5,2,3>
-  2573036267U,  // <7,4,5,3>: Cost 3 vext1 <3,7,4,5>, <3,7,4,5>
-  1517210934U,  // <7,4,5,4>: Cost 2 vext1 <6,7,4,5>, RHS
-  2711989549U,  // <7,4,5,5>: Cost 3 vext3 <4,5,5,7>, <4,5,5,7>
-   564579638U,  // <7,4,5,6>: Cost 1 vext3 RHS, RHS
-  2651639976U,  // <7,4,5,7>: Cost 3 vext2 <5,6,7,4>, <5,7,5,7>
-   564579656U,  // <7,4,5,u>: Cost 1 vext3 RHS, RHS
-  2712063307U,  // <7,4,6,0>: Cost 3 vext3 RHS, <4,6,0,1>
-  3767668056U,  // <7,4,6,1>: Cost 4 vext3 <1,5,3,7>, <4,6,1,5>
-  2651640314U,  // <7,4,6,2>: Cost 3 vext2 <5,6,7,4>, <6,2,7,3>
-  2655621708U,  // <7,4,6,3>: Cost 3 vext2 <6,3,7,4>, <6,3,7,4>
-  1638468980U,  // <7,4,6,4>: Cost 2 vext3 RHS, <4,6,4,6>
-  2712063358U,  // <7,4,6,5>: Cost 3 vext3 RHS, <4,6,5,7>
-  2712063367U,  // <7,4,6,6>: Cost 3 vext3 RHS, <4,6,6,7>
-  2712210826U,  // <7,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1>
-  1638469012U,  // <7,4,6,u>: Cost 2 vext3 RHS, <4,6,u,2>
-  2651640826U,  // <7,4,7,0>: Cost 3 vext2 <5,6,7,4>, <7,0,1,2>
-  3773713830U,  // <7,4,7,1>: Cost 4 vext3 <2,5,4,7>, <4,7,1,2>
-  3773713842U,  // <7,4,7,2>: Cost 4 vext3 <2,5,4,7>, <4,7,2,5>
-  3780349372U,  // <7,4,7,3>: Cost 4 vext3 <3,6,4,7>, <4,7,3,6>
-  2651641140U,  // <7,4,7,4>: Cost 3 vext2 <5,6,7,4>, <7,4,0,1>
-  2712210888U,  // <7,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0>
-  2712210898U,  // <7,4,7,6>: Cost 3 vext3 RHS, <4,7,6,1>
-  2651641452U,  // <7,4,7,7>: Cost 3 vext2 <5,6,7,4>, <7,7,7,7>
-  2713538026U,  // <7,4,7,u>: Cost 3 vext3 <4,7,u,7>, <4,7,u,7>
-  1517232230U,  // <7,4,u,0>: Cost 2 vext1 <6,7,4,u>, LHS
-  1577899822U,  // <7,4,u,1>: Cost 2 vext2 <5,6,7,4>, LHS
-  2712063489U,  // <7,4,u,2>: Cost 3 vext3 RHS, <4,u,2,3>
-  2573060846U,  // <7,4,u,3>: Cost 3 vext1 <3,7,4,u>, <3,7,4,u>
-  1640312342U,  // <7,4,u,4>: Cost 2 vext3 RHS, <4,u,4,6>
-  1638469146U,  // <7,4,u,5>: Cost 2 vext3 RHS, <4,u,5,1>
-   564579881U,  // <7,4,u,6>: Cost 1 vext3 RHS, RHS
-  2714054192U,  // <7,4,u,7>: Cost 3 vext3 RHS, <4,u,7,5>
-   564579899U,  // <7,4,u,u>: Cost 1 vext3 RHS, RHS
-  2579038310U,  // <7,5,0,0>: Cost 3 vext1 <4,7,5,0>, LHS
-  2636382310U,  // <7,5,0,1>: Cost 3 vext2 <3,1,7,5>, LHS
-  2796339302U,  // <7,5,0,2>: Cost 3 vuzpl <7,4,5,6>, LHS
-  3646810719U,  // <7,5,0,3>: Cost 4 vext1 <3,7,5,0>, <3,5,7,0>
-  2712063586U,  // <7,5,0,4>: Cost 3 vext3 RHS, <5,0,4,1>
-  2735951467U,  // <7,5,0,5>: Cost 3 vext3 RHS, <5,0,5,1>
-  2735951476U,  // <7,5,0,6>: Cost 3 vext3 RHS, <5,0,6,1>
-  2579043322U,  // <7,5,0,7>: Cost 3 vext1 <4,7,5,0>, <7,0,1,2>
-  2636382877U,  // <7,5,0,u>: Cost 3 vext2 <3,1,7,5>, LHS
-  2712211087U,  // <7,5,1,0>: Cost 3 vext3 RHS, <5,1,0,1>
-  3698180916U,  // <7,5,1,1>: Cost 4 vext2 <1,1,7,5>, <1,1,1,1>
-  3710124950U,  // <7,5,1,2>: Cost 4 vext2 <3,1,7,5>, <1,2,3,0>
-  2636383232U,  // <7,5,1,3>: Cost 3 vext2 <3,1,7,5>, <1,3,5,7>
-  2712211127U,  // <7,5,1,4>: Cost 3 vext3 RHS, <5,1,4,5>
-  2590994128U,  // <7,5,1,5>: Cost 3 vext1 <6,7,5,1>, <5,1,7,3>
-  2590995323U,  // <7,5,1,6>: Cost 3 vext1 <6,7,5,1>, <6,7,5,1>
-  1638469328U,  // <7,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3>
-  1638469337U,  // <7,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3>
-  3785805536U,  // <7,5,2,0>: Cost 4 vext3 RHS, <5,2,0,1>
-  3785805544U,  // <7,5,2,1>: Cost 4 vext3 RHS, <5,2,1,0>
-  3704817288U,  // <7,5,2,2>: Cost 4 vext2 <2,2,7,5>, <2,2,5,7>
-  2712063742U,  // <7,5,2,3>: Cost 3 vext3 RHS, <5,2,3,4>
-  3716761386U,  // <7,5,2,4>: Cost 4 vext2 <4,2,7,5>, <2,4,5,7>
-  2714054415U,  // <7,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3>
-  3774304024U,  // <7,5,2,6>: Cost 4 vext3 <2,6,3,7>, <5,2,6,3>
-  2712063777U,  // <7,5,2,7>: Cost 3 vext3 RHS, <5,2,7,3>
-  2712063787U,  // <7,5,2,u>: Cost 3 vext3 RHS, <5,2,u,4>
-  3634888806U,  // <7,5,3,0>: Cost 4 vext1 <1,7,5,3>, LHS
-  2636384544U,  // <7,5,3,1>: Cost 3 vext2 <3,1,7,5>, <3,1,7,5>
-  3710790001U,  // <7,5,3,2>: Cost 4 vext2 <3,2,7,5>, <3,2,7,5>
-  3710126492U,  // <7,5,3,3>: Cost 4 vext2 <3,1,7,5>, <3,3,3,3>
-  3634892086U,  // <7,5,3,4>: Cost 4 vext1 <1,7,5,3>, RHS
-  2639039076U,  // <7,5,3,5>: Cost 3 vext2 <3,5,7,5>, <3,5,7,5>
-  3713444533U,  // <7,5,3,6>: Cost 4 vext2 <3,6,7,5>, <3,6,7,5>
-  2693926767U,  // <7,5,3,7>: Cost 3 vext3 <1,5,3,7>, <5,3,7,0>
-  2712063864U,  // <7,5,3,u>: Cost 3 vext3 RHS, <5,3,u,0>
-  2579071078U,  // <7,5,4,0>: Cost 3 vext1 <4,7,5,4>, LHS
-  3646841856U,  // <7,5,4,1>: Cost 4 vext1 <3,7,5,4>, <1,3,5,7>
-  3716762698U,  // <7,5,4,2>: Cost 4 vext2 <4,2,7,5>, <4,2,7,5>
-  3646843491U,  // <7,5,4,3>: Cost 4 vext1 <3,7,5,4>, <3,5,7,4>
-  2579074358U,  // <7,5,4,4>: Cost 3 vext1 <4,7,5,4>, RHS
-  2636385590U,  // <7,5,4,5>: Cost 3 vext2 <3,1,7,5>, RHS
-  2645675406U,  // <7,5,4,6>: Cost 3 vext2 <4,6,7,5>, <4,6,7,5>
-  1638322118U,  // <7,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6>
-  1638469583U,  // <7,5,4,u>: Cost 2 vext3 RHS, <5,4,u,6>
-  2714054611U,  // <7,5,5,0>: Cost 3 vext3 RHS, <5,5,0,1>
-  2652974800U,  // <7,5,5,1>: Cost 3 vext2 <5,u,7,5>, <5,1,7,3>
-  3710127905U,  // <7,5,5,2>: Cost 4 vext2 <3,1,7,5>, <5,2,7,3>
-  3785805808U,  // <7,5,5,3>: Cost 4 vext3 RHS, <5,5,3,3>
-  2712211450U,  // <7,5,5,4>: Cost 3 vext3 RHS, <5,5,4,4>
-  1638322180U,  // <7,5,5,5>: Cost 2 vext3 RHS, <5,5,5,5>
-  2712064014U,  // <7,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6>
-  1638469656U,  // <7,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7>
-  1638469665U,  // <7,5,5,u>: Cost 2 vext3 RHS, <5,5,u,7>
-  2712064036U,  // <7,5,6,0>: Cost 3 vext3 RHS, <5,6,0,1>
-  2714054707U,  // <7,5,6,1>: Cost 3 vext3 RHS, <5,6,1,7>
-  3785805879U,  // <7,5,6,2>: Cost 4 vext3 RHS, <5,6,2,2>
-  2712064066U,  // <7,5,6,3>: Cost 3 vext3 RHS, <5,6,3,4>
-  2712064076U,  // <7,5,6,4>: Cost 3 vext3 RHS, <5,6,4,5>
-  2714054743U,  // <7,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7>
-  2712064096U,  // <7,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7>
-  1638322274U,  // <7,5,6,7>: Cost 2 vext3 RHS, <5,6,7,0>
-  1638469739U,  // <7,5,6,u>: Cost 2 vext3 RHS, <5,6,u,0>
-  1511325798U,  // <7,5,7,0>: Cost 2 vext1 <5,7,5,7>, LHS
-  2692747392U,  // <7,5,7,1>: Cost 3 vext3 <1,3,5,7>, <5,7,1,3>
-  2585069160U,  // <7,5,7,2>: Cost 3 vext1 <5,7,5,7>, <2,2,2,2>
-  2573126390U,  // <7,5,7,3>: Cost 3 vext1 <3,7,5,7>, <3,7,5,7>
-  1511329078U,  // <7,5,7,4>: Cost 2 vext1 <5,7,5,7>, RHS
-  1638469800U,  // <7,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7>
-  2712211626U,  // <7,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0>
-  2712211636U,  // <7,5,7,7>: Cost 3 vext3 RHS, <5,7,7,1>
-  1638469823U,  // <7,5,7,u>: Cost 2 vext3 RHS, <5,7,u,3>
-  1511333990U,  // <7,5,u,0>: Cost 2 vext1 <5,7,5,u>, LHS
-  2636388142U,  // <7,5,u,1>: Cost 3 vext2 <3,1,7,5>, LHS
-  2712211671U,  // <7,5,u,2>: Cost 3 vext3 RHS, <5,u,2,0>
-  2573134583U,  // <7,5,u,3>: Cost 3 vext1 <3,7,5,u>, <3,7,5,u>
-  1511337270U,  // <7,5,u,4>: Cost 2 vext1 <5,7,5,u>, RHS
-  1638469881U,  // <7,5,u,5>: Cost 2 vext3 RHS, <5,u,5,7>
-  2712064258U,  // <7,5,u,6>: Cost 3 vext3 RHS, <5,u,6,7>
-  1638469892U,  // <7,5,u,7>: Cost 2 vext3 RHS, <5,u,7,0>
-  1638469904U,  // <7,5,u,u>: Cost 2 vext3 RHS, <5,u,u,3>
-  2650324992U,  // <7,6,0,0>: Cost 3 vext2 <5,4,7,6>, <0,0,0,0>
-  1576583270U,  // <7,6,0,1>: Cost 2 vext2 <5,4,7,6>, LHS
-  2712064300U,  // <7,6,0,2>: Cost 3 vext3 RHS, <6,0,2,4>
-  2255295336U,  // <7,6,0,3>: Cost 3 vrev <6,7,3,0>
-  2712064316U,  // <7,6,0,4>: Cost 3 vext3 RHS, <6,0,4,2>
-  2585088098U,  // <7,6,0,5>: Cost 3 vext1 <5,7,6,0>, <5,6,7,0>
-  2735952204U,  // <7,6,0,6>: Cost 3 vext3 RHS, <6,0,6,0>
-  2712211799U,  // <7,6,0,7>: Cost 3 vext3 RHS, <6,0,7,2>
-  1576583837U,  // <7,6,0,u>: Cost 2 vext2 <5,4,7,6>, LHS
-  1181340494U,  // <7,6,1,0>: Cost 2 vrev <6,7,0,1>
-  2650325812U,  // <7,6,1,1>: Cost 3 vext2 <5,4,7,6>, <1,1,1,1>
-  2650325910U,  // <7,6,1,2>: Cost 3 vext2 <5,4,7,6>, <1,2,3,0>
-  2650325976U,  // <7,6,1,3>: Cost 3 vext2 <5,4,7,6>, <1,3,1,3>
-  2579123510U,  // <7,6,1,4>: Cost 3 vext1 <4,7,6,1>, RHS
-  2650326160U,  // <7,6,1,5>: Cost 3 vext2 <5,4,7,6>, <1,5,3,7>
-  2714055072U,  // <7,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3>
-  2712064425U,  // <7,6,1,7>: Cost 3 vext3 RHS, <6,1,7,3>
-  1181930390U,  // <7,6,1,u>: Cost 2 vrev <6,7,u,1>
-  2712211897U,  // <7,6,2,0>: Cost 3 vext3 RHS, <6,2,0,1>
-  2714055108U,  // <7,6,2,1>: Cost 3 vext3 RHS, <6,2,1,3>
-  2650326632U,  // <7,6,2,2>: Cost 3 vext2 <5,4,7,6>, <2,2,2,2>
-  2650326694U,  // <7,6,2,3>: Cost 3 vext2 <5,4,7,6>, <2,3,0,1>
-  2714055137U,  // <7,6,2,4>: Cost 3 vext3 RHS, <6,2,4,5>
-  2714055148U,  // <7,6,2,5>: Cost 3 vext3 RHS, <6,2,5,7>
-  2650326970U,  // <7,6,2,6>: Cost 3 vext2 <5,4,7,6>, <2,6,3,7>
-  1638470138U,  // <7,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3>
-  1638470147U,  // <7,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3>
-  2650327190U,  // <7,6,3,0>: Cost 3 vext2 <5,4,7,6>, <3,0,1,2>
-  2255172441U,  // <7,6,3,1>: Cost 3 vrev <6,7,1,3>
-  2255246178U,  // <7,6,3,2>: Cost 3 vrev <6,7,2,3>
-  2650327452U,  // <7,6,3,3>: Cost 3 vext2 <5,4,7,6>, <3,3,3,3>
-  2712064562U,  // <7,6,3,4>: Cost 3 vext3 RHS, <6,3,4,5>
-  2650327627U,  // <7,6,3,5>: Cost 3 vext2 <5,4,7,6>, <3,5,4,7>
-  3713452726U,  // <7,6,3,6>: Cost 4 vext2 <3,6,7,6>, <3,6,7,6>
-  2700563016U,  // <7,6,3,7>: Cost 3 vext3 <2,6,3,7>, <6,3,7,0>
-  2712064593U,  // <7,6,3,u>: Cost 3 vext3 RHS, <6,3,u,0>
-  2650327954U,  // <7,6,4,0>: Cost 3 vext2 <5,4,7,6>, <4,0,5,1>
-  2735952486U,  // <7,6,4,1>: Cost 3 vext3 RHS, <6,4,1,3>
-  2735952497U,  // <7,6,4,2>: Cost 3 vext3 RHS, <6,4,2,5>
-  2255328108U,  // <7,6,4,3>: Cost 3 vrev <6,7,3,4>
-  2712212100U,  // <7,6,4,4>: Cost 3 vext3 RHS, <6,4,4,6>
-  1576586550U,  // <7,6,4,5>: Cost 2 vext2 <5,4,7,6>, RHS
-  2714055312U,  // <7,6,4,6>: Cost 3 vext3 RHS, <6,4,6,0>
-  2712212126U,  // <7,6,4,7>: Cost 3 vext3 RHS, <6,4,7,5>
-  1576586793U,  // <7,6,4,u>: Cost 2 vext2 <5,4,7,6>, RHS
-  2579152998U,  // <7,6,5,0>: Cost 3 vext1 <4,7,6,5>, LHS
-  2650328784U,  // <7,6,5,1>: Cost 3 vext2 <5,4,7,6>, <5,1,7,3>
-  2714055364U,  // <7,6,5,2>: Cost 3 vext3 RHS, <6,5,2,7>
-  3785806538U,  // <7,6,5,3>: Cost 4 vext3 RHS, <6,5,3,4>
-  1576587206U,  // <7,6,5,4>: Cost 2 vext2 <5,4,7,6>, <5,4,7,6>
-  2650329092U,  // <7,6,5,5>: Cost 3 vext2 <5,4,7,6>, <5,5,5,5>
-  2650329186U,  // <7,6,5,6>: Cost 3 vext2 <5,4,7,6>, <5,6,7,0>
-  2712064753U,  // <7,6,5,7>: Cost 3 vext3 RHS, <6,5,7,7>
-  1181963162U,  // <7,6,5,u>: Cost 2 vrev <6,7,u,5>
-  2714055421U,  // <7,6,6,0>: Cost 3 vext3 RHS, <6,6,0,1>
-  2714055432U,  // <7,6,6,1>: Cost 3 vext3 RHS, <6,6,1,3>
-  2650329594U,  // <7,6,6,2>: Cost 3 vext2 <5,4,7,6>, <6,2,7,3>
-  3785806619U,  // <7,6,6,3>: Cost 4 vext3 RHS, <6,6,3,4>
-  2712212260U,  // <7,6,6,4>: Cost 3 vext3 RHS, <6,6,4,4>
-  2714055472U,  // <7,6,6,5>: Cost 3 vext3 RHS, <6,6,5,7>
-  1638323000U,  // <7,6,6,6>: Cost 2 vext3 RHS, <6,6,6,6>
-  1638470466U,  // <7,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7>
-  1638470475U,  // <7,6,6,u>: Cost 2 vext3 RHS, <6,6,u,7>
-  1638323022U,  // <7,6,7,0>: Cost 2 vext3 RHS, <6,7,0,1>
-  2712064854U,  // <7,6,7,1>: Cost 3 vext3 RHS, <6,7,1,0>
-  2712064865U,  // <7,6,7,2>: Cost 3 vext3 RHS, <6,7,2,2>
-  2712064872U,  // <7,6,7,3>: Cost 3 vext3 RHS, <6,7,3,0>
-  1638323062U,  // <7,6,7,4>: Cost 2 vext3 RHS, <6,7,4,5>
-  2712064894U,  // <7,6,7,5>: Cost 3 vext3 RHS, <6,7,5,4>
-  2712064905U,  // <7,6,7,6>: Cost 3 vext3 RHS, <6,7,6,6>
-  2712064915U,  // <7,6,7,7>: Cost 3 vext3 RHS, <6,7,7,7>
-  1638323094U,  // <7,6,7,u>: Cost 2 vext3 RHS, <6,7,u,1>
-  1638470559U,  // <7,6,u,0>: Cost 2 vext3 RHS, <6,u,0,1>
-  1576589102U,  // <7,6,u,1>: Cost 2 vext2 <5,4,7,6>, LHS
-  2712212402U,  // <7,6,u,2>: Cost 3 vext3 RHS, <6,u,2,2>
-  2712212409U,  // <7,6,u,3>: Cost 3 vext3 RHS, <6,u,3,0>
-  1638470599U,  // <7,6,u,4>: Cost 2 vext3 RHS, <6,u,4,5>
-  1576589466U,  // <7,6,u,5>: Cost 2 vext2 <5,4,7,6>, RHS
-  1638323000U,  // <7,6,u,6>: Cost 2 vext3 RHS, <6,6,6,6>
-  1638470624U,  // <7,6,u,7>: Cost 2 vext3 RHS, <6,u,7,3>
-  1638470631U,  // <7,6,u,u>: Cost 2 vext3 RHS, <6,u,u,1>
-  2712065007U,  // <7,7,0,0>: Cost 3 vext3 RHS, <7,0,0,0>
-  1638323194U,  // <7,7,0,1>: Cost 2 vext3 RHS, <7,0,1,2>
-  2712065025U,  // <7,7,0,2>: Cost 3 vext3 RHS, <7,0,2,0>
-  3646958337U,  // <7,7,0,3>: Cost 4 vext1 <3,7,7,0>, <3,7,7,0>
-  2712065044U,  // <7,7,0,4>: Cost 3 vext3 RHS, <7,0,4,1>
-  2585161907U,  // <7,7,0,5>: Cost 3 vext1 <5,7,7,0>, <5,7,7,0>
-  2591134604U,  // <7,7,0,6>: Cost 3 vext1 <6,7,7,0>, <6,7,7,0>
-  2591134714U,  // <7,7,0,7>: Cost 3 vext1 <6,7,7,0>, <7,0,1,2>
-  1638323257U,  // <7,7,0,u>: Cost 2 vext3 RHS, <7,0,u,2>
-  2712065091U,  // <7,7,1,0>: Cost 3 vext3 RHS, <7,1,0,3>
-  2712065098U,  // <7,7,1,1>: Cost 3 vext3 RHS, <7,1,1,1>
-  2712065109U,  // <7,7,1,2>: Cost 3 vext3 RHS, <7,1,2,3>
-  2692748384U,  // <7,7,1,3>: Cost 3 vext3 <1,3,5,7>, <7,1,3,5>
-  2585169206U,  // <7,7,1,4>: Cost 3 vext1 <5,7,7,1>, RHS
-  2693928048U,  // <7,7,1,5>: Cost 3 vext3 <1,5,3,7>, <7,1,5,3>
-  2585170766U,  // <7,7,1,6>: Cost 3 vext1 <5,7,7,1>, <6,7,0,1>
-  2735953024U,  // <7,7,1,7>: Cost 3 vext3 RHS, <7,1,7,1>
-  2695918731U,  // <7,7,1,u>: Cost 3 vext3 <1,u,3,7>, <7,1,u,3>
-  3770471574U,  // <7,7,2,0>: Cost 4 vext3 <2,0,5,7>, <7,2,0,5>
-  3785807002U,  // <7,7,2,1>: Cost 4 vext3 RHS, <7,2,1,0>
-  2712065189U,  // <7,7,2,2>: Cost 3 vext3 RHS, <7,2,2,2>
-  2712065196U,  // <7,7,2,3>: Cost 3 vext3 RHS, <7,2,3,0>
-  3773125818U,  // <7,7,2,4>: Cost 4 vext3 <2,4,5,7>, <7,2,4,5>
-  3766490305U,  // <7,7,2,5>: Cost 4 vext3 <1,3,5,7>, <7,2,5,3>
-  2700563658U,  // <7,7,2,6>: Cost 3 vext3 <2,6,3,7>, <7,2,6,3>
-  2735953107U,  // <7,7,2,7>: Cost 3 vext3 RHS, <7,2,7,3>
-  2701890780U,  // <7,7,2,u>: Cost 3 vext3 <2,u,3,7>, <7,2,u,3>
-  2712065251U,  // <7,7,3,0>: Cost 3 vext3 RHS, <7,3,0,1>
-  3766490350U,  // <7,7,3,1>: Cost 4 vext3 <1,3,5,7>, <7,3,1,3>
-  3774305530U,  // <7,7,3,2>: Cost 4 vext3 <2,6,3,7>, <7,3,2,6>
-  2637728196U,  // <7,7,3,3>: Cost 3 vext2 <3,3,7,7>, <3,3,7,7>
-  2712065291U,  // <7,7,3,4>: Cost 3 vext3 RHS, <7,3,4,5>
-  2585186486U,  // <7,7,3,5>: Cost 3 vext1 <5,7,7,3>, <5,7,7,3>
-  2639719095U,  // <7,7,3,6>: Cost 3 vext2 <3,6,7,7>, <3,6,7,7>
-  2640382728U,  // <7,7,3,7>: Cost 3 vext2 <3,7,7,7>, <3,7,7,7>
-  2641046361U,  // <7,7,3,u>: Cost 3 vext2 <3,u,7,7>, <3,u,7,7>
-  2712212792U,  // <7,7,4,0>: Cost 3 vext3 RHS, <7,4,0,5>
-  3646989312U,  // <7,7,4,1>: Cost 4 vext1 <3,7,7,4>, <1,3,5,7>
-  3785807176U,  // <7,7,4,2>: Cost 4 vext3 RHS, <7,4,2,3>
-  3646991109U,  // <7,7,4,3>: Cost 4 vext1 <3,7,7,4>, <3,7,7,4>
-  2712065371U,  // <7,7,4,4>: Cost 3 vext3 RHS, <7,4,4,4>
-  1638323558U,  // <7,7,4,5>: Cost 2 vext3 RHS, <7,4,5,6>
-  2712212845U,  // <7,7,4,6>: Cost 3 vext3 RHS, <7,4,6,4>
-  2591167846U,  // <7,7,4,7>: Cost 3 vext1 <6,7,7,4>, <7,4,5,6>
-  1638323585U,  // <7,7,4,u>: Cost 2 vext3 RHS, <7,4,u,6>
-  2585198694U,  // <7,7,5,0>: Cost 3 vext1 <5,7,7,5>, LHS
-  2712212884U,  // <7,7,5,1>: Cost 3 vext3 RHS, <7,5,1,7>
-  3711471393U,  // <7,7,5,2>: Cost 4 vext2 <3,3,7,7>, <5,2,7,3>
-  2649673590U,  // <7,7,5,3>: Cost 3 vext2 <5,3,7,7>, <5,3,7,7>
-  2712065455U,  // <7,7,5,4>: Cost 3 vext3 RHS, <7,5,4,7>
-  1577259032U,  // <7,7,5,5>: Cost 2 vext2 <5,5,7,7>, <5,5,7,7>
-  2712065473U,  // <7,7,5,6>: Cost 3 vext3 RHS, <7,5,6,7>
-  2712212936U,  // <7,7,5,7>: Cost 3 vext3 RHS, <7,5,7,5>
-  1579249931U,  // <7,7,5,u>: Cost 2 vext2 <5,u,7,7>, <5,u,7,7>
-  2591178854U,  // <7,7,6,0>: Cost 3 vext1 <6,7,7,6>, LHS
-  2735953374U,  // <7,7,6,1>: Cost 3 vext3 RHS, <7,6,1,0>
-  2712212974U,  // <7,7,6,2>: Cost 3 vext3 RHS, <7,6,2,7>
-  2655646287U,  // <7,7,6,3>: Cost 3 vext2 <6,3,7,7>, <6,3,7,7>
-  2591182134U,  // <7,7,6,4>: Cost 3 vext1 <6,7,7,6>, RHS
-  2656973553U,  // <7,7,6,5>: Cost 3 vext2 <6,5,7,7>, <6,5,7,7>
-  1583895362U,  // <7,7,6,6>: Cost 2 vext2 <6,6,7,7>, <6,6,7,7>
-  2712065556U,  // <7,7,6,7>: Cost 3 vext3 RHS, <7,6,7,0>
-  1585222628U,  // <7,7,6,u>: Cost 2 vext2 <6,u,7,7>, <6,u,7,7>
-  1523417190U,  // <7,7,7,0>: Cost 2 vext1 <7,7,7,7>, LHS
-  2597159670U,  // <7,7,7,1>: Cost 3 vext1 <7,7,7,7>, <1,0,3,2>
-  2597160552U,  // <7,7,7,2>: Cost 3 vext1 <7,7,7,7>, <2,2,2,2>
-  2597161110U,  // <7,7,7,3>: Cost 3 vext1 <7,7,7,7>, <3,0,1,2>
-  1523420470U,  // <7,7,7,4>: Cost 2 vext1 <7,7,7,7>, RHS
-  2651002296U,  // <7,7,7,5>: Cost 3 vext2 <5,5,7,7>, <7,5,5,7>
-  2657637906U,  // <7,7,7,6>: Cost 3 vext2 <6,6,7,7>, <7,6,6,7>
-   363253046U,  // <7,7,7,7>: Cost 1 vdup3 RHS
-   363253046U,  // <7,7,7,u>: Cost 1 vdup3 RHS
-  1523417190U,  // <7,7,u,0>: Cost 2 vext1 <7,7,7,7>, LHS
-  1638471298U,  // <7,7,u,1>: Cost 2 vext3 RHS, <7,u,1,2>
-  2712213132U,  // <7,7,u,2>: Cost 3 vext3 RHS, <7,u,2,3>
-  2712213138U,  // <7,7,u,3>: Cost 3 vext3 RHS, <7,u,3,0>
-  1523420470U,  // <7,7,u,4>: Cost 2 vext1 <7,7,7,7>, RHS
-  1638471338U,  // <7,7,u,5>: Cost 2 vext3 RHS, <7,u,5,6>
-  1595840756U,  // <7,7,u,6>: Cost 2 vext2 <u,6,7,7>, <u,6,7,7>
-   363253046U,  // <7,7,u,7>: Cost 1 vdup3 RHS
-   363253046U,  // <7,7,u,u>: Cost 1 vdup3 RHS
-  1638318080U,  // <7,u,0,0>: Cost 2 vext3 RHS, <0,0,0,0>
-  1638323923U,  // <7,u,0,1>: Cost 2 vext3 RHS, <u,0,1,2>
-  1662211804U,  // <7,u,0,2>: Cost 2 vext3 RHS, <u,0,2,2>
-  1638323941U,  // <7,u,0,3>: Cost 2 vext3 RHS, <u,0,3,2>
-  2712065773U,  // <7,u,0,4>: Cost 3 vext3 RHS, <u,0,4,1>
-  1662359286U,  // <7,u,0,5>: Cost 2 vext3 RHS, <u,0,5,1>
-  1662359296U,  // <7,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2>
-  2987150664U,  // <7,u,0,7>: Cost 3 vzipr <5,6,7,0>, RHS
-  1638323986U,  // <7,u,0,u>: Cost 2 vext3 RHS, <u,0,u,2>
-  1517469798U,  // <7,u,1,0>: Cost 2 vext1 <6,7,u,1>, LHS
-  1638318900U,  // <7,u,1,1>: Cost 2 vext3 RHS, <1,1,1,1>
-   564582190U,  // <7,u,1,2>: Cost 1 vext3 RHS, LHS
-  1638324023U,  // <7,u,1,3>: Cost 2 vext3 RHS, <u,1,3,3>
-  1517473078U,  // <7,u,1,4>: Cost 2 vext1 <6,7,u,1>, RHS
-  2693928777U,  // <7,u,1,5>: Cost 3 vext3 <1,5,3,7>, <u,1,5,3>
-  1517474710U,  // <7,u,1,6>: Cost 2 vext1 <6,7,u,1>, <6,7,u,1>
-  1640462171U,  // <7,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3>
-   564582244U,  // <7,u,1,u>: Cost 1 vext3 RHS, LHS
-  1638318244U,  // <7,u,2,0>: Cost 2 vext3 RHS, <0,2,0,2>
-  2712065907U,  // <7,u,2,1>: Cost 3 vext3 RHS, <u,2,1,0>
-  1638319720U,  // <7,u,2,2>: Cost 2 vext3 RHS, <2,2,2,2>
-  1638324101U,  // <7,u,2,3>: Cost 2 vext3 RHS, <u,2,3,0>
-  1638318284U,  // <7,u,2,4>: Cost 2 vext3 RHS, <0,2,4,6>
-  2712065947U,  // <7,u,2,5>: Cost 3 vext3 RHS, <u,2,5,4>
-  2700564387U,  // <7,u,2,6>: Cost 3 vext3 <2,6,3,7>, <u,2,6,3>
-  1640314796U,  // <7,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3>
-  1638324146U,  // <7,u,2,u>: Cost 2 vext3 RHS, <u,2,u,0>
-  1638324156U,  // <7,u,3,0>: Cost 2 vext3 RHS, <u,3,0,1>
-  1638319064U,  // <7,u,3,1>: Cost 2 vext3 RHS, <1,3,1,3>
-  2700564435U,  // <7,u,3,2>: Cost 3 vext3 <2,6,3,7>, <u,3,2,6>
-  1638320540U,  // <7,u,3,3>: Cost 2 vext3 RHS, <3,3,3,3>
-  1638324196U,  // <7,u,3,4>: Cost 2 vext3 RHS, <u,3,4,5>
-  1638324207U,  // <7,u,3,5>: Cost 2 vext3 RHS, <u,3,5,7>
-  2700564472U,  // <7,u,3,6>: Cost 3 vext3 <2,6,3,7>, <u,3,6,7>
-  2695919610U,  // <7,u,3,7>: Cost 3 vext3 <1,u,3,7>, <u,3,7,0>
-  1638324228U,  // <7,u,3,u>: Cost 2 vext3 RHS, <u,3,u,1>
-  2712066061U,  // <7,u,4,0>: Cost 3 vext3 RHS, <u,4,0,1>
-  1662212122U,  // <7,u,4,1>: Cost 2 vext3 RHS, <u,4,1,5>
-  1662212132U,  // <7,u,4,2>: Cost 2 vext3 RHS, <u,4,2,6>
-  2712066092U,  // <7,u,4,3>: Cost 3 vext3 RHS, <u,4,3,5>
-  1638321360U,  // <7,u,4,4>: Cost 2 vext3 RHS, <4,4,4,4>
-  1638324287U,  // <7,u,4,5>: Cost 2 vext3 RHS, <u,4,5,6>
-  1662359624U,  // <7,u,4,6>: Cost 2 vext3 RHS, <u,4,6,6>
-  1640314961U,  // <7,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6>
-  1638324314U,  // <7,u,4,u>: Cost 2 vext3 RHS, <u,4,u,6>
-  1517502566U,  // <7,u,5,0>: Cost 2 vext1 <6,7,u,5>, LHS
-  1574612693U,  // <7,u,5,1>: Cost 2 vext2 <5,1,7,u>, <5,1,7,u>
-  2712066162U,  // <7,u,5,2>: Cost 3 vext3 RHS, <u,5,2,3>
-  1638324351U,  // <7,u,5,3>: Cost 2 vext3 RHS, <u,5,3,7>
-  1576603592U,  // <7,u,5,4>: Cost 2 vext2 <5,4,7,u>, <5,4,7,u>
-  1577267225U,  // <7,u,5,5>: Cost 2 vext2 <5,5,7,u>, <5,5,7,u>
-   564582554U,  // <7,u,5,6>: Cost 1 vext3 RHS, RHS
-  1640462499U,  // <7,u,5,7>: Cost 2 vext3 RHS, <u,5,7,7>
-   564582572U,  // <7,u,5,u>: Cost 1 vext3 RHS, RHS
-  2712066223U,  // <7,u,6,0>: Cost 3 vext3 RHS, <u,6,0,1>
-  2712066238U,  // <7,u,6,1>: Cost 3 vext3 RHS, <u,6,1,7>
-  1581249023U,  // <7,u,6,2>: Cost 2 vext2 <6,2,7,u>, <6,2,7,u>
-  1638324432U,  // <7,u,6,3>: Cost 2 vext3 RHS, <u,6,3,7>
-  1638468980U,  // <7,u,6,4>: Cost 2 vext3 RHS, <4,6,4,6>
-  2712066274U,  // <7,u,6,5>: Cost 3 vext3 RHS, <u,6,5,7>
-  1583903555U,  // <7,u,6,6>: Cost 2 vext2 <6,6,7,u>, <6,6,7,u>
-  1640315117U,  // <7,u,6,7>: Cost 2 vext3 RHS, <u,6,7,0>
-  1638324477U,  // <7,u,6,u>: Cost 2 vext3 RHS, <u,6,u,7>
-  1638471936U,  // <7,u,7,0>: Cost 2 vext3 RHS, <u,7,0,1>
-  2692970763U,  // <7,u,7,1>: Cost 3 vext3 <1,3,u,7>, <u,7,1,3>
-  2700933399U,  // <7,u,7,2>: Cost 3 vext3 <2,6,u,7>, <u,7,2,6>
-  2573347601U,  // <7,u,7,3>: Cost 3 vext1 <3,7,u,7>, <3,7,u,7>
-  1638471976U,  // <7,u,7,4>: Cost 2 vext3 RHS, <u,7,4,5>
-  1511551171U,  // <7,u,7,5>: Cost 2 vext1 <5,7,u,7>, <5,7,u,7>
-  2712213815U,  // <7,u,7,6>: Cost 3 vext3 RHS, <u,7,6,2>
-   363253046U,  // <7,u,7,7>: Cost 1 vdup3 RHS
-   363253046U,  // <7,u,7,u>: Cost 1 vdup3 RHS
-  1638324561U,  // <7,u,u,0>: Cost 2 vext3 RHS, <u,u,0,1>
-  1638324571U,  // <7,u,u,1>: Cost 2 vext3 RHS, <u,u,1,2>
-   564582757U,  // <7,u,u,2>: Cost 1 vext3 RHS, LHS
-  1638324587U,  // <7,u,u,3>: Cost 2 vext3 RHS, <u,u,3,0>
-  1638324601U,  // <7,u,u,4>: Cost 2 vext3 RHS, <u,u,4,5>
-  1638324611U,  // <7,u,u,5>: Cost 2 vext3 RHS, <u,u,5,6>
-   564582797U,  // <7,u,u,6>: Cost 1 vext3 RHS, RHS
-   363253046U,  // <7,u,u,7>: Cost 1 vdup3 RHS
-   564582811U,  // <7,u,u,u>: Cost 1 vext3 RHS, LHS
-   135053414U,  // <u,0,0,0>: Cost 1 vdup0 LHS
-  1611489290U,  // <u,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1>
-  1611489300U,  // <u,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2>
-  2568054923U,  // <u,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0>
-  1481706806U,  // <u,0,0,4>: Cost 2 vext1 <0,u,0,0>, RHS
-  2555449040U,  // <u,0,0,5>: Cost 3 vext1 <0,u,0,0>, <5,1,7,3>
-  2591282078U,  // <u,0,0,6>: Cost 3 vext1 <6,u,0,0>, <6,u,0,0>
-  2591945711U,  // <u,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0>
-   135053414U,  // <u,0,0,u>: Cost 1 vdup0 LHS
-  1493655654U,  // <u,0,1,0>: Cost 2 vext1 <2,u,0,1>, LHS
-  1860550758U,  // <u,0,1,1>: Cost 2 vzipl LHS, LHS
-   537747563U,  // <u,0,1,2>: Cost 1 vext3 LHS, LHS
-  2625135576U,  // <u,0,1,3>: Cost 3 vext2 <1,2,u,0>, <1,3,1,3>
-  1493658934U,  // <u,0,1,4>: Cost 2 vext1 <2,u,0,1>, RHS
-  2625135760U,  // <u,0,1,5>: Cost 3 vext2 <1,2,u,0>, <1,5,3,7>
-  1517548447U,  // <u,0,1,6>: Cost 2 vext1 <6,u,0,1>, <6,u,0,1>
-  2591290362U,  // <u,0,1,7>: Cost 3 vext1 <6,u,0,1>, <7,0,1,2>
-   537747612U,  // <u,0,1,u>: Cost 1 vext3 LHS, LHS
-  1611489444U,  // <u,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
-  2685231276U,  // <u,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1>
-  1994768486U,  // <u,0,2,2>: Cost 2 vtrnl LHS, LHS
-  2685231294U,  // <u,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1>
-  1611489484U,  // <u,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
-  2712068310U,  // <u,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7>
-  2625136570U,  // <u,0,2,6>: Cost 3 vext2 <1,2,u,0>, <2,6,3,7>
-  2591962097U,  // <u,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2>
-  1611489516U,  // <u,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2>
-  2954067968U,  // <u,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0>
-  2685231356U,  // <u,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0>
-    72589981U,  // <u,0,3,2>: Cost 1 vrev LHS
-  2625137052U,  // <u,0,3,3>: Cost 3 vext2 <1,2,u,0>, <3,3,3,3>
-  2625137154U,  // <u,0,3,4>: Cost 3 vext2 <1,2,u,0>, <3,4,5,6>
-  2639071848U,  // <u,0,3,5>: Cost 3 vext2 <3,5,u,0>, <3,5,u,0>
-  2639735481U,  // <u,0,3,6>: Cost 3 vext2 <3,6,u,0>, <3,6,u,0>
-  2597279354U,  // <u,0,3,7>: Cost 3 vext1 <7,u,0,3>, <7,u,0,3>
-    73032403U,  // <u,0,3,u>: Cost 1 vrev LHS
-  2687074636U,  // <u,0,4,0>: Cost 3 vext3 <0,4,0,u>, <0,4,0,u>
-  1611489618U,  // <u,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5>
-  1611489628U,  // <u,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6>
-  3629222038U,  // <u,0,4,3>: Cost 4 vext1 <0,u,0,4>, <3,0,1,2>
-  2555481398U,  // <u,0,4,4>: Cost 3 vext1 <0,u,0,4>, RHS
-  1551396150U,  // <u,0,4,5>: Cost 2 vext2 <1,2,u,0>, RHS
-  2651680116U,  // <u,0,4,6>: Cost 3 vext2 <5,6,u,0>, <4,6,4,6>
-  2646150600U,  // <u,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0>
-  1611932050U,  // <u,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6>
-  2561458278U,  // <u,0,5,0>: Cost 3 vext1 <1,u,0,5>, LHS
-  1863532646U,  // <u,0,5,1>: Cost 2 vzipl RHS, LHS
-  2712068526U,  // <u,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7>
-  2649689976U,  // <u,0,5,3>: Cost 3 vext2 <5,3,u,0>, <5,3,u,0>
-  2220237489U,  // <u,0,5,4>: Cost 3 vrev <0,u,4,5>
-  2651680772U,  // <u,0,5,5>: Cost 3 vext2 <5,6,u,0>, <5,5,5,5>
-  1577939051U,  // <u,0,5,6>: Cost 2 vext2 <5,6,u,0>, <5,6,u,0>
-  2830077238U,  // <u,0,5,7>: Cost 3 vuzpr <1,u,3,0>, RHS
-  1579266317U,  // <u,0,5,u>: Cost 2 vext2 <5,u,u,0>, <5,u,u,0>
-  2555494502U,  // <u,0,6,0>: Cost 3 vext1 <0,u,0,6>, LHS
-  2712068598U,  // <u,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7>
-  1997750374U,  // <u,0,6,2>: Cost 2 vtrnl RHS, LHS
-  2655662673U,  // <u,0,6,3>: Cost 3 vext2 <6,3,u,0>, <6,3,u,0>
-  2555497782U,  // <u,0,6,4>: Cost 3 vext1 <0,u,0,6>, RHS
-  2651681459U,  // <u,0,6,5>: Cost 3 vext2 <5,6,u,0>, <6,5,0,u>
-  2651681592U,  // <u,0,6,6>: Cost 3 vext2 <5,6,u,0>, <6,6,6,6>
-  2651681614U,  // <u,0,6,7>: Cost 3 vext2 <5,6,u,0>, <6,7,0,1>
-  1997750428U,  // <u,0,6,u>: Cost 2 vtrnl RHS, LHS
-  2567446630U,  // <u,0,7,0>: Cost 3 vext1 <2,u,0,7>, LHS
-  2567447446U,  // <u,0,7,1>: Cost 3 vext1 <2,u,0,7>, <1,2,3,0>
-  2567448641U,  // <u,0,7,2>: Cost 3 vext1 <2,u,0,7>, <2,u,0,7>
-  2573421338U,  // <u,0,7,3>: Cost 3 vext1 <3,u,0,7>, <3,u,0,7>
-  2567449910U,  // <u,0,7,4>: Cost 3 vext1 <2,u,0,7>, RHS
-  2651682242U,  // <u,0,7,5>: Cost 3 vext2 <5,6,u,0>, <7,5,6,u>
-  2591339429U,  // <u,0,7,6>: Cost 3 vext1 <6,u,0,7>, <6,u,0,7>
-  2651682412U,  // <u,0,7,7>: Cost 3 vext2 <5,6,u,0>, <7,7,7,7>
-  2567452462U,  // <u,0,7,u>: Cost 3 vext1 <2,u,0,7>, LHS
-   135053414U,  // <u,0,u,0>: Cost 1 vdup0 LHS
-  1611489938U,  // <u,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1>
-   537748125U,  // <u,0,u,2>: Cost 1 vext3 LHS, LHS
-  2685674148U,  // <u,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1>
-  1611932338U,  // <u,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6>
-  1551399066U,  // <u,0,u,5>: Cost 2 vext2 <1,2,u,0>, RHS
-  1517605798U,  // <u,0,u,6>: Cost 2 vext1 <6,u,0,u>, <6,u,0,u>
-  2830077481U,  // <u,0,u,7>: Cost 3 vuzpr <1,u,3,0>, RHS
-   537748179U,  // <u,0,u,u>: Cost 1 vext3 LHS, LHS
-  1544101961U,  // <u,1,0,0>: Cost 2 vext2 <0,0,u,1>, <0,0,u,1>
-  1558036582U,  // <u,1,0,1>: Cost 2 vext2 <2,3,u,1>, LHS
-  2619171051U,  // <u,1,0,2>: Cost 3 vext2 <0,2,u,1>, <0,2,u,1>
-  1611490038U,  // <u,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2>
-  2555522358U,  // <u,1,0,4>: Cost 3 vext1 <0,u,1,0>, RHS
-  2712068871U,  // <u,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1>
-  2591355815U,  // <u,1,0,6>: Cost 3 vext1 <6,u,1,0>, <6,u,1,0>
-  2597328512U,  // <u,1,0,7>: Cost 3 vext1 <7,u,1,0>, <7,u,1,0>
-  1611490083U,  // <u,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2>
-  1481785446U,  // <u,1,1,0>: Cost 2 vext1 <0,u,1,1>, LHS
-   202162278U,  // <u,1,1,1>: Cost 1 vdup1 LHS
-  2555528808U,  // <u,1,1,2>: Cost 3 vext1 <0,u,1,1>, <2,2,2,2>
-  1611490120U,  // <u,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3>
-  1481788726U,  // <u,1,1,4>: Cost 2 vext1 <0,u,1,1>, RHS
-  2689876828U,  // <u,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5>
-  2591364008U,  // <u,1,1,6>: Cost 3 vext1 <6,u,1,1>, <6,u,1,1>
-  2592691274U,  // <u,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1>
-   202162278U,  // <u,1,1,u>: Cost 1 vdup1 LHS
-  1499709542U,  // <u,1,2,0>: Cost 2 vext1 <3,u,1,2>, LHS
-  2689876871U,  // <u,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3>
-  2631116445U,  // <u,1,2,2>: Cost 3 vext2 <2,2,u,1>, <2,2,u,1>
-      835584U,  // <u,1,2,3>: Cost 0 copy LHS
-  1499712822U,  // <u,1,2,4>: Cost 2 vext1 <3,u,1,2>, RHS
-  2689876907U,  // <u,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3>
-  2631780282U,  // <u,1,2,6>: Cost 3 vext2 <2,3,u,1>, <2,6,3,7>
-  1523603074U,  // <u,1,2,7>: Cost 2 vext1 <7,u,1,2>, <7,u,1,2>
-      835584U,  // <u,1,2,u>: Cost 0 copy LHS
-  1487773798U,  // <u,1,3,0>: Cost 2 vext1 <1,u,1,3>, LHS
-  1611490264U,  // <u,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3>
-  2685232094U,  // <u,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0>
-  2018746470U,  // <u,1,3,3>: Cost 2 vtrnr LHS, LHS
-  1487777078U,  // <u,1,3,4>: Cost 2 vext1 <1,u,1,3>, RHS
-  1611490304U,  // <u,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7>
-  2685674505U,  // <u,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7>
-  2640407307U,  // <u,1,3,7>: Cost 3 vext2 <3,7,u,1>, <3,7,u,1>
-  1611490327U,  // <u,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3>
-  1567992749U,  // <u,1,4,0>: Cost 2 vext2 <4,0,u,1>, <4,0,u,1>
-  2693121070U,  // <u,1,4,1>: Cost 3 vext3 <1,4,1,u>, <1,4,1,u>
-  2693194807U,  // <u,1,4,2>: Cost 3 vext3 <1,4,2,u>, <1,4,2,u>
-  1152386432U,  // <u,1,4,3>: Cost 2 vrev <1,u,3,4>
-  2555555126U,  // <u,1,4,4>: Cost 3 vext1 <0,u,1,4>, RHS
-  1558039862U,  // <u,1,4,5>: Cost 2 vext2 <2,3,u,1>, RHS
-  2645716371U,  // <u,1,4,6>: Cost 3 vext2 <4,6,u,1>, <4,6,u,1>
-  2597361284U,  // <u,1,4,7>: Cost 3 vext1 <7,u,1,4>, <7,u,1,4>
-  1152755117U,  // <u,1,4,u>: Cost 2 vrev <1,u,u,4>
-  1481818214U,  // <u,1,5,0>: Cost 2 vext1 <0,u,1,5>, LHS
-  2555560694U,  // <u,1,5,1>: Cost 3 vext1 <0,u,1,5>, <1,0,3,2>
-  2555561576U,  // <u,1,5,2>: Cost 3 vext1 <0,u,1,5>, <2,2,2,2>
-  1611490448U,  // <u,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7>
-  1481821494U,  // <u,1,5,4>: Cost 2 vext1 <0,u,1,5>, RHS
-  2651025435U,  // <u,1,5,5>: Cost 3 vext2 <5,5,u,1>, <5,5,u,1>
-  2651689068U,  // <u,1,5,6>: Cost 3 vext2 <5,6,u,1>, <5,6,u,1>
-  2823966006U,  // <u,1,5,7>: Cost 3 vuzpr <0,u,1,1>, RHS
-  1611932861U,  // <u,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7>
-  2555568230U,  // <u,1,6,0>: Cost 3 vext1 <0,u,1,6>, LHS
-  2689877199U,  // <u,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7>
-  2712069336U,  // <u,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7>
-  2685232353U,  // <u,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7>
-  2555571510U,  // <u,1,6,4>: Cost 3 vext1 <0,u,1,6>, RHS
-  2689877235U,  // <u,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7>
-  2657661765U,  // <u,1,6,6>: Cost 3 vext2 <6,6,u,1>, <6,6,u,1>
-  1584583574U,  // <u,1,6,7>: Cost 2 vext2 <6,7,u,1>, <6,7,u,1>
-  1585247207U,  // <u,1,6,u>: Cost 2 vext2 <6,u,u,1>, <6,u,u,1>
-  2561548390U,  // <u,1,7,0>: Cost 3 vext1 <1,u,1,7>, LHS
-  2561549681U,  // <u,1,7,1>: Cost 3 vext1 <1,u,1,7>, <1,u,1,7>
-  2573493926U,  // <u,1,7,2>: Cost 3 vext1 <3,u,1,7>, <2,3,0,1>
-  2042962022U,  // <u,1,7,3>: Cost 2 vtrnr RHS, LHS
-  2561551670U,  // <u,1,7,4>: Cost 3 vext1 <1,u,1,7>, RHS
-  2226300309U,  // <u,1,7,5>: Cost 3 vrev <1,u,5,7>
-  2658325990U,  // <u,1,7,6>: Cost 3 vext2 <6,7,u,1>, <7,6,1,u>
-  2658326124U,  // <u,1,7,7>: Cost 3 vext2 <6,7,u,1>, <7,7,7,7>
-  2042962027U,  // <u,1,7,u>: Cost 2 vtrnr RHS, LHS
-  1481842790U,  // <u,1,u,0>: Cost 2 vext1 <0,u,1,u>, LHS
-   202162278U,  // <u,1,u,1>: Cost 1 vdup1 LHS
-  2685674867U,  // <u,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0>
-      835584U,  // <u,1,u,3>: Cost 0 copy LHS
-  1481846070U,  // <u,1,u,4>: Cost 2 vext1 <0,u,1,u>, RHS
-  1611933077U,  // <u,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7>
-  2685674910U,  // <u,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7>
-  1523652232U,  // <u,1,u,7>: Cost 2 vext1 <7,u,1,u>, <7,u,1,u>
-      835584U,  // <u,1,u,u>: Cost 0 copy LHS
-  1544110154U,  // <u,2,0,0>: Cost 2 vext2 <0,0,u,2>, <0,0,u,2>
-  1545437286U,  // <u,2,0,1>: Cost 2 vext2 <0,2,u,2>, LHS
-  1545437420U,  // <u,2,0,2>: Cost 2 vext2 <0,2,u,2>, <0,2,u,2>
-  2685232589U,  // <u,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0>
-  2619179346U,  // <u,2,0,4>: Cost 3 vext2 <0,2,u,2>, <0,4,1,5>
-  2712069606U,  // <u,2,0,5>: Cost 3 vext3 RHS, <2,0,5,7>
-  2689877484U,  // <u,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4>
-  2659656273U,  // <u,2,0,7>: Cost 3 vext2 <7,0,u,2>, <0,7,2,u>
-  1545437853U,  // <u,2,0,u>: Cost 2 vext2 <0,2,u,2>, LHS
-  1550082851U,  // <u,2,1,0>: Cost 2 vext2 <1,0,u,2>, <1,0,u,2>
-  2619179828U,  // <u,2,1,1>: Cost 3 vext2 <0,2,u,2>, <1,1,1,1>
-  2619179926U,  // <u,2,1,2>: Cost 3 vext2 <0,2,u,2>, <1,2,3,0>
-  2685232671U,  // <u,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1>
-  2555604278U,  // <u,2,1,4>: Cost 3 vext1 <0,u,2,1>, RHS
-  2619180176U,  // <u,2,1,5>: Cost 3 vext2 <0,2,u,2>, <1,5,3,7>
-  2689877564U,  // <u,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3>
-  2602718850U,  // <u,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2>
-  1158703235U,  // <u,2,1,u>: Cost 2 vrev <2,u,u,1>
-  1481867366U,  // <u,2,2,0>: Cost 2 vext1 <0,u,2,2>, LHS
-  2555609846U,  // <u,2,2,1>: Cost 3 vext1 <0,u,2,2>, <1,0,3,2>
-   269271142U,  // <u,2,2,2>: Cost 1 vdup2 LHS
-  1611490930U,  // <u,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3>
-  1481870646U,  // <u,2,2,4>: Cost 2 vext1 <0,u,2,2>, RHS
-  2689877640U,  // <u,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7>
-  2619180986U,  // <u,2,2,6>: Cost 3 vext2 <0,2,u,2>, <2,6,3,7>
-  2593436837U,  // <u,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2>
-   269271142U,  // <u,2,2,u>: Cost 1 vdup2 LHS
-   408134301U,  // <u,2,3,0>: Cost 1 vext1 LHS, LHS
-  1481876214U,  // <u,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
-  1481877096U,  // <u,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2>
-  1880326246U,  // <u,2,3,3>: Cost 2 vzipr LHS, LHS
-   408137014U,  // <u,2,3,4>: Cost 1 vext1 LHS, RHS
-  1529654992U,  // <u,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3>
-  1529655802U,  // <u,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
-  1529656314U,  // <u,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2>
-   408139566U,  // <u,2,3,u>: Cost 1 vext1 LHS, LHS
-  1567853468U,  // <u,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2>
-  2561598362U,  // <u,2,4,1>: Cost 3 vext1 <1,u,2,4>, <1,2,3,4>
-  2555627214U,  // <u,2,4,2>: Cost 3 vext1 <0,u,2,4>, <2,3,4,5>
-  2685232918U,  // <u,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5>
-  2555628854U,  // <u,2,4,4>: Cost 3 vext1 <0,u,2,4>, RHS
-  1545440566U,  // <u,2,4,5>: Cost 2 vext2 <0,2,u,2>, RHS
-  1571982740U,  // <u,2,4,6>: Cost 2 vext2 <4,6,u,2>, <4,6,u,2>
-  2592125957U,  // <u,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4>
-  1545440809U,  // <u,2,4,u>: Cost 2 vext2 <0,2,u,2>, RHS
-  2555633766U,  // <u,2,5,0>: Cost 3 vext1 <0,u,2,5>, LHS
-  2561606550U,  // <u,2,5,1>: Cost 3 vext1 <1,u,2,5>, <1,2,3,0>
-  2689877856U,  // <u,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7>
-  2685233000U,  // <u,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6>
-  1158441059U,  // <u,2,5,4>: Cost 2 vrev <2,u,4,5>
-  2645725188U,  // <u,2,5,5>: Cost 3 vext2 <4,6,u,2>, <5,5,5,5>
-  2689877892U,  // <u,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7>
-  2823900470U,  // <u,2,5,7>: Cost 3 vuzpr <0,u,0,2>, RHS
-  1158736007U,  // <u,2,5,u>: Cost 2 vrev <2,u,u,5>
-  1481900134U,  // <u,2,6,0>: Cost 2 vext1 <0,u,2,6>, LHS
-  2555642614U,  // <u,2,6,1>: Cost 3 vext1 <0,u,2,6>, <1,0,3,2>
-  2555643496U,  // <u,2,6,2>: Cost 3 vext1 <0,u,2,6>, <2,2,2,2>
-  1611491258U,  // <u,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7>
-  1481903414U,  // <u,2,6,4>: Cost 2 vext1 <0,u,2,6>, RHS
-  2689877964U,  // <u,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7>
-  2689877973U,  // <u,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7>
-  2645726030U,  // <u,2,6,7>: Cost 3 vext2 <4,6,u,2>, <6,7,0,1>
-  1611933671U,  // <u,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7>
-  1585919033U,  // <u,2,7,0>: Cost 2 vext2 <7,0,u,2>, <7,0,u,2>
-  2573566710U,  // <u,2,7,1>: Cost 3 vext1 <3,u,2,7>, <1,0,3,2>
-  2567596115U,  // <u,2,7,2>: Cost 3 vext1 <2,u,2,7>, <2,u,2,7>
-  1906901094U,  // <u,2,7,3>: Cost 2 vzipr RHS, LHS
-  2555653430U,  // <u,2,7,4>: Cost 3 vext1 <0,u,2,7>, RHS
-  2800080230U,  // <u,2,7,5>: Cost 3 vuzpl LHS, <7,4,5,6>
-  2980643164U,  // <u,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6>
-  2645726828U,  // <u,2,7,7>: Cost 3 vext2 <4,6,u,2>, <7,7,7,7>
-  1906901099U,  // <u,2,7,u>: Cost 2 vzipr RHS, LHS
-   408175266U,  // <u,2,u,0>: Cost 1 vext1 LHS, LHS
-  1545443118U,  // <u,2,u,1>: Cost 2 vext2 <0,2,u,2>, LHS
-   269271142U,  // <u,2,u,2>: Cost 1 vdup2 LHS
-  1611491416U,  // <u,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3>
-   408177974U,  // <u,2,u,4>: Cost 1 vext1 LHS, RHS
-  1545443482U,  // <u,2,u,5>: Cost 2 vext2 <0,2,u,2>, RHS
-  1726339226U,  // <u,2,u,6>: Cost 2 vuzpl LHS, RHS
-  1529697274U,  // <u,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2>
-   408180526U,  // <u,2,u,u>: Cost 1 vext1 LHS, LHS
-  1544781824U,  // <u,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
-   471040156U,  // <u,3,0,1>: Cost 1 vext2 LHS, LHS
-  1544781988U,  // <u,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
-  2618523900U,  // <u,3,0,3>: Cost 3 vext2 LHS, <0,3,1,0>
-  1544782162U,  // <u,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
-  2238188352U,  // <u,3,0,5>: Cost 3 vrev <3,u,5,0>
-  2623169023U,  // <u,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7>
-  2238335826U,  // <u,3,0,7>: Cost 3 vrev <3,u,7,0>
-   471040669U,  // <u,3,0,u>: Cost 1 vext2 LHS, LHS
-  1544782582U,  // <u,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
-  1544782644U,  // <u,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
-  1544782742U,  // <u,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
-  1544782808U,  // <u,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
-  2618524733U,  // <u,3,1,4>: Cost 3 vext2 LHS, <1,4,3,5>
-  1544782992U,  // <u,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
-  2618524897U,  // <u,3,1,6>: Cost 3 vext2 LHS, <1,6,3,7>
-  2703517987U,  // <u,3,1,7>: Cost 3 vext3 <3,1,7,u>, <3,1,7,u>
-  1544783213U,  // <u,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3>
-  1529716838U,  // <u,3,2,0>: Cost 2 vext1 <u,u,3,2>, LHS
-  1164167966U,  // <u,3,2,1>: Cost 2 vrev <3,u,1,2>
-  1544783464U,  // <u,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2>
-  1544783526U,  // <u,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
-  1529720118U,  // <u,3,2,4>: Cost 2 vext1 <u,u,3,2>, RHS
-  2618525544U,  // <u,3,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
-  1544783802U,  // <u,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
-  2704181620U,  // <u,3,2,7>: Cost 3 vext3 <3,2,7,u>, <3,2,7,u>
-  1544783931U,  // <u,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1>
-  1544784022U,  // <u,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
-  1487922559U,  // <u,3,3,1>: Cost 2 vext1 <1,u,3,3>, <1,u,3,3>
-  1493895256U,  // <u,3,3,2>: Cost 2 vext1 <2,u,3,3>, <2,u,3,3>
-   336380006U,  // <u,3,3,3>: Cost 1 vdup3 LHS
-  1544784386U,  // <u,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
-  2824054478U,  // <u,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5>
-  2238286668U,  // <u,3,3,6>: Cost 3 vrev <3,u,6,3>
-  2954069136U,  // <u,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7>
-   336380006U,  // <u,3,3,u>: Cost 1 vdup3 LHS
-  1487929446U,  // <u,3,4,0>: Cost 2 vext1 <1,u,3,4>, LHS
-  1487930752U,  // <u,3,4,1>: Cost 2 vext1 <1,u,3,4>, <1,u,3,4>
-  2623171644U,  // <u,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0>
-  2561673366U,  // <u,3,4,3>: Cost 3 vext1 <1,u,3,4>, <3,0,1,2>
-  1487932726U,  // <u,3,4,4>: Cost 2 vext1 <1,u,3,4>, RHS
-   471043382U,  // <u,3,4,5>: Cost 1 vext2 LHS, RHS
-  1592561012U,  // <u,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
-  2238368598U,  // <u,3,4,7>: Cost 3 vrev <3,u,7,4>
-   471043625U,  // <u,3,4,u>: Cost 1 vext2 LHS, RHS
-  2555707494U,  // <u,3,5,0>: Cost 3 vext1 <0,u,3,5>, LHS
-  1574645465U,  // <u,3,5,1>: Cost 2 vext2 <5,1,u,3>, <5,1,u,3>
-  2567653106U,  // <u,3,5,2>: Cost 3 vext1 <2,u,3,5>, <2,3,u,5>
-  2555709954U,  // <u,3,5,3>: Cost 3 vext1 <0,u,3,5>, <3,4,5,6>
-  1592561606U,  // <u,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
-  1592561668U,  // <u,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
-  1592561762U,  // <u,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0>
-  1750314294U,  // <u,3,5,7>: Cost 2 vuzpr LHS, RHS
-  1750314295U,  // <u,3,5,u>: Cost 2 vuzpr LHS, RHS
-  2623172897U,  // <u,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2>
-  2561688962U,  // <u,3,6,1>: Cost 3 vext1 <1,u,3,6>, <1,u,3,6>
-  1581281795U,  // <u,3,6,2>: Cost 2 vext2 <6,2,u,3>, <6,2,u,3>
-  2706541204U,  // <u,3,6,3>: Cost 3 vext3 <3,6,3,u>, <3,6,3,u>
-  2623173261U,  // <u,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6>
-  1164495686U,  // <u,3,6,5>: Cost 2 vrev <3,u,5,6>
-  1592562488U,  // <u,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
-  1592562510U,  // <u,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
-  1164716897U,  // <u,3,6,u>: Cost 2 vrev <3,u,u,6>
-  1487954022U,  // <u,3,7,0>: Cost 2 vext1 <1,u,3,7>, LHS
-  1487955331U,  // <u,3,7,1>: Cost 2 vext1 <1,u,3,7>, <1,u,3,7>
-  1493928028U,  // <u,3,7,2>: Cost 2 vext1 <2,u,3,7>, <2,u,3,7>
-  2561697942U,  // <u,3,7,3>: Cost 3 vext1 <1,u,3,7>, <3,0,1,2>
-  1487957302U,  // <u,3,7,4>: Cost 2 vext1 <1,u,3,7>, RHS
-  2707352311U,  // <u,3,7,5>: Cost 3 vext3 <3,7,5,u>, <3,7,5,u>
-  2655024623U,  // <u,3,7,6>: Cost 3 vext2 <6,2,u,3>, <7,6,2,u>
-  1592563308U,  // <u,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
-  1487959854U,  // <u,3,7,u>: Cost 2 vext1 <1,u,3,7>, LHS
-  1544787667U,  // <u,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2>
-   471045934U,  // <u,3,u,1>: Cost 1 vext2 LHS, LHS
-  1549432709U,  // <u,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0>
-   336380006U,  // <u,3,u,3>: Cost 1 vdup3 LHS
-  1544788031U,  // <u,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6>
-   471046298U,  // <u,3,u,5>: Cost 1 vext2 LHS, RHS
-  1549433040U,  // <u,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7>
-  1750314537U,  // <u,3,u,7>: Cost 2 vuzpr LHS, RHS
-   471046501U,  // <u,3,u,u>: Cost 1 vext2 LHS, LHS
-  2625167360U,  // <u,4,0,0>: Cost 3 vext2 <1,2,u,4>, <0,0,0,0>
-  1551425638U,  // <u,4,0,1>: Cost 2 vext2 <1,2,u,4>, LHS
-  2619195630U,  // <u,4,0,2>: Cost 3 vext2 <0,2,u,4>, <0,2,u,4>
-  2619343104U,  // <u,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4>
-  2625167698U,  // <u,4,0,4>: Cost 3 vext2 <1,2,u,4>, <0,4,1,5>
-  1638329234U,  // <u,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1>
-  1638329244U,  // <u,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2>
-  3787803556U,  // <u,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1>
-  1551426205U,  // <u,4,0,u>: Cost 2 vext2 <1,2,u,4>, LHS
-  2555748454U,  // <u,4,1,0>: Cost 3 vext1 <0,u,4,1>, LHS
-  2625168180U,  // <u,4,1,1>: Cost 3 vext2 <1,2,u,4>, <1,1,1,1>
-  1551426503U,  // <u,4,1,2>: Cost 2 vext2 <1,2,u,4>, <1,2,u,4>
-  2625168344U,  // <u,4,1,3>: Cost 3 vext2 <1,2,u,4>, <1,3,1,3>
-  2555751734U,  // <u,4,1,4>: Cost 3 vext1 <0,u,4,1>, RHS
-  1860554038U,  // <u,4,1,5>: Cost 2 vzipl LHS, RHS
-  2689879022U,  // <u,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3>
-  2592248852U,  // <u,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1>
-  1555408301U,  // <u,4,1,u>: Cost 2 vext2 <1,u,u,4>, <1,u,u,4>
-  2555756646U,  // <u,4,2,0>: Cost 3 vext1 <0,u,4,2>, LHS
-  2625168943U,  // <u,4,2,1>: Cost 3 vext2 <1,2,u,4>, <2,1,4,u>
-  2625169000U,  // <u,4,2,2>: Cost 3 vext2 <1,2,u,4>, <2,2,2,2>
-  2619197134U,  // <u,4,2,3>: Cost 3 vext2 <0,2,u,4>, <2,3,4,5>
-  2555759926U,  // <u,4,2,4>: Cost 3 vext1 <0,u,4,2>, RHS
-  2712071222U,  // <u,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3>
-  1994771766U,  // <u,4,2,6>: Cost 2 vtrnl LHS, RHS
-  2592257045U,  // <u,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2>
-  1994771784U,  // <u,4,2,u>: Cost 2 vtrnl LHS, RHS
-  2625169558U,  // <u,4,3,0>: Cost 3 vext2 <1,2,u,4>, <3,0,1,2>
-  2567709594U,  // <u,4,3,1>: Cost 3 vext1 <2,u,4,3>, <1,2,3,4>
-  2567710817U,  // <u,4,3,2>: Cost 3 vext1 <2,u,4,3>, <2,u,4,3>
-  2625169820U,  // <u,4,3,3>: Cost 3 vext2 <1,2,u,4>, <3,3,3,3>
-  2625169922U,  // <u,4,3,4>: Cost 3 vext2 <1,2,u,4>, <3,4,5,6>
-  2954069710U,  // <u,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5>
-  2954068172U,  // <u,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6>
-  3903849472U,  // <u,4,3,7>: Cost 4 vuzpr <1,u,3,4>, <1,3,5,7>
-  2954068174U,  // <u,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u>
-  1505919078U,  // <u,4,4,0>: Cost 2 vext1 <4,u,4,4>, LHS
-  2567717831U,  // <u,4,4,1>: Cost 3 vext1 <2,u,4,4>, <1,2,u,4>
-  2567719010U,  // <u,4,4,2>: Cost 3 vext1 <2,u,4,4>, <2,u,4,4>
-  2570373542U,  // <u,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4>
-   161926454U,  // <u,4,4,4>: Cost 1 vdup0 RHS
-  1551428918U,  // <u,4,4,5>: Cost 2 vext2 <1,2,u,4>, RHS
-  1638329572U,  // <u,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6>
-  2594927963U,  // <u,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4>
-   161926454U,  // <u,4,4,u>: Cost 1 vdup0 RHS
-  1493983334U,  // <u,4,5,0>: Cost 2 vext1 <2,u,4,5>, LHS
-  2689879301U,  // <u,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3>
-  1493985379U,  // <u,4,5,2>: Cost 2 vext1 <2,u,4,5>, <2,u,4,5>
-  2567727254U,  // <u,4,5,3>: Cost 3 vext1 <2,u,4,5>, <3,0,1,2>
-  1493986614U,  // <u,4,5,4>: Cost 2 vext1 <2,u,4,5>, RHS
-  1863535926U,  // <u,4,5,5>: Cost 2 vzipl RHS, RHS
-   537750838U,  // <u,4,5,6>: Cost 1 vext3 LHS, RHS
-  2830110006U,  // <u,4,5,7>: Cost 3 vuzpr <1,u,3,4>, RHS
-   537750856U,  // <u,4,5,u>: Cost 1 vext3 LHS, RHS
-  1482047590U,  // <u,4,6,0>: Cost 2 vext1 <0,u,4,6>, LHS
-  2555790070U,  // <u,4,6,1>: Cost 3 vext1 <0,u,4,6>, <1,0,3,2>
-  2555790952U,  // <u,4,6,2>: Cost 3 vext1 <0,u,4,6>, <2,2,2,2>
-  2555791510U,  // <u,4,6,3>: Cost 3 vext1 <0,u,4,6>, <3,0,1,2>
-  1482050870U,  // <u,4,6,4>: Cost 2 vext1 <0,u,4,6>, RHS
-  2689879422U,  // <u,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7>
-  1997753654U,  // <u,4,6,6>: Cost 2 vtrnl RHS, RHS
-  2712071562U,  // <u,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1>
-  1482053422U,  // <u,4,6,u>: Cost 2 vext1 <0,u,4,6>, LHS
-  2567741542U,  // <u,4,7,0>: Cost 3 vext1 <2,u,4,7>, LHS
-  2567742362U,  // <u,4,7,1>: Cost 3 vext1 <2,u,4,7>, <1,2,3,4>
-  2567743589U,  // <u,4,7,2>: Cost 3 vext1 <2,u,4,7>, <2,u,4,7>
-  2573716286U,  // <u,4,7,3>: Cost 3 vext1 <3,u,4,7>, <3,u,4,7>
-  2567744822U,  // <u,4,7,4>: Cost 3 vext1 <2,u,4,7>, RHS
-  2712071624U,  // <u,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0>
-    96808489U,  // <u,4,7,6>: Cost 1 vrev RHS
-  2651715180U,  // <u,4,7,7>: Cost 3 vext2 <5,6,u,4>, <7,7,7,7>
-    96955963U,  // <u,4,7,u>: Cost 1 vrev RHS
-  1482063974U,  // <u,4,u,0>: Cost 2 vext1 <0,u,4,u>, LHS
-  1551431470U,  // <u,4,u,1>: Cost 2 vext2 <1,2,u,4>, LHS
-  1494009958U,  // <u,4,u,2>: Cost 2 vext1 <2,u,4,u>, <2,u,4,u>
-  2555807894U,  // <u,4,u,3>: Cost 3 vext1 <0,u,4,u>, <3,0,1,2>
-   161926454U,  // <u,4,u,4>: Cost 1 vdup0 RHS
-  1551431834U,  // <u,4,u,5>: Cost 2 vext2 <1,2,u,4>, RHS
-   537751081U,  // <u,4,u,6>: Cost 1 vext3 LHS, RHS
-  2830110249U,  // <u,4,u,7>: Cost 3 vuzpr <1,u,3,4>, RHS
-   537751099U,  // <u,4,u,u>: Cost 1 vext3 LHS, RHS
-  2631811072U,  // <u,5,0,0>: Cost 3 vext2 <2,3,u,5>, <0,0,0,0>
-  1558069350U,  // <u,5,0,1>: Cost 2 vext2 <2,3,u,5>, LHS
-  2619203823U,  // <u,5,0,2>: Cost 3 vext2 <0,2,u,5>, <0,2,u,5>
-  2619867456U,  // <u,5,0,3>: Cost 3 vext2 <0,3,u,5>, <0,3,u,5>
-  1546273106U,  // <u,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5>
-  2733010539U,  // <u,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1>
-  2597622682U,  // <u,5,0,6>: Cost 3 vext1 <7,u,5,0>, <6,7,u,5>
-  1176539396U,  // <u,5,0,7>: Cost 2 vrev <5,u,7,0>
-  1558069917U,  // <u,5,0,u>: Cost 2 vext2 <2,3,u,5>, LHS
-  1505968230U,  // <u,5,1,0>: Cost 2 vext1 <4,u,5,1>, LHS
-  2624512887U,  // <u,5,1,1>: Cost 3 vext2 <1,1,u,5>, <1,1,u,5>
-  2631811990U,  // <u,5,1,2>: Cost 3 vext2 <2,3,u,5>, <1,2,3,0>
-  2618541056U,  // <u,5,1,3>: Cost 3 vext2 <0,1,u,5>, <1,3,5,7>
-  1505971510U,  // <u,5,1,4>: Cost 2 vext1 <4,u,5,1>, RHS
-  2627167419U,  // <u,5,1,5>: Cost 3 vext2 <1,5,u,5>, <1,5,u,5>
-  2579714554U,  // <u,5,1,6>: Cost 3 vext1 <4,u,5,1>, <6,2,7,3>
-  1638330064U,  // <u,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3>
-  1638477529U,  // <u,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3>
-  2561802342U,  // <u,5,2,0>: Cost 3 vext1 <1,u,5,2>, LHS
-  2561803264U,  // <u,5,2,1>: Cost 3 vext1 <1,u,5,2>, <1,3,5,7>
-  2631149217U,  // <u,5,2,2>: Cost 3 vext2 <2,2,u,5>, <2,2,u,5>
-  1558071026U,  // <u,5,2,3>: Cost 2 vext2 <2,3,u,5>, <2,3,u,5>
-  2561805622U,  // <u,5,2,4>: Cost 3 vext1 <1,u,5,2>, RHS
-  2714062607U,  // <u,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3>
-  2631813050U,  // <u,5,2,6>: Cost 3 vext2 <2,3,u,5>, <2,6,3,7>
-  3092335926U,  // <u,5,2,7>: Cost 3 vtrnr <0,u,0,2>, RHS
-  1561389191U,  // <u,5,2,u>: Cost 2 vext2 <2,u,u,5>, <2,u,u,5>
-  2561810534U,  // <u,5,3,0>: Cost 3 vext1 <1,u,5,3>, LHS
-  2561811857U,  // <u,5,3,1>: Cost 3 vext1 <1,u,5,3>, <1,u,5,3>
-  2631813474U,  // <u,5,3,2>: Cost 3 vext2 <2,3,u,5>, <3,2,5,u>
-  2631813532U,  // <u,5,3,3>: Cost 3 vext2 <2,3,u,5>, <3,3,3,3>
-  2619869698U,  // <u,5,3,4>: Cost 3 vext2 <0,3,u,5>, <3,4,5,6>
-  3001847002U,  // <u,5,3,5>: Cost 3 vzipr LHS, <4,4,5,5>
-  2954070530U,  // <u,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6>
-  2018749750U,  // <u,5,3,7>: Cost 2 vtrnr LHS, RHS
-  2018749751U,  // <u,5,3,u>: Cost 2 vtrnr LHS, RHS
-  2573762662U,  // <u,5,4,0>: Cost 3 vext1 <3,u,5,4>, LHS
-  2620017634U,  // <u,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0>
-  2573764338U,  // <u,5,4,2>: Cost 3 vext1 <3,u,5,4>, <2,3,u,5>
-  2573765444U,  // <u,5,4,3>: Cost 3 vext1 <3,u,5,4>, <3,u,5,4>
-  1570680053U,  // <u,5,4,4>: Cost 2 vext2 <4,4,u,5>, <4,4,u,5>
-  1558072630U,  // <u,5,4,5>: Cost 2 vext2 <2,3,u,5>, RHS
-  2645749143U,  // <u,5,4,6>: Cost 3 vext2 <4,6,u,5>, <4,6,u,5>
-  1638330310U,  // <u,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6>
-  1558072873U,  // <u,5,4,u>: Cost 2 vext2 <2,3,u,5>, RHS
-  1506000998U,  // <u,5,5,0>: Cost 2 vext1 <4,u,5,5>, LHS
-  2561827984U,  // <u,5,5,1>: Cost 3 vext1 <1,u,5,5>, <1,5,3,7>
-  2579744360U,  // <u,5,5,2>: Cost 3 vext1 <4,u,5,5>, <2,2,2,2>
-  2579744918U,  // <u,5,5,3>: Cost 3 vext1 <4,u,5,5>, <3,0,1,2>
-  1506004278U,  // <u,5,5,4>: Cost 2 vext1 <4,u,5,5>, RHS
-   229035318U,  // <u,5,5,5>: Cost 1 vdup1 RHS
-  2712072206U,  // <u,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6>
-  1638330392U,  // <u,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7>
-   229035318U,  // <u,5,5,u>: Cost 1 vdup1 RHS
-  1500037222U,  // <u,5,6,0>: Cost 2 vext1 <3,u,5,6>, LHS
-  2561836436U,  // <u,5,6,1>: Cost 3 vext1 <1,u,5,6>, <1,u,5,6>
-  2567809133U,  // <u,5,6,2>: Cost 3 vext1 <2,u,5,6>, <2,u,5,6>
-  1500040006U,  // <u,5,6,3>: Cost 2 vext1 <3,u,5,6>, <3,u,5,6>
-  1500040502U,  // <u,5,6,4>: Cost 2 vext1 <3,u,5,6>, RHS
-  2714062935U,  // <u,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7>
-  2712072288U,  // <u,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7>
-    27705344U,  // <u,5,6,7>: Cost 0 copy RHS
-    27705344U,  // <u,5,6,u>: Cost 0 copy RHS
-  1488101478U,  // <u,5,7,0>: Cost 2 vext1 <1,u,5,7>, LHS
-  1488102805U,  // <u,5,7,1>: Cost 2 vext1 <1,u,5,7>, <1,u,5,7>
-  2561844840U,  // <u,5,7,2>: Cost 3 vext1 <1,u,5,7>, <2,2,2,2>
-  2561845398U,  // <u,5,7,3>: Cost 3 vext1 <1,u,5,7>, <3,0,1,2>
-  1488104758U,  // <u,5,7,4>: Cost 2 vext1 <1,u,5,7>, RHS
-  1638330536U,  // <u,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7>
-  2712072362U,  // <u,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0>
-  2042965302U,  // <u,5,7,7>: Cost 2 vtrnr RHS, RHS
-  1488107310U,  // <u,5,7,u>: Cost 2 vext1 <1,u,5,7>, LHS
-  1488109670U,  // <u,5,u,0>: Cost 2 vext1 <1,u,5,u>, LHS
-  1488110998U,  // <u,5,u,1>: Cost 2 vext1 <1,u,5,u>, <1,u,5,u>
-  2561853032U,  // <u,5,u,2>: Cost 3 vext1 <1,u,5,u>, <2,2,2,2>
-  1500056392U,  // <u,5,u,3>: Cost 2 vext1 <3,u,5,u>, <3,u,5,u>
-  1488112950U,  // <u,5,u,4>: Cost 2 vext1 <1,u,5,u>, RHS
-   229035318U,  // <u,5,u,5>: Cost 1 vdup1 RHS
-  2954111490U,  // <u,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6>
-    27705344U,  // <u,5,u,7>: Cost 0 copy RHS
-    27705344U,  // <u,5,u,u>: Cost 0 copy RHS
-  2619211776U,  // <u,6,0,0>: Cost 3 vext2 <0,2,u,6>, <0,0,0,0>
-  1545470054U,  // <u,6,0,1>: Cost 2 vext2 <0,2,u,6>, LHS
-  1545470192U,  // <u,6,0,2>: Cost 2 vext2 <0,2,u,6>, <0,2,u,6>
-  2255958969U,  // <u,6,0,3>: Cost 3 vrev <6,u,3,0>
-  1546797458U,  // <u,6,0,4>: Cost 2 vext2 <0,4,u,6>, <0,4,u,6>
-  2720624971U,  // <u,6,0,5>: Cost 3 vext3 <6,0,5,u>, <6,0,5,u>
-  2256180180U,  // <u,6,0,6>: Cost 3 vrev <6,u,6,0>
-  2960682294U,  // <u,6,0,7>: Cost 3 vzipr <1,2,u,0>, RHS
-  1545470621U,  // <u,6,0,u>: Cost 2 vext2 <0,2,u,6>, LHS
-  1182004127U,  // <u,6,1,0>: Cost 2 vrev <6,u,0,1>
-  2619212596U,  // <u,6,1,1>: Cost 3 vext2 <0,2,u,6>, <1,1,1,1>
-  2619212694U,  // <u,6,1,2>: Cost 3 vext2 <0,2,u,6>, <1,2,3,0>
-  2619212760U,  // <u,6,1,3>: Cost 3 vext2 <0,2,u,6>, <1,3,1,3>
-  2626511979U,  // <u,6,1,4>: Cost 3 vext2 <1,4,u,6>, <1,4,u,6>
-  2619212944U,  // <u,6,1,5>: Cost 3 vext2 <0,2,u,6>, <1,5,3,7>
-  2714063264U,  // <u,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3>
-  2967326006U,  // <u,6,1,7>: Cost 3 vzipr <2,3,u,1>, RHS
-  1182594023U,  // <u,6,1,u>: Cost 2 vrev <6,u,u,1>
-  1506050150U,  // <u,6,2,0>: Cost 2 vext1 <4,u,6,2>, LHS
-  2579792630U,  // <u,6,2,1>: Cost 3 vext1 <4,u,6,2>, <1,0,3,2>
-  2619213416U,  // <u,6,2,2>: Cost 3 vext2 <0,2,u,6>, <2,2,2,2>
-  2619213478U,  // <u,6,2,3>: Cost 3 vext2 <0,2,u,6>, <2,3,0,1>
-  1506053430U,  // <u,6,2,4>: Cost 2 vext1 <4,u,6,2>, RHS
-  2633148309U,  // <u,6,2,5>: Cost 3 vext2 <2,5,u,6>, <2,5,u,6>
-  2619213754U,  // <u,6,2,6>: Cost 3 vext2 <0,2,u,6>, <2,6,3,7>
-  1638330874U,  // <u,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3>
-  1638478339U,  // <u,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3>
-  2619213974U,  // <u,6,3,0>: Cost 3 vext2 <0,2,u,6>, <3,0,1,2>
-  2255836074U,  // <u,6,3,1>: Cost 3 vrev <6,u,1,3>
-  2255909811U,  // <u,6,3,2>: Cost 3 vrev <6,u,2,3>
-  2619214236U,  // <u,6,3,3>: Cost 3 vext2 <0,2,u,6>, <3,3,3,3>
-  1564715549U,  // <u,6,3,4>: Cost 2 vext2 <3,4,u,6>, <3,4,u,6>
-  2639121006U,  // <u,6,3,5>: Cost 3 vext2 <3,5,u,6>, <3,5,u,6>
-  3001847012U,  // <u,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6>
-  1880329526U,  // <u,6,3,7>: Cost 2 vzipr LHS, RHS
-  1880329527U,  // <u,6,3,u>: Cost 2 vzipr LHS, RHS
-  2567864422U,  // <u,6,4,0>: Cost 3 vext1 <2,u,6,4>, LHS
-  2733011558U,  // <u,6,4,1>: Cost 3 vext3 LHS, <6,4,1,3>
-  2567866484U,  // <u,6,4,2>: Cost 3 vext1 <2,u,6,4>, <2,u,6,4>
-  2638458005U,  // <u,6,4,3>: Cost 3 vext2 <3,4,u,6>, <4,3,6,u>
-  1570540772U,  // <u,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6>
-  1545473334U,  // <u,6,4,5>: Cost 2 vext2 <0,2,u,6>, RHS
-  1572015512U,  // <u,6,4,6>: Cost 2 vext2 <4,6,u,6>, <4,6,u,6>
-  2960715062U,  // <u,6,4,7>: Cost 3 vzipr <1,2,u,4>, RHS
-  1545473577U,  // <u,6,4,u>: Cost 2 vext2 <0,2,u,6>, RHS
-  2567872614U,  // <u,6,5,0>: Cost 3 vext1 <2,u,6,5>, LHS
-  2645757648U,  // <u,6,5,1>: Cost 3 vext2 <4,6,u,6>, <5,1,7,3>
-  2567874490U,  // <u,6,5,2>: Cost 3 vext1 <2,u,6,5>, <2,6,3,7>
-  2576501250U,  // <u,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6>
-  1576660943U,  // <u,6,5,4>: Cost 2 vext2 <5,4,u,6>, <5,4,u,6>
-  2645757956U,  // <u,6,5,5>: Cost 3 vext2 <4,6,u,6>, <5,5,5,5>
-  2645758050U,  // <u,6,5,6>: Cost 3 vext2 <4,6,u,6>, <5,6,7,0>
-  2824080694U,  // <u,6,5,7>: Cost 3 vuzpr <0,u,2,6>, RHS
-  1182626795U,  // <u,6,5,u>: Cost 2 vrev <6,u,u,5>
-  1506082918U,  // <u,6,6,0>: Cost 2 vext1 <4,u,6,6>, LHS
-  2579825398U,  // <u,6,6,1>: Cost 3 vext1 <4,u,6,6>, <1,0,3,2>
-  2645758458U,  // <u,6,6,2>: Cost 3 vext2 <4,6,u,6>, <6,2,7,3>
-  2579826838U,  // <u,6,6,3>: Cost 3 vext1 <4,u,6,6>, <3,0,1,2>
-  1506086198U,  // <u,6,6,4>: Cost 2 vext1 <4,u,6,6>, RHS
-  2579828432U,  // <u,6,6,5>: Cost 3 vext1 <4,u,6,6>, <5,1,7,3>
-   296144182U,  // <u,6,6,6>: Cost 1 vdup2 RHS
-  1638331202U,  // <u,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7>
-   296144182U,  // <u,6,6,u>: Cost 1 vdup2 RHS
-   432349286U,  // <u,6,7,0>: Cost 1 vext1 RHS, LHS
-  1506091766U,  // <u,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2>
-  1506092648U,  // <u,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
-  1506093206U,  // <u,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2>
-   432352809U,  // <u,6,7,4>: Cost 1 vext1 RHS, RHS
-  1506094800U,  // <u,6,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
-  1506095610U,  // <u,6,7,6>: Cost 2 vext1 RHS, <6,2,7,3>
-  1906904374U,  // <u,6,7,7>: Cost 2 vzipr RHS, RHS
-   432355118U,  // <u,6,7,u>: Cost 1 vext1 RHS, LHS
-   432357478U,  // <u,6,u,0>: Cost 1 vext1 RHS, LHS
-  1545475886U,  // <u,6,u,1>: Cost 2 vext2 <0,2,u,6>, LHS
-  1506100840U,  // <u,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2>
-  1506101398U,  // <u,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2>
-   432361002U,  // <u,6,u,4>: Cost 1 vext1 RHS, RHS
-  1545476250U,  // <u,6,u,5>: Cost 2 vext2 <0,2,u,6>, RHS
-   296144182U,  // <u,6,u,6>: Cost 1 vdup2 RHS
-  1880370486U,  // <u,6,u,7>: Cost 2 vzipr LHS, RHS
-   432363310U,  // <u,6,u,u>: Cost 1 vext1 RHS, LHS
-  1571356672U,  // <u,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
-   497614950U,  // <u,7,0,1>: Cost 1 vext2 RHS, LHS
-  1571356836U,  // <u,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
-  2573880146U,  // <u,7,0,3>: Cost 3 vext1 <3,u,7,0>, <3,u,7,0>
-  1571357010U,  // <u,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
-  1512083716U,  // <u,7,0,5>: Cost 2 vext1 <5,u,7,0>, <5,u,7,0>
-  2621874741U,  // <u,7,0,6>: Cost 3 vext2 <0,6,u,7>, <0,6,u,7>
-  2585826298U,  // <u,7,0,7>: Cost 3 vext1 <5,u,7,0>, <7,0,1,2>
-   497615517U,  // <u,7,0,u>: Cost 1 vext2 RHS, LHS
-  1571357430U,  // <u,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
-  1571357492U,  // <u,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
-  1571357590U,  // <u,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0>
-  1552114715U,  // <u,7,1,3>: Cost 2 vext2 <1,3,u,7>, <1,3,u,7>
-  2573888822U,  // <u,7,1,4>: Cost 3 vext1 <3,u,7,1>, RHS
-  1553441981U,  // <u,7,1,5>: Cost 2 vext2 <1,5,u,7>, <1,5,u,7>
-  2627847438U,  // <u,7,1,6>: Cost 3 vext2 <1,6,u,7>, <1,6,u,7>
-  2727408775U,  // <u,7,1,7>: Cost 3 vext3 <7,1,7,u>, <7,1,7,u>
-  1555432880U,  // <u,7,1,u>: Cost 2 vext2 <1,u,u,7>, <1,u,u,7>
-  2629838337U,  // <u,7,2,0>: Cost 3 vext2 <2,0,u,7>, <2,0,u,7>
-  1188058754U,  // <u,7,2,1>: Cost 2 vrev <7,u,1,2>
-  1571358312U,  // <u,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
-  1571358374U,  // <u,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
-  2632492869U,  // <u,7,2,4>: Cost 3 vext2 <2,4,u,7>, <2,4,u,7>
-  2633156502U,  // <u,7,2,5>: Cost 3 vext2 <2,5,u,7>, <2,5,u,7>
-  1560078311U,  // <u,7,2,6>: Cost 2 vext2 <2,6,u,7>, <2,6,u,7>
-  2728072408U,  // <u,7,2,7>: Cost 3 vext3 <7,2,7,u>, <7,2,7,u>
-  1561405577U,  // <u,7,2,u>: Cost 2 vext2 <2,u,u,7>, <2,u,u,7>
-  1571358870U,  // <u,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
-  2627184913U,  // <u,7,3,1>: Cost 3 vext2 <1,5,u,7>, <3,1,5,u>
-  2633820523U,  // <u,7,3,2>: Cost 3 vext2 <2,6,u,7>, <3,2,6,u>
-  1571359132U,  // <u,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
-  1571359234U,  // <u,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
-  1512108295U,  // <u,7,3,5>: Cost 2 vext1 <5,u,7,3>, <5,u,7,3>
-  1518080992U,  // <u,7,3,6>: Cost 2 vext1 <6,u,7,3>, <6,u,7,3>
-  2640456465U,  // <u,7,3,7>: Cost 3 vext2 <3,7,u,7>, <3,7,u,7>
-  1571359518U,  // <u,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
-  1571359634U,  // <u,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
-  2573911067U,  // <u,7,4,1>: Cost 3 vext1 <3,u,7,4>, <1,3,u,7>
-  2645101622U,  // <u,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3>
-  2573912918U,  // <u,7,4,3>: Cost 3 vext1 <3,u,7,4>, <3,u,7,4>
-  1571359952U,  // <u,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
-   497618248U,  // <u,7,4,5>: Cost 1 vext2 RHS, RHS
-  1571360116U,  // <u,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
-  2645102024U,  // <u,7,4,7>: Cost 3 vext2 RHS, <4,7,5,0>
-   497618473U,  // <u,7,4,u>: Cost 1 vext2 RHS, RHS
-  2645102152U,  // <u,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2>
-  1571360464U,  // <u,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
-  2645102334U,  // <u,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4>
-  2645102447U,  // <u,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0>
-  1571360710U,  // <u,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
-  1571360772U,  // <u,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
-  1571360866U,  // <u,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0>
-  1571360936U,  // <u,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
-  1571361017U,  // <u,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7>
-  1530044518U,  // <u,7,6,0>: Cost 2 vext1 <u,u,7,6>, LHS
-  2645103016U,  // <u,7,6,1>: Cost 3 vext2 RHS, <6,1,7,2>
-  1571361274U,  // <u,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
-  2645103154U,  // <u,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5>
-  1530047798U,  // <u,7,6,4>: Cost 2 vext1 <u,u,7,6>, RHS
-  1188386474U,  // <u,7,6,5>: Cost 2 vrev <7,u,5,6>
-  1571361592U,  // <u,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6>
-  1571361614U,  // <u,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
-  1571361695U,  // <u,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1>
-  1571361786U,  // <u,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2>
-  2573935616U,  // <u,7,7,1>: Cost 3 vext1 <3,u,7,7>, <1,3,5,7>
-  2645103781U,  // <u,7,7,2>: Cost 3 vext2 RHS, <7,2,2,2>
-  2573937497U,  // <u,7,7,3>: Cost 3 vext1 <3,u,7,7>, <3,u,7,7>
-  1571362150U,  // <u,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6>
-  1512141067U,  // <u,7,7,5>: Cost 2 vext1 <5,u,7,7>, <5,u,7,7>
-  1518113764U,  // <u,7,7,6>: Cost 2 vext1 <6,u,7,7>, <6,u,7,7>
-   363253046U,  // <u,7,7,7>: Cost 1 vdup3 RHS
-   363253046U,  // <u,7,7,u>: Cost 1 vdup3 RHS
-  1571362515U,  // <u,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2>
-   497620782U,  // <u,7,u,1>: Cost 1 vext2 RHS, LHS
-  1571362693U,  // <u,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0>
-  1571362748U,  // <u,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
-  1571362879U,  // <u,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6>
-   497621146U,  // <u,7,u,5>: Cost 1 vext2 RHS, RHS
-  1571363024U,  // <u,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7>
-   363253046U,  // <u,7,u,7>: Cost 1 vdup3 RHS
-   497621349U,  // <u,7,u,u>: Cost 1 vext2 RHS, LHS
-   135053414U,  // <u,u,0,0>: Cost 1 vdup0 LHS
-   471081121U,  // <u,u,0,1>: Cost 1 vext2 LHS, LHS
-  1544822948U,  // <u,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
-  1616140005U,  // <u,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2>
-  1544823122U,  // <u,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
-  1512157453U,  // <u,u,0,5>: Cost 2 vext1 <5,u,u,0>, <5,u,u,0>
-  1662220032U,  // <u,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2>
-  1194457487U,  // <u,u,0,7>: Cost 2 vrev <u,u,7,0>
-   471081629U,  // <u,u,0,u>: Cost 1 vext2 LHS, LHS
-  1544823542U,  // <u,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
-   202162278U,  // <u,u,1,1>: Cost 1 vdup1 LHS
-   537753390U,  // <u,u,1,2>: Cost 1 vext3 LHS, LHS
-  1544823768U,  // <u,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
-  1494248758U,  // <u,u,1,4>: Cost 2 vext1 <2,u,u,1>, RHS
-  1544823952U,  // <u,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
-  1518138343U,  // <u,u,1,6>: Cost 2 vext1 <6,u,u,1>, <6,u,u,1>
-  1640322907U,  // <u,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3>
-   537753444U,  // <u,u,1,u>: Cost 1 vext3 LHS, LHS
-  1482309734U,  // <u,u,2,0>: Cost 2 vext1 <0,u,u,2>, LHS
-  1194031451U,  // <u,u,2,1>: Cost 2 vrev <u,u,1,2>
-   269271142U,  // <u,u,2,2>: Cost 1 vdup2 LHS
-      835584U,  // <u,u,2,3>: Cost 0 copy LHS
-  1482313014U,  // <u,u,2,4>: Cost 2 vext1 <0,u,u,2>, RHS
-  2618566504U,  // <u,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
-  1544824762U,  // <u,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
-  1638479788U,  // <u,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3>
-      835584U,  // <u,u,2,u>: Cost 0 copy LHS
-   408576723U,  // <u,u,3,0>: Cost 1 vext1 LHS, LHS
-  1482318582U,  // <u,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
-   120371557U,  // <u,u,3,2>: Cost 1 vrev LHS
-   336380006U,  // <u,u,3,3>: Cost 1 vdup3 LHS
-   408579382U,  // <u,u,3,4>: Cost 1 vext1 LHS, RHS
-  1616140271U,  // <u,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7>
-  1530098170U,  // <u,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
-  1880329544U,  // <u,u,3,7>: Cost 2 vzipr LHS, RHS
-   408581934U,  // <u,u,3,u>: Cost 1 vext1 LHS, LHS
-  1488298086U,  // <u,u,4,0>: Cost 2 vext1 <1,u,u,4>, LHS
-  1488299437U,  // <u,u,4,1>: Cost 2 vext1 <1,u,u,4>, <1,u,u,4>
-  1659271204U,  // <u,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6>
-  1194195311U,  // <u,u,4,3>: Cost 2 vrev <u,u,3,4>
-   161926454U,  // <u,u,4,4>: Cost 1 vdup0 RHS
-   471084342U,  // <u,u,4,5>: Cost 1 vext2 LHS, RHS
-  1571368308U,  // <u,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
-  1640323153U,  // <u,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6>
-   471084585U,  // <u,u,4,u>: Cost 1 vext2 LHS, RHS
-  1494278246U,  // <u,u,5,0>: Cost 2 vext1 <2,u,u,5>, LHS
-  1571368656U,  // <u,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
-  1494280327U,  // <u,u,5,2>: Cost 2 vext1 <2,u,u,5>, <2,u,u,5>
-  1616140415U,  // <u,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7>
-  1494281526U,  // <u,u,5,4>: Cost 2 vext1 <2,u,u,5>, RHS
-   229035318U,  // <u,u,5,5>: Cost 1 vdup1 RHS
-   537753754U,  // <u,u,5,6>: Cost 1 vext3 LHS, RHS
-  1750355254U,  // <u,u,5,7>: Cost 2 vuzpr LHS, RHS
-   537753772U,  // <u,u,5,u>: Cost 1 vext3 LHS, RHS
-  1482342502U,  // <u,u,6,0>: Cost 2 vext1 <0,u,u,6>, LHS
-  2556084982U,  // <u,u,6,1>: Cost 3 vext1 <0,u,u,6>, <1,0,3,2>
-  1571369466U,  // <u,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
-  1611938000U,  // <u,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7>
-  1482345782U,  // <u,u,6,4>: Cost 2 vext1 <0,u,u,6>, RHS
-  1194359171U,  // <u,u,6,5>: Cost 2 vrev <u,u,5,6>
-   296144182U,  // <u,u,6,6>: Cost 1 vdup2 RHS
-    27705344U,  // <u,u,6,7>: Cost 0 copy RHS
-    27705344U,  // <u,u,6,u>: Cost 0 copy RHS
-   432496742U,  // <u,u,7,0>: Cost 1 vext1 RHS, LHS
-  1488324016U,  // <u,u,7,1>: Cost 2 vext1 <1,u,u,7>, <1,u,u,7>
-  1494296713U,  // <u,u,7,2>: Cost 2 vext1 <2,u,u,7>, <2,u,u,7>
-  1906901148U,  // <u,u,7,3>: Cost 2 vzipr RHS, LHS
-   432500283U,  // <u,u,7,4>: Cost 1 vext1 RHS, RHS
-  1506242256U,  // <u,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
-   120699277U,  // <u,u,7,6>: Cost 1 vrev RHS
-   363253046U,  // <u,u,7,7>: Cost 1 vdup3 RHS
-   432502574U,  // <u,u,7,u>: Cost 1 vext1 RHS, LHS
-   408617688U,  // <u,u,u,0>: Cost 1 vext1 LHS, LHS
-   471086894U,  // <u,u,u,1>: Cost 1 vext2 LHS, LHS
-   537753957U,  // <u,u,u,2>: Cost 1 vext3 LHS, LHS
-      835584U,  // <u,u,u,3>: Cost 0 copy LHS
-   408620342U,  // <u,u,u,4>: Cost 1 vext1 LHS, RHS
-   471087258U,  // <u,u,u,5>: Cost 1 vext2 LHS, RHS
-   537753997U,  // <u,u,u,6>: Cost 1 vext3 LHS, RHS
-    27705344U,  // <u,u,u,7>: Cost 0 copy RHS
-      835584U,  // <u,u,u,u>: Cost 0 copy LHS
+  135053414U, // <0,0,0,0>: Cost 1 vdup0 LHS
+  1543503974U, // <0,0,0,1>: Cost 2 vext2 <0,0,0,0>, LHS
+  2618572962U, // <0,0,0,2>: Cost 3 vext2 <0,2,0,0>, <0,2,0,0>
+  2568054923U, // <0,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0>
+  1476398390U, // <0,0,0,4>: Cost 2 vext1 <0,0,0,0>, RHS
+  2550140624U, // <0,0,0,5>: Cost 3 vext1 <0,0,0,0>, <5,1,7,3>
+  2550141434U, // <0,0,0,6>: Cost 3 vext1 <0,0,0,0>, <6,2,7,3>
+  2591945711U, // <0,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0>
+  135053414U, // <0,0,0,u>: Cost 1 vdup0 LHS
+  2886516736U, // <0,0,1,0>: Cost 3 vzipl LHS, <0,0,0,0>
+  1812775014U, // <0,0,1,1>: Cost 2 vzipl LHS, LHS
+  1618133094U, // <0,0,1,2>: Cost 2 vext3 <1,2,3,0>, LHS
+  2625209292U, // <0,0,1,3>: Cost 3 vext2 <1,3,0,0>, <1,3,0,0>
+  2886558034U, // <0,0,1,4>: Cost 3 vzipl LHS, <0,4,1,5>
+  2617246864U, // <0,0,1,5>: Cost 3 vext2 <0,0,0,0>, <1,5,3,7>
+  3659723031U, // <0,0,1,6>: Cost 4 vext1 <6,0,0,1>, <6,0,0,1>
+  2591953904U, // <0,0,1,7>: Cost 3 vext1 <7,0,0,1>, <7,0,0,1>
+  1812775581U, // <0,0,1,u>: Cost 2 vzipl LHS, LHS
+  3020734464U, // <0,0,2,0>: Cost 3 vtrnl LHS, <0,0,0,0>
+  3020734474U, // <0,0,2,1>: Cost 3 vtrnl LHS, <0,0,1,1>
+  1946992742U, // <0,0,2,2>: Cost 2 vtrnl LHS, LHS
+  2631181989U, // <0,0,2,3>: Cost 3 vext2 <2,3,0,0>, <2,3,0,0>
+  3020734668U, // <0,0,2,4>: Cost 3 vtrnl LHS, <0,2,4,6>
+  3826550569U, // <0,0,2,5>: Cost 4 vuzpl <0,2,0,2>, <2,4,5,6>
+  2617247674U, // <0,0,2,6>: Cost 3 vext2 <0,0,0,0>, <2,6,3,7>
+  2591962097U, // <0,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2>
+  1946992796U, // <0,0,2,u>: Cost 2 vtrnl LHS, LHS
+  2635163787U, // <0,0,3,0>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0>
+  2686419196U, // <0,0,3,1>: Cost 3 vext3 <0,3,1,0>, <0,3,1,0>
+  2686492933U, // <0,0,3,2>: Cost 3 vext3 <0,3,2,0>, <0,3,2,0>
+  2617248156U, // <0,0,3,3>: Cost 3 vext2 <0,0,0,0>, <3,3,3,3>
+  2617248258U, // <0,0,3,4>: Cost 3 vext2 <0,0,0,0>, <3,4,5,6>
+  3826551298U, // <0,0,3,5>: Cost 4 vuzpl <0,2,0,2>, <3,4,5,6>
+  3690990200U, // <0,0,3,6>: Cost 4 vext2 <0,0,0,0>, <3,6,0,7>
+  3713551042U, // <0,0,3,7>: Cost 4 vext2 <3,7,0,0>, <3,7,0,0>
+  2635163787U, // <0,0,3,u>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0>
+  2617248658U, // <0,0,4,0>: Cost 3 vext2 <0,0,0,0>, <4,0,5,1>
+  2888450150U, // <0,0,4,1>: Cost 3 vzipl <0,4,1,5>, LHS
+  3021570150U, // <0,0,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS
+  3641829519U, // <0,0,4,3>: Cost 4 vext1 <3,0,0,4>, <3,0,0,4>
+  3021570252U, // <0,0,4,4>: Cost 3 vtrnl <0,2,4,6>, <0,2,4,6>
+  1543507254U, // <0,0,4,5>: Cost 2 vext2 <0,0,0,0>, RHS
+  2752810294U, // <0,0,4,6>: Cost 3 vuzpl <0,2,0,2>, RHS
+  3786998152U, // <0,0,4,7>: Cost 4 vext3 <4,7,5,0>, <0,4,7,5>
+  1543507497U, // <0,0,4,u>: Cost 2 vext2 <0,0,0,0>, RHS
+  2684354972U, // <0,0,5,0>: Cost 3 vext3 <0,0,0,0>, <0,5,0,7>
+  2617249488U, // <0,0,5,1>: Cost 3 vext2 <0,0,0,0>, <5,1,7,3>
+  3765617070U, // <0,0,5,2>: Cost 4 vext3 <1,2,3,0>, <0,5,2,7>
+  3635865780U, // <0,0,5,3>: Cost 4 vext1 <2,0,0,5>, <3,0,4,5>
+  2617249734U, // <0,0,5,4>: Cost 3 vext2 <0,0,0,0>, <5,4,7,6>
+  2617249796U, // <0,0,5,5>: Cost 3 vext2 <0,0,0,0>, <5,5,5,5>
+  2718712274U, // <0,0,5,6>: Cost 3 vext3 <5,6,7,0>, <0,5,6,7>
+  2617249960U, // <0,0,5,7>: Cost 3 vext2 <0,0,0,0>, <5,7,5,7>
+  2720039396U, // <0,0,5,u>: Cost 3 vext3 <5,u,7,0>, <0,5,u,7>
+  2684355053U, // <0,0,6,0>: Cost 3 vext3 <0,0,0,0>, <0,6,0,7>
+  3963609190U, // <0,0,6,1>: Cost 4 vzipl <0,6,2,7>, LHS
+  2617250298U, // <0,0,6,2>: Cost 3 vext2 <0,0,0,0>, <6,2,7,3>
+  3796435464U, // <0,0,6,3>: Cost 4 vext3 <6,3,7,0>, <0,6,3,7>
+  3659762998U, // <0,0,6,4>: Cost 4 vext1 <6,0,0,6>, RHS
+  3659763810U, // <0,0,6,5>: Cost 4 vext1 <6,0,0,6>, <5,6,7,0>
+  2617250616U, // <0,0,6,6>: Cost 3 vext2 <0,0,0,0>, <6,6,6,6>
+  2657727309U, // <0,0,6,7>: Cost 3 vext2 <6,7,0,0>, <6,7,0,0>
+  2658390942U, // <0,0,6,u>: Cost 3 vext2 <6,u,0,0>, <6,u,0,0>
+  2659054575U, // <0,0,7,0>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0>
+  3635880854U, // <0,0,7,1>: Cost 4 vext1 <2,0,0,7>, <1,2,3,0>
+  3635881401U, // <0,0,7,2>: Cost 4 vext1 <2,0,0,7>, <2,0,0,7>
+  3734787298U, // <0,0,7,3>: Cost 4 vext2 <7,3,0,0>, <7,3,0,0>
+  2617251174U, // <0,0,7,4>: Cost 3 vext2 <0,0,0,0>, <7,4,5,6>
+  3659772002U, // <0,0,7,5>: Cost 4 vext1 <6,0,0,7>, <5,6,7,0>
+  3659772189U, // <0,0,7,6>: Cost 4 vext1 <6,0,0,7>, <6,0,0,7>
+  2617251436U, // <0,0,7,7>: Cost 3 vext2 <0,0,0,0>, <7,7,7,7>
+  2659054575U, // <0,0,7,u>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0>
+  135053414U, // <0,0,u,0>: Cost 1 vdup0 LHS
+  1817419878U, // <0,0,u,1>: Cost 2 vzipl LHS, LHS
+  1947435110U, // <0,0,u,2>: Cost 2 vtrnl LHS, LHS
+  2568120467U, // <0,0,u,3>: Cost 3 vext1 <3,0,0,u>, <3,0,0,u>
+  1476463926U, // <0,0,u,4>: Cost 2 vext1 <0,0,0,u>, RHS
+  1543510170U, // <0,0,u,5>: Cost 2 vext2 <0,0,0,0>, RHS
+  2752813210U, // <0,0,u,6>: Cost 3 vuzpl <0,2,0,2>, RHS
+  2592011255U, // <0,0,u,7>: Cost 3 vext1 <7,0,0,u>, <7,0,0,u>
+  135053414U, // <0,0,u,u>: Cost 1 vdup0 LHS
+  2618581002U, // <0,1,0,0>: Cost 3 vext2 <0,2,0,1>, <0,0,1,1>
+  1557446758U, // <0,1,0,1>: Cost 2 vext2 <2,3,0,1>, LHS
+  2618581155U, // <0,1,0,2>: Cost 3 vext2 <0,2,0,1>, <0,2,0,1>
+  2690548468U, // <0,1,0,3>: Cost 3 vext3 <1,0,3,0>, <1,0,3,0>
+  2626543954U, // <0,1,0,4>: Cost 3 vext2 <1,5,0,1>, <0,4,1,5>
+  4094985216U, // <0,1,0,5>: Cost 4 vtrnl <0,2,0,2>, <1,3,5,7>
+  2592019278U, // <0,1,0,6>: Cost 3 vext1 <7,0,1,0>, <6,7,0,1>
+  2592019448U, // <0,1,0,7>: Cost 3 vext1 <7,0,1,0>, <7,0,1,0>
+  1557447325U, // <0,1,0,u>: Cost 2 vext2 <2,3,0,1>, LHS
+  1476476938U, // <0,1,1,0>: Cost 2 vext1 <0,0,1,1>, <0,0,1,1>
+  2886517556U, // <0,1,1,1>: Cost 3 vzipl LHS, <1,1,1,1>
+  2886517654U, // <0,1,1,2>: Cost 3 vzipl LHS, <1,2,3,0>
+  2886517720U, // <0,1,1,3>: Cost 3 vzipl LHS, <1,3,1,3>
+  1476480310U, // <0,1,1,4>: Cost 2 vext1 <0,0,1,1>, RHS
+  2886558864U, // <0,1,1,5>: Cost 3 vzipl LHS, <1,5,3,7>
+  2550223354U, // <0,1,1,6>: Cost 3 vext1 <0,0,1,1>, <6,2,7,3>
+  2550223856U, // <0,1,1,7>: Cost 3 vext1 <0,0,1,1>, <7,0,0,1>
+  1476482862U, // <0,1,1,u>: Cost 2 vext1 <0,0,1,1>, LHS
+  1494401126U, // <0,1,2,0>: Cost 2 vext1 <3,0,1,2>, LHS
+  3020735284U, // <0,1,2,1>: Cost 3 vtrnl LHS, <1,1,1,1>
+  2562172349U, // <0,1,2,2>: Cost 3 vext1 <2,0,1,2>, <2,0,1,2>
+  835584U, // <0,1,2,3>: Cost 0 copy LHS
+  1494404406U, // <0,1,2,4>: Cost 2 vext1 <3,0,1,2>, RHS
+  3020735488U, // <0,1,2,5>: Cost 3 vtrnl LHS, <1,3,5,7>
+  2631190458U, // <0,1,2,6>: Cost 3 vext2 <2,3,0,1>, <2,6,3,7>
+  1518294010U, // <0,1,2,7>: Cost 2 vext1 <7,0,1,2>, <7,0,1,2>
+  835584U, // <0,1,2,u>: Cost 0 copy LHS
+  2692318156U, // <0,1,3,0>: Cost 3 vext3 <1,3,0,0>, <1,3,0,0>
+  2691875800U, // <0,1,3,1>: Cost 3 vext3 <1,2,3,0>, <1,3,1,3>
+  2691875806U, // <0,1,3,2>: Cost 3 vext3 <1,2,3,0>, <1,3,2,0>
+  2692539367U, // <0,1,3,3>: Cost 3 vext3 <1,3,3,0>, <1,3,3,0>
+  2562182454U, // <0,1,3,4>: Cost 3 vext1 <2,0,1,3>, RHS
+  2691875840U, // <0,1,3,5>: Cost 3 vext3 <1,2,3,0>, <1,3,5,7>
+  2692760578U, // <0,1,3,6>: Cost 3 vext3 <1,3,6,0>, <1,3,6,0>
+  2639817411U, // <0,1,3,7>: Cost 3 vext2 <3,7,0,1>, <3,7,0,1>
+  2691875863U, // <0,1,3,u>: Cost 3 vext3 <1,2,3,0>, <1,3,u,3>
+  2568159334U, // <0,1,4,0>: Cost 3 vext1 <3,0,1,4>, LHS
+  4095312692U, // <0,1,4,1>: Cost 4 vtrnl <0,2,4,6>, <1,1,1,1>
+  2568160934U, // <0,1,4,2>: Cost 3 vext1 <3,0,1,4>, <2,3,0,1>
+  2568161432U, // <0,1,4,3>: Cost 3 vext1 <3,0,1,4>, <3,0,1,4>
+  2568162614U, // <0,1,4,4>: Cost 3 vext1 <3,0,1,4>, RHS
+  1557450038U, // <0,1,4,5>: Cost 2 vext2 <2,3,0,1>, RHS
+  2754235702U, // <0,1,4,6>: Cost 3 vuzpl <0,4,1,5>, RHS
+  2592052220U, // <0,1,4,7>: Cost 3 vext1 <7,0,1,4>, <7,0,1,4>
+  1557450281U, // <0,1,4,u>: Cost 2 vext2 <2,3,0,1>, RHS
+  3765617775U, // <0,1,5,0>: Cost 4 vext3 <1,2,3,0>, <1,5,0,1>
+  2647781007U, // <0,1,5,1>: Cost 3 vext2 <5,1,0,1>, <5,1,0,1>
+  3704934138U, // <0,1,5,2>: Cost 4 vext2 <2,3,0,1>, <5,2,3,0>
+  2691875984U, // <0,1,5,3>: Cost 3 vext3 <1,2,3,0>, <1,5,3,7>
+  2657734598U, // <0,1,5,4>: Cost 3 vext2 <6,7,0,1>, <5,4,7,6>
+  2650435539U, // <0,1,5,5>: Cost 3 vext2 <5,5,0,1>, <5,5,0,1>
+  2651099172U, // <0,1,5,6>: Cost 3 vext2 <5,6,0,1>, <5,6,0,1>
+  2651762805U, // <0,1,5,7>: Cost 3 vext2 <5,7,0,1>, <5,7,0,1>
+  2691876029U, // <0,1,5,u>: Cost 3 vext3 <1,2,3,0>, <1,5,u,7>
+  2592063590U, // <0,1,6,0>: Cost 3 vext1 <7,0,1,6>, LHS
+  3765617871U, // <0,1,6,1>: Cost 4 vext3 <1,2,3,0>, <1,6,1,7>
+  2654417337U, // <0,1,6,2>: Cost 3 vext2 <6,2,0,1>, <6,2,0,1>
+  3765617889U, // <0,1,6,3>: Cost 4 vext3 <1,2,3,0>, <1,6,3,7>
+  2592066870U, // <0,1,6,4>: Cost 3 vext1 <7,0,1,6>, RHS
+  3765617907U, // <0,1,6,5>: Cost 4 vext3 <1,2,3,0>, <1,6,5,7>
+  2657071869U, // <0,1,6,6>: Cost 3 vext2 <6,6,0,1>, <6,6,0,1>
+  1583993678U, // <0,1,6,7>: Cost 2 vext2 <6,7,0,1>, <6,7,0,1>
+  1584657311U, // <0,1,6,u>: Cost 2 vext2 <6,u,0,1>, <6,u,0,1>
+  2657735672U, // <0,1,7,0>: Cost 3 vext2 <6,7,0,1>, <7,0,1,0>
+  2657735808U, // <0,1,7,1>: Cost 3 vext2 <6,7,0,1>, <7,1,7,1>
+  2631193772U, // <0,1,7,2>: Cost 3 vext2 <2,3,0,1>, <7,2,3,0>
+  2661053667U, // <0,1,7,3>: Cost 3 vext2 <7,3,0,1>, <7,3,0,1>
+  2657736038U, // <0,1,7,4>: Cost 3 vext2 <6,7,0,1>, <7,4,5,6>
+  3721524621U, // <0,1,7,5>: Cost 4 vext2 <5,1,0,1>, <7,5,1,0>
+  2657736158U, // <0,1,7,6>: Cost 3 vext2 <6,7,0,1>, <7,6,1,0>
+  2657736300U, // <0,1,7,7>: Cost 3 vext2 <6,7,0,1>, <7,7,7,7>
+  2657736322U, // <0,1,7,u>: Cost 3 vext2 <6,7,0,1>, <7,u,1,2>
+  1494450278U, // <0,1,u,0>: Cost 2 vext1 <3,0,1,u>, LHS
+  1557452590U, // <0,1,u,1>: Cost 2 vext2 <2,3,0,1>, LHS
+  2754238254U, // <0,1,u,2>: Cost 3 vuzpl <0,4,1,5>, LHS
+  835584U, // <0,1,u,3>: Cost 0 copy LHS
+  1494453558U, // <0,1,u,4>: Cost 2 vext1 <3,0,1,u>, RHS
+  1557452954U, // <0,1,u,5>: Cost 2 vext2 <2,3,0,1>, RHS
+  2754238618U, // <0,1,u,6>: Cost 3 vuzpl <0,4,1,5>, RHS
+  1518343168U, // <0,1,u,7>: Cost 2 vext1 <7,0,1,u>, <7,0,1,u>
+  835584U, // <0,1,u,u>: Cost 0 copy LHS
+  2752299008U, // <0,2,0,0>: Cost 3 vuzpl LHS, <0,0,0,0>
+  1544847462U, // <0,2,0,1>: Cost 2 vext2 <0,2,0,2>, LHS
+  1678557286U, // <0,2,0,2>: Cost 2 vuzpl LHS, LHS
+  2696521165U, // <0,2,0,3>: Cost 3 vext3 <2,0,3,0>, <2,0,3,0>
+  2752340172U, // <0,2,0,4>: Cost 3 vuzpl LHS, <0,2,4,6>
+  2691876326U, // <0,2,0,5>: Cost 3 vext3 <1,2,3,0>, <2,0,5,7>
+  2618589695U, // <0,2,0,6>: Cost 3 vext2 <0,2,0,2>, <0,6,2,7>
+  2592093185U, // <0,2,0,7>: Cost 3 vext1 <7,0,2,0>, <7,0,2,0>
+  1678557340U, // <0,2,0,u>: Cost 2 vuzpl LHS, LHS
+  2618589942U, // <0,2,1,0>: Cost 3 vext2 <0,2,0,2>, <1,0,3,2>
+  2752299828U, // <0,2,1,1>: Cost 3 vuzpl LHS, <1,1,1,1>
+  2886518376U, // <0,2,1,2>: Cost 3 vzipl LHS, <2,2,2,2>
+  2752299766U, // <0,2,1,3>: Cost 3 vuzpl LHS, <1,0,3,2>
+  2550295862U, // <0,2,1,4>: Cost 3 vext1 <0,0,2,1>, RHS
+  2752340992U, // <0,2,1,5>: Cost 3 vuzpl LHS, <1,3,5,7>
+  2886559674U, // <0,2,1,6>: Cost 3 vzipl LHS, <2,6,3,7>
+  3934208106U, // <0,2,1,7>: Cost 4 vuzpr <7,0,1,2>, <0,1,2,7>
+  2752340771U, // <0,2,1,u>: Cost 3 vuzpl LHS, <1,0,u,2>
+  1476558868U, // <0,2,2,0>: Cost 2 vext1 <0,0,2,2>, <0,0,2,2>
+  2226628029U, // <0,2,2,1>: Cost 3 vrev <2,0,1,2>
+  2752300648U, // <0,2,2,2>: Cost 3 vuzpl LHS, <2,2,2,2>
+  3020736114U, // <0,2,2,3>: Cost 3 vtrnl LHS, <2,2,3,3>
+  1476562230U, // <0,2,2,4>: Cost 2 vext1 <0,0,2,2>, RHS
+  2550304464U, // <0,2,2,5>: Cost 3 vext1 <0,0,2,2>, <5,1,7,3>
+  2618591162U, // <0,2,2,6>: Cost 3 vext2 <0,2,0,2>, <2,6,3,7>
+  2550305777U, // <0,2,2,7>: Cost 3 vext1 <0,0,2,2>, <7,0,0,2>
+  1476564782U, // <0,2,2,u>: Cost 2 vext1 <0,0,2,2>, LHS
+  2618591382U, // <0,2,3,0>: Cost 3 vext2 <0,2,0,2>, <3,0,1,2>
+  2752301206U, // <0,2,3,1>: Cost 3 vuzpl LHS, <3,0,1,2>
+  3826043121U, // <0,2,3,2>: Cost 4 vuzpl LHS, <3,1,2,3>
+  2752301468U, // <0,2,3,3>: Cost 3 vuzpl LHS, <3,3,3,3>
+  2618591746U, // <0,2,3,4>: Cost 3 vext2 <0,2,0,2>, <3,4,5,6>
+  2752301570U, // <0,2,3,5>: Cost 3 vuzpl LHS, <3,4,5,6>
+  3830688102U, // <0,2,3,6>: Cost 4 vuzpl LHS, <3,2,6,3>
+  2698807012U, // <0,2,3,7>: Cost 3 vext3 <2,3,7,0>, <2,3,7,0>
+  2752301269U, // <0,2,3,u>: Cost 3 vuzpl LHS, <3,0,u,2>
+  2562261094U, // <0,2,4,0>: Cost 3 vext1 <2,0,2,4>, LHS
+  4095313828U, // <0,2,4,1>: Cost 4 vtrnl <0,2,4,6>, <2,6,1,3>
+  2226718152U, // <0,2,4,2>: Cost 3 vrev <2,0,2,4>
+  2568235169U, // <0,2,4,3>: Cost 3 vext1 <3,0,2,4>, <3,0,2,4>
+  2562264374U, // <0,2,4,4>: Cost 3 vext1 <2,0,2,4>, RHS
+  1544850742U, // <0,2,4,5>: Cost 2 vext2 <0,2,0,2>, RHS
+  1678560566U, // <0,2,4,6>: Cost 2 vuzpl LHS, RHS
+  2592125957U, // <0,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4>
+  1678560584U, // <0,2,4,u>: Cost 2 vuzpl LHS, RHS
+  2691876686U, // <0,2,5,0>: Cost 3 vext3 <1,2,3,0>, <2,5,0,7>
+  2618592976U, // <0,2,5,1>: Cost 3 vext2 <0,2,0,2>, <5,1,7,3>
+  3765618528U, // <0,2,5,2>: Cost 4 vext3 <1,2,3,0>, <2,5,2,7>
+  3765618536U, // <0,2,5,3>: Cost 4 vext3 <1,2,3,0>, <2,5,3,6>
+  2618593222U, // <0,2,5,4>: Cost 3 vext2 <0,2,0,2>, <5,4,7,6>
+  2752303108U, // <0,2,5,5>: Cost 3 vuzpl LHS, <5,5,5,5>
+  2618593378U, // <0,2,5,6>: Cost 3 vext2 <0,2,0,2>, <5,6,7,0>
+  2824785206U, // <0,2,5,7>: Cost 3 vuzpr <1,0,3,2>, RHS
+  2824785207U, // <0,2,5,u>: Cost 3 vuzpr <1,0,3,2>, RHS
+  2752303950U, // <0,2,6,0>: Cost 3 vuzpl LHS, <6,7,0,1>
+  3830690081U, // <0,2,6,1>: Cost 4 vuzpl LHS, <6,0,1,2>
+  2618593786U, // <0,2,6,2>: Cost 3 vext2 <0,2,0,2>, <6,2,7,3>
+  2691876794U, // <0,2,6,3>: Cost 3 vext3 <1,2,3,0>, <2,6,3,7>
+  2752303990U, // <0,2,6,4>: Cost 3 vuzpl LHS, <6,7,4,5>
+  3830690445U, // <0,2,6,5>: Cost 4 vuzpl LHS, <6,4,5,6>
+  2752303928U, // <0,2,6,6>: Cost 3 vuzpl LHS, <6,6,6,6>
+  2657743695U, // <0,2,6,7>: Cost 3 vext2 <6,7,0,2>, <6,7,0,2>
+  2691876839U, // <0,2,6,u>: Cost 3 vext3 <1,2,3,0>, <2,6,u,7>
+  2659070961U, // <0,2,7,0>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2>
+  2659734594U, // <0,2,7,1>: Cost 3 vext2 <7,1,0,2>, <7,1,0,2>
+  3734140051U, // <0,2,7,2>: Cost 4 vext2 <7,2,0,2>, <7,2,0,2>
+  2701166596U, // <0,2,7,3>: Cost 3 vext3 <2,7,3,0>, <2,7,3,0>
+  2662389094U, // <0,2,7,4>: Cost 3 vext2 <7,5,0,2>, <7,4,5,6>
+  2662389126U, // <0,2,7,5>: Cost 3 vext2 <7,5,0,2>, <7,5,0,2>
+  3736794583U, // <0,2,7,6>: Cost 4 vext2 <7,6,0,2>, <7,6,0,2>
+  2752304748U, // <0,2,7,7>: Cost 3 vuzpl LHS, <7,7,7,7>
+  2659070961U, // <0,2,7,u>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2>
+  1476608026U, // <0,2,u,0>: Cost 2 vext1 <0,0,2,u>, <0,0,2,u>
+  1544853294U, // <0,2,u,1>: Cost 2 vext2 <0,2,0,2>, LHS
+  1678563118U, // <0,2,u,2>: Cost 2 vuzpl LHS, LHS
+  3021178482U, // <0,2,u,3>: Cost 3 vtrnl LHS, <2,2,3,3>
+  1476611382U, // <0,2,u,4>: Cost 2 vext1 <0,0,2,u>, RHS
+  1544853658U, // <0,2,u,5>: Cost 2 vext2 <0,2,0,2>, RHS
+  1678563482U, // <0,2,u,6>: Cost 2 vuzpl LHS, RHS
+  2824785449U, // <0,2,u,7>: Cost 3 vuzpr <1,0,3,2>, RHS
+  1678563172U, // <0,2,u,u>: Cost 2 vuzpl LHS, LHS
+  2556329984U, // <0,3,0,0>: Cost 3 vext1 <1,0,3,0>, <0,0,0,0>
+  2686421142U, // <0,3,0,1>: Cost 3 vext3 <0,3,1,0>, <3,0,1,2>
+  2562303437U, // <0,3,0,2>: Cost 3 vext1 <2,0,3,0>, <2,0,3,0>
+  4094986652U, // <0,3,0,3>: Cost 4 vtrnl <0,2,0,2>, <3,3,3,3>
+  2556333366U, // <0,3,0,4>: Cost 3 vext1 <1,0,3,0>, RHS
+  4094986754U, // <0,3,0,5>: Cost 4 vtrnl <0,2,0,2>, <3,4,5,6>
+  3798796488U, // <0,3,0,6>: Cost 4 vext3 <6,7,3,0>, <3,0,6,7>
+  3776530634U, // <0,3,0,7>: Cost 4 vext3 <3,0,7,0>, <3,0,7,0>
+  2556335918U, // <0,3,0,u>: Cost 3 vext1 <1,0,3,0>, LHS
+  2886518934U, // <0,3,1,0>: Cost 3 vzipl LHS, <3,0,1,2>
+  2556338933U, // <0,3,1,1>: Cost 3 vext1 <1,0,3,1>, <1,0,3,1>
+  2691877105U, // <0,3,1,2>: Cost 3 vext3 <1,2,3,0>, <3,1,2,3>
+  2886519196U, // <0,3,1,3>: Cost 3 vzipl LHS, <3,3,3,3>
+  2886519298U, // <0,3,1,4>: Cost 3 vzipl LHS, <3,4,5,6>
+  4095740418U, // <0,3,1,5>: Cost 4 vtrnl <0,3,1,4>, <3,4,5,6>
+  3659944242U, // <0,3,1,6>: Cost 4 vext1 <6,0,3,1>, <6,0,3,1>
+  3769600286U, // <0,3,1,7>: Cost 4 vext3 <1,u,3,0>, <3,1,7,3>
+  2886519582U, // <0,3,1,u>: Cost 3 vzipl LHS, <3,u,1,2>
+  1482604646U, // <0,3,2,0>: Cost 2 vext1 <1,0,3,2>, LHS
+  1482605302U, // <0,3,2,1>: Cost 2 vext1 <1,0,3,2>, <1,0,3,2>
+  2556348008U, // <0,3,2,2>: Cost 3 vext1 <1,0,3,2>, <2,2,2,2>
+  3020736924U, // <0,3,2,3>: Cost 3 vtrnl LHS, <3,3,3,3>
+  1482607926U, // <0,3,2,4>: Cost 2 vext1 <1,0,3,2>, RHS
+  3020737026U, // <0,3,2,5>: Cost 3 vtrnl LHS, <3,4,5,6>
+  2598154746U, // <0,3,2,6>: Cost 3 vext1 <u,0,3,2>, <6,2,7,3>
+  2598155258U, // <0,3,2,7>: Cost 3 vext1 <u,0,3,2>, <7,0,1,2>
+  1482610478U, // <0,3,2,u>: Cost 2 vext1 <1,0,3,2>, LHS
+  3692341398U, // <0,3,3,0>: Cost 4 vext2 <0,2,0,3>, <3,0,1,2>
+  2635851999U, // <0,3,3,1>: Cost 3 vext2 <3,1,0,3>, <3,1,0,3>
+  3636069840U, // <0,3,3,2>: Cost 4 vext1 <2,0,3,3>, <2,0,3,3>
+  2691877276U, // <0,3,3,3>: Cost 3 vext3 <1,2,3,0>, <3,3,3,3>
+  3961522690U, // <0,3,3,4>: Cost 4 vzipl <0,3,1,4>, <3,4,5,6>
+  3826797058U, // <0,3,3,5>: Cost 4 vuzpl <0,2,3,5>, <3,4,5,6>
+  3703622282U, // <0,3,3,6>: Cost 4 vext2 <2,1,0,3>, <3,6,2,7>
+  3769600452U, // <0,3,3,7>: Cost 4 vext3 <1,u,3,0>, <3,3,7,7>
+  2640497430U, // <0,3,3,u>: Cost 3 vext2 <3,u,0,3>, <3,u,0,3>
+  3962194070U, // <0,3,4,0>: Cost 4 vzipl <0,4,1,5>, <3,0,1,2>
+  2232617112U, // <0,3,4,1>: Cost 3 vrev <3,0,1,4>
+  2232690849U, // <0,3,4,2>: Cost 3 vrev <3,0,2,4>
+  4095314332U, // <0,3,4,3>: Cost 4 vtrnl <0,2,4,6>, <3,3,3,3>
+  3962194434U, // <0,3,4,4>: Cost 4 vzipl <0,4,1,5>, <3,4,5,6>
+  2691877378U, // <0,3,4,5>: Cost 3 vext3 <1,2,3,0>, <3,4,5,6>
+  3826765110U, // <0,3,4,6>: Cost 4 vuzpl <0,2,3,1>, RHS
+  3665941518U, // <0,3,4,7>: Cost 4 vext1 <7,0,3,4>, <7,0,3,4>
+  2691877405U, // <0,3,4,u>: Cost 3 vext3 <1,2,3,0>, <3,4,u,6>
+  3630112870U, // <0,3,5,0>: Cost 4 vext1 <1,0,3,5>, LHS
+  3630113526U, // <0,3,5,1>: Cost 4 vext1 <1,0,3,5>, <1,0,3,2>
+  4035199734U, // <0,3,5,2>: Cost 4 vzipr <1,4,0,5>, <1,0,3,2>
+  3769600578U, // <0,3,5,3>: Cost 4 vext3 <1,u,3,0>, <3,5,3,7>
+  2232846516U, // <0,3,5,4>: Cost 3 vrev <3,0,4,5>
+  3779037780U, // <0,3,5,5>: Cost 4 vext3 <3,4,5,0>, <3,5,5,7>
+  2718714461U, // <0,3,5,6>: Cost 3 vext3 <5,6,7,0>, <3,5,6,7>
+  2706106975U, // <0,3,5,7>: Cost 3 vext3 <3,5,7,0>, <3,5,7,0>
+  2233141464U, // <0,3,5,u>: Cost 3 vrev <3,0,u,5>
+  2691877496U, // <0,3,6,0>: Cost 3 vext3 <1,2,3,0>, <3,6,0,7>
+  3727511914U, // <0,3,6,1>: Cost 4 vext2 <6,1,0,3>, <6,1,0,3>
+  3765619338U, // <0,3,6,2>: Cost 4 vext3 <1,2,3,0>, <3,6,2,7>
+  3765619347U, // <0,3,6,3>: Cost 4 vext3 <1,2,3,0>, <3,6,3,7>
+  3765987996U, // <0,3,6,4>: Cost 4 vext3 <1,2,u,0>, <3,6,4,7>
+  3306670270U, // <0,3,6,5>: Cost 4 vrev <3,0,5,6>
+  3792456365U, // <0,3,6,6>: Cost 4 vext3 <5,6,7,0>, <3,6,6,6>
+  2706770608U, // <0,3,6,7>: Cost 3 vext3 <3,6,7,0>, <3,6,7,0>
+  2706844345U, // <0,3,6,u>: Cost 3 vext3 <3,6,u,0>, <3,6,u,0>
+  3769600707U, // <0,3,7,0>: Cost 4 vext3 <1,u,3,0>, <3,7,0,1>
+  2659742787U, // <0,3,7,1>: Cost 3 vext2 <7,1,0,3>, <7,1,0,3>
+  3636102612U, // <0,3,7,2>: Cost 4 vext1 <2,0,3,7>, <2,0,3,7>
+  3769600740U, // <0,3,7,3>: Cost 4 vext3 <1,u,3,0>, <3,7,3,7>
+  3769600747U, // <0,3,7,4>: Cost 4 vext3 <1,u,3,0>, <3,7,4,5>
+  3769600758U, // <0,3,7,5>: Cost 4 vext3 <1,u,3,0>, <3,7,5,7>
+  3659993400U, // <0,3,7,6>: Cost 4 vext1 <6,0,3,7>, <6,0,3,7>
+  3781176065U, // <0,3,7,7>: Cost 4 vext3 <3,7,7,0>, <3,7,7,0>
+  2664388218U, // <0,3,7,u>: Cost 3 vext2 <7,u,0,3>, <7,u,0,3>
+  1482653798U, // <0,3,u,0>: Cost 2 vext1 <1,0,3,u>, LHS
+  1482654460U, // <0,3,u,1>: Cost 2 vext1 <1,0,3,u>, <1,0,3,u>
+  2556397160U, // <0,3,u,2>: Cost 3 vext1 <1,0,3,u>, <2,2,2,2>
+  3021179292U, // <0,3,u,3>: Cost 3 vtrnl LHS, <3,3,3,3>
+  1482657078U, // <0,3,u,4>: Cost 2 vext1 <1,0,3,u>, RHS
+  3021179394U, // <0,3,u,5>: Cost 3 vtrnl LHS, <3,4,5,6>
+  2598203898U, // <0,3,u,6>: Cost 3 vext1 <u,0,3,u>, <6,2,7,3>
+  2708097874U, // <0,3,u,7>: Cost 3 vext3 <3,u,7,0>, <3,u,7,0>
+  1482659630U, // <0,3,u,u>: Cost 2 vext1 <1,0,3,u>, LHS
+  2617278468U, // <0,4,0,0>: Cost 3 vext2 <0,0,0,4>, <0,0,0,4>
+  2618605670U, // <0,4,0,1>: Cost 3 vext2 <0,2,0,4>, LHS
+  2618605734U, // <0,4,0,2>: Cost 3 vext2 <0,2,0,4>, <0,2,0,4>
+  3642091695U, // <0,4,0,3>: Cost 4 vext1 <3,0,4,0>, <3,0,4,0>
+  2753134796U, // <0,4,0,4>: Cost 3 vuzpl <0,2,4,6>, <0,2,4,6>
+  2718714770U, // <0,4,0,5>: Cost 3 vext3 <5,6,7,0>, <4,0,5,1>
+  3021245750U, // <0,4,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS
+  3665982483U, // <0,4,0,7>: Cost 4 vext1 <7,0,4,0>, <7,0,4,0>
+  3021245768U, // <0,4,0,u>: Cost 3 vtrnl <0,2,0,2>, RHS
+  2568355942U, // <0,4,1,0>: Cost 3 vext1 <3,0,4,1>, LHS
+  3692348212U, // <0,4,1,1>: Cost 4 vext2 <0,2,0,4>, <1,1,1,1>
+  3692348310U, // <0,4,1,2>: Cost 4 vext2 <0,2,0,4>, <1,2,3,0>
+  2568358064U, // <0,4,1,3>: Cost 3 vext1 <3,0,4,1>, <3,0,4,1>
+  2568359222U, // <0,4,1,4>: Cost 3 vext1 <3,0,4,1>, RHS
+  1812778294U, // <0,4,1,5>: Cost 2 vzipl LHS, RHS
+  3022671158U, // <0,4,1,6>: Cost 3 vtrnl <0,4,1,5>, RHS
+  2592248852U, // <0,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1>
+  1812778537U, // <0,4,1,u>: Cost 2 vzipl LHS, RHS
+  2568364134U, // <0,4,2,0>: Cost 3 vext1 <3,0,4,2>, LHS
+  2238573423U, // <0,4,2,1>: Cost 3 vrev <4,0,1,2>
+  3692349032U, // <0,4,2,2>: Cost 4 vext2 <0,2,0,4>, <2,2,2,2>
+  2631214761U, // <0,4,2,3>: Cost 3 vext2 <2,3,0,4>, <2,3,0,4>
+  2568367414U, // <0,4,2,4>: Cost 3 vext1 <3,0,4,2>, RHS
+  2887028022U, // <0,4,2,5>: Cost 3 vzipl <0,2,0,2>, RHS
+  1946996022U, // <0,4,2,6>: Cost 2 vtrnl LHS, RHS
+  2592257045U, // <0,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2>
+  1946996040U, // <0,4,2,u>: Cost 2 vtrnl LHS, RHS
+  3692349590U, // <0,4,3,0>: Cost 4 vext2 <0,2,0,4>, <3,0,1,2>
+  3826878614U, // <0,4,3,1>: Cost 4 vuzpl <0,2,4,6>, <3,0,1,2>
+  3826878625U, // <0,4,3,2>: Cost 4 vuzpl <0,2,4,6>, <3,0,2,4>
+  3692349852U, // <0,4,3,3>: Cost 4 vext2 <0,2,0,4>, <3,3,3,3>
+  3692349954U, // <0,4,3,4>: Cost 4 vext2 <0,2,0,4>, <3,4,5,6>
+  3826878978U, // <0,4,3,5>: Cost 4 vuzpl <0,2,4,6>, <3,4,5,6>
+  4095200566U, // <0,4,3,6>: Cost 4 vtrnl <0,2,3,1>, RHS
+  3713583814U, // <0,4,3,7>: Cost 4 vext2 <3,7,0,4>, <3,7,0,4>
+  3692350238U, // <0,4,3,u>: Cost 4 vext2 <0,2,0,4>, <3,u,1,2>
+  2550464552U, // <0,4,4,0>: Cost 3 vext1 <0,0,4,4>, <0,0,4,4>
+  3962194914U, // <0,4,4,1>: Cost 4 vzipl <0,4,1,5>, <4,1,5,0>
+  3693677631U, // <0,4,4,2>: Cost 4 vext2 <0,4,0,4>, <4,2,6,3>
+  3642124467U, // <0,4,4,3>: Cost 4 vext1 <3,0,4,4>, <3,0,4,4>
+  2718715088U, // <0,4,4,4>: Cost 3 vext3 <5,6,7,0>, <4,4,4,4>
+  2618608950U, // <0,4,4,5>: Cost 3 vext2 <0,2,0,4>, RHS
+  2753137974U, // <0,4,4,6>: Cost 3 vuzpl <0,2,4,6>, RHS
+  3666015255U, // <0,4,4,7>: Cost 4 vext1 <7,0,4,4>, <7,0,4,4>
+  2618609193U, // <0,4,4,u>: Cost 3 vext2 <0,2,0,4>, RHS
+  2568388710U, // <0,4,5,0>: Cost 3 vext1 <3,0,4,5>, LHS
+  2568389526U, // <0,4,5,1>: Cost 3 vext1 <3,0,4,5>, <1,2,3,0>
+  3636159963U, // <0,4,5,2>: Cost 4 vext1 <2,0,4,5>, <2,0,4,5>
+  2568390836U, // <0,4,5,3>: Cost 3 vext1 <3,0,4,5>, <3,0,4,5>
+  2568391990U, // <0,4,5,4>: Cost 3 vext1 <3,0,4,5>, RHS
+  2718715180U, // <0,4,5,5>: Cost 3 vext3 <5,6,7,0>, <4,5,5,6>
+  1618136374U, // <0,4,5,6>: Cost 2 vext3 <1,2,3,0>, RHS
+  2592281624U, // <0,4,5,7>: Cost 3 vext1 <7,0,4,5>, <7,0,4,5>
+  1618136392U, // <0,4,5,u>: Cost 2 vext3 <1,2,3,0>, RHS
+  2550480938U, // <0,4,6,0>: Cost 3 vext1 <0,0,4,6>, <0,0,4,6>
+  3826880801U, // <0,4,6,1>: Cost 4 vuzpl <0,2,4,6>, <6,0,1,2>
+  2562426332U, // <0,4,6,2>: Cost 3 vext1 <2,0,4,6>, <2,0,4,6>
+  3786190181U, // <0,4,6,3>: Cost 4 vext3 <4,6,3,0>, <4,6,3,0>
+  2718715252U, // <0,4,6,4>: Cost 3 vext3 <5,6,7,0>, <4,6,4,6>
+  3826881165U, // <0,4,6,5>: Cost 4 vuzpl <0,2,4,6>, <6,4,5,6>
+  2712669568U, // <0,4,6,6>: Cost 3 vext3 <4,6,6,0>, <4,6,6,0>
+  2657760081U, // <0,4,6,7>: Cost 3 vext2 <6,7,0,4>, <6,7,0,4>
+  2718715284U, // <0,4,6,u>: Cost 3 vext3 <5,6,7,0>, <4,6,u,2>
+  3654090854U, // <0,4,7,0>: Cost 4 vext1 <5,0,4,7>, LHS
+  3934229326U, // <0,4,7,1>: Cost 4 vuzpr <7,0,1,4>, <6,7,0,1>
+  3734156437U, // <0,4,7,2>: Cost 4 vext2 <7,2,0,4>, <7,2,0,4>
+  3734820070U, // <0,4,7,3>: Cost 4 vext2 <7,3,0,4>, <7,3,0,4>
+  3654094134U, // <0,4,7,4>: Cost 4 vext1 <5,0,4,7>, RHS
+  2713259464U, // <0,4,7,5>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0>
+  2713333201U, // <0,4,7,6>: Cost 3 vext3 <4,7,6,0>, <4,7,6,0>
+  3654095866U, // <0,4,7,7>: Cost 4 vext1 <5,0,4,7>, <7,0,1,2>
+  2713259464U, // <0,4,7,u>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0>
+  2568413286U, // <0,4,u,0>: Cost 3 vext1 <3,0,4,u>, LHS
+  2618611502U, // <0,4,u,1>: Cost 3 vext2 <0,2,0,4>, LHS
+  2753140526U, // <0,4,u,2>: Cost 3 vuzpl <0,2,4,6>, LHS
+  2568415415U, // <0,4,u,3>: Cost 3 vext1 <3,0,4,u>, <3,0,4,u>
+  2568416566U, // <0,4,u,4>: Cost 3 vext1 <3,0,4,u>, RHS
+  1817423158U, // <0,4,u,5>: Cost 2 vzipl LHS, RHS
+  1947438390U, // <0,4,u,6>: Cost 2 vtrnl LHS, RHS
+  2592306203U, // <0,4,u,7>: Cost 3 vext1 <7,0,4,u>, <7,0,4,u>
+  1947438408U, // <0,4,u,u>: Cost 2 vtrnl LHS, RHS
+  3630219264U, // <0,5,0,0>: Cost 4 vext1 <1,0,5,0>, <0,0,0,0>
+  2625912934U, // <0,5,0,1>: Cost 3 vext2 <1,4,0,5>, LHS
+  3692355748U, // <0,5,0,2>: Cost 4 vext2 <0,2,0,5>, <0,2,0,2>
+  3693019384U, // <0,5,0,3>: Cost 4 vext2 <0,3,0,5>, <0,3,0,5>
+  3630222646U, // <0,5,0,4>: Cost 4 vext1 <1,0,5,0>, RHS
+  3699655062U, // <0,5,0,5>: Cost 4 vext2 <1,4,0,5>, <0,5,0,1>
+  2718715508U, // <0,5,0,6>: Cost 3 vext3 <5,6,7,0>, <5,0,6,1>
+  3087011126U, // <0,5,0,7>: Cost 3 vtrnr <0,0,0,0>, RHS
+  2625913501U, // <0,5,0,u>: Cost 3 vext2 <1,4,0,5>, LHS
+  1500659814U, // <0,5,1,0>: Cost 2 vext1 <4,0,5,1>, LHS
+  2886520528U, // <0,5,1,1>: Cost 3 vzipl LHS, <5,1,7,3>
+  2574403176U, // <0,5,1,2>: Cost 3 vext1 <4,0,5,1>, <2,2,2,2>
+  2574403734U, // <0,5,1,3>: Cost 3 vext1 <4,0,5,1>, <3,0,1,2>
+  1500662674U, // <0,5,1,4>: Cost 2 vext1 <4,0,5,1>, <4,0,5,1>
+  2886520836U, // <0,5,1,5>: Cost 3 vzipl LHS, <5,5,5,5>
+  2886520930U, // <0,5,1,6>: Cost 3 vzipl LHS, <5,6,7,0>
+  2718715600U, // <0,5,1,7>: Cost 3 vext3 <5,6,7,0>, <5,1,7,3>
+  1500665646U, // <0,5,1,u>: Cost 2 vext1 <4,0,5,1>, LHS
+  2556493926U, // <0,5,2,0>: Cost 3 vext1 <1,0,5,2>, LHS
+  2244546120U, // <0,5,2,1>: Cost 3 vrev <5,0,1,2>
+  3692357256U, // <0,5,2,2>: Cost 4 vext2 <0,2,0,5>, <2,2,5,7>
+  2568439994U, // <0,5,2,3>: Cost 3 vext1 <3,0,5,2>, <3,0,5,2>
+  2556497206U, // <0,5,2,4>: Cost 3 vext1 <1,0,5,2>, RHS
+  3020738564U, // <0,5,2,5>: Cost 3 vtrnl LHS, <5,5,5,5>
+  4027877161U, // <0,5,2,6>: Cost 4 vzipr <0,2,0,2>, <2,4,5,6>
+  3093220662U, // <0,5,2,7>: Cost 3 vtrnr <1,0,3,2>, RHS
+  3093220663U, // <0,5,2,u>: Cost 3 vtrnr <1,0,3,2>, RHS
+  3699656854U, // <0,5,3,0>: Cost 4 vext2 <1,4,0,5>, <3,0,1,2>
+  3699656927U, // <0,5,3,1>: Cost 4 vext2 <1,4,0,5>, <3,1,0,3>
+  3699657006U, // <0,5,3,2>: Cost 4 vext2 <1,4,0,5>, <3,2,0,1>
+  3699657116U, // <0,5,3,3>: Cost 4 vext2 <1,4,0,5>, <3,3,3,3>
+  2637859284U, // <0,5,3,4>: Cost 3 vext2 <3,4,0,5>, <3,4,0,5>
+  3790319453U, // <0,5,3,5>: Cost 4 vext3 <5,3,5,0>, <5,3,5,0>
+  3699657354U, // <0,5,3,6>: Cost 4 vext2 <1,4,0,5>, <3,6,2,7>
+  2716725103U, // <0,5,3,7>: Cost 3 vext3 <5,3,7,0>, <5,3,7,0>
+  2716798840U, // <0,5,3,u>: Cost 3 vext3 <5,3,u,0>, <5,3,u,0>
+  2661747602U, // <0,5,4,0>: Cost 3 vext2 <7,4,0,5>, <4,0,5,1>
+  3630252810U, // <0,5,4,1>: Cost 4 vext1 <1,0,5,4>, <1,0,5,4>
+  3636225507U, // <0,5,4,2>: Cost 4 vext1 <2,0,5,4>, <2,0,5,4>
+  3716910172U, // <0,5,4,3>: Cost 4 vext2 <4,3,0,5>, <4,3,0,5>
+  3962195892U, // <0,5,4,4>: Cost 4 vzipl <0,4,1,5>, <5,4,5,6>
+  2625916214U, // <0,5,4,5>: Cost 3 vext2 <1,4,0,5>, RHS
+  3718901071U, // <0,5,4,6>: Cost 4 vext2 <4,6,0,5>, <4,6,0,5>
+  2718715846U, // <0,5,4,7>: Cost 3 vext3 <5,6,7,0>, <5,4,7,6>
+  2625916457U, // <0,5,4,u>: Cost 3 vext2 <1,4,0,5>, RHS
+  3791278034U, // <0,5,5,0>: Cost 4 vext3 <5,5,0,0>, <5,5,0,0>
+  3791351771U, // <0,5,5,1>: Cost 4 vext3 <5,5,1,0>, <5,5,1,0>
+  3318386260U, // <0,5,5,2>: Cost 4 vrev <5,0,2,5>
+  3791499245U, // <0,5,5,3>: Cost 4 vext3 <5,5,3,0>, <5,5,3,0>
+  3318533734U, // <0,5,5,4>: Cost 4 vrev <5,0,4,5>
+  2718715908U, // <0,5,5,5>: Cost 3 vext3 <5,6,7,0>, <5,5,5,5>
+  2657767522U, // <0,5,5,6>: Cost 3 vext2 <6,7,0,5>, <5,6,7,0>
+  2718715928U, // <0,5,5,7>: Cost 3 vext3 <5,6,7,0>, <5,5,7,7>
+  2718715937U, // <0,5,5,u>: Cost 3 vext3 <5,6,7,0>, <5,5,u,7>
+  2592358502U, // <0,5,6,0>: Cost 3 vext1 <7,0,5,6>, LHS
+  3792015404U, // <0,5,6,1>: Cost 4 vext3 <5,6,1,0>, <5,6,1,0>
+  3731509754U, // <0,5,6,2>: Cost 4 vext2 <6,7,0,5>, <6,2,7,3>
+  3785748546U, // <0,5,6,3>: Cost 4 vext3 <4,5,6,0>, <5,6,3,4>
+  2592361782U, // <0,5,6,4>: Cost 3 vext1 <7,0,5,6>, RHS
+  2592362594U, // <0,5,6,5>: Cost 3 vext1 <7,0,5,6>, <5,6,7,0>
+  3785748576U, // <0,5,6,6>: Cost 4 vext3 <4,5,6,0>, <5,6,6,7>
+  1644974178U, // <0,5,6,7>: Cost 2 vext3 <5,6,7,0>, <5,6,7,0>
+  1645047915U, // <0,5,6,u>: Cost 2 vext3 <5,6,u,0>, <5,6,u,0>
+  2562506854U, // <0,5,7,0>: Cost 3 vext1 <2,0,5,7>, LHS
+  2562507670U, // <0,5,7,1>: Cost 3 vext1 <2,0,5,7>, <1,2,3,0>
+  2562508262U, // <0,5,7,2>: Cost 3 vext1 <2,0,5,7>, <2,0,5,7>
+  3636250774U, // <0,5,7,3>: Cost 4 vext1 <2,0,5,7>, <3,0,1,2>
+  2562510134U, // <0,5,7,4>: Cost 3 vext1 <2,0,5,7>, RHS
+  2718716072U, // <0,5,7,5>: Cost 3 vext3 <5,6,7,0>, <5,7,5,7>
+  2718716074U, // <0,5,7,6>: Cost 3 vext3 <5,6,7,0>, <5,7,6,0>
+  2719379635U, // <0,5,7,7>: Cost 3 vext3 <5,7,7,0>, <5,7,7,0>
+  2562512686U, // <0,5,7,u>: Cost 3 vext1 <2,0,5,7>, LHS
+  1500717158U, // <0,5,u,0>: Cost 2 vext1 <4,0,5,u>, LHS
+  2625918766U, // <0,5,u,1>: Cost 3 vext2 <1,4,0,5>, LHS
+  2719674583U, // <0,5,u,2>: Cost 3 vext3 <5,u,2,0>, <5,u,2,0>
+  2568489152U, // <0,5,u,3>: Cost 3 vext1 <3,0,5,u>, <3,0,5,u>
+  1500720025U, // <0,5,u,4>: Cost 2 vext1 <4,0,5,u>, <4,0,5,u>
+  2625919130U, // <0,5,u,5>: Cost 3 vext2 <1,4,0,5>, RHS
+  2586407243U, // <0,5,u,6>: Cost 3 vext1 <6,0,5,u>, <6,0,5,u>
+  1646301444U, // <0,5,u,7>: Cost 2 vext3 <5,u,7,0>, <5,u,7,0>
+  1646375181U, // <0,5,u,u>: Cost 2 vext3 <5,u,u,0>, <5,u,u,0>
+  2586411110U, // <0,6,0,0>: Cost 3 vext1 <6,0,6,0>, LHS
+  2619949158U, // <0,6,0,1>: Cost 3 vext2 <0,4,0,6>, LHS
+  2619949220U, // <0,6,0,2>: Cost 3 vext2 <0,4,0,6>, <0,2,0,2>
+  3785748789U, // <0,6,0,3>: Cost 4 vext3 <4,5,6,0>, <6,0,3,4>
+  2619949386U, // <0,6,0,4>: Cost 3 vext2 <0,4,0,6>, <0,4,0,6>
+  2586415202U, // <0,6,0,5>: Cost 3 vext1 <6,0,6,0>, <5,6,7,0>
+  2586415436U, // <0,6,0,6>: Cost 3 vext1 <6,0,6,0>, <6,0,6,0>
+  2952793398U, // <0,6,0,7>: Cost 3 vzipr <0,0,0,0>, RHS
+  2619949725U, // <0,6,0,u>: Cost 3 vext2 <0,4,0,6>, LHS
+  2562531430U, // <0,6,1,0>: Cost 3 vext1 <2,0,6,1>, LHS
+  3693691700U, // <0,6,1,1>: Cost 4 vext2 <0,4,0,6>, <1,1,1,1>
+  2886521338U, // <0,6,1,2>: Cost 3 vzipl LHS, <6,2,7,3>
+  3693691864U, // <0,6,1,3>: Cost 4 vext2 <0,4,0,6>, <1,3,1,3>
+  2562534710U, // <0,6,1,4>: Cost 3 vext1 <2,0,6,1>, RHS
+  2580450932U, // <0,6,1,5>: Cost 3 vext1 <5,0,6,1>, <5,0,6,1>
+  2886521656U, // <0,6,1,6>: Cost 3 vzipl LHS, <6,6,6,6>
+  2966736182U, // <0,6,1,7>: Cost 3 vzipr <2,3,0,1>, RHS
+  2966736183U, // <0,6,1,u>: Cost 3 vzipr <2,3,0,1>, RHS
+  1500741734U, // <0,6,2,0>: Cost 2 vext1 <4,0,6,2>, LHS
+  2250518817U, // <0,6,2,1>: Cost 3 vrev <6,0,1,2>
+  2574485096U, // <0,6,2,2>: Cost 3 vext1 <4,0,6,2>, <2,2,2,2>
+  2631894694U, // <0,6,2,3>: Cost 3 vext2 <2,4,0,6>, <2,3,0,1>
+  1500744604U, // <0,6,2,4>: Cost 2 vext1 <4,0,6,2>, <4,0,6,2>
+  2574487248U, // <0,6,2,5>: Cost 3 vext1 <4,0,6,2>, <5,1,7,3>
+  3020739384U, // <0,6,2,6>: Cost 3 vtrnl LHS, <6,6,6,6>
+  2954136886U, // <0,6,2,7>: Cost 3 vzipr <0,2,0,2>, RHS
+  1500747566U, // <0,6,2,u>: Cost 2 vext1 <4,0,6,2>, LHS
+  3693693078U, // <0,6,3,0>: Cost 4 vext2 <0,4,0,6>, <3,0,1,2>
+  3705637136U, // <0,6,3,1>: Cost 4 vext2 <2,4,0,6>, <3,1,5,7>
+  3705637192U, // <0,6,3,2>: Cost 4 vext2 <2,4,0,6>, <3,2,3,0>
+  3693693340U, // <0,6,3,3>: Cost 4 vext2 <0,4,0,6>, <3,3,3,3>
+  2637867477U, // <0,6,3,4>: Cost 3 vext2 <3,4,0,6>, <3,4,0,6>
+  3705637424U, // <0,6,3,5>: Cost 4 vext2 <2,4,0,6>, <3,5,1,7>
+  3666154056U, // <0,6,3,6>: Cost 4 vext1 <7,0,6,3>, <6,3,7,0>
+  2722697800U, // <0,6,3,7>: Cost 3 vext3 <6,3,7,0>, <6,3,7,0>
+  2722771537U, // <0,6,3,u>: Cost 3 vext3 <6,3,u,0>, <6,3,u,0>
+  2562556006U, // <0,6,4,0>: Cost 3 vext1 <2,0,6,4>, LHS
+  4095316257U, // <0,6,4,1>: Cost 4 vtrnl <0,2,4,6>, <6,0,1,2>
+  2562557420U, // <0,6,4,2>: Cost 3 vext1 <2,0,6,4>, <2,0,6,4>
+  3636299926U, // <0,6,4,3>: Cost 4 vext1 <2,0,6,4>, <3,0,1,2>
+  2562559286U, // <0,6,4,4>: Cost 3 vext1 <2,0,6,4>, RHS
+  2619952438U, // <0,6,4,5>: Cost 3 vext2 <0,4,0,6>, RHS
+  2723287696U, // <0,6,4,6>: Cost 3 vext3 <6,4,6,0>, <6,4,6,0>
+  4027895094U, // <0,6,4,7>: Cost 4 vzipr <0,2,0,4>, RHS
+  2619952681U, // <0,6,4,u>: Cost 3 vext2 <0,4,0,6>, RHS
+  2718716594U, // <0,6,5,0>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7>
+  3648250774U, // <0,6,5,1>: Cost 4 vext1 <4,0,6,5>, <1,2,3,0>
+  3792458436U, // <0,6,5,2>: Cost 4 vext3 <5,6,7,0>, <6,5,2,7>
+  3705638767U, // <0,6,5,3>: Cost 5 vext2 <2,4,0,6>, <5,3,7,0>
+  3648252831U, // <0,6,5,4>: Cost 4 vext1 <4,0,6,5>, <4,0,6,5>
+  3797619416U, // <0,6,5,5>: Cost 4 vext3 <6,5,5,0>, <6,5,5,0>
+  3792458472U, // <0,6,5,6>: Cost 4 vext3 <5,6,7,0>, <6,5,6,7>
+  4035202358U, // <0,6,5,7>: Cost 4 vzipr <1,4,0,5>, RHS
+  2718716594U, // <0,6,5,u>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7>
+  3786412796U, // <0,6,6,0>: Cost 4 vext3 <4,6,6,0>, <6,6,0,0>
+  3792458504U, // <0,6,6,1>: Cost 4 vext3 <5,6,7,0>, <6,6,1,3>
+  3728200126U, // <0,6,6,2>: Cost 4 vext2 <6,2,0,6>, <6,2,0,6>
+  3798135575U, // <0,6,6,3>: Cost 4 vext3 <6,6,3,0>, <6,6,3,0>
+  3786412836U, // <0,6,6,4>: Cost 4 vext3 <4,6,6,0>, <6,6,4,4>
+  3792458543U, // <0,6,6,5>: Cost 4 vext3 <5,6,7,0>, <6,6,5,6>
+  2718716728U, // <0,6,6,6>: Cost 3 vext3 <5,6,7,0>, <6,6,6,6>
+  2718716738U, // <0,6,6,7>: Cost 3 vext3 <5,6,7,0>, <6,6,7,7>
+  2718716747U, // <0,6,6,u>: Cost 3 vext3 <5,6,7,0>, <6,6,u,7>
+  2718716750U, // <0,6,7,0>: Cost 3 vext3 <5,6,7,0>, <6,7,0,1>
+  2724909910U, // <0,6,7,1>: Cost 3 vext3 <6,7,1,0>, <6,7,1,0>
+  3636323823U, // <0,6,7,2>: Cost 4 vext1 <2,0,6,7>, <2,0,6,7>
+  2725057384U, // <0,6,7,3>: Cost 3 vext3 <6,7,3,0>, <6,7,3,0>
+  2718716790U, // <0,6,7,4>: Cost 3 vext3 <5,6,7,0>, <6,7,4,5>
+  2718716800U, // <0,6,7,5>: Cost 3 vext3 <5,6,7,0>, <6,7,5,6>
+  3792458629U, // <0,6,7,6>: Cost 4 vext3 <5,6,7,0>, <6,7,6,2>
+  2725352332U, // <0,6,7,7>: Cost 3 vext3 <6,7,7,0>, <6,7,7,0>
+  2718716822U, // <0,6,7,u>: Cost 3 vext3 <5,6,7,0>, <6,7,u,1>
+  1500790886U, // <0,6,u,0>: Cost 2 vext1 <4,0,6,u>, LHS
+  2619954990U, // <0,6,u,1>: Cost 3 vext2 <0,4,0,6>, LHS
+  2562590192U, // <0,6,u,2>: Cost 3 vext1 <2,0,6,u>, <2,0,6,u>
+  2725721017U, // <0,6,u,3>: Cost 3 vext3 <6,u,3,0>, <6,u,3,0>
+  1500793762U, // <0,6,u,4>: Cost 2 vext1 <4,0,6,u>, <4,0,6,u>
+  2619955354U, // <0,6,u,5>: Cost 3 vext2 <0,4,0,6>, RHS
+  2725942228U, // <0,6,u,6>: Cost 3 vext3 <6,u,6,0>, <6,u,6,0>
+  2954186038U, // <0,6,u,7>: Cost 3 vzipr <0,2,0,u>, RHS
+  1500796718U, // <0,6,u,u>: Cost 2 vext1 <4,0,6,u>, LHS
+  2256401391U, // <0,7,0,0>: Cost 3 vrev <7,0,0,0>
+  2632564838U, // <0,7,0,1>: Cost 3 vext2 <2,5,0,7>, LHS
+  2256548865U, // <0,7,0,2>: Cost 3 vrev <7,0,2,0>
+  3700998396U, // <0,7,0,3>: Cost 4 vext2 <1,6,0,7>, <0,3,1,0>
+  2718716952U, // <0,7,0,4>: Cost 3 vext3 <5,6,7,0>, <7,0,4,5>
+  2718716962U, // <0,7,0,5>: Cost 3 vext3 <5,6,7,0>, <7,0,5,6>
+  2621284845U, // <0,7,0,6>: Cost 3 vext2 <0,6,0,7>, <0,6,0,7>
+  3904685542U, // <0,7,0,7>: Cost 4 vuzpr <2,0,5,7>, <2,0,5,7>
+  2632565405U, // <0,7,0,u>: Cost 3 vext2 <2,5,0,7>, LHS
+  2256409584U, // <0,7,1,0>: Cost 3 vrev <7,0,0,1>
+  3706307380U, // <0,7,1,1>: Cost 4 vext2 <2,5,0,7>, <1,1,1,1>
+  2632565654U, // <0,7,1,2>: Cost 3 vext2 <2,5,0,7>, <1,2,3,0>
+  3769603168U, // <0,7,1,3>: Cost 4 vext3 <1,u,3,0>, <7,1,3,5>
+  2256704532U, // <0,7,1,4>: Cost 3 vrev <7,0,4,1>
+  3769603184U, // <0,7,1,5>: Cost 4 vext3 <1,u,3,0>, <7,1,5,3>
+  3700999366U, // <0,7,1,6>: Cost 4 vext2 <1,6,0,7>, <1,6,0,7>
+  2886522476U, // <0,7,1,7>: Cost 3 vzipl LHS, <7,7,7,7>
+  2256999480U, // <0,7,1,u>: Cost 3 vrev <7,0,u,1>
+  2586501222U, // <0,7,2,0>: Cost 3 vext1 <6,0,7,2>, LHS
+  1182749690U, // <0,7,2,1>: Cost 2 vrev <7,0,1,2>
+  3636356595U, // <0,7,2,2>: Cost 4 vext1 <2,0,7,2>, <2,0,7,2>
+  2727711916U, // <0,7,2,3>: Cost 3 vext3 <7,2,3,0>, <7,2,3,0>
+  2586504502U, // <0,7,2,4>: Cost 3 vext1 <6,0,7,2>, RHS
+  2632566606U, // <0,7,2,5>: Cost 3 vext2 <2,5,0,7>, <2,5,0,7>
+  2586505559U, // <0,7,2,6>: Cost 3 vext1 <6,0,7,2>, <6,0,7,2>
+  3020740204U, // <0,7,2,7>: Cost 3 vtrnl LHS, <7,7,7,7>
+  1183265849U, // <0,7,2,u>: Cost 2 vrev <7,0,u,2>
+  3701000342U, // <0,7,3,0>: Cost 4 vext2 <1,6,0,7>, <3,0,1,2>
+  3706308849U, // <0,7,3,1>: Cost 4 vext2 <2,5,0,7>, <3,1,2,3>
+  3330315268U, // <0,7,3,2>: Cost 4 vrev <7,0,2,3>
+  3706309020U, // <0,7,3,3>: Cost 4 vext2 <2,5,0,7>, <3,3,3,3>
+  3706309122U, // <0,7,3,4>: Cost 4 vext2 <2,5,0,7>, <3,4,5,6>
+  3712281127U, // <0,7,3,5>: Cost 4 vext2 <3,5,0,7>, <3,5,0,7>
+  2639202936U, // <0,7,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7>
+  3802412321U, // <0,7,3,7>: Cost 4 vext3 <7,3,7,0>, <7,3,7,0>
+  2640530202U, // <0,7,3,u>: Cost 3 vext2 <3,u,0,7>, <3,u,0,7>
+  3654287462U, // <0,7,4,0>: Cost 4 vext1 <5,0,7,4>, LHS
+  2256507900U, // <0,7,4,1>: Cost 3 vrev <7,0,1,4>
+  2256581637U, // <0,7,4,2>: Cost 3 vrev <7,0,2,4>
+  3660262008U, // <0,7,4,3>: Cost 4 vext1 <6,0,7,4>, <3,6,0,7>
+  3786413405U, // <0,7,4,4>: Cost 4 vext3 <4,6,6,0>, <7,4,4,6>
+  2632568118U, // <0,7,4,5>: Cost 3 vext2 <2,5,0,7>, RHS
+  3718917457U, // <0,7,4,6>: Cost 4 vext2 <4,6,0,7>, <4,6,0,7>
+  3787003255U, // <0,7,4,7>: Cost 4 vext3 <4,7,5,0>, <7,4,7,5>
+  2632568361U, // <0,7,4,u>: Cost 3 vext2 <2,5,0,7>, RHS
+  3706310268U, // <0,7,5,0>: Cost 4 vext2 <2,5,0,7>, <5,0,7,0>
+  3792459156U, // <0,7,5,1>: Cost 4 vext3 <5,6,7,0>, <7,5,1,7>
+  3330331654U, // <0,7,5,2>: Cost 4 vrev <7,0,2,5>
+  3722899255U, // <0,7,5,3>: Cost 4 vext2 <5,3,0,7>, <5,3,0,7>
+  2256737304U, // <0,7,5,4>: Cost 3 vrev <7,0,4,5>
+  3724226521U, // <0,7,5,5>: Cost 4 vext2 <5,5,0,7>, <5,5,0,7>
+  2718717377U, // <0,7,5,6>: Cost 3 vext3 <5,6,7,0>, <7,5,6,7>
+  2729997763U, // <0,7,5,7>: Cost 3 vext3 <7,5,7,0>, <7,5,7,0>
+  2720044499U, // <0,7,5,u>: Cost 3 vext3 <5,u,7,0>, <7,5,u,7>
+  3712946517U, // <0,7,6,0>: Cost 4 vext2 <3,6,0,7>, <6,0,7,0>
+  2256524286U, // <0,7,6,1>: Cost 3 vrev <7,0,1,6>
+  3792459246U, // <0,7,6,2>: Cost 4 vext3 <5,6,7,0>, <7,6,2,7>
+  3796440567U, // <0,7,6,3>: Cost 4 vext3 <6,3,7,0>, <7,6,3,7>
+  3654307126U, // <0,7,6,4>: Cost 4 vext1 <5,0,7,6>, RHS
+  2656457394U, // <0,7,6,5>: Cost 3 vext2 <6,5,0,7>, <6,5,0,7>
+  3792459281U, // <0,7,6,6>: Cost 4 vext3 <5,6,7,0>, <7,6,6,6>
+  2730661396U, // <0,7,6,7>: Cost 3 vext3 <7,6,7,0>, <7,6,7,0>
+  2658448293U, // <0,7,6,u>: Cost 3 vext2 <6,u,0,7>, <6,u,0,7>
+  3787003431U, // <0,7,7,0>: Cost 4 vext3 <4,7,5,0>, <7,7,0,1>
+  3654312854U, // <0,7,7,1>: Cost 4 vext1 <5,0,7,7>, <1,2,3,0>
+  3654313446U, // <0,7,7,2>: Cost 4 vext1 <5,0,7,7>, <2,0,5,7>
+  3804771905U, // <0,7,7,3>: Cost 4 vext3 <7,7,3,0>, <7,7,3,0>
+  3654315318U, // <0,7,7,4>: Cost 4 vext1 <5,0,7,7>, RHS
+  3654315651U, // <0,7,7,5>: Cost 4 vext1 <5,0,7,7>, <5,0,7,7>
+  3660288348U, // <0,7,7,6>: Cost 4 vext1 <6,0,7,7>, <6,0,7,7>
+  2718717548U, // <0,7,7,7>: Cost 3 vext3 <5,6,7,0>, <7,7,7,7>
+  2664420990U, // <0,7,7,u>: Cost 3 vext2 <7,u,0,7>, <7,u,0,7>
+  2256466935U, // <0,7,u,0>: Cost 3 vrev <7,0,0,u>
+  1182798848U, // <0,7,u,1>: Cost 2 vrev <7,0,1,u>
+  2256614409U, // <0,7,u,2>: Cost 3 vrev <7,0,2,u>
+  2731693714U, // <0,7,u,3>: Cost 3 vext3 <7,u,3,0>, <7,u,3,0>
+  2256761883U, // <0,7,u,4>: Cost 3 vrev <7,0,4,u>
+  2632571034U, // <0,7,u,5>: Cost 3 vext2 <2,5,0,7>, RHS
+  2669066421U, // <0,7,u,6>: Cost 3 vext2 <u,6,0,7>, <u,6,0,7>
+  2731988662U, // <0,7,u,7>: Cost 3 vext3 <7,u,7,0>, <7,u,7,0>
+  1183315007U, // <0,7,u,u>: Cost 2 vrev <7,0,u,u>
+  135053414U, // <0,u,0,0>: Cost 1 vdup0 LHS
+  1544896614U, // <0,u,0,1>: Cost 2 vext2 <0,2,0,u>, LHS
+  1678999654U, // <0,u,0,2>: Cost 2 vuzpl LHS, LHS
+  2691880677U, // <0,u,0,3>: Cost 3 vext3 <1,2,3,0>, <u,0,3,2>
+  1476988214U, // <0,u,0,4>: Cost 2 vext1 <0,0,u,0>, RHS
+  2718791419U, // <0,u,0,5>: Cost 3 vext3 <5,6,u,0>, <u,0,5,6>
+  3021248666U, // <0,u,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS
+  2592535607U, // <0,u,0,7>: Cost 3 vext1 <7,0,u,0>, <7,0,u,0>
+  135053414U, // <0,u,0,u>: Cost 1 vdup0 LHS
+  1476993097U, // <0,u,1,0>: Cost 2 vext1 <0,0,u,1>, <0,0,u,1>
+  1812780846U, // <0,u,1,1>: Cost 2 vzipl LHS, LHS
+  1618138926U, // <0,u,1,2>: Cost 2 vext3 <1,2,3,0>, LHS
+  2752742134U, // <0,u,1,3>: Cost 3 vuzpl LHS, <1,0,3,2>
+  1476996406U, // <0,u,1,4>: Cost 2 vext1 <0,0,u,1>, RHS
+  1812781210U, // <0,u,1,5>: Cost 2 vzipl LHS, RHS
+  2887006416U, // <0,u,1,6>: Cost 3 vzipl LHS, <u,6,3,7>
+  2966736200U, // <0,u,1,7>: Cost 3 vzipr <2,3,0,1>, RHS
+  1812781413U, // <0,u,1,u>: Cost 2 vzipl LHS, LHS
+  1482973286U, // <0,u,2,0>: Cost 2 vext1 <1,0,u,2>, LHS
+  1482973987U, // <0,u,2,1>: Cost 2 vext1 <1,0,u,2>, <1,0,u,2>
+  1946998574U, // <0,u,2,2>: Cost 2 vtrnl LHS, LHS
+  835584U, // <0,u,2,3>: Cost 0 copy LHS
+  1482976566U, // <0,u,2,4>: Cost 2 vext1 <1,0,u,2>, RHS
+  3020781631U, // <0,u,2,5>: Cost 3 vtrnl LHS, <u,4,5,6>
+  1946998938U, // <0,u,2,6>: Cost 2 vtrnl LHS, RHS
+  1518810169U, // <0,u,2,7>: Cost 2 vext1 <7,0,u,2>, <7,0,u,2>
+  835584U, // <0,u,2,u>: Cost 0 copy LHS
+  2618640534U, // <0,u,3,0>: Cost 3 vext2 <0,2,0,u>, <3,0,1,2>
+  2752743574U, // <0,u,3,1>: Cost 3 vuzpl LHS, <3,0,1,2>
+  2636556597U, // <0,u,3,2>: Cost 3 vext2 <3,2,0,u>, <3,2,0,u>
+  2752743836U, // <0,u,3,3>: Cost 3 vuzpl LHS, <3,3,3,3>
+  2618640898U, // <0,u,3,4>: Cost 3 vext2 <0,2,0,u>, <3,4,5,6>
+  2752743938U, // <0,u,3,5>: Cost 3 vuzpl LHS, <3,4,5,6>
+  2639202936U, // <0,u,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7>
+  2639874762U, // <0,u,3,7>: Cost 3 vext2 <3,7,0,u>, <3,7,0,u>
+  2752743637U, // <0,u,3,u>: Cost 3 vuzpl LHS, <3,0,u,2>
+  2562703462U, // <0,u,4,0>: Cost 3 vext1 <2,0,u,4>, LHS
+  2888455982U, // <0,u,4,1>: Cost 3 vzipl <0,4,1,5>, LHS
+  3021575982U, // <0,u,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS
+  2568677591U, // <0,u,4,3>: Cost 3 vext1 <3,0,u,4>, <3,0,u,4>
+  2562706742U, // <0,u,4,4>: Cost 3 vext1 <2,0,u,4>, RHS
+  1544899894U, // <0,u,4,5>: Cost 2 vext2 <0,2,0,u>, RHS
+  1679002934U, // <0,u,4,6>: Cost 2 vuzpl LHS, RHS
+  2718718033U, // <0,u,4,7>: Cost 3 vext3 <5,6,7,0>, <u,4,7,6>
+  1679002952U, // <0,u,4,u>: Cost 2 vuzpl LHS, RHS
+  2568683622U, // <0,u,5,0>: Cost 3 vext1 <3,0,u,5>, LHS
+  2568684438U, // <0,u,5,1>: Cost 3 vext1 <3,0,u,5>, <1,2,3,0>
+  3765622902U, // <0,u,5,2>: Cost 4 vext3 <1,2,3,0>, <u,5,2,7>
+  2691881087U, // <0,u,5,3>: Cost 3 vext3 <1,2,3,0>, <u,5,3,7>
+  2568686902U, // <0,u,5,4>: Cost 3 vext1 <3,0,u,5>, RHS
+  2650492890U, // <0,u,5,5>: Cost 3 vext2 <5,5,0,u>, <5,5,0,u>
+  1618139290U, // <0,u,5,6>: Cost 2 vext3 <1,2,3,0>, RHS
+  2824834358U, // <0,u,5,7>: Cost 3 vuzpr <1,0,3,u>, RHS
+  1618139308U, // <0,u,5,u>: Cost 2 vext3 <1,2,3,0>, RHS
+  2592579686U, // <0,u,6,0>: Cost 3 vext1 <7,0,u,6>, LHS
+  2262496983U, // <0,u,6,1>: Cost 3 vrev <u,0,1,6>
+  2654474688U, // <0,u,6,2>: Cost 3 vext2 <6,2,0,u>, <6,2,0,u>
+  2691881168U, // <0,u,6,3>: Cost 3 vext3 <1,2,3,0>, <u,6,3,7>
+  2592582966U, // <0,u,6,4>: Cost 3 vext1 <7,0,u,6>, RHS
+  2656465587U, // <0,u,6,5>: Cost 3 vext2 <6,5,0,u>, <6,5,0,u>
+  2657129220U, // <0,u,6,6>: Cost 3 vext2 <6,6,0,u>, <6,6,0,u>
+  1584051029U, // <0,u,6,7>: Cost 2 vext2 <6,7,0,u>, <6,7,0,u>
+  1584714662U, // <0,u,6,u>: Cost 2 vext2 <6,u,0,u>, <6,u,0,u>
+  2562728038U, // <0,u,7,0>: Cost 3 vext1 <2,0,u,7>, LHS
+  2562728854U, // <0,u,7,1>: Cost 3 vext1 <2,0,u,7>, <1,2,3,0>
+  2562729473U, // <0,u,7,2>: Cost 3 vext1 <2,0,u,7>, <2,0,u,7>
+  2661111018U, // <0,u,7,3>: Cost 3 vext2 <7,3,0,u>, <7,3,0,u>
+  2562731318U, // <0,u,7,4>: Cost 3 vext1 <2,0,u,7>, RHS
+  2718718258U, // <0,u,7,5>: Cost 3 vext3 <5,6,7,0>, <u,7,5,6>
+  2586620261U, // <0,u,7,6>: Cost 3 vext1 <6,0,u,7>, <6,0,u,7>
+  2657793644U, // <0,u,7,7>: Cost 3 vext2 <6,7,0,u>, <7,7,7,7>
+  2562733870U, // <0,u,7,u>: Cost 3 vext1 <2,0,u,7>, LHS
+  135053414U, // <0,u,u,0>: Cost 1 vdup0 LHS
+  1544902446U, // <0,u,u,1>: Cost 2 vext2 <0,2,0,u>, LHS
+  1679005486U, // <0,u,u,2>: Cost 2 vuzpl LHS, LHS
+  835584U, // <0,u,u,3>: Cost 0 copy LHS
+  1483025718U, // <0,u,u,4>: Cost 2 vext1 <1,0,u,u>, RHS
+  1544902810U, // <0,u,u,5>: Cost 2 vext2 <0,2,0,u>, RHS
+  1679005850U, // <0,u,u,6>: Cost 2 vuzpl LHS, RHS
+  1518859327U, // <0,u,u,7>: Cost 2 vext1 <7,0,u,u>, <7,0,u,u>
+  835584U, // <0,u,u,u>: Cost 0 copy LHS
+  2689744896U, // <1,0,0,0>: Cost 3 vext3 <0,u,1,1>, <0,0,0,0>
+  1610694666U, // <1,0,0,1>: Cost 2 vext3 <0,0,1,1>, <0,0,1,1>
+  2689744916U, // <1,0,0,2>: Cost 3 vext3 <0,u,1,1>, <0,0,2,2>
+  2619310332U, // <1,0,0,3>: Cost 3 vext2 <0,3,1,0>, <0,3,1,0>
+  2684657701U, // <1,0,0,4>: Cost 3 vext3 <0,0,4,1>, <0,0,4,1>
+  2620637598U, // <1,0,0,5>: Cost 3 vext2 <0,5,1,0>, <0,5,1,0>
+  3708977654U, // <1,0,0,6>: Cost 4 vext2 <3,0,1,0>, <0,6,1,7>
+  3666351168U, // <1,0,0,7>: Cost 4 vext1 <7,1,0,0>, <7,1,0,0>
+  1611210825U, // <1,0,0,u>: Cost 2 vext3 <0,0,u,1>, <0,0,u,1>
+  2556780646U, // <1,0,1,0>: Cost 3 vext1 <1,1,0,1>, LHS
+  2556781355U, // <1,0,1,1>: Cost 3 vext1 <1,1,0,1>, <1,1,0,1>
+  1616003174U, // <1,0,1,2>: Cost 2 vext3 <0,u,1,1>, LHS
+  3693052888U, // <1,0,1,3>: Cost 4 vext2 <0,3,1,0>, <1,3,1,3>
+  2556783926U, // <1,0,1,4>: Cost 3 vext1 <1,1,0,1>, RHS
+  2580672143U, // <1,0,1,5>: Cost 3 vext1 <5,1,0,1>, <5,1,0,1>
+  2724839566U, // <1,0,1,6>: Cost 3 vext3 <6,7,0,1>, <0,1,6,7>
+  3654415354U, // <1,0,1,7>: Cost 4 vext1 <5,1,0,1>, <7,0,1,2>
+  1616003228U, // <1,0,1,u>: Cost 2 vext3 <0,u,1,1>, LHS
+  2685690019U, // <1,0,2,0>: Cost 3 vext3 <0,2,0,1>, <0,2,0,1>
+  2685763756U, // <1,0,2,1>: Cost 3 vext3 <0,2,1,1>, <0,2,1,1>
+  2698297524U, // <1,0,2,2>: Cost 3 vext3 <2,3,0,1>, <0,2,2,0>
+  2685911230U, // <1,0,2,3>: Cost 3 vext3 <0,2,3,1>, <0,2,3,1>
+  2689745100U, // <1,0,2,4>: Cost 3 vext3 <0,u,1,1>, <0,2,4,6>
+  3764814038U, // <1,0,2,5>: Cost 4 vext3 <1,1,1,1>, <0,2,5,7>
+  2724839640U, // <1,0,2,6>: Cost 3 vext3 <6,7,0,1>, <0,2,6,0>
+  2592625658U, // <1,0,2,7>: Cost 3 vext1 <7,1,0,2>, <7,0,1,2>
+  2686279915U, // <1,0,2,u>: Cost 3 vext3 <0,2,u,1>, <0,2,u,1>
+  3087843328U, // <1,0,3,0>: Cost 3 vtrnr LHS, <0,0,0,0>
+  3087843338U, // <1,0,3,1>: Cost 3 vtrnr LHS, <0,0,1,1>
+  67944550U, // <1,0,3,2>: Cost 1 vrev LHS
+  2568743135U, // <1,0,3,3>: Cost 3 vext1 <3,1,0,3>, <3,1,0,3>
+  2562772278U, // <1,0,3,4>: Cost 3 vext1 <2,1,0,3>, RHS
+  4099850454U, // <1,0,3,5>: Cost 4 vtrnl <1,0,3,2>, <0,2,5,7>
+  3704998538U, // <1,0,3,6>: Cost 4 vext2 <2,3,1,0>, <3,6,2,7>
+  2592633923U, // <1,0,3,7>: Cost 3 vext1 <7,1,0,3>, <7,1,0,3>
+  68386972U, // <1,0,3,u>: Cost 1 vrev LHS
+  2620640146U, // <1,0,4,0>: Cost 3 vext2 <0,5,1,0>, <4,0,5,1>
+  2689745234U, // <1,0,4,1>: Cost 3 vext3 <0,u,1,1>, <0,4,1,5>
+  2689745244U, // <1,0,4,2>: Cost 3 vext3 <0,u,1,1>, <0,4,2,6>
+  3760980320U, // <1,0,4,3>: Cost 4 vext3 <0,4,3,1>, <0,4,3,1>
+  3761054057U, // <1,0,4,4>: Cost 4 vext3 <0,4,4,1>, <0,4,4,1>
+  2619313462U, // <1,0,4,5>: Cost 3 vext2 <0,3,1,0>, RHS
+  3761201531U, // <1,0,4,6>: Cost 4 vext3 <0,4,6,1>, <0,4,6,1>
+  3666383940U, // <1,0,4,7>: Cost 4 vext1 <7,1,0,4>, <7,1,0,4>
+  2619313705U, // <1,0,4,u>: Cost 3 vext2 <0,3,1,0>, RHS
+  4029300736U, // <1,0,5,0>: Cost 4 vzipr <0,4,1,5>, <0,0,0,0>
+  2895249510U, // <1,0,5,1>: Cost 3 vzipl <1,5,3,7>, LHS
+  3028287590U, // <1,0,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS
+  3642501345U, // <1,0,5,3>: Cost 4 vext1 <3,1,0,5>, <3,1,0,5>
+  2215592058U, // <1,0,5,4>: Cost 3 vrev <0,1,4,5>
+  3724242907U, // <1,0,5,5>: Cost 4 vext2 <5,5,1,0>, <5,5,1,0>
+  3724906540U, // <1,0,5,6>: Cost 4 vext2 <5,6,1,0>, <5,6,1,0>
+  3911118134U, // <1,0,5,7>: Cost 4 vuzpr <3,1,3,0>, RHS
+  3028287644U, // <1,0,5,u>: Cost 3 vtrnl <1,3,5,7>, LHS
+  3762086375U, // <1,0,6,0>: Cost 4 vext3 <0,6,0,1>, <0,6,0,1>
+  2698297846U, // <1,0,6,1>: Cost 3 vext3 <2,3,0,1>, <0,6,1,7>
+  3760022015U, // <1,0,6,2>: Cost 4 vext3 <0,2,u,1>, <0,6,2,7>
+  3642509538U, // <1,0,6,3>: Cost 4 vext1 <3,1,0,6>, <3,1,0,6>
+  3762381323U, // <1,0,6,4>: Cost 4 vext3 <0,6,4,1>, <0,6,4,1>
+  3730215604U, // <1,0,6,5>: Cost 4 vext2 <6,5,1,0>, <6,5,1,0>
+  3730879237U, // <1,0,6,6>: Cost 4 vext2 <6,6,1,0>, <6,6,1,0>
+  2657801046U, // <1,0,6,7>: Cost 3 vext2 <6,7,1,0>, <6,7,1,0>
+  2658464679U, // <1,0,6,u>: Cost 3 vext2 <6,u,1,0>, <6,u,1,0>
+  2659128312U, // <1,0,7,0>: Cost 3 vext2 <7,0,1,0>, <7,0,1,0>
+  4047898278U, // <1,0,7,1>: Cost 4 vzipr <3,5,1,7>, <2,3,0,1>
+  2215460970U, // <1,0,7,2>: Cost 3 vrev <0,1,2,7>
+  3734861035U, // <1,0,7,3>: Cost 4 vext2 <7,3,1,0>, <7,3,1,0>
+  3731543398U, // <1,0,7,4>: Cost 4 vext2 <6,7,1,0>, <7,4,5,6>
+  3736188301U, // <1,0,7,5>: Cost 4 vext2 <7,5,1,0>, <7,5,1,0>
+  2663110110U, // <1,0,7,6>: Cost 3 vext2 <7,6,1,0>, <7,6,1,0>
+  3731543660U, // <1,0,7,7>: Cost 4 vext2 <6,7,1,0>, <7,7,7,7>
+  2664437376U, // <1,0,7,u>: Cost 3 vext2 <7,u,1,0>, <7,u,1,0>
+  3087884288U, // <1,0,u,0>: Cost 3 vtrnr LHS, <0,0,0,0>
+  1616003730U, // <1,0,u,1>: Cost 2 vext3 <0,u,1,1>, <0,u,1,1>
+  67985515U, // <1,0,u,2>: Cost 1 vrev LHS
+  2689893028U, // <1,0,u,3>: Cost 3 vext3 <0,u,3,1>, <0,u,3,1>
+  2689745586U, // <1,0,u,4>: Cost 3 vext3 <0,u,1,1>, <0,u,4,6>
+  2619316378U, // <1,0,u,5>: Cost 3 vext2 <0,3,1,0>, RHS
+  2669082807U, // <1,0,u,6>: Cost 3 vext2 <u,6,1,0>, <u,6,1,0>
+  2592674888U, // <1,0,u,7>: Cost 3 vext1 <7,1,0,u>, <7,1,0,u>
+  68427937U, // <1,0,u,u>: Cost 1 vrev LHS
+  1543585802U, // <1,1,0,0>: Cost 2 vext2 <0,0,1,1>, <0,0,1,1>
+  1548894310U, // <1,1,0,1>: Cost 2 vext2 <0,u,1,1>, LHS
+  2618654892U, // <1,1,0,2>: Cost 3 vext2 <0,2,1,1>, <0,2,1,1>
+  2689745654U, // <1,1,0,3>: Cost 3 vext3 <0,u,1,1>, <1,0,3,2>
+  2622636370U, // <1,1,0,4>: Cost 3 vext2 <0,u,1,1>, <0,4,1,5>
+  2620645791U, // <1,1,0,5>: Cost 3 vext2 <0,5,1,1>, <0,5,1,1>
+  3696378367U, // <1,1,0,6>: Cost 4 vext2 <0,u,1,1>, <0,6,2,7>
+  3666424905U, // <1,1,0,7>: Cost 4 vext1 <7,1,1,0>, <7,1,1,0>
+  1548894866U, // <1,1,0,u>: Cost 2 vext2 <0,u,1,1>, <0,u,1,1>
+  1483112550U, // <1,1,1,0>: Cost 2 vext1 <1,1,1,1>, LHS
+  202162278U, // <1,1,1,1>: Cost 1 vdup1 LHS
+  2622636950U, // <1,1,1,2>: Cost 3 vext2 <0,u,1,1>, <1,2,3,0>
+  2622637016U, // <1,1,1,3>: Cost 3 vext2 <0,u,1,1>, <1,3,1,3>
+  1483115830U, // <1,1,1,4>: Cost 2 vext1 <1,1,1,1>, RHS
+  2622637200U, // <1,1,1,5>: Cost 3 vext2 <0,u,1,1>, <1,5,3,7>
+  2622637263U, // <1,1,1,6>: Cost 3 vext2 <0,u,1,1>, <1,6,1,7>
+  2592691274U, // <1,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1>
+  202162278U, // <1,1,1,u>: Cost 1 vdup1 LHS
+  2550890588U, // <1,1,2,0>: Cost 3 vext1 <0,1,1,2>, <0,1,1,2>
+  2617329183U, // <1,1,2,1>: Cost 3 vext2 <0,0,1,1>, <2,1,3,1>
+  2622637672U, // <1,1,2,2>: Cost 3 vext2 <0,u,1,1>, <2,2,2,2>
+  2622637734U, // <1,1,2,3>: Cost 3 vext2 <0,u,1,1>, <2,3,0,1>
+  2550893878U, // <1,1,2,4>: Cost 3 vext1 <0,1,1,2>, RHS
+  3696379744U, // <1,1,2,5>: Cost 4 vext2 <0,u,1,1>, <2,5,2,7>
+  2622638010U, // <1,1,2,6>: Cost 3 vext2 <0,u,1,1>, <2,6,3,7>
+  3804554170U, // <1,1,2,7>: Cost 4 vext3 <7,7,0,1>, <1,2,7,0>
+  2622638139U, // <1,1,2,u>: Cost 3 vext2 <0,u,1,1>, <2,u,0,1>
+  2622638230U, // <1,1,3,0>: Cost 3 vext2 <0,u,1,1>, <3,0,1,2>
+  3087844148U, // <1,1,3,1>: Cost 3 vtrnr LHS, <1,1,1,1>
+  4161585244U, // <1,1,3,2>: Cost 4 vtrnr LHS, <0,1,1,2>
+  2014101606U, // <1,1,3,3>: Cost 2 vtrnr LHS, LHS
+  2622638594U, // <1,1,3,4>: Cost 3 vext2 <0,u,1,1>, <3,4,5,6>
+  2689745920U, // <1,1,3,5>: Cost 3 vext3 <0,u,1,1>, <1,3,5,7>
+  3763487753U, // <1,1,3,6>: Cost 4 vext3 <0,u,1,1>, <1,3,6,7>
+  2592707660U, // <1,1,3,7>: Cost 3 vext1 <7,1,1,3>, <7,1,1,3>
+  2014101611U, // <1,1,3,u>: Cost 2 vtrnr LHS, LHS
+  2556878950U, // <1,1,4,0>: Cost 3 vext1 <1,1,1,4>, LHS
+  2221335351U, // <1,1,4,1>: Cost 3 vrev <1,1,1,4>
+  3696380988U, // <1,1,4,2>: Cost 4 vext2 <0,u,1,1>, <4,2,6,0>
+  3763487805U, // <1,1,4,3>: Cost 4 vext3 <0,u,1,1>, <1,4,3,5>
+  2556882230U, // <1,1,4,4>: Cost 3 vext1 <1,1,1,4>, RHS
+  1548897590U, // <1,1,4,5>: Cost 2 vext2 <0,u,1,1>, RHS
+  2758184246U, // <1,1,4,6>: Cost 3 vuzpl <1,1,1,1>, RHS
+  3666457677U, // <1,1,4,7>: Cost 4 vext1 <7,1,1,4>, <7,1,1,4>
+  1548897833U, // <1,1,4,u>: Cost 2 vext2 <0,u,1,1>, RHS
+  2693653615U, // <1,1,5,0>: Cost 3 vext3 <1,5,0,1>, <1,5,0,1>
+  2617331408U, // <1,1,5,1>: Cost 3 vext2 <0,0,1,1>, <5,1,7,3>
+  4029302934U, // <1,1,5,2>: Cost 4 vzipr <0,4,1,5>, <3,0,1,2>
+  2689746064U, // <1,1,5,3>: Cost 3 vext3 <0,u,1,1>, <1,5,3,7>
+  2221564755U, // <1,1,5,4>: Cost 3 vrev <1,1,4,5>
+  2955559250U, // <1,1,5,5>: Cost 3 vzipr <0,4,1,5>, <0,4,1,5>
+  2617331810U, // <1,1,5,6>: Cost 3 vext2 <0,0,1,1>, <5,6,7,0>
+  2825293110U, // <1,1,5,7>: Cost 3 vuzpr <1,1,1,1>, RHS
+  2689746109U, // <1,1,5,u>: Cost 3 vext3 <0,u,1,1>, <1,5,u,7>
+  3696382241U, // <1,1,6,0>: Cost 4 vext2 <0,u,1,1>, <6,0,1,2>
+  2689746127U, // <1,1,6,1>: Cost 3 vext3 <0,u,1,1>, <1,6,1,7>
+  2617332218U, // <1,1,6,2>: Cost 3 vext2 <0,0,1,1>, <6,2,7,3>
+  3763487969U, // <1,1,6,3>: Cost 4 vext3 <0,u,1,1>, <1,6,3,7>
+  3696382605U, // <1,1,6,4>: Cost 4 vext2 <0,u,1,1>, <6,4,5,6>
+  4029309266U, // <1,1,6,5>: Cost 4 vzipr <0,4,1,6>, <0,4,1,5>
+  2617332536U, // <1,1,6,6>: Cost 3 vext2 <0,0,1,1>, <6,6,6,6>
+  2724840702U, // <1,1,6,7>: Cost 3 vext3 <6,7,0,1>, <1,6,7,0>
+  2725504263U, // <1,1,6,u>: Cost 3 vext3 <6,u,0,1>, <1,6,u,0>
+  2617332720U, // <1,1,7,0>: Cost 3 vext2 <0,0,1,1>, <7,0,0,1>
+  2659800138U, // <1,1,7,1>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1>
+  3691074717U, // <1,1,7,2>: Cost 4 vext2 <0,0,1,1>, <7,2,1,3>
+  4167811174U, // <1,1,7,3>: Cost 4 vtrnr <1,1,5,7>, LHS
+  2617333094U, // <1,1,7,4>: Cost 3 vext2 <0,0,1,1>, <7,4,5,6>
+  3295396702U, // <1,1,7,5>: Cost 4 vrev <1,1,5,7>
+  3803891014U, // <1,1,7,6>: Cost 4 vext3 <7,6,0,1>, <1,7,6,0>
+  2617333356U, // <1,1,7,7>: Cost 3 vext2 <0,0,1,1>, <7,7,7,7>
+  2659800138U, // <1,1,7,u>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1>
+  1483112550U, // <1,1,u,0>: Cost 2 vext1 <1,1,1,1>, LHS
+  202162278U, // <1,1,u,1>: Cost 1 vdup1 LHS
+  2622642056U, // <1,1,u,2>: Cost 3 vext2 <0,u,1,1>, <u,2,3,3>
+  2014142566U, // <1,1,u,3>: Cost 2 vtrnr LHS, LHS
+  1483115830U, // <1,1,u,4>: Cost 2 vext1 <1,1,1,1>, RHS
+  1548900506U, // <1,1,u,5>: Cost 2 vext2 <0,u,1,1>, RHS
+  2622642384U, // <1,1,u,6>: Cost 3 vext2 <0,u,1,1>, <u,6,3,7>
+  2825293353U, // <1,1,u,7>: Cost 3 vuzpr <1,1,1,1>, RHS
+  202162278U, // <1,1,u,u>: Cost 1 vdup1 LHS
+  2635251712U, // <1,2,0,0>: Cost 3 vext2 <3,0,1,2>, <0,0,0,0>
+  1561509990U, // <1,2,0,1>: Cost 2 vext2 <3,0,1,2>, LHS
+  2618663085U, // <1,2,0,2>: Cost 3 vext2 <0,2,1,2>, <0,2,1,2>
+  2696529358U, // <1,2,0,3>: Cost 3 vext3 <2,0,3,1>, <2,0,3,1>
+  2635252050U, // <1,2,0,4>: Cost 3 vext2 <3,0,1,2>, <0,4,1,5>
+  3769533926U, // <1,2,0,5>: Cost 4 vext3 <1,u,2,1>, <2,0,5,7>
+  2621317617U, // <1,2,0,6>: Cost 3 vext2 <0,6,1,2>, <0,6,1,2>
+  2659140170U, // <1,2,0,7>: Cost 3 vext2 <7,0,1,2>, <0,7,2,1>
+  1561510557U, // <1,2,0,u>: Cost 2 vext2 <3,0,1,2>, LHS
+  2623308516U, // <1,2,1,0>: Cost 3 vext2 <1,0,1,2>, <1,0,1,2>
+  2635252532U, // <1,2,1,1>: Cost 3 vext2 <3,0,1,2>, <1,1,1,1>
+  2631271318U, // <1,2,1,2>: Cost 3 vext2 <2,3,1,2>, <1,2,3,0>
+  2958180454U, // <1,2,1,3>: Cost 3 vzipr <0,u,1,1>, LHS
+  2550959414U, // <1,2,1,4>: Cost 3 vext1 <0,1,2,1>, RHS
+  2635252880U, // <1,2,1,5>: Cost 3 vext2 <3,0,1,2>, <1,5,3,7>
+  2635252952U, // <1,2,1,6>: Cost 3 vext2 <3,0,1,2>, <1,6,2,7>
+  3732882731U, // <1,2,1,7>: Cost 4 vext2 <7,0,1,2>, <1,7,3,0>
+  2958180459U, // <1,2,1,u>: Cost 3 vzipr <0,u,1,1>, LHS
+  2629281213U, // <1,2,2,0>: Cost 3 vext2 <2,0,1,2>, <2,0,1,2>
+  2635253280U, // <1,2,2,1>: Cost 3 vext2 <3,0,1,2>, <2,1,3,2>
+  2618664552U, // <1,2,2,2>: Cost 3 vext2 <0,2,1,2>, <2,2,2,2>
+  2689746546U, // <1,2,2,3>: Cost 3 vext3 <0,u,1,1>, <2,2,3,3>
+  3764815485U, // <1,2,2,4>: Cost 4 vext3 <1,1,1,1>, <2,2,4,5>
+  3760023176U, // <1,2,2,5>: Cost 4 vext3 <0,2,u,1>, <2,2,5,7>
+  2635253690U, // <1,2,2,6>: Cost 3 vext2 <3,0,1,2>, <2,6,3,7>
+  2659141610U, // <1,2,2,7>: Cost 3 vext2 <7,0,1,2>, <2,7,0,1>
+  2689746591U, // <1,2,2,u>: Cost 3 vext3 <0,u,1,1>, <2,2,u,3>
+  403488870U, // <1,2,3,0>: Cost 1 vext1 LHS, LHS
+  1477231350U, // <1,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  1477232232U, // <1,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2>
+  1477233052U, // <1,2,3,3>: Cost 2 vext1 LHS, <3,3,3,3>
+  403492150U, // <1,2,3,4>: Cost 1 vext1 LHS, RHS
+  1525010128U, // <1,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3>
+  1525010938U, // <1,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  1525011450U, // <1,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2>
+  403494702U, // <1,2,3,u>: Cost 1 vext1 LHS, LHS
+  2641226607U, // <1,2,4,0>: Cost 3 vext2 <4,0,1,2>, <4,0,1,2>
+  3624723446U, // <1,2,4,1>: Cost 4 vext1 <0,1,2,4>, <1,3,4,6>
+  3301123609U, // <1,2,4,2>: Cost 4 vrev <2,1,2,4>
+  2598759198U, // <1,2,4,3>: Cost 3 vext1 <u,1,2,4>, <3,u,1,2>
+  2659142864U, // <1,2,4,4>: Cost 3 vext2 <7,0,1,2>, <4,4,4,4>
+  1561513270U, // <1,2,4,5>: Cost 2 vext2 <3,0,1,2>, RHS
+  2659143028U, // <1,2,4,6>: Cost 3 vext2 <7,0,1,2>, <4,6,4,6>
+  2659143112U, // <1,2,4,7>: Cost 3 vext2 <7,0,1,2>, <4,7,5,0>
+  1561513513U, // <1,2,4,u>: Cost 2 vext2 <3,0,1,2>, RHS
+  2550988902U, // <1,2,5,0>: Cost 3 vext1 <0,1,2,5>, LHS
+  2550989824U, // <1,2,5,1>: Cost 3 vext1 <0,1,2,5>, <1,3,5,7>
+  3624732264U, // <1,2,5,2>: Cost 4 vext1 <0,1,2,5>, <2,2,2,2>
+  2955559014U, // <1,2,5,3>: Cost 3 vzipr <0,4,1,5>, LHS
+  2550992182U, // <1,2,5,4>: Cost 3 vext1 <0,1,2,5>, RHS
+  2659143684U, // <1,2,5,5>: Cost 3 vext2 <7,0,1,2>, <5,5,5,5>
+  2659143778U, // <1,2,5,6>: Cost 3 vext2 <7,0,1,2>, <5,6,7,0>
+  2659143848U, // <1,2,5,7>: Cost 3 vext2 <7,0,1,2>, <5,7,5,7>
+  2550994734U, // <1,2,5,u>: Cost 3 vext1 <0,1,2,5>, LHS
+  2700289945U, // <1,2,6,0>: Cost 3 vext3 <2,6,0,1>, <2,6,0,1>
+  2635256232U, // <1,2,6,1>: Cost 3 vext2 <3,0,1,2>, <6,1,7,2>
+  2659144186U, // <1,2,6,2>: Cost 3 vext2 <7,0,1,2>, <6,2,7,3>
+  2689746874U, // <1,2,6,3>: Cost 3 vext3 <0,u,1,1>, <2,6,3,7>
+  3763488705U, // <1,2,6,4>: Cost 4 vext3 <0,u,1,1>, <2,6,4,5>
+  3763488716U, // <1,2,6,5>: Cost 4 vext3 <0,u,1,1>, <2,6,5,7>
+  2659144504U, // <1,2,6,6>: Cost 3 vext2 <7,0,1,2>, <6,6,6,6>
+  2657817432U, // <1,2,6,7>: Cost 3 vext2 <6,7,1,2>, <6,7,1,2>
+  2689746919U, // <1,2,6,u>: Cost 3 vext3 <0,u,1,1>, <2,6,u,7>
+  1585402874U, // <1,2,7,0>: Cost 2 vext2 <7,0,1,2>, <7,0,1,2>
+  2659144770U, // <1,2,7,1>: Cost 3 vext2 <7,0,1,2>, <7,1,0,2>
+  3708998858U, // <1,2,7,2>: Cost 4 vext2 <3,0,1,2>, <7,2,6,3>
+  2635257059U, // <1,2,7,3>: Cost 3 vext2 <3,0,1,2>, <7,3,0,1>
+  2659145062U, // <1,2,7,4>: Cost 3 vext2 <7,0,1,2>, <7,4,5,6>
+  3732886916U, // <1,2,7,5>: Cost 4 vext2 <7,0,1,2>, <7,5,0,0>
+  3732886998U, // <1,2,7,6>: Cost 4 vext2 <7,0,1,2>, <7,6,0,1>
+  2659145255U, // <1,2,7,7>: Cost 3 vext2 <7,0,1,2>, <7,7,0,1>
+  1590711938U, // <1,2,7,u>: Cost 2 vext2 <7,u,1,2>, <7,u,1,2>
+  403529835U, // <1,2,u,0>: Cost 1 vext1 LHS, LHS
+  1477272310U, // <1,2,u,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  1477273192U, // <1,2,u,2>: Cost 2 vext1 LHS, <2,2,2,2>
+  1477273750U, // <1,2,u,3>: Cost 2 vext1 LHS, <3,0,1,2>
+  403533110U, // <1,2,u,4>: Cost 1 vext1 LHS, RHS
+  1561516186U, // <1,2,u,5>: Cost 2 vext2 <3,0,1,2>, RHS
+  1525051898U, // <1,2,u,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  1525052410U, // <1,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2>
+  403535662U, // <1,2,u,u>: Cost 1 vext1 LHS, LHS
+  2819407872U, // <1,3,0,0>: Cost 3 vuzpr LHS, <0,0,0,0>
+  1551564902U, // <1,3,0,1>: Cost 2 vext2 <1,3,1,3>, LHS
+  2819408630U, // <1,3,0,2>: Cost 3 vuzpr LHS, <1,0,3,2>
+  2619334911U, // <1,3,0,3>: Cost 3 vext2 <0,3,1,3>, <0,3,1,3>
+  2625306962U, // <1,3,0,4>: Cost 3 vext2 <1,3,1,3>, <0,4,1,5>
+  3832725879U, // <1,3,0,5>: Cost 4 vuzpl <1,2,3,0>, <0,4,5,6>
+  3699048959U, // <1,3,0,6>: Cost 4 vext2 <1,3,1,3>, <0,6,2,7>
+  3776538827U, // <1,3,0,7>: Cost 4 vext3 <3,0,7,1>, <3,0,7,1>
+  1551565469U, // <1,3,0,u>: Cost 2 vext2 <1,3,1,3>, LHS
+  2618671862U, // <1,3,1,0>: Cost 3 vext2 <0,2,1,3>, <1,0,3,2>
+  2819408692U, // <1,3,1,1>: Cost 3 vuzpr LHS, <1,1,1,1>
+  2624643975U, // <1,3,1,2>: Cost 3 vext2 <1,2,1,3>, <1,2,1,3>
+  1745666150U, // <1,3,1,3>: Cost 2 vuzpr LHS, LHS
+  2557005110U, // <1,3,1,4>: Cost 3 vext1 <1,1,3,1>, RHS
+  2625307792U, // <1,3,1,5>: Cost 3 vext2 <1,3,1,3>, <1,5,3,7>
+  3698386127U, // <1,3,1,6>: Cost 4 vext2 <1,2,1,3>, <1,6,1,7>
+  2592838748U, // <1,3,1,7>: Cost 3 vext1 <7,1,3,1>, <7,1,3,1>
+  1745666155U, // <1,3,1,u>: Cost 2 vuzpr LHS, LHS
+  2819408790U, // <1,3,2,0>: Cost 3 vuzpr LHS, <1,2,3,0>
+  2625308193U, // <1,3,2,1>: Cost 3 vext2 <1,3,1,3>, <2,1,3,3>
+  2819408036U, // <1,3,2,2>: Cost 3 vuzpr LHS, <0,2,0,2>
+  2819851890U, // <1,3,2,3>: Cost 3 vuzpr LHS, <2,2,3,3>
+  2819408794U, // <1,3,2,4>: Cost 3 vuzpr LHS, <1,2,3,4>
+  3893149890U, // <1,3,2,5>: Cost 4 vuzpr LHS, <0,2,3,5>
+  2819408076U, // <1,3,2,6>: Cost 3 vuzpr LHS, <0,2,4,6>
+  3772041583U, // <1,3,2,7>: Cost 4 vext3 <2,3,0,1>, <3,2,7,3>
+  2819408042U, // <1,3,2,u>: Cost 3 vuzpr LHS, <0,2,0,u>
+  1483276390U, // <1,3,3,0>: Cost 2 vext1 <1,1,3,3>, LHS
+  1483277128U, // <1,3,3,1>: Cost 2 vext1 <1,1,3,3>, <1,1,3,3>
+  2557019752U, // <1,3,3,2>: Cost 3 vext1 <1,1,3,3>, <2,2,2,2>
+  2819408856U, // <1,3,3,3>: Cost 3 vuzpr LHS, <1,3,1,3>
+  1483279670U, // <1,3,3,4>: Cost 2 vext1 <1,1,3,3>, RHS
+  2819409614U, // <1,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5>
+  2598826490U, // <1,3,3,6>: Cost 3 vext1 <u,1,3,3>, <6,2,7,3>
+  3087844352U, // <1,3,3,7>: Cost 3 vtrnr LHS, <1,3,5,7>
+  1483282222U, // <1,3,3,u>: Cost 2 vext1 <1,1,3,3>, LHS
+  2568970342U, // <1,3,4,0>: Cost 3 vext1 <3,1,3,4>, LHS
+  2568971224U, // <1,3,4,1>: Cost 3 vext1 <3,1,3,4>, <1,3,1,3>
+  3832761290U, // <1,3,4,2>: Cost 4 vuzpl <1,2,3,4>, <4,1,2,3>
+  2233428219U, // <1,3,4,3>: Cost 3 vrev <3,1,3,4>
+  2568973622U, // <1,3,4,4>: Cost 3 vext1 <3,1,3,4>, RHS
+  1551568182U, // <1,3,4,5>: Cost 2 vext2 <1,3,1,3>, RHS
+  2819410434U, // <1,3,4,6>: Cost 3 vuzpr LHS, <3,4,5,6>
+  3666605151U, // <1,3,4,7>: Cost 4 vext1 <7,1,3,4>, <7,1,3,4>
+  1551568425U, // <1,3,4,u>: Cost 2 vext2 <1,3,1,3>, RHS
+  2563006566U, // <1,3,5,0>: Cost 3 vext1 <2,1,3,5>, LHS
+  2568979456U, // <1,3,5,1>: Cost 3 vext1 <3,1,3,5>, <1,3,5,7>
+  2563008035U, // <1,3,5,2>: Cost 3 vext1 <2,1,3,5>, <2,1,3,5>
+  2233436412U, // <1,3,5,3>: Cost 3 vrev <3,1,3,5>
+  2563009846U, // <1,3,5,4>: Cost 3 vext1 <2,1,3,5>, RHS
+  2867187716U, // <1,3,5,5>: Cost 3 vuzpr LHS, <5,5,5,5>
+  2655834214U, // <1,3,5,6>: Cost 3 vext2 <6,4,1,3>, <5,6,7,4>
+  1745669430U, // <1,3,5,7>: Cost 2 vuzpr LHS, RHS
+  1745669431U, // <1,3,5,u>: Cost 2 vuzpr LHS, RHS
+  2867187810U, // <1,3,6,0>: Cost 3 vuzpr LHS, <5,6,7,0>
+  3699052931U, // <1,3,6,1>: Cost 4 vext2 <1,3,1,3>, <6,1,3,1>
+  2654507460U, // <1,3,6,2>: Cost 3 vext2 <6,2,1,3>, <6,2,1,3>
+  3766291091U, // <1,3,6,3>: Cost 4 vext3 <1,3,3,1>, <3,6,3,7>
+  2655834726U, // <1,3,6,4>: Cost 3 vext2 <6,4,1,3>, <6,4,1,3>
+  3923384562U, // <1,3,6,5>: Cost 4 vuzpr <5,1,7,3>, <u,6,7,5>
+  2657161992U, // <1,3,6,6>: Cost 3 vext2 <6,6,1,3>, <6,6,1,3>
+  2819852218U, // <1,3,6,7>: Cost 3 vuzpr LHS, <2,6,3,7>
+  2819852219U, // <1,3,6,u>: Cost 3 vuzpr LHS, <2,6,3,u>
+  2706926275U, // <1,3,7,0>: Cost 3 vext3 <3,7,0,1>, <3,7,0,1>
+  2659816524U, // <1,3,7,1>: Cost 3 vext2 <7,1,1,3>, <7,1,1,3>
+  3636766245U, // <1,3,7,2>: Cost 4 vext1 <2,1,3,7>, <2,1,3,7>
+  2867187903U, // <1,3,7,3>: Cost 3 vuzpr LHS, <5,7,u,3>
+  2625312102U, // <1,3,7,4>: Cost 3 vext2 <1,3,1,3>, <7,4,5,6>
+  2867188598U, // <1,3,7,5>: Cost 3 vuzpr LHS, <6,7,4,5>
+  3728250344U, // <1,3,7,6>: Cost 4 vext2 <6,2,1,3>, <7,6,2,1>
+  2867187880U, // <1,3,7,7>: Cost 3 vuzpr LHS, <5,7,5,7>
+  2707516171U, // <1,3,7,u>: Cost 3 vext3 <3,7,u,1>, <3,7,u,1>
+  1483317350U, // <1,3,u,0>: Cost 2 vext1 <1,1,3,u>, LHS
+  1483318093U, // <1,3,u,1>: Cost 2 vext1 <1,1,3,u>, <1,1,3,u>
+  2819410718U, // <1,3,u,2>: Cost 3 vuzpr LHS, <3,u,1,2>
+  1745666717U, // <1,3,u,3>: Cost 2 vuzpr LHS, LHS
+  1483320630U, // <1,3,u,4>: Cost 2 vext1 <1,1,3,u>, RHS
+  1551571098U, // <1,3,u,5>: Cost 2 vext2 <1,3,1,3>, RHS
+  2819410758U, // <1,3,u,6>: Cost 3 vuzpr LHS, <3,u,5,6>
+  1745669673U, // <1,3,u,7>: Cost 2 vuzpr LHS, RHS
+  1745666722U, // <1,3,u,u>: Cost 2 vuzpr LHS, LHS
+  2617352205U, // <1,4,0,0>: Cost 3 vext2 <0,0,1,4>, <0,0,1,4>
+  2619342950U, // <1,4,0,1>: Cost 3 vext2 <0,3,1,4>, LHS
+  3692421295U, // <1,4,0,2>: Cost 4 vext2 <0,2,1,4>, <0,2,1,4>
+  2619343104U, // <1,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4>
+  2617352530U, // <1,4,0,4>: Cost 3 vext2 <0,0,1,4>, <0,4,1,5>
+  1634880402U, // <1,4,0,5>: Cost 2 vext3 <4,0,5,1>, <4,0,5,1>
+  2713930652U, // <1,4,0,6>: Cost 3 vext3 <4,u,5,1>, <4,0,6,2>
+  3732898396U, // <1,4,0,7>: Cost 4 vext2 <7,0,1,4>, <0,7,4,1>
+  1635101613U, // <1,4,0,u>: Cost 2 vext3 <4,0,u,1>, <4,0,u,1>
+  3693085430U, // <1,4,1,0>: Cost 4 vext2 <0,3,1,4>, <1,0,3,2>
+  2623988535U, // <1,4,1,1>: Cost 3 vext2 <1,1,1,4>, <1,1,1,4>
+  3693085590U, // <1,4,1,2>: Cost 4 vext2 <0,3,1,4>, <1,2,3,0>
+  3692422134U, // <1,4,1,3>: Cost 4 vext2 <0,2,1,4>, <1,3,4,6>
+  3693085726U, // <1,4,1,4>: Cost 4 vext2 <0,3,1,4>, <1,4,0,1>
+  2892401974U, // <1,4,1,5>: Cost 3 vzipl <1,1,1,1>, RHS
+  3026619702U, // <1,4,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS
+  3800206324U, // <1,4,1,7>: Cost 4 vext3 <7,0,4,1>, <4,1,7,0>
+  2892402217U, // <1,4,1,u>: Cost 3 vzipl <1,1,1,1>, RHS
+  3966978927U, // <1,4,2,0>: Cost 4 vzipl <1,2,3,4>, <4,0,1,2>
+  3966979018U, // <1,4,2,1>: Cost 4 vzipl <1,2,3,4>, <4,1,2,3>
+  3693086312U, // <1,4,2,2>: Cost 4 vext2 <0,3,1,4>, <2,2,2,2>
+  2635269798U, // <1,4,2,3>: Cost 3 vext2 <3,0,1,4>, <2,3,0,1>
+  3966979280U, // <1,4,2,4>: Cost 4 vzipl <1,2,3,4>, <4,4,4,4>
+  2893204790U, // <1,4,2,5>: Cost 3 vzipl <1,2,3,0>, RHS
+  3693086650U, // <1,4,2,6>: Cost 4 vext2 <0,3,1,4>, <2,6,3,7>
+  3666662502U, // <1,4,2,7>: Cost 4 vext1 <7,1,4,2>, <7,1,4,2>
+  2893205033U, // <1,4,2,u>: Cost 3 vzipl <1,2,3,0>, RHS
+  2563063910U, // <1,4,3,0>: Cost 3 vext1 <2,1,4,3>, LHS
+  2563064730U, // <1,4,3,1>: Cost 3 vext1 <2,1,4,3>, <1,2,3,4>
+  2563065386U, // <1,4,3,2>: Cost 3 vext1 <2,1,4,3>, <2,1,4,3>
+  3693087132U, // <1,4,3,3>: Cost 4 vext2 <0,3,1,4>, <3,3,3,3>
+  2619345410U, // <1,4,3,4>: Cost 3 vext2 <0,3,1,4>, <3,4,5,6>
+  3087843666U, // <1,4,3,5>: Cost 3 vtrnr LHS, <0,4,1,5>
+  3087843676U, // <1,4,3,6>: Cost 3 vtrnr LHS, <0,4,2,6>
+  3666670695U, // <1,4,3,7>: Cost 4 vext1 <7,1,4,3>, <7,1,4,3>
+  3087843669U, // <1,4,3,u>: Cost 3 vtrnr LHS, <0,4,1,u>
+  2620672914U, // <1,4,4,0>: Cost 3 vext2 <0,5,1,4>, <4,0,5,1>
+  3630842706U, // <1,4,4,1>: Cost 4 vext1 <1,1,4,4>, <1,1,4,4>
+  3313069003U, // <1,4,4,2>: Cost 4 vrev <4,1,2,4>
+  3642788100U, // <1,4,4,3>: Cost 4 vext1 <3,1,4,4>, <3,1,4,4>
+  2713930960U, // <1,4,4,4>: Cost 3 vext3 <4,u,5,1>, <4,4,4,4>
+  2619346230U, // <1,4,4,5>: Cost 3 vext2 <0,3,1,4>, RHS
+  2713930980U, // <1,4,4,6>: Cost 3 vext3 <4,u,5,1>, <4,4,6,6>
+  3736882642U, // <1,4,4,7>: Cost 4 vext2 <7,6,1,4>, <4,7,6,1>
+  2619346473U, // <1,4,4,u>: Cost 3 vext2 <0,3,1,4>, RHS
+  2557108326U, // <1,4,5,0>: Cost 3 vext1 <1,1,4,5>, LHS
+  2557109075U, // <1,4,5,1>: Cost 3 vext1 <1,1,4,5>, <1,1,4,5>
+  2598913774U, // <1,4,5,2>: Cost 3 vext1 <u,1,4,5>, <2,3,u,1>
+  3630852246U, // <1,4,5,3>: Cost 4 vext1 <1,1,4,5>, <3,0,1,2>
+  2557111606U, // <1,4,5,4>: Cost 3 vext1 <1,1,4,5>, RHS
+  2895252790U, // <1,4,5,5>: Cost 3 vzipl <1,5,3,7>, RHS
+  1616006454U, // <1,4,5,6>: Cost 2 vext3 <0,u,1,1>, RHS
+  3899059510U, // <1,4,5,7>: Cost 4 vuzpr <1,1,1,4>, RHS
+  1616006472U, // <1,4,5,u>: Cost 2 vext3 <0,u,1,1>, RHS
+  2557116518U, // <1,4,6,0>: Cost 3 vext1 <1,1,4,6>, LHS
+  2557117236U, // <1,4,6,1>: Cost 3 vext1 <1,1,4,6>, <1,1,1,1>
+  3630859880U, // <1,4,6,2>: Cost 4 vext1 <1,1,4,6>, <2,2,2,2>
+  2569062550U, // <1,4,6,3>: Cost 3 vext1 <3,1,4,6>, <3,0,1,2>
+  2557119798U, // <1,4,6,4>: Cost 3 vext1 <1,1,4,6>, RHS
+  3763490174U, // <1,4,6,5>: Cost 4 vext3 <0,u,1,1>, <4,6,5,7>
+  3763490183U, // <1,4,6,6>: Cost 4 vext3 <0,u,1,1>, <4,6,6,7>
+  2712751498U, // <1,4,6,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1>
+  2557122350U, // <1,4,6,u>: Cost 3 vext1 <1,1,4,6>, LHS
+  2659161084U, // <1,4,7,0>: Cost 3 vext2 <7,0,1,4>, <7,0,1,4>
+  3732903040U, // <1,4,7,1>: Cost 4 vext2 <7,0,1,4>, <7,1,7,1>
+  3734230174U, // <1,4,7,2>: Cost 4 vext2 <7,2,1,4>, <7,2,1,4>
+  3734893807U, // <1,4,7,3>: Cost 4 vext2 <7,3,1,4>, <7,3,1,4>
+  3660729654U, // <1,4,7,4>: Cost 4 vext1 <6,1,4,7>, RHS
+  3786493384U, // <1,4,7,5>: Cost 4 vext3 <4,6,7,1>, <4,7,5,0>
+  2713341394U, // <1,4,7,6>: Cost 3 vext3 <4,7,6,1>, <4,7,6,1>
+  3660731386U, // <1,4,7,7>: Cost 4 vext1 <6,1,4,7>, <7,0,1,2>
+  2664470148U, // <1,4,7,u>: Cost 3 vext2 <7,u,1,4>, <7,u,1,4>
+  2557132902U, // <1,4,u,0>: Cost 3 vext1 <1,1,4,u>, LHS
+  2619348782U, // <1,4,u,1>: Cost 3 vext2 <0,3,1,4>, LHS
+  2563106351U, // <1,4,u,2>: Cost 3 vext1 <2,1,4,u>, <2,1,4,u>
+  2713783816U, // <1,4,u,3>: Cost 3 vext3 <4,u,3,1>, <4,u,3,1>
+  2622666815U, // <1,4,u,4>: Cost 3 vext2 <0,u,1,4>, <u,4,5,6>
+  1640189466U, // <1,4,u,5>: Cost 2 vext3 <4,u,5,1>, <4,u,5,1>
+  1616006697U, // <1,4,u,6>: Cost 2 vext3 <0,u,1,1>, RHS
+  2712751498U, // <1,4,u,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1>
+  1616006715U, // <1,4,u,u>: Cost 2 vext3 <0,u,1,1>, RHS
+  2620014592U, // <1,5,0,0>: Cost 3 vext2 <0,4,1,5>, <0,0,0,0>
+  1546272870U, // <1,5,0,1>: Cost 2 vext2 <0,4,1,5>, LHS
+  2618687664U, // <1,5,0,2>: Cost 3 vext2 <0,2,1,5>, <0,2,1,5>
+  3693093120U, // <1,5,0,3>: Cost 4 vext2 <0,3,1,5>, <0,3,1,4>
+  1546273106U, // <1,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5>
+  2620678563U, // <1,5,0,5>: Cost 3 vext2 <0,5,1,5>, <0,5,1,5>
+  2714668660U, // <1,5,0,6>: Cost 3 vext3 <5,0,6,1>, <5,0,6,1>
+  3772042877U, // <1,5,0,7>: Cost 4 vext3 <2,3,0,1>, <5,0,7,1>
+  1546273437U, // <1,5,0,u>: Cost 2 vext2 <0,4,1,5>, LHS
+  2620015350U, // <1,5,1,0>: Cost 3 vext2 <0,4,1,5>, <1,0,3,2>
+  2620015412U, // <1,5,1,1>: Cost 3 vext2 <0,4,1,5>, <1,1,1,1>
+  2620015510U, // <1,5,1,2>: Cost 3 vext2 <0,4,1,5>, <1,2,3,0>
+  2618688512U, // <1,5,1,3>: Cost 3 vext2 <0,2,1,5>, <1,3,5,7>
+  2620015677U, // <1,5,1,4>: Cost 3 vext2 <0,4,1,5>, <1,4,3,5>
+  2620015727U, // <1,5,1,5>: Cost 3 vext2 <0,4,1,5>, <1,5,0,1>
+  2620015859U, // <1,5,1,6>: Cost 3 vext2 <0,4,1,5>, <1,6,5,7>
+  3093728566U, // <1,5,1,7>: Cost 3 vtrnr <1,1,1,1>, RHS
+  2620015981U, // <1,5,1,u>: Cost 3 vext2 <0,4,1,5>, <1,u,1,3>
+  3692430816U, // <1,5,2,0>: Cost 4 vext2 <0,2,1,5>, <2,0,5,1>
+  2620016163U, // <1,5,2,1>: Cost 3 vext2 <0,4,1,5>, <2,1,3,5>
+  2620016232U, // <1,5,2,2>: Cost 3 vext2 <0,4,1,5>, <2,2,2,2>
+  2620016294U, // <1,5,2,3>: Cost 3 vext2 <0,4,1,5>, <2,3,0,1>
+  3693758221U, // <1,5,2,4>: Cost 4 vext2 <0,4,1,5>, <2,4,2,5>
+  3692431209U, // <1,5,2,5>: Cost 4 vext2 <0,2,1,5>, <2,5,3,7>
+  2620016570U, // <1,5,2,6>: Cost 3 vext2 <0,4,1,5>, <2,6,3,7>
+  4173598006U, // <1,5,2,7>: Cost 4 vtrnr <2,1,3,2>, RHS
+  2620016699U, // <1,5,2,u>: Cost 3 vext2 <0,4,1,5>, <2,u,0,1>
+  2620016790U, // <1,5,3,0>: Cost 3 vext2 <0,4,1,5>, <3,0,1,2>
+  2569110672U, // <1,5,3,1>: Cost 3 vext1 <3,1,5,3>, <1,5,3,7>
+  3693758785U, // <1,5,3,2>: Cost 4 vext2 <0,4,1,5>, <3,2,2,2>
+  2620017052U, // <1,5,3,3>: Cost 3 vext2 <0,4,1,5>, <3,3,3,3>
+  2620017154U, // <1,5,3,4>: Cost 3 vext2 <0,4,1,5>, <3,4,5,6>
+  3135623172U, // <1,5,3,5>: Cost 3 vtrnr LHS, <5,5,5,5>
+  4161587048U, // <1,5,3,6>: Cost 4 vtrnr LHS, <2,5,3,6>
+  2014104886U, // <1,5,3,7>: Cost 2 vtrnr LHS, RHS
+  2014104887U, // <1,5,3,u>: Cost 2 vtrnr LHS, RHS
+  2620017554U, // <1,5,4,0>: Cost 3 vext2 <0,4,1,5>, <4,0,5,1>
+  2620017634U, // <1,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0>
+  3693759551U, // <1,5,4,2>: Cost 4 vext2 <0,4,1,5>, <4,2,6,3>
+  3642861837U, // <1,5,4,3>: Cost 4 vext1 <3,1,5,4>, <3,1,5,4>
+  2575092710U, // <1,5,4,4>: Cost 3 vext1 <4,1,5,4>, <4,1,5,4>
+  1546276150U, // <1,5,4,5>: Cost 2 vext2 <0,4,1,5>, RHS
+  2759855414U, // <1,5,4,6>: Cost 3 vuzpl <1,3,5,7>, RHS
+  2713931718U, // <1,5,4,7>: Cost 3 vext3 <4,u,5,1>, <5,4,7,6>
+  1546276393U, // <1,5,4,u>: Cost 2 vext2 <0,4,1,5>, RHS
+  2557182054U, // <1,5,5,0>: Cost 3 vext1 <1,1,5,5>, LHS
+  2557182812U, // <1,5,5,1>: Cost 3 vext1 <1,1,5,5>, <1,1,5,5>
+  3630925347U, // <1,5,5,2>: Cost 4 vext1 <1,1,5,5>, <2,1,3,5>
+  4029301675U, // <1,5,5,3>: Cost 4 vzipr <0,4,1,5>, <1,2,5,3>
+  2557185334U, // <1,5,5,4>: Cost 3 vext1 <1,1,5,5>, RHS
+  2713931780U, // <1,5,5,5>: Cost 3 vext3 <4,u,5,1>, <5,5,5,5>
+  2667794530U, // <1,5,5,6>: Cost 3 vext2 <u,4,1,5>, <5,6,7,0>
+  2713931800U, // <1,5,5,7>: Cost 3 vext3 <4,u,5,1>, <5,5,7,7>
+  2557187886U, // <1,5,5,u>: Cost 3 vext1 <1,1,5,5>, LHS
+  2718208036U, // <1,5,6,0>: Cost 3 vext3 <5,6,0,1>, <5,6,0,1>
+  2620019115U, // <1,5,6,1>: Cost 3 vext2 <0,4,1,5>, <6,1,7,5>
+  2667794938U, // <1,5,6,2>: Cost 3 vext2 <u,4,1,5>, <6,2,7,3>
+  3787673666U, // <1,5,6,3>: Cost 4 vext3 <4,u,5,1>, <5,6,3,4>
+  3693761165U, // <1,5,6,4>: Cost 4 vext2 <0,4,1,5>, <6,4,5,6>
+  3319279297U, // <1,5,6,5>: Cost 4 vrev <5,1,5,6>
+  2667795256U, // <1,5,6,6>: Cost 3 vext2 <u,4,1,5>, <6,6,6,6>
+  2713931874U, // <1,5,6,7>: Cost 3 vext3 <4,u,5,1>, <5,6,7,0>
+  2713931883U, // <1,5,6,u>: Cost 3 vext3 <4,u,5,1>, <5,6,u,0>
+  2557198438U, // <1,5,7,0>: Cost 3 vext1 <1,1,5,7>, LHS
+  2557199156U, // <1,5,7,1>: Cost 3 vext1 <1,1,5,7>, <1,1,1,1>
+  2569143974U, // <1,5,7,2>: Cost 3 vext1 <3,1,5,7>, <2,3,0,1>
+  2569144592U, // <1,5,7,3>: Cost 3 vext1 <3,1,5,7>, <3,1,5,7>
+  2557201718U, // <1,5,7,4>: Cost 3 vext1 <1,1,5,7>, RHS
+  2713931944U, // <1,5,7,5>: Cost 3 vext3 <4,u,5,1>, <5,7,5,7>
+  3787673770U, // <1,5,7,6>: Cost 4 vext3 <4,u,5,1>, <5,7,6,0>
+  2719387828U, // <1,5,7,7>: Cost 3 vext3 <5,7,7,1>, <5,7,7,1>
+  2557204270U, // <1,5,7,u>: Cost 3 vext1 <1,1,5,7>, LHS
+  2620020435U, // <1,5,u,0>: Cost 3 vext2 <0,4,1,5>, <u,0,1,2>
+  1546278702U, // <1,5,u,1>: Cost 2 vext2 <0,4,1,5>, LHS
+  2620020616U, // <1,5,u,2>: Cost 3 vext2 <0,4,1,5>, <u,2,3,3>
+  2620020668U, // <1,5,u,3>: Cost 3 vext2 <0,4,1,5>, <u,3,0,1>
+  1594054682U, // <1,5,u,4>: Cost 2 vext2 <u,4,1,5>, <u,4,1,5>
+  1546279066U, // <1,5,u,5>: Cost 2 vext2 <0,4,1,5>, RHS
+  2620020944U, // <1,5,u,6>: Cost 3 vext2 <0,4,1,5>, <u,6,3,7>
+  2014145846U, // <1,5,u,7>: Cost 2 vtrnr LHS, RHS
+  2014145847U, // <1,5,u,u>: Cost 2 vtrnr LHS, RHS
+  3692437504U, // <1,6,0,0>: Cost 4 vext2 <0,2,1,6>, <0,0,0,0>
+  2618695782U, // <1,6,0,1>: Cost 3 vext2 <0,2,1,6>, LHS
+  2618695857U, // <1,6,0,2>: Cost 3 vext2 <0,2,1,6>, <0,2,1,6>
+  3794161970U, // <1,6,0,3>: Cost 4 vext3 <6,0,3,1>, <6,0,3,1>
+  2620023122U, // <1,6,0,4>: Cost 3 vext2 <0,4,1,6>, <0,4,1,5>
+  2620686756U, // <1,6,0,5>: Cost 3 vext2 <0,5,1,6>, <0,5,1,6>
+  2621350389U, // <1,6,0,6>: Cost 3 vext2 <0,6,1,6>, <0,6,1,6>
+  4028599606U, // <1,6,0,7>: Cost 4 vzipr <0,3,1,0>, RHS
+  2618696349U, // <1,6,0,u>: Cost 3 vext2 <0,2,1,6>, LHS
+  3692438262U, // <1,6,1,0>: Cost 4 vext2 <0,2,1,6>, <1,0,3,2>
+  2625995572U, // <1,6,1,1>: Cost 3 vext2 <1,4,1,6>, <1,1,1,1>
+  3692438422U, // <1,6,1,2>: Cost 4 vext2 <0,2,1,6>, <1,2,3,0>
+  3692438488U, // <1,6,1,3>: Cost 4 vext2 <0,2,1,6>, <1,3,1,3>
+  2625995820U, // <1,6,1,4>: Cost 3 vext2 <1,4,1,6>, <1,4,1,6>
+  3692438672U, // <1,6,1,5>: Cost 4 vext2 <0,2,1,6>, <1,5,3,7>
+  3692438720U, // <1,6,1,6>: Cost 4 vext2 <0,2,1,6>, <1,6,0,1>
+  2958183734U, // <1,6,1,7>: Cost 3 vzipr <0,u,1,1>, RHS
+  2958183735U, // <1,6,1,u>: Cost 3 vzipr <0,u,1,1>, RHS
+  2721526201U, // <1,6,2,0>: Cost 3 vext3 <6,2,0,1>, <6,2,0,1>
+  3692439097U, // <1,6,2,1>: Cost 4 vext2 <0,2,1,6>, <2,1,6,0>
+  3692439144U, // <1,6,2,2>: Cost 4 vext2 <0,2,1,6>, <2,2,2,2>
+  3692439206U, // <1,6,2,3>: Cost 4 vext2 <0,2,1,6>, <2,3,0,1>
+  3636948278U, // <1,6,2,4>: Cost 4 vext1 <2,1,6,2>, RHS
+  3787674092U, // <1,6,2,5>: Cost 4 vext3 <4,u,5,1>, <6,2,5,7>
+  2618697658U, // <1,6,2,6>: Cost 3 vext2 <0,2,1,6>, <2,6,3,7>
+  2970799414U, // <1,6,2,7>: Cost 3 vzipr <3,0,1,2>, RHS
+  2970799415U, // <1,6,2,u>: Cost 3 vzipr <3,0,1,2>, RHS
+  2563211366U, // <1,6,3,0>: Cost 3 vext1 <2,1,6,3>, LHS
+  3699738854U, // <1,6,3,1>: Cost 4 vext2 <1,4,1,6>, <3,1,1,1>
+  2563212860U, // <1,6,3,2>: Cost 3 vext1 <2,1,6,3>, <2,1,6,3>
+  3692439964U, // <1,6,3,3>: Cost 4 vext2 <0,2,1,6>, <3,3,3,3>
+  2563214646U, // <1,6,3,4>: Cost 3 vext1 <2,1,6,3>, RHS
+  4191820018U, // <1,6,3,5>: Cost 4 vtrnr <5,1,7,3>, <u,6,7,5>
+  2587103648U, // <1,6,3,6>: Cost 3 vext1 <6,1,6,3>, <6,1,6,3>
+  3087845306U, // <1,6,3,7>: Cost 3 vtrnr LHS, <2,6,3,7>
+  3087845307U, // <1,6,3,u>: Cost 3 vtrnr LHS, <2,6,3,u>
+  3693767570U, // <1,6,4,0>: Cost 4 vext2 <0,4,1,6>, <4,0,5,1>
+  3693767650U, // <1,6,4,1>: Cost 4 vext2 <0,4,1,6>, <4,1,5,0>
+  3636962877U, // <1,6,4,2>: Cost 4 vext1 <2,1,6,4>, <2,1,6,4>
+  3325088134U, // <1,6,4,3>: Cost 4 vrev <6,1,3,4>
+  3693767898U, // <1,6,4,4>: Cost 4 vext2 <0,4,1,6>, <4,4,5,5>
+  2618699062U, // <1,6,4,5>: Cost 3 vext2 <0,2,1,6>, RHS
+  3833670966U, // <1,6,4,6>: Cost 4 vuzpl <1,3,6,7>, RHS
+  4028632374U, // <1,6,4,7>: Cost 4 vzipr <0,3,1,4>, RHS
+  2618699305U, // <1,6,4,u>: Cost 3 vext2 <0,2,1,6>, RHS
+  3693768264U, // <1,6,5,0>: Cost 4 vext2 <0,4,1,6>, <5,0,1,2>
+  3630998373U, // <1,6,5,1>: Cost 4 vext1 <1,1,6,5>, <1,1,6,5>
+  3636971070U, // <1,6,5,2>: Cost 4 vext1 <2,1,6,5>, <2,1,6,5>
+  3642943767U, // <1,6,5,3>: Cost 4 vext1 <3,1,6,5>, <3,1,6,5>
+  3693768628U, // <1,6,5,4>: Cost 4 vext2 <0,4,1,6>, <5,4,5,6>
+  3732918276U, // <1,6,5,5>: Cost 4 vext2 <7,0,1,6>, <5,5,5,5>
+  2620690530U, // <1,6,5,6>: Cost 3 vext2 <0,5,1,6>, <5,6,7,0>
+  2955562294U, // <1,6,5,7>: Cost 3 vzipr <0,4,1,5>, RHS
+  2955562295U, // <1,6,5,u>: Cost 3 vzipr <0,4,1,5>, RHS
+  2724180733U, // <1,6,6,0>: Cost 3 vext3 <6,6,0,1>, <6,6,0,1>
+  3631006566U, // <1,6,6,1>: Cost 4 vext1 <1,1,6,6>, <1,1,6,6>
+  3631007674U, // <1,6,6,2>: Cost 4 vext1 <1,1,6,6>, <2,6,3,7>
+  3692442184U, // <1,6,6,3>: Cost 4 vext2 <0,2,1,6>, <6,3,7,0>
+  3631009078U, // <1,6,6,4>: Cost 4 vext1 <1,1,6,6>, RHS
+  3787674416U, // <1,6,6,5>: Cost 4 vext3 <4,u,5,1>, <6,6,5,7>
+  2713932600U, // <1,6,6,6>: Cost 3 vext3 <4,u,5,1>, <6,6,6,6>
+  2713932610U, // <1,6,6,7>: Cost 3 vext3 <4,u,5,1>, <6,6,7,7>
+  2713932619U, // <1,6,6,u>: Cost 3 vext3 <4,u,5,1>, <6,6,u,7>
+  1651102542U, // <1,6,7,0>: Cost 2 vext3 <6,7,0,1>, <6,7,0,1>
+  2724918103U, // <1,6,7,1>: Cost 3 vext3 <6,7,1,1>, <6,7,1,1>
+  2698302306U, // <1,6,7,2>: Cost 3 vext3 <2,3,0,1>, <6,7,2,3>
+  3642960153U, // <1,6,7,3>: Cost 4 vext1 <3,1,6,7>, <3,1,6,7>
+  2713932662U, // <1,6,7,4>: Cost 3 vext3 <4,u,5,1>, <6,7,4,5>
+  2725213051U, // <1,6,7,5>: Cost 3 vext3 <6,7,5,1>, <6,7,5,1>
+  2724844426U, // <1,6,7,6>: Cost 3 vext3 <6,7,0,1>, <6,7,6,7>
+  4035956022U, // <1,6,7,7>: Cost 4 vzipr <1,5,1,7>, RHS
+  1651692438U, // <1,6,7,u>: Cost 2 vext3 <6,7,u,1>, <6,7,u,1>
+  1651766175U, // <1,6,u,0>: Cost 2 vext3 <6,u,0,1>, <6,u,0,1>
+  2618701614U, // <1,6,u,1>: Cost 3 vext2 <0,2,1,6>, LHS
+  3135663508U, // <1,6,u,2>: Cost 3 vtrnr LHS, <4,6,u,2>
+  3692443580U, // <1,6,u,3>: Cost 4 vext2 <0,2,1,6>, <u,3,0,1>
+  2713932743U, // <1,6,u,4>: Cost 3 vext3 <4,u,5,1>, <6,u,4,5>
+  2618701978U, // <1,6,u,5>: Cost 3 vext2 <0,2,1,6>, RHS
+  2622683344U, // <1,6,u,6>: Cost 3 vext2 <0,u,1,6>, <u,6,3,7>
+  3087886266U, // <1,6,u,7>: Cost 3 vtrnr LHS, <2,6,3,7>
+  1652356071U, // <1,6,u,u>: Cost 2 vext3 <6,u,u,1>, <6,u,u,1>
+  2726171632U, // <1,7,0,0>: Cost 3 vext3 <7,0,0,1>, <7,0,0,1>
+  2626666598U, // <1,7,0,1>: Cost 3 vext2 <1,5,1,7>, LHS
+  3695100067U, // <1,7,0,2>: Cost 4 vext2 <0,6,1,7>, <0,2,0,1>
+  3707044102U, // <1,7,0,3>: Cost 4 vext2 <2,6,1,7>, <0,3,2,1>
+  2726466580U, // <1,7,0,4>: Cost 3 vext3 <7,0,4,1>, <7,0,4,1>
+  3654921933U, // <1,7,0,5>: Cost 4 vext1 <5,1,7,0>, <5,1,7,0>
+  2621358582U, // <1,7,0,6>: Cost 3 vext2 <0,6,1,7>, <0,6,1,7>
+  2622022215U, // <1,7,0,7>: Cost 3 vext2 <0,7,1,7>, <0,7,1,7>
+  2626667165U, // <1,7,0,u>: Cost 3 vext2 <1,5,1,7>, LHS
+  2593128550U, // <1,7,1,0>: Cost 3 vext1 <7,1,7,1>, LHS
+  2626667316U, // <1,7,1,1>: Cost 3 vext2 <1,5,1,7>, <1,1,1,1>
+  3700409238U, // <1,7,1,2>: Cost 4 vext2 <1,5,1,7>, <1,2,3,0>
+  2257294428U, // <1,7,1,3>: Cost 3 vrev <7,1,3,1>
+  2593131830U, // <1,7,1,4>: Cost 3 vext1 <7,1,7,1>, RHS
+  2626667646U, // <1,7,1,5>: Cost 3 vext2 <1,5,1,7>, <1,5,1,7>
+  2627331279U, // <1,7,1,6>: Cost 3 vext2 <1,6,1,7>, <1,6,1,7>
+  2593133696U, // <1,7,1,7>: Cost 3 vext1 <7,1,7,1>, <7,1,7,1>
+  2628658545U, // <1,7,1,u>: Cost 3 vext2 <1,u,1,7>, <1,u,1,7>
+  2587164774U, // <1,7,2,0>: Cost 3 vext1 <6,1,7,2>, LHS
+  3701073445U, // <1,7,2,1>: Cost 4 vext2 <1,6,1,7>, <2,1,3,7>
+  3700409960U, // <1,7,2,2>: Cost 4 vext2 <1,5,1,7>, <2,2,2,2>
+  2638612134U, // <1,7,2,3>: Cost 3 vext2 <3,5,1,7>, <2,3,0,1>
+  2587168054U, // <1,7,2,4>: Cost 3 vext1 <6,1,7,2>, RHS
+  3706382167U, // <1,7,2,5>: Cost 4 vext2 <2,5,1,7>, <2,5,1,7>
+  2587169192U, // <1,7,2,6>: Cost 3 vext1 <6,1,7,2>, <6,1,7,2>
+  3660911610U, // <1,7,2,7>: Cost 4 vext1 <6,1,7,2>, <7,0,1,2>
+  2587170606U, // <1,7,2,u>: Cost 3 vext1 <6,1,7,2>, LHS
+  1507459174U, // <1,7,3,0>: Cost 2 vext1 <5,1,7,3>, LHS
+  2569257984U, // <1,7,3,1>: Cost 3 vext1 <3,1,7,3>, <1,3,5,7>
+  2581202536U, // <1,7,3,2>: Cost 3 vext1 <5,1,7,3>, <2,2,2,2>
+  2569259294U, // <1,7,3,3>: Cost 3 vext1 <3,1,7,3>, <3,1,7,3>
+  1507462454U, // <1,7,3,4>: Cost 2 vext1 <5,1,7,3>, RHS
+  1507462864U, // <1,7,3,5>: Cost 2 vext1 <5,1,7,3>, <5,1,7,3>
+  2581205498U, // <1,7,3,6>: Cost 3 vext1 <5,1,7,3>, <6,2,7,3>
+  2581206010U, // <1,7,3,7>: Cost 3 vext1 <5,1,7,3>, <7,0,1,2>
+  1507465006U, // <1,7,3,u>: Cost 2 vext1 <5,1,7,3>, LHS
+  2728826164U, // <1,7,4,0>: Cost 3 vext3 <7,4,0,1>, <7,4,0,1>
+  3654951732U, // <1,7,4,1>: Cost 4 vext1 <5,1,7,4>, <1,1,1,1>
+  3330987094U, // <1,7,4,2>: Cost 4 vrev <7,1,2,4>
+  3331060831U, // <1,7,4,3>: Cost 4 vrev <7,1,3,4>
+  3787674971U, // <1,7,4,4>: Cost 4 vext3 <4,u,5,1>, <7,4,4,4>
+  2626669878U, // <1,7,4,5>: Cost 3 vext2 <1,5,1,7>, RHS
+  3785979241U, // <1,7,4,6>: Cost 4 vext3 <4,6,0,1>, <7,4,6,0>
+  3787085176U, // <1,7,4,7>: Cost 4 vext3 <4,7,6,1>, <7,4,7,6>
+  2626670121U, // <1,7,4,u>: Cost 3 vext2 <1,5,1,7>, RHS
+  2569273446U, // <1,7,5,0>: Cost 3 vext1 <3,1,7,5>, LHS
+  2569274368U, // <1,7,5,1>: Cost 3 vext1 <3,1,7,5>, <1,3,5,7>
+  3643016808U, // <1,7,5,2>: Cost 4 vext1 <3,1,7,5>, <2,2,2,2>
+  2569275680U, // <1,7,5,3>: Cost 3 vext1 <3,1,7,5>, <3,1,7,5>
+  2569276726U, // <1,7,5,4>: Cost 3 vext1 <3,1,7,5>, RHS
+  4102034790U, // <1,7,5,5>: Cost 4 vtrnl <1,3,5,7>, <7,4,5,6>
+  2651222067U, // <1,7,5,6>: Cost 3 vext2 <5,6,1,7>, <5,6,1,7>
+  3899378998U, // <1,7,5,7>: Cost 4 vuzpr <1,1,5,7>, RHS
+  2569279278U, // <1,7,5,u>: Cost 3 vext1 <3,1,7,5>, LHS
+  2730153430U, // <1,7,6,0>: Cost 3 vext3 <7,6,0,1>, <7,6,0,1>
+  2724845022U, // <1,7,6,1>: Cost 3 vext3 <6,7,0,1>, <7,6,1,0>
+  3643025338U, // <1,7,6,2>: Cost 4 vext1 <3,1,7,6>, <2,6,3,7>
+  3643025697U, // <1,7,6,3>: Cost 4 vext1 <3,1,7,6>, <3,1,7,6>
+  3643026742U, // <1,7,6,4>: Cost 4 vext1 <3,1,7,6>, RHS
+  3654971091U, // <1,7,6,5>: Cost 4 vext1 <5,1,7,6>, <5,1,7,6>
+  3787675153U, // <1,7,6,6>: Cost 4 vext3 <4,u,5,1>, <7,6,6,6>
+  2724845076U, // <1,7,6,7>: Cost 3 vext3 <6,7,0,1>, <7,6,7,0>
+  2725508637U, // <1,7,6,u>: Cost 3 vext3 <6,u,0,1>, <7,6,u,0>
+  2730817063U, // <1,7,7,0>: Cost 3 vext3 <7,7,0,1>, <7,7,0,1>
+  3631088436U, // <1,7,7,1>: Cost 4 vext1 <1,1,7,7>, <1,1,1,1>
+  3660949158U, // <1,7,7,2>: Cost 4 vext1 <6,1,7,7>, <2,3,0,1>
+  3801904705U, // <1,7,7,3>: Cost 4 vext3 <7,3,0,1>, <7,7,3,0>
+  3631090998U, // <1,7,7,4>: Cost 4 vext1 <1,1,7,7>, RHS
+  2662503828U, // <1,7,7,5>: Cost 3 vext2 <7,5,1,7>, <7,5,1,7>
+  3660951981U, // <1,7,7,6>: Cost 4 vext1 <6,1,7,7>, <6,1,7,7>
+  2713933420U, // <1,7,7,7>: Cost 3 vext3 <4,u,5,1>, <7,7,7,7>
+  2731406959U, // <1,7,7,u>: Cost 3 vext3 <7,7,u,1>, <7,7,u,1>
+  1507500134U, // <1,7,u,0>: Cost 2 vext1 <5,1,7,u>, LHS
+  2626672430U, // <1,7,u,1>: Cost 3 vext2 <1,5,1,7>, LHS
+  2581243496U, // <1,7,u,2>: Cost 3 vext1 <5,1,7,u>, <2,2,2,2>
+  2569300259U, // <1,7,u,3>: Cost 3 vext1 <3,1,7,u>, <3,1,7,u>
+  1507503414U, // <1,7,u,4>: Cost 2 vext1 <5,1,7,u>, RHS
+  1507503829U, // <1,7,u,5>: Cost 2 vext1 <5,1,7,u>, <5,1,7,u>
+  2581246458U, // <1,7,u,6>: Cost 3 vext1 <5,1,7,u>, <6,2,7,3>
+  2581246970U, // <1,7,u,7>: Cost 3 vext1 <5,1,7,u>, <7,0,1,2>
+  1507505966U, // <1,7,u,u>: Cost 2 vext1 <5,1,7,u>, LHS
+  1543643153U, // <1,u,0,0>: Cost 2 vext2 <0,0,1,u>, <0,0,1,u>
+  1546297446U, // <1,u,0,1>: Cost 2 vext2 <0,4,1,u>, LHS
+  2819448852U, // <1,u,0,2>: Cost 3 vuzpr LHS, <0,0,2,2>
+  2619375876U, // <1,u,0,3>: Cost 3 vext2 <0,3,1,u>, <0,3,1,u>
+  1546297685U, // <1,u,0,4>: Cost 2 vext2 <0,4,1,u>, <0,4,1,u>
+  1658771190U, // <1,u,0,5>: Cost 2 vext3 <u,0,5,1>, <u,0,5,1>
+  2736789248U, // <1,u,0,6>: Cost 3 vext3 <u,7,0,1>, <u,0,6,2>
+  2659189376U, // <1,u,0,7>: Cost 3 vext2 <7,0,1,u>, <0,7,u,1>
+  1546298013U, // <1,u,0,u>: Cost 2 vext2 <0,4,1,u>, LHS
+  1483112550U, // <1,u,1,0>: Cost 2 vext1 <1,1,1,1>, LHS
+  202162278U, // <1,u,1,1>: Cost 1 vdup1 LHS
+  1616009006U, // <1,u,1,2>: Cost 2 vext3 <0,u,1,1>, LHS
+  1745707110U, // <1,u,1,3>: Cost 2 vuzpr LHS, LHS
+  1483115830U, // <1,u,1,4>: Cost 2 vext1 <1,1,1,1>, RHS
+  2620040336U, // <1,u,1,5>: Cost 3 vext2 <0,4,1,u>, <1,5,3,7>
+  3026622618U, // <1,u,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS
+  2958183752U, // <1,u,1,7>: Cost 3 vzipr <0,u,1,1>, RHS
+  202162278U, // <1,u,1,u>: Cost 1 vdup1 LHS
+  2819449750U, // <1,u,2,0>: Cost 3 vuzpr LHS, <1,2,3,0>
+  2893207342U, // <1,u,2,1>: Cost 3 vzipl <1,2,3,0>, LHS
+  2819448996U, // <1,u,2,2>: Cost 3 vuzpr LHS, <0,2,0,2>
+  2819450482U, // <1,u,2,3>: Cost 3 vuzpr LHS, <2,2,3,3>
+  2819449754U, // <1,u,2,4>: Cost 3 vuzpr LHS, <1,2,3,4>
+  2893207706U, // <1,u,2,5>: Cost 3 vzipl <1,2,3,0>, RHS
+  2819449036U, // <1,u,2,6>: Cost 3 vuzpr LHS, <0,2,4,6>
+  2970799432U, // <1,u,2,7>: Cost 3 vzipr <3,0,1,2>, RHS
+  2819449002U, // <1,u,2,u>: Cost 3 vuzpr LHS, <0,2,0,u>
+  403931292U, // <1,u,3,0>: Cost 1 vext1 LHS, LHS
+  1477673718U, // <1,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  115726126U, // <1,u,3,2>: Cost 1 vrev LHS
+  2014102173U, // <1,u,3,3>: Cost 2 vtrnr LHS, LHS
+  403934518U, // <1,u,3,4>: Cost 1 vext1 LHS, RHS
+  1507536601U, // <1,u,3,5>: Cost 2 vext1 <5,1,u,3>, <5,1,u,3>
+  1525453306U, // <1,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  2014105129U, // <1,u,3,7>: Cost 2 vtrnr LHS, RHS
+  403937070U, // <1,u,3,u>: Cost 1 vext1 LHS, LHS
+  2620042157U, // <1,u,4,0>: Cost 3 vext2 <0,4,1,u>, <4,0,u,1>
+  2620042237U, // <1,u,4,1>: Cost 3 vext2 <0,4,1,u>, <4,1,u,0>
+  2263217967U, // <1,u,4,2>: Cost 3 vrev <u,1,2,4>
+  2569341224U, // <1,u,4,3>: Cost 3 vext1 <3,1,u,4>, <3,1,u,4>
+  2569342262U, // <1,u,4,4>: Cost 3 vext1 <3,1,u,4>, RHS
+  1546300726U, // <1,u,4,5>: Cost 2 vext2 <0,4,1,u>, RHS
+  2819449180U, // <1,u,4,6>: Cost 3 vuzpr LHS, <0,4,2,6>
+  2724845649U, // <1,u,4,7>: Cost 3 vext3 <6,7,0,1>, <u,4,7,6>
+  1546300969U, // <1,u,4,u>: Cost 2 vext2 <0,4,1,u>, RHS
+  2551431270U, // <1,u,5,0>: Cost 3 vext1 <0,1,u,5>, LHS
+  2551432192U, // <1,u,5,1>: Cost 3 vext1 <0,1,u,5>, <1,3,5,7>
+  3028293422U, // <1,u,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS
+  2955559068U, // <1,u,5,3>: Cost 3 vzipr <0,4,1,5>, LHS
+  2551434550U, // <1,u,5,4>: Cost 3 vext1 <0,1,u,5>, RHS
+  2895255706U, // <1,u,5,5>: Cost 3 vzipl <1,5,3,7>, RHS
+  1616009370U, // <1,u,5,6>: Cost 2 vext3 <0,u,1,1>, RHS
+  1745710390U, // <1,u,5,7>: Cost 2 vuzpr LHS, RHS
+  1745710391U, // <1,u,5,u>: Cost 2 vuzpr LHS, RHS
+  2653221159U, // <1,u,6,0>: Cost 3 vext2 <6,0,1,u>, <6,0,1,u>
+  2725509303U, // <1,u,6,1>: Cost 3 vext3 <6,u,0,1>, <u,6,1,0>
+  2659193338U, // <1,u,6,2>: Cost 3 vext2 <7,0,1,u>, <6,2,7,3>
+  2689751248U, // <1,u,6,3>: Cost 3 vext3 <0,u,1,1>, <u,6,3,7>
+  2867228774U, // <1,u,6,4>: Cost 3 vuzpr LHS, <5,6,7,4>
+  3764820194U, // <1,u,6,5>: Cost 4 vext3 <1,1,1,1>, <u,6,5,7>
+  2657202957U, // <1,u,6,6>: Cost 3 vext2 <6,6,1,u>, <6,6,1,u>
+  2819450810U, // <1,u,6,7>: Cost 3 vuzpr LHS, <2,6,3,7>
+  2819450811U, // <1,u,6,u>: Cost 3 vuzpr LHS, <2,6,3,u>
+  1585452032U, // <1,u,7,0>: Cost 2 vext2 <7,0,1,u>, <7,0,1,u>
+  2557420340U, // <1,u,7,1>: Cost 3 vext1 <1,1,u,7>, <1,1,1,1>
+  2569365158U, // <1,u,7,2>: Cost 3 vext1 <3,1,u,7>, <2,3,0,1>
+  2569365803U, // <1,u,7,3>: Cost 3 vext1 <3,1,u,7>, <3,1,u,7>
+  2557422902U, // <1,u,7,4>: Cost 3 vext1 <1,1,u,7>, RHS
+  2662512021U, // <1,u,7,5>: Cost 3 vext2 <7,5,1,u>, <7,5,1,u>
+  2724845884U, // <1,u,7,6>: Cost 3 vext3 <6,7,0,1>, <u,7,6,7>
+  2659194476U, // <1,u,7,7>: Cost 3 vext2 <7,0,1,u>, <7,7,7,7>
+  1590761096U, // <1,u,7,u>: Cost 2 vext2 <7,u,1,u>, <7,u,1,u>
+  403972257U, // <1,u,u,0>: Cost 1 vext1 LHS, LHS
+  202162278U, // <1,u,u,1>: Cost 1 vdup1 LHS
+  115767091U, // <1,u,u,2>: Cost 1 vrev LHS
+  1745707677U, // <1,u,u,3>: Cost 2 vuzpr LHS, LHS
+  403975478U, // <1,u,u,4>: Cost 1 vext1 LHS, RHS
+  1546303642U, // <1,u,u,5>: Cost 2 vext2 <0,4,1,u>, RHS
+  1616009613U, // <1,u,u,6>: Cost 2 vext3 <0,u,1,1>, RHS
+  1745710633U, // <1,u,u,7>: Cost 2 vuzpr LHS, RHS
+  403978030U, // <1,u,u,u>: Cost 1 vext1 LHS, LHS
+  2551463936U, // <2,0,0,0>: Cost 3 vext1 <0,2,0,0>, <0,0,0,0>
+  2685698058U, // <2,0,0,1>: Cost 3 vext3 <0,2,0,2>, <0,0,1,1>
+  1610776596U, // <2,0,0,2>: Cost 2 vext3 <0,0,2,2>, <0,0,2,2>
+  2619384069U, // <2,0,0,3>: Cost 3 vext2 <0,3,2,0>, <0,3,2,0>
+  2551467318U, // <2,0,0,4>: Cost 3 vext1 <0,2,0,0>, RHS
+  3899836596U, // <2,0,0,5>: Cost 4 vuzpr <1,2,3,0>, <3,0,4,5>
+  2621374968U, // <2,0,0,6>: Cost 3 vext2 <0,6,2,0>, <0,6,2,0>
+  4168271334U, // <2,0,0,7>: Cost 4 vtrnr <1,2,3,0>, <2,0,5,7>
+  1611219018U, // <2,0,0,u>: Cost 2 vext3 <0,0,u,2>, <0,0,u,2>
+  2551472138U, // <2,0,1,0>: Cost 3 vext1 <0,2,0,1>, <0,0,1,1>
+  2690564186U, // <2,0,1,1>: Cost 3 vext3 <1,0,3,2>, <0,1,1,0>
+  1611956326U, // <2,0,1,2>: Cost 2 vext3 <0,2,0,2>, LHS
+  2826092646U, // <2,0,1,3>: Cost 3 vuzpr <1,2,3,0>, LHS
+  2551475510U, // <2,0,1,4>: Cost 3 vext1 <0,2,0,1>, RHS
+  3692463248U, // <2,0,1,5>: Cost 4 vext2 <0,2,2,0>, <1,5,3,7>
+  2587308473U, // <2,0,1,6>: Cost 3 vext1 <6,2,0,1>, <6,2,0,1>
+  3661050874U, // <2,0,1,7>: Cost 4 vext1 <6,2,0,1>, <7,0,1,2>
+  1611956380U, // <2,0,1,u>: Cost 2 vext3 <0,2,0,2>, LHS
+  1477738598U, // <2,0,2,0>: Cost 2 vext1 <0,2,0,2>, LHS
+  2551481078U, // <2,0,2,1>: Cost 3 vext1 <0,2,0,2>, <1,0,3,2>
+  2551481796U, // <2,0,2,2>: Cost 3 vext1 <0,2,0,2>, <2,0,2,0>
+  2551482518U, // <2,0,2,3>: Cost 3 vext1 <0,2,0,2>, <3,0,1,2>
+  1477741878U, // <2,0,2,4>: Cost 2 vext1 <0,2,0,2>, RHS
+  2551484112U, // <2,0,2,5>: Cost 3 vext1 <0,2,0,2>, <5,1,7,3>
+  2551484759U, // <2,0,2,6>: Cost 3 vext1 <0,2,0,2>, <6,0,7,2>
+  2551485434U, // <2,0,2,7>: Cost 3 vext1 <0,2,0,2>, <7,0,1,2>
+  1477744430U, // <2,0,2,u>: Cost 2 vext1 <0,2,0,2>, LHS
+  2953625600U, // <2,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0>
+  2953627302U, // <2,0,3,1>: Cost 3 vzipr LHS, <2,3,0,1>
+  2953625764U, // <2,0,3,2>: Cost 3 vzipr LHS, <0,2,0,2>
+  4027369695U, // <2,0,3,3>: Cost 4 vzipr LHS, <3,1,0,3>
+  3625233718U, // <2,0,3,4>: Cost 4 vext1 <0,2,0,3>, RHS
+  3899836110U, // <2,0,3,5>: Cost 4 vuzpr <1,2,3,0>, <2,3,4,5>
+  4032012618U, // <2,0,3,6>: Cost 4 vzipr LHS, <0,4,0,6>
+  3899835392U, // <2,0,3,7>: Cost 4 vuzpr <1,2,3,0>, <1,3,5,7>
+  2953625770U, // <2,0,3,u>: Cost 3 vzipr LHS, <0,2,0,u>
+  2551496806U, // <2,0,4,0>: Cost 3 vext1 <0,2,0,4>, LHS
+  2685698386U, // <2,0,4,1>: Cost 3 vext3 <0,2,0,2>, <0,4,1,5>
+  2685698396U, // <2,0,4,2>: Cost 3 vext3 <0,2,0,2>, <0,4,2,6>
+  3625240726U, // <2,0,4,3>: Cost 4 vext1 <0,2,0,4>, <3,0,1,2>
+  2551500086U, // <2,0,4,4>: Cost 3 vext1 <0,2,0,4>, RHS
+  2618723638U, // <2,0,4,5>: Cost 3 vext2 <0,2,2,0>, RHS
+  2765409590U, // <2,0,4,6>: Cost 3 vuzpl <2,3,0,1>, RHS
+  3799990664U, // <2,0,4,7>: Cost 4 vext3 <7,0,1,2>, <0,4,7,5>
+  2685698450U, // <2,0,4,u>: Cost 3 vext3 <0,2,0,2>, <0,4,u,6>
+  3625246822U, // <2,0,5,0>: Cost 4 vext1 <0,2,0,5>, LHS
+  3289776304U, // <2,0,5,1>: Cost 4 vrev <0,2,1,5>
+  2690564526U, // <2,0,5,2>: Cost 3 vext3 <1,0,3,2>, <0,5,2,7>
+  3289923778U, // <2,0,5,3>: Cost 4 vrev <0,2,3,5>
+  2216255691U, // <2,0,5,4>: Cost 3 vrev <0,2,4,5>
+  3726307332U, // <2,0,5,5>: Cost 4 vext2 <5,u,2,0>, <5,5,5,5>
+  3726307426U, // <2,0,5,6>: Cost 4 vext2 <5,u,2,0>, <5,6,7,0>
+  2826095926U, // <2,0,5,7>: Cost 3 vuzpr <1,2,3,0>, RHS
+  2216550639U, // <2,0,5,u>: Cost 3 vrev <0,2,u,5>
+  4162420736U, // <2,0,6,0>: Cost 4 vtrnr <0,2,4,6>, <0,0,0,0>
+  2901885030U, // <2,0,6,1>: Cost 3 vzipl <2,6,3,7>, LHS
+  2685698559U, // <2,0,6,2>: Cost 3 vext3 <0,2,0,2>, <0,6,2,7>
+  3643173171U, // <2,0,6,3>: Cost 4 vext1 <3,2,0,6>, <3,2,0,6>
+  2216263884U, // <2,0,6,4>: Cost 3 vrev <0,2,4,6>
+  3730289341U, // <2,0,6,5>: Cost 4 vext2 <6,5,2,0>, <6,5,2,0>
+  3726308152U, // <2,0,6,6>: Cost 4 vext2 <5,u,2,0>, <6,6,6,6>
+  3899836346U, // <2,0,6,7>: Cost 4 vuzpr <1,2,3,0>, <2,6,3,7>
+  2216558832U, // <2,0,6,u>: Cost 3 vrev <0,2,u,6>
+  2659202049U, // <2,0,7,0>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0>
+  3726308437U, // <2,0,7,1>: Cost 4 vext2 <5,u,2,0>, <7,1,2,3>
+  2726249034U, // <2,0,7,2>: Cost 3 vext3 <7,0,1,2>, <0,7,2,1>
+  3734934772U, // <2,0,7,3>: Cost 4 vext2 <7,3,2,0>, <7,3,2,0>
+  3726308710U, // <2,0,7,4>: Cost 4 vext2 <5,u,2,0>, <7,4,5,6>
+  3726308814U, // <2,0,7,5>: Cost 4 vext2 <5,u,2,0>, <7,5,u,2>
+  3736925671U, // <2,0,7,6>: Cost 4 vext2 <7,6,2,0>, <7,6,2,0>
+  3726308972U, // <2,0,7,7>: Cost 4 vext2 <5,u,2,0>, <7,7,7,7>
+  2659202049U, // <2,0,7,u>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0>
+  1477787750U, // <2,0,u,0>: Cost 2 vext1 <0,2,0,u>, LHS
+  2953668262U, // <2,0,u,1>: Cost 3 vzipr LHS, <2,3,0,1>
+  1611956893U, // <2,0,u,2>: Cost 2 vext3 <0,2,0,2>, LHS
+  2551531670U, // <2,0,u,3>: Cost 3 vext1 <0,2,0,u>, <3,0,1,2>
+  1477791030U, // <2,0,u,4>: Cost 2 vext1 <0,2,0,u>, RHS
+  2618726554U, // <2,0,u,5>: Cost 3 vext2 <0,2,2,0>, RHS
+  2765412506U, // <2,0,u,6>: Cost 3 vuzpl <2,3,0,1>, RHS
+  2826096169U, // <2,0,u,7>: Cost 3 vuzpr <1,2,3,0>, RHS
+  1611956947U, // <2,0,u,u>: Cost 2 vext3 <0,2,0,2>, LHS
+  2569453670U, // <2,1,0,0>: Cost 3 vext1 <3,2,1,0>, LHS
+  2619392102U, // <2,1,0,1>: Cost 3 vext2 <0,3,2,1>, LHS
+  3759440619U, // <2,1,0,2>: Cost 4 vext3 <0,2,0,2>, <1,0,2,0>
+  1616823030U, // <2,1,0,3>: Cost 2 vext3 <1,0,3,2>, <1,0,3,2>
+  2569456950U, // <2,1,0,4>: Cost 3 vext1 <3,2,1,0>, RHS
+  2690712328U, // <2,1,0,5>: Cost 3 vext3 <1,0,5,2>, <1,0,5,2>
+  3661115841U, // <2,1,0,6>: Cost 4 vext1 <6,2,1,0>, <6,2,1,0>
+  2622046794U, // <2,1,0,7>: Cost 3 vext2 <0,7,2,1>, <0,7,2,1>
+  1617191715U, // <2,1,0,u>: Cost 2 vext3 <1,0,u,2>, <1,0,u,2>
+  2551545958U, // <2,1,1,0>: Cost 3 vext1 <0,2,1,1>, LHS
+  2685698868U, // <2,1,1,1>: Cost 3 vext3 <0,2,0,2>, <1,1,1,1>
+  2628682646U, // <2,1,1,2>: Cost 3 vext2 <1,u,2,1>, <1,2,3,0>
+  2685698888U, // <2,1,1,3>: Cost 3 vext3 <0,2,0,2>, <1,1,3,3>
+  2551549238U, // <2,1,1,4>: Cost 3 vext1 <0,2,1,1>, RHS
+  3693134992U, // <2,1,1,5>: Cost 4 vext2 <0,3,2,1>, <1,5,3,7>
+  3661124034U, // <2,1,1,6>: Cost 4 vext1 <6,2,1,1>, <6,2,1,1>
+  3625292794U, // <2,1,1,7>: Cost 4 vext1 <0,2,1,1>, <7,0,1,2>
+  2685698933U, // <2,1,1,u>: Cost 3 vext3 <0,2,0,2>, <1,1,u,3>
+  2551554150U, // <2,1,2,0>: Cost 3 vext1 <0,2,1,2>, LHS
+  3893649571U, // <2,1,2,1>: Cost 4 vuzpr <0,2,0,1>, <0,2,0,1>
+  2551555688U, // <2,1,2,2>: Cost 3 vext1 <0,2,1,2>, <2,2,2,2>
+  2685698966U, // <2,1,2,3>: Cost 3 vext3 <0,2,0,2>, <1,2,3,0>
+  2551557430U, // <2,1,2,4>: Cost 3 vext1 <0,2,1,2>, RHS
+  3763422123U, // <2,1,2,5>: Cost 4 vext3 <0,u,0,2>, <1,2,5,3>
+  3693135802U, // <2,1,2,6>: Cost 4 vext2 <0,3,2,1>, <2,6,3,7>
+  2726249402U, // <2,1,2,7>: Cost 3 vext3 <7,0,1,2>, <1,2,7,0>
+  2685699011U, // <2,1,2,u>: Cost 3 vext3 <0,2,0,2>, <1,2,u,0>
+  2551562342U, // <2,1,3,0>: Cost 3 vext1 <0,2,1,3>, LHS
+  2953625610U, // <2,1,3,1>: Cost 3 vzipr LHS, <0,0,1,1>
+  2953627798U, // <2,1,3,2>: Cost 3 vzipr LHS, <3,0,1,2>
+  2953626584U, // <2,1,3,3>: Cost 3 vzipr LHS, <1,3,1,3>
+  2551565622U, // <2,1,3,4>: Cost 3 vext1 <0,2,1,3>, RHS
+  2953625938U, // <2,1,3,5>: Cost 3 vzipr LHS, <0,4,1,5>
+  2587398596U, // <2,1,3,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3>
+  4032013519U, // <2,1,3,7>: Cost 4 vzipr LHS, <1,6,1,7>
+  2953625617U, // <2,1,3,u>: Cost 3 vzipr LHS, <0,0,1,u>
+  2690565154U, // <2,1,4,0>: Cost 3 vext3 <1,0,3,2>, <1,4,0,5>
+  3625313270U, // <2,1,4,1>: Cost 4 vext1 <0,2,1,4>, <1,3,4,6>
+  3771532340U, // <2,1,4,2>: Cost 4 vext3 <2,2,2,2>, <1,4,2,5>
+  1148404634U, // <2,1,4,3>: Cost 2 vrev <1,2,3,4>
+  3625315638U, // <2,1,4,4>: Cost 4 vext1 <0,2,1,4>, RHS
+  2619395382U, // <2,1,4,5>: Cost 3 vext2 <0,3,2,1>, RHS
+  3837242678U, // <2,1,4,6>: Cost 4 vuzpl <2,0,1,2>, RHS
+  3799991394U, // <2,1,4,7>: Cost 4 vext3 <7,0,1,2>, <1,4,7,6>
+  1148773319U, // <2,1,4,u>: Cost 2 vrev <1,2,u,4>
+  2551578726U, // <2,1,5,0>: Cost 3 vext1 <0,2,1,5>, LHS
+  2551579648U, // <2,1,5,1>: Cost 3 vext1 <0,2,1,5>, <1,3,5,7>
+  3625321952U, // <2,1,5,2>: Cost 4 vext1 <0,2,1,5>, <2,0,5,1>
+  2685699216U, // <2,1,5,3>: Cost 3 vext3 <0,2,0,2>, <1,5,3,7>
+  2551582006U, // <2,1,5,4>: Cost 3 vext1 <0,2,1,5>, RHS
+  3740913668U, // <2,1,5,5>: Cost 4 vext2 <u,3,2,1>, <5,5,5,5>
+  3661156806U, // <2,1,5,6>: Cost 4 vext1 <6,2,1,5>, <6,2,1,5>
+  3893652790U, // <2,1,5,7>: Cost 4 vuzpr <0,2,0,1>, RHS
+  2685699261U, // <2,1,5,u>: Cost 3 vext3 <0,2,0,2>, <1,5,u,7>
+  2551586918U, // <2,1,6,0>: Cost 3 vext1 <0,2,1,6>, LHS
+  3625329398U, // <2,1,6,1>: Cost 4 vext1 <0,2,1,6>, <1,0,3,2>
+  2551588794U, // <2,1,6,2>: Cost 3 vext1 <0,2,1,6>, <2,6,3,7>
+  3088679014U, // <2,1,6,3>: Cost 3 vtrnr <0,2,4,6>, LHS
+  2551590198U, // <2,1,6,4>: Cost 3 vext1 <0,2,1,6>, RHS
+  4029382994U, // <2,1,6,5>: Cost 4 vzipr <0,4,2,6>, <0,4,1,5>
+  3625333560U, // <2,1,6,6>: Cost 4 vext1 <0,2,1,6>, <6,6,6,6>
+  3731624800U, // <2,1,6,7>: Cost 4 vext2 <6,7,2,1>, <6,7,2,1>
+  2551592750U, // <2,1,6,u>: Cost 3 vext1 <0,2,1,6>, LHS
+  2622051322U, // <2,1,7,0>: Cost 3 vext2 <0,7,2,1>, <7,0,1,2>
+  3733615699U, // <2,1,7,1>: Cost 4 vext2 <7,1,2,1>, <7,1,2,1>
+  3795125538U, // <2,1,7,2>: Cost 4 vext3 <6,1,7,2>, <1,7,2,0>
+  2222171037U, // <2,1,7,3>: Cost 3 vrev <1,2,3,7>
+  3740915046U, // <2,1,7,4>: Cost 4 vext2 <u,3,2,1>, <7,4,5,6>
+  3296060335U, // <2,1,7,5>: Cost 4 vrev <1,2,5,7>
+  3736933864U, // <2,1,7,6>: Cost 4 vext2 <7,6,2,1>, <7,6,2,1>
+  3805300055U, // <2,1,7,7>: Cost 4 vext3 <7,u,1,2>, <1,7,7,u>
+  2669827714U, // <2,1,7,u>: Cost 3 vext2 <u,7,2,1>, <7,u,1,2>
+  2551603302U, // <2,1,u,0>: Cost 3 vext1 <0,2,1,u>, LHS
+  2953666570U, // <2,1,u,1>: Cost 3 vzipr LHS, <0,0,1,1>
+  2953668758U, // <2,1,u,2>: Cost 3 vzipr LHS, <3,0,1,2>
+  1148437406U, // <2,1,u,3>: Cost 2 vrev <1,2,3,u>
+  2551606582U, // <2,1,u,4>: Cost 3 vext1 <0,2,1,u>, RHS
+  2953666898U, // <2,1,u,5>: Cost 3 vzipr LHS, <0,4,1,5>
+  2587398596U, // <2,1,u,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3>
+  2669828370U, // <2,1,u,7>: Cost 3 vext2 <u,7,2,1>, <u,7,2,1>
+  1148806091U, // <2,1,u,u>: Cost 2 vrev <1,2,u,u>
+  1543667732U, // <2,2,0,0>: Cost 2 vext2 <0,0,2,2>, <0,0,2,2>
+  1548976230U, // <2,2,0,1>: Cost 2 vext2 <0,u,2,2>, LHS
+  2685699524U, // <2,2,0,2>: Cost 3 vext3 <0,2,0,2>, <2,0,2,0>
+  2685699535U, // <2,2,0,3>: Cost 3 vext3 <0,2,0,2>, <2,0,3,2>
+  2551614774U, // <2,2,0,4>: Cost 3 vext1 <0,2,2,0>, RHS
+  3704422830U, // <2,2,0,5>: Cost 4 vext2 <2,2,2,2>, <0,5,2,7>
+  3893657642U, // <2,2,0,6>: Cost 4 vuzpr <0,2,0,2>, <0,0,4,6>
+  3770574323U, // <2,2,0,7>: Cost 4 vext3 <2,0,7,2>, <2,0,7,2>
+  1548976796U, // <2,2,0,u>: Cost 2 vext2 <0,u,2,2>, <0,u,2,2>
+  2622718710U, // <2,2,1,0>: Cost 3 vext2 <0,u,2,2>, <1,0,3,2>
+  2622718772U, // <2,2,1,1>: Cost 3 vext2 <0,u,2,2>, <1,1,1,1>
+  2622718870U, // <2,2,1,2>: Cost 3 vext2 <0,u,2,2>, <1,2,3,0>
+  2819915878U, // <2,2,1,3>: Cost 3 vuzpr <0,2,0,2>, LHS
+  3625364790U, // <2,2,1,4>: Cost 4 vext1 <0,2,2,1>, RHS
+  2622719120U, // <2,2,1,5>: Cost 3 vext2 <0,u,2,2>, <1,5,3,7>
+  3760031292U, // <2,2,1,6>: Cost 4 vext3 <0,2,u,2>, <2,1,6,3>
+  3667170468U, // <2,2,1,7>: Cost 4 vext1 <7,2,2,1>, <7,2,2,1>
+  2819915883U, // <2,2,1,u>: Cost 3 vuzpr <0,2,0,2>, LHS
+  1489829990U, // <2,2,2,0>: Cost 2 vext1 <2,2,2,2>, LHS
+  2563572470U, // <2,2,2,1>: Cost 3 vext1 <2,2,2,2>, <1,0,3,2>
+  269271142U, // <2,2,2,2>: Cost 1 vdup2 LHS
+  2685699698U, // <2,2,2,3>: Cost 3 vext3 <0,2,0,2>, <2,2,3,3>
+  1489833270U, // <2,2,2,4>: Cost 2 vext1 <2,2,2,2>, RHS
+  2685699720U, // <2,2,2,5>: Cost 3 vext3 <0,2,0,2>, <2,2,5,7>
+  2622719930U, // <2,2,2,6>: Cost 3 vext2 <0,u,2,2>, <2,6,3,7>
+  2593436837U, // <2,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2>
+  269271142U, // <2,2,2,u>: Cost 1 vdup2 LHS
+  2685699750U, // <2,2,3,0>: Cost 3 vext3 <0,2,0,2>, <2,3,0,1>
+  2690565806U, // <2,2,3,1>: Cost 3 vext3 <1,0,3,2>, <2,3,1,0>
+  2953627240U, // <2,2,3,2>: Cost 3 vzipr LHS, <2,2,2,2>
+  1879883878U, // <2,2,3,3>: Cost 2 vzipr LHS, LHS
+  2685699790U, // <2,2,3,4>: Cost 3 vext3 <0,2,0,2>, <2,3,4,5>
+  3893659342U, // <2,2,3,5>: Cost 4 vuzpr <0,2,0,2>, <2,3,4,5>
+  2958270812U, // <2,2,3,6>: Cost 3 vzipr LHS, <0,4,2,6>
+  2593445030U, // <2,2,3,7>: Cost 3 vext1 <7,2,2,3>, <7,2,2,3>
+  1879883883U, // <2,2,3,u>: Cost 2 vzipr LHS, LHS
+  2551644262U, // <2,2,4,0>: Cost 3 vext1 <0,2,2,4>, LHS
+  3625386742U, // <2,2,4,1>: Cost 4 vext1 <0,2,2,4>, <1,0,3,2>
+  2551645902U, // <2,2,4,2>: Cost 3 vext1 <0,2,2,4>, <2,3,4,5>
+  3759441686U, // <2,2,4,3>: Cost 4 vext3 <0,2,0,2>, <2,4,3,5>
+  2551647542U, // <2,2,4,4>: Cost 3 vext1 <0,2,2,4>, RHS
+  1548979510U, // <2,2,4,5>: Cost 2 vext2 <0,u,2,2>, RHS
+  2764901686U, // <2,2,4,6>: Cost 3 vuzpl <2,2,2,2>, RHS
+  3667195047U, // <2,2,4,7>: Cost 4 vext1 <7,2,2,4>, <7,2,2,4>
+  1548979753U, // <2,2,4,u>: Cost 2 vext2 <0,u,2,2>, RHS
+  3696463432U, // <2,2,5,0>: Cost 4 vext2 <0,u,2,2>, <5,0,1,2>
+  2617413328U, // <2,2,5,1>: Cost 3 vext2 <0,0,2,2>, <5,1,7,3>
+  2685699936U, // <2,2,5,2>: Cost 3 vext3 <0,2,0,2>, <2,5,2,7>
+  4027383910U, // <2,2,5,3>: Cost 4 vzipr <0,1,2,5>, LHS
+  2228201085U, // <2,2,5,4>: Cost 3 vrev <2,2,4,5>
+  2617413636U, // <2,2,5,5>: Cost 3 vext2 <0,0,2,2>, <5,5,5,5>
+  2617413730U, // <2,2,5,6>: Cost 3 vext2 <0,0,2,2>, <5,6,7,0>
+  2819919158U, // <2,2,5,7>: Cost 3 vuzpr <0,2,0,2>, RHS
+  2819919159U, // <2,2,5,u>: Cost 3 vuzpr <0,2,0,2>, RHS
+  3625402554U, // <2,2,6,0>: Cost 4 vext1 <0,2,2,6>, <0,2,2,6>
+  3760031652U, // <2,2,6,1>: Cost 4 vext3 <0,2,u,2>, <2,6,1,3>
+  2617414138U, // <2,2,6,2>: Cost 3 vext2 <0,0,2,2>, <6,2,7,3>
+  2685700026U, // <2,2,6,3>: Cost 3 vext3 <0,2,0,2>, <2,6,3,7>
+  3625405750U, // <2,2,6,4>: Cost 4 vext1 <0,2,2,6>, RHS
+  3760031692U, // <2,2,6,5>: Cost 4 vext3 <0,2,u,2>, <2,6,5,7>
+  3088679116U, // <2,2,6,6>: Cost 3 vtrnr <0,2,4,6>, <0,2,4,6>
+  2657891169U, // <2,2,6,7>: Cost 3 vext2 <6,7,2,2>, <6,7,2,2>
+  2685700071U, // <2,2,6,u>: Cost 3 vext3 <0,2,0,2>, <2,6,u,7>
+  2726250474U, // <2,2,7,0>: Cost 3 vext3 <7,0,1,2>, <2,7,0,1>
+  3704427616U, // <2,2,7,1>: Cost 4 vext2 <2,2,2,2>, <7,1,3,5>
+  2660545701U, // <2,2,7,2>: Cost 3 vext2 <7,2,2,2>, <7,2,2,2>
+  4030718054U, // <2,2,7,3>: Cost 4 vzipr <0,6,2,7>, LHS
+  2617415014U, // <2,2,7,4>: Cost 3 vext2 <0,0,2,2>, <7,4,5,6>
+  3302033032U, // <2,2,7,5>: Cost 4 vrev <2,2,5,7>
+  3661246929U, // <2,2,7,6>: Cost 4 vext1 <6,2,2,7>, <6,2,2,7>
+  2617415276U, // <2,2,7,7>: Cost 3 vext2 <0,0,2,2>, <7,7,7,7>
+  2731558962U, // <2,2,7,u>: Cost 3 vext3 <7,u,1,2>, <2,7,u,1>
+  1489829990U, // <2,2,u,0>: Cost 2 vext1 <2,2,2,2>, LHS
+  1548982062U, // <2,2,u,1>: Cost 2 vext2 <0,u,2,2>, LHS
+  269271142U, // <2,2,u,2>: Cost 1 vdup2 LHS
+  1879924838U, // <2,2,u,3>: Cost 2 vzipr LHS, LHS
+  1489833270U, // <2,2,u,4>: Cost 2 vext1 <2,2,2,2>, RHS
+  1548982426U, // <2,2,u,5>: Cost 2 vext2 <0,u,2,2>, RHS
+  2953666908U, // <2,2,u,6>: Cost 3 vzipr LHS, <0,4,2,6>
+  2819919401U, // <2,2,u,7>: Cost 3 vuzpr <0,2,0,2>, RHS
+  269271142U, // <2,2,u,u>: Cost 1 vdup2 LHS
+  1544339456U, // <2,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+  470597734U, // <2,3,0,1>: Cost 1 vext2 LHS, LHS
+  1548984484U, // <2,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+  2619408648U, // <2,3,0,3>: Cost 3 vext2 <0,3,2,3>, <0,3,2,3>
+  1548984658U, // <2,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+  2665857454U, // <2,3,0,5>: Cost 3 vext2 LHS, <0,5,2,7>
+  2622726655U, // <2,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7>
+  2593494188U, // <2,3,0,7>: Cost 3 vext1 <7,2,3,0>, <7,2,3,0>
+  470598301U, // <2,3,0,u>: Cost 1 vext2 LHS, LHS
+  1544340214U, // <2,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+  1544340276U, // <2,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+  1544340374U, // <2,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+  1548985304U, // <2,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+  2551696694U, // <2,3,1,4>: Cost 3 vext1 <0,2,3,1>, RHS
+  1548985488U, // <2,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+  2622727375U, // <2,3,1,6>: Cost 3 vext2 LHS, <1,6,1,7>
+  2665858347U, // <2,3,1,7>: Cost 3 vext2 LHS, <1,7,3,0>
+  1548985709U, // <2,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3>
+  2622727613U, // <2,3,2,0>: Cost 3 vext2 LHS, <2,0,1,2>
+  2622727711U, // <2,3,2,1>: Cost 3 vext2 LHS, <2,1,3,1>
+  1544341096U, // <2,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2>
+  1544341158U, // <2,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
+  2622727958U, // <2,3,2,4>: Cost 3 vext2 LHS, <2,4,3,5>
+  2622728032U, // <2,3,2,5>: Cost 3 vext2 LHS, <2,5,2,7>
+  1548986298U, // <2,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+  2665859050U, // <2,3,2,7>: Cost 3 vext2 LHS, <2,7,0,1>
+  1548986427U, // <2,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1>
+  1548986518U, // <2,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
+  2622728415U, // <2,3,3,1>: Cost 3 vext2 LHS, <3,1,0,3>
+  1489913458U, // <2,3,3,2>: Cost 2 vext1 <2,2,3,3>, <2,2,3,3>
+  1544341916U, // <2,3,3,3>: Cost 2 vext2 LHS, <3,3,3,3>
+  1548986882U, // <2,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
+  2665859632U, // <2,3,3,5>: Cost 3 vext2 LHS, <3,5,1,7>
+  2234304870U, // <2,3,3,6>: Cost 3 vrev <3,2,6,3>
+  2958271632U, // <2,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7>
+  1548987166U, // <2,3,3,u>: Cost 2 vext2 LHS, <3,u,1,2>
+  1483948134U, // <2,3,4,0>: Cost 2 vext1 <1,2,3,4>, LHS
+  1483948954U, // <2,3,4,1>: Cost 2 vext1 <1,2,3,4>, <1,2,3,4>
+  2622729276U, // <2,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0>
+  2557692054U, // <2,3,4,3>: Cost 3 vext1 <1,2,3,4>, <3,0,1,2>
+  1483951414U, // <2,3,4,4>: Cost 2 vext1 <1,2,3,4>, RHS
+  470601014U, // <2,3,4,5>: Cost 1 vext2 LHS, RHS
+  1592118644U, // <2,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
+  2593526960U, // <2,3,4,7>: Cost 3 vext1 <7,2,3,4>, <7,2,3,4>
+  470601257U, // <2,3,4,u>: Cost 1 vext2 LHS, RHS
+  2551726182U, // <2,3,5,0>: Cost 3 vext1 <0,2,3,5>, LHS
+  1592118992U, // <2,3,5,1>: Cost 2 vext2 LHS, <5,1,7,3>
+  2665860862U, // <2,3,5,2>: Cost 3 vext2 LHS, <5,2,3,4>
+  2551728642U, // <2,3,5,3>: Cost 3 vext1 <0,2,3,5>, <3,4,5,6>
+  1592119238U, // <2,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+  1592119300U, // <2,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+  1592119394U, // <2,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0>
+  1592119464U, // <2,3,5,7>: Cost 2 vext2 LHS, <5,7,5,7>
+  1592119545U, // <2,3,5,u>: Cost 2 vext2 LHS, <5,u,5,7>
+  2622730529U, // <2,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2>
+  2557707164U, // <2,3,6,1>: Cost 3 vext1 <1,2,3,6>, <1,2,3,6>
+  1592119802U, // <2,3,6,2>: Cost 2 vext2 LHS, <6,2,7,3>
+  2665861682U, // <2,3,6,3>: Cost 3 vext2 LHS, <6,3,4,5>
+  2622730893U, // <2,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6>
+  2665861810U, // <2,3,6,5>: Cost 3 vext2 LHS, <6,5,0,7>
+  1592120120U, // <2,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+  1592120142U, // <2,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
+  1592120223U, // <2,3,6,u>: Cost 2 vext2 LHS, <6,u,0,1>
+  1592120314U, // <2,3,7,0>: Cost 2 vext2 LHS, <7,0,1,2>
+  2659890261U, // <2,3,7,1>: Cost 3 vext2 <7,1,2,3>, <7,1,2,3>
+  2660553894U, // <2,3,7,2>: Cost 3 vext2 <7,2,2,3>, <7,2,2,3>
+  2665862371U, // <2,3,7,3>: Cost 3 vext2 LHS, <7,3,0,1>
+  1592120678U, // <2,3,7,4>: Cost 2 vext2 LHS, <7,4,5,6>
+  2665862534U, // <2,3,7,5>: Cost 3 vext2 LHS, <7,5,0,2>
+  2665862614U, // <2,3,7,6>: Cost 3 vext2 LHS, <7,6,0,1>
+  1592120940U, // <2,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
+  1592120962U, // <2,3,7,u>: Cost 2 vext2 LHS, <7,u,1,2>
+  1548990163U, // <2,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2>
+  470603566U, // <2,3,u,1>: Cost 1 vext2 LHS, LHS
+  1548990341U, // <2,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0>
+  1548990396U, // <2,3,u,3>: Cost 2 vext2 LHS, <u,3,0,1>
+  1548990527U, // <2,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6>
+  470603930U, // <2,3,u,5>: Cost 1 vext2 LHS, RHS
+  1548990672U, // <2,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7>
+  1592121600U, // <2,3,u,7>: Cost 2 vext2 LHS, <u,7,0,1>
+  470604133U, // <2,3,u,u>: Cost 1 vext2 LHS, LHS
+  2617425942U, // <2,4,0,0>: Cost 3 vext2 <0,0,2,4>, <0,0,2,4>
+  2618753126U, // <2,4,0,1>: Cost 3 vext2 <0,2,2,4>, LHS
+  2618753208U, // <2,4,0,2>: Cost 3 vext2 <0,2,2,4>, <0,2,2,4>
+  2619416841U, // <2,4,0,3>: Cost 3 vext2 <0,3,2,4>, <0,3,2,4>
+  2587593628U, // <2,4,0,4>: Cost 3 vext1 <6,2,4,0>, <4,0,6,2>
+  2712832914U, // <2,4,0,5>: Cost 3 vext3 <4,6,u,2>, <4,0,5,1>
+  1634962332U, // <2,4,0,6>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2>
+  3799993252U, // <2,4,0,7>: Cost 4 vext3 <7,0,1,2>, <4,0,7,1>
+  1634962332U, // <2,4,0,u>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2>
+  2619417334U, // <2,4,1,0>: Cost 3 vext2 <0,3,2,4>, <1,0,3,2>
+  3692495668U, // <2,4,1,1>: Cost 4 vext2 <0,2,2,4>, <1,1,1,1>
+  2625389466U, // <2,4,1,2>: Cost 3 vext2 <1,3,2,4>, <1,2,3,4>
+  2826125414U, // <2,4,1,3>: Cost 3 vuzpr <1,2,3,4>, LHS
+  3699794995U, // <2,4,1,4>: Cost 4 vext2 <1,4,2,4>, <1,4,2,4>
+  3692496016U, // <2,4,1,5>: Cost 4 vext2 <0,2,2,4>, <1,5,3,7>
+  3763424238U, // <2,4,1,6>: Cost 4 vext3 <0,u,0,2>, <4,1,6,3>
+  3667317942U, // <2,4,1,7>: Cost 4 vext1 <7,2,4,1>, <7,2,4,1>
+  2826125419U, // <2,4,1,u>: Cost 3 vuzpr <1,2,3,4>, LHS
+  2629371336U, // <2,4,2,0>: Cost 3 vext2 <2,0,2,4>, <2,0,2,4>
+  3699131946U, // <2,4,2,1>: Cost 4 vext2 <1,3,2,4>, <2,1,4,3>
+  2630698602U, // <2,4,2,2>: Cost 3 vext2 <2,2,2,4>, <2,2,2,4>
+  2618754766U, // <2,4,2,3>: Cost 3 vext2 <0,2,2,4>, <2,3,4,5>
+  2826126234U, // <2,4,2,4>: Cost 3 vuzpr <1,2,3,4>, <1,2,3,4>
+  2899119414U, // <2,4,2,5>: Cost 3 vzipl <2,2,2,2>, RHS
+  3033337142U, // <2,4,2,6>: Cost 3 vtrnl <2,2,2,2>, RHS
+  3800214597U, // <2,4,2,7>: Cost 4 vext3 <7,0,4,2>, <4,2,7,0>
+  2899119657U, // <2,4,2,u>: Cost 3 vzipl <2,2,2,2>, RHS
+  2635344033U, // <2,4,3,0>: Cost 3 vext2 <3,0,2,4>, <3,0,2,4>
+  4032012325U, // <2,4,3,1>: Cost 4 vzipr LHS, <0,0,4,1>
+  3692497228U, // <2,4,3,2>: Cost 4 vext2 <0,2,2,4>, <3,2,3,4>
+  3692497308U, // <2,4,3,3>: Cost 4 vext2 <0,2,2,4>, <3,3,3,3>
+  3001404624U, // <2,4,3,4>: Cost 3 vzipr LHS, <4,4,4,4>
+  2953627342U, // <2,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5>
+  2953625804U, // <2,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6>
+  3899868160U, // <2,4,3,7>: Cost 4 vuzpr <1,2,3,4>, <1,3,5,7>
+  2953625806U, // <2,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u>
+  2710916266U, // <2,4,4,0>: Cost 3 vext3 <4,4,0,2>, <4,4,0,2>
+  3899869648U, // <2,4,4,1>: Cost 4 vuzpr <1,2,3,4>, <3,4,0,1>
+  3899869658U, // <2,4,4,2>: Cost 4 vuzpr <1,2,3,4>, <3,4,1,2>
+  3899868930U, // <2,4,4,3>: Cost 4 vuzpr <1,2,3,4>, <2,4,1,3>
+  2712833232U, // <2,4,4,4>: Cost 3 vext3 <4,6,u,2>, <4,4,4,4>
+  2618756406U, // <2,4,4,5>: Cost 3 vext2 <0,2,2,4>, RHS
+  2765737270U, // <2,4,4,6>: Cost 3 vuzpl <2,3,4,5>, RHS
+  4168304426U, // <2,4,4,7>: Cost 4 vtrnr <1,2,3,4>, <2,4,5,7>
+  2618756649U, // <2,4,4,u>: Cost 3 vext2 <0,2,2,4>, RHS
+  2551800011U, // <2,4,5,0>: Cost 3 vext1 <0,2,4,5>, <0,2,4,5>
+  2569716470U, // <2,4,5,1>: Cost 3 vext1 <3,2,4,5>, <1,0,3,2>
+  2563745405U, // <2,4,5,2>: Cost 3 vext1 <2,2,4,5>, <2,2,4,5>
+  2569718102U, // <2,4,5,3>: Cost 3 vext1 <3,2,4,5>, <3,2,4,5>
+  2551803190U, // <2,4,5,4>: Cost 3 vext1 <0,2,4,5>, RHS
+  3625545732U, // <2,4,5,5>: Cost 4 vext1 <0,2,4,5>, <5,5,5,5>
+  1611959606U, // <2,4,5,6>: Cost 2 vext3 <0,2,0,2>, RHS
+  2826128694U, // <2,4,5,7>: Cost 3 vuzpr <1,2,3,4>, RHS
+  1611959624U, // <2,4,5,u>: Cost 2 vext3 <0,2,0,2>, RHS
+  1478066278U, // <2,4,6,0>: Cost 2 vext1 <0,2,4,6>, LHS
+  2551808758U, // <2,4,6,1>: Cost 3 vext1 <0,2,4,6>, <1,0,3,2>
+  2551809516U, // <2,4,6,2>: Cost 3 vext1 <0,2,4,6>, <2,0,6,4>
+  2551810198U, // <2,4,6,3>: Cost 3 vext1 <0,2,4,6>, <3,0,1,2>
+  1478069558U, // <2,4,6,4>: Cost 2 vext1 <0,2,4,6>, RHS
+  2901888310U, // <2,4,6,5>: Cost 3 vzipl <2,6,3,7>, RHS
+  2551812920U, // <2,4,6,6>: Cost 3 vext1 <0,2,4,6>, <6,6,6,6>
+  2726251914U, // <2,4,6,7>: Cost 3 vext3 <7,0,1,2>, <4,6,7,1>
+  1478072110U, // <2,4,6,u>: Cost 2 vext1 <0,2,4,6>, LHS
+  2659234821U, // <2,4,7,0>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4>
+  3786722726U, // <2,4,7,1>: Cost 4 vext3 <4,7,1,2>, <4,7,1,2>
+  3734303911U, // <2,4,7,2>: Cost 4 vext2 <7,2,2,4>, <7,2,2,4>
+  3734967544U, // <2,4,7,3>: Cost 4 vext2 <7,3,2,4>, <7,3,2,4>
+  3727005030U, // <2,4,7,4>: Cost 4 vext2 <6,0,2,4>, <7,4,5,6>
+  2726251976U, // <2,4,7,5>: Cost 3 vext3 <7,0,1,2>, <4,7,5,0>
+  2726251986U, // <2,4,7,6>: Cost 3 vext3 <7,0,1,2>, <4,7,6,1>
+  3727005292U, // <2,4,7,7>: Cost 4 vext2 <6,0,2,4>, <7,7,7,7>
+  2659234821U, // <2,4,7,u>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4>
+  1478082662U, // <2,4,u,0>: Cost 2 vext1 <0,2,4,u>, LHS
+  2618758958U, // <2,4,u,1>: Cost 3 vext2 <0,2,2,4>, LHS
+  2551826024U, // <2,4,u,2>: Cost 3 vext1 <0,2,4,u>, <2,2,2,2>
+  2551826582U, // <2,4,u,3>: Cost 3 vext1 <0,2,4,u>, <3,0,1,2>
+  1478085942U, // <2,4,u,4>: Cost 2 vext1 <0,2,4,u>, RHS
+  2953668302U, // <2,4,u,5>: Cost 3 vzipr LHS, <2,3,4,5>
+  1611959849U, // <2,4,u,6>: Cost 2 vext3 <0,2,0,2>, RHS
+  2826128937U, // <2,4,u,7>: Cost 3 vuzpr <1,2,3,4>, RHS
+  1611959867U, // <2,4,u,u>: Cost 2 vext3 <0,2,0,2>, RHS
+  3691839488U, // <2,5,0,0>: Cost 4 vext2 <0,1,2,5>, <0,0,0,0>
+  2618097766U, // <2,5,0,1>: Cost 3 vext2 <0,1,2,5>, LHS
+  2620088484U, // <2,5,0,2>: Cost 3 vext2 <0,4,2,5>, <0,2,0,2>
+  2619425034U, // <2,5,0,3>: Cost 3 vext2 <0,3,2,5>, <0,3,2,5>
+  2620088667U, // <2,5,0,4>: Cost 3 vext2 <0,4,2,5>, <0,4,2,5>
+  2620752300U, // <2,5,0,5>: Cost 3 vext2 <0,5,2,5>, <0,5,2,5>
+  3693830655U, // <2,5,0,6>: Cost 4 vext2 <0,4,2,5>, <0,6,2,7>
+  3094531382U, // <2,5,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS
+  2618098333U, // <2,5,0,u>: Cost 3 vext2 <0,1,2,5>, LHS
+  3691840246U, // <2,5,1,0>: Cost 4 vext2 <0,1,2,5>, <1,0,3,2>
+  3691840308U, // <2,5,1,1>: Cost 4 vext2 <0,1,2,5>, <1,1,1,1>
+  2626061206U, // <2,5,1,2>: Cost 3 vext2 <1,4,2,5>, <1,2,3,0>
+  2618098688U, // <2,5,1,3>: Cost 3 vext2 <0,1,2,5>, <1,3,5,7>
+  2626061364U, // <2,5,1,4>: Cost 3 vext2 <1,4,2,5>, <1,4,2,5>
+  3691840656U, // <2,5,1,5>: Cost 4 vext2 <0,1,2,5>, <1,5,3,7>
+  3789082310U, // <2,5,1,6>: Cost 4 vext3 <5,1,6,2>, <5,1,6,2>
+  2712833744U, // <2,5,1,7>: Cost 3 vext3 <4,6,u,2>, <5,1,7,3>
+  2628715896U, // <2,5,1,u>: Cost 3 vext2 <1,u,2,5>, <1,u,2,5>
+  3693831613U, // <2,5,2,0>: Cost 4 vext2 <0,4,2,5>, <2,0,1,2>
+  4026698642U, // <2,5,2,1>: Cost 4 vzipr <0,0,2,2>, <4,0,5,1>
+  2632033896U, // <2,5,2,2>: Cost 3 vext2 <2,4,2,5>, <2,2,2,2>
+  3691841190U, // <2,5,2,3>: Cost 4 vext2 <0,1,2,5>, <2,3,0,1>
+  2632034061U, // <2,5,2,4>: Cost 3 vext2 <2,4,2,5>, <2,4,2,5>
+  3691841352U, // <2,5,2,5>: Cost 4 vext2 <0,1,2,5>, <2,5,0,1>
+  3691841466U, // <2,5,2,6>: Cost 4 vext2 <0,1,2,5>, <2,6,3,7>
+  3088354614U, // <2,5,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS
+  3088354615U, // <2,5,2,u>: Cost 3 vtrnr <0,2,0,2>, RHS
+  2557829222U, // <2,5,3,0>: Cost 3 vext1 <1,2,5,3>, LHS
+  2557830059U, // <2,5,3,1>: Cost 3 vext1 <1,2,5,3>, <1,2,5,3>
+  2575746766U, // <2,5,3,2>: Cost 3 vext1 <4,2,5,3>, <2,3,4,5>
+  3691841948U, // <2,5,3,3>: Cost 4 vext2 <0,1,2,5>, <3,3,3,3>
+  2619427330U, // <2,5,3,4>: Cost 3 vext2 <0,3,2,5>, <3,4,5,6>
+  2581720847U, // <2,5,3,5>: Cost 3 vext1 <5,2,5,3>, <5,2,5,3>
+  2953628162U, // <2,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6>
+  2953626624U, // <2,5,3,7>: Cost 3 vzipr LHS, <1,3,5,7>
+  2953626625U, // <2,5,3,u>: Cost 3 vzipr LHS, <1,3,5,u>
+  2569781350U, // <2,5,4,0>: Cost 3 vext1 <3,2,5,4>, LHS
+  3631580076U, // <2,5,4,1>: Cost 4 vext1 <1,2,5,4>, <1,2,5,4>
+  2569782990U, // <2,5,4,2>: Cost 3 vext1 <3,2,5,4>, <2,3,4,5>
+  2569783646U, // <2,5,4,3>: Cost 3 vext1 <3,2,5,4>, <3,2,5,4>
+  2569784630U, // <2,5,4,4>: Cost 3 vext1 <3,2,5,4>, RHS
+  2618101046U, // <2,5,4,5>: Cost 3 vext2 <0,1,2,5>, RHS
+  3893905922U, // <2,5,4,6>: Cost 4 vuzpr <0,2,3,5>, <3,4,5,6>
+  3094564150U, // <2,5,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS
+  2618101289U, // <2,5,4,u>: Cost 3 vext2 <0,1,2,5>, RHS
+  2551873638U, // <2,5,5,0>: Cost 3 vext1 <0,2,5,5>, LHS
+  3637560320U, // <2,5,5,1>: Cost 4 vext1 <2,2,5,5>, <1,3,5,7>
+  3637560966U, // <2,5,5,2>: Cost 4 vext1 <2,2,5,5>, <2,2,5,5>
+  3723030343U, // <2,5,5,3>: Cost 4 vext2 <5,3,2,5>, <5,3,2,5>
+  2551876918U, // <2,5,5,4>: Cost 3 vext1 <0,2,5,5>, RHS
+  2712834052U, // <2,5,5,5>: Cost 3 vext3 <4,6,u,2>, <5,5,5,5>
+  4028713474U, // <2,5,5,6>: Cost 4 vzipr <0,3,2,5>, <3,4,5,6>
+  2712834072U, // <2,5,5,7>: Cost 3 vext3 <4,6,u,2>, <5,5,7,7>
+  2712834081U, // <2,5,5,u>: Cost 3 vext3 <4,6,u,2>, <5,5,u,7>
+  2575769702U, // <2,5,6,0>: Cost 3 vext1 <4,2,5,6>, LHS
+  3631596462U, // <2,5,6,1>: Cost 4 vext1 <1,2,5,6>, <1,2,5,6>
+  2655924730U, // <2,5,6,2>: Cost 3 vext2 <6,4,2,5>, <6,2,7,3>
+  3643541856U, // <2,5,6,3>: Cost 4 vext1 <3,2,5,6>, <3,2,5,6>
+  2655924849U, // <2,5,6,4>: Cost 3 vext2 <6,4,2,5>, <6,4,2,5>
+  3787755607U, // <2,5,6,5>: Cost 4 vext3 <4,u,6,2>, <5,6,5,7>
+  4029385218U, // <2,5,6,6>: Cost 4 vzipr <0,4,2,6>, <3,4,5,6>
+  3088682294U, // <2,5,6,7>: Cost 3 vtrnr <0,2,4,6>, RHS
+  3088682295U, // <2,5,6,u>: Cost 3 vtrnr <0,2,4,6>, RHS
+  2563833958U, // <2,5,7,0>: Cost 3 vext1 <2,2,5,7>, LHS
+  2551890678U, // <2,5,7,1>: Cost 3 vext1 <0,2,5,7>, <1,0,3,2>
+  2563835528U, // <2,5,7,2>: Cost 3 vext1 <2,2,5,7>, <2,2,5,7>
+  3637577878U, // <2,5,7,3>: Cost 4 vext1 <2,2,5,7>, <3,0,1,2>
+  2563837238U, // <2,5,7,4>: Cost 3 vext1 <2,2,5,7>, RHS
+  2712834216U, // <2,5,7,5>: Cost 3 vext3 <4,6,u,2>, <5,7,5,7>
+  2712834220U, // <2,5,7,6>: Cost 3 vext3 <4,6,u,2>, <5,7,6,2>
+  4174449974U, // <2,5,7,7>: Cost 4 vtrnr <2,2,5,7>, RHS
+  2563839790U, // <2,5,7,u>: Cost 3 vext1 <2,2,5,7>, LHS
+  2563842150U, // <2,5,u,0>: Cost 3 vext1 <2,2,5,u>, LHS
+  2618103598U, // <2,5,u,1>: Cost 3 vext2 <0,1,2,5>, LHS
+  2563843721U, // <2,5,u,2>: Cost 3 vext1 <2,2,5,u>, <2,2,5,u>
+  2569816418U, // <2,5,u,3>: Cost 3 vext1 <3,2,5,u>, <3,2,5,u>
+  2622748735U, // <2,5,u,4>: Cost 3 vext2 <0,u,2,5>, <u,4,5,6>
+  2618103962U, // <2,5,u,5>: Cost 3 vext2 <0,1,2,5>, RHS
+  2953669122U, // <2,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6>
+  2953667584U, // <2,5,u,7>: Cost 3 vzipr LHS, <1,3,5,7>
+  2618104165U, // <2,5,u,u>: Cost 3 vext2 <0,1,2,5>, LHS
+  2620096512U, // <2,6,0,0>: Cost 3 vext2 <0,4,2,6>, <0,0,0,0>
+  1546354790U, // <2,6,0,1>: Cost 2 vext2 <0,4,2,6>, LHS
+  2620096676U, // <2,6,0,2>: Cost 3 vext2 <0,4,2,6>, <0,2,0,2>
+  3693838588U, // <2,6,0,3>: Cost 4 vext2 <0,4,2,6>, <0,3,1,0>
+  1546355036U, // <2,6,0,4>: Cost 2 vext2 <0,4,2,6>, <0,4,2,6>
+  3694502317U, // <2,6,0,5>: Cost 4 vext2 <0,5,2,6>, <0,5,2,6>
+  2551911246U, // <2,6,0,6>: Cost 3 vext1 <0,2,6,0>, <6,7,0,1>
+  2720723287U, // <2,6,0,7>: Cost 3 vext3 <6,0,7,2>, <6,0,7,2>
+  1546355357U, // <2,6,0,u>: Cost 2 vext2 <0,4,2,6>, LHS
+  2620097270U, // <2,6,1,0>: Cost 3 vext2 <0,4,2,6>, <1,0,3,2>
+  2620097332U, // <2,6,1,1>: Cost 3 vext2 <0,4,2,6>, <1,1,1,1>
+  2620097430U, // <2,6,1,2>: Cost 3 vext2 <0,4,2,6>, <1,2,3,0>
+  2820243558U, // <2,6,1,3>: Cost 3 vuzpr <0,2,4,6>, LHS
+  2620097598U, // <2,6,1,4>: Cost 3 vext2 <0,4,2,6>, <1,4,3,6>
+  2620097680U, // <2,6,1,5>: Cost 3 vext2 <0,4,2,6>, <1,5,3,7>
+  3693839585U, // <2,6,1,6>: Cost 4 vext2 <0,4,2,6>, <1,6,3,7>
+  2721386920U, // <2,6,1,7>: Cost 3 vext3 <6,1,7,2>, <6,1,7,2>
+  2820243563U, // <2,6,1,u>: Cost 3 vuzpr <0,2,4,6>, LHS
+  2714014137U, // <2,6,2,0>: Cost 3 vext3 <4,u,6,2>, <6,2,0,1>
+  2712834500U, // <2,6,2,1>: Cost 3 vext3 <4,6,u,2>, <6,2,1,3>
+  2620098152U, // <2,6,2,2>: Cost 3 vext2 <0,4,2,6>, <2,2,2,2>
+  2620098214U, // <2,6,2,3>: Cost 3 vext2 <0,4,2,6>, <2,3,0,1>
+  2632042254U, // <2,6,2,4>: Cost 3 vext2 <2,4,2,6>, <2,4,2,6>
+  2712834540U, // <2,6,2,5>: Cost 3 vext3 <4,6,u,2>, <6,2,5,7>
+  2820243660U, // <2,6,2,6>: Cost 3 vuzpr <0,2,4,6>, <0,2,4,6>
+  2958265654U, // <2,6,2,7>: Cost 3 vzipr <0,u,2,2>, RHS
+  2620098619U, // <2,6,2,u>: Cost 3 vext2 <0,4,2,6>, <2,u,0,1>
+  2620098710U, // <2,6,3,0>: Cost 3 vext2 <0,4,2,6>, <3,0,1,2>
+  3893986982U, // <2,6,3,1>: Cost 4 vuzpr <0,2,4,6>, <2,3,0,1>
+  2569848762U, // <2,6,3,2>: Cost 3 vext1 <3,2,6,3>, <2,6,3,7>
+  2620098972U, // <2,6,3,3>: Cost 3 vext2 <0,4,2,6>, <3,3,3,3>
+  2620099074U, // <2,6,3,4>: Cost 3 vext2 <0,4,2,6>, <3,4,5,6>
+  3893987022U, // <2,6,3,5>: Cost 4 vuzpr <0,2,4,6>, <2,3,4,5>
+  3001404644U, // <2,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6>
+  1879887158U, // <2,6,3,7>: Cost 2 vzipr LHS, RHS
+  1879887159U, // <2,6,3,u>: Cost 2 vzipr LHS, RHS
+  2620099484U, // <2,6,4,0>: Cost 3 vext2 <0,4,2,6>, <4,0,6,2>
+  2620099566U, // <2,6,4,1>: Cost 3 vext2 <0,4,2,6>, <4,1,6,3>
+  2620099644U, // <2,6,4,2>: Cost 3 vext2 <0,4,2,6>, <4,2,6,0>
+  3643599207U, // <2,6,4,3>: Cost 4 vext1 <3,2,6,4>, <3,2,6,4>
+  2575830080U, // <2,6,4,4>: Cost 3 vext1 <4,2,6,4>, <4,2,6,4>
+  1546358070U, // <2,6,4,5>: Cost 2 vext2 <0,4,2,6>, RHS
+  2667875700U, // <2,6,4,6>: Cost 3 vext2 <u,4,2,6>, <4,6,4,6>
+  4028042550U, // <2,6,4,7>: Cost 4 vzipr <0,2,2,4>, RHS
+  1546358313U, // <2,6,4,u>: Cost 2 vext2 <0,4,2,6>, RHS
+  3693841992U, // <2,6,5,0>: Cost 4 vext2 <0,4,2,6>, <5,0,1,2>
+  2667876048U, // <2,6,5,1>: Cost 3 vext2 <u,4,2,6>, <5,1,7,3>
+  2712834756U, // <2,6,5,2>: Cost 3 vext3 <4,6,u,2>, <6,5,2,7>
+  3643607400U, // <2,6,5,3>: Cost 4 vext1 <3,2,6,5>, <3,2,6,5>
+  2252091873U, // <2,6,5,4>: Cost 3 vrev <6,2,4,5>
+  2667876356U, // <2,6,5,5>: Cost 3 vext2 <u,4,2,6>, <5,5,5,5>
+  2667876450U, // <2,6,5,6>: Cost 3 vext2 <u,4,2,6>, <5,6,7,0>
+  2820246838U, // <2,6,5,7>: Cost 3 vuzpr <0,2,4,6>, RHS
+  2820246839U, // <2,6,5,u>: Cost 3 vuzpr <0,2,4,6>, RHS
+  2563899494U, // <2,6,6,0>: Cost 3 vext1 <2,2,6,6>, LHS
+  3893988683U, // <2,6,6,1>: Cost 4 vuzpr <0,2,4,6>, <4,6,0,1>
+  2563901072U, // <2,6,6,2>: Cost 3 vext1 <2,2,6,6>, <2,2,6,6>
+  3893987236U, // <2,6,6,3>: Cost 4 vuzpr <0,2,4,6>, <2,6,1,3>
+  2563902774U, // <2,6,6,4>: Cost 3 vext1 <2,2,6,6>, RHS
+  3893988723U, // <2,6,6,5>: Cost 4 vuzpr <0,2,4,6>, <4,6,4,5>
+  2712834872U, // <2,6,6,6>: Cost 3 vext3 <4,6,u,2>, <6,6,6,6>
+  2955644214U, // <2,6,6,7>: Cost 3 vzipr <0,4,2,6>, RHS
+  2955644215U, // <2,6,6,u>: Cost 3 vzipr <0,4,2,6>, RHS
+  2712834894U, // <2,6,7,0>: Cost 3 vext3 <4,6,u,2>, <6,7,0,1>
+  2724926296U, // <2,6,7,1>: Cost 3 vext3 <6,7,1,2>, <6,7,1,2>
+  2725000033U, // <2,6,7,2>: Cost 3 vext3 <6,7,2,2>, <6,7,2,2>
+  2702365544U, // <2,6,7,3>: Cost 3 vext3 <3,0,1,2>, <6,7,3,0>
+  2712834934U, // <2,6,7,4>: Cost 3 vext3 <4,6,u,2>, <6,7,4,5>
+  3776107393U, // <2,6,7,5>: Cost 4 vext3 <3,0,1,2>, <6,7,5,7>
+  2725294981U, // <2,6,7,6>: Cost 3 vext3 <6,7,6,2>, <6,7,6,2>
+  2726253452U, // <2,6,7,7>: Cost 3 vext3 <7,0,1,2>, <6,7,7,0>
+  2712834966U, // <2,6,7,u>: Cost 3 vext3 <4,6,u,2>, <6,7,u,1>
+  2620102355U, // <2,6,u,0>: Cost 3 vext2 <0,4,2,6>, <u,0,1,2>
+  1546360622U, // <2,6,u,1>: Cost 2 vext2 <0,4,2,6>, LHS
+  2620102536U, // <2,6,u,2>: Cost 3 vext2 <0,4,2,6>, <u,2,3,3>
+  2820244125U, // <2,6,u,3>: Cost 3 vuzpr <0,2,4,6>, LHS
+  1594136612U, // <2,6,u,4>: Cost 2 vext2 <u,4,2,6>, <u,4,2,6>
+  1546360986U, // <2,6,u,5>: Cost 2 vext2 <0,4,2,6>, RHS
+  2620102864U, // <2,6,u,6>: Cost 3 vext2 <0,4,2,6>, <u,6,3,7>
+  1879928118U, // <2,6,u,7>: Cost 2 vzipr LHS, RHS
+  1879928119U, // <2,6,u,u>: Cost 2 vzipr LHS, RHS
+  2726179825U, // <2,7,0,0>: Cost 3 vext3 <7,0,0,2>, <7,0,0,2>
+  1652511738U, // <2,7,0,1>: Cost 2 vext3 <7,0,1,2>, <7,0,1,2>
+  2621431972U, // <2,7,0,2>: Cost 3 vext2 <0,6,2,7>, <0,2,0,2>
+  2257949868U, // <2,7,0,3>: Cost 3 vrev <7,2,3,0>
+  2726474773U, // <2,7,0,4>: Cost 3 vext3 <7,0,4,2>, <7,0,4,2>
+  2620768686U, // <2,7,0,5>: Cost 3 vext2 <0,5,2,7>, <0,5,2,7>
+  2621432319U, // <2,7,0,6>: Cost 3 vext2 <0,6,2,7>, <0,6,2,7>
+  2599760953U, // <2,7,0,7>: Cost 3 vext1 <u,2,7,0>, <7,0,u,2>
+  1653027897U, // <2,7,0,u>: Cost 2 vext3 <7,0,u,2>, <7,0,u,2>
+  2639348470U, // <2,7,1,0>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2>
+  3695174452U, // <2,7,1,1>: Cost 4 vext2 <0,6,2,7>, <1,1,1,1>
+  3695174550U, // <2,7,1,2>: Cost 4 vext2 <0,6,2,7>, <1,2,3,0>
+  3694511104U, // <2,7,1,3>: Cost 4 vext2 <0,5,2,7>, <1,3,5,7>
+  3713090594U, // <2,7,1,4>: Cost 4 vext2 <3,6,2,7>, <1,4,0,5>
+  3693184144U, // <2,7,1,5>: Cost 4 vext2 <0,3,2,7>, <1,5,3,7>
+  2627405016U, // <2,7,1,6>: Cost 3 vext2 <1,6,2,7>, <1,6,2,7>
+  3799995519U, // <2,7,1,7>: Cost 4 vext3 <7,0,1,2>, <7,1,7,0>
+  2639348470U, // <2,7,1,u>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2>
+  3695175101U, // <2,7,2,0>: Cost 4 vext2 <0,6,2,7>, <2,0,1,2>
+  3643655168U, // <2,7,2,1>: Cost 4 vext1 <3,2,7,2>, <1,3,5,7>
+  2257892517U, // <2,7,2,2>: Cost 3 vrev <7,2,2,2>
+  3695175334U, // <2,7,2,3>: Cost 4 vext2 <0,6,2,7>, <2,3,0,1>
+  3695175465U, // <2,7,2,4>: Cost 4 vext2 <0,6,2,7>, <2,4,5,6>
+  2632714080U, // <2,7,2,5>: Cost 3 vext2 <2,5,2,7>, <2,5,2,7>
+  2633377713U, // <2,7,2,6>: Cost 3 vext2 <2,6,2,7>, <2,6,2,7>
+  3695175658U, // <2,7,2,7>: Cost 4 vext2 <0,6,2,7>, <2,7,0,1>
+  2634704979U, // <2,7,2,u>: Cost 3 vext2 <2,u,2,7>, <2,u,2,7>
+  1514094694U, // <2,7,3,0>: Cost 2 vext1 <6,2,7,3>, LHS
+  2569921680U, // <2,7,3,1>: Cost 3 vext1 <3,2,7,3>, <1,5,3,7>
+  2587838056U, // <2,7,3,2>: Cost 3 vext1 <6,2,7,3>, <2,2,2,2>
+  2569922927U, // <2,7,3,3>: Cost 3 vext1 <3,2,7,3>, <3,2,7,3>
+  1514097974U, // <2,7,3,4>: Cost 2 vext1 <6,2,7,3>, RHS
+  2581868321U, // <2,7,3,5>: Cost 3 vext1 <5,2,7,3>, <5,2,7,3>
+  1514099194U, // <2,7,3,6>: Cost 2 vext1 <6,2,7,3>, <6,2,7,3>
+  2587841530U, // <2,7,3,7>: Cost 3 vext1 <6,2,7,3>, <7,0,1,2>
+  1514100526U, // <2,7,3,u>: Cost 2 vext1 <6,2,7,3>, LHS
+  2708706617U, // <2,7,4,0>: Cost 3 vext3 <4,0,6,2>, <7,4,0,6>
+  3649643418U, // <2,7,4,1>: Cost 4 vext1 <4,2,7,4>, <1,2,3,4>
+  3649644330U, // <2,7,4,2>: Cost 4 vext1 <4,2,7,4>, <2,4,5,7>
+  2257982640U, // <2,7,4,3>: Cost 3 vrev <7,2,3,4>
+  3649645641U, // <2,7,4,4>: Cost 4 vext1 <4,2,7,4>, <4,2,7,4>
+  2621435190U, // <2,7,4,5>: Cost 3 vext2 <0,6,2,7>, RHS
+  2712835441U, // <2,7,4,6>: Cost 3 vext3 <4,6,u,2>, <7,4,6,u>
+  3799995762U, // <2,7,4,7>: Cost 4 vext3 <7,0,1,2>, <7,4,7,0>
+  2621435433U, // <2,7,4,u>: Cost 3 vext2 <0,6,2,7>, RHS
+  2729497990U, // <2,7,5,0>: Cost 3 vext3 <7,5,0,2>, <7,5,0,2>
+  3643679744U, // <2,7,5,1>: Cost 4 vext1 <3,2,7,5>, <1,3,5,7>
+  3637708424U, // <2,7,5,2>: Cost 4 vext1 <2,2,7,5>, <2,2,5,7>
+  3643681137U, // <2,7,5,3>: Cost 4 vext1 <3,2,7,5>, <3,2,7,5>
+  2599800118U, // <2,7,5,4>: Cost 3 vext1 <u,2,7,5>, RHS
+  3786577334U, // <2,7,5,5>: Cost 4 vext3 <4,6,u,2>, <7,5,5,5>
+  3786577345U, // <2,7,5,6>: Cost 4 vext3 <4,6,u,2>, <7,5,6,7>
+  2599802214U, // <2,7,5,7>: Cost 3 vext1 <u,2,7,5>, <7,4,5,6>
+  2599802670U, // <2,7,5,u>: Cost 3 vext1 <u,2,7,5>, LHS
+  2581889126U, // <2,7,6,0>: Cost 3 vext1 <5,2,7,6>, LHS
+  3643687936U, // <2,7,6,1>: Cost 4 vext1 <3,2,7,6>, <1,3,5,7>
+  2663240186U, // <2,7,6,2>: Cost 3 vext2 <7,6,2,7>, <6,2,7,3>
+  3643689330U, // <2,7,6,3>: Cost 4 vext1 <3,2,7,6>, <3,2,7,6>
+  2581892406U, // <2,7,6,4>: Cost 3 vext1 <5,2,7,6>, RHS
+  2581892900U, // <2,7,6,5>: Cost 3 vext1 <5,2,7,6>, <5,2,7,6>
+  2587865597U, // <2,7,6,6>: Cost 3 vext1 <6,2,7,6>, <6,2,7,6>
+  3786577428U, // <2,7,6,7>: Cost 4 vext3 <4,6,u,2>, <7,6,7,0>
+  2581894958U, // <2,7,6,u>: Cost 3 vext1 <5,2,7,6>, LHS
+  2726254119U, // <2,7,7,0>: Cost 3 vext3 <7,0,1,2>, <7,7,0,1>
+  3804640817U, // <2,7,7,1>: Cost 4 vext3 <7,7,1,2>, <7,7,1,2>
+  3637724826U, // <2,7,7,2>: Cost 4 vext1 <2,2,7,7>, <2,2,7,7>
+  3734992123U, // <2,7,7,3>: Cost 4 vext2 <7,3,2,7>, <7,3,2,7>
+  2552040758U, // <2,7,7,4>: Cost 3 vext1 <0,2,7,7>, RHS
+  3799995992U, // <2,7,7,5>: Cost 4 vext3 <7,0,1,2>, <7,7,5,5>
+  2663241198U, // <2,7,7,6>: Cost 3 vext2 <7,6,2,7>, <7,6,2,7>
+  2712835692U, // <2,7,7,7>: Cost 3 vext3 <4,6,u,2>, <7,7,7,7>
+  2731562607U, // <2,7,7,u>: Cost 3 vext3 <7,u,1,2>, <7,7,u,1>
+  1514135654U, // <2,7,u,0>: Cost 2 vext1 <6,2,7,u>, LHS
+  1657820802U, // <2,7,u,1>: Cost 2 vext3 <7,u,1,2>, <7,u,1,2>
+  2587879016U, // <2,7,u,2>: Cost 3 vext1 <6,2,7,u>, <2,2,2,2>
+  2569963892U, // <2,7,u,3>: Cost 3 vext1 <3,2,7,u>, <3,2,7,u>
+  1514138934U, // <2,7,u,4>: Cost 2 vext1 <6,2,7,u>, RHS
+  2621438106U, // <2,7,u,5>: Cost 3 vext2 <0,6,2,7>, RHS
+  1514140159U, // <2,7,u,6>: Cost 2 vext1 <6,2,7,u>, <6,2,7,u>
+  2587882490U, // <2,7,u,7>: Cost 3 vext1 <6,2,7,u>, <7,0,1,2>
+  1514141486U, // <2,7,u,u>: Cost 2 vext1 <6,2,7,u>, LHS
+  1544380416U, // <2,u,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+  470638699U, // <2,u,0,1>: Cost 1 vext2 LHS, LHS
+  1544380580U, // <2,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+  1658631909U, // <2,u,0,3>: Cost 2 vext3 <u,0,3,2>, <u,0,3,2>
+  1544380754U, // <2,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+  2665898414U, // <2,u,0,5>: Cost 3 vext2 LHS, <0,5,2,7>
+  1658853120U, // <2,u,0,6>: Cost 2 vext3 <u,0,6,2>, <u,0,6,2>
+  3094531625U, // <2,u,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS
+  470639261U, // <2,u,0,u>: Cost 1 vext2 LHS, LHS
+  1544381174U, // <2,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+  1544381236U, // <2,u,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+  1544381334U, // <2,u,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+  1544381400U, // <2,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+  2618123325U, // <2,u,1,4>: Cost 3 vext2 LHS, <1,4,3,5>
+  1544381584U, // <2,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+  2618123489U, // <2,u,1,6>: Cost 3 vext2 LHS, <1,6,3,7>
+  2726254427U, // <2,u,1,7>: Cost 3 vext3 <7,0,1,2>, <u,1,7,3>
+  1544381823U, // <2,u,1,u>: Cost 2 vext2 LHS, <1,u,3,3>
+  1478328422U, // <2,u,2,0>: Cost 2 vext1 <0,2,u,2>, LHS
+  2618123807U, // <2,u,2,1>: Cost 3 vext2 LHS, <2,1,3,1>
+  269271142U, // <2,u,2,2>: Cost 1 vdup2 LHS
+  1544382118U, // <2,u,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
+  1478331702U, // <2,u,2,4>: Cost 2 vext1 <0,2,u,2>, RHS
+  2618124136U, // <2,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
+  1544382394U, // <2,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+  3088354857U, // <2,u,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS
+  269271142U, // <2,u,2,u>: Cost 1 vdup2 LHS
+  1544382614U, // <2,u,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
+  2953627374U, // <2,u,3,1>: Cost 3 vzipr LHS, <2,3,u,1>
+  1490282143U, // <2,u,3,2>: Cost 2 vext1 <2,2,u,3>, <2,2,u,3>
+  1879883932U, // <2,u,3,3>: Cost 2 vzipr LHS, LHS
+  1544382978U, // <2,u,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
+  2953627378U, // <2,u,3,5>: Cost 3 vzipr LHS, <2,3,u,5>
+  1514172931U, // <2,u,3,6>: Cost 2 vext1 <6,2,u,3>, <6,2,u,3>
+  1879887176U, // <2,u,3,7>: Cost 2 vzipr LHS, RHS
+  1879883937U, // <2,u,3,u>: Cost 2 vzipr LHS, LHS
+  1484316774U, // <2,u,4,0>: Cost 2 vext1 <1,2,u,4>, LHS
+  1484317639U, // <2,u,4,1>: Cost 2 vext1 <1,2,u,4>, <1,2,u,4>
+  2552088270U, // <2,u,4,2>: Cost 3 vext1 <0,2,u,4>, <2,3,4,5>
+  1190213513U, // <2,u,4,3>: Cost 2 vrev <u,2,3,4>
+  1484320054U, // <2,u,4,4>: Cost 2 vext1 <1,2,u,4>, RHS
+  470641974U, // <2,u,4,5>: Cost 1 vext2 LHS, RHS
+  1592159604U, // <2,u,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
+  3094564393U, // <2,u,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS
+  470642217U, // <2,u,4,u>: Cost 1 vext2 LHS, RHS
+  2552094959U, // <2,u,5,0>: Cost 3 vext1 <0,2,u,5>, <0,2,u,5>
+  1592159952U, // <2,u,5,1>: Cost 2 vext2 LHS, <5,1,7,3>
+  2564040353U, // <2,u,5,2>: Cost 3 vext1 <2,2,u,5>, <2,2,u,5>
+  2690275455U, // <2,u,5,3>: Cost 3 vext3 <0,u,u,2>, <u,5,3,7>
+  1592160198U, // <2,u,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+  1592160260U, // <2,u,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+  1611962522U, // <2,u,5,6>: Cost 2 vext3 <0,2,0,2>, RHS
+  1592160424U, // <2,u,5,7>: Cost 2 vext2 LHS, <5,7,5,7>
+  1611962540U, // <2,u,5,u>: Cost 2 vext3 <0,2,0,2>, RHS
+  1478361190U, // <2,u,6,0>: Cost 2 vext1 <0,2,u,6>, LHS
+  2552103670U, // <2,u,6,1>: Cost 3 vext1 <0,2,u,6>, <1,0,3,2>
+  1592160762U, // <2,u,6,2>: Cost 2 vext2 LHS, <6,2,7,3>
+  2685704400U, // <2,u,6,3>: Cost 3 vext3 <0,2,0,2>, <u,6,3,7>
+  1478364470U, // <2,u,6,4>: Cost 2 vext1 <0,2,u,6>, RHS
+  2901891226U, // <2,u,6,5>: Cost 3 vzipl <2,6,3,7>, RHS
+  1592161080U, // <2,u,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+  1592161102U, // <2,u,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
+  1478367022U, // <2,u,6,u>: Cost 2 vext1 <0,2,u,6>, LHS
+  1592161274U, // <2,u,7,0>: Cost 2 vext2 LHS, <7,0,1,2>
+  2659931226U, // <2,u,7,1>: Cost 3 vext2 <7,1,2,u>, <7,1,2,u>
+  2564056739U, // <2,u,7,2>: Cost 3 vext1 <2,2,u,7>, <2,2,u,7>
+  2665903331U, // <2,u,7,3>: Cost 3 vext2 LHS, <7,3,0,1>
+  1592161638U, // <2,u,7,4>: Cost 2 vext2 LHS, <7,4,5,6>
+  2665903494U, // <2,u,7,5>: Cost 3 vext2 LHS, <7,5,0,2>
+  2587947527U, // <2,u,7,6>: Cost 3 vext1 <6,2,u,7>, <6,2,u,7>
+  1592161900U, // <2,u,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
+  1592161922U, // <2,u,7,u>: Cost 2 vext2 LHS, <7,u,1,2>
+  1478377574U, // <2,u,u,0>: Cost 2 vext1 <0,2,u,u>, LHS
+  470644526U, // <2,u,u,1>: Cost 1 vext2 LHS, LHS
+  269271142U, // <2,u,u,2>: Cost 1 vdup2 LHS
+  1879924892U, // <2,u,u,3>: Cost 2 vzipr LHS, LHS
+  1478380854U, // <2,u,u,4>: Cost 2 vext1 <0,2,u,u>, RHS
+  470644890U, // <2,u,u,5>: Cost 1 vext2 LHS, RHS
+  1611962765U, // <2,u,u,6>: Cost 2 vext3 <0,2,0,2>, RHS
+  1879928136U, // <2,u,u,7>: Cost 2 vzipr LHS, RHS
+  470645093U, // <2,u,u,u>: Cost 1 vext2 LHS, LHS
+  1611448320U, // <3,0,0,0>: Cost 2 vext3 LHS, <0,0,0,0>
+  1611890698U, // <3,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1>
+  1611890708U, // <3,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2>
+  3763576860U, // <3,0,0,3>: Cost 4 vext3 LHS, <0,0,3,1>
+  2689835045U, // <3,0,0,4>: Cost 3 vext3 LHS, <0,0,4,1>
+  3698508206U, // <3,0,0,5>: Cost 4 vext2 <1,2,3,0>, <0,5,2,7>
+  3763576887U, // <3,0,0,6>: Cost 4 vext3 LHS, <0,0,6,1>
+  3667678434U, // <3,0,0,7>: Cost 4 vext1 <7,3,0,0>, <7,3,0,0>
+  1616093258U, // <3,0,0,u>: Cost 2 vext3 LHS, <0,0,u,2>
+  1490337894U, // <3,0,1,0>: Cost 2 vext1 <2,3,0,1>, LHS
+  2685632602U, // <3,0,1,1>: Cost 3 vext3 LHS, <0,1,1,0>
+  537706598U, // <3,0,1,2>: Cost 1 vext3 LHS, LHS
+  2624766936U, // <3,0,1,3>: Cost 3 vext2 <1,2,3,0>, <1,3,1,3>
+  1490341174U, // <3,0,1,4>: Cost 2 vext1 <2,3,0,1>, RHS
+  2624767120U, // <3,0,1,5>: Cost 3 vext2 <1,2,3,0>, <1,5,3,7>
+  2732966030U, // <3,0,1,6>: Cost 3 vext3 LHS, <0,1,6,7>
+  2593944803U, // <3,0,1,7>: Cost 3 vext1 <7,3,0,1>, <7,3,0,1>
+  537706652U, // <3,0,1,u>: Cost 1 vext3 LHS, LHS
+  1611890852U, // <3,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
+  2685632684U, // <3,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1>
+  2685632692U, // <3,0,2,2>: Cost 3 vext3 LHS, <0,2,2,0>
+  2685632702U, // <3,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1>
+  1611890892U, // <3,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
+  2732966102U, // <3,0,2,5>: Cost 3 vext3 LHS, <0,2,5,7>
+  2624767930U, // <3,0,2,6>: Cost 3 vext2 <1,2,3,0>, <2,6,3,7>
+  2685632744U, // <3,0,2,7>: Cost 3 vext3 LHS, <0,2,7,7>
+  1611890924U, // <3,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2>
+  2624768150U, // <3,0,3,0>: Cost 3 vext2 <1,2,3,0>, <3,0,1,2>
+  2685632764U, // <3,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0>
+  2685632774U, // <3,0,3,2>: Cost 3 vext3 LHS, <0,3,2,1>
+  2624768412U, // <3,0,3,3>: Cost 3 vext2 <1,2,3,0>, <3,3,3,3>
+  2624768514U, // <3,0,3,4>: Cost 3 vext2 <1,2,3,0>, <3,4,5,6>
+  3702491714U, // <3,0,3,5>: Cost 4 vext2 <1,u,3,0>, <3,5,3,7>
+  2624768632U, // <3,0,3,6>: Cost 3 vext2 <1,2,3,0>, <3,6,0,7>
+  3702491843U, // <3,0,3,7>: Cost 4 vext2 <1,u,3,0>, <3,7,0,1>
+  2686959934U, // <3,0,3,u>: Cost 3 vext3 <0,3,u,3>, <0,3,u,3>
+  2689835336U, // <3,0,4,0>: Cost 3 vext3 LHS, <0,4,0,4>
+  1611891026U, // <3,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5>
+  1611891036U, // <3,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6>
+  3763577184U, // <3,0,4,3>: Cost 4 vext3 LHS, <0,4,3,1>
+  2689835374U, // <3,0,4,4>: Cost 3 vext3 LHS, <0,4,4,6>
+  1551027510U, // <3,0,4,5>: Cost 2 vext2 <1,2,3,0>, RHS
+  2666573172U, // <3,0,4,6>: Cost 3 vext2 <u,2,3,0>, <4,6,4,6>
+  3667711206U, // <3,0,4,7>: Cost 4 vext1 <7,3,0,4>, <7,3,0,4>
+  1616093586U, // <3,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6>
+  2685190556U, // <3,0,5,0>: Cost 3 vext3 LHS, <0,5,0,7>
+  2666573520U, // <3,0,5,1>: Cost 3 vext2 <u,2,3,0>, <5,1,7,3>
+  3040886886U, // <3,0,5,2>: Cost 3 vtrnl <3,4,5,6>, LHS
+  3625912834U, // <3,0,5,3>: Cost 4 vext1 <0,3,0,5>, <3,4,5,6>
+  2666573766U, // <3,0,5,4>: Cost 3 vext2 <u,2,3,0>, <5,4,7,6>
+  2666573828U, // <3,0,5,5>: Cost 3 vext2 <u,2,3,0>, <5,5,5,5>
+  2732966354U, // <3,0,5,6>: Cost 3 vext3 LHS, <0,5,6,7>
+  2666573992U, // <3,0,5,7>: Cost 3 vext2 <u,2,3,0>, <5,7,5,7>
+  3040886940U, // <3,0,5,u>: Cost 3 vtrnl <3,4,5,6>, LHS
+  2685190637U, // <3,0,6,0>: Cost 3 vext3 LHS, <0,6,0,7>
+  2732966390U, // <3,0,6,1>: Cost 3 vext3 LHS, <0,6,1,7>
+  2689835519U, // <3,0,6,2>: Cost 3 vext3 LHS, <0,6,2,7>
+  3667724438U, // <3,0,6,3>: Cost 4 vext1 <7,3,0,6>, <3,0,1,2>
+  3763577355U, // <3,0,6,4>: Cost 4 vext3 LHS, <0,6,4,1>
+  3806708243U, // <3,0,6,5>: Cost 4 vext3 LHS, <0,6,5,0>
+  2666574648U, // <3,0,6,6>: Cost 3 vext2 <u,2,3,0>, <6,6,6,6>
+  2657948520U, // <3,0,6,7>: Cost 3 vext2 <6,7,3,0>, <6,7,3,0>
+  2689835573U, // <3,0,6,u>: Cost 3 vext3 LHS, <0,6,u,7>
+  2666574842U, // <3,0,7,0>: Cost 3 vext2 <u,2,3,0>, <7,0,1,2>
+  2685633095U, // <3,0,7,1>: Cost 3 vext3 LHS, <0,7,1,7>
+  2660603052U, // <3,0,7,2>: Cost 3 vext2 <7,2,3,0>, <7,2,3,0>
+  3643844997U, // <3,0,7,3>: Cost 4 vext1 <3,3,0,7>, <3,3,0,7>
+  2666575206U, // <3,0,7,4>: Cost 3 vext2 <u,2,3,0>, <7,4,5,6>
+  3655790391U, // <3,0,7,5>: Cost 4 vext1 <5,3,0,7>, <5,3,0,7>
+  3731690968U, // <3,0,7,6>: Cost 4 vext2 <6,7,3,0>, <7,6,0,3>
+  2666575468U, // <3,0,7,7>: Cost 3 vext2 <u,2,3,0>, <7,7,7,7>
+  2664584850U, // <3,0,7,u>: Cost 3 vext2 <7,u,3,0>, <7,u,3,0>
+  1616093834U, // <3,0,u,0>: Cost 2 vext3 LHS, <0,u,0,2>
+  1611891346U, // <3,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1>
+  537707165U, // <3,0,u,2>: Cost 1 vext3 LHS, LHS
+  2689835684U, // <3,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1>
+  1616093874U, // <3,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6>
+  1551030426U, // <3,0,u,5>: Cost 2 vext2 <1,2,3,0>, RHS
+  2624772304U, // <3,0,u,6>: Cost 3 vext2 <1,2,3,0>, <u,6,3,7>
+  2594002154U, // <3,0,u,7>: Cost 3 vext1 <7,3,0,u>, <7,3,0,u>
+  537707219U, // <3,0,u,u>: Cost 1 vext3 LHS, LHS
+  2552201318U, // <3,1,0,0>: Cost 3 vext1 <0,3,1,0>, LHS
+  2618802278U, // <3,1,0,1>: Cost 3 vext2 <0,2,3,1>, LHS
+  2618802366U, // <3,1,0,2>: Cost 3 vext2 <0,2,3,1>, <0,2,3,1>
+  1611449078U, // <3,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2>
+  2552204598U, // <3,1,0,4>: Cost 3 vext1 <0,3,1,0>, RHS
+  2732966663U, // <3,1,0,5>: Cost 3 vext3 LHS, <1,0,5,1>
+  3906258396U, // <3,1,0,6>: Cost 4 vuzpr <2,3,0,1>, <2,0,4,6>
+  3667752171U, // <3,1,0,7>: Cost 4 vext1 <7,3,1,0>, <7,3,1,0>
+  1611891491U, // <3,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2>
+  2689835819U, // <3,1,1,0>: Cost 3 vext3 LHS, <1,1,0,1>
+  1611449140U, // <3,1,1,1>: Cost 2 vext3 LHS, <1,1,1,1>
+  2624775063U, // <3,1,1,2>: Cost 3 vext2 <1,2,3,1>, <1,2,3,1>
+  1611891528U, // <3,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3>
+  2689835859U, // <3,1,1,4>: Cost 3 vext3 LHS, <1,1,4,5>
+  2689835868U, // <3,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5>
+  3763577701U, // <3,1,1,6>: Cost 4 vext3 LHS, <1,1,6,5>
+  3765273452U, // <3,1,1,7>: Cost 4 vext3 <1,1,7,3>, <1,1,7,3>
+  1611891573U, // <3,1,1,u>: Cost 2 vext3 LHS, <1,1,u,3>
+  2629420494U, // <3,1,2,0>: Cost 3 vext2 <2,0,3,1>, <2,0,3,1>
+  2689835911U, // <3,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3>
+  2564163248U, // <3,1,2,2>: Cost 3 vext1 <2,3,1,2>, <2,3,1,2>
+  1611449238U, // <3,1,2,3>: Cost 2 vext3 LHS, <1,2,3,0>
+  2564164918U, // <3,1,2,4>: Cost 3 vext1 <2,3,1,2>, RHS
+  2689835947U, // <3,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3>
+  3692545978U, // <3,1,2,6>: Cost 4 vext2 <0,2,3,1>, <2,6,3,7>
+  2732966842U, // <3,1,2,7>: Cost 3 vext3 LHS, <1,2,7,0>
+  1611891651U, // <3,1,2,u>: Cost 2 vext3 LHS, <1,2,u,0>
+  1484456038U, // <3,1,3,0>: Cost 2 vext1 <1,3,1,3>, LHS
+  1611891672U, // <3,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3>
+  2685633502U, // <3,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0>
+  2685633512U, // <3,1,3,3>: Cost 3 vext3 LHS, <1,3,3,1>
+  1484459318U, // <3,1,3,4>: Cost 2 vext1 <1,3,1,3>, RHS
+  1611891712U, // <3,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7>
+  2689836041U, // <3,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7>
+  2733409294U, // <3,1,3,7>: Cost 3 vext3 LHS, <1,3,7,3>
+  1611891735U, // <3,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3>
+  2552234086U, // <3,1,4,0>: Cost 3 vext1 <0,3,1,4>, LHS
+  2732966955U, // <3,1,4,1>: Cost 3 vext3 LHS, <1,4,1,5>
+  2732966964U, // <3,1,4,2>: Cost 3 vext3 LHS, <1,4,2,5>
+  2685633597U, // <3,1,4,3>: Cost 3 vext3 LHS, <1,4,3,5>
+  2552237366U, // <3,1,4,4>: Cost 3 vext1 <0,3,1,4>, RHS
+  2618805558U, // <3,1,4,5>: Cost 3 vext2 <0,2,3,1>, RHS
+  2769472822U, // <3,1,4,6>: Cost 3 vuzpl <3,0,1,2>, RHS
+  3667784943U, // <3,1,4,7>: Cost 4 vext1 <7,3,1,4>, <7,3,1,4>
+  2685633642U, // <3,1,4,u>: Cost 3 vext3 LHS, <1,4,u,5>
+  2689836143U, // <3,1,5,0>: Cost 3 vext3 LHS, <1,5,0,1>
+  2564187280U, // <3,1,5,1>: Cost 3 vext1 <2,3,1,5>, <1,5,3,7>
+  2564187827U, // <3,1,5,2>: Cost 3 vext1 <2,3,1,5>, <2,3,1,5>
+  1611891856U, // <3,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7>
+  2689836183U, // <3,1,5,4>: Cost 3 vext3 LHS, <1,5,4,5>
+  3759375522U, // <3,1,5,5>: Cost 4 vext3 LHS, <1,5,5,7>
+  3720417378U, // <3,1,5,6>: Cost 4 vext2 <4,u,3,1>, <5,6,7,0>
+  2832518454U, // <3,1,5,7>: Cost 3 vuzpr <2,3,0,1>, RHS
+  1611891901U, // <3,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7>
+  3763578048U, // <3,1,6,0>: Cost 4 vext3 LHS, <1,6,0,1>
+  2689836239U, // <3,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7>
+  2732967128U, // <3,1,6,2>: Cost 3 vext3 LHS, <1,6,2,7>
+  2685633761U, // <3,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7>
+  3763578088U, // <3,1,6,4>: Cost 4 vext3 LHS, <1,6,4,5>
+  2689836275U, // <3,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7>
+  3763578108U, // <3,1,6,6>: Cost 4 vext3 LHS, <1,6,6,7>
+  2732967166U, // <3,1,6,7>: Cost 3 vext3 LHS, <1,6,7,0>
+  2685633806U, // <3,1,6,u>: Cost 3 vext3 LHS, <1,6,u,7>
+  3631972454U, // <3,1,7,0>: Cost 4 vext1 <1,3,1,7>, LHS
+  2659947612U, // <3,1,7,1>: Cost 3 vext2 <7,1,3,1>, <7,1,3,1>
+  4036102294U, // <3,1,7,2>: Cost 4 vzipr <1,5,3,7>, <3,0,1,2>
+  3095396454U, // <3,1,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS
+  3631975734U, // <3,1,7,4>: Cost 4 vext1 <1,3,1,7>, RHS
+  2222982144U, // <3,1,7,5>: Cost 3 vrev <1,3,5,7>
+  3296797705U, // <3,1,7,6>: Cost 4 vrev <1,3,6,7>
+  3720418924U, // <3,1,7,7>: Cost 4 vext2 <4,u,3,1>, <7,7,7,7>
+  3095396459U, // <3,1,7,u>: Cost 3 vtrnr <1,3,5,7>, LHS
+  1484496998U, // <3,1,u,0>: Cost 2 vext1 <1,3,1,u>, LHS
+  1611892077U, // <3,1,u,1>: Cost 2 vext3 LHS, <1,u,1,3>
+  2685633907U, // <3,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0>
+  1611892092U, // <3,1,u,3>: Cost 2 vext3 LHS, <1,u,3,0>
+  1484500278U, // <3,1,u,4>: Cost 2 vext1 <1,3,1,u>, RHS
+  1611892117U, // <3,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7>
+  2685633950U, // <3,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7>
+  2832518697U, // <3,1,u,7>: Cost 3 vuzpr <2,3,0,1>, RHS
+  1611892140U, // <3,1,u,u>: Cost 2 vext3 LHS, <1,u,u,3>
+  2623455232U, // <3,2,0,0>: Cost 3 vext2 <1,0,3,2>, <0,0,0,0>
+  1549713510U, // <3,2,0,1>: Cost 2 vext2 <1,0,3,2>, LHS
+  2689836484U, // <3,2,0,2>: Cost 3 vext3 LHS, <2,0,2,0>
+  2685633997U, // <3,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0>
+  2623455570U, // <3,2,0,4>: Cost 3 vext2 <1,0,3,2>, <0,4,1,5>
+  2732967398U, // <3,2,0,5>: Cost 3 vext3 LHS, <2,0,5,7>
+  2689836524U, // <3,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4>
+  2229044964U, // <3,2,0,7>: Cost 3 vrev <2,3,7,0>
+  1549714077U, // <3,2,0,u>: Cost 2 vext2 <1,0,3,2>, LHS
+  1549714166U, // <3,2,1,0>: Cost 2 vext2 <1,0,3,2>, <1,0,3,2>
+  2623456052U, // <3,2,1,1>: Cost 3 vext2 <1,0,3,2>, <1,1,1,1>
+  2623456150U, // <3,2,1,2>: Cost 3 vext2 <1,0,3,2>, <1,2,3,0>
+  2685634079U, // <3,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1>
+  2552286518U, // <3,2,1,4>: Cost 3 vext1 <0,3,2,1>, RHS
+  2623456400U, // <3,2,1,5>: Cost 3 vext2 <1,0,3,2>, <1,5,3,7>
+  2689836604U, // <3,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3>
+  3667834101U, // <3,2,1,7>: Cost 4 vext1 <7,3,2,1>, <7,3,2,1>
+  1155385070U, // <3,2,1,u>: Cost 2 vrev <2,3,u,1>
+  2689836629U, // <3,2,2,0>: Cost 3 vext3 LHS, <2,2,0,1>
+  2689836640U, // <3,2,2,1>: Cost 3 vext3 LHS, <2,2,1,3>
+  1611449960U, // <3,2,2,2>: Cost 2 vext3 LHS, <2,2,2,2>
+  1611892338U, // <3,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3>
+  2689836669U, // <3,2,2,4>: Cost 3 vext3 LHS, <2,2,4,5>
+  2689836680U, // <3,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7>
+  2689836688U, // <3,2,2,6>: Cost 3 vext3 LHS, <2,2,6,6>
+  3763578518U, // <3,2,2,7>: Cost 4 vext3 LHS, <2,2,7,3>
+  1611892383U, // <3,2,2,u>: Cost 2 vext3 LHS, <2,2,u,3>
+  1611450022U, // <3,2,3,0>: Cost 2 vext3 LHS, <2,3,0,1>
+  2685191854U, // <3,2,3,1>: Cost 3 vext3 LHS, <2,3,1,0>
+  2685191865U, // <3,2,3,2>: Cost 3 vext3 LHS, <2,3,2,2>
+  2685191875U, // <3,2,3,3>: Cost 3 vext3 LHS, <2,3,3,3>
+  1611450062U, // <3,2,3,4>: Cost 2 vext3 LHS, <2,3,4,5>
+  2732967635U, // <3,2,3,5>: Cost 3 vext3 LHS, <2,3,5,1>
+  2732967645U, // <3,2,3,6>: Cost 3 vext3 LHS, <2,3,6,2>
+  2732967652U, // <3,2,3,7>: Cost 3 vext3 LHS, <2,3,7,0>
+  1611450094U, // <3,2,3,u>: Cost 2 vext3 LHS, <2,3,u,1>
+  2558279782U, // <3,2,4,0>: Cost 3 vext1 <1,3,2,4>, LHS
+  2558280602U, // <3,2,4,1>: Cost 3 vext1 <1,3,2,4>, <1,2,3,4>
+  2732967692U, // <3,2,4,2>: Cost 3 vext3 LHS, <2,4,2,4>
+  2685634326U, // <3,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5>
+  2558283062U, // <3,2,4,4>: Cost 3 vext1 <1,3,2,4>, RHS
+  1549716790U, // <3,2,4,5>: Cost 2 vext2 <1,0,3,2>, RHS
+  2689836844U, // <3,2,4,6>: Cost 3 vext3 LHS, <2,4,6,0>
+  2229077736U, // <3,2,4,7>: Cost 3 vrev <2,3,7,4>
+  1549717033U, // <3,2,4,u>: Cost 2 vext2 <1,0,3,2>, RHS
+  2552316006U, // <3,2,5,0>: Cost 3 vext1 <0,3,2,5>, LHS
+  2228643507U, // <3,2,5,1>: Cost 3 vrev <2,3,1,5>
+  2689836896U, // <3,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7>
+  2685634408U, // <3,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6>
+  1155122894U, // <3,2,5,4>: Cost 2 vrev <2,3,4,5>
+  2665263108U, // <3,2,5,5>: Cost 3 vext2 <u,0,3,2>, <5,5,5,5>
+  2689836932U, // <3,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7>
+  2665263272U, // <3,2,5,7>: Cost 3 vext2 <u,0,3,2>, <5,7,5,7>
+  1155417842U, // <3,2,5,u>: Cost 2 vrev <2,3,u,5>
+  2689836953U, // <3,2,6,0>: Cost 3 vext3 LHS, <2,6,0,1>
+  2689836964U, // <3,2,6,1>: Cost 3 vext3 LHS, <2,6,1,3>
+  2689836976U, // <3,2,6,2>: Cost 3 vext3 LHS, <2,6,2,6>
+  1611892666U, // <3,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7>
+  2689836993U, // <3,2,6,4>: Cost 3 vext3 LHS, <2,6,4,5>
+  2689837004U, // <3,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7>
+  2689837013U, // <3,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7>
+  2665263950U, // <3,2,6,7>: Cost 3 vext2 <u,0,3,2>, <6,7,0,1>
+  1611892711U, // <3,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7>
+  2665264122U, // <3,2,7,0>: Cost 3 vext2 <u,0,3,2>, <7,0,1,2>
+  2623460419U, // <3,2,7,1>: Cost 3 vext2 <1,0,3,2>, <7,1,0,3>
+  4169138340U, // <3,2,7,2>: Cost 4 vtrnr <1,3,5,7>, <0,2,0,2>
+  2962358374U, // <3,2,7,3>: Cost 3 vzipr <1,5,3,7>, LHS
+  2665264486U, // <3,2,7,4>: Cost 3 vext2 <u,0,3,2>, <7,4,5,6>
+  2228954841U, // <3,2,7,5>: Cost 3 vrev <2,3,5,7>
+  2229028578U, // <3,2,7,6>: Cost 3 vrev <2,3,6,7>
+  2665264748U, // <3,2,7,7>: Cost 3 vext2 <u,0,3,2>, <7,7,7,7>
+  2962358379U, // <3,2,7,u>: Cost 3 vzipr <1,5,3,7>, LHS
+  1611892795U, // <3,2,u,0>: Cost 2 vext3 LHS, <2,u,0,1>
+  1549719342U, // <3,2,u,1>: Cost 2 vext2 <1,0,3,2>, LHS
+  1611449960U, // <3,2,u,2>: Cost 2 vext3 LHS, <2,2,2,2>
+  1611892824U, // <3,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3>
+  1611892835U, // <3,2,u,4>: Cost 2 vext3 LHS, <2,u,4,5>
+  1549719706U, // <3,2,u,5>: Cost 2 vext2 <1,0,3,2>, RHS
+  2689837168U, // <3,2,u,6>: Cost 3 vext3 LHS, <2,u,6,0>
+  2665265408U, // <3,2,u,7>: Cost 3 vext2 <u,0,3,2>, <u,7,0,1>
+  1611892867U, // <3,2,u,u>: Cost 2 vext3 LHS, <2,u,u,1>
+  2685192331U, // <3,3,0,0>: Cost 3 vext3 LHS, <3,0,0,0>
+  1611450518U, // <3,3,0,1>: Cost 2 vext3 LHS, <3,0,1,2>
+  2685634717U, // <3,3,0,2>: Cost 3 vext3 LHS, <3,0,2,0>
+  2564294806U, // <3,3,0,3>: Cost 3 vext1 <2,3,3,0>, <3,0,1,2>
+  2685634736U, // <3,3,0,4>: Cost 3 vext3 LHS, <3,0,4,1>
+  2732968122U, // <3,3,0,5>: Cost 3 vext3 LHS, <3,0,5,2>
+  3763579075U, // <3,3,0,6>: Cost 4 vext3 LHS, <3,0,6,2>
+  4034053264U, // <3,3,0,7>: Cost 4 vzipr <1,2,3,0>, <1,5,3,7>
+  1611450581U, // <3,3,0,u>: Cost 2 vext3 LHS, <3,0,u,2>
+  2685192415U, // <3,3,1,0>: Cost 3 vext3 LHS, <3,1,0,3>
+  1550385992U, // <3,3,1,1>: Cost 2 vext2 <1,1,3,3>, <1,1,3,3>
+  2685192433U, // <3,3,1,2>: Cost 3 vext3 LHS, <3,1,2,3>
+  2685634808U, // <3,3,1,3>: Cost 3 vext3 LHS, <3,1,3,1>
+  2558332214U, // <3,3,1,4>: Cost 3 vext1 <1,3,3,1>, RHS
+  2685634828U, // <3,3,1,5>: Cost 3 vext3 LHS, <3,1,5,3>
+  3759376661U, // <3,3,1,6>: Cost 4 vext3 LHS, <3,1,6,3>
+  2703477022U, // <3,3,1,7>: Cost 3 vext3 <3,1,7,3>, <3,1,7,3>
+  1555031423U, // <3,3,1,u>: Cost 2 vext2 <1,u,3,3>, <1,u,3,3>
+  2564309094U, // <3,3,2,0>: Cost 3 vext1 <2,3,3,2>, LHS
+  2630100513U, // <3,3,2,1>: Cost 3 vext2 <2,1,3,3>, <2,1,3,3>
+  1557022322U, // <3,3,2,2>: Cost 2 vext2 <2,2,3,3>, <2,2,3,3>
+  2685192520U, // <3,3,2,3>: Cost 3 vext3 LHS, <3,2,3,0>
+  2564312374U, // <3,3,2,4>: Cost 3 vext1 <2,3,3,2>, RHS
+  2732968286U, // <3,3,2,5>: Cost 3 vext3 LHS, <3,2,5,4>
+  2685634918U, // <3,3,2,6>: Cost 3 vext3 LHS, <3,2,6,3>
+  2704140655U, // <3,3,2,7>: Cost 3 vext3 <3,2,7,3>, <3,2,7,3>
+  1561004120U, // <3,3,2,u>: Cost 2 vext2 <2,u,3,3>, <2,u,3,3>
+  1496547430U, // <3,3,3,0>: Cost 2 vext1 <3,3,3,3>, LHS
+  2624129256U, // <3,3,3,1>: Cost 3 vext2 <1,1,3,3>, <3,1,1,3>
+  2630764866U, // <3,3,3,2>: Cost 3 vext2 <2,2,3,3>, <3,2,2,3>
+  336380006U, // <3,3,3,3>: Cost 1 vdup3 LHS
+  1496550710U, // <3,3,3,4>: Cost 2 vext1 <3,3,3,3>, RHS
+  2732968368U, // <3,3,3,5>: Cost 3 vext3 LHS, <3,3,5,5>
+  2624129683U, // <3,3,3,6>: Cost 3 vext2 <1,1,3,3>, <3,6,3,7>
+  2594182400U, // <3,3,3,7>: Cost 3 vext1 <7,3,3,3>, <7,3,3,3>
+  336380006U, // <3,3,3,u>: Cost 1 vdup3 LHS
+  2558353510U, // <3,3,4,0>: Cost 3 vext1 <1,3,3,4>, LHS
+  2558354411U, // <3,3,4,1>: Cost 3 vext1 <1,3,3,4>, <1,3,3,4>
+  2564327108U, // <3,3,4,2>: Cost 3 vext1 <2,3,3,4>, <2,3,3,4>
+  2564327938U, // <3,3,4,3>: Cost 3 vext1 <2,3,3,4>, <3,4,5,6>
+  2960343962U, // <3,3,4,4>: Cost 3 vzipr <1,2,3,4>, <1,2,3,4>
+  1611893250U, // <3,3,4,5>: Cost 2 vext3 LHS, <3,4,5,6>
+  2771619126U, // <3,3,4,6>: Cost 3 vuzpl <3,3,3,3>, RHS
+  4034086032U, // <3,3,4,7>: Cost 4 vzipr <1,2,3,4>, <1,5,3,7>
+  1611893277U, // <3,3,4,u>: Cost 2 vext3 LHS, <3,4,u,6>
+  2558361702U, // <3,3,5,0>: Cost 3 vext1 <1,3,3,5>, LHS
+  2558362604U, // <3,3,5,1>: Cost 3 vext1 <1,3,3,5>, <1,3,3,5>
+  2558363342U, // <3,3,5,2>: Cost 3 vext1 <1,3,3,5>, <2,3,4,5>
+  2732968512U, // <3,3,5,3>: Cost 3 vext3 LHS, <3,5,3,5>
+  2558364982U, // <3,3,5,4>: Cost 3 vext1 <1,3,3,5>, RHS
+  3101279950U, // <3,3,5,5>: Cost 3 vtrnr <2,3,4,5>, <2,3,4,5>
+  2665934946U, // <3,3,5,6>: Cost 3 vext2 <u,1,3,3>, <5,6,7,0>
+  2826636598U, // <3,3,5,7>: Cost 3 vuzpr <1,3,1,3>, RHS
+  2826636599U, // <3,3,5,u>: Cost 3 vuzpr <1,3,1,3>, RHS
+  2732968568U, // <3,3,6,0>: Cost 3 vext3 LHS, <3,6,0,7>
+  3763579521U, // <3,3,6,1>: Cost 4 vext3 LHS, <3,6,1,7>
+  2732968586U, // <3,3,6,2>: Cost 3 vext3 LHS, <3,6,2,7>
+  2732968595U, // <3,3,6,3>: Cost 3 vext3 LHS, <3,6,3,7>
+  2732968604U, // <3,3,6,4>: Cost 3 vext3 LHS, <3,6,4,7>
+  3763579557U, // <3,3,6,5>: Cost 4 vext3 LHS, <3,6,5,7>
+  2732968621U, // <3,3,6,6>: Cost 3 vext3 LHS, <3,6,6,6>
+  2657973099U, // <3,3,6,7>: Cost 3 vext2 <6,7,3,3>, <6,7,3,3>
+  2658636732U, // <3,3,6,u>: Cost 3 vext2 <6,u,3,3>, <6,u,3,3>
+  2558378086U, // <3,3,7,0>: Cost 3 vext1 <1,3,3,7>, LHS
+  2558378990U, // <3,3,7,1>: Cost 3 vext1 <1,3,3,7>, <1,3,3,7>
+  2564351687U, // <3,3,7,2>: Cost 3 vext1 <2,3,3,7>, <2,3,3,7>
+  2661291264U, // <3,3,7,3>: Cost 3 vext2 <7,3,3,3>, <7,3,3,3>
+  2558381366U, // <3,3,7,4>: Cost 3 vext1 <1,3,3,7>, RHS
+  2732968694U, // <3,3,7,5>: Cost 3 vext3 LHS, <3,7,5,7>
+  3781126907U, // <3,3,7,6>: Cost 4 vext3 <3,7,6,3>, <3,7,6,3>
+  3095397376U, // <3,3,7,7>: Cost 3 vtrnr <1,3,5,7>, <1,3,5,7>
+  2558383918U, // <3,3,7,u>: Cost 3 vext1 <1,3,3,7>, LHS
+  1496547430U, // <3,3,u,0>: Cost 2 vext1 <3,3,3,3>, LHS
+  1611893534U, // <3,3,u,1>: Cost 2 vext3 LHS, <3,u,1,2>
+  1592858504U, // <3,3,u,2>: Cost 2 vext2 <u,2,3,3>, <u,2,3,3>
+  336380006U, // <3,3,u,3>: Cost 1 vdup3 LHS
+  1496550710U, // <3,3,u,4>: Cost 2 vext1 <3,3,3,3>, RHS
+  1611893574U, // <3,3,u,5>: Cost 2 vext3 LHS, <3,u,5,6>
+  2690280268U, // <3,3,u,6>: Cost 3 vext3 LHS, <3,u,6,3>
+  2826636841U, // <3,3,u,7>: Cost 3 vuzpr <1,3,1,3>, RHS
+  336380006U, // <3,3,u,u>: Cost 1 vdup3 LHS
+  2624798720U, // <3,4,0,0>: Cost 3 vext2 <1,2,3,4>, <0,0,0,0>
+  1551056998U, // <3,4,0,1>: Cost 2 vext2 <1,2,3,4>, LHS
+  2624798884U, // <3,4,0,2>: Cost 3 vext2 <1,2,3,4>, <0,2,0,2>
+  3693232384U, // <3,4,0,3>: Cost 4 vext2 <0,3,3,4>, <0,3,1,4>
+  2624799058U, // <3,4,0,4>: Cost 3 vext2 <1,2,3,4>, <0,4,1,5>
+  1659227026U, // <3,4,0,5>: Cost 2 vext3 LHS, <4,0,5,1>
+  1659227036U, // <3,4,0,6>: Cost 2 vext3 LHS, <4,0,6,2>
+  3667973382U, // <3,4,0,7>: Cost 4 vext1 <7,3,4,0>, <7,3,4,0>
+  1551057565U, // <3,4,0,u>: Cost 2 vext2 <1,2,3,4>, LHS
+  2624799478U, // <3,4,1,0>: Cost 3 vext2 <1,2,3,4>, <1,0,3,2>
+  2624799540U, // <3,4,1,1>: Cost 3 vext2 <1,2,3,4>, <1,1,1,1>
+  1551057818U, // <3,4,1,2>: Cost 2 vext2 <1,2,3,4>, <1,2,3,4>
+  2624799704U, // <3,4,1,3>: Cost 3 vext2 <1,2,3,4>, <1,3,1,3>
+  2564377910U, // <3,4,1,4>: Cost 3 vext1 <2,3,4,1>, RHS
+  2689838050U, // <3,4,1,5>: Cost 3 vext3 LHS, <4,1,5,0>
+  2689838062U, // <3,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3>
+  2628117807U, // <3,4,1,7>: Cost 3 vext2 <1,7,3,4>, <1,7,3,4>
+  1555039616U, // <3,4,1,u>: Cost 2 vext2 <1,u,3,4>, <1,u,3,4>
+  3626180710U, // <3,4,2,0>: Cost 4 vext1 <0,3,4,2>, LHS
+  2624800298U, // <3,4,2,1>: Cost 3 vext2 <1,2,3,4>, <2,1,4,3>
+  2624800360U, // <3,4,2,2>: Cost 3 vext2 <1,2,3,4>, <2,2,2,2>
+  2624800422U, // <3,4,2,3>: Cost 3 vext2 <1,2,3,4>, <2,3,0,1>
+  2624800514U, // <3,4,2,4>: Cost 3 vext2 <1,2,3,4>, <2,4,1,3>
+  2709965878U, // <3,4,2,5>: Cost 3 vext3 <4,2,5,3>, <4,2,5,3>
+  2689838140U, // <3,4,2,6>: Cost 3 vext3 LHS, <4,2,6,0>
+  2634090504U, // <3,4,2,7>: Cost 3 vext2 <2,7,3,4>, <2,7,3,4>
+  2689838158U, // <3,4,2,u>: Cost 3 vext3 LHS, <4,2,u,0>
+  2624800918U, // <3,4,3,0>: Cost 3 vext2 <1,2,3,4>, <3,0,1,2>
+  2636081403U, // <3,4,3,1>: Cost 3 vext2 <3,1,3,4>, <3,1,3,4>
+  2636745036U, // <3,4,3,2>: Cost 3 vext2 <3,2,3,4>, <3,2,3,4>
+  2624801180U, // <3,4,3,3>: Cost 3 vext2 <1,2,3,4>, <3,3,3,3>
+  2624801232U, // <3,4,3,4>: Cost 3 vext2 <1,2,3,4>, <3,4,0,1>
+  2905836854U, // <3,4,3,5>: Cost 3 vzipl <3,3,3,3>, RHS
+  3040054582U, // <3,4,3,6>: Cost 3 vtrnl <3,3,3,3>, RHS
+  3702524611U, // <3,4,3,7>: Cost 4 vext2 <1,u,3,4>, <3,7,0,1>
+  2624801566U, // <3,4,3,u>: Cost 3 vext2 <1,2,3,4>, <3,u,1,2>
+  2564399206U, // <3,4,4,0>: Cost 3 vext1 <2,3,4,4>, LHS
+  2564400026U, // <3,4,4,1>: Cost 3 vext1 <2,3,4,4>, <1,2,3,4>
+  2564400845U, // <3,4,4,2>: Cost 3 vext1 <2,3,4,4>, <2,3,4,4>
+  2570373542U, // <3,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4>
+  1659227344U, // <3,4,4,4>: Cost 2 vext3 LHS, <4,4,4,4>
+  1551060278U, // <3,4,4,5>: Cost 2 vext2 <1,2,3,4>, RHS
+  1659227364U, // <3,4,4,6>: Cost 2 vext3 LHS, <4,4,6,6>
+  3668006154U, // <3,4,4,7>: Cost 4 vext1 <7,3,4,4>, <7,3,4,4>
+  1551060521U, // <3,4,4,u>: Cost 2 vext2 <1,2,3,4>, RHS
+  1490665574U, // <3,4,5,0>: Cost 2 vext1 <2,3,4,5>, LHS
+  2689838341U, // <3,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3>
+  1490667214U, // <3,4,5,2>: Cost 2 vext1 <2,3,4,5>, <2,3,4,5>
+  2564409494U, // <3,4,5,3>: Cost 3 vext1 <2,3,4,5>, <3,0,1,2>
+  1490668854U, // <3,4,5,4>: Cost 2 vext1 <2,3,4,5>, RHS
+  2689838381U, // <3,4,5,5>: Cost 3 vext3 LHS, <4,5,5,7>
+  537709878U, // <3,4,5,6>: Cost 1 vext3 LHS, RHS
+  2594272523U, // <3,4,5,7>: Cost 3 vext1 <7,3,4,5>, <7,3,4,5>
+  537709896U, // <3,4,5,u>: Cost 1 vext3 LHS, RHS
+  2689838411U, // <3,4,6,0>: Cost 3 vext3 LHS, <4,6,0,1>
+  2558444534U, // <3,4,6,1>: Cost 3 vext1 <1,3,4,6>, <1,3,4,6>
+  2666607098U, // <3,4,6,2>: Cost 3 vext2 <u,2,3,4>, <6,2,7,3>
+  2558446082U, // <3,4,6,3>: Cost 3 vext1 <1,3,4,6>, <3,4,5,6>
+  1659227508U, // <3,4,6,4>: Cost 2 vext3 LHS, <4,6,4,6>
+  2689838462U, // <3,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7>
+  2689838471U, // <3,4,6,6>: Cost 3 vext3 LHS, <4,6,6,7>
+  2657981292U, // <3,4,6,7>: Cost 3 vext2 <6,7,3,4>, <6,7,3,4>
+  1659227540U, // <3,4,6,u>: Cost 2 vext3 LHS, <4,6,u,2>
+  2666607610U, // <3,4,7,0>: Cost 3 vext2 <u,2,3,4>, <7,0,1,2>
+  3702527072U, // <3,4,7,1>: Cost 4 vext2 <1,u,3,4>, <7,1,3,5>
+  2660635824U, // <3,4,7,2>: Cost 3 vext2 <7,2,3,4>, <7,2,3,4>
+  3644139945U, // <3,4,7,3>: Cost 4 vext1 <3,3,4,7>, <3,3,4,7>
+  2666607974U, // <3,4,7,4>: Cost 3 vext2 <u,2,3,4>, <7,4,5,6>
+  2732969416U, // <3,4,7,5>: Cost 3 vext3 LHS, <4,7,5,0>
+  2732969425U, // <3,4,7,6>: Cost 3 vext3 LHS, <4,7,6,0>
+  2666608236U, // <3,4,7,7>: Cost 3 vext2 <u,2,3,4>, <7,7,7,7>
+  2664617622U, // <3,4,7,u>: Cost 3 vext2 <7,u,3,4>, <7,u,3,4>
+  1490690150U, // <3,4,u,0>: Cost 2 vext1 <2,3,4,u>, LHS
+  1551062830U, // <3,4,u,1>: Cost 2 vext2 <1,2,3,4>, LHS
+  1490691793U, // <3,4,u,2>: Cost 2 vext1 <2,3,4,u>, <2,3,4,u>
+  2624804796U, // <3,4,u,3>: Cost 3 vext2 <1,2,3,4>, <u,3,0,1>
+  1490693430U, // <3,4,u,4>: Cost 2 vext1 <2,3,4,u>, RHS
+  1551063194U, // <3,4,u,5>: Cost 2 vext2 <1,2,3,4>, RHS
+  537710121U, // <3,4,u,6>: Cost 1 vext3 LHS, RHS
+  2594297102U, // <3,4,u,7>: Cost 3 vext1 <7,3,4,u>, <7,3,4,u>
+  537710139U, // <3,4,u,u>: Cost 1 vext3 LHS, RHS
+  3692576768U, // <3,5,0,0>: Cost 4 vext2 <0,2,3,5>, <0,0,0,0>
+  2618835046U, // <3,5,0,1>: Cost 3 vext2 <0,2,3,5>, LHS
+  2618835138U, // <3,5,0,2>: Cost 3 vext2 <0,2,3,5>, <0,2,3,5>
+  3692577024U, // <3,5,0,3>: Cost 4 vext2 <0,2,3,5>, <0,3,1,4>
+  2689838690U, // <3,5,0,4>: Cost 3 vext3 LHS, <5,0,4,1>
+  2732969579U, // <3,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1>
+  2732969588U, // <3,5,0,6>: Cost 3 vext3 LHS, <5,0,6,1>
+  2246963055U, // <3,5,0,7>: Cost 3 vrev <5,3,7,0>
+  2618835613U, // <3,5,0,u>: Cost 3 vext2 <0,2,3,5>, LHS
+  2594308198U, // <3,5,1,0>: Cost 3 vext1 <7,3,5,1>, LHS
+  3692577588U, // <3,5,1,1>: Cost 4 vext2 <0,2,3,5>, <1,1,1,1>
+  2624807835U, // <3,5,1,2>: Cost 3 vext2 <1,2,3,5>, <1,2,3,5>
+  2625471468U, // <3,5,1,3>: Cost 3 vext2 <1,3,3,5>, <1,3,3,5>
+  2626135101U, // <3,5,1,4>: Cost 3 vext2 <1,4,3,5>, <1,4,3,5>
+  2594311888U, // <3,5,1,5>: Cost 3 vext1 <7,3,5,1>, <5,1,7,3>
+  3699877107U, // <3,5,1,6>: Cost 4 vext2 <1,4,3,5>, <1,6,5,7>
+  1641680592U, // <3,5,1,7>: Cost 2 vext3 <5,1,7,3>, <5,1,7,3>
+  1641754329U, // <3,5,1,u>: Cost 2 vext3 <5,1,u,3>, <5,1,u,3>
+  3692578274U, // <3,5,2,0>: Cost 4 vext2 <0,2,3,5>, <2,0,5,3>
+  2630116899U, // <3,5,2,1>: Cost 3 vext2 <2,1,3,5>, <2,1,3,5>
+  3692578408U, // <3,5,2,2>: Cost 4 vext2 <0,2,3,5>, <2,2,2,2>
+  2625472206U, // <3,5,2,3>: Cost 3 vext2 <1,3,3,5>, <2,3,4,5>
+  2632107798U, // <3,5,2,4>: Cost 3 vext2 <2,4,3,5>, <2,4,3,5>
+  2715938575U, // <3,5,2,5>: Cost 3 vext3 <5,2,5,3>, <5,2,5,3>
+  3692578746U, // <3,5,2,6>: Cost 4 vext2 <0,2,3,5>, <2,6,3,7>
+  2716086049U, // <3,5,2,7>: Cost 3 vext3 <5,2,7,3>, <5,2,7,3>
+  2634762330U, // <3,5,2,u>: Cost 3 vext2 <2,u,3,5>, <2,u,3,5>
+  3692578966U, // <3,5,3,0>: Cost 4 vext2 <0,2,3,5>, <3,0,1,2>
+  2636089596U, // <3,5,3,1>: Cost 3 vext2 <3,1,3,5>, <3,1,3,5>
+  3699214668U, // <3,5,3,2>: Cost 4 vext2 <1,3,3,5>, <3,2,3,4>
+  2638080412U, // <3,5,3,3>: Cost 3 vext2 <3,4,3,5>, <3,3,3,3>
+  2618837506U, // <3,5,3,4>: Cost 3 vext2 <0,2,3,5>, <3,4,5,6>
+  2832844494U, // <3,5,3,5>: Cost 3 vuzpr <2,3,4,5>, <2,3,4,5>
+  4033415682U, // <3,5,3,6>: Cost 4 vzipr <1,1,3,3>, <3,4,5,6>
+  3095072054U, // <3,5,3,7>: Cost 3 vtrnr <1,3,1,3>, RHS
+  3095072055U, // <3,5,3,u>: Cost 3 vtrnr <1,3,1,3>, RHS
+  2600304742U, // <3,5,4,0>: Cost 3 vext1 <u,3,5,4>, LHS
+  3763580815U, // <3,5,4,1>: Cost 4 vext3 LHS, <5,4,1,5>
+  2564474582U, // <3,5,4,2>: Cost 3 vext1 <2,3,5,4>, <2,3,5,4>
+  3699879044U, // <3,5,4,3>: Cost 4 vext2 <1,4,3,5>, <4,3,5,0>
+  2600308022U, // <3,5,4,4>: Cost 3 vext1 <u,3,5,4>, RHS
+  2618838326U, // <3,5,4,5>: Cost 3 vext2 <0,2,3,5>, RHS
+  2772454710U, // <3,5,4,6>: Cost 3 vuzpl <3,4,5,6>, RHS
+  1659228102U, // <3,5,4,7>: Cost 2 vext3 LHS, <5,4,7,6>
+  1659228111U, // <3,5,4,u>: Cost 2 vext3 LHS, <5,4,u,6>
+  2570453094U, // <3,5,5,0>: Cost 3 vext1 <3,3,5,5>, LHS
+  2624810704U, // <3,5,5,1>: Cost 3 vext2 <1,2,3,5>, <5,1,7,3>
+  2570454734U, // <3,5,5,2>: Cost 3 vext1 <3,3,5,5>, <2,3,4,5>
+  2570455472U, // <3,5,5,3>: Cost 3 vext1 <3,3,5,5>, <3,3,5,5>
+  2570456374U, // <3,5,5,4>: Cost 3 vext1 <3,3,5,5>, RHS
+  1659228164U, // <3,5,5,5>: Cost 2 vext3 LHS, <5,5,5,5>
+  2732969998U, // <3,5,5,6>: Cost 3 vext3 LHS, <5,5,6,6>
+  1659228184U, // <3,5,5,7>: Cost 2 vext3 LHS, <5,5,7,7>
+  1659228193U, // <3,5,5,u>: Cost 2 vext3 LHS, <5,5,u,7>
+  2732970020U, // <3,5,6,0>: Cost 3 vext3 LHS, <5,6,0,1>
+  2732970035U, // <3,5,6,1>: Cost 3 vext3 LHS, <5,6,1,7>
+  2564490968U, // <3,5,6,2>: Cost 3 vext1 <2,3,5,6>, <2,3,5,6>
+  2732970050U, // <3,5,6,3>: Cost 3 vext3 LHS, <5,6,3,4>
+  2732970060U, // <3,5,6,4>: Cost 3 vext3 LHS, <5,6,4,5>
+  2732970071U, // <3,5,6,5>: Cost 3 vext3 LHS, <5,6,5,7>
+  2732970080U, // <3,5,6,6>: Cost 3 vext3 LHS, <5,6,6,7>
+  1659228258U, // <3,5,6,7>: Cost 2 vext3 LHS, <5,6,7,0>
+  1659228267U, // <3,5,6,u>: Cost 2 vext3 LHS, <5,6,u,0>
+  1484783718U, // <3,5,7,0>: Cost 2 vext1 <1,3,5,7>, LHS
+  1484784640U, // <3,5,7,1>: Cost 2 vext1 <1,3,5,7>, <1,3,5,7>
+  2558527080U, // <3,5,7,2>: Cost 3 vext1 <1,3,5,7>, <2,2,2,2>
+  2558527638U, // <3,5,7,3>: Cost 3 vext1 <1,3,5,7>, <3,0,1,2>
+  1484786998U, // <3,5,7,4>: Cost 2 vext1 <1,3,5,7>, RHS
+  1659228328U, // <3,5,7,5>: Cost 2 vext3 LHS, <5,7,5,7>
+  2732970154U, // <3,5,7,6>: Cost 3 vext3 LHS, <5,7,6,0>
+  2558531180U, // <3,5,7,7>: Cost 3 vext1 <1,3,5,7>, <7,7,7,7>
+  1484789550U, // <3,5,7,u>: Cost 2 vext1 <1,3,5,7>, LHS
+  1484791910U, // <3,5,u,0>: Cost 2 vext1 <1,3,5,u>, LHS
+  1484792833U, // <3,5,u,1>: Cost 2 vext1 <1,3,5,u>, <1,3,5,u>
+  2558535272U, // <3,5,u,2>: Cost 3 vext1 <1,3,5,u>, <2,2,2,2>
+  2558535830U, // <3,5,u,3>: Cost 3 vext1 <1,3,5,u>, <3,0,1,2>
+  1484795190U, // <3,5,u,4>: Cost 2 vext1 <1,3,5,u>, RHS
+  1659228409U, // <3,5,u,5>: Cost 2 vext3 LHS, <5,u,5,7>
+  2772457626U, // <3,5,u,6>: Cost 3 vuzpl <3,4,5,6>, RHS
+  1646326023U, // <3,5,u,7>: Cost 2 vext3 <5,u,7,3>, <5,u,7,3>
+  1484797742U, // <3,5,u,u>: Cost 2 vext1 <1,3,5,u>, LHS
+  2558541926U, // <3,6,0,0>: Cost 3 vext1 <1,3,6,0>, LHS
+  2689839393U, // <3,6,0,1>: Cost 3 vext3 LHS, <6,0,1,2>
+  2689839404U, // <3,6,0,2>: Cost 3 vext3 LHS, <6,0,2,4>
+  3706519808U, // <3,6,0,3>: Cost 4 vext2 <2,5,3,6>, <0,3,1,4>
+  2689839420U, // <3,6,0,4>: Cost 3 vext3 LHS, <6,0,4,2>
+  2732970314U, // <3,6,0,5>: Cost 3 vext3 LHS, <6,0,5,7>
+  2732970316U, // <3,6,0,6>: Cost 3 vext3 LHS, <6,0,6,0>
+  2960313654U, // <3,6,0,7>: Cost 3 vzipr <1,2,3,0>, RHS
+  2689839456U, // <3,6,0,u>: Cost 3 vext3 LHS, <6,0,u,2>
+  3763581290U, // <3,6,1,0>: Cost 4 vext3 LHS, <6,1,0,3>
+  3763581297U, // <3,6,1,1>: Cost 4 vext3 LHS, <6,1,1,1>
+  2624816028U, // <3,6,1,2>: Cost 3 vext2 <1,2,3,6>, <1,2,3,6>
+  3763581315U, // <3,6,1,3>: Cost 4 vext3 LHS, <6,1,3,1>
+  2626143294U, // <3,6,1,4>: Cost 3 vext2 <1,4,3,6>, <1,4,3,6>
+  3763581335U, // <3,6,1,5>: Cost 4 vext3 LHS, <6,1,5,3>
+  2721321376U, // <3,6,1,6>: Cost 3 vext3 <6,1,6,3>, <6,1,6,3>
+  2721395113U, // <3,6,1,7>: Cost 3 vext3 <6,1,7,3>, <6,1,7,3>
+  2628797826U, // <3,6,1,u>: Cost 3 vext2 <1,u,3,6>, <1,u,3,6>
+  2594390118U, // <3,6,2,0>: Cost 3 vext1 <7,3,6,2>, LHS
+  2721616324U, // <3,6,2,1>: Cost 3 vext3 <6,2,1,3>, <6,2,1,3>
+  2630788725U, // <3,6,2,2>: Cost 3 vext2 <2,2,3,6>, <2,2,3,6>
+  3763581395U, // <3,6,2,3>: Cost 4 vext3 LHS, <6,2,3,0>
+  2632115991U, // <3,6,2,4>: Cost 3 vext2 <2,4,3,6>, <2,4,3,6>
+  2632779624U, // <3,6,2,5>: Cost 3 vext2 <2,5,3,6>, <2,5,3,6>
+  2594394618U, // <3,6,2,6>: Cost 3 vext1 <7,3,6,2>, <6,2,7,3>
+  1648316922U, // <3,6,2,7>: Cost 2 vext3 <6,2,7,3>, <6,2,7,3>
+  1648390659U, // <3,6,2,u>: Cost 2 vext3 <6,2,u,3>, <6,2,u,3>
+  3693914262U, // <3,6,3,0>: Cost 4 vext2 <0,4,3,6>, <3,0,1,2>
+  3638281176U, // <3,6,3,1>: Cost 4 vext1 <2,3,6,3>, <1,3,1,3>
+  3696568678U, // <3,6,3,2>: Cost 4 vext2 <0,u,3,6>, <3,2,6,3>
+  2638088604U, // <3,6,3,3>: Cost 3 vext2 <3,4,3,6>, <3,3,3,3>
+  2632780290U, // <3,6,3,4>: Cost 3 vext2 <2,5,3,6>, <3,4,5,6>
+  3712494145U, // <3,6,3,5>: Cost 4 vext2 <3,5,3,6>, <3,5,3,6>
+  3698559612U, // <3,6,3,6>: Cost 4 vext2 <1,2,3,6>, <3,6,1,2>
+  2959674678U, // <3,6,3,7>: Cost 3 vzipr <1,1,3,3>, RHS
+  2959674679U, // <3,6,3,u>: Cost 3 vzipr <1,1,3,3>, RHS
+  3763581536U, // <3,6,4,0>: Cost 4 vext3 LHS, <6,4,0,6>
+  2722943590U, // <3,6,4,1>: Cost 3 vext3 <6,4,1,3>, <6,4,1,3>
+  2732970609U, // <3,6,4,2>: Cost 3 vext3 LHS, <6,4,2,5>
+  3698560147U, // <3,6,4,3>: Cost 4 vext2 <1,2,3,6>, <4,3,6,6>
+  2732970628U, // <3,6,4,4>: Cost 3 vext3 LHS, <6,4,4,6>
+  2689839757U, // <3,6,4,5>: Cost 3 vext3 LHS, <6,4,5,6>
+  2732970640U, // <3,6,4,6>: Cost 3 vext3 LHS, <6,4,6,0>
+  2960346422U, // <3,6,4,7>: Cost 3 vzipr <1,2,3,4>, RHS
+  2689839784U, // <3,6,4,u>: Cost 3 vext3 LHS, <6,4,u,6>
+  2576498790U, // <3,6,5,0>: Cost 3 vext1 <4,3,6,5>, LHS
+  3650241270U, // <3,6,5,1>: Cost 4 vext1 <4,3,6,5>, <1,0,3,2>
+  2732970692U, // <3,6,5,2>: Cost 3 vext3 LHS, <6,5,2,7>
+  2576501250U, // <3,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6>
+  2576501906U, // <3,6,5,4>: Cost 3 vext1 <4,3,6,5>, <4,3,6,5>
+  3650244622U, // <3,6,5,5>: Cost 4 vext1 <4,3,6,5>, <5,5,6,6>
+  4114633528U, // <3,6,5,6>: Cost 4 vtrnl <3,4,5,6>, <6,6,6,6>
+  2732970735U, // <3,6,5,7>: Cost 3 vext3 LHS, <6,5,7,5>
+  2576504622U, // <3,6,5,u>: Cost 3 vext1 <4,3,6,5>, LHS
+  2732970749U, // <3,6,6,0>: Cost 3 vext3 LHS, <6,6,0,1>
+  2724270856U, // <3,6,6,1>: Cost 3 vext3 <6,6,1,3>, <6,6,1,3>
+  2624819706U, // <3,6,6,2>: Cost 3 vext2 <1,2,3,6>, <6,2,7,3>
+  3656223234U, // <3,6,6,3>: Cost 4 vext1 <5,3,6,6>, <3,4,5,6>
+  2732970788U, // <3,6,6,4>: Cost 3 vext3 LHS, <6,6,4,4>
+  2732970800U, // <3,6,6,5>: Cost 3 vext3 LHS, <6,6,5,7>
+  1659228984U, // <3,6,6,6>: Cost 2 vext3 LHS, <6,6,6,6>
+  1659228994U, // <3,6,6,7>: Cost 2 vext3 LHS, <6,6,7,7>
+  1659229003U, // <3,6,6,u>: Cost 2 vext3 LHS, <6,6,u,7>
+  1659229006U, // <3,6,7,0>: Cost 2 vext3 LHS, <6,7,0,1>
+  2558600201U, // <3,6,7,1>: Cost 3 vext1 <1,3,6,7>, <1,3,6,7>
+  2558601146U, // <3,6,7,2>: Cost 3 vext1 <1,3,6,7>, <2,6,3,7>
+  2725081963U, // <3,6,7,3>: Cost 3 vext3 <6,7,3,3>, <6,7,3,3>
+  1659229046U, // <3,6,7,4>: Cost 2 vext3 LHS, <6,7,4,5>
+  2715423611U, // <3,6,7,5>: Cost 3 vext3 <5,1,7,3>, <6,7,5,1>
+  2722059141U, // <3,6,7,6>: Cost 3 vext3 <6,2,7,3>, <6,7,6,2>
+  2962361654U, // <3,6,7,7>: Cost 3 vzipr <1,5,3,7>, RHS
+  1659229078U, // <3,6,7,u>: Cost 2 vext3 LHS, <6,7,u,1>
+  1659229087U, // <3,6,u,0>: Cost 2 vext3 LHS, <6,u,0,1>
+  2689840041U, // <3,6,u,1>: Cost 3 vext3 LHS, <6,u,1,2>
+  2558609339U, // <3,6,u,2>: Cost 3 vext1 <1,3,6,u>, <2,6,3,u>
+  2576525853U, // <3,6,u,3>: Cost 3 vext1 <4,3,6,u>, <3,4,u,6>
+  1659229127U, // <3,6,u,4>: Cost 2 vext3 LHS, <6,u,4,5>
+  2689840081U, // <3,6,u,5>: Cost 3 vext3 LHS, <6,u,5,6>
+  1659228984U, // <3,6,u,6>: Cost 2 vext3 LHS, <6,6,6,6>
+  1652298720U, // <3,6,u,7>: Cost 2 vext3 <6,u,7,3>, <6,u,7,3>
+  1659229159U, // <3,6,u,u>: Cost 2 vext3 LHS, <6,u,u,1>
+  2626813952U, // <3,7,0,0>: Cost 3 vext2 <1,5,3,7>, <0,0,0,0>
+  1553072230U, // <3,7,0,1>: Cost 2 vext2 <1,5,3,7>, LHS
+  2626814116U, // <3,7,0,2>: Cost 3 vext2 <1,5,3,7>, <0,2,0,2>
+  3700556028U, // <3,7,0,3>: Cost 4 vext2 <1,5,3,7>, <0,3,1,0>
+  2626814290U, // <3,7,0,4>: Cost 3 vext2 <1,5,3,7>, <0,4,1,5>
+  2582507375U, // <3,7,0,5>: Cost 3 vext1 <5,3,7,0>, <5,3,7,0>
+  2588480072U, // <3,7,0,6>: Cost 3 vext1 <6,3,7,0>, <6,3,7,0>
+  2732971055U, // <3,7,0,7>: Cost 3 vext3 LHS, <7,0,7,1>
+  1553072797U, // <3,7,0,u>: Cost 2 vext2 <1,5,3,7>, LHS
+  2626814710U, // <3,7,1,0>: Cost 3 vext2 <1,5,3,7>, <1,0,3,2>
+  2626814772U, // <3,7,1,1>: Cost 3 vext2 <1,5,3,7>, <1,1,1,1>
+  2626814870U, // <3,7,1,2>: Cost 3 vext2 <1,5,3,7>, <1,2,3,0>
+  2625487854U, // <3,7,1,3>: Cost 3 vext2 <1,3,3,7>, <1,3,3,7>
+  2582514998U, // <3,7,1,4>: Cost 3 vext1 <5,3,7,1>, RHS
+  1553073296U, // <3,7,1,5>: Cost 2 vext2 <1,5,3,7>, <1,5,3,7>
+  2627478753U, // <3,7,1,6>: Cost 3 vext2 <1,6,3,7>, <1,6,3,7>
+  2727367810U, // <3,7,1,7>: Cost 3 vext3 <7,1,7,3>, <7,1,7,3>
+  1555064195U, // <3,7,1,u>: Cost 2 vext2 <1,u,3,7>, <1,u,3,7>
+  2588491878U, // <3,7,2,0>: Cost 3 vext1 <6,3,7,2>, LHS
+  3700557318U, // <3,7,2,1>: Cost 4 vext2 <1,5,3,7>, <2,1,0,3>
+  2626815592U, // <3,7,2,2>: Cost 3 vext2 <1,5,3,7>, <2,2,2,2>
+  2626815654U, // <3,7,2,3>: Cost 3 vext2 <1,5,3,7>, <2,3,0,1>
+  2588495158U, // <3,7,2,4>: Cost 3 vext1 <6,3,7,2>, RHS
+  2632787817U, // <3,7,2,5>: Cost 3 vext2 <2,5,3,7>, <2,5,3,7>
+  1559709626U, // <3,7,2,6>: Cost 2 vext2 <2,6,3,7>, <2,6,3,7>
+  2728031443U, // <3,7,2,7>: Cost 3 vext3 <7,2,7,3>, <7,2,7,3>
+  1561036892U, // <3,7,2,u>: Cost 2 vext2 <2,u,3,7>, <2,u,3,7>
+  2626816150U, // <3,7,3,0>: Cost 3 vext2 <1,5,3,7>, <3,0,1,2>
+  2626816268U, // <3,7,3,1>: Cost 3 vext2 <1,5,3,7>, <3,1,5,3>
+  2633451878U, // <3,7,3,2>: Cost 3 vext2 <2,6,3,7>, <3,2,6,3>
+  2626816412U, // <3,7,3,3>: Cost 3 vext2 <1,5,3,7>, <3,3,3,3>
+  2626816514U, // <3,7,3,4>: Cost 3 vext2 <1,5,3,7>, <3,4,5,6>
+  2638760514U, // <3,7,3,5>: Cost 3 vext2 <3,5,3,7>, <3,5,3,7>
+  2639424147U, // <3,7,3,6>: Cost 3 vext2 <3,6,3,7>, <3,6,3,7>
+  2826961920U, // <3,7,3,7>: Cost 3 vuzpr <1,3,5,7>, <1,3,5,7>
+  2626816798U, // <3,7,3,u>: Cost 3 vext2 <1,5,3,7>, <3,u,1,2>
+  2582536294U, // <3,7,4,0>: Cost 3 vext1 <5,3,7,4>, LHS
+  2582537360U, // <3,7,4,1>: Cost 3 vext1 <5,3,7,4>, <1,5,3,7>
+  2588510138U, // <3,7,4,2>: Cost 3 vext1 <6,3,7,4>, <2,6,3,7>
+  3700558996U, // <3,7,4,3>: Cost 4 vext2 <1,5,3,7>, <4,3,6,7>
+  2582539574U, // <3,7,4,4>: Cost 3 vext1 <5,3,7,4>, RHS
+  1553075510U, // <3,7,4,5>: Cost 2 vext2 <1,5,3,7>, RHS
+  2588512844U, // <3,7,4,6>: Cost 3 vext1 <6,3,7,4>, <6,3,7,4>
+  2564625766U, // <3,7,4,7>: Cost 3 vext1 <2,3,7,4>, <7,4,5,6>
+  1553075753U, // <3,7,4,u>: Cost 2 vext2 <1,5,3,7>, RHS
+  2732971398U, // <3,7,5,0>: Cost 3 vext3 LHS, <7,5,0,2>
+  2626817744U, // <3,7,5,1>: Cost 3 vext2 <1,5,3,7>, <5,1,7,3>
+  3700559649U, // <3,7,5,2>: Cost 4 vext2 <1,5,3,7>, <5,2,7,3>
+  2626817903U, // <3,7,5,3>: Cost 3 vext2 <1,5,3,7>, <5,3,7,0>
+  2258728203U, // <3,7,5,4>: Cost 3 vrev <7,3,4,5>
+  2732971446U, // <3,7,5,5>: Cost 3 vext3 LHS, <7,5,5,5>
+  2732971457U, // <3,7,5,6>: Cost 3 vext3 LHS, <7,5,6,7>
+  2826964278U, // <3,7,5,7>: Cost 3 vuzpr <1,3,5,7>, RHS
+  2826964279U, // <3,7,5,u>: Cost 3 vuzpr <1,3,5,7>, RHS
+  2732971478U, // <3,7,6,0>: Cost 3 vext3 LHS, <7,6,0,1>
+  2732971486U, // <3,7,6,1>: Cost 3 vext3 LHS, <7,6,1,0>
+  2633454074U, // <3,7,6,2>: Cost 3 vext2 <2,6,3,7>, <6,2,7,3>
+  2633454152U, // <3,7,6,3>: Cost 3 vext2 <2,6,3,7>, <6,3,7,0>
+  2732971518U, // <3,7,6,4>: Cost 3 vext3 LHS, <7,6,4,5>
+  2732971526U, // <3,7,6,5>: Cost 3 vext3 LHS, <7,6,5,4>
+  2732971537U, // <3,7,6,6>: Cost 3 vext3 LHS, <7,6,6,6>
+  2732971540U, // <3,7,6,7>: Cost 3 vext3 LHS, <7,6,7,0>
+  2726041124U, // <3,7,6,u>: Cost 3 vext3 <6,u,7,3>, <7,6,u,7>
+  2570616934U, // <3,7,7,0>: Cost 3 vext1 <3,3,7,7>, LHS
+  2570617856U, // <3,7,7,1>: Cost 3 vext1 <3,3,7,7>, <1,3,5,7>
+  2564646635U, // <3,7,7,2>: Cost 3 vext1 <2,3,7,7>, <2,3,7,7>
+  2570619332U, // <3,7,7,3>: Cost 3 vext1 <3,3,7,7>, <3,3,7,7>
+  2570620214U, // <3,7,7,4>: Cost 3 vext1 <3,3,7,7>, RHS
+  2582564726U, // <3,7,7,5>: Cost 3 vext1 <5,3,7,7>, <5,3,7,7>
+  2588537423U, // <3,7,7,6>: Cost 3 vext1 <6,3,7,7>, <6,3,7,7>
+  1659229804U, // <3,7,7,7>: Cost 2 vext3 LHS, <7,7,7,7>
+  1659229804U, // <3,7,7,u>: Cost 2 vext3 LHS, <7,7,7,7>
+  2626819795U, // <3,7,u,0>: Cost 3 vext2 <1,5,3,7>, <u,0,1,2>
+  1553078062U, // <3,7,u,1>: Cost 2 vext2 <1,5,3,7>, LHS
+  2626819973U, // <3,7,u,2>: Cost 3 vext2 <1,5,3,7>, <u,2,3,0>
+  2826961565U, // <3,7,u,3>: Cost 3 vuzpr <1,3,5,7>, LHS
+  2626820159U, // <3,7,u,4>: Cost 3 vext2 <1,5,3,7>, <u,4,5,6>
+  1553078426U, // <3,7,u,5>: Cost 2 vext2 <1,5,3,7>, RHS
+  1595545808U, // <3,7,u,6>: Cost 2 vext2 <u,6,3,7>, <u,6,3,7>
+  1659229804U, // <3,7,u,7>: Cost 2 vext3 LHS, <7,7,7,7>
+  1553078629U, // <3,7,u,u>: Cost 2 vext2 <1,5,3,7>, LHS
+  1611448320U, // <3,u,0,0>: Cost 2 vext3 LHS, <0,0,0,0>
+  1611896531U, // <3,u,0,1>: Cost 2 vext3 LHS, <u,0,1,2>
+  1659672284U, // <3,u,0,2>: Cost 2 vext3 LHS, <u,0,2,2>
+  1616099045U, // <3,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2>
+  2685638381U, // <3,u,0,4>: Cost 3 vext3 LHS, <u,0,4,1>
+  1663874806U, // <3,u,0,5>: Cost 2 vext3 LHS, <u,0,5,1>
+  1663874816U, // <3,u,0,6>: Cost 2 vext3 LHS, <u,0,6,2>
+  2960313672U, // <3,u,0,7>: Cost 3 vzipr <1,2,3,0>, RHS
+  1611896594U, // <3,u,0,u>: Cost 2 vext3 LHS, <u,0,u,2>
+  1549763324U, // <3,u,1,0>: Cost 2 vext2 <1,0,3,u>, <1,0,3,u>
+  1550426957U, // <3,u,1,1>: Cost 2 vext2 <1,1,3,u>, <1,1,3,u>
+  537712430U, // <3,u,1,2>: Cost 1 vext3 LHS, LHS
+  1616541495U, // <3,u,1,3>: Cost 2 vext3 LHS, <u,1,3,3>
+  1490930998U, // <3,u,1,4>: Cost 2 vext1 <2,3,u,1>, RHS
+  1553081489U, // <3,u,1,5>: Cost 2 vext2 <1,5,3,u>, <1,5,3,u>
+  2627486946U, // <3,u,1,6>: Cost 3 vext2 <1,6,3,u>, <1,6,3,u>
+  1659230043U, // <3,u,1,7>: Cost 2 vext3 LHS, <u,1,7,3>
+  537712484U, // <3,u,1,u>: Cost 1 vext3 LHS, LHS
+  1611890852U, // <3,u,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
+  2624833102U, // <3,u,2,1>: Cost 3 vext2 <1,2,3,u>, <2,1,u,3>
+  1557063287U, // <3,u,2,2>: Cost 2 vext2 <2,2,3,u>, <2,2,3,u>
+  1616099205U, // <3,u,2,3>: Cost 2 vext3 LHS, <u,2,3,0>
+  1611890892U, // <3,u,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
+  2689841054U, // <3,u,2,5>: Cost 3 vext3 LHS, <u,2,5,7>
+  1559717819U, // <3,u,2,6>: Cost 2 vext2 <2,6,3,u>, <2,6,3,u>
+  1659230124U, // <3,u,2,7>: Cost 2 vext3 LHS, <u,2,7,3>
+  1616541618U, // <3,u,2,u>: Cost 2 vext3 LHS, <u,2,u,0>
+  1611896764U, // <3,u,3,0>: Cost 2 vext3 LHS, <u,3,0,1>
+  1484973079U, // <3,u,3,1>: Cost 2 vext1 <1,3,u,3>, <1,3,u,3>
+  2685638607U, // <3,u,3,2>: Cost 3 vext3 LHS, <u,3,2,2>
+  336380006U, // <3,u,3,3>: Cost 1 vdup3 LHS
+  1611896804U, // <3,u,3,4>: Cost 2 vext3 LHS, <u,3,4,5>
+  1616541679U, // <3,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7>
+  2690283512U, // <3,u,3,6>: Cost 3 vext3 LHS, <u,3,6,7>
+  2959674696U, // <3,u,3,7>: Cost 3 vzipr <1,1,3,3>, RHS
+  336380006U, // <3,u,3,u>: Cost 1 vdup3 LHS
+  2558722150U, // <3,u,4,0>: Cost 3 vext1 <1,3,u,4>, LHS
+  1659672602U, // <3,u,4,1>: Cost 2 vext3 LHS, <u,4,1,5>
+  1659672612U, // <3,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6>
+  2689841196U, // <3,u,4,3>: Cost 3 vext3 LHS, <u,4,3,5>
+  1659227344U, // <3,u,4,4>: Cost 2 vext3 LHS, <4,4,4,4>
+  1611896895U, // <3,u,4,5>: Cost 2 vext3 LHS, <u,4,5,6>
+  1663875144U, // <3,u,4,6>: Cost 2 vext3 LHS, <u,4,6,6>
+  1659230289U, // <3,u,4,7>: Cost 2 vext3 LHS, <u,4,7,6>
+  1611896922U, // <3,u,4,u>: Cost 2 vext3 LHS, <u,4,u,6>
+  1490960486U, // <3,u,5,0>: Cost 2 vext1 <2,3,u,5>, LHS
+  2689841261U, // <3,u,5,1>: Cost 3 vext3 LHS, <u,5,1,7>
+  1490962162U, // <3,u,5,2>: Cost 2 vext1 <2,3,u,5>, <2,3,u,5>
+  1616541823U, // <3,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7>
+  1490963766U, // <3,u,5,4>: Cost 2 vext1 <2,3,u,5>, RHS
+  1659228164U, // <3,u,5,5>: Cost 2 vext3 LHS, <5,5,5,5>
+  537712794U, // <3,u,5,6>: Cost 1 vext3 LHS, RHS
+  1659230371U, // <3,u,5,7>: Cost 2 vext3 LHS, <u,5,7,7>
+  537712812U, // <3,u,5,u>: Cost 1 vext3 LHS, RHS
+  2689841327U, // <3,u,6,0>: Cost 3 vext3 LHS, <u,6,0,1>
+  2558739482U, // <3,u,6,1>: Cost 3 vext1 <1,3,u,6>, <1,3,u,6>
+  2689841351U, // <3,u,6,2>: Cost 3 vext3 LHS, <u,6,2,7>
+  1616099536U, // <3,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7>
+  1659227508U, // <3,u,6,4>: Cost 2 vext3 LHS, <4,6,4,6>
+  2690283746U, // <3,u,6,5>: Cost 3 vext3 LHS, <u,6,5,7>
+  1659228984U, // <3,u,6,6>: Cost 2 vext3 LHS, <6,6,6,6>
+  1659230445U, // <3,u,6,7>: Cost 2 vext3 LHS, <u,6,7,0>
+  1616099581U, // <3,u,6,u>: Cost 2 vext3 LHS, <u,6,u,7>
+  1485004902U, // <3,u,7,0>: Cost 2 vext1 <1,3,u,7>, LHS
+  1485005851U, // <3,u,7,1>: Cost 2 vext1 <1,3,u,7>, <1,3,u,7>
+  2558748264U, // <3,u,7,2>: Cost 3 vext1 <1,3,u,7>, <2,2,2,2>
+  3095397021U, // <3,u,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS
+  1485008182U, // <3,u,7,4>: Cost 2 vext1 <1,3,u,7>, RHS
+  1659228328U, // <3,u,7,5>: Cost 2 vext3 LHS, <5,7,5,7>
+  2722060599U, // <3,u,7,6>: Cost 3 vext3 <6,2,7,3>, <u,7,6,2>
+  1659229804U, // <3,u,7,7>: Cost 2 vext3 LHS, <7,7,7,7>
+  1485010734U, // <3,u,7,u>: Cost 2 vext1 <1,3,u,7>, LHS
+  1616099665U, // <3,u,u,0>: Cost 2 vext3 LHS, <u,u,0,1>
+  1611897179U, // <3,u,u,1>: Cost 2 vext3 LHS, <u,u,1,2>
+  537712997U, // <3,u,u,2>: Cost 1 vext3 LHS, LHS
+  336380006U, // <3,u,u,3>: Cost 1 vdup3 LHS
+  1616099705U, // <3,u,u,4>: Cost 2 vext3 LHS, <u,u,4,5>
+  1611897219U, // <3,u,u,5>: Cost 2 vext3 LHS, <u,u,5,6>
+  537713037U, // <3,u,u,6>: Cost 1 vext3 LHS, RHS
+  1659230607U, // <3,u,u,7>: Cost 2 vext3 LHS, <u,u,7,0>
+  537713051U, // <3,u,u,u>: Cost 1 vext3 LHS, LHS
+  2691907584U, // <4,0,0,0>: Cost 3 vext3 <1,2,3,4>, <0,0,0,0>
+  2691907594U, // <4,0,0,1>: Cost 3 vext3 <1,2,3,4>, <0,0,1,1>
+  2691907604U, // <4,0,0,2>: Cost 3 vext3 <1,2,3,4>, <0,0,2,2>
+  3709862144U, // <4,0,0,3>: Cost 4 vext2 <3,1,4,0>, <0,3,1,4>
+  2684682280U, // <4,0,0,4>: Cost 3 vext3 <0,0,4,4>, <0,0,4,4>
+  3694600633U, // <4,0,0,5>: Cost 4 vext2 <0,5,4,0>, <0,5,4,0>
+  3291431290U, // <4,0,0,6>: Cost 4 vrev <0,4,6,0>
+  3668342067U, // <4,0,0,7>: Cost 4 vext1 <7,4,0,0>, <7,4,0,0>
+  2691907657U, // <4,0,0,u>: Cost 3 vext3 <1,2,3,4>, <0,0,u,1>
+  2570715238U, // <4,0,1,0>: Cost 3 vext1 <3,4,0,1>, LHS
+  2570716058U, // <4,0,1,1>: Cost 3 vext1 <3,4,0,1>, <1,2,3,4>
+  1618165862U, // <4,0,1,2>: Cost 2 vext3 <1,2,3,4>, LHS
+  2570717648U, // <4,0,1,3>: Cost 3 vext1 <3,4,0,1>, <3,4,0,1>
+  2570718518U, // <4,0,1,4>: Cost 3 vext1 <3,4,0,1>, RHS
+  2594607206U, // <4,0,1,5>: Cost 3 vext1 <7,4,0,1>, <5,6,7,4>
+  3662377563U, // <4,0,1,6>: Cost 4 vext1 <6,4,0,1>, <6,4,0,1>
+  2594608436U, // <4,0,1,7>: Cost 3 vext1 <7,4,0,1>, <7,4,0,1>
+  1618165916U, // <4,0,1,u>: Cost 2 vext3 <1,2,3,4>, LHS
+  2685714598U, // <4,0,2,0>: Cost 3 vext3 <0,2,0,4>, <0,2,0,4>
+  3759530159U, // <4,0,2,1>: Cost 4 vext3 <0,2,1,4>, <0,2,1,4>
+  2685862072U, // <4,0,2,2>: Cost 3 vext3 <0,2,2,4>, <0,2,2,4>
+  2631476937U, // <4,0,2,3>: Cost 3 vext2 <2,3,4,0>, <2,3,4,0>
+  2685714636U, // <4,0,2,4>: Cost 3 vext3 <0,2,0,4>, <0,2,4,6>
+  3765649622U, // <4,0,2,5>: Cost 4 vext3 <1,2,3,4>, <0,2,5,7>
+  2686157020U, // <4,0,2,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4>
+  3668358453U, // <4,0,2,7>: Cost 4 vext1 <7,4,0,2>, <7,4,0,2>
+  2686304494U, // <4,0,2,u>: Cost 3 vext3 <0,2,u,4>, <0,2,u,4>
+  3632529510U, // <4,0,3,0>: Cost 4 vext1 <1,4,0,3>, LHS
+  2686451968U, // <4,0,3,1>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4>
+  2686525705U, // <4,0,3,2>: Cost 3 vext3 <0,3,2,4>, <0,3,2,4>
+  3760341266U, // <4,0,3,3>: Cost 4 vext3 <0,3,3,4>, <0,3,3,4>
+  3632532790U, // <4,0,3,4>: Cost 4 vext1 <1,4,0,3>, RHS
+  3913254606U, // <4,0,3,5>: Cost 4 vuzpr <3,4,5,0>, <2,3,4,5>
+  3705219740U, // <4,0,3,6>: Cost 4 vext2 <2,3,4,0>, <3,6,4,7>
+  3713845990U, // <4,0,3,7>: Cost 4 vext2 <3,7,4,0>, <3,7,4,0>
+  2686451968U, // <4,0,3,u>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4>
+  2552823910U, // <4,0,4,0>: Cost 3 vext1 <0,4,0,4>, LHS
+  2691907922U, // <4,0,4,1>: Cost 3 vext3 <1,2,3,4>, <0,4,1,5>
+  2691907932U, // <4,0,4,2>: Cost 3 vext3 <1,2,3,4>, <0,4,2,6>
+  3626567830U, // <4,0,4,3>: Cost 4 vext1 <0,4,0,4>, <3,0,1,2>
+  2552827190U, // <4,0,4,4>: Cost 3 vext1 <0,4,0,4>, RHS
+  2631478582U, // <4,0,4,5>: Cost 3 vext2 <2,3,4,0>, RHS
+  3626570017U, // <4,0,4,6>: Cost 4 vext1 <0,4,0,4>, <6,0,1,2>
+  3668374839U, // <4,0,4,7>: Cost 4 vext1 <7,4,0,4>, <7,4,0,4>
+  2552829742U, // <4,0,4,u>: Cost 3 vext1 <0,4,0,4>, LHS
+  2558804070U, // <4,0,5,0>: Cost 3 vext1 <1,4,0,5>, LHS
+  1839644774U, // <4,0,5,1>: Cost 2 vzipl RHS, LHS
+  2913386660U, // <4,0,5,2>: Cost 3 vzipl RHS, <0,2,0,2>
+  2570750420U, // <4,0,5,3>: Cost 3 vext1 <3,4,0,5>, <3,4,0,5>
+  2558807350U, // <4,0,5,4>: Cost 3 vext1 <1,4,0,5>, RHS
+  3987128750U, // <4,0,5,5>: Cost 4 vzipl RHS, <0,5,2,7>
+  3987128822U, // <4,0,5,6>: Cost 4 vzipl RHS, <0,6,1,7>
+  2594641208U, // <4,0,5,7>: Cost 3 vext1 <7,4,0,5>, <7,4,0,5>
+  1839645341U, // <4,0,5,u>: Cost 2 vzipl RHS, LHS
+  2552840294U, // <4,0,6,0>: Cost 3 vext1 <0,4,0,6>, LHS
+  3047604234U, // <4,0,6,1>: Cost 3 vtrnl RHS, <0,0,1,1>
+  1973862502U, // <4,0,6,2>: Cost 2 vtrnl RHS, LHS
+  2570758613U, // <4,0,6,3>: Cost 3 vext1 <3,4,0,6>, <3,4,0,6>
+  2552843574U, // <4,0,6,4>: Cost 3 vext1 <0,4,0,6>, RHS
+  2217664887U, // <4,0,6,5>: Cost 3 vrev <0,4,5,6>
+  3662418528U, // <4,0,6,6>: Cost 4 vext1 <6,4,0,6>, <6,4,0,6>
+  2658022257U, // <4,0,6,7>: Cost 3 vext2 <6,7,4,0>, <6,7,4,0>
+  1973862556U, // <4,0,6,u>: Cost 2 vtrnl RHS, LHS
+  3731764218U, // <4,0,7,0>: Cost 4 vext2 <6,7,4,0>, <7,0,1,2>
+  3988324454U, // <4,0,7,1>: Cost 4 vzipl <4,7,5,0>, LHS
+  4122034278U, // <4,0,7,2>: Cost 4 vtrnl <4,6,7,1>, LHS
+  3735082246U, // <4,0,7,3>: Cost 4 vext2 <7,3,4,0>, <7,3,4,0>
+  3731764536U, // <4,0,7,4>: Cost 4 vext2 <6,7,4,0>, <7,4,0,5>
+  3937145718U, // <4,0,7,5>: Cost 4 vuzpr <7,4,5,0>, <6,7,4,5>
+  3737073145U, // <4,0,7,6>: Cost 4 vext2 <7,6,4,0>, <7,6,4,0>
+  3731764844U, // <4,0,7,7>: Cost 4 vext2 <6,7,4,0>, <7,7,7,7>
+  4122034332U, // <4,0,7,u>: Cost 4 vtrnl <4,6,7,1>, LHS
+  2552856678U, // <4,0,u,0>: Cost 3 vext1 <0,4,0,u>, LHS
+  1841635430U, // <4,0,u,1>: Cost 2 vzipl RHS, LHS
+  1618166429U, // <4,0,u,2>: Cost 2 vext3 <1,2,3,4>, LHS
+  2570774999U, // <4,0,u,3>: Cost 3 vext1 <3,4,0,u>, <3,4,0,u>
+  2552859958U, // <4,0,u,4>: Cost 3 vext1 <0,4,0,u>, RHS
+  2631481498U, // <4,0,u,5>: Cost 3 vext2 <2,3,4,0>, RHS
+  2686157020U, // <4,0,u,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4>
+  2594665787U, // <4,0,u,7>: Cost 3 vext1 <7,4,0,u>, <7,4,0,u>
+  1618166483U, // <4,0,u,u>: Cost 2 vext3 <1,2,3,4>, LHS
+  2617548837U, // <4,1,0,0>: Cost 3 vext2 <0,0,4,1>, <0,0,4,1>
+  2622857318U, // <4,1,0,1>: Cost 3 vext2 <0,u,4,1>, LHS
+  3693281484U, // <4,1,0,2>: Cost 4 vext2 <0,3,4,1>, <0,2,4,6>
+  2691908342U, // <4,1,0,3>: Cost 3 vext3 <1,2,3,4>, <1,0,3,2>
+  2622857554U, // <4,1,0,4>: Cost 3 vext2 <0,u,4,1>, <0,4,1,5>
+  3764470538U, // <4,1,0,5>: Cost 4 vext3 <1,0,5,4>, <1,0,5,4>
+  3695272459U, // <4,1,0,6>: Cost 4 vext2 <0,6,4,1>, <0,6,4,1>
+  3733094980U, // <4,1,0,7>: Cost 4 vext2 <7,0,4,1>, <0,7,1,4>
+  2622857885U, // <4,1,0,u>: Cost 3 vext2 <0,u,4,1>, LHS
+  3696599798U, // <4,1,1,0>: Cost 4 vext2 <0,u,4,1>, <1,0,3,2>
+  2691097399U, // <4,1,1,1>: Cost 3 vext3 <1,1,1,4>, <1,1,1,4>
+  2631484314U, // <4,1,1,2>: Cost 3 vext2 <2,3,4,1>, <1,2,3,4>
+  2691908424U, // <4,1,1,3>: Cost 3 vext3 <1,2,3,4>, <1,1,3,3>
+  3696600125U, // <4,1,1,4>: Cost 4 vext2 <0,u,4,1>, <1,4,3,5>
+  3696600175U, // <4,1,1,5>: Cost 4 vext2 <0,u,4,1>, <1,5,0,1>
+  3696600307U, // <4,1,1,6>: Cost 4 vext2 <0,u,4,1>, <1,6,5,7>
+  3668423997U, // <4,1,1,7>: Cost 4 vext1 <7,4,1,1>, <7,4,1,1>
+  2691908469U, // <4,1,1,u>: Cost 3 vext3 <1,2,3,4>, <1,1,u,3>
+  2570797158U, // <4,1,2,0>: Cost 3 vext1 <3,4,1,2>, LHS
+  2570797978U, // <4,1,2,1>: Cost 3 vext1 <3,4,1,2>, <1,2,3,4>
+  3696600680U, // <4,1,2,2>: Cost 4 vext2 <0,u,4,1>, <2,2,2,2>
+  1618166682U, // <4,1,2,3>: Cost 2 vext3 <1,2,3,4>, <1,2,3,4>
+  2570800438U, // <4,1,2,4>: Cost 3 vext1 <3,4,1,2>, RHS
+  3765650347U, // <4,1,2,5>: Cost 4 vext3 <1,2,3,4>, <1,2,5,3>
+  3696601018U, // <4,1,2,6>: Cost 4 vext2 <0,u,4,1>, <2,6,3,7>
+  3668432190U, // <4,1,2,7>: Cost 4 vext1 <7,4,1,2>, <7,4,1,2>
+  1618535367U, // <4,1,2,u>: Cost 2 vext3 <1,2,u,4>, <1,2,u,4>
+  2564833382U, // <4,1,3,0>: Cost 3 vext1 <2,4,1,3>, LHS
+  2691908568U, // <4,1,3,1>: Cost 3 vext3 <1,2,3,4>, <1,3,1,3>
+  2691908578U, // <4,1,3,2>: Cost 3 vext3 <1,2,3,4>, <1,3,2,4>
+  2692572139U, // <4,1,3,3>: Cost 3 vext3 <1,3,3,4>, <1,3,3,4>
+  2564836662U, // <4,1,3,4>: Cost 3 vext1 <2,4,1,3>, RHS
+  2691908608U, // <4,1,3,5>: Cost 3 vext3 <1,2,3,4>, <1,3,5,7>
+  2588725862U, // <4,1,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
+  3662468090U, // <4,1,3,7>: Cost 4 vext1 <6,4,1,3>, <7,0,1,2>
+  2691908631U, // <4,1,3,u>: Cost 3 vext3 <1,2,3,4>, <1,3,u,3>
+  3760194590U, // <4,1,4,0>: Cost 4 vext3 <0,3,1,4>, <1,4,0,1>
+  3693947874U, // <4,1,4,1>: Cost 4 vext2 <0,4,4,1>, <4,1,5,0>
+  3765650484U, // <4,1,4,2>: Cost 4 vext3 <1,2,3,4>, <1,4,2,5>
+  3113877606U, // <4,1,4,3>: Cost 3 vtrnr <4,4,4,4>, LHS
+  3760194630U, // <4,1,4,4>: Cost 4 vext3 <0,3,1,4>, <1,4,4,5>
+  2622860598U, // <4,1,4,5>: Cost 3 vext2 <0,u,4,1>, RHS
+  3297436759U, // <4,1,4,6>: Cost 4 vrev <1,4,6,4>
+  3800007772U, // <4,1,4,7>: Cost 4 vext3 <7,0,1,4>, <1,4,7,0>
+  2622860841U, // <4,1,4,u>: Cost 3 vext2 <0,u,4,1>, RHS
+  1479164006U, // <4,1,5,0>: Cost 2 vext1 <0,4,1,5>, LHS
+  2552906486U, // <4,1,5,1>: Cost 3 vext1 <0,4,1,5>, <1,0,3,2>
+  2552907299U, // <4,1,5,2>: Cost 3 vext1 <0,4,1,5>, <2,1,3,5>
+  2552907926U, // <4,1,5,3>: Cost 3 vext1 <0,4,1,5>, <3,0,1,2>
+  1479167286U, // <4,1,5,4>: Cost 2 vext1 <0,4,1,5>, RHS
+  2913387664U, // <4,1,5,5>: Cost 3 vzipl RHS, <1,5,3,7>
+  2600686074U, // <4,1,5,6>: Cost 3 vext1 <u,4,1,5>, <6,2,7,3>
+  2600686586U, // <4,1,5,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2>
+  1479169838U, // <4,1,5,u>: Cost 2 vext1 <0,4,1,5>, LHS
+  2552914022U, // <4,1,6,0>: Cost 3 vext1 <0,4,1,6>, LHS
+  2558886708U, // <4,1,6,1>: Cost 3 vext1 <1,4,1,6>, <1,1,1,1>
+  4028205206U, // <4,1,6,2>: Cost 4 vzipr <0,2,4,6>, <3,0,1,2>
+  3089858662U, // <4,1,6,3>: Cost 3 vtrnr <0,4,2,6>, LHS
+  2552917302U, // <4,1,6,4>: Cost 3 vext1 <0,4,1,6>, RHS
+  2223637584U, // <4,1,6,5>: Cost 3 vrev <1,4,5,6>
+  4121347081U, // <4,1,6,6>: Cost 4 vtrnl RHS, <1,3,6,7>
+  3721155406U, // <4,1,6,7>: Cost 4 vext2 <5,0,4,1>, <6,7,0,1>
+  2552919854U, // <4,1,6,u>: Cost 3 vext1 <0,4,1,6>, LHS
+  2659357716U, // <4,1,7,0>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1>
+  3733763173U, // <4,1,7,1>: Cost 4 vext2 <7,1,4,1>, <7,1,4,1>
+  3734426806U, // <4,1,7,2>: Cost 4 vext2 <7,2,4,1>, <7,2,4,1>
+  2695226671U, // <4,1,7,3>: Cost 3 vext3 <1,7,3,4>, <1,7,3,4>
+  3721155942U, // <4,1,7,4>: Cost 4 vext2 <5,0,4,1>, <7,4,5,6>
+  3721155976U, // <4,1,7,5>: Cost 4 vext2 <5,0,4,1>, <7,5,0,4>
+  3662500458U, // <4,1,7,6>: Cost 4 vext1 <6,4,1,7>, <6,4,1,7>
+  3721156204U, // <4,1,7,7>: Cost 4 vext2 <5,0,4,1>, <7,7,7,7>
+  2659357716U, // <4,1,7,u>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1>
+  1479188582U, // <4,1,u,0>: Cost 2 vext1 <0,4,1,u>, LHS
+  2552931062U, // <4,1,u,1>: Cost 3 vext1 <0,4,1,u>, <1,0,3,2>
+  2552931944U, // <4,1,u,2>: Cost 3 vext1 <0,4,1,u>, <2,2,2,2>
+  1622148480U, // <4,1,u,3>: Cost 2 vext3 <1,u,3,4>, <1,u,3,4>
+  1479191862U, // <4,1,u,4>: Cost 2 vext1 <0,4,1,u>, RHS
+  2622863514U, // <4,1,u,5>: Cost 3 vext2 <0,u,4,1>, RHS
+  2588725862U, // <4,1,u,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
+  2600686586U, // <4,1,u,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2>
+  1479194414U, // <4,1,u,u>: Cost 2 vext1 <0,4,1,u>, LHS
+  2617557030U, // <4,2,0,0>: Cost 3 vext2 <0,0,4,2>, <0,0,4,2>
+  2622865510U, // <4,2,0,1>: Cost 3 vext2 <0,u,4,2>, LHS
+  2622865612U, // <4,2,0,2>: Cost 3 vext2 <0,u,4,2>, <0,2,4,6>
+  3693289753U, // <4,2,0,3>: Cost 4 vext2 <0,3,4,2>, <0,3,4,2>
+  2635473244U, // <4,2,0,4>: Cost 3 vext2 <3,0,4,2>, <0,4,2,6>
+  3765650918U, // <4,2,0,5>: Cost 4 vext3 <1,2,3,4>, <2,0,5,7>
+  2696775148U, // <4,2,0,6>: Cost 3 vext3 <2,0,6,4>, <2,0,6,4>
+  3695944285U, // <4,2,0,7>: Cost 4 vext2 <0,7,4,2>, <0,7,4,2>
+  2622866077U, // <4,2,0,u>: Cost 3 vext2 <0,u,4,2>, LHS
+  3696607990U, // <4,2,1,0>: Cost 4 vext2 <0,u,4,2>, <1,0,3,2>
+  3696608052U, // <4,2,1,1>: Cost 4 vext2 <0,u,4,2>, <1,1,1,1>
+  3696608150U, // <4,2,1,2>: Cost 4 vext2 <0,u,4,2>, <1,2,3,0>
+  3895574630U, // <4,2,1,3>: Cost 4 vuzpr <0,4,u,2>, LHS
+  2691909162U, // <4,2,1,4>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3>
+  3696608400U, // <4,2,1,5>: Cost 4 vext2 <0,u,4,2>, <1,5,3,7>
+  3760784956U, // <4,2,1,6>: Cost 4 vext3 <0,4,0,4>, <2,1,6,3>
+  3773908549U, // <4,2,1,7>: Cost 5 vext3 <2,5,7,4>, <2,1,7,3>
+  2691909162U, // <4,2,1,u>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3>
+  3696608748U, // <4,2,2,0>: Cost 4 vext2 <0,u,4,2>, <2,0,6,4>
+  3696608828U, // <4,2,2,1>: Cost 4 vext2 <0,u,4,2>, <2,1,6,3>
+  2691909224U, // <4,2,2,2>: Cost 3 vext3 <1,2,3,4>, <2,2,2,2>
+  2691909234U, // <4,2,2,3>: Cost 3 vext3 <1,2,3,4>, <2,2,3,3>
+  3759605368U, // <4,2,2,4>: Cost 4 vext3 <0,2,2,4>, <2,2,4,0>
+  3696609156U, // <4,2,2,5>: Cost 4 vext2 <0,u,4,2>, <2,5,6,7>
+  3760785040U, // <4,2,2,6>: Cost 4 vext3 <0,4,0,4>, <2,2,6,6>
+  3668505927U, // <4,2,2,7>: Cost 4 vext1 <7,4,2,2>, <7,4,2,2>
+  2691909279U, // <4,2,2,u>: Cost 3 vext3 <1,2,3,4>, <2,2,u,3>
+  2691909286U, // <4,2,3,0>: Cost 3 vext3 <1,2,3,4>, <2,3,0,1>
+  3764840111U, // <4,2,3,1>: Cost 4 vext3 <1,1,1,4>, <2,3,1,1>
+  3765651129U, // <4,2,3,2>: Cost 4 vext3 <1,2,3,4>, <2,3,2,2>
+  2698544836U, // <4,2,3,3>: Cost 3 vext3 <2,3,3,4>, <2,3,3,4>
+  2685863630U, // <4,2,3,4>: Cost 3 vext3 <0,2,2,4>, <2,3,4,5>
+  2698692310U, // <4,2,3,5>: Cost 3 vext3 <2,3,5,4>, <2,3,5,4>
+  3772507871U, // <4,2,3,6>: Cost 4 vext3 <2,3,6,4>, <2,3,6,4>
+  2698839784U, // <4,2,3,7>: Cost 3 vext3 <2,3,7,4>, <2,3,7,4>
+  2691909358U, // <4,2,3,u>: Cost 3 vext3 <1,2,3,4>, <2,3,u,1>
+  2564915302U, // <4,2,4,0>: Cost 3 vext1 <2,4,2,4>, LHS
+  2564916122U, // <4,2,4,1>: Cost 3 vext1 <2,4,2,4>, <1,2,3,4>
+  2564917004U, // <4,2,4,2>: Cost 3 vext1 <2,4,2,4>, <2,4,2,4>
+  2699208469U, // <4,2,4,3>: Cost 3 vext3 <2,4,3,4>, <2,4,3,4>
+  2564918582U, // <4,2,4,4>: Cost 3 vext1 <2,4,2,4>, RHS
+  2622868790U, // <4,2,4,5>: Cost 3 vext2 <0,u,4,2>, RHS
+  2229667632U, // <4,2,4,6>: Cost 3 vrev <2,4,6,4>
+  3800082229U, // <4,2,4,7>: Cost 4 vext3 <7,0,2,4>, <2,4,7,0>
+  2622869033U, // <4,2,4,u>: Cost 3 vext2 <0,u,4,2>, RHS
+  2552979558U, // <4,2,5,0>: Cost 3 vext1 <0,4,2,5>, LHS
+  2558952342U, // <4,2,5,1>: Cost 3 vext1 <1,4,2,5>, <1,2,3,0>
+  2564925032U, // <4,2,5,2>: Cost 3 vext1 <2,4,2,5>, <2,2,2,2>
+  2967060582U, // <4,2,5,3>: Cost 3 vzipr <2,3,4,5>, LHS
+  2552982838U, // <4,2,5,4>: Cost 3 vext1 <0,4,2,5>, RHS
+  3987130190U, // <4,2,5,5>: Cost 4 vzipl RHS, <2,5,0,7>
+  2913388474U, // <4,2,5,6>: Cost 3 vzipl RHS, <2,6,3,7>
+  3895577910U, // <4,2,5,7>: Cost 4 vuzpr <0,4,u,2>, RHS
+  2552985390U, // <4,2,5,u>: Cost 3 vext1 <0,4,2,5>, LHS
+  1479245926U, // <4,2,6,0>: Cost 2 vext1 <0,4,2,6>, LHS
+  2552988406U, // <4,2,6,1>: Cost 3 vext1 <0,4,2,6>, <1,0,3,2>
+  2552989288U, // <4,2,6,2>: Cost 3 vext1 <0,4,2,6>, <2,2,2,2>
+  2954461286U, // <4,2,6,3>: Cost 3 vzipr <0,2,4,6>, LHS
+  1479249206U, // <4,2,6,4>: Cost 2 vext1 <0,4,2,6>, RHS
+  2229610281U, // <4,2,6,5>: Cost 3 vrev <2,4,5,6>
+  2600767994U, // <4,2,6,6>: Cost 3 vext1 <u,4,2,6>, <6,2,7,3>
+  2600768506U, // <4,2,6,7>: Cost 3 vext1 <u,4,2,6>, <7,0,1,2>
+  1479251758U, // <4,2,6,u>: Cost 2 vext1 <0,4,2,6>, LHS
+  2659365909U, // <4,2,7,0>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2>
+  3733771366U, // <4,2,7,1>: Cost 4 vext2 <7,1,4,2>, <7,1,4,2>
+  3734434999U, // <4,2,7,2>: Cost 4 vext2 <7,2,4,2>, <7,2,4,2>
+  2701199368U, // <4,2,7,3>: Cost 3 vext3 <2,7,3,4>, <2,7,3,4>
+  4175774618U, // <4,2,7,4>: Cost 4 vtrnr <2,4,5,7>, <1,2,3,4>
+  3303360298U, // <4,2,7,5>: Cost 4 vrev <2,4,5,7>
+  3727136217U, // <4,2,7,6>: Cost 4 vext2 <6,0,4,2>, <7,6,0,4>
+  3727136364U, // <4,2,7,7>: Cost 4 vext2 <6,0,4,2>, <7,7,7,7>
+  2659365909U, // <4,2,7,u>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2>
+  1479262310U, // <4,2,u,0>: Cost 2 vext1 <0,4,2,u>, LHS
+  2553004790U, // <4,2,u,1>: Cost 3 vext1 <0,4,2,u>, <1,0,3,2>
+  2553005672U, // <4,2,u,2>: Cost 3 vext1 <0,4,2,u>, <2,2,2,2>
+  2954477670U, // <4,2,u,3>: Cost 3 vzipr <0,2,4,u>, LHS
+  1479265590U, // <4,2,u,4>: Cost 2 vext1 <0,4,2,u>, RHS
+  2622871706U, // <4,2,u,5>: Cost 3 vext2 <0,u,4,2>, RHS
+  2229700404U, // <4,2,u,6>: Cost 3 vrev <2,4,6,u>
+  2600784890U, // <4,2,u,7>: Cost 3 vext1 <u,4,2,u>, <7,0,1,2>
+  1479268142U, // <4,2,u,u>: Cost 2 vext1 <0,4,2,u>, LHS
+  3765651595U, // <4,3,0,0>: Cost 4 vext3 <1,2,3,4>, <3,0,0,0>
+  2691909782U, // <4,3,0,1>: Cost 3 vext3 <1,2,3,4>, <3,0,1,2>
+  2702452897U, // <4,3,0,2>: Cost 3 vext3 <3,0,2,4>, <3,0,2,4>
+  3693297946U, // <4,3,0,3>: Cost 4 vext2 <0,3,4,3>, <0,3,4,3>
+  3760711856U, // <4,3,0,4>: Cost 4 vext3 <0,3,u,4>, <3,0,4,1>
+  2235533820U, // <4,3,0,5>: Cost 3 vrev <3,4,5,0>
+  3309349381U, // <4,3,0,6>: Cost 4 vrev <3,4,6,0>
+  3668563278U, // <4,3,0,7>: Cost 4 vext1 <7,4,3,0>, <7,4,3,0>
+  2691909845U, // <4,3,0,u>: Cost 3 vext3 <1,2,3,4>, <3,0,u,2>
+  2235173328U, // <4,3,1,0>: Cost 3 vrev <3,4,0,1>
+  3764840678U, // <4,3,1,1>: Cost 4 vext3 <1,1,1,4>, <3,1,1,1>
+  2630173594U, // <4,3,1,2>: Cost 3 vext2 <2,1,4,3>, <1,2,3,4>
+  2703190267U, // <4,3,1,3>: Cost 3 vext3 <3,1,3,4>, <3,1,3,4>
+  3760195840U, // <4,3,1,4>: Cost 4 vext3 <0,3,1,4>, <3,1,4,0>
+  3765651724U, // <4,3,1,5>: Cost 4 vext3 <1,2,3,4>, <3,1,5,3>
+  3309357574U, // <4,3,1,6>: Cost 4 vrev <3,4,6,1>
+  3769633054U, // <4,3,1,7>: Cost 4 vext3 <1,u,3,4>, <3,1,7,3>
+  2703558952U, // <4,3,1,u>: Cost 3 vext3 <3,1,u,4>, <3,1,u,4>
+  3626770534U, // <4,3,2,0>: Cost 4 vext1 <0,4,3,2>, LHS
+  2630174250U, // <4,3,2,1>: Cost 3 vext2 <2,1,4,3>, <2,1,4,3>
+  3765651777U, // <4,3,2,2>: Cost 4 vext3 <1,2,3,4>, <3,2,2,2>
+  2703853900U, // <4,3,2,3>: Cost 3 vext3 <3,2,3,4>, <3,2,3,4>
+  3626773814U, // <4,3,2,4>: Cost 4 vext1 <0,4,3,2>, RHS
+  2704001374U, // <4,3,2,5>: Cost 3 vext3 <3,2,5,4>, <3,2,5,4>
+  3765651814U, // <4,3,2,6>: Cost 4 vext3 <1,2,3,4>, <3,2,6,3>
+  3769633135U, // <4,3,2,7>: Cost 4 vext3 <1,u,3,4>, <3,2,7,3>
+  2634819681U, // <4,3,2,u>: Cost 3 vext2 <2,u,4,3>, <2,u,4,3>
+  3765651839U, // <4,3,3,0>: Cost 4 vext3 <1,2,3,4>, <3,3,0,1>
+  3765651848U, // <4,3,3,1>: Cost 4 vext3 <1,2,3,4>, <3,3,1,1>
+  3710552404U, // <4,3,3,2>: Cost 4 vext2 <3,2,4,3>, <3,2,4,3>
+  2691910044U, // <4,3,3,3>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3>
+  2704591270U, // <4,3,3,4>: Cost 3 vext3 <3,3,4,4>, <3,3,4,4>
+  3769633202U, // <4,3,3,5>: Cost 4 vext3 <1,u,3,4>, <3,3,5,7>
+  3703917212U, // <4,3,3,6>: Cost 4 vext2 <2,1,4,3>, <3,6,4,7>
+  3769633220U, // <4,3,3,7>: Cost 4 vext3 <1,u,3,4>, <3,3,7,7>
+  2691910044U, // <4,3,3,u>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3>
+  2691910096U, // <4,3,4,0>: Cost 3 vext3 <1,2,3,4>, <3,4,0,1>
+  2691910106U, // <4,3,4,1>: Cost 3 vext3 <1,2,3,4>, <3,4,1,2>
+  2564990741U, // <4,3,4,2>: Cost 3 vext1 <2,4,3,4>, <2,4,3,4>
+  3765651946U, // <4,3,4,3>: Cost 4 vext3 <1,2,3,4>, <3,4,3,0>
+  2691910136U, // <4,3,4,4>: Cost 3 vext3 <1,2,3,4>, <3,4,4,5>
+  2686454274U, // <4,3,4,5>: Cost 3 vext3 <0,3,1,4>, <3,4,5,6>
+  2235640329U, // <4,3,4,6>: Cost 3 vrev <3,4,6,4>
+  3801483792U, // <4,3,4,7>: Cost 4 vext3 <7,2,3,4>, <3,4,7,2>
+  2691910168U, // <4,3,4,u>: Cost 3 vext3 <1,2,3,4>, <3,4,u,1>
+  2559025254U, // <4,3,5,0>: Cost 3 vext1 <1,4,3,5>, LHS
+  2559026237U, // <4,3,5,1>: Cost 3 vext1 <1,4,3,5>, <1,4,3,5>
+  2564998862U, // <4,3,5,2>: Cost 3 vext1 <2,4,3,5>, <2,3,4,5>
+  2570971548U, // <4,3,5,3>: Cost 3 vext1 <3,4,3,5>, <3,3,3,3>
+  2559028534U, // <4,3,5,4>: Cost 3 vext1 <1,4,3,5>, RHS
+  4163519477U, // <4,3,5,5>: Cost 4 vtrnr <0,4,1,5>, <1,3,4,5>
+  3309390346U, // <4,3,5,6>: Cost 4 vrev <3,4,6,5>
+  2706139747U, // <4,3,5,7>: Cost 3 vext3 <3,5,7,4>, <3,5,7,4>
+  2559031086U, // <4,3,5,u>: Cost 3 vext1 <1,4,3,5>, LHS
+  2559033446U, // <4,3,6,0>: Cost 3 vext1 <1,4,3,6>, LHS
+  2559034430U, // <4,3,6,1>: Cost 3 vext1 <1,4,3,6>, <1,4,3,6>
+  2565007127U, // <4,3,6,2>: Cost 3 vext1 <2,4,3,6>, <2,4,3,6>
+  2570979740U, // <4,3,6,3>: Cost 3 vext1 <3,4,3,6>, <3,3,3,3>
+  2559036726U, // <4,3,6,4>: Cost 3 vext1 <1,4,3,6>, RHS
+  1161841154U, // <4,3,6,5>: Cost 2 vrev <3,4,5,6>
+  4028203932U, // <4,3,6,6>: Cost 4 vzipr <0,2,4,6>, <1,2,3,6>
+  2706803380U, // <4,3,6,7>: Cost 3 vext3 <3,6,7,4>, <3,6,7,4>
+  1162062365U, // <4,3,6,u>: Cost 2 vrev <3,4,u,6>
+  3769633475U, // <4,3,7,0>: Cost 4 vext3 <1,u,3,4>, <3,7,0,1>
+  3769633488U, // <4,3,7,1>: Cost 4 vext3 <1,u,3,4>, <3,7,1,5>
+  3638757144U, // <4,3,7,2>: Cost 4 vext1 <2,4,3,7>, <2,4,3,7>
+  3769633508U, // <4,3,7,3>: Cost 4 vext3 <1,u,3,4>, <3,7,3,7>
+  3769633515U, // <4,3,7,4>: Cost 4 vext3 <1,u,3,4>, <3,7,4,5>
+  3769633526U, // <4,3,7,5>: Cost 4 vext3 <1,u,3,4>, <3,7,5,7>
+  3662647932U, // <4,3,7,6>: Cost 4 vext1 <6,4,3,7>, <6,4,3,7>
+  3781208837U, // <4,3,7,7>: Cost 4 vext3 <3,7,7,4>, <3,7,7,4>
+  3769633547U, // <4,3,7,u>: Cost 4 vext3 <1,u,3,4>, <3,7,u,1>
+  2559049830U, // <4,3,u,0>: Cost 3 vext1 <1,4,3,u>, LHS
+  2691910430U, // <4,3,u,1>: Cost 3 vext3 <1,2,3,4>, <3,u,1,2>
+  2565023513U, // <4,3,u,2>: Cost 3 vext1 <2,4,3,u>, <2,4,3,u>
+  2707835698U, // <4,3,u,3>: Cost 3 vext3 <3,u,3,4>, <3,u,3,4>
+  2559053110U, // <4,3,u,4>: Cost 3 vext1 <1,4,3,u>, RHS
+  1161857540U, // <4,3,u,5>: Cost 2 vrev <3,4,5,u>
+  2235673101U, // <4,3,u,6>: Cost 3 vrev <3,4,6,u>
+  2708130646U, // <4,3,u,7>: Cost 3 vext3 <3,u,7,4>, <3,u,7,4>
+  1162078751U, // <4,3,u,u>: Cost 2 vrev <3,4,u,u>
+  2617573416U, // <4,4,0,0>: Cost 3 vext2 <0,0,4,4>, <0,0,4,4>
+  1570373734U, // <4,4,0,1>: Cost 2 vext2 <4,4,4,4>, LHS
+  2779676774U, // <4,4,0,2>: Cost 3 vuzpl <4,6,4,6>, LHS
+  3760196480U, // <4,4,0,3>: Cost 4 vext3 <0,3,1,4>, <4,0,3,1>
+  2576977100U, // <4,4,0,4>: Cost 3 vext1 <4,4,4,0>, <4,4,4,0>
+  2718747538U, // <4,4,0,5>: Cost 3 vext3 <5,6,7,4>, <4,0,5,1>
+  2718747548U, // <4,4,0,6>: Cost 3 vext3 <5,6,7,4>, <4,0,6,2>
+  3668637015U, // <4,4,0,7>: Cost 4 vext1 <7,4,4,0>, <7,4,4,0>
+  1570374301U, // <4,4,0,u>: Cost 2 vext2 <4,4,4,4>, LHS
+  2644116214U, // <4,4,1,0>: Cost 3 vext2 <4,4,4,4>, <1,0,3,2>
+  2644116276U, // <4,4,1,1>: Cost 3 vext2 <4,4,4,4>, <1,1,1,1>
+  2691910602U, // <4,4,1,2>: Cost 3 vext3 <1,2,3,4>, <4,1,2,3>
+  2644116440U, // <4,4,1,3>: Cost 3 vext2 <4,4,4,4>, <1,3,1,3>
+  2711227356U, // <4,4,1,4>: Cost 3 vext3 <4,4,4,4>, <4,1,4,3>
+  2709310438U, // <4,4,1,5>: Cost 3 vext3 <4,1,5,4>, <4,1,5,4>
+  3765652462U, // <4,4,1,6>: Cost 4 vext3 <1,2,3,4>, <4,1,6,3>
+  3768970231U, // <4,4,1,7>: Cost 4 vext3 <1,7,3,4>, <4,1,7,3>
+  2695891968U, // <4,4,1,u>: Cost 3 vext3 <1,u,3,4>, <4,1,u,3>
+  3703260634U, // <4,4,2,0>: Cost 4 vext2 <2,0,4,4>, <2,0,4,4>
+  3765652499U, // <4,4,2,1>: Cost 4 vext3 <1,2,3,4>, <4,2,1,4>
+  2644117096U, // <4,4,2,2>: Cost 3 vext2 <4,4,4,4>, <2,2,2,2>
+  2631509709U, // <4,4,2,3>: Cost 3 vext2 <2,3,4,4>, <2,3,4,4>
+  2644117269U, // <4,4,2,4>: Cost 3 vext2 <4,4,4,4>, <2,4,3,4>
+  3705251698U, // <4,4,2,5>: Cost 4 vext2 <2,3,4,4>, <2,5,4,7>
+  2710047808U, // <4,4,2,6>: Cost 3 vext3 <4,2,6,4>, <4,2,6,4>
+  3783863369U, // <4,4,2,7>: Cost 4 vext3 <4,2,7,4>, <4,2,7,4>
+  2634827874U, // <4,4,2,u>: Cost 3 vext2 <2,u,4,4>, <2,u,4,4>
+  2644117654U, // <4,4,3,0>: Cost 3 vext2 <4,4,4,4>, <3,0,1,2>
+  3638797210U, // <4,4,3,1>: Cost 4 vext1 <2,4,4,3>, <1,2,3,4>
+  3638798082U, // <4,4,3,2>: Cost 4 vext1 <2,4,4,3>, <2,4,1,3>
+  2637482406U, // <4,4,3,3>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4>
+  2638146039U, // <4,4,3,4>: Cost 3 vext2 <3,4,4,4>, <3,4,4,4>
+  3913287374U, // <4,4,3,5>: Cost 4 vuzpr <3,4,5,4>, <2,3,4,5>
+  3765652625U, // <4,4,3,6>: Cost 4 vext3 <1,2,3,4>, <4,3,6,4>
+  3713878762U, // <4,4,3,7>: Cost 4 vext2 <3,7,4,4>, <3,7,4,4>
+  2637482406U, // <4,4,3,u>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4>
+  1503264870U, // <4,4,4,0>: Cost 2 vext1 <4,4,4,4>, LHS
+  2577007514U, // <4,4,4,1>: Cost 3 vext1 <4,4,4,4>, <1,2,3,4>
+  2577008232U, // <4,4,4,2>: Cost 3 vext1 <4,4,4,4>, <2,2,2,2>
+  2571037175U, // <4,4,4,3>: Cost 3 vext1 <3,4,4,4>, <3,4,4,4>
+  161926454U, // <4,4,4,4>: Cost 1 vdup0 RHS
+  1570377014U, // <4,4,4,5>: Cost 2 vext2 <4,4,4,4>, RHS
+  2779680054U, // <4,4,4,6>: Cost 3 vuzpl <4,6,4,6>, RHS
+  2594927963U, // <4,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4>
+  161926454U, // <4,4,4,u>: Cost 1 vdup0 RHS
+  2571042918U, // <4,4,5,0>: Cost 3 vext1 <3,4,4,5>, LHS
+  2571043738U, // <4,4,5,1>: Cost 3 vext1 <3,4,4,5>, <1,2,3,4>
+  3638814495U, // <4,4,5,2>: Cost 4 vext1 <2,4,4,5>, <2,4,4,5>
+  2571045368U, // <4,4,5,3>: Cost 3 vext1 <3,4,4,5>, <3,4,4,5>
+  2571046198U, // <4,4,5,4>: Cost 3 vext1 <3,4,4,5>, RHS
+  1839648054U, // <4,4,5,5>: Cost 2 vzipl RHS, RHS
+  1618169142U, // <4,4,5,6>: Cost 2 vext3 <1,2,3,4>, RHS
+  2594936156U, // <4,4,5,7>: Cost 3 vext1 <7,4,4,5>, <7,4,4,5>
+  1618169160U, // <4,4,5,u>: Cost 2 vext3 <1,2,3,4>, RHS
+  2553135206U, // <4,4,6,0>: Cost 3 vext1 <0,4,4,6>, LHS
+  3626877686U, // <4,4,6,1>: Cost 4 vext1 <0,4,4,6>, <1,0,3,2>
+  2565080782U, // <4,4,6,2>: Cost 3 vext1 <2,4,4,6>, <2,3,4,5>
+  2571053561U, // <4,4,6,3>: Cost 3 vext1 <3,4,4,6>, <3,4,4,6>
+  2553138486U, // <4,4,6,4>: Cost 3 vext1 <0,4,4,6>, RHS
+  2241555675U, // <4,4,6,5>: Cost 3 vrev <4,4,5,6>
+  1973865782U, // <4,4,6,6>: Cost 2 vtrnl RHS, RHS
+  2658055029U, // <4,4,6,7>: Cost 3 vext2 <6,7,4,4>, <6,7,4,4>
+  1973865800U, // <4,4,6,u>: Cost 2 vtrnl RHS, RHS
+  2644120570U, // <4,4,7,0>: Cost 3 vext2 <4,4,4,4>, <7,0,1,2>
+  3638829978U, // <4,4,7,1>: Cost 4 vext1 <2,4,4,7>, <1,2,3,4>
+  3638830881U, // <4,4,7,2>: Cost 4 vext1 <2,4,4,7>, <2,4,4,7>
+  3735115018U, // <4,4,7,3>: Cost 4 vext2 <7,3,4,4>, <7,3,4,4>
+  2662036827U, // <4,4,7,4>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4>
+  2713292236U, // <4,4,7,5>: Cost 3 vext3 <4,7,5,4>, <4,7,5,4>
+  2713365973U, // <4,4,7,6>: Cost 3 vext3 <4,7,6,4>, <4,7,6,4>
+  2644121196U, // <4,4,7,7>: Cost 3 vext2 <4,4,4,4>, <7,7,7,7>
+  2662036827U, // <4,4,7,u>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4>
+  1503297638U, // <4,4,u,0>: Cost 2 vext1 <4,4,4,u>, LHS
+  1570379566U, // <4,4,u,1>: Cost 2 vext2 <4,4,4,4>, LHS
+  2779682606U, // <4,4,u,2>: Cost 3 vuzpl <4,6,4,6>, LHS
+  2571069947U, // <4,4,u,3>: Cost 3 vext1 <3,4,4,u>, <3,4,4,u>
+  161926454U, // <4,4,u,4>: Cost 1 vdup0 RHS
+  1841638710U, // <4,4,u,5>: Cost 2 vzipl RHS, RHS
+  1618169385U, // <4,4,u,6>: Cost 2 vext3 <1,2,3,4>, RHS
+  2594960735U, // <4,4,u,7>: Cost 3 vext1 <7,4,4,u>, <7,4,4,u>
+  161926454U, // <4,4,u,u>: Cost 1 vdup0 RHS
+  2631516160U, // <4,5,0,0>: Cost 3 vext2 <2,3,4,5>, <0,0,0,0>
+  1557774438U, // <4,5,0,1>: Cost 2 vext2 <2,3,4,5>, LHS
+  2618908875U, // <4,5,0,2>: Cost 3 vext2 <0,2,4,5>, <0,2,4,5>
+  2571078140U, // <4,5,0,3>: Cost 3 vext1 <3,4,5,0>, <3,4,5,0>
+  2626871634U, // <4,5,0,4>: Cost 3 vext2 <1,5,4,5>, <0,4,1,5>
+  3705258414U, // <4,5,0,5>: Cost 4 vext2 <2,3,4,5>, <0,5,2,7>
+  2594968438U, // <4,5,0,6>: Cost 3 vext1 <7,4,5,0>, <6,7,4,5>
+  2594968928U, // <4,5,0,7>: Cost 3 vext1 <7,4,5,0>, <7,4,5,0>
+  1557775005U, // <4,5,0,u>: Cost 2 vext2 <2,3,4,5>, LHS
+  2631516918U, // <4,5,1,0>: Cost 3 vext2 <2,3,4,5>, <1,0,3,2>
+  2624217939U, // <4,5,1,1>: Cost 3 vext2 <1,1,4,5>, <1,1,4,5>
+  2631517078U, // <4,5,1,2>: Cost 3 vext2 <2,3,4,5>, <1,2,3,0>
+  2821341286U, // <4,5,1,3>: Cost 3 vuzpr <0,4,1,5>, LHS
+  3895086054U, // <4,5,1,4>: Cost 4 vuzpr <0,4,1,5>, <4,1,5,4>
+  2626872471U, // <4,5,1,5>: Cost 3 vext2 <1,5,4,5>, <1,5,4,5>
+  3895083131U, // <4,5,1,6>: Cost 4 vuzpr <0,4,1,5>, <0,1,4,6>
+  2718748368U, // <4,5,1,7>: Cost 3 vext3 <5,6,7,4>, <5,1,7,3>
+  2821341291U, // <4,5,1,u>: Cost 3 vuzpr <0,4,1,5>, LHS
+  2571092070U, // <4,5,2,0>: Cost 3 vext1 <3,4,5,2>, LHS
+  3699287585U, // <4,5,2,1>: Cost 4 vext2 <1,3,4,5>, <2,1,3,3>
+  2630854269U, // <4,5,2,2>: Cost 3 vext2 <2,2,4,5>, <2,2,4,5>
+  1557776078U, // <4,5,2,3>: Cost 2 vext2 <2,3,4,5>, <2,3,4,5>
+  2631517974U, // <4,5,2,4>: Cost 3 vext2 <2,3,4,5>, <2,4,3,5>
+  3692652384U, // <4,5,2,5>: Cost 4 vext2 <0,2,4,5>, <2,5,2,7>
+  2631518138U, // <4,5,2,6>: Cost 3 vext2 <2,3,4,5>, <2,6,3,7>
+  4164013366U, // <4,5,2,7>: Cost 4 vtrnr <0,4,u,2>, RHS
+  1561094243U, // <4,5,2,u>: Cost 2 vext2 <2,u,4,5>, <2,u,4,5>
+  2631518358U, // <4,5,3,0>: Cost 3 vext2 <2,3,4,5>, <3,0,1,2>
+  3895084710U, // <4,5,3,1>: Cost 4 vuzpr <0,4,1,5>, <2,3,0,1>
+  2631518540U, // <4,5,3,2>: Cost 3 vext2 <2,3,4,5>, <3,2,3,4>
+  2631518620U, // <4,5,3,3>: Cost 3 vext2 <2,3,4,5>, <3,3,3,3>
+  2631518716U, // <4,5,3,4>: Cost 3 vext2 <2,3,4,5>, <3,4,5,0>
+  2631518784U, // <4,5,3,5>: Cost 3 vext2 <2,3,4,5>, <3,5,3,5>
+  2658060980U, // <4,5,3,6>: Cost 3 vext2 <6,7,4,5>, <3,6,7,4>
+  2640145131U, // <4,5,3,7>: Cost 3 vext2 <3,7,4,5>, <3,7,4,5>
+  2631519006U, // <4,5,3,u>: Cost 3 vext2 <2,3,4,5>, <3,u,1,2>
+  2571108454U, // <4,5,4,0>: Cost 3 vext1 <3,4,5,4>, LHS
+  3632907342U, // <4,5,4,1>: Cost 4 vext1 <1,4,5,4>, <1,4,5,4>
+  2571110094U, // <4,5,4,2>: Cost 3 vext1 <3,4,5,4>, <2,3,4,5>
+  2571110912U, // <4,5,4,3>: Cost 3 vext1 <3,4,5,4>, <3,4,5,4>
+  2571111734U, // <4,5,4,4>: Cost 3 vext1 <3,4,5,4>, RHS
+  1557777718U, // <4,5,4,5>: Cost 2 vext2 <2,3,4,5>, RHS
+  2645454195U, // <4,5,4,6>: Cost 3 vext2 <4,6,4,5>, <4,6,4,5>
+  2718748614U, // <4,5,4,7>: Cost 3 vext3 <5,6,7,4>, <5,4,7,6>
+  1557777961U, // <4,5,4,u>: Cost 2 vext2 <2,3,4,5>, RHS
+  1503346790U, // <4,5,5,0>: Cost 2 vext1 <4,4,5,5>, LHS
+  2913398480U, // <4,5,5,1>: Cost 3 vzipl RHS, <5,1,7,3>
+  2631519998U, // <4,5,5,2>: Cost 3 vext2 <2,3,4,5>, <5,2,3,4>
+  2577090710U, // <4,5,5,3>: Cost 3 vext1 <4,4,5,5>, <3,0,1,2>
+  1503349978U, // <4,5,5,4>: Cost 2 vext1 <4,4,5,5>, <4,4,5,5>
+  2631520260U, // <4,5,5,5>: Cost 3 vext2 <2,3,4,5>, <5,5,5,5>
+  2913390690U, // <4,5,5,6>: Cost 3 vzipl RHS, <5,6,7,0>
+  2821344566U, // <4,5,5,7>: Cost 3 vuzpr <0,4,1,5>, RHS
+  1503352622U, // <4,5,5,u>: Cost 2 vext1 <4,4,5,5>, LHS
+  1497383014U, // <4,5,6,0>: Cost 2 vext1 <3,4,5,6>, LHS
+  2559181904U, // <4,5,6,1>: Cost 3 vext1 <1,4,5,6>, <1,4,5,6>
+  2565154601U, // <4,5,6,2>: Cost 3 vext1 <2,4,5,6>, <2,4,5,6>
+  1497385474U, // <4,5,6,3>: Cost 2 vext1 <3,4,5,6>, <3,4,5,6>
+  1497386294U, // <4,5,6,4>: Cost 2 vext1 <3,4,5,6>, RHS
+  3047608324U, // <4,5,6,5>: Cost 3 vtrnl RHS, <5,5,5,5>
+  2571129656U, // <4,5,6,6>: Cost 3 vext1 <3,4,5,6>, <6,6,6,6>
+  27705344U, // <4,5,6,7>: Cost 0 copy RHS
+  27705344U, // <4,5,6,u>: Cost 0 copy RHS
+  2565161062U, // <4,5,7,0>: Cost 3 vext1 <2,4,5,7>, LHS
+  2565161882U, // <4,5,7,1>: Cost 3 vext1 <2,4,5,7>, <1,2,3,4>
+  2565162794U, // <4,5,7,2>: Cost 3 vext1 <2,4,5,7>, <2,4,5,7>
+  2661381387U, // <4,5,7,3>: Cost 3 vext2 <7,3,4,5>, <7,3,4,5>
+  2565164342U, // <4,5,7,4>: Cost 3 vext1 <2,4,5,7>, RHS
+  2718748840U, // <4,5,7,5>: Cost 3 vext3 <5,6,7,4>, <5,7,5,7>
+  2718748846U, // <4,5,7,6>: Cost 3 vext3 <5,6,7,4>, <5,7,6,4>
+  2719412407U, // <4,5,7,7>: Cost 3 vext3 <5,7,7,4>, <5,7,7,4>
+  2565166894U, // <4,5,7,u>: Cost 3 vext1 <2,4,5,7>, LHS
+  1497399398U, // <4,5,u,0>: Cost 2 vext1 <3,4,5,u>, LHS
+  1557780270U, // <4,5,u,1>: Cost 2 vext2 <2,3,4,5>, LHS
+  2631522181U, // <4,5,u,2>: Cost 3 vext2 <2,3,4,5>, <u,2,3,0>
+  1497401860U, // <4,5,u,3>: Cost 2 vext1 <3,4,5,u>, <3,4,5,u>
+  1497402678U, // <4,5,u,4>: Cost 2 vext1 <3,4,5,u>, RHS
+  1557780634U, // <4,5,u,5>: Cost 2 vext2 <2,3,4,5>, RHS
+  2631522512U, // <4,5,u,6>: Cost 3 vext2 <2,3,4,5>, <u,6,3,7>
+  27705344U, // <4,5,u,7>: Cost 0 copy RHS
+  27705344U, // <4,5,u,u>: Cost 0 copy RHS
+  2618916864U, // <4,6,0,0>: Cost 3 vext2 <0,2,4,6>, <0,0,0,0>
+  1545175142U, // <4,6,0,1>: Cost 2 vext2 <0,2,4,6>, LHS
+  1545175244U, // <4,6,0,2>: Cost 2 vext2 <0,2,4,6>, <0,2,4,6>
+  3692658940U, // <4,6,0,3>: Cost 4 vext2 <0,2,4,6>, <0,3,1,0>
+  2618917202U, // <4,6,0,4>: Cost 3 vext2 <0,2,4,6>, <0,4,1,5>
+  3852910806U, // <4,6,0,5>: Cost 4 vuzpl RHS, <0,2,5,7>
+  2253525648U, // <4,6,0,6>: Cost 3 vrev <6,4,6,0>
+  4040764726U, // <4,6,0,7>: Cost 4 vzipr <2,3,4,0>, RHS
+  1545175709U, // <4,6,0,u>: Cost 2 vext2 <0,2,4,6>, LHS
+  2618917622U, // <4,6,1,0>: Cost 3 vext2 <0,2,4,6>, <1,0,3,2>
+  2618917684U, // <4,6,1,1>: Cost 3 vext2 <0,2,4,6>, <1,1,1,1>
+  2618917782U, // <4,6,1,2>: Cost 3 vext2 <0,2,4,6>, <1,2,3,0>
+  2618917848U, // <4,6,1,3>: Cost 3 vext2 <0,2,4,6>, <1,3,1,3>
+  3692659773U, // <4,6,1,4>: Cost 4 vext2 <0,2,4,6>, <1,4,3,5>
+  2618918032U, // <4,6,1,5>: Cost 3 vext2 <0,2,4,6>, <1,5,3,7>
+  3692659937U, // <4,6,1,6>: Cost 4 vext2 <0,2,4,6>, <1,6,3,7>
+  4032146742U, // <4,6,1,7>: Cost 4 vzipr <0,u,4,1>, RHS
+  2618918253U, // <4,6,1,u>: Cost 3 vext2 <0,2,4,6>, <1,u,1,3>
+  2618918380U, // <4,6,2,0>: Cost 3 vext2 <0,2,4,6>, <2,0,6,4>
+  2618918460U, // <4,6,2,1>: Cost 3 vext2 <0,2,4,6>, <2,1,6,3>
+  2618918504U, // <4,6,2,2>: Cost 3 vext2 <0,2,4,6>, <2,2,2,2>
+  2618918566U, // <4,6,2,3>: Cost 3 vext2 <0,2,4,6>, <2,3,0,1>
+  2618918679U, // <4,6,2,4>: Cost 3 vext2 <0,2,4,6>, <2,4,3,6>
+  2618918788U, // <4,6,2,5>: Cost 3 vext2 <0,2,4,6>, <2,5,6,7>
+  2618918842U, // <4,6,2,6>: Cost 3 vext2 <0,2,4,6>, <2,6,3,7>
+  2718749178U, // <4,6,2,7>: Cost 3 vext3 <5,6,7,4>, <6,2,7,3>
+  2618918971U, // <4,6,2,u>: Cost 3 vext2 <0,2,4,6>, <2,u,0,1>
+  2618919062U, // <4,6,3,0>: Cost 3 vext2 <0,2,4,6>, <3,0,1,2>
+  2636171526U, // <4,6,3,1>: Cost 3 vext2 <3,1,4,6>, <3,1,4,6>
+  3692661057U, // <4,6,3,2>: Cost 4 vext2 <0,2,4,6>, <3,2,2,2>
+  2618919324U, // <4,6,3,3>: Cost 3 vext2 <0,2,4,6>, <3,3,3,3>
+  2618919426U, // <4,6,3,4>: Cost 3 vext2 <0,2,4,6>, <3,4,5,6>
+  2638826058U, // <4,6,3,5>: Cost 3 vext2 <3,5,4,6>, <3,5,4,6>
+  3913303030U, // <4,6,3,6>: Cost 4 vuzpr <3,4,5,6>, <1,3,4,6>
+  2722730572U, // <4,6,3,7>: Cost 3 vext3 <6,3,7,4>, <6,3,7,4>
+  2618919710U, // <4,6,3,u>: Cost 3 vext2 <0,2,4,6>, <3,u,1,2>
+  2565210214U, // <4,6,4,0>: Cost 3 vext1 <2,4,6,4>, LHS
+  2718749286U, // <4,6,4,1>: Cost 3 vext3 <5,6,7,4>, <6,4,1,3>
+  2565211952U, // <4,6,4,2>: Cost 3 vext1 <2,4,6,4>, <2,4,6,4>
+  2571184649U, // <4,6,4,3>: Cost 3 vext1 <3,4,6,4>, <3,4,6,4>
+  2565213494U, // <4,6,4,4>: Cost 3 vext1 <2,4,6,4>, RHS
+  1545178422U, // <4,6,4,5>: Cost 2 vext2 <0,2,4,6>, RHS
+  1705430326U, // <4,6,4,6>: Cost 2 vuzpl RHS, RHS
+  2595075437U, // <4,6,4,7>: Cost 3 vext1 <7,4,6,4>, <7,4,6,4>
+  1545178665U, // <4,6,4,u>: Cost 2 vext2 <0,2,4,6>, RHS
+  2565218406U, // <4,6,5,0>: Cost 3 vext1 <2,4,6,5>, LHS
+  2645462736U, // <4,6,5,1>: Cost 3 vext2 <4,6,4,6>, <5,1,7,3>
+  2913399290U, // <4,6,5,2>: Cost 3 vzipl RHS, <6,2,7,3>
+  3913305394U, // <4,6,5,3>: Cost 4 vuzpr <3,4,5,6>, <4,5,6,3>
+  2645462982U, // <4,6,5,4>: Cost 3 vext2 <4,6,4,6>, <5,4,7,6>
+  2779172868U, // <4,6,5,5>: Cost 3 vuzpl RHS, <5,5,5,5>
+  2913391416U, // <4,6,5,6>: Cost 3 vzipl RHS, <6,6,6,6>
+  2821426486U, // <4,6,5,7>: Cost 3 vuzpr <0,4,2,6>, RHS
+  2821426487U, // <4,6,5,u>: Cost 3 vuzpr <0,4,2,6>, RHS
+  1503428710U, // <4,6,6,0>: Cost 2 vext1 <4,4,6,6>, LHS
+  2577171190U, // <4,6,6,1>: Cost 3 vext1 <4,4,6,6>, <1,0,3,2>
+  2645463546U, // <4,6,6,2>: Cost 3 vext2 <4,6,4,6>, <6,2,7,3>
+  2577172630U, // <4,6,6,3>: Cost 3 vext1 <4,4,6,6>, <3,0,1,2>
+  1503431908U, // <4,6,6,4>: Cost 2 vext1 <4,4,6,6>, <4,4,6,6>
+  2253501069U, // <4,6,6,5>: Cost 3 vrev <6,4,5,6>
+  2618921784U, // <4,6,6,6>: Cost 3 vext2 <0,2,4,6>, <6,6,6,6>
+  2954464566U, // <4,6,6,7>: Cost 3 vzipr <0,2,4,6>, RHS
+  1503434542U, // <4,6,6,u>: Cost 2 vext1 <4,4,6,6>, LHS
+  2645464058U, // <4,6,7,0>: Cost 3 vext2 <4,6,4,6>, <7,0,1,2>
+  2779173882U, // <4,6,7,1>: Cost 3 vuzpl RHS, <7,0,1,2>
+  3638978355U, // <4,6,7,2>: Cost 4 vext1 <2,4,6,7>, <2,4,6,7>
+  2725090156U, // <4,6,7,3>: Cost 3 vext3 <6,7,3,4>, <6,7,3,4>
+  2645464422U, // <4,6,7,4>: Cost 3 vext2 <4,6,4,6>, <7,4,5,6>
+  2779174246U, // <4,6,7,5>: Cost 3 vuzpl RHS, <7,4,5,6>
+  3852915914U, // <4,6,7,6>: Cost 4 vuzpl RHS, <7,2,6,3>
+  2779174508U, // <4,6,7,7>: Cost 3 vuzpl RHS, <7,7,7,7>
+  2779173945U, // <4,6,7,u>: Cost 3 vuzpl RHS, <7,0,u,2>
+  1503445094U, // <4,6,u,0>: Cost 2 vext1 <4,4,6,u>, LHS
+  1545180974U, // <4,6,u,1>: Cost 2 vext2 <0,2,4,6>, LHS
+  1705432878U, // <4,6,u,2>: Cost 2 vuzpl RHS, LHS
+  2618922940U, // <4,6,u,3>: Cost 3 vext2 <0,2,4,6>, <u,3,0,1>
+  1503448294U, // <4,6,u,4>: Cost 2 vext1 <4,4,6,u>, <4,4,6,u>
+  1545181338U, // <4,6,u,5>: Cost 2 vext2 <0,2,4,6>, RHS
+  1705433242U, // <4,6,u,6>: Cost 2 vuzpl RHS, RHS
+  2954480950U, // <4,6,u,7>: Cost 3 vzipr <0,2,4,u>, RHS
+  1545181541U, // <4,6,u,u>: Cost 2 vext2 <0,2,4,6>, LHS
+  3706601472U, // <4,7,0,0>: Cost 4 vext2 <2,5,4,7>, <0,0,0,0>
+  2632859750U, // <4,7,0,1>: Cost 3 vext2 <2,5,4,7>, LHS
+  2726343685U, // <4,7,0,2>: Cost 3 vext3 <7,0,2,4>, <7,0,2,4>
+  3701293312U, // <4,7,0,3>: Cost 4 vext2 <1,6,4,7>, <0,3,1,4>
+  3706601810U, // <4,7,0,4>: Cost 4 vext2 <2,5,4,7>, <0,4,1,5>
+  2259424608U, // <4,7,0,5>: Cost 3 vrev <7,4,5,0>
+  3695321617U, // <4,7,0,6>: Cost 4 vext2 <0,6,4,7>, <0,6,4,7>
+  3800454194U, // <4,7,0,7>: Cost 4 vext3 <7,0,7,4>, <7,0,7,4>
+  2632860317U, // <4,7,0,u>: Cost 3 vext2 <2,5,4,7>, LHS
+  2259064116U, // <4,7,1,0>: Cost 3 vrev <7,4,0,1>
+  3700630324U, // <4,7,1,1>: Cost 4 vext2 <1,5,4,7>, <1,1,1,1>
+  2632860570U, // <4,7,1,2>: Cost 3 vext2 <2,5,4,7>, <1,2,3,4>
+  3769635936U, // <4,7,1,3>: Cost 4 vext3 <1,u,3,4>, <7,1,3,5>
+  3656920374U, // <4,7,1,4>: Cost 4 vext1 <5,4,7,1>, RHS
+  3700630681U, // <4,7,1,5>: Cost 4 vext2 <1,5,4,7>, <1,5,4,7>
+  3701294314U, // <4,7,1,6>: Cost 4 vext2 <1,6,4,7>, <1,6,4,7>
+  3793818754U, // <4,7,1,7>: Cost 4 vext3 <5,u,7,4>, <7,1,7,3>
+  2259654012U, // <4,7,1,u>: Cost 3 vrev <7,4,u,1>
+  3656925286U, // <4,7,2,0>: Cost 4 vext1 <5,4,7,2>, LHS
+  3706603050U, // <4,7,2,1>: Cost 4 vext2 <2,5,4,7>, <2,1,4,3>
+  3706603112U, // <4,7,2,2>: Cost 4 vext2 <2,5,4,7>, <2,2,2,2>
+  2727744688U, // <4,7,2,3>: Cost 3 vext3 <7,2,3,4>, <7,2,3,4>
+  3705939745U, // <4,7,2,4>: Cost 4 vext2 <2,4,4,7>, <2,4,4,7>
+  2632861554U, // <4,7,2,5>: Cost 3 vext2 <2,5,4,7>, <2,5,4,7>
+  3706603450U, // <4,7,2,6>: Cost 4 vext2 <2,5,4,7>, <2,6,3,7>
+  3792491731U, // <4,7,2,7>: Cost 4 vext3 <5,6,7,4>, <7,2,7,3>
+  2634852453U, // <4,7,2,u>: Cost 3 vext2 <2,u,4,7>, <2,u,4,7>
+  3706603670U, // <4,7,3,0>: Cost 4 vext2 <2,5,4,7>, <3,0,1,2>
+  3662906266U, // <4,7,3,1>: Cost 4 vext1 <6,4,7,3>, <1,2,3,4>
+  3725183326U, // <4,7,3,2>: Cost 4 vext2 <5,6,4,7>, <3,2,5,4>
+  3706603932U, // <4,7,3,3>: Cost 4 vext2 <2,5,4,7>, <3,3,3,3>
+  3701295618U, // <4,7,3,4>: Cost 4 vext2 <1,6,4,7>, <3,4,5,6>
+  2638834251U, // <4,7,3,5>: Cost 3 vext2 <3,5,4,7>, <3,5,4,7>
+  2639497884U, // <4,7,3,6>: Cost 3 vext2 <3,6,4,7>, <3,6,4,7>
+  3802445093U, // <4,7,3,7>: Cost 4 vext3 <7,3,7,4>, <7,3,7,4>
+  2640825150U, // <4,7,3,u>: Cost 3 vext2 <3,u,4,7>, <3,u,4,7>
+  2718750004U, // <4,7,4,0>: Cost 3 vext3 <5,6,7,4>, <7,4,0,1>
+  3706604490U, // <4,7,4,1>: Cost 4 vext2 <2,5,4,7>, <4,1,2,3>
+  3656943474U, // <4,7,4,2>: Cost 4 vext1 <5,4,7,4>, <2,5,4,7>
+  3779884371U, // <4,7,4,3>: Cost 4 vext3 <3,5,7,4>, <7,4,3,5>
+  2259383643U, // <4,7,4,4>: Cost 3 vrev <7,4,4,4>
+  2632863030U, // <4,7,4,5>: Cost 3 vext2 <2,5,4,7>, RHS
+  2259531117U, // <4,7,4,6>: Cost 3 vrev <7,4,6,4>
+  3907340074U, // <4,7,4,7>: Cost 4 vuzpr <2,4,5,7>, <2,4,5,7>
+  2632863273U, // <4,7,4,u>: Cost 3 vext2 <2,5,4,7>, RHS
+  2913391610U, // <4,7,5,0>: Cost 3 vzipl RHS, <7,0,1,2>
+  3645006848U, // <4,7,5,1>: Cost 4 vext1 <3,4,7,5>, <1,3,5,7>
+  2589181646U, // <4,7,5,2>: Cost 3 vext1 <6,4,7,5>, <2,3,4,5>
+  3645008403U, // <4,7,5,3>: Cost 4 vext1 <3,4,7,5>, <3,4,7,5>
+  2913391974U, // <4,7,5,4>: Cost 3 vzipl RHS, <7,4,5,6>
+  2583211973U, // <4,7,5,5>: Cost 3 vext1 <5,4,7,5>, <5,4,7,5>
+  2589184670U, // <4,7,5,6>: Cost 3 vext1 <6,4,7,5>, <6,4,7,5>
+  2913392236U, // <4,7,5,7>: Cost 3 vzipl RHS, <7,7,7,7>
+  2913392258U, // <4,7,5,u>: Cost 3 vzipl RHS, <7,u,1,2>
+  1509474406U, // <4,7,6,0>: Cost 2 vext1 <5,4,7,6>, LHS
+  3047609338U, // <4,7,6,1>: Cost 3 vtrnl RHS, <7,0,1,2>
+  2583217768U, // <4,7,6,2>: Cost 3 vext1 <5,4,7,6>, <2,2,2,2>
+  2583218326U, // <4,7,6,3>: Cost 3 vext1 <5,4,7,6>, <3,0,1,2>
+  1509477686U, // <4,7,6,4>: Cost 2 vext1 <5,4,7,6>, RHS
+  1509478342U, // <4,7,6,5>: Cost 2 vext1 <5,4,7,6>, <5,4,7,6>
+  2583220730U, // <4,7,6,6>: Cost 3 vext1 <5,4,7,6>, <6,2,7,3>
+  3047609964U, // <4,7,6,7>: Cost 3 vtrnl RHS, <7,7,7,7>
+  1509480238U, // <4,7,6,u>: Cost 2 vext1 <5,4,7,6>, LHS
+  3650994278U, // <4,7,7,0>: Cost 4 vext1 <4,4,7,7>, LHS
+  3650995098U, // <4,7,7,1>: Cost 4 vext1 <4,4,7,7>, <1,2,3,4>
+  3650996010U, // <4,7,7,2>: Cost 4 vext1 <4,4,7,7>, <2,4,5,7>
+  3804804677U, // <4,7,7,3>: Cost 4 vext3 <7,7,3,4>, <7,7,3,4>
+  3650997486U, // <4,7,7,4>: Cost 4 vext1 <4,4,7,7>, <4,4,7,7>
+  2662725039U, // <4,7,7,5>: Cost 3 vext2 <7,5,4,7>, <7,5,4,7>
+  3662942880U, // <4,7,7,6>: Cost 4 vext1 <6,4,7,7>, <6,4,7,7>
+  2718750316U, // <4,7,7,7>: Cost 3 vext3 <5,6,7,4>, <7,7,7,7>
+  2664715938U, // <4,7,7,u>: Cost 3 vext2 <7,u,4,7>, <7,u,4,7>
+  1509490790U, // <4,7,u,0>: Cost 2 vext1 <5,4,7,u>, LHS
+  2632865582U, // <4,7,u,1>: Cost 3 vext2 <2,5,4,7>, LHS
+  2583234152U, // <4,7,u,2>: Cost 3 vext1 <5,4,7,u>, <2,2,2,2>
+  2583234710U, // <4,7,u,3>: Cost 3 vext1 <5,4,7,u>, <3,0,1,2>
+  1509494070U, // <4,7,u,4>: Cost 2 vext1 <5,4,7,u>, RHS
+  1509494728U, // <4,7,u,5>: Cost 2 vext1 <5,4,7,u>, <5,4,7,u>
+  2583237114U, // <4,7,u,6>: Cost 3 vext1 <5,4,7,u>, <6,2,7,3>
+  3047757420U, // <4,7,u,7>: Cost 3 vtrnl RHS, <7,7,7,7>
+  1509496622U, // <4,7,u,u>: Cost 2 vext1 <5,4,7,u>, LHS
+  2618933248U, // <4,u,0,0>: Cost 3 vext2 <0,2,4,u>, <0,0,0,0>
+  1545191526U, // <4,u,0,1>: Cost 2 vext2 <0,2,4,u>, LHS
+  1545191630U, // <4,u,0,2>: Cost 2 vext2 <0,2,4,u>, <0,2,4,u>
+  2691913445U, // <4,u,0,3>: Cost 3 vext3 <1,2,3,4>, <u,0,3,2>
+  2618933586U, // <4,u,0,4>: Cost 3 vext2 <0,2,4,u>, <0,4,1,5>
+  2265397305U, // <4,u,0,5>: Cost 3 vrev <u,4,5,0>
+  2595189625U, // <4,u,0,6>: Cost 3 vext1 <7,4,u,0>, <6,7,4,u>
+  2595190139U, // <4,u,0,7>: Cost 3 vext1 <7,4,u,0>, <7,4,u,0>
+  1545192093U, // <4,u,0,u>: Cost 2 vext2 <0,2,4,u>, LHS
+  2618934006U, // <4,u,1,0>: Cost 3 vext2 <0,2,4,u>, <1,0,3,2>
+  2618934068U, // <4,u,1,1>: Cost 3 vext2 <0,2,4,u>, <1,1,1,1>
+  1618171694U, // <4,u,1,2>: Cost 2 vext3 <1,2,3,4>, LHS
+  2618934232U, // <4,u,1,3>: Cost 3 vext2 <0,2,4,u>, <1,3,1,3>
+  2695894848U, // <4,u,1,4>: Cost 3 vext3 <1,u,3,4>, <u,1,4,3>
+  2618934416U, // <4,u,1,5>: Cost 3 vext2 <0,2,4,u>, <1,5,3,7>
+  3692676321U, // <4,u,1,6>: Cost 4 vext2 <0,2,4,u>, <1,6,3,7>
+  2718750555U, // <4,u,1,7>: Cost 3 vext3 <5,6,7,4>, <u,1,7,3>
+  1618171748U, // <4,u,1,u>: Cost 2 vext3 <1,2,3,4>, LHS
+  2553397350U, // <4,u,2,0>: Cost 3 vext1 <0,4,u,2>, LHS
+  2630215215U, // <4,u,2,1>: Cost 3 vext2 <2,1,4,u>, <2,1,4,u>
+  2618934888U, // <4,u,2,2>: Cost 3 vext2 <0,2,4,u>, <2,2,2,2>
+  1557800657U, // <4,u,2,3>: Cost 2 vext2 <2,3,4,u>, <2,3,4,u>
+  2618935065U, // <4,u,2,4>: Cost 3 vext2 <0,2,4,u>, <2,4,3,u>
+  2733864859U, // <4,u,2,5>: Cost 3 vext3 <u,2,5,4>, <u,2,5,4>
+  2618935226U, // <4,u,2,6>: Cost 3 vext2 <0,2,4,u>, <2,6,3,7>
+  2718750636U, // <4,u,2,7>: Cost 3 vext3 <5,6,7,4>, <u,2,7,3>
+  1561118822U, // <4,u,2,u>: Cost 2 vext2 <2,u,4,u>, <2,u,4,u>
+  2618935446U, // <4,u,3,0>: Cost 3 vext2 <0,2,4,u>, <3,0,1,2>
+  2779318422U, // <4,u,3,1>: Cost 3 vuzpl RHS, <3,0,1,2>
+  2636851545U, // <4,u,3,2>: Cost 3 vext2 <3,2,4,u>, <3,2,4,u>
+  2618935708U, // <4,u,3,3>: Cost 3 vext2 <0,2,4,u>, <3,3,3,3>
+  2618935810U, // <4,u,3,4>: Cost 3 vext2 <0,2,4,u>, <3,4,5,6>
+  2691913711U, // <4,u,3,5>: Cost 3 vext3 <1,2,3,4>, <u,3,5,7>
+  2588725862U, // <4,u,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
+  2640169710U, // <4,u,3,7>: Cost 3 vext2 <3,7,4,u>, <3,7,4,u>
+  2618936094U, // <4,u,3,u>: Cost 3 vext2 <0,2,4,u>, <3,u,1,2>
+  1503559782U, // <4,u,4,0>: Cost 2 vext1 <4,4,u,4>, LHS
+  2692282391U, // <4,u,4,1>: Cost 3 vext3 <1,2,u,4>, <u,4,1,2>
+  2565359426U, // <4,u,4,2>: Cost 3 vext1 <2,4,u,4>, <2,4,u,4>
+  2571332123U, // <4,u,4,3>: Cost 3 vext1 <3,4,u,4>, <3,4,u,4>
+  161926454U, // <4,u,4,4>: Cost 1 vdup0 RHS
+  1545194806U, // <4,u,4,5>: Cost 2 vext2 <0,2,4,u>, RHS
+  1705577782U, // <4,u,4,6>: Cost 2 vuzpl RHS, RHS
+  2718750801U, // <4,u,4,7>: Cost 3 vext3 <5,6,7,4>, <u,4,7,6>
+  161926454U, // <4,u,4,u>: Cost 1 vdup0 RHS
+  1479164006U, // <4,u,5,0>: Cost 2 vext1 <0,4,1,5>, LHS
+  1839650606U, // <4,u,5,1>: Cost 2 vzipl RHS, LHS
+  2565367502U, // <4,u,5,2>: Cost 3 vext1 <2,4,u,5>, <2,3,4,5>
+  3089777309U, // <4,u,5,3>: Cost 3 vtrnr <0,4,1,5>, LHS
+  1479167286U, // <4,u,5,4>: Cost 2 vext1 <0,4,1,5>, RHS
+  1839650970U, // <4,u,5,5>: Cost 2 vzipl RHS, RHS
+  1618172058U, // <4,u,5,6>: Cost 2 vext3 <1,2,3,4>, RHS
+  3089780265U, // <4,u,5,7>: Cost 3 vtrnr <0,4,1,5>, RHS
+  1618172076U, // <4,u,5,u>: Cost 2 vext3 <1,2,3,4>, RHS
+  1479688294U, // <4,u,6,0>: Cost 2 vext1 <0,4,u,6>, LHS
+  2553430774U, // <4,u,6,1>: Cost 3 vext1 <0,4,u,6>, <1,0,3,2>
+  1973868334U, // <4,u,6,2>: Cost 2 vtrnl RHS, LHS
+  1497606685U, // <4,u,6,3>: Cost 2 vext1 <3,4,u,6>, <3,4,u,6>
+  1479691574U, // <4,u,6,4>: Cost 2 vext1 <0,4,u,6>, RHS
+  1509552079U, // <4,u,6,5>: Cost 2 vext1 <5,4,u,6>, <5,4,u,6>
+  1973868698U, // <4,u,6,6>: Cost 2 vtrnl RHS, RHS
+  27705344U, // <4,u,6,7>: Cost 0 copy RHS
+  27705344U, // <4,u,6,u>: Cost 0 copy RHS
+  2565382246U, // <4,u,7,0>: Cost 3 vext1 <2,4,u,7>, LHS
+  2565383066U, // <4,u,7,1>: Cost 3 vext1 <2,4,u,7>, <1,2,3,4>
+  2565384005U, // <4,u,7,2>: Cost 3 vext1 <2,4,u,7>, <2,4,u,7>
+  2661405966U, // <4,u,7,3>: Cost 3 vext2 <7,3,4,u>, <7,3,4,u>
+  2565385526U, // <4,u,7,4>: Cost 3 vext1 <2,4,u,7>, RHS
+  2779321702U, // <4,u,7,5>: Cost 3 vuzpl RHS, <7,4,5,6>
+  2589274793U, // <4,u,7,6>: Cost 3 vext1 <6,4,u,7>, <6,4,u,7>
+  2779321964U, // <4,u,7,7>: Cost 3 vuzpl RHS, <7,7,7,7>
+  2565388078U, // <4,u,7,u>: Cost 3 vext1 <2,4,u,7>, LHS
+  1479704678U, // <4,u,u,0>: Cost 2 vext1 <0,4,u,u>, LHS
+  1545197358U, // <4,u,u,1>: Cost 2 vext2 <0,2,4,u>, LHS
+  1618172261U, // <4,u,u,2>: Cost 2 vext3 <1,2,3,4>, LHS
+  1497623071U, // <4,u,u,3>: Cost 2 vext1 <3,4,u,u>, <3,4,u,u>
+  161926454U, // <4,u,u,4>: Cost 1 vdup0 RHS
+  1545197722U, // <4,u,u,5>: Cost 2 vext2 <0,2,4,u>, RHS
+  1618172301U, // <4,u,u,6>: Cost 2 vext3 <1,2,3,4>, RHS
+  27705344U, // <4,u,u,7>: Cost 0 copy RHS
+  27705344U, // <4,u,u,u>: Cost 0 copy RHS
+  2687123456U, // <5,0,0,0>: Cost 3 vext3 <0,4,1,5>, <0,0,0,0>
+  2687123466U, // <5,0,0,1>: Cost 3 vext3 <0,4,1,5>, <0,0,1,1>
+  2687123476U, // <5,0,0,2>: Cost 3 vext3 <0,4,1,5>, <0,0,2,2>
+  3710599434U, // <5,0,0,3>: Cost 4 vext2 <3,2,5,0>, <0,3,2,5>
+  2642166098U, // <5,0,0,4>: Cost 3 vext2 <4,1,5,0>, <0,4,1,5>
+  3657060306U, // <5,0,0,5>: Cost 4 vext1 <5,5,0,0>, <5,5,0,0>
+  3292094923U, // <5,0,0,6>: Cost 4 vrev <0,5,6,0>
+  3669005700U, // <5,0,0,7>: Cost 4 vext1 <7,5,0,0>, <7,5,0,0>
+  2687123530U, // <5,0,0,u>: Cost 3 vext3 <0,4,1,5>, <0,0,u,2>
+  2559434854U, // <5,0,1,0>: Cost 3 vext1 <1,5,0,1>, LHS
+  2559435887U, // <5,0,1,1>: Cost 3 vext1 <1,5,0,1>, <1,5,0,1>
+  1613381734U, // <5,0,1,2>: Cost 2 vext3 <0,4,1,5>, LHS
+  3698656256U, // <5,0,1,3>: Cost 4 vext2 <1,2,5,0>, <1,3,5,7>
+  2559438134U, // <5,0,1,4>: Cost 3 vext1 <1,5,0,1>, RHS
+  2583326675U, // <5,0,1,5>: Cost 3 vext1 <5,5,0,1>, <5,5,0,1>
+  3715908851U, // <5,0,1,6>: Cost 4 vext2 <4,1,5,0>, <1,6,5,7>
+  3657069562U, // <5,0,1,7>: Cost 4 vext1 <5,5,0,1>, <7,0,1,2>
+  1613381788U, // <5,0,1,u>: Cost 2 vext3 <0,4,1,5>, LHS
+  2686017700U, // <5,0,2,0>: Cost 3 vext3 <0,2,4,5>, <0,2,0,2>
+  2685796528U, // <5,0,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5>
+  2698625208U, // <5,0,2,2>: Cost 3 vext3 <2,3,4,5>, <0,2,2,4>
+  2685944002U, // <5,0,2,3>: Cost 3 vext3 <0,2,3,5>, <0,2,3,5>
+  2686017739U, // <5,0,2,4>: Cost 3 vext3 <0,2,4,5>, <0,2,4,5>
+  2686091476U, // <5,0,2,5>: Cost 3 vext3 <0,2,5,5>, <0,2,5,5>
+  2725167324U, // <5,0,2,6>: Cost 3 vext3 <6,7,4,5>, <0,2,6,4>
+  2595280230U, // <5,0,2,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6>
+  2686312687U, // <5,0,2,u>: Cost 3 vext3 <0,2,u,5>, <0,2,u,5>
+  3760128248U, // <5,0,3,0>: Cost 4 vext3 <0,3,0,5>, <0,3,0,5>
+  3759685888U, // <5,0,3,1>: Cost 4 vext3 <0,2,3,5>, <0,3,1,4>
+  2686533898U, // <5,0,3,2>: Cost 3 vext3 <0,3,2,5>, <0,3,2,5>
+  3760349459U, // <5,0,3,3>: Cost 4 vext3 <0,3,3,5>, <0,3,3,5>
+  2638187004U, // <5,0,3,4>: Cost 3 vext2 <3,4,5,0>, <3,4,5,0>
+  3776348452U, // <5,0,3,5>: Cost 4 vext3 <3,0,4,5>, <0,3,5,4>
+  3713256094U, // <5,0,3,6>: Cost 4 vext2 <3,6,5,0>, <3,6,5,0>
+  3914064896U, // <5,0,3,7>: Cost 4 vuzpr <3,5,7,0>, <1,3,5,7>
+  2686976320U, // <5,0,3,u>: Cost 3 vext3 <0,3,u,5>, <0,3,u,5>
+  2559459430U, // <5,0,4,0>: Cost 3 vext1 <1,5,0,4>, LHS
+  1613381970U, // <5,0,4,1>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5>
+  2687123804U, // <5,0,4,2>: Cost 3 vext3 <0,4,1,5>, <0,4,2,6>
+  3761013092U, // <5,0,4,3>: Cost 4 vext3 <0,4,3,5>, <0,4,3,5>
+  2559462710U, // <5,0,4,4>: Cost 3 vext1 <1,5,0,4>, RHS
+  2638187830U, // <5,0,4,5>: Cost 3 vext2 <3,4,5,0>, RHS
+  3761234303U, // <5,0,4,6>: Cost 4 vext3 <0,4,6,5>, <0,4,6,5>
+  2646150600U, // <5,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0>
+  1613381970U, // <5,0,4,u>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5>
+  3766763926U, // <5,0,5,0>: Cost 4 vext3 <1,4,0,5>, <0,5,0,1>
+  2919268454U, // <5,0,5,1>: Cost 3 vzipl <5,5,5,5>, LHS
+  3053486182U, // <5,0,5,2>: Cost 3 vtrnl <5,5,5,5>, LHS
+  3723210589U, // <5,0,5,3>: Cost 4 vext2 <5,3,5,0>, <5,3,5,0>
+  3766763966U, // <5,0,5,4>: Cost 4 vext3 <1,4,0,5>, <0,5,4,5>
+  2650796031U, // <5,0,5,5>: Cost 3 vext2 <5,5,5,0>, <5,5,5,0>
+  3719893090U, // <5,0,5,6>: Cost 4 vext2 <4,7,5,0>, <5,6,7,0>
+  3914067254U, // <5,0,5,7>: Cost 4 vuzpr <3,5,7,0>, RHS
+  2919269021U, // <5,0,5,u>: Cost 3 vzipl <5,5,5,5>, LHS
+  4047519744U, // <5,0,6,0>: Cost 4 vzipr <3,4,5,6>, <0,0,0,0>
+  2920038502U, // <5,0,6,1>: Cost 3 vzipl <5,6,7,0>, LHS
+  3759759871U, // <5,0,6,2>: Cost 4 vext3 <0,2,4,5>, <0,6,2,7>
+  3645164070U, // <5,0,6,3>: Cost 4 vext1 <3,5,0,6>, <3,5,0,6>
+  3762414095U, // <5,0,6,4>: Cost 4 vext3 <0,6,4,5>, <0,6,4,5>
+  3993780690U, // <5,0,6,5>: Cost 4 vzipl <5,6,7,0>, <0,5,6,7>
+  3719893816U, // <5,0,6,6>: Cost 4 vext2 <4,7,5,0>, <6,6,6,6>
+  2662077302U, // <5,0,6,7>: Cost 3 vext2 <7,4,5,0>, <6,7,4,5>
+  2920039069U, // <5,0,6,u>: Cost 3 vzipl <5,6,7,0>, LHS
+  2565455974U, // <5,0,7,0>: Cost 3 vext1 <2,5,0,7>, LHS
+  2565456790U, // <5,0,7,1>: Cost 3 vext1 <2,5,0,7>, <1,2,3,0>
+  2565457742U, // <5,0,7,2>: Cost 3 vext1 <2,5,0,7>, <2,5,0,7>
+  3639199894U, // <5,0,7,3>: Cost 4 vext1 <2,5,0,7>, <3,0,1,2>
+  2565459254U, // <5,0,7,4>: Cost 3 vext1 <2,5,0,7>, RHS
+  2589347938U, // <5,0,7,5>: Cost 3 vext1 <6,5,0,7>, <5,6,7,0>
+  2589348530U, // <5,0,7,6>: Cost 3 vext1 <6,5,0,7>, <6,5,0,7>
+  4188456422U, // <5,0,7,7>: Cost 4 vtrnr RHS, <2,0,5,7>
+  2565461806U, // <5,0,7,u>: Cost 3 vext1 <2,5,0,7>, LHS
+  2687124106U, // <5,0,u,0>: Cost 3 vext3 <0,4,1,5>, <0,u,0,2>
+  1616036502U, // <5,0,u,1>: Cost 2 vext3 <0,u,1,5>, <0,u,1,5>
+  1613382301U, // <5,0,u,2>: Cost 2 vext3 <0,4,1,5>, LHS
+  2689925800U, // <5,0,u,3>: Cost 3 vext3 <0,u,3,5>, <0,u,3,5>
+  2687124146U, // <5,0,u,4>: Cost 3 vext3 <0,4,1,5>, <0,u,4,6>
+  2638190746U, // <5,0,u,5>: Cost 3 vext2 <3,4,5,0>, RHS
+  2589356723U, // <5,0,u,6>: Cost 3 vext1 <6,5,0,u>, <6,5,0,u>
+  2595280230U, // <5,0,u,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6>
+  1613382355U, // <5,0,u,u>: Cost 2 vext3 <0,4,1,5>, LHS
+  2646818816U, // <5,1,0,0>: Cost 3 vext2 <4,u,5,1>, <0,0,0,0>
+  1573077094U, // <5,1,0,1>: Cost 2 vext2 <4,u,5,1>, LHS
+  2646818980U, // <5,1,0,2>: Cost 3 vext2 <4,u,5,1>, <0,2,0,2>
+  2687124214U, // <5,1,0,3>: Cost 3 vext3 <0,4,1,5>, <1,0,3,2>
+  2641510738U, // <5,1,0,4>: Cost 3 vext2 <4,0,5,1>, <0,4,1,5>
+  2641510814U, // <5,1,0,5>: Cost 3 vext2 <4,0,5,1>, <0,5,1,0>
+  3720561142U, // <5,1,0,6>: Cost 4 vext2 <4,u,5,1>, <0,6,1,7>
+  3298141357U, // <5,1,0,7>: Cost 4 vrev <1,5,7,0>
+  1573077661U, // <5,1,0,u>: Cost 2 vext2 <4,u,5,1>, LHS
+  2223891567U, // <5,1,1,0>: Cost 3 vrev <1,5,0,1>
+  2687124276U, // <5,1,1,1>: Cost 3 vext3 <0,4,1,5>, <1,1,1,1>
+  2646819734U, // <5,1,1,2>: Cost 3 vext2 <4,u,5,1>, <1,2,3,0>
+  2687124296U, // <5,1,1,3>: Cost 3 vext3 <0,4,1,5>, <1,1,3,3>
+  2691326803U, // <5,1,1,4>: Cost 3 vext3 <1,1,4,5>, <1,1,4,5>
+  2691400540U, // <5,1,1,5>: Cost 3 vext3 <1,1,5,5>, <1,1,5,5>
+  3765216101U, // <5,1,1,6>: Cost 4 vext3 <1,1,6,5>, <1,1,6,5>
+  3765289838U, // <5,1,1,7>: Cost 4 vext3 <1,1,7,5>, <1,1,7,5>
+  2687124341U, // <5,1,1,u>: Cost 3 vext3 <0,4,1,5>, <1,1,u,3>
+  3297641584U, // <5,1,2,0>: Cost 4 vrev <1,5,0,2>
+  3763520391U, // <5,1,2,1>: Cost 4 vext3 <0,u,1,5>, <1,2,1,3>
+  2646820456U, // <5,1,2,2>: Cost 3 vext2 <4,u,5,1>, <2,2,2,2>
+  2687124374U, // <5,1,2,3>: Cost 3 vext3 <0,4,1,5>, <1,2,3,0>
+  2691990436U, // <5,1,2,4>: Cost 3 vext3 <1,2,4,5>, <1,2,4,5>
+  2687124395U, // <5,1,2,5>: Cost 3 vext3 <0,4,1,5>, <1,2,5,3>
+  2646820794U, // <5,1,2,6>: Cost 3 vext2 <4,u,5,1>, <2,6,3,7>
+  3808199610U, // <5,1,2,7>: Cost 4 vext3 <u,3,4,5>, <1,2,7,0>
+  2687124419U, // <5,1,2,u>: Cost 3 vext3 <0,4,1,5>, <1,2,u,0>
+  2577440870U, // <5,1,3,0>: Cost 3 vext1 <4,5,1,3>, LHS
+  2687124440U, // <5,1,3,1>: Cost 3 vext3 <0,4,1,5>, <1,3,1,3>
+  3759686627U, // <5,1,3,2>: Cost 4 vext3 <0,2,3,5>, <1,3,2,5>
+  2692580332U, // <5,1,3,3>: Cost 3 vext3 <1,3,3,5>, <1,3,3,5>
+  2687124469U, // <5,1,3,4>: Cost 3 vext3 <0,4,1,5>, <1,3,4,5>
+  2685207552U, // <5,1,3,5>: Cost 3 vext3 <0,1,2,5>, <1,3,5,7>
+  3760866313U, // <5,1,3,6>: Cost 4 vext3 <0,4,1,5>, <1,3,6,7>
+  2692875280U, // <5,1,3,7>: Cost 3 vext3 <1,3,7,5>, <1,3,7,5>
+  2687124503U, // <5,1,3,u>: Cost 3 vext3 <0,4,1,5>, <1,3,u,3>
+  1567771538U, // <5,1,4,0>: Cost 2 vext2 <4,0,5,1>, <4,0,5,1>
+  2693096491U, // <5,1,4,1>: Cost 3 vext3 <1,4,1,5>, <1,4,1,5>
+  2693170228U, // <5,1,4,2>: Cost 3 vext3 <1,4,2,5>, <1,4,2,5>
+  2687124541U, // <5,1,4,3>: Cost 3 vext3 <0,4,1,5>, <1,4,3,5>
+  2646822096U, // <5,1,4,4>: Cost 3 vext2 <4,u,5,1>, <4,4,4,4>
+  1573080374U, // <5,1,4,5>: Cost 2 vext2 <4,u,5,1>, RHS
+  2646822260U, // <5,1,4,6>: Cost 3 vext2 <4,u,5,1>, <4,6,4,6>
+  3298174129U, // <5,1,4,7>: Cost 4 vrev <1,5,7,4>
+  1573080602U, // <5,1,4,u>: Cost 2 vext2 <4,u,5,1>, <4,u,5,1>
+  2687124591U, // <5,1,5,0>: Cost 3 vext3 <0,4,1,5>, <1,5,0,1>
+  2646822543U, // <5,1,5,1>: Cost 3 vext2 <4,u,5,1>, <5,1,0,1>
+  3760866433U, // <5,1,5,2>: Cost 4 vext3 <0,4,1,5>, <1,5,2,1>
+  2687124624U, // <5,1,5,3>: Cost 3 vext3 <0,4,1,5>, <1,5,3,7>
+  2687124631U, // <5,1,5,4>: Cost 3 vext3 <0,4,1,5>, <1,5,4,5>
+  2646822916U, // <5,1,5,5>: Cost 3 vext2 <4,u,5,1>, <5,5,5,5>
+  2646823010U, // <5,1,5,6>: Cost 3 vext2 <4,u,5,1>, <5,6,7,0>
+  2646823080U, // <5,1,5,7>: Cost 3 vext2 <4,u,5,1>, <5,7,5,7>
+  2687124663U, // <5,1,5,u>: Cost 3 vext3 <0,4,1,5>, <1,5,u,1>
+  2553577574U, // <5,1,6,0>: Cost 3 vext1 <0,5,1,6>, LHS
+  3763520719U, // <5,1,6,1>: Cost 4 vext3 <0,u,1,5>, <1,6,1,7>
+  2646823418U, // <5,1,6,2>: Cost 3 vext2 <4,u,5,1>, <6,2,7,3>
+  3760866529U, // <5,1,6,3>: Cost 4 vext3 <0,4,1,5>, <1,6,3,7>
+  2553580854U, // <5,1,6,4>: Cost 3 vext1 <0,5,1,6>, RHS
+  2687124723U, // <5,1,6,5>: Cost 3 vext3 <0,4,1,5>, <1,6,5,7>
+  2646823736U, // <5,1,6,6>: Cost 3 vext2 <4,u,5,1>, <6,6,6,6>
+  2646823758U, // <5,1,6,7>: Cost 3 vext2 <4,u,5,1>, <6,7,0,1>
+  2646823839U, // <5,1,6,u>: Cost 3 vext2 <4,u,5,1>, <6,u,0,1>
+  2559557734U, // <5,1,7,0>: Cost 3 vext1 <1,5,1,7>, LHS
+  2559558452U, // <5,1,7,1>: Cost 3 vext1 <1,5,1,7>, <1,1,1,1>
+  2571503270U, // <5,1,7,2>: Cost 3 vext1 <3,5,1,7>, <2,3,0,1>
+  2040971366U, // <5,1,7,3>: Cost 2 vtrnr RHS, LHS
+  2559561014U, // <5,1,7,4>: Cost 3 vext1 <1,5,1,7>, RHS
+  2595393232U, // <5,1,7,5>: Cost 3 vext1 <7,5,1,7>, <5,1,7,3>
+  4188455035U, // <5,1,7,6>: Cost 4 vtrnr RHS, <0,1,4,6>
+  2646824556U, // <5,1,7,7>: Cost 3 vext2 <4,u,5,1>, <7,7,7,7>
+  2040971371U, // <5,1,7,u>: Cost 2 vtrnr RHS, LHS
+  1591662326U, // <5,1,u,0>: Cost 2 vext2 <u,0,5,1>, <u,0,5,1>
+  1573082926U, // <5,1,u,1>: Cost 2 vext2 <4,u,5,1>, LHS
+  2695824760U, // <5,1,u,2>: Cost 3 vext3 <1,u,2,5>, <1,u,2,5>
+  2040979558U, // <5,1,u,3>: Cost 2 vtrnr RHS, LHS
+  2687124874U, // <5,1,u,4>: Cost 3 vext3 <0,4,1,5>, <1,u,4,5>
+  1573083290U, // <5,1,u,5>: Cost 2 vext2 <4,u,5,1>, RHS
+  2646825168U, // <5,1,u,6>: Cost 3 vext2 <4,u,5,1>, <u,6,3,7>
+  2646825216U, // <5,1,u,7>: Cost 3 vext2 <4,u,5,1>, <u,7,0,1>
+  2040979563U, // <5,1,u,u>: Cost 2 vtrnr RHS, LHS
+  3702652928U, // <5,2,0,0>: Cost 4 vext2 <1,u,5,2>, <0,0,0,0>
+  2628911206U, // <5,2,0,1>: Cost 3 vext2 <1,u,5,2>, LHS
+  2641518756U, // <5,2,0,2>: Cost 3 vext2 <4,0,5,2>, <0,2,0,2>
+  3759760847U, // <5,2,0,3>: Cost 4 vext3 <0,2,4,5>, <2,0,3,2>
+  3760866775U, // <5,2,0,4>: Cost 4 vext3 <0,4,1,5>, <2,0,4,1>
+  3759539680U, // <5,2,0,5>: Cost 4 vext3 <0,2,1,5>, <2,0,5,1>
+  3760866796U, // <5,2,0,6>: Cost 4 vext3 <0,4,1,5>, <2,0,6,4>
+  3304114054U, // <5,2,0,7>: Cost 4 vrev <2,5,7,0>
+  2628911773U, // <5,2,0,u>: Cost 3 vext2 <1,u,5,2>, LHS
+  2623603464U, // <5,2,1,0>: Cost 3 vext2 <1,0,5,2>, <1,0,5,2>
+  3698008921U, // <5,2,1,1>: Cost 4 vext2 <1,1,5,2>, <1,1,5,2>
+  3633325603U, // <5,2,1,2>: Cost 4 vext1 <1,5,2,1>, <2,1,3,5>
+  2687125027U, // <5,2,1,3>: Cost 3 vext3 <0,4,1,5>, <2,1,3,5>
+  3633327414U, // <5,2,1,4>: Cost 4 vext1 <1,5,2,1>, RHS
+  3759539760U, // <5,2,1,5>: Cost 4 vext3 <0,2,1,5>, <2,1,5,0>
+  3760866876U, // <5,2,1,6>: Cost 4 vext3 <0,4,1,5>, <2,1,6,3>
+  3304122247U, // <5,2,1,7>: Cost 4 vrev <2,5,7,1>
+  2687125072U, // <5,2,1,u>: Cost 3 vext3 <0,4,1,5>, <2,1,u,5>
+  3633332326U, // <5,2,2,0>: Cost 4 vext1 <1,5,2,2>, LHS
+  3759760992U, // <5,2,2,1>: Cost 4 vext3 <0,2,4,5>, <2,2,1,3>
+  2687125096U, // <5,2,2,2>: Cost 3 vext3 <0,4,1,5>, <2,2,2,2>
+  2687125106U, // <5,2,2,3>: Cost 3 vext3 <0,4,1,5>, <2,2,3,3>
+  2697963133U, // <5,2,2,4>: Cost 3 vext3 <2,2,4,5>, <2,2,4,5>
+  3759466120U, // <5,2,2,5>: Cost 4 vext3 <0,2,0,5>, <2,2,5,7>
+  3760866960U, // <5,2,2,6>: Cost 4 vext3 <0,4,1,5>, <2,2,6,6>
+  3771926168U, // <5,2,2,7>: Cost 4 vext3 <2,2,7,5>, <2,2,7,5>
+  2687125151U, // <5,2,2,u>: Cost 3 vext3 <0,4,1,5>, <2,2,u,3>
+  2687125158U, // <5,2,3,0>: Cost 3 vext3 <0,4,1,5>, <2,3,0,1>
+  2698405555U, // <5,2,3,1>: Cost 3 vext3 <2,3,1,5>, <2,3,1,5>
+  2577516238U, // <5,2,3,2>: Cost 3 vext1 <4,5,2,3>, <2,3,4,5>
+  3759687365U, // <5,2,3,3>: Cost 4 vext3 <0,2,3,5>, <2,3,3,5>
+  1624884942U, // <5,2,3,4>: Cost 2 vext3 <2,3,4,5>, <2,3,4,5>
+  2698700503U, // <5,2,3,5>: Cost 3 vext3 <2,3,5,5>, <2,3,5,5>
+  3772368608U, // <5,2,3,6>: Cost 4 vext3 <2,3,4,5>, <2,3,6,5>
+  3702655716U, // <5,2,3,7>: Cost 4 vext2 <1,u,5,2>, <3,7,3,7>
+  1625179890U, // <5,2,3,u>: Cost 2 vext3 <2,3,u,5>, <2,3,u,5>
+  2641521555U, // <5,2,4,0>: Cost 3 vext2 <4,0,5,2>, <4,0,5,2>
+  3772368642U, // <5,2,4,1>: Cost 4 vext3 <2,3,4,5>, <2,4,1,3>
+  2699142925U, // <5,2,4,2>: Cost 3 vext3 <2,4,2,5>, <2,4,2,5>
+  2698626838U, // <5,2,4,3>: Cost 3 vext3 <2,3,4,5>, <2,4,3,5>
+  2698626848U, // <5,2,4,4>: Cost 3 vext3 <2,3,4,5>, <2,4,4,6>
+  2628914486U, // <5,2,4,5>: Cost 3 vext2 <1,u,5,2>, RHS
+  2645503353U, // <5,2,4,6>: Cost 3 vext2 <4,6,5,2>, <4,6,5,2>
+  3304146826U, // <5,2,4,7>: Cost 4 vrev <2,5,7,4>
+  2628914729U, // <5,2,4,u>: Cost 3 vext2 <1,u,5,2>, RHS
+  2553643110U, // <5,2,5,0>: Cost 3 vext1 <0,5,2,5>, LHS
+  3758950227U, // <5,2,5,1>: Cost 4 vext3 <0,1,2,5>, <2,5,1,3>
+  3759761248U, // <5,2,5,2>: Cost 4 vext3 <0,2,4,5>, <2,5,2,7>
+  2982396006U, // <5,2,5,3>: Cost 3 vzipr <4,u,5,5>, LHS
+  2553646390U, // <5,2,5,4>: Cost 3 vext1 <0,5,2,5>, RHS
+  2553647108U, // <5,2,5,5>: Cost 3 vext1 <0,5,2,5>, <5,5,5,5>
+  3760867204U, // <5,2,5,6>: Cost 4 vext3 <0,4,1,5>, <2,5,6,7>
+  3702657141U, // <5,2,5,7>: Cost 4 vext2 <1,u,5,2>, <5,7,0,1>
+  2982396011U, // <5,2,5,u>: Cost 3 vzipr <4,u,5,5>, LHS
+  3627393126U, // <5,2,6,0>: Cost 4 vext1 <0,5,2,6>, LHS
+  3760867236U, // <5,2,6,1>: Cost 4 vext3 <0,4,1,5>, <2,6,1,3>
+  2645504506U, // <5,2,6,2>: Cost 3 vext2 <4,6,5,2>, <6,2,7,3>
+  2687125434U, // <5,2,6,3>: Cost 3 vext3 <0,4,1,5>, <2,6,3,7>
+  2700617665U, // <5,2,6,4>: Cost 3 vext3 <2,6,4,5>, <2,6,4,5>
+  3760867276U, // <5,2,6,5>: Cost 4 vext3 <0,4,1,5>, <2,6,5,7>
+  3763521493U, // <5,2,6,6>: Cost 4 vext3 <0,u,1,5>, <2,6,6,7>
+  3719246670U, // <5,2,6,7>: Cost 4 vext2 <4,6,5,2>, <6,7,0,1>
+  2687125479U, // <5,2,6,u>: Cost 3 vext3 <0,4,1,5>, <2,6,u,7>
+  2565603430U, // <5,2,7,0>: Cost 3 vext1 <2,5,2,7>, LHS
+  2553660150U, // <5,2,7,1>: Cost 3 vext1 <0,5,2,7>, <1,0,3,2>
+  2565605216U, // <5,2,7,2>: Cost 3 vext1 <2,5,2,7>, <2,5,2,7>
+  2961178726U, // <5,2,7,3>: Cost 3 vzipr <1,3,5,7>, LHS
+  2565606710U, // <5,2,7,4>: Cost 3 vext1 <2,5,2,7>, RHS
+  4034920552U, // <5,2,7,5>: Cost 4 vzipr <1,3,5,7>, <0,1,2,5>
+  3114713292U, // <5,2,7,6>: Cost 3 vtrnr RHS, <0,2,4,6>
+  3702658668U, // <5,2,7,7>: Cost 4 vext2 <1,u,5,2>, <7,7,7,7>
+  2961178731U, // <5,2,7,u>: Cost 3 vzipr <1,3,5,7>, LHS
+  2687125563U, // <5,2,u,0>: Cost 3 vext3 <0,4,1,5>, <2,u,0,1>
+  2628917038U, // <5,2,u,1>: Cost 3 vext2 <1,u,5,2>, LHS
+  2565613409U, // <5,2,u,2>: Cost 3 vext1 <2,5,2,u>, <2,5,2,u>
+  2687125592U, // <5,2,u,3>: Cost 3 vext3 <0,4,1,5>, <2,u,3,3>
+  1628203107U, // <5,2,u,4>: Cost 2 vext3 <2,u,4,5>, <2,u,4,5>
+  2628917402U, // <5,2,u,5>: Cost 3 vext2 <1,u,5,2>, RHS
+  2702092405U, // <5,2,u,6>: Cost 3 vext3 <2,u,6,5>, <2,u,6,5>
+  3304179598U, // <5,2,u,7>: Cost 4 vrev <2,5,7,u>
+  1628498055U, // <5,2,u,u>: Cost 2 vext3 <2,u,u,5>, <2,u,u,5>
+  3760867467U, // <5,3,0,0>: Cost 4 vext3 <0,4,1,5>, <3,0,0,0>
+  2687125654U, // <5,3,0,1>: Cost 3 vext3 <0,4,1,5>, <3,0,1,2>
+  3759761565U, // <5,3,0,2>: Cost 4 vext3 <0,2,4,5>, <3,0,2,0>
+  3633391766U, // <5,3,0,3>: Cost 4 vext1 <1,5,3,0>, <3,0,1,2>
+  2687125680U, // <5,3,0,4>: Cost 3 vext3 <0,4,1,5>, <3,0,4,1>
+  3760277690U, // <5,3,0,5>: Cost 4 vext3 <0,3,2,5>, <3,0,5,2>
+  3310013014U, // <5,3,0,6>: Cost 4 vrev <3,5,6,0>
+  2236344927U, // <5,3,0,7>: Cost 3 vrev <3,5,7,0>
+  2687125717U, // <5,3,0,u>: Cost 3 vext3 <0,4,1,5>, <3,0,u,2>
+  3760867551U, // <5,3,1,0>: Cost 4 vext3 <0,4,1,5>, <3,1,0,3>
+  3760867558U, // <5,3,1,1>: Cost 4 vext3 <0,4,1,5>, <3,1,1,1>
+  2624938923U, // <5,3,1,2>: Cost 3 vext2 <1,2,5,3>, <1,2,5,3>
+  2703198460U, // <5,3,1,3>: Cost 3 vext3 <3,1,3,5>, <3,1,3,5>
+  3760867587U, // <5,3,1,4>: Cost 4 vext3 <0,4,1,5>, <3,1,4,3>
+  2636219536U, // <5,3,1,5>: Cost 3 vext2 <3,1,5,3>, <1,5,3,7>
+  3698681075U, // <5,3,1,6>: Cost 4 vext2 <1,2,5,3>, <1,6,5,7>
+  2703493408U, // <5,3,1,7>: Cost 3 vext3 <3,1,7,5>, <3,1,7,5>
+  2628920721U, // <5,3,1,u>: Cost 3 vext2 <1,u,5,3>, <1,u,5,3>
+  3766765870U, // <5,3,2,0>: Cost 4 vext3 <1,4,0,5>, <3,2,0,1>
+  3698681379U, // <5,3,2,1>: Cost 4 vext2 <1,2,5,3>, <2,1,3,5>
+  3760867649U, // <5,3,2,2>: Cost 4 vext3 <0,4,1,5>, <3,2,2,2>
+  2698627404U, // <5,3,2,3>: Cost 3 vext3 <2,3,4,5>, <3,2,3,4>
+  2703935830U, // <5,3,2,4>: Cost 3 vext3 <3,2,4,5>, <3,2,4,5>
+  2698627422U, // <5,3,2,5>: Cost 3 vext3 <2,3,4,5>, <3,2,5,4>
+  3760867686U, // <5,3,2,6>: Cost 4 vext3 <0,4,1,5>, <3,2,6,3>
+  3769788783U, // <5,3,2,7>: Cost 4 vext3 <1,u,5,5>, <3,2,7,3>
+  2701945209U, // <5,3,2,u>: Cost 3 vext3 <2,u,4,5>, <3,2,u,4>
+  3760867711U, // <5,3,3,0>: Cost 4 vext3 <0,4,1,5>, <3,3,0,1>
+  2636220684U, // <5,3,3,1>: Cost 3 vext2 <3,1,5,3>, <3,1,5,3>
+  3772369298U, // <5,3,3,2>: Cost 4 vext3 <2,3,4,5>, <3,3,2,2>
+  2687125916U, // <5,3,3,3>: Cost 3 vext3 <0,4,1,5>, <3,3,3,3>
+  2704599463U, // <5,3,3,4>: Cost 3 vext3 <3,3,4,5>, <3,3,4,5>
+  2704673200U, // <5,3,3,5>: Cost 3 vext3 <3,3,5,5>, <3,3,5,5>
+  3709962935U, // <5,3,3,6>: Cost 4 vext2 <3,1,5,3>, <3,6,7,7>
+  3772369346U, // <5,3,3,7>: Cost 4 vext3 <2,3,4,5>, <3,3,7,5>
+  2704894411U, // <5,3,3,u>: Cost 3 vext3 <3,3,u,5>, <3,3,u,5>
+  2704968148U, // <5,3,4,0>: Cost 3 vext3 <3,4,0,5>, <3,4,0,5>
+  3698682850U, // <5,3,4,1>: Cost 4 vext2 <1,2,5,3>, <4,1,5,0>
+  2642857014U, // <5,3,4,2>: Cost 3 vext2 <4,2,5,3>, <4,2,5,3>
+  2705189359U, // <5,3,4,3>: Cost 3 vext3 <3,4,3,5>, <3,4,3,5>
+  2705263096U, // <5,3,4,4>: Cost 3 vext3 <3,4,4,5>, <3,4,4,5>
+  2685946370U, // <5,3,4,5>: Cost 3 vext3 <0,2,3,5>, <3,4,5,6>
+  3779152394U, // <5,3,4,6>: Cost 4 vext3 <3,4,6,5>, <3,4,6,5>
+  2236377699U, // <5,3,4,7>: Cost 3 vrev <3,5,7,4>
+  2687126045U, // <5,3,4,u>: Cost 3 vext3 <0,4,1,5>, <3,4,u,6>
+  2571632742U, // <5,3,5,0>: Cost 3 vext1 <3,5,3,5>, LHS
+  2559689870U, // <5,3,5,1>: Cost 3 vext1 <1,5,3,5>, <1,5,3,5>
+  2571634382U, // <5,3,5,2>: Cost 3 vext1 <3,5,3,5>, <2,3,4,5>
+  2571635264U, // <5,3,5,3>: Cost 3 vext1 <3,5,3,5>, <3,5,3,5>
+  2571636022U, // <5,3,5,4>: Cost 3 vext1 <3,5,3,5>, RHS
+  2559692804U, // <5,3,5,5>: Cost 3 vext1 <1,5,3,5>, <5,5,5,5>
+  3720581218U, // <5,3,5,6>: Cost 4 vext2 <4,u,5,3>, <5,6,7,0>
+  2236385892U, // <5,3,5,7>: Cost 3 vrev <3,5,7,5>
+  2571638574U, // <5,3,5,u>: Cost 3 vext1 <3,5,3,5>, LHS
+  2565668966U, // <5,3,6,0>: Cost 3 vext1 <2,5,3,6>, LHS
+  3633439887U, // <5,3,6,1>: Cost 4 vext1 <1,5,3,6>, <1,5,3,6>
+  2565670760U, // <5,3,6,2>: Cost 3 vext1 <2,5,3,6>, <2,5,3,6>
+  2565671426U, // <5,3,6,3>: Cost 3 vext1 <2,5,3,6>, <3,4,5,6>
+  2565672246U, // <5,3,6,4>: Cost 3 vext1 <2,5,3,6>, RHS
+  3639414630U, // <5,3,6,5>: Cost 4 vext1 <2,5,3,6>, <5,3,6,0>
+  4047521640U, // <5,3,6,6>: Cost 4 vzipr <3,4,5,6>, <2,5,3,6>
+  2725169844U, // <5,3,6,7>: Cost 3 vext3 <6,7,4,5>, <3,6,7,4>
+  2565674798U, // <5,3,6,u>: Cost 3 vext1 <2,5,3,6>, LHS
+  1485963366U, // <5,3,7,0>: Cost 2 vext1 <1,5,3,7>, LHS
+  1485964432U, // <5,3,7,1>: Cost 2 vext1 <1,5,3,7>, <1,5,3,7>
+  2559706728U, // <5,3,7,2>: Cost 3 vext1 <1,5,3,7>, <2,2,2,2>
+  2559707286U, // <5,3,7,3>: Cost 3 vext1 <1,5,3,7>, <3,0,1,2>
+  1485966646U, // <5,3,7,4>: Cost 2 vext1 <1,5,3,7>, RHS
+  2559708880U, // <5,3,7,5>: Cost 3 vext1 <1,5,3,7>, <5,1,7,3>
+  2601513466U, // <5,3,7,6>: Cost 3 vext1 <u,5,3,7>, <6,2,7,3>
+  3114714112U, // <5,3,7,7>: Cost 3 vtrnr RHS, <1,3,5,7>
+  1485969198U, // <5,3,7,u>: Cost 2 vext1 <1,5,3,7>, LHS
+  1485971558U, // <5,3,u,0>: Cost 2 vext1 <1,5,3,u>, LHS
+  1485972625U, // <5,3,u,1>: Cost 2 vext1 <1,5,3,u>, <1,5,3,u>
+  2559714920U, // <5,3,u,2>: Cost 3 vext1 <1,5,3,u>, <2,2,2,2>
+  2559715478U, // <5,3,u,3>: Cost 3 vext1 <1,5,3,u>, <3,0,1,2>
+  1485974838U, // <5,3,u,4>: Cost 2 vext1 <1,5,3,u>, RHS
+  2687126342U, // <5,3,u,5>: Cost 3 vext3 <0,4,1,5>, <3,u,5,6>
+  2601521658U, // <5,3,u,6>: Cost 3 vext1 <u,5,3,u>, <6,2,7,3>
+  2236410471U, // <5,3,u,7>: Cost 3 vrev <3,5,7,u>
+  1485977390U, // <5,3,u,u>: Cost 2 vext1 <1,5,3,u>, LHS
+  3627491430U, // <5,4,0,0>: Cost 4 vext1 <0,5,4,0>, LHS
+  2636890214U, // <5,4,0,1>: Cost 3 vext2 <3,2,5,4>, LHS
+  3703333028U, // <5,4,0,2>: Cost 4 vext2 <2,0,5,4>, <0,2,0,2>
+  3782249348U, // <5,4,0,3>: Cost 4 vext3 <4,0,3,5>, <4,0,3,5>
+  2642198866U, // <5,4,0,4>: Cost 3 vext2 <4,1,5,4>, <0,4,1,5>
+  2687126418U, // <5,4,0,5>: Cost 3 vext3 <0,4,1,5>, <4,0,5,1>
+  2242243887U, // <5,4,0,6>: Cost 3 vrev <4,5,6,0>
+  3316059448U, // <5,4,0,7>: Cost 4 vrev <4,5,7,0>
+  2636890781U, // <5,4,0,u>: Cost 3 vext2 <3,2,5,4>, LHS
+  2241809658U, // <5,4,1,0>: Cost 3 vrev <4,5,0,1>
+  3698025307U, // <5,4,1,1>: Cost 4 vext2 <1,1,5,4>, <1,1,5,4>
+  3698688940U, // <5,4,1,2>: Cost 4 vext2 <1,2,5,4>, <1,2,5,4>
+  3698689024U, // <5,4,1,3>: Cost 4 vext2 <1,2,5,4>, <1,3,5,7>
+  3700016206U, // <5,4,1,4>: Cost 4 vext2 <1,4,5,4>, <1,4,5,4>
+  2687126498U, // <5,4,1,5>: Cost 3 vext3 <0,4,1,5>, <4,1,5,0>
+  3760868336U, // <5,4,1,6>: Cost 4 vext3 <0,4,1,5>, <4,1,6,5>
+  3316067641U, // <5,4,1,7>: Cost 4 vrev <4,5,7,1>
+  2242399554U, // <5,4,1,u>: Cost 3 vrev <4,5,u,1>
+  3703334371U, // <5,4,2,0>: Cost 4 vext2 <2,0,5,4>, <2,0,5,4>
+  3703998004U, // <5,4,2,1>: Cost 4 vext2 <2,1,5,4>, <2,1,5,4>
+  3704661637U, // <5,4,2,2>: Cost 4 vext2 <2,2,5,4>, <2,2,5,4>
+  2636891854U, // <5,4,2,3>: Cost 3 vext2 <3,2,5,4>, <2,3,4,5>
+  3705988903U, // <5,4,2,4>: Cost 4 vext2 <2,4,5,4>, <2,4,5,4>
+  2698628150U, // <5,4,2,5>: Cost 3 vext3 <2,3,4,5>, <4,2,5,3>
+  3760868415U, // <5,4,2,6>: Cost 4 vext3 <0,4,1,5>, <4,2,6,3>
+  3783871562U, // <5,4,2,7>: Cost 4 vext3 <4,2,7,5>, <4,2,7,5>
+  2666752099U, // <5,4,2,u>: Cost 3 vext2 <u,2,5,4>, <2,u,4,5>
+  3639459942U, // <5,4,3,0>: Cost 4 vext1 <2,5,4,3>, LHS
+  3709970701U, // <5,4,3,1>: Cost 4 vext2 <3,1,5,4>, <3,1,5,4>
+  2636892510U, // <5,4,3,2>: Cost 3 vext2 <3,2,5,4>, <3,2,5,4>
+  3710634396U, // <5,4,3,3>: Cost 4 vext2 <3,2,5,4>, <3,3,3,3>
+  2638219776U, // <5,4,3,4>: Cost 3 vext2 <3,4,5,4>, <3,4,5,4>
+  3766987908U, // <5,4,3,5>: Cost 4 vext3 <1,4,3,5>, <4,3,5,0>
+  2710719634U, // <5,4,3,6>: Cost 3 vext3 <4,3,6,5>, <4,3,6,5>
+  3914097664U, // <5,4,3,7>: Cost 4 vuzpr <3,5,7,4>, <1,3,5,7>
+  2640874308U, // <5,4,3,u>: Cost 3 vext2 <3,u,5,4>, <3,u,5,4>
+  2583642214U, // <5,4,4,0>: Cost 3 vext1 <5,5,4,4>, LHS
+  2642201574U, // <5,4,4,1>: Cost 3 vext2 <4,1,5,4>, <4,1,5,4>
+  3710635062U, // <5,4,4,2>: Cost 4 vext2 <3,2,5,4>, <4,2,5,3>
+  3717270664U, // <5,4,4,3>: Cost 4 vext2 <4,3,5,4>, <4,3,5,4>
+  2713963728U, // <5,4,4,4>: Cost 3 vext3 <4,u,5,5>, <4,4,4,4>
+  1637567706U, // <5,4,4,5>: Cost 2 vext3 <4,4,5,5>, <4,4,5,5>
+  2242276659U, // <5,4,4,6>: Cost 3 vrev <4,5,6,4>
+  2646183372U, // <5,4,4,7>: Cost 3 vext2 <4,7,5,4>, <4,7,5,4>
+  1637788917U, // <5,4,4,u>: Cost 2 vext3 <4,4,u,5>, <4,4,u,5>
+  2559762534U, // <5,4,5,0>: Cost 3 vext1 <1,5,4,5>, LHS
+  2559763607U, // <5,4,5,1>: Cost 3 vext1 <1,5,4,5>, <1,5,4,5>
+  2698628366U, // <5,4,5,2>: Cost 3 vext3 <2,3,4,5>, <4,5,2,3>
+  3633506454U, // <5,4,5,3>: Cost 4 vext1 <1,5,4,5>, <3,0,1,2>
+  2559765814U, // <5,4,5,4>: Cost 3 vext1 <1,5,4,5>, RHS
+  2583654395U, // <5,4,5,5>: Cost 3 vext1 <5,5,4,5>, <5,5,4,5>
+  1613385014U, // <5,4,5,6>: Cost 2 vext3 <0,4,1,5>, RHS
+  3901639990U, // <5,4,5,7>: Cost 4 vuzpr <1,5,0,4>, RHS
+  1613385032U, // <5,4,5,u>: Cost 2 vext3 <0,4,1,5>, RHS
+  2559770726U, // <5,4,6,0>: Cost 3 vext1 <1,5,4,6>, LHS
+  2559771648U, // <5,4,6,1>: Cost 3 vext1 <1,5,4,6>, <1,3,5,7>
+  3633514088U, // <5,4,6,2>: Cost 4 vext1 <1,5,4,6>, <2,2,2,2>
+  2571717122U, // <5,4,6,3>: Cost 3 vext1 <3,5,4,6>, <3,4,5,6>
+  2559774006U, // <5,4,6,4>: Cost 3 vext1 <1,5,4,6>, RHS
+  2712636796U, // <5,4,6,5>: Cost 3 vext3 <4,6,5,5>, <4,6,5,5>
+  3760868743U, // <5,4,6,6>: Cost 4 vext3 <0,4,1,5>, <4,6,6,7>
+  2712784270U, // <5,4,6,7>: Cost 3 vext3 <4,6,7,5>, <4,6,7,5>
+  2559776558U, // <5,4,6,u>: Cost 3 vext1 <1,5,4,6>, LHS
+  2565750886U, // <5,4,7,0>: Cost 3 vext1 <2,5,4,7>, LHS
+  2565751706U, // <5,4,7,1>: Cost 3 vext1 <2,5,4,7>, <1,2,3,4>
+  2565752690U, // <5,4,7,2>: Cost 3 vext1 <2,5,4,7>, <2,5,4,7>
+  2571725387U, // <5,4,7,3>: Cost 3 vext1 <3,5,4,7>, <3,5,4,7>
+  2565754166U, // <5,4,7,4>: Cost 3 vext1 <2,5,4,7>, RHS
+  3114713426U, // <5,4,7,5>: Cost 3 vtrnr RHS, <0,4,1,5>
+  94817590U, // <5,4,7,6>: Cost 1 vrev RHS
+  2595616175U, // <5,4,7,7>: Cost 3 vext1 <7,5,4,7>, <7,5,4,7>
+  94965064U, // <5,4,7,u>: Cost 1 vrev RHS
+  2559787110U, // <5,4,u,0>: Cost 3 vext1 <1,5,4,u>, LHS
+  2559788186U, // <5,4,u,1>: Cost 3 vext1 <1,5,4,u>, <1,5,4,u>
+  2242014483U, // <5,4,u,2>: Cost 3 vrev <4,5,2,u>
+  2667419628U, // <5,4,u,3>: Cost 3 vext2 <u,3,5,4>, <u,3,5,4>
+  2559790390U, // <5,4,u,4>: Cost 3 vext1 <1,5,4,u>, RHS
+  1640222238U, // <5,4,u,5>: Cost 2 vext3 <4,u,5,5>, <4,u,5,5>
+  94825783U, // <5,4,u,6>: Cost 1 vrev RHS
+  2714111536U, // <5,4,u,7>: Cost 3 vext3 <4,u,7,5>, <4,u,7,5>
+  94973257U, // <5,4,u,u>: Cost 1 vrev RHS
+  2646851584U, // <5,5,0,0>: Cost 3 vext2 <4,u,5,5>, <0,0,0,0>
+  1573109862U, // <5,5,0,1>: Cost 2 vext2 <4,u,5,5>, LHS
+  2646851748U, // <5,5,0,2>: Cost 3 vext2 <4,u,5,5>, <0,2,0,2>
+  3760279130U, // <5,5,0,3>: Cost 4 vext3 <0,3,2,5>, <5,0,3,2>
+  2687127138U, // <5,5,0,4>: Cost 3 vext3 <0,4,1,5>, <5,0,4,1>
+  2248142847U, // <5,5,0,5>: Cost 3 vrev <5,5,5,0>
+  3720593910U, // <5,5,0,6>: Cost 4 vext2 <4,u,5,5>, <0,6,1,7>
+  4182502710U, // <5,5,0,7>: Cost 4 vtrnr <3,5,7,0>, RHS
+  1573110429U, // <5,5,0,u>: Cost 2 vext2 <4,u,5,5>, LHS
+  2646852342U, // <5,5,1,0>: Cost 3 vext2 <4,u,5,5>, <1,0,3,2>
+  2624291676U, // <5,5,1,1>: Cost 3 vext2 <1,1,5,5>, <1,1,5,5>
+  2646852502U, // <5,5,1,2>: Cost 3 vext2 <4,u,5,5>, <1,2,3,0>
+  2646852568U, // <5,5,1,3>: Cost 3 vext2 <4,u,5,5>, <1,3,1,3>
+  2715217591U, // <5,5,1,4>: Cost 3 vext3 <5,1,4,5>, <5,1,4,5>
+  2628936848U, // <5,5,1,5>: Cost 3 vext2 <1,u,5,5>, <1,5,3,7>
+  3698033907U, // <5,5,1,6>: Cost 4 vext2 <1,1,5,5>, <1,6,5,7>
+  2713964240U, // <5,5,1,7>: Cost 3 vext3 <4,u,5,5>, <5,1,7,3>
+  2628937107U, // <5,5,1,u>: Cost 3 vext2 <1,u,5,5>, <1,u,5,5>
+  3645497446U, // <5,5,2,0>: Cost 4 vext1 <3,5,5,2>, LHS
+  3760869099U, // <5,5,2,1>: Cost 4 vext3 <0,4,1,5>, <5,2,1,3>
+  2646853224U, // <5,5,2,2>: Cost 3 vext2 <4,u,5,5>, <2,2,2,2>
+  2698628862U, // <5,5,2,3>: Cost 3 vext3 <2,3,4,5>, <5,2,3,4>
+  3772370694U, // <5,5,2,4>: Cost 4 vext3 <2,3,4,5>, <5,2,4,3>
+  2713964303U, // <5,5,2,5>: Cost 3 vext3 <4,u,5,5>, <5,2,5,3>
+  2646853562U, // <5,5,2,6>: Cost 3 vext2 <4,u,5,5>, <2,6,3,7>
+  4038198272U, // <5,5,2,7>: Cost 4 vzipr <1,u,5,2>, <1,3,5,7>
+  2701946667U, // <5,5,2,u>: Cost 3 vext3 <2,u,4,5>, <5,2,u,4>
+  2646853782U, // <5,5,3,0>: Cost 3 vext2 <4,u,5,5>, <3,0,1,2>
+  3698034922U, // <5,5,3,1>: Cost 4 vext2 <1,1,5,5>, <3,1,1,5>
+  3702679919U, // <5,5,3,2>: Cost 4 vext2 <1,u,5,5>, <3,2,7,3>
+  2637564336U, // <5,5,3,3>: Cost 3 vext2 <3,3,5,5>, <3,3,5,5>
+  2646854146U, // <5,5,3,4>: Cost 3 vext2 <4,u,5,5>, <3,4,5,6>
+  2638891602U, // <5,5,3,5>: Cost 3 vext2 <3,5,5,5>, <3,5,5,5>
+  3702680247U, // <5,5,3,6>: Cost 4 vext2 <1,u,5,5>, <3,6,7,7>
+  3702680259U, // <5,5,3,7>: Cost 4 vext2 <1,u,5,5>, <3,7,0,1>
+  2646854430U, // <5,5,3,u>: Cost 3 vext2 <4,u,5,5>, <3,u,1,2>
+  2646854546U, // <5,5,4,0>: Cost 3 vext2 <4,u,5,5>, <4,0,5,1>
+  2642209767U, // <5,5,4,1>: Cost 3 vext2 <4,1,5,5>, <4,1,5,5>
+  3711306806U, // <5,5,4,2>: Cost 4 vext2 <3,3,5,5>, <4,2,5,3>
+  3645516369U, // <5,5,4,3>: Cost 4 vext1 <3,5,5,4>, <3,5,5,4>
+  1570458842U, // <5,5,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5>
+  1573113142U, // <5,5,4,5>: Cost 2 vext2 <4,u,5,5>, RHS
+  2645527932U, // <5,5,4,6>: Cost 3 vext2 <4,6,5,5>, <4,6,5,5>
+  2713964486U, // <5,5,4,7>: Cost 3 vext3 <4,u,5,5>, <5,4,7,6>
+  1573113374U, // <5,5,4,u>: Cost 2 vext2 <4,u,5,5>, <4,u,5,5>
+  1509982310U, // <5,5,5,0>: Cost 2 vext1 <5,5,5,5>, LHS
+  2646855376U, // <5,5,5,1>: Cost 3 vext2 <4,u,5,5>, <5,1,7,3>
+  2583725672U, // <5,5,5,2>: Cost 3 vext1 <5,5,5,5>, <2,2,2,2>
+  2583726230U, // <5,5,5,3>: Cost 3 vext1 <5,5,5,5>, <3,0,1,2>
+  1509985590U, // <5,5,5,4>: Cost 2 vext1 <5,5,5,5>, RHS
+  229035318U, // <5,5,5,5>: Cost 1 vdup1 RHS
+  2646855778U, // <5,5,5,6>: Cost 3 vext2 <4,u,5,5>, <5,6,7,0>
+  2646855848U, // <5,5,5,7>: Cost 3 vext2 <4,u,5,5>, <5,7,5,7>
+  229035318U, // <5,5,5,u>: Cost 1 vdup1 RHS
+  2577760358U, // <5,5,6,0>: Cost 3 vext1 <4,5,5,6>, LHS
+  3633587361U, // <5,5,6,1>: Cost 4 vext1 <1,5,5,6>, <1,5,5,6>
+  2646856186U, // <5,5,6,2>: Cost 3 vext2 <4,u,5,5>, <6,2,7,3>
+  3633588738U, // <5,5,6,3>: Cost 4 vext1 <1,5,5,6>, <3,4,5,6>
+  2718535756U, // <5,5,6,4>: Cost 3 vext3 <5,6,4,5>, <5,6,4,5>
+  2644202223U, // <5,5,6,5>: Cost 3 vext2 <4,4,5,5>, <6,5,7,5>
+  2973780482U, // <5,5,6,6>: Cost 3 vzipr <3,4,5,6>, <3,4,5,6>
+  2646856526U, // <5,5,6,7>: Cost 3 vext2 <4,u,5,5>, <6,7,0,1>
+  2646856607U, // <5,5,6,u>: Cost 3 vext2 <4,u,5,5>, <6,u,0,1>
+  2571796582U, // <5,5,7,0>: Cost 3 vext1 <3,5,5,7>, LHS
+  3633595392U, // <5,5,7,1>: Cost 4 vext1 <1,5,5,7>, <1,3,5,7>
+  2571798222U, // <5,5,7,2>: Cost 3 vext1 <3,5,5,7>, <2,3,4,5>
+  2571799124U, // <5,5,7,3>: Cost 3 vext1 <3,5,5,7>, <3,5,5,7>
+  2571799862U, // <5,5,7,4>: Cost 3 vext1 <3,5,5,7>, RHS
+  3114717188U, // <5,5,7,5>: Cost 3 vtrnr RHS, <5,5,5,5>
+  4034923010U, // <5,5,7,6>: Cost 4 vzipr <1,3,5,7>, <3,4,5,6>
+  2040974646U, // <5,5,7,7>: Cost 2 vtrnr RHS, RHS
+  2040974647U, // <5,5,7,u>: Cost 2 vtrnr RHS, RHS
+  1509982310U, // <5,5,u,0>: Cost 2 vext1 <5,5,5,5>, LHS
+  1573115694U, // <5,5,u,1>: Cost 2 vext2 <4,u,5,5>, LHS
+  2571806414U, // <5,5,u,2>: Cost 3 vext1 <3,5,5,u>, <2,3,4,5>
+  2571807317U, // <5,5,u,3>: Cost 3 vext1 <3,5,5,u>, <3,5,5,u>
+  1509985590U, // <5,5,u,4>: Cost 2 vext1 <5,5,5,5>, RHS
+  229035318U, // <5,5,u,5>: Cost 1 vdup1 RHS
+  2646857936U, // <5,5,u,6>: Cost 3 vext2 <4,u,5,5>, <u,6,3,7>
+  2040982838U, // <5,5,u,7>: Cost 2 vtrnr RHS, RHS
+  229035318U, // <5,5,u,u>: Cost 1 vdup1 RHS
+  2638233600U, // <5,6,0,0>: Cost 3 vext2 <3,4,5,6>, <0,0,0,0>
+  1564491878U, // <5,6,0,1>: Cost 2 vext2 <3,4,5,6>, LHS
+  2632261796U, // <5,6,0,2>: Cost 3 vext2 <2,4,5,6>, <0,2,0,2>
+  2638233856U, // <5,6,0,3>: Cost 3 vext2 <3,4,5,6>, <0,3,1,4>
+  2638233938U, // <5,6,0,4>: Cost 3 vext2 <3,4,5,6>, <0,4,1,5>
+  3706003885U, // <5,6,0,5>: Cost 4 vext2 <2,4,5,6>, <0,5,2,6>
+  3706003967U, // <5,6,0,6>: Cost 4 vext2 <2,4,5,6>, <0,6,2,7>
+  4047473974U, // <5,6,0,7>: Cost 4 vzipr <3,4,5,0>, RHS
+  1564492445U, // <5,6,0,u>: Cost 2 vext2 <3,4,5,6>, LHS
+  2638234358U, // <5,6,1,0>: Cost 3 vext2 <3,4,5,6>, <1,0,3,2>
+  2638234420U, // <5,6,1,1>: Cost 3 vext2 <3,4,5,6>, <1,1,1,1>
+  2638234518U, // <5,6,1,2>: Cost 3 vext2 <3,4,5,6>, <1,2,3,0>
+  2638234584U, // <5,6,1,3>: Cost 3 vext2 <3,4,5,6>, <1,3,1,3>
+  2626290768U, // <5,6,1,4>: Cost 3 vext2 <1,4,5,6>, <1,4,5,6>
+  2638234768U, // <5,6,1,5>: Cost 3 vext2 <3,4,5,6>, <1,5,3,7>
+  3700032719U, // <5,6,1,6>: Cost 4 vext2 <1,4,5,6>, <1,6,1,7>
+  2982366518U, // <5,6,1,7>: Cost 3 vzipr <4,u,5,1>, RHS
+  2628945300U, // <5,6,1,u>: Cost 3 vext2 <1,u,5,6>, <1,u,5,6>
+  3706004925U, // <5,6,2,0>: Cost 4 vext2 <2,4,5,6>, <2,0,1,2>
+  3711976966U, // <5,6,2,1>: Cost 4 vext2 <3,4,5,6>, <2,1,0,3>
+  2638235240U, // <5,6,2,2>: Cost 3 vext2 <3,4,5,6>, <2,2,2,2>
+  2638235302U, // <5,6,2,3>: Cost 3 vext2 <3,4,5,6>, <2,3,0,1>
+  2632263465U, // <5,6,2,4>: Cost 3 vext2 <2,4,5,6>, <2,4,5,6>
+  2638235496U, // <5,6,2,5>: Cost 3 vext2 <3,4,5,6>, <2,5,3,6>
+  2638235578U, // <5,6,2,6>: Cost 3 vext2 <3,4,5,6>, <2,6,3,7>
+  2713965050U, // <5,6,2,7>: Cost 3 vext3 <4,u,5,5>, <6,2,7,3>
+  2634917997U, // <5,6,2,u>: Cost 3 vext2 <2,u,5,6>, <2,u,5,6>
+  2638235798U, // <5,6,3,0>: Cost 3 vext2 <3,4,5,6>, <3,0,1,2>
+  3711977695U, // <5,6,3,1>: Cost 4 vext2 <3,4,5,6>, <3,1,0,3>
+  3710650720U, // <5,6,3,2>: Cost 4 vext2 <3,2,5,6>, <3,2,5,6>
+  2638236060U, // <5,6,3,3>: Cost 3 vext2 <3,4,5,6>, <3,3,3,3>
+  1564494338U, // <5,6,3,4>: Cost 2 vext2 <3,4,5,6>, <3,4,5,6>
+  2638236234U, // <5,6,3,5>: Cost 3 vext2 <3,4,5,6>, <3,5,4,6>
+  3711978104U, // <5,6,3,6>: Cost 4 vext2 <3,4,5,6>, <3,6,0,7>
+  4034227510U, // <5,6,3,7>: Cost 4 vzipr <1,2,5,3>, RHS
+  1567148870U, // <5,6,3,u>: Cost 2 vext2 <3,u,5,6>, <3,u,5,6>
+  2577817702U, // <5,6,4,0>: Cost 3 vext1 <4,5,6,4>, LHS
+  3700034544U, // <5,6,4,1>: Cost 4 vext2 <1,4,5,6>, <4,1,6,5>
+  2723033713U, // <5,6,4,2>: Cost 3 vext3 <6,4,2,5>, <6,4,2,5>
+  2638236818U, // <5,6,4,3>: Cost 3 vext2 <3,4,5,6>, <4,3,6,5>
+  2644208859U, // <5,6,4,4>: Cost 3 vext2 <4,4,5,6>, <4,4,5,6>
+  1564495158U, // <5,6,4,5>: Cost 2 vext2 <3,4,5,6>, RHS
+  2645536125U, // <5,6,4,6>: Cost 3 vext2 <4,6,5,6>, <4,6,5,6>
+  2723402398U, // <5,6,4,7>: Cost 3 vext3 <6,4,7,5>, <6,4,7,5>
+  1564495401U, // <5,6,4,u>: Cost 2 vext2 <3,4,5,6>, RHS
+  2577825894U, // <5,6,5,0>: Cost 3 vext1 <4,5,6,5>, LHS
+  2662125264U, // <5,6,5,1>: Cost 3 vext2 <7,4,5,6>, <5,1,7,3>
+  3775836867U, // <5,6,5,2>: Cost 4 vext3 <2,u,6,5>, <6,5,2,6>
+  3711979343U, // <5,6,5,3>: Cost 4 vext2 <3,4,5,6>, <5,3,3,4>
+  2650181556U, // <5,6,5,4>: Cost 3 vext2 <5,4,5,6>, <5,4,5,6>
+  2662125572U, // <5,6,5,5>: Cost 3 vext2 <7,4,5,6>, <5,5,5,5>
+  2638237732U, // <5,6,5,6>: Cost 3 vext2 <3,4,5,6>, <5,6,0,1>
+  2982399286U, // <5,6,5,7>: Cost 3 vzipr <4,u,5,5>, RHS
+  2982399287U, // <5,6,5,u>: Cost 3 vzipr <4,u,5,5>, RHS
+  2583806054U, // <5,6,6,0>: Cost 3 vext1 <5,5,6,6>, LHS
+  3711979910U, // <5,6,6,1>: Cost 4 vext2 <3,4,5,6>, <6,1,3,4>
+  2662126074U, // <5,6,6,2>: Cost 3 vext2 <7,4,5,6>, <6,2,7,3>
+  2583808514U, // <5,6,6,3>: Cost 3 vext1 <5,5,6,6>, <3,4,5,6>
+  2583809334U, // <5,6,6,4>: Cost 3 vext1 <5,5,6,6>, RHS
+  2583810062U, // <5,6,6,5>: Cost 3 vext1 <5,5,6,6>, <5,5,6,6>
+  2638238520U, // <5,6,6,6>: Cost 3 vext2 <3,4,5,6>, <6,6,6,6>
+  2973781302U, // <5,6,6,7>: Cost 3 vzipr <3,4,5,6>, RHS
+  2973781303U, // <5,6,6,u>: Cost 3 vzipr <3,4,5,6>, RHS
+  430358630U, // <5,6,7,0>: Cost 1 vext1 RHS, LHS
+  1504101110U, // <5,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2>
+  1504101992U, // <5,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  1504102550U, // <5,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2>
+  430361910U, // <5,6,7,4>: Cost 1 vext1 RHS, RHS
+  1504104390U, // <5,6,7,5>: Cost 2 vext1 RHS, <5,4,7,6>
+  1504105272U, // <5,6,7,6>: Cost 2 vext1 RHS, <6,6,6,6>
+  1504106092U, // <5,6,7,7>: Cost 2 vext1 RHS, <7,7,7,7>
+  430364462U, // <5,6,7,u>: Cost 1 vext1 RHS, LHS
+  430366822U, // <5,6,u,0>: Cost 1 vext1 RHS, LHS
+  1564497710U, // <5,6,u,1>: Cost 2 vext2 <3,4,5,6>, LHS
+  1504110184U, // <5,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  1504110742U, // <5,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2>
+  430370103U, // <5,6,u,4>: Cost 1 vext1 RHS, RHS
+  1564498074U, // <5,6,u,5>: Cost 2 vext2 <3,4,5,6>, RHS
+  1504113146U, // <5,6,u,6>: Cost 2 vext1 RHS, <6,2,7,3>
+  1504113658U, // <5,6,u,7>: Cost 2 vext1 RHS, <7,0,1,2>
+  430372654U, // <5,6,u,u>: Cost 1 vext1 RHS, LHS
+  2625634304U, // <5,7,0,0>: Cost 3 vext2 <1,3,5,7>, <0,0,0,0>
+  1551892582U, // <5,7,0,1>: Cost 2 vext2 <1,3,5,7>, LHS
+  2625634468U, // <5,7,0,2>: Cost 3 vext2 <1,3,5,7>, <0,2,0,2>
+  2571889247U, // <5,7,0,3>: Cost 3 vext1 <3,5,7,0>, <3,5,7,0>
+  2625634642U, // <5,7,0,4>: Cost 3 vext2 <1,3,5,7>, <0,4,1,5>
+  2595778728U, // <5,7,0,5>: Cost 3 vext1 <7,5,7,0>, <5,7,5,7>
+  3699376639U, // <5,7,0,6>: Cost 4 vext2 <1,3,5,7>, <0,6,2,7>
+  2260235715U, // <5,7,0,7>: Cost 3 vrev <7,5,7,0>
+  1551893149U, // <5,7,0,u>: Cost 2 vext2 <1,3,5,7>, LHS
+  2625635062U, // <5,7,1,0>: Cost 3 vext2 <1,3,5,7>, <1,0,3,2>
+  2624308020U, // <5,7,1,1>: Cost 3 vext2 <1,1,5,7>, <1,1,1,1>
+  2625635222U, // <5,7,1,2>: Cost 3 vext2 <1,3,5,7>, <1,2,3,0>
+  1551893504U, // <5,7,1,3>: Cost 2 vext2 <1,3,5,7>, <1,3,5,7>
+  2571898166U, // <5,7,1,4>: Cost 3 vext1 <3,5,7,1>, RHS
+  2625635472U, // <5,7,1,5>: Cost 3 vext2 <1,3,5,7>, <1,5,3,7>
+  2627626227U, // <5,7,1,6>: Cost 3 vext2 <1,6,5,7>, <1,6,5,7>
+  3702031684U, // <5,7,1,7>: Cost 4 vext2 <1,7,5,7>, <1,7,5,7>
+  1555211669U, // <5,7,1,u>: Cost 2 vext2 <1,u,5,7>, <1,u,5,7>
+  2629617126U, // <5,7,2,0>: Cost 3 vext2 <2,0,5,7>, <2,0,5,7>
+  3699377670U, // <5,7,2,1>: Cost 4 vext2 <1,3,5,7>, <2,1,0,3>
+  2625635944U, // <5,7,2,2>: Cost 3 vext2 <1,3,5,7>, <2,2,2,2>
+  2625636006U, // <5,7,2,3>: Cost 3 vext2 <1,3,5,7>, <2,3,0,1>
+  2632271658U, // <5,7,2,4>: Cost 3 vext2 <2,4,5,7>, <2,4,5,7>
+  2625636201U, // <5,7,2,5>: Cost 3 vext2 <1,3,5,7>, <2,5,3,7>
+  2625636282U, // <5,7,2,6>: Cost 3 vext2 <1,3,5,7>, <2,6,3,7>
+  3708004381U, // <5,7,2,7>: Cost 4 vext2 <2,7,5,7>, <2,7,5,7>
+  2625636411U, // <5,7,2,u>: Cost 3 vext2 <1,3,5,7>, <2,u,0,1>
+  2625636502U, // <5,7,3,0>: Cost 3 vext2 <1,3,5,7>, <3,0,1,2>
+  2625636604U, // <5,7,3,1>: Cost 3 vext2 <1,3,5,7>, <3,1,3,5>
+  3699378478U, // <5,7,3,2>: Cost 4 vext2 <1,3,5,7>, <3,2,0,1>
+  2625636764U, // <5,7,3,3>: Cost 3 vext2 <1,3,5,7>, <3,3,3,3>
+  2625636866U, // <5,7,3,4>: Cost 3 vext2 <1,3,5,7>, <3,4,5,6>
+  2625636959U, // <5,7,3,5>: Cost 3 vext2 <1,3,5,7>, <3,5,7,0>
+  3699378808U, // <5,7,3,6>: Cost 4 vext2 <1,3,5,7>, <3,6,0,7>
+  2640235254U, // <5,7,3,7>: Cost 3 vext2 <3,7,5,7>, <3,7,5,7>
+  2625637150U, // <5,7,3,u>: Cost 3 vext2 <1,3,5,7>, <3,u,1,2>
+  2571919462U, // <5,7,4,0>: Cost 3 vext1 <3,5,7,4>, LHS
+  2571920384U, // <5,7,4,1>: Cost 3 vext1 <3,5,7,4>, <1,3,5,7>
+  3699379260U, // <5,7,4,2>: Cost 4 vext2 <1,3,5,7>, <4,2,6,0>
+  2571922019U, // <5,7,4,3>: Cost 3 vext1 <3,5,7,4>, <3,5,7,4>
+  2571922742U, // <5,7,4,4>: Cost 3 vext1 <3,5,7,4>, RHS
+  1551895862U, // <5,7,4,5>: Cost 2 vext2 <1,3,5,7>, RHS
+  2846277980U, // <5,7,4,6>: Cost 3 vuzpr RHS, <0,4,2,6>
+  2646207951U, // <5,7,4,7>: Cost 3 vext2 <4,7,5,7>, <4,7,5,7>
+  1551896105U, // <5,7,4,u>: Cost 2 vext2 <1,3,5,7>, RHS
+  2583871590U, // <5,7,5,0>: Cost 3 vext1 <5,5,7,5>, LHS
+  2652180176U, // <5,7,5,1>: Cost 3 vext2 <5,7,5,7>, <5,1,7,3>
+  2625638177U, // <5,7,5,2>: Cost 3 vext2 <1,3,5,7>, <5,2,7,3>
+  2625638262U, // <5,7,5,3>: Cost 3 vext2 <1,3,5,7>, <5,3,7,7>
+  2583874870U, // <5,7,5,4>: Cost 3 vext1 <5,5,7,5>, RHS
+  2846281732U, // <5,7,5,5>: Cost 3 vuzpr RHS, <5,5,5,5>
+  2651517015U, // <5,7,5,6>: Cost 3 vext2 <5,6,5,7>, <5,6,5,7>
+  1772539190U, // <5,7,5,7>: Cost 2 vuzpr RHS, RHS
+  1772539191U, // <5,7,5,u>: Cost 2 vuzpr RHS, RHS
+  2846281826U, // <5,7,6,0>: Cost 3 vuzpr RHS, <5,6,7,0>
+  3699380615U, // <5,7,6,1>: Cost 4 vext2 <1,3,5,7>, <6,1,3,5>
+  2846281108U, // <5,7,6,2>: Cost 3 vuzpr RHS, <4,6,u,2>
+  2589854210U, // <5,7,6,3>: Cost 3 vext1 <6,5,7,6>, <3,4,5,6>
+  2846281830U, // <5,7,6,4>: Cost 3 vuzpr RHS, <5,6,7,4>
+  2725467658U, // <5,7,6,5>: Cost 3 vext3 <6,7,u,5>, <7,6,5,u>
+  2846281076U, // <5,7,6,6>: Cost 3 vuzpr RHS, <4,6,4,6>
+  2846279610U, // <5,7,6,7>: Cost 3 vuzpr RHS, <2,6,3,7>
+  2846279611U, // <5,7,6,u>: Cost 3 vuzpr RHS, <2,6,3,u>
+  1510146150U, // <5,7,7,0>: Cost 2 vext1 <5,5,7,7>, LHS
+  2846282574U, // <5,7,7,1>: Cost 3 vuzpr RHS, <6,7,0,1>
+  2583889512U, // <5,7,7,2>: Cost 3 vext1 <5,5,7,7>, <2,2,2,2>
+  2846281919U, // <5,7,7,3>: Cost 3 vuzpr RHS, <5,7,u,3>
+  1510149430U, // <5,7,7,4>: Cost 2 vext1 <5,5,7,7>, RHS
+  1510150168U, // <5,7,7,5>: Cost 2 vext1 <5,5,7,7>, <5,5,7,7>
+  2583892474U, // <5,7,7,6>: Cost 3 vext1 <5,5,7,7>, <6,2,7,3>
+  2625640044U, // <5,7,7,7>: Cost 3 vext2 <1,3,5,7>, <7,7,7,7>
+  1510151982U, // <5,7,7,u>: Cost 2 vext1 <5,5,7,7>, LHS
+  1510154342U, // <5,7,u,0>: Cost 2 vext1 <5,5,7,u>, LHS
+  1551898414U, // <5,7,u,1>: Cost 2 vext2 <1,3,5,7>, LHS
+  2625640325U, // <5,7,u,2>: Cost 3 vext2 <1,3,5,7>, <u,2,3,0>
+  1772536477U, // <5,7,u,3>: Cost 2 vuzpr RHS, LHS
+  1510157622U, // <5,7,u,4>: Cost 2 vext1 <5,5,7,u>, RHS
+  1551898778U, // <5,7,u,5>: Cost 2 vext2 <1,3,5,7>, RHS
+  2625640656U, // <5,7,u,6>: Cost 3 vext2 <1,3,5,7>, <u,6,3,7>
+  1772539433U, // <5,7,u,7>: Cost 2 vuzpr RHS, RHS
+  1551898981U, // <5,7,u,u>: Cost 2 vext2 <1,3,5,7>, LHS
+  2625642496U, // <5,u,0,0>: Cost 3 vext2 <1,3,5,u>, <0,0,0,0>
+  1551900774U, // <5,u,0,1>: Cost 2 vext2 <1,3,5,u>, LHS
+  2625642660U, // <5,u,0,2>: Cost 3 vext2 <1,3,5,u>, <0,2,0,2>
+  2698630885U, // <5,u,0,3>: Cost 3 vext3 <2,3,4,5>, <u,0,3,2>
+  2687129325U, // <5,u,0,4>: Cost 3 vext3 <0,4,1,5>, <u,0,4,1>
+  2689783542U, // <5,u,0,5>: Cost 3 vext3 <0,u,1,5>, <u,0,5,1>
+  2266134675U, // <5,u,0,6>: Cost 3 vrev <u,5,6,0>
+  2595853772U, // <5,u,0,7>: Cost 3 vext1 <7,5,u,0>, <7,5,u,0>
+  1551901341U, // <5,u,0,u>: Cost 2 vext2 <1,3,5,u>, LHS
+  2625643254U, // <5,u,1,0>: Cost 3 vext2 <1,3,5,u>, <1,0,3,2>
+  2625643316U, // <5,u,1,1>: Cost 3 vext2 <1,3,5,u>, <1,1,1,1>
+  1613387566U, // <5,u,1,2>: Cost 2 vext3 <0,4,1,5>, LHS
+  1551901697U, // <5,u,1,3>: Cost 2 vext2 <1,3,5,u>, <1,3,5,u>
+  2626307154U, // <5,u,1,4>: Cost 3 vext2 <1,4,5,u>, <1,4,5,u>
+  2689783622U, // <5,u,1,5>: Cost 3 vext3 <0,u,1,5>, <u,1,5,0>
+  2627634420U, // <5,u,1,6>: Cost 3 vext2 <1,6,5,u>, <1,6,5,u>
+  2982366536U, // <5,u,1,7>: Cost 3 vzipr <4,u,5,1>, RHS
+  1613387620U, // <5,u,1,u>: Cost 2 vext3 <0,4,1,5>, LHS
+  2846286742U, // <5,u,2,0>: Cost 3 vuzpr RHS, <1,2,3,0>
+  2685796528U, // <5,u,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5>
+  2625644136U, // <5,u,2,2>: Cost 3 vext2 <1,3,5,u>, <2,2,2,2>
+  2687129480U, // <5,u,2,3>: Cost 3 vext3 <0,4,1,5>, <u,2,3,3>
+  2632279851U, // <5,u,2,4>: Cost 3 vext2 <2,4,5,u>, <2,4,5,u>
+  2625644394U, // <5,u,2,5>: Cost 3 vext2 <1,3,5,u>, <2,5,3,u>
+  2625644474U, // <5,u,2,6>: Cost 3 vext2 <1,3,5,u>, <2,6,3,7>
+  2713966508U, // <5,u,2,7>: Cost 3 vext3 <4,u,5,5>, <u,2,7,3>
+  2625644603U, // <5,u,2,u>: Cost 3 vext2 <1,3,5,u>, <2,u,0,1>
+  2687129532U, // <5,u,3,0>: Cost 3 vext3 <0,4,1,5>, <u,3,0,1>
+  2636261649U, // <5,u,3,1>: Cost 3 vext2 <3,1,5,u>, <3,1,5,u>
+  2636925282U, // <5,u,3,2>: Cost 3 vext2 <3,2,5,u>, <3,2,5,u>
+  2625644956U, // <5,u,3,3>: Cost 3 vext2 <1,3,5,u>, <3,3,3,3>
+  1564510724U, // <5,u,3,4>: Cost 2 vext2 <3,4,5,u>, <3,4,5,u>
+  2625645160U, // <5,u,3,5>: Cost 3 vext2 <1,3,5,u>, <3,5,u,0>
+  2734610422U, // <5,u,3,6>: Cost 3 vext3 <u,3,6,5>, <u,3,6,5>
+  2640243447U, // <5,u,3,7>: Cost 3 vext2 <3,7,5,u>, <3,7,5,u>
+  1567165256U, // <5,u,3,u>: Cost 2 vext2 <3,u,5,u>, <3,u,5,u>
+  1567828889U, // <5,u,4,0>: Cost 2 vext2 <4,0,5,u>, <4,0,5,u>
+  1661163546U, // <5,u,4,1>: Cost 2 vext3 <u,4,1,5>, <u,4,1,5>
+  2734463012U, // <5,u,4,2>: Cost 3 vext3 <u,3,4,5>, <u,4,2,6>
+  2698631212U, // <5,u,4,3>: Cost 3 vext3 <2,3,4,5>, <u,4,3,5>
+  1570458842U, // <5,u,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5>
+  1551904054U, // <5,u,4,5>: Cost 2 vext2 <1,3,5,u>, RHS
+  2846286172U, // <5,u,4,6>: Cost 3 vuzpr RHS, <0,4,2,6>
+  2646216144U, // <5,u,4,7>: Cost 3 vext2 <4,7,5,u>, <4,7,5,u>
+  1551904297U, // <5,u,4,u>: Cost 2 vext2 <1,3,5,u>, RHS
+  1509982310U, // <5,u,5,0>: Cost 2 vext1 <5,5,5,5>, LHS
+  2560058555U, // <5,u,5,1>: Cost 3 vext1 <1,5,u,5>, <1,5,u,5>
+  2698926194U, // <5,u,5,2>: Cost 3 vext3 <2,3,u,5>, <u,5,2,3>
+  2698631295U, // <5,u,5,3>: Cost 3 vext3 <2,3,4,5>, <u,5,3,7>
+  1509985590U, // <5,u,5,4>: Cost 2 vext1 <5,5,5,5>, RHS
+  229035318U, // <5,u,5,5>: Cost 1 vdup1 RHS
+  1613387930U, // <5,u,5,6>: Cost 2 vext3 <0,4,1,5>, RHS
+  1772547382U, // <5,u,5,7>: Cost 2 vuzpr RHS, RHS
+  229035318U, // <5,u,5,u>: Cost 1 vdup1 RHS
+  2566037606U, // <5,u,6,0>: Cost 3 vext1 <2,5,u,6>, LHS
+  2920044334U, // <5,u,6,1>: Cost 3 vzipl <5,6,7,0>, LHS
+  2566039445U, // <5,u,6,2>: Cost 3 vext1 <2,5,u,6>, <2,5,u,6>
+  2687129808U, // <5,u,6,3>: Cost 3 vext3 <0,4,1,5>, <u,6,3,7>
+  2566040886U, // <5,u,6,4>: Cost 3 vext1 <2,5,u,6>, RHS
+  2920044698U, // <5,u,6,5>: Cost 3 vzipl <5,6,7,0>, RHS
+  2846289268U, // <5,u,6,6>: Cost 3 vuzpr RHS, <4,6,4,6>
+  2973781320U, // <5,u,6,7>: Cost 3 vzipr <3,4,5,6>, RHS
+  2687129853U, // <5,u,6,u>: Cost 3 vext3 <0,4,1,5>, <u,6,u,7>
+  430506086U, // <5,u,7,0>: Cost 1 vext1 RHS, LHS
+  1486333117U, // <5,u,7,1>: Cost 2 vext1 <1,5,u,7>, <1,5,u,7>
+  1504249448U, // <5,u,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  2040971933U, // <5,u,7,3>: Cost 2 vtrnr RHS, LHS
+  430509384U, // <5,u,7,4>: Cost 1 vext1 RHS, RHS
+  1504251600U, // <5,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
+  118708378U, // <5,u,7,6>: Cost 1 vrev RHS
+  2040974889U, // <5,u,7,7>: Cost 2 vtrnr RHS, RHS
+  430511918U, // <5,u,7,u>: Cost 1 vext1 RHS, LHS
+  430514278U, // <5,u,u,0>: Cost 1 vext1 RHS, LHS
+  1551906606U, // <5,u,u,1>: Cost 2 vext2 <1,3,5,u>, LHS
+  1613388133U, // <5,u,u,2>: Cost 2 vext3 <0,4,1,5>, LHS
+  1772544669U, // <5,u,u,3>: Cost 2 vuzpr RHS, LHS
+  430517577U, // <5,u,u,4>: Cost 1 vext1 RHS, RHS
+  229035318U, // <5,u,u,5>: Cost 1 vdup1 RHS
+  118716571U, // <5,u,u,6>: Cost 1 vrev RHS
+  1772547625U, // <5,u,u,7>: Cost 2 vuzpr RHS, RHS
+  430520110U, // <5,u,u,u>: Cost 1 vext1 RHS, LHS
+  2686025728U, // <6,0,0,0>: Cost 3 vext3 <0,2,4,6>, <0,0,0,0>
+  2686025738U, // <6,0,0,1>: Cost 3 vext3 <0,2,4,6>, <0,0,1,1>
+  2686025748U, // <6,0,0,2>: Cost 3 vext3 <0,2,4,6>, <0,0,2,2>
+  3779084320U, // <6,0,0,3>: Cost 4 vext3 <3,4,5,6>, <0,0,3,5>
+  2642903388U, // <6,0,0,4>: Cost 3 vext2 <4,2,6,0>, <0,4,2,6>
+  3657723939U, // <6,0,0,5>: Cost 4 vext1 <5,6,0,0>, <5,6,0,0>
+  3926676514U, // <6,0,0,6>: Cost 4 vuzpr <5,6,7,0>, <7,0,5,6>
+  3926675786U, // <6,0,0,7>: Cost 4 vuzpr <5,6,7,0>, <6,0,5,7>
+  2686025802U, // <6,0,0,u>: Cost 3 vext3 <0,2,4,6>, <0,0,u,2>
+  2566070374U, // <6,0,1,0>: Cost 3 vext1 <2,6,0,1>, LHS
+  3759767642U, // <6,0,1,1>: Cost 4 vext3 <0,2,4,6>, <0,1,1,0>
+  1612284006U, // <6,0,1,2>: Cost 2 vext3 <0,2,4,6>, LHS
+  2583988738U, // <6,0,1,3>: Cost 3 vext1 <5,6,0,1>, <3,4,5,6>
+  2566073654U, // <6,0,1,4>: Cost 3 vext1 <2,6,0,1>, RHS
+  2583990308U, // <6,0,1,5>: Cost 3 vext1 <5,6,0,1>, <5,6,0,1>
+  2589963005U, // <6,0,1,6>: Cost 3 vext1 <6,6,0,1>, <6,6,0,1>
+  2595935702U, // <6,0,1,7>: Cost 3 vext1 <7,6,0,1>, <7,6,0,1>
+  1612284060U, // <6,0,1,u>: Cost 2 vext3 <0,2,4,6>, LHS
+  2686025892U, // <6,0,2,0>: Cost 3 vext3 <0,2,4,6>, <0,2,0,2>
+  2685804721U, // <6,0,2,1>: Cost 3 vext3 <0,2,1,6>, <0,2,1,6>
+  3759620282U, // <6,0,2,2>: Cost 4 vext3 <0,2,2,6>, <0,2,2,6>
+  2705342658U, // <6,0,2,3>: Cost 3 vext3 <3,4,5,6>, <0,2,3,5>
+  1612284108U, // <6,0,2,4>: Cost 2 vext3 <0,2,4,6>, <0,2,4,6>
+  3706029956U, // <6,0,2,5>: Cost 4 vext2 <2,4,6,0>, <2,5,6,7>
+  2686173406U, // <6,0,2,6>: Cost 3 vext3 <0,2,6,6>, <0,2,6,6>
+  3651769338U, // <6,0,2,7>: Cost 4 vext1 <4,6,0,2>, <7,0,1,2>
+  1612579056U, // <6,0,2,u>: Cost 2 vext3 <0,2,u,6>, <0,2,u,6>
+  3706030230U, // <6,0,3,0>: Cost 4 vext2 <2,4,6,0>, <3,0,1,2>
+  2705342720U, // <6,0,3,1>: Cost 3 vext3 <3,4,5,6>, <0,3,1,4>
+  2705342730U, // <6,0,3,2>: Cost 3 vext3 <3,4,5,6>, <0,3,2,5>
+  3706030492U, // <6,0,3,3>: Cost 4 vext2 <2,4,6,0>, <3,3,3,3>
+  2644896258U, // <6,0,3,4>: Cost 3 vext2 <4,5,6,0>, <3,4,5,6>
+  3718638154U, // <6,0,3,5>: Cost 4 vext2 <4,5,6,0>, <3,5,4,6>
+  3729918619U, // <6,0,3,6>: Cost 4 vext2 <6,4,6,0>, <3,6,4,6>
+  3926672384U, // <6,0,3,7>: Cost 4 vuzpr <5,6,7,0>, <1,3,5,7>
+  2705342784U, // <6,0,3,u>: Cost 3 vext3 <3,4,5,6>, <0,3,u,5>
+  2687058250U, // <6,0,4,0>: Cost 3 vext3 <0,4,0,6>, <0,4,0,6>
+  2686026066U, // <6,0,4,1>: Cost 3 vext3 <0,2,4,6>, <0,4,1,5>
+  1613463900U, // <6,0,4,2>: Cost 2 vext3 <0,4,2,6>, <0,4,2,6>
+  3761021285U, // <6,0,4,3>: Cost 4 vext3 <0,4,3,6>, <0,4,3,6>
+  2687353198U, // <6,0,4,4>: Cost 3 vext3 <0,4,4,6>, <0,4,4,6>
+  2632289590U, // <6,0,4,5>: Cost 3 vext2 <2,4,6,0>, RHS
+  2645560704U, // <6,0,4,6>: Cost 3 vext2 <4,6,6,0>, <4,6,6,0>
+  2646224337U, // <6,0,4,7>: Cost 3 vext2 <4,7,6,0>, <4,7,6,0>
+  1613906322U, // <6,0,4,u>: Cost 2 vext3 <0,4,u,6>, <0,4,u,6>
+  3651788902U, // <6,0,5,0>: Cost 4 vext1 <4,6,0,5>, LHS
+  2687795620U, // <6,0,5,1>: Cost 3 vext3 <0,5,1,6>, <0,5,1,6>
+  3761611181U, // <6,0,5,2>: Cost 4 vext3 <0,5,2,6>, <0,5,2,6>
+  3723284326U, // <6,0,5,3>: Cost 4 vext2 <5,3,6,0>, <5,3,6,0>
+  2646224838U, // <6,0,5,4>: Cost 3 vext2 <4,7,6,0>, <5,4,7,6>
+  3718639630U, // <6,0,5,5>: Cost 4 vext2 <4,5,6,0>, <5,5,6,6>
+  2652196962U, // <6,0,5,6>: Cost 3 vext2 <5,7,6,0>, <5,6,7,0>
+  2852932918U, // <6,0,5,7>: Cost 3 vuzpr <5,6,7,0>, RHS
+  2852932919U, // <6,0,5,u>: Cost 3 vuzpr <5,6,7,0>, RHS
+  2852933730U, // <6,0,6,0>: Cost 3 vuzpr <5,6,7,0>, <5,6,7,0>
+  2925985894U, // <6,0,6,1>: Cost 3 vzipl <6,6,6,6>, LHS
+  3060203622U, // <6,0,6,2>: Cost 3 vtrnl <6,6,6,6>, LHS
+  3718640178U, // <6,0,6,3>: Cost 4 vext2 <4,5,6,0>, <6,3,4,5>
+  2656178832U, // <6,0,6,4>: Cost 3 vext2 <6,4,6,0>, <6,4,6,0>
+  3725939378U, // <6,0,6,5>: Cost 4 vext2 <5,7,6,0>, <6,5,0,7>
+  2657506098U, // <6,0,6,6>: Cost 3 vext2 <6,6,6,0>, <6,6,6,0>
+  2619020110U, // <6,0,6,7>: Cost 3 vext2 <0,2,6,0>, <6,7,0,1>
+  2925986461U, // <6,0,6,u>: Cost 3 vzipl <6,6,6,6>, LHS
+  2572091494U, // <6,0,7,0>: Cost 3 vext1 <3,6,0,7>, LHS
+  2572092310U, // <6,0,7,1>: Cost 3 vext1 <3,6,0,7>, <1,2,3,0>
+  2980495524U, // <6,0,7,2>: Cost 3 vzipr RHS, <0,2,0,2>
+  2572094072U, // <6,0,7,3>: Cost 3 vext1 <3,6,0,7>, <3,6,0,7>
+  2572094774U, // <6,0,7,4>: Cost 3 vext1 <3,6,0,7>, RHS
+  4054238242U, // <6,0,7,5>: Cost 4 vzipr RHS, <1,4,0,5>
+  3645837653U, // <6,0,7,6>: Cost 4 vext1 <3,6,0,7>, <6,0,7,0>
+  4054239054U, // <6,0,7,7>: Cost 4 vzipr RHS, <2,5,0,7>
+  2572097326U, // <6,0,7,u>: Cost 3 vext1 <3,6,0,7>, LHS
+  2686026378U, // <6,0,u,0>: Cost 3 vext3 <0,2,4,6>, <0,u,0,2>
+  2686026386U, // <6,0,u,1>: Cost 3 vext3 <0,2,4,6>, <0,u,1,1>
+  1612284573U, // <6,0,u,2>: Cost 2 vext3 <0,2,4,6>, LHS
+  2705343144U, // <6,0,u,3>: Cost 3 vext3 <3,4,5,6>, <0,u,3,5>
+  1616265906U, // <6,0,u,4>: Cost 2 vext3 <0,u,4,6>, <0,u,4,6>
+  2632292506U, // <6,0,u,5>: Cost 3 vext2 <2,4,6,0>, RHS
+  2590020356U, // <6,0,u,6>: Cost 3 vext1 <6,6,0,u>, <6,6,0,u>
+  2852933161U, // <6,0,u,7>: Cost 3 vuzpr <5,6,7,0>, RHS
+  1612284627U, // <6,0,u,u>: Cost 2 vext3 <0,2,4,6>, LHS
+  2595995750U, // <6,1,0,0>: Cost 3 vext1 <7,6,1,0>, LHS
+  2646229094U, // <6,1,0,1>: Cost 3 vext2 <4,7,6,1>, LHS
+  3694092492U, // <6,1,0,2>: Cost 4 vext2 <0,4,6,1>, <0,2,4,6>
+  2686026486U, // <6,1,0,3>: Cost 3 vext3 <0,2,4,6>, <1,0,3,2>
+  2595999030U, // <6,1,0,4>: Cost 3 vext1 <7,6,1,0>, RHS
+  3767730952U, // <6,1,0,5>: Cost 4 vext3 <1,5,4,6>, <1,0,5,2>
+  2596000590U, // <6,1,0,6>: Cost 3 vext1 <7,6,1,0>, <6,7,0,1>
+  2596001246U, // <6,1,0,7>: Cost 3 vext1 <7,6,1,0>, <7,6,1,0>
+  2686026531U, // <6,1,0,u>: Cost 3 vext3 <0,2,4,6>, <1,0,u,2>
+  3763602219U, // <6,1,1,0>: Cost 4 vext3 <0,u,2,6>, <1,1,0,1>
+  2686026548U, // <6,1,1,1>: Cost 3 vext3 <0,2,4,6>, <1,1,1,1>
+  3764929346U, // <6,1,1,2>: Cost 4 vext3 <1,1,2,6>, <1,1,2,6>
+  2686026568U, // <6,1,1,3>: Cost 3 vext3 <0,2,4,6>, <1,1,3,3>
+  2691334996U, // <6,1,1,4>: Cost 3 vext3 <1,1,4,6>, <1,1,4,6>
+  3760874332U, // <6,1,1,5>: Cost 4 vext3 <0,4,1,6>, <1,1,5,5>
+  3765224294U, // <6,1,1,6>: Cost 4 vext3 <1,1,6,6>, <1,1,6,6>
+  3669751263U, // <6,1,1,7>: Cost 4 vext1 <7,6,1,1>, <7,6,1,1>
+  2686026613U, // <6,1,1,u>: Cost 3 vext3 <0,2,4,6>, <1,1,u,3>
+  2554208358U, // <6,1,2,0>: Cost 3 vext1 <0,6,1,2>, LHS
+  3763602311U, // <6,1,2,1>: Cost 4 vext3 <0,u,2,6>, <1,2,1,3>
+  3639895971U, // <6,1,2,2>: Cost 4 vext1 <2,6,1,2>, <2,6,1,2>
+  2686026646U, // <6,1,2,3>: Cost 3 vext3 <0,2,4,6>, <1,2,3,0>
+  2554211638U, // <6,1,2,4>: Cost 3 vext1 <0,6,1,2>, RHS
+  3760874411U, // <6,1,2,5>: Cost 4 vext3 <0,4,1,6>, <1,2,5,3>
+  2554212858U, // <6,1,2,6>: Cost 3 vext1 <0,6,1,2>, <6,2,7,3>
+  3802973114U, // <6,1,2,7>: Cost 4 vext3 <7,4,5,6>, <1,2,7,0>
+  2686026691U, // <6,1,2,u>: Cost 3 vext3 <0,2,4,6>, <1,2,u,0>
+  2566160486U, // <6,1,3,0>: Cost 3 vext1 <2,6,1,3>, LHS
+  2686026712U, // <6,1,3,1>: Cost 3 vext3 <0,2,4,6>, <1,3,1,3>
+  2686026724U, // <6,1,3,2>: Cost 3 vext3 <0,2,4,6>, <1,3,2,6>
+  3759768552U, // <6,1,3,3>: Cost 4 vext3 <0,2,4,6>, <1,3,3,1>
+  2692662262U, // <6,1,3,4>: Cost 3 vext3 <1,3,4,6>, <1,3,4,6>
+  2686026752U, // <6,1,3,5>: Cost 3 vext3 <0,2,4,6>, <1,3,5,7>
+  2590053128U, // <6,1,3,6>: Cost 3 vext1 <6,6,1,3>, <6,6,1,3>
+  3663795194U, // <6,1,3,7>: Cost 4 vext1 <6,6,1,3>, <7,0,1,2>
+  2686026775U, // <6,1,3,u>: Cost 3 vext3 <0,2,4,6>, <1,3,u,3>
+  2641587099U, // <6,1,4,0>: Cost 3 vext2 <4,0,6,1>, <4,0,6,1>
+  2693104684U, // <6,1,4,1>: Cost 3 vext3 <1,4,1,6>, <1,4,1,6>
+  3639912357U, // <6,1,4,2>: Cost 4 vext1 <2,6,1,4>, <2,6,1,4>
+  2687206462U, // <6,1,4,3>: Cost 3 vext3 <0,4,2,6>, <1,4,3,6>
+  3633941814U, // <6,1,4,4>: Cost 4 vext1 <1,6,1,4>, RHS
+  2693399632U, // <6,1,4,5>: Cost 3 vext3 <1,4,5,6>, <1,4,5,6>
+  3765077075U, // <6,1,4,6>: Cost 4 vext3 <1,1,4,6>, <1,4,6,0>
+  2646232530U, // <6,1,4,7>: Cost 3 vext2 <4,7,6,1>, <4,7,6,1>
+  2687206507U, // <6,1,4,u>: Cost 3 vext3 <0,4,2,6>, <1,4,u,6>
+  2647559796U, // <6,1,5,0>: Cost 3 vext2 <5,0,6,1>, <5,0,6,1>
+  3765077118U, // <6,1,5,1>: Cost 4 vext3 <1,1,4,6>, <1,5,1,7>
+  3767583878U, // <6,1,5,2>: Cost 4 vext3 <1,5,2,6>, <1,5,2,6>
+  2686026896U, // <6,1,5,3>: Cost 3 vext3 <0,2,4,6>, <1,5,3,7>
+  2693989528U, // <6,1,5,4>: Cost 3 vext3 <1,5,4,6>, <1,5,4,6>
+  3767805089U, // <6,1,5,5>: Cost 4 vext3 <1,5,5,6>, <1,5,5,6>
+  2652868706U, // <6,1,5,6>: Cost 3 vext2 <5,u,6,1>, <5,6,7,0>
+  3908250934U, // <6,1,5,7>: Cost 4 vuzpr <2,6,0,1>, RHS
+  2686026941U, // <6,1,5,u>: Cost 3 vext3 <0,2,4,6>, <1,5,u,7>
+  2554241126U, // <6,1,6,0>: Cost 3 vext1 <0,6,1,6>, LHS
+  3763602639U, // <6,1,6,1>: Cost 4 vext3 <0,u,2,6>, <1,6,1,7>
+  3759547607U, // <6,1,6,2>: Cost 4 vext3 <0,2,1,6>, <1,6,2,6>
+  3115221094U, // <6,1,6,3>: Cost 3 vtrnr <4,6,4,6>, LHS
+  2554244406U, // <6,1,6,4>: Cost 3 vext1 <0,6,1,6>, RHS
+  3760874739U, // <6,1,6,5>: Cost 4 vext3 <0,4,1,6>, <1,6,5,7>
+  2554245944U, // <6,1,6,6>: Cost 3 vext1 <0,6,1,6>, <6,6,6,6>
+  3719975758U, // <6,1,6,7>: Cost 4 vext2 <4,7,6,1>, <6,7,0,1>
+  3115221099U, // <6,1,6,u>: Cost 3 vtrnr <4,6,4,6>, LHS
+  2560221286U, // <6,1,7,0>: Cost 3 vext1 <1,6,1,7>, LHS
+  2560222415U, // <6,1,7,1>: Cost 3 vext1 <1,6,1,7>, <1,6,1,7>
+  2980497558U, // <6,1,7,2>: Cost 3 vzipr RHS, <3,0,1,2>
+  3103211622U, // <6,1,7,3>: Cost 3 vtrnr <2,6,3,7>, LHS
+  2560224566U, // <6,1,7,4>: Cost 3 vext1 <1,6,1,7>, RHS
+  2980495698U, // <6,1,7,5>: Cost 3 vzipr RHS, <0,4,1,5>
+  3633967526U, // <6,1,7,6>: Cost 4 vext1 <1,6,1,7>, <6,1,7,0>
+  4054237686U, // <6,1,7,7>: Cost 4 vzipr RHS, <0,6,1,7>
+  2560227118U, // <6,1,7,u>: Cost 3 vext1 <1,6,1,7>, LHS
+  2560229478U, // <6,1,u,0>: Cost 3 vext1 <1,6,1,u>, LHS
+  2686027117U, // <6,1,u,1>: Cost 3 vext3 <0,2,4,6>, <1,u,1,3>
+  2686027129U, // <6,1,u,2>: Cost 3 vext3 <0,2,4,6>, <1,u,2,6>
+  2686027132U, // <6,1,u,3>: Cost 3 vext3 <0,2,4,6>, <1,u,3,0>
+  2687206795U, // <6,1,u,4>: Cost 3 vext3 <0,4,2,6>, <1,u,4,6>
+  2686027157U, // <6,1,u,5>: Cost 3 vext3 <0,2,4,6>, <1,u,5,7>
+  2590094093U, // <6,1,u,6>: Cost 3 vext1 <6,6,1,u>, <6,6,1,u>
+  2596066790U, // <6,1,u,7>: Cost 3 vext1 <7,6,1,u>, <7,6,1,u>
+  2686027177U, // <6,1,u,u>: Cost 3 vext3 <0,2,4,6>, <1,u,u,0>
+  2646900736U, // <6,2,0,0>: Cost 3 vext2 <4,u,6,2>, <0,0,0,0>
+  1573159014U, // <6,2,0,1>: Cost 2 vext2 <4,u,6,2>, LHS
+  2646900900U, // <6,2,0,2>: Cost 3 vext2 <4,u,6,2>, <0,2,0,2>
+  3759769037U, // <6,2,0,3>: Cost 4 vext3 <0,2,4,6>, <2,0,3,0>
+  2641592668U, // <6,2,0,4>: Cost 3 vext2 <4,0,6,2>, <0,4,2,6>
+  3779085794U, // <6,2,0,5>: Cost 4 vext3 <3,4,5,6>, <2,0,5,3>
+  2686027244U, // <6,2,0,6>: Cost 3 vext3 <0,2,4,6>, <2,0,6,4>
+  3669816807U, // <6,2,0,7>: Cost 4 vext1 <7,6,2,0>, <7,6,2,0>
+  1573159581U, // <6,2,0,u>: Cost 2 vext2 <4,u,6,2>, LHS
+  2230527897U, // <6,2,1,0>: Cost 3 vrev <2,6,0,1>
+  2646901556U, // <6,2,1,1>: Cost 3 vext2 <4,u,6,2>, <1,1,1,1>
+  2646901654U, // <6,2,1,2>: Cost 3 vext2 <4,u,6,2>, <1,2,3,0>
+  2847047782U, // <6,2,1,3>: Cost 3 vuzpr <4,6,u,2>, LHS
+  3771049517U, // <6,2,1,4>: Cost 4 vext3 <2,1,4,6>, <2,1,4,6>
+  2646901904U, // <6,2,1,5>: Cost 3 vext2 <4,u,6,2>, <1,5,3,7>
+  2686027324U, // <6,2,1,6>: Cost 3 vext3 <0,2,4,6>, <2,1,6,3>
+  3669825000U, // <6,2,1,7>: Cost 4 vext1 <7,6,2,1>, <7,6,2,1>
+  2231117793U, // <6,2,1,u>: Cost 3 vrev <2,6,u,1>
+  3763603029U, // <6,2,2,0>: Cost 4 vext3 <0,u,2,6>, <2,2,0,1>
+  3759769184U, // <6,2,2,1>: Cost 4 vext3 <0,2,4,6>, <2,2,1,3>
+  2686027368U, // <6,2,2,2>: Cost 3 vext3 <0,2,4,6>, <2,2,2,2>
+  2686027378U, // <6,2,2,3>: Cost 3 vext3 <0,2,4,6>, <2,2,3,3>
+  2697971326U, // <6,2,2,4>: Cost 3 vext3 <2,2,4,6>, <2,2,4,6>
+  3759769224U, // <6,2,2,5>: Cost 4 vext3 <0,2,4,6>, <2,2,5,7>
+  2698118800U, // <6,2,2,6>: Cost 3 vext3 <2,2,6,6>, <2,2,6,6>
+  3920794092U, // <6,2,2,7>: Cost 4 vuzpr <4,6,u,2>, <6,2,5,7>
+  2686027423U, // <6,2,2,u>: Cost 3 vext3 <0,2,4,6>, <2,2,u,3>
+  2686027430U, // <6,2,3,0>: Cost 3 vext3 <0,2,4,6>, <2,3,0,1>
+  3759769262U, // <6,2,3,1>: Cost 4 vext3 <0,2,4,6>, <2,3,1,0>
+  2698487485U, // <6,2,3,2>: Cost 3 vext3 <2,3,2,6>, <2,3,2,6>
+  2705344196U, // <6,2,3,3>: Cost 3 vext3 <3,4,5,6>, <2,3,3,4>
+  2686027470U, // <6,2,3,4>: Cost 3 vext3 <0,2,4,6>, <2,3,4,5>
+  2698708696U, // <6,2,3,5>: Cost 3 vext3 <2,3,5,6>, <2,3,5,6>
+  2724660961U, // <6,2,3,6>: Cost 3 vext3 <6,6,6,6>, <2,3,6,6>
+  2729232104U, // <6,2,3,7>: Cost 3 vext3 <7,4,5,6>, <2,3,7,4>
+  2686027502U, // <6,2,3,u>: Cost 3 vext3 <0,2,4,6>, <2,3,u,1>
+  1567853468U, // <6,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2>
+  3759769351U, // <6,2,4,1>: Cost 4 vext3 <0,2,4,6>, <2,4,1,u>
+  2699151118U, // <6,2,4,2>: Cost 3 vext3 <2,4,2,6>, <2,4,2,6>
+  2686027543U, // <6,2,4,3>: Cost 3 vext3 <0,2,4,6>, <2,4,3,6>
+  2699298592U, // <6,2,4,4>: Cost 3 vext3 <2,4,4,6>, <2,4,4,6>
+  1573162294U, // <6,2,4,5>: Cost 2 vext2 <4,u,6,2>, RHS
+  2686027564U, // <6,2,4,6>: Cost 3 vext3 <0,2,4,6>, <2,4,6,0>
+  3719982547U, // <6,2,4,7>: Cost 4 vext2 <4,7,6,2>, <4,7,6,2>
+  1573162532U, // <6,2,4,u>: Cost 2 vext2 <4,u,6,2>, <4,u,6,2>
+  3779086154U, // <6,2,5,0>: Cost 4 vext3 <3,4,5,6>, <2,5,0,3>
+  2646904528U, // <6,2,5,1>: Cost 3 vext2 <4,u,6,2>, <5,1,7,3>
+  3759769440U, // <6,2,5,2>: Cost 4 vext3 <0,2,4,6>, <2,5,2,7>
+  2699888488U, // <6,2,5,3>: Cost 3 vext3 <2,5,3,6>, <2,5,3,6>
+  2230855617U, // <6,2,5,4>: Cost 3 vrev <2,6,4,5>
+  2646904836U, // <6,2,5,5>: Cost 3 vext2 <4,u,6,2>, <5,5,5,5>
+  2646904930U, // <6,2,5,6>: Cost 3 vext2 <4,u,6,2>, <5,6,7,0>
+  2847051062U, // <6,2,5,7>: Cost 3 vuzpr <4,6,u,2>, RHS
+  2700257173U, // <6,2,5,u>: Cost 3 vext3 <2,5,u,6>, <2,5,u,6>
+  2687207321U, // <6,2,6,0>: Cost 3 vext3 <0,4,2,6>, <2,6,0,1>
+  2686027684U, // <6,2,6,1>: Cost 3 vext3 <0,2,4,6>, <2,6,1,3>
+  2566260656U, // <6,2,6,2>: Cost 3 vext1 <2,6,2,6>, <2,6,2,6>
+  2685806522U, // <6,2,6,3>: Cost 3 vext3 <0,2,1,6>, <2,6,3,7>
+  2687207361U, // <6,2,6,4>: Cost 3 vext3 <0,4,2,6>, <2,6,4,5>
+  2686027724U, // <6,2,6,5>: Cost 3 vext3 <0,2,4,6>, <2,6,5,7>
+  2646905656U, // <6,2,6,6>: Cost 3 vext2 <4,u,6,2>, <6,6,6,6>
+  2646905678U, // <6,2,6,7>: Cost 3 vext2 <4,u,6,2>, <6,7,0,1>
+  2686027751U, // <6,2,6,u>: Cost 3 vext3 <0,2,4,6>, <2,6,u,7>
+  2554323046U, // <6,2,7,0>: Cost 3 vext1 <0,6,2,7>, LHS
+  2572239606U, // <6,2,7,1>: Cost 3 vext1 <3,6,2,7>, <1,0,3,2>
+  2566268849U, // <6,2,7,2>: Cost 3 vext1 <2,6,2,7>, <2,6,2,7>
+  1906753638U, // <6,2,7,3>: Cost 2 vzipr RHS, LHS
+  2554326326U, // <6,2,7,4>: Cost 3 vext1 <0,6,2,7>, RHS
+  3304687564U, // <6,2,7,5>: Cost 4 vrev <2,6,5,7>
+  2980495708U, // <6,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6>
+  2646906476U, // <6,2,7,7>: Cost 3 vext2 <4,u,6,2>, <7,7,7,7>
+  1906753643U, // <6,2,7,u>: Cost 2 vzipr RHS, LHS
+  1591744256U, // <6,2,u,0>: Cost 2 vext2 <u,0,6,2>, <u,0,6,2>
+  1573164846U, // <6,2,u,1>: Cost 2 vext2 <4,u,6,2>, LHS
+  2701805650U, // <6,2,u,2>: Cost 3 vext3 <2,u,2,6>, <2,u,2,6>
+  1906761830U, // <6,2,u,3>: Cost 2 vzipr RHS, LHS
+  2686027875U, // <6,2,u,4>: Cost 3 vext3 <0,2,4,6>, <2,u,4,5>
+  1573165210U, // <6,2,u,5>: Cost 2 vext2 <4,u,6,2>, RHS
+  2686322800U, // <6,2,u,6>: Cost 3 vext3 <0,2,u,6>, <2,u,6,0>
+  2847051305U, // <6,2,u,7>: Cost 3 vuzpr <4,6,u,2>, RHS
+  1906761835U, // <6,2,u,u>: Cost 2 vzipr RHS, LHS
+  3759769739U, // <6,3,0,0>: Cost 4 vext3 <0,2,4,6>, <3,0,0,0>
+  2686027926U, // <6,3,0,1>: Cost 3 vext3 <0,2,4,6>, <3,0,1,2>
+  2686027937U, // <6,3,0,2>: Cost 3 vext3 <0,2,4,6>, <3,0,2,4>
+  3640027286U, // <6,3,0,3>: Cost 4 vext1 <2,6,3,0>, <3,0,1,2>
+  2687207601U, // <6,3,0,4>: Cost 3 vext3 <0,4,2,6>, <3,0,4,2>
+  2705344698U, // <6,3,0,5>: Cost 3 vext3 <3,4,5,6>, <3,0,5,2>
+  3663917847U, // <6,3,0,6>: Cost 4 vext1 <6,6,3,0>, <6,6,3,0>
+  2237008560U, // <6,3,0,7>: Cost 3 vrev <3,6,7,0>
+  2686027989U, // <6,3,0,u>: Cost 3 vext3 <0,2,4,6>, <3,0,u,2>
+  3759769823U, // <6,3,1,0>: Cost 4 vext3 <0,2,4,6>, <3,1,0,3>
+  3759769830U, // <6,3,1,1>: Cost 4 vext3 <0,2,4,6>, <3,1,1,1>
+  3759769841U, // <6,3,1,2>: Cost 4 vext3 <0,2,4,6>, <3,1,2,3>
+  3759769848U, // <6,3,1,3>: Cost 4 vext3 <0,2,4,6>, <3,1,3,1>
+  2703280390U, // <6,3,1,4>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6>
+  3759769868U, // <6,3,1,5>: Cost 4 vext3 <0,2,4,6>, <3,1,5,3>
+  3704063194U, // <6,3,1,6>: Cost 4 vext2 <2,1,6,3>, <1,6,3,0>
+  3767732510U, // <6,3,1,7>: Cost 4 vext3 <1,5,4,6>, <3,1,7,3>
+  2703280390U, // <6,3,1,u>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6>
+  3704063468U, // <6,3,2,0>: Cost 4 vext2 <2,1,6,3>, <2,0,6,4>
+  2630321724U, // <6,3,2,1>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3>
+  3759769921U, // <6,3,2,2>: Cost 4 vext3 <0,2,4,6>, <3,2,2,2>
+  3759769928U, // <6,3,2,3>: Cost 4 vext3 <0,2,4,6>, <3,2,3,0>
+  3704063767U, // <6,3,2,4>: Cost 4 vext2 <2,1,6,3>, <2,4,3,6>
+  3704063876U, // <6,3,2,5>: Cost 4 vext2 <2,1,6,3>, <2,5,6,7>
+  2636957626U, // <6,3,2,6>: Cost 3 vext2 <3,2,6,3>, <2,6,3,7>
+  3777907058U, // <6,3,2,7>: Cost 4 vext3 <3,2,7,6>, <3,2,7,6>
+  2630321724U, // <6,3,2,u>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3>
+  3759769983U, // <6,3,3,0>: Cost 4 vext3 <0,2,4,6>, <3,3,0,1>
+  3710036245U, // <6,3,3,1>: Cost 4 vext2 <3,1,6,3>, <3,1,6,3>
+  2636958054U, // <6,3,3,2>: Cost 3 vext2 <3,2,6,3>, <3,2,6,3>
+  2686028188U, // <6,3,3,3>: Cost 3 vext3 <0,2,4,6>, <3,3,3,3>
+  2704607656U, // <6,3,3,4>: Cost 3 vext3 <3,3,4,6>, <3,3,4,6>
+  3773041072U, // <6,3,3,5>: Cost 4 vext3 <2,4,4,6>, <3,3,5,5>
+  3711363731U, // <6,3,3,6>: Cost 4 vext2 <3,3,6,3>, <3,6,3,7>
+  3767732676U, // <6,3,3,7>: Cost 4 vext3 <1,5,4,6>, <3,3,7,7>
+  2707999179U, // <6,3,3,u>: Cost 3 vext3 <3,u,5,6>, <3,3,u,5>
+  2584232038U, // <6,3,4,0>: Cost 3 vext1 <5,6,3,4>, LHS
+  2642267118U, // <6,3,4,1>: Cost 3 vext2 <4,1,6,3>, <4,1,6,3>
+  2642930751U, // <6,3,4,2>: Cost 3 vext2 <4,2,6,3>, <4,2,6,3>
+  2705197552U, // <6,3,4,3>: Cost 3 vext3 <3,4,3,6>, <3,4,3,6>
+  2584235318U, // <6,3,4,4>: Cost 3 vext1 <5,6,3,4>, RHS
+  1631603202U, // <6,3,4,5>: Cost 2 vext3 <3,4,5,6>, <3,4,5,6>
+  2654211444U, // <6,3,4,6>: Cost 3 vext2 <6,1,6,3>, <4,6,4,6>
+  2237041332U, // <6,3,4,7>: Cost 3 vrev <3,6,7,4>
+  1631824413U, // <6,3,4,u>: Cost 2 vext3 <3,4,u,6>, <3,4,u,6>
+  3640066150U, // <6,3,5,0>: Cost 4 vext1 <2,6,3,5>, LHS
+  3772746288U, // <6,3,5,1>: Cost 4 vext3 <2,4,0,6>, <3,5,1,7>
+  3640067790U, // <6,3,5,2>: Cost 4 vext1 <2,6,3,5>, <2,3,4,5>
+  3773041216U, // <6,3,5,3>: Cost 4 vext3 <2,4,4,6>, <3,5,3,5>
+  2705934922U, // <6,3,5,4>: Cost 3 vext3 <3,5,4,6>, <3,5,4,6>
+  3773041236U, // <6,3,5,5>: Cost 4 vext3 <2,4,4,6>, <3,5,5,7>
+  3779086940U, // <6,3,5,6>: Cost 4 vext3 <3,4,5,6>, <3,5,6,6>
+  3767732831U, // <6,3,5,7>: Cost 4 vext3 <1,5,4,6>, <3,5,7,0>
+  2706229870U, // <6,3,5,u>: Cost 3 vext3 <3,5,u,6>, <3,5,u,6>
+  2602164326U, // <6,3,6,0>: Cost 3 vext1 <u,6,3,6>, LHS
+  2654212512U, // <6,3,6,1>: Cost 3 vext2 <6,1,6,3>, <6,1,6,3>
+  2566334393U, // <6,3,6,2>: Cost 3 vext1 <2,6,3,6>, <2,6,3,6>
+  3704066588U, // <6,3,6,3>: Cost 4 vext2 <2,1,6,3>, <6,3,2,1>
+  2602167524U, // <6,3,6,4>: Cost 3 vext1 <u,6,3,6>, <4,4,6,6>
+  3710702321U, // <6,3,6,5>: Cost 4 vext2 <3,2,6,3>, <6,5,7,7>
+  2724661933U, // <6,3,6,6>: Cost 3 vext3 <6,6,6,6>, <3,6,6,6>
+  3710702465U, // <6,3,6,7>: Cost 4 vext2 <3,2,6,3>, <6,7,5,7>
+  2602170158U, // <6,3,6,u>: Cost 3 vext1 <u,6,3,6>, LHS
+  1492598886U, // <6,3,7,0>: Cost 2 vext1 <2,6,3,7>, LHS
+  2560369889U, // <6,3,7,1>: Cost 3 vext1 <1,6,3,7>, <1,6,3,7>
+  1492600762U, // <6,3,7,2>: Cost 2 vext1 <2,6,3,7>, <2,6,3,7>
+  2566342806U, // <6,3,7,3>: Cost 3 vext1 <2,6,3,7>, <3,0,1,2>
+  1492602166U, // <6,3,7,4>: Cost 2 vext1 <2,6,3,7>, RHS
+  2602176208U, // <6,3,7,5>: Cost 3 vext1 <u,6,3,7>, <5,1,7,3>
+  2566345210U, // <6,3,7,6>: Cost 3 vext1 <2,6,3,7>, <6,2,7,3>
+  2980496528U, // <6,3,7,7>: Cost 3 vzipr RHS, <1,5,3,7>
+  1492604718U, // <6,3,7,u>: Cost 2 vext1 <2,6,3,7>, LHS
+  1492607078U, // <6,3,u,0>: Cost 2 vext1 <2,6,3,u>, LHS
+  2686028574U, // <6,3,u,1>: Cost 3 vext3 <0,2,4,6>, <3,u,1,2>
+  1492608955U, // <6,3,u,2>: Cost 2 vext1 <2,6,3,u>, <2,6,3,u>
+  2566350998U, // <6,3,u,3>: Cost 3 vext1 <2,6,3,u>, <3,0,1,2>
+  1492610358U, // <6,3,u,4>: Cost 2 vext1 <2,6,3,u>, RHS
+  1634257734U, // <6,3,u,5>: Cost 2 vext3 <3,u,5,6>, <3,u,5,6>
+  2566353489U, // <6,3,u,6>: Cost 3 vext1 <2,6,3,u>, <6,3,u,0>
+  2980504720U, // <6,3,u,7>: Cost 3 vzipr RHS, <1,5,3,7>
+  1492612910U, // <6,3,u,u>: Cost 2 vext1 <2,6,3,u>, LHS
+  3703406592U, // <6,4,0,0>: Cost 4 vext2 <2,0,6,4>, <0,0,0,0>
+  2629664870U, // <6,4,0,1>: Cost 3 vext2 <2,0,6,4>, LHS
+  2629664972U, // <6,4,0,2>: Cost 3 vext2 <2,0,6,4>, <0,2,4,6>
+  3779087232U, // <6,4,0,3>: Cost 4 vext3 <3,4,5,6>, <4,0,3,1>
+  2642936156U, // <6,4,0,4>: Cost 3 vext2 <4,2,6,4>, <0,4,2,6>
+  2712570770U, // <6,4,0,5>: Cost 3 vext3 <4,6,4,6>, <4,0,5,1>
+  2687208348U, // <6,4,0,6>: Cost 3 vext3 <0,4,2,6>, <4,0,6,2>
+  3316723081U, // <6,4,0,7>: Cost 4 vrev <4,6,7,0>
+  2629665437U, // <6,4,0,u>: Cost 3 vext2 <2,0,6,4>, LHS
+  2242473291U, // <6,4,1,0>: Cost 3 vrev <4,6,0,1>
+  3700089652U, // <6,4,1,1>: Cost 4 vext2 <1,4,6,4>, <1,1,1,1>
+  3703407510U, // <6,4,1,2>: Cost 4 vext2 <2,0,6,4>, <1,2,3,0>
+  2852962406U, // <6,4,1,3>: Cost 3 vuzpr <5,6,7,4>, LHS
+  3628166454U, // <6,4,1,4>: Cost 4 vext1 <0,6,4,1>, RHS
+  3760876514U, // <6,4,1,5>: Cost 4 vext3 <0,4,1,6>, <4,1,5,0>
+  2687208430U, // <6,4,1,6>: Cost 3 vext3 <0,4,2,6>, <4,1,6,3>
+  3316731274U, // <6,4,1,7>: Cost 4 vrev <4,6,7,1>
+  2243063187U, // <6,4,1,u>: Cost 3 vrev <4,6,u,1>
+  2629666284U, // <6,4,2,0>: Cost 3 vext2 <2,0,6,4>, <2,0,6,4>
+  3703408188U, // <6,4,2,1>: Cost 4 vext2 <2,0,6,4>, <2,1,6,3>
+  3703408232U, // <6,4,2,2>: Cost 4 vext2 <2,0,6,4>, <2,2,2,2>
+  3703408294U, // <6,4,2,3>: Cost 4 vext2 <2,0,6,4>, <2,3,0,1>
+  2632320816U, // <6,4,2,4>: Cost 3 vext2 <2,4,6,4>, <2,4,6,4>
+  2923384118U, // <6,4,2,5>: Cost 3 vzipl <6,2,7,3>, RHS
+  2687208508U, // <6,4,2,6>: Cost 3 vext3 <0,4,2,6>, <4,2,6,0>
+  3760950341U, // <6,4,2,7>: Cost 4 vext3 <0,4,2,6>, <4,2,7,0>
+  2634975348U, // <6,4,2,u>: Cost 3 vext2 <2,u,6,4>, <2,u,6,4>
+  3703408790U, // <6,4,3,0>: Cost 4 vext2 <2,0,6,4>, <3,0,1,2>
+  3316305238U, // <6,4,3,1>: Cost 4 vrev <4,6,1,3>
+  3703408947U, // <6,4,3,2>: Cost 4 vext2 <2,0,6,4>, <3,2,0,6>
+  3703409052U, // <6,4,3,3>: Cost 4 vext2 <2,0,6,4>, <3,3,3,3>
+  2644929026U, // <6,4,3,4>: Cost 3 vext2 <4,5,6,4>, <3,4,5,6>
+  3718670922U, // <6,4,3,5>: Cost 4 vext2 <4,5,6,4>, <3,5,4,6>
+  2705345682U, // <6,4,3,6>: Cost 3 vext3 <3,4,5,6>, <4,3,6,5>
+  3926705152U, // <6,4,3,7>: Cost 4 vuzpr <5,6,7,4>, <1,3,5,7>
+  2668817222U, // <6,4,3,u>: Cost 3 vext2 <u,5,6,4>, <3,u,5,6>
+  2590277734U, // <6,4,4,0>: Cost 3 vext1 <6,6,4,4>, LHS
+  3716017135U, // <6,4,4,1>: Cost 4 vext2 <4,1,6,4>, <4,1,6,4>
+  2642938944U, // <6,4,4,2>: Cost 3 vext2 <4,2,6,4>, <4,2,6,4>
+  3717344401U, // <6,4,4,3>: Cost 4 vext2 <4,3,6,4>, <4,3,6,4>
+  2712571088U, // <6,4,4,4>: Cost 3 vext3 <4,6,4,6>, <4,4,4,4>
+  2629668150U, // <6,4,4,5>: Cost 3 vext2 <2,0,6,4>, RHS
+  1637649636U, // <6,4,4,6>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6>
+  2646257109U, // <6,4,4,7>: Cost 3 vext2 <4,7,6,4>, <4,7,6,4>
+  1637649636U, // <6,4,4,u>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6>
+  2566398054U, // <6,4,5,0>: Cost 3 vext1 <2,6,4,5>, LHS
+  3760876805U, // <6,4,5,1>: Cost 4 vext3 <0,4,1,6>, <4,5,1,3>
+  2566399937U, // <6,4,5,2>: Cost 3 vext1 <2,6,4,5>, <2,6,4,5>
+  2584316418U, // <6,4,5,3>: Cost 3 vext1 <5,6,4,5>, <3,4,5,6>
+  2566401334U, // <6,4,5,4>: Cost 3 vext1 <2,6,4,5>, RHS
+  2584318028U, // <6,4,5,5>: Cost 3 vext1 <5,6,4,5>, <5,6,4,5>
+  1612287286U, // <6,4,5,6>: Cost 2 vext3 <0,2,4,6>, RHS
+  2852965686U, // <6,4,5,7>: Cost 3 vuzpr <5,6,7,4>, RHS
+  1612287304U, // <6,4,5,u>: Cost 2 vext3 <0,2,4,6>, RHS
+  1504608358U, // <6,4,6,0>: Cost 2 vext1 <4,6,4,6>, LHS
+  2578350838U, // <6,4,6,1>: Cost 3 vext1 <4,6,4,6>, <1,0,3,2>
+  2578351720U, // <6,4,6,2>: Cost 3 vext1 <4,6,4,6>, <2,2,2,2>
+  2578352278U, // <6,4,6,3>: Cost 3 vext1 <4,6,4,6>, <3,0,1,2>
+  1504611638U, // <6,4,6,4>: Cost 2 vext1 <4,6,4,6>, RHS
+  2578353872U, // <6,4,6,5>: Cost 3 vext1 <4,6,4,6>, <5,1,7,3>
+  2578354682U, // <6,4,6,6>: Cost 3 vext1 <4,6,4,6>, <6,2,7,3>
+  2578355194U, // <6,4,6,7>: Cost 3 vext1 <4,6,4,6>, <7,0,1,2>
+  1504614190U, // <6,4,6,u>: Cost 2 vext1 <4,6,4,6>, LHS
+  2572386406U, // <6,4,7,0>: Cost 3 vext1 <3,6,4,7>, LHS
+  2572387226U, // <6,4,7,1>: Cost 3 vext1 <3,6,4,7>, <1,2,3,4>
+  3640157902U, // <6,4,7,2>: Cost 4 vext1 <2,6,4,7>, <2,3,4,5>
+  2572389020U, // <6,4,7,3>: Cost 3 vext1 <3,6,4,7>, <3,6,4,7>
+  2572389686U, // <6,4,7,4>: Cost 3 vext1 <3,6,4,7>, RHS
+  2980497102U, // <6,4,7,5>: Cost 3 vzipr RHS, <2,3,4,5>
+  2980495564U, // <6,4,7,6>: Cost 3 vzipr RHS, <0,2,4,6>
+  4054239090U, // <6,4,7,7>: Cost 4 vzipr RHS, <2,5,4,7>
+  2572392238U, // <6,4,7,u>: Cost 3 vext1 <3,6,4,7>, LHS
+  1504608358U, // <6,4,u,0>: Cost 2 vext1 <4,6,4,6>, LHS
+  2629670702U, // <6,4,u,1>: Cost 3 vext2 <2,0,6,4>, LHS
+  2566424516U, // <6,4,u,2>: Cost 3 vext1 <2,6,4,u>, <2,6,4,u>
+  2584340994U, // <6,4,u,3>: Cost 3 vext1 <5,6,4,u>, <3,4,5,6>
+  1640156694U, // <6,4,u,4>: Cost 2 vext3 <4,u,4,6>, <4,u,4,6>
+  2629671066U, // <6,4,u,5>: Cost 3 vext2 <2,0,6,4>, RHS
+  1612287529U, // <6,4,u,6>: Cost 2 vext3 <0,2,4,6>, RHS
+  2852965929U, // <6,4,u,7>: Cost 3 vuzpr <5,6,7,4>, RHS
+  1612287547U, // <6,4,u,u>: Cost 2 vext3 <0,2,4,6>, RHS
+  3708723200U, // <6,5,0,0>: Cost 4 vext2 <2,u,6,5>, <0,0,0,0>
+  2634981478U, // <6,5,0,1>: Cost 3 vext2 <2,u,6,5>, LHS
+  3694125260U, // <6,5,0,2>: Cost 4 vext2 <0,4,6,5>, <0,2,4,6>
+  3779087962U, // <6,5,0,3>: Cost 4 vext3 <3,4,5,6>, <5,0,3,2>
+  3760877154U, // <6,5,0,4>: Cost 4 vext3 <0,4,1,6>, <5,0,4,1>
+  4195110916U, // <6,5,0,5>: Cost 4 vtrnr <5,6,7,0>, <5,5,5,5>
+  3696779775U, // <6,5,0,6>: Cost 4 vext2 <0,u,6,5>, <0,6,2,7>
+  1175212130U, // <6,5,0,7>: Cost 2 vrev <5,6,7,0>
+  1175285867U, // <6,5,0,u>: Cost 2 vrev <5,6,u,0>
+  2248445988U, // <6,5,1,0>: Cost 3 vrev <5,6,0,1>
+  3698107237U, // <6,5,1,1>: Cost 4 vext2 <1,1,6,5>, <1,1,6,5>
+  3708724118U, // <6,5,1,2>: Cost 4 vext2 <2,u,6,5>, <1,2,3,0>
+  3908575334U, // <6,5,1,3>: Cost 4 vuzpr <2,6,4,5>, LHS
+  3716023376U, // <6,5,1,4>: Cost 4 vext2 <4,1,6,5>, <1,4,5,6>
+  3708724368U, // <6,5,1,5>: Cost 4 vext2 <2,u,6,5>, <1,5,3,7>
+  3767733960U, // <6,5,1,6>: Cost 4 vext3 <1,5,4,6>, <5,1,6,4>
+  2712571600U, // <6,5,1,7>: Cost 3 vext3 <4,6,4,6>, <5,1,7,3>
+  2712571609U, // <6,5,1,u>: Cost 3 vext3 <4,6,4,6>, <5,1,u,3>
+  2578391142U, // <6,5,2,0>: Cost 3 vext1 <4,6,5,2>, LHS
+  3704079934U, // <6,5,2,1>: Cost 4 vext2 <2,1,6,5>, <2,1,6,5>
+  3708724840U, // <6,5,2,2>: Cost 4 vext2 <2,u,6,5>, <2,2,2,2>
+  3705407182U, // <6,5,2,3>: Cost 4 vext2 <2,3,6,5>, <2,3,4,5>
+  2578394422U, // <6,5,2,4>: Cost 3 vext1 <4,6,5,2>, RHS
+  3717351272U, // <6,5,2,5>: Cost 4 vext2 <4,3,6,5>, <2,5,3,6>
+  2634983354U, // <6,5,2,6>: Cost 3 vext2 <2,u,6,5>, <2,6,3,7>
+  3115486518U, // <6,5,2,7>: Cost 3 vtrnr <4,6,u,2>, RHS
+  2634983541U, // <6,5,2,u>: Cost 3 vext2 <2,u,6,5>, <2,u,6,5>
+  3708725398U, // <6,5,3,0>: Cost 4 vext2 <2,u,6,5>, <3,0,1,2>
+  3710052631U, // <6,5,3,1>: Cost 4 vext2 <3,1,6,5>, <3,1,6,5>
+  3708725606U, // <6,5,3,2>: Cost 4 vext2 <2,u,6,5>, <3,2,6,3>
+  3708725660U, // <6,5,3,3>: Cost 4 vext2 <2,u,6,5>, <3,3,3,3>
+  2643610114U, // <6,5,3,4>: Cost 3 vext2 <4,3,6,5>, <3,4,5,6>
+  3717352010U, // <6,5,3,5>: Cost 4 vext2 <4,3,6,5>, <3,5,4,6>
+  3773632358U, // <6,5,3,6>: Cost 4 vext3 <2,5,3,6>, <5,3,6,0>
+  2248978533U, // <6,5,3,7>: Cost 3 vrev <5,6,7,3>
+  2249052270U, // <6,5,3,u>: Cost 3 vrev <5,6,u,3>
+  2596323430U, // <6,5,4,0>: Cost 3 vext1 <7,6,5,4>, LHS
+  3716025328U, // <6,5,4,1>: Cost 4 vext2 <4,1,6,5>, <4,1,6,5>
+  3716688961U, // <6,5,4,2>: Cost 4 vext2 <4,2,6,5>, <4,2,6,5>
+  2643610770U, // <6,5,4,3>: Cost 3 vext2 <4,3,6,5>, <4,3,6,5>
+  2596326710U, // <6,5,4,4>: Cost 3 vext1 <7,6,5,4>, RHS
+  2634984758U, // <6,5,4,5>: Cost 3 vext2 <2,u,6,5>, RHS
+  3767734199U, // <6,5,4,6>: Cost 4 vext3 <1,5,4,6>, <5,4,6,0>
+  1643696070U, // <6,5,4,7>: Cost 2 vext3 <5,4,7,6>, <5,4,7,6>
+  1643769807U, // <6,5,4,u>: Cost 2 vext3 <5,4,u,6>, <5,4,u,6>
+  2578415718U, // <6,5,5,0>: Cost 3 vext1 <4,6,5,5>, LHS
+  3652158198U, // <6,5,5,1>: Cost 4 vext1 <4,6,5,5>, <1,0,3,2>
+  3652159080U, // <6,5,5,2>: Cost 4 vext1 <4,6,5,5>, <2,2,2,2>
+  3652159638U, // <6,5,5,3>: Cost 4 vext1 <4,6,5,5>, <3,0,1,2>
+  2578418998U, // <6,5,5,4>: Cost 3 vext1 <4,6,5,5>, RHS
+  2712571908U, // <6,5,5,5>: Cost 3 vext3 <4,6,4,6>, <5,5,5,5>
+  2718027790U, // <6,5,5,6>: Cost 3 vext3 <5,5,6,6>, <5,5,6,6>
+  2712571928U, // <6,5,5,7>: Cost 3 vext3 <4,6,4,6>, <5,5,7,7>
+  2712571937U, // <6,5,5,u>: Cost 3 vext3 <4,6,4,6>, <5,5,u,7>
+  2705346596U, // <6,5,6,0>: Cost 3 vext3 <3,4,5,6>, <5,6,0,1>
+  3767144496U, // <6,5,6,1>: Cost 4 vext3 <1,4,5,6>, <5,6,1,4>
+  3773116473U, // <6,5,6,2>: Cost 4 vext3 <2,4,5,6>, <5,6,2,4>
+  2705346626U, // <6,5,6,3>: Cost 3 vext3 <3,4,5,6>, <5,6,3,4>
+  2705346636U, // <6,5,6,4>: Cost 3 vext3 <3,4,5,6>, <5,6,4,5>
+  3908577217U, // <6,5,6,5>: Cost 4 vuzpr <2,6,4,5>, <2,6,4,5>
+  2578428728U, // <6,5,6,6>: Cost 3 vext1 <4,6,5,6>, <6,6,6,6>
+  2712572002U, // <6,5,6,7>: Cost 3 vext3 <4,6,4,6>, <5,6,7,0>
+  2705346668U, // <6,5,6,u>: Cost 3 vext3 <3,4,5,6>, <5,6,u,1>
+  2560516198U, // <6,5,7,0>: Cost 3 vext1 <1,6,5,7>, LHS
+  2560517363U, // <6,5,7,1>: Cost 3 vext1 <1,6,5,7>, <1,6,5,7>
+  2566490060U, // <6,5,7,2>: Cost 3 vext1 <2,6,5,7>, <2,6,5,7>
+  3634260118U, // <6,5,7,3>: Cost 4 vext1 <1,6,5,7>, <3,0,1,2>
+  2560519478U, // <6,5,7,4>: Cost 3 vext1 <1,6,5,7>, RHS
+  2980498650U, // <6,5,7,5>: Cost 3 vzipr RHS, <4,4,5,5>
+  2980497922U, // <6,5,7,6>: Cost 3 vzipr RHS, <3,4,5,6>
+  3103214902U, // <6,5,7,7>: Cost 3 vtrnr <2,6,3,7>, RHS
+  2560522030U, // <6,5,7,u>: Cost 3 vext1 <1,6,5,7>, LHS
+  2560524390U, // <6,5,u,0>: Cost 3 vext1 <1,6,5,u>, LHS
+  2560525556U, // <6,5,u,1>: Cost 3 vext1 <1,6,5,u>, <1,6,5,u>
+  2566498253U, // <6,5,u,2>: Cost 3 vext1 <2,6,5,u>, <2,6,5,u>
+  2646931439U, // <6,5,u,3>: Cost 3 vext2 <4,u,6,5>, <u,3,5,7>
+  2560527670U, // <6,5,u,4>: Cost 3 vext1 <1,6,5,u>, RHS
+  2634987674U, // <6,5,u,5>: Cost 3 vext2 <2,u,6,5>, RHS
+  2980506114U, // <6,5,u,6>: Cost 3 vzipr RHS, <3,4,5,6>
+  1175277674U, // <6,5,u,7>: Cost 2 vrev <5,6,7,u>
+  1175351411U, // <6,5,u,u>: Cost 2 vrev <5,6,u,u>
+  2578448486U, // <6,6,0,0>: Cost 3 vext1 <4,6,6,0>, LHS
+  1573191782U, // <6,6,0,1>: Cost 2 vext2 <4,u,6,6>, LHS
+  2686030124U, // <6,6,0,2>: Cost 3 vext3 <0,2,4,6>, <6,0,2,4>
+  3779088690U, // <6,6,0,3>: Cost 4 vext3 <3,4,5,6>, <6,0,3,1>
+  2687209788U, // <6,6,0,4>: Cost 3 vext3 <0,4,2,6>, <6,0,4,2>
+  3652194000U, // <6,6,0,5>: Cost 4 vext1 <4,6,6,0>, <5,1,7,3>
+  2254852914U, // <6,6,0,6>: Cost 3 vrev <6,6,6,0>
+  4041575734U, // <6,6,0,7>: Cost 4 vzipr <2,4,6,0>, RHS
+  1573192349U, // <6,6,0,u>: Cost 2 vext2 <4,u,6,6>, LHS
+  2646934262U, // <6,6,1,0>: Cost 3 vext2 <4,u,6,6>, <1,0,3,2>
+  2646934324U, // <6,6,1,1>: Cost 3 vext2 <4,u,6,6>, <1,1,1,1>
+  2646934422U, // <6,6,1,2>: Cost 3 vext2 <4,u,6,6>, <1,2,3,0>
+  2846785638U, // <6,6,1,3>: Cost 3 vuzpr <4,6,4,6>, LHS
+  3760951694U, // <6,6,1,4>: Cost 4 vext3 <0,4,2,6>, <6,1,4,3>
+  2646934672U, // <6,6,1,5>: Cost 3 vext2 <4,u,6,6>, <1,5,3,7>
+  2712572320U, // <6,6,1,6>: Cost 3 vext3 <4,6,4,6>, <6,1,6,3>
+  3775549865U, // <6,6,1,7>: Cost 4 vext3 <2,u,2,6>, <6,1,7,3>
+  2846785643U, // <6,6,1,u>: Cost 3 vuzpr <4,6,4,6>, LHS
+  3759772094U, // <6,6,2,0>: Cost 4 vext3 <0,2,4,6>, <6,2,0,6>
+  3704751676U, // <6,6,2,1>: Cost 4 vext2 <2,2,6,6>, <2,1,6,3>
+  2631009936U, // <6,6,2,2>: Cost 3 vext2 <2,2,6,6>, <2,2,6,6>
+  2646935206U, // <6,6,2,3>: Cost 3 vext2 <4,u,6,6>, <2,3,0,1>
+  3759772127U, // <6,6,2,4>: Cost 4 vext3 <0,2,4,6>, <6,2,4,3>
+  3704752004U, // <6,6,2,5>: Cost 4 vext2 <2,2,6,6>, <2,5,6,7>
+  2646935482U, // <6,6,2,6>: Cost 3 vext2 <4,u,6,6>, <2,6,3,7>
+  2712572410U, // <6,6,2,7>: Cost 3 vext3 <4,6,4,6>, <6,2,7,3>
+  2712572419U, // <6,6,2,u>: Cost 3 vext3 <4,6,4,6>, <6,2,u,3>
+  2646935702U, // <6,6,3,0>: Cost 3 vext2 <4,u,6,6>, <3,0,1,2>
+  3777024534U, // <6,6,3,1>: Cost 4 vext3 <3,1,4,6>, <6,3,1,4>
+  3704752453U, // <6,6,3,2>: Cost 4 vext2 <2,2,6,6>, <3,2,2,6>
+  2646935964U, // <6,6,3,3>: Cost 3 vext2 <4,u,6,6>, <3,3,3,3>
+  2705347122U, // <6,6,3,4>: Cost 3 vext3 <3,4,5,6>, <6,3,4,5>
+  3779678778U, // <6,6,3,5>: Cost 4 vext3 <3,5,4,6>, <6,3,5,4>
+  2657553069U, // <6,6,3,6>: Cost 3 vext2 <6,6,6,6>, <3,6,6,6>
+  4039609654U, // <6,6,3,7>: Cost 4 vzipr <2,1,6,3>, RHS
+  2708001366U, // <6,6,3,u>: Cost 3 vext3 <3,u,5,6>, <6,3,u,5>
+  2578481254U, // <6,6,4,0>: Cost 3 vext1 <4,6,6,4>, LHS
+  3652223734U, // <6,6,4,1>: Cost 4 vext1 <4,6,6,4>, <1,0,3,2>
+  3760951922U, // <6,6,4,2>: Cost 4 vext3 <0,4,2,6>, <6,4,2,6>
+  3779089019U, // <6,6,4,3>: Cost 4 vext3 <3,4,5,6>, <6,4,3,6>
+  1570540772U, // <6,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6>
+  1573195062U, // <6,6,4,5>: Cost 2 vext2 <4,u,6,6>, RHS
+  2712572560U, // <6,6,4,6>: Cost 3 vext3 <4,6,4,6>, <6,4,6,0>
+  2723410591U, // <6,6,4,7>: Cost 3 vext3 <6,4,7,6>, <6,4,7,6>
+  1573195304U, // <6,6,4,u>: Cost 2 vext2 <4,u,6,6>, <4,u,6,6>
+  3640287334U, // <6,6,5,0>: Cost 4 vext1 <2,6,6,5>, LHS
+  2646937296U, // <6,6,5,1>: Cost 3 vext2 <4,u,6,6>, <5,1,7,3>
+  3640289235U, // <6,6,5,2>: Cost 4 vext1 <2,6,6,5>, <2,6,6,5>
+  3720679279U, // <6,6,5,3>: Cost 4 vext2 <4,u,6,6>, <5,3,7,0>
+  2646937542U, // <6,6,5,4>: Cost 3 vext2 <4,u,6,6>, <5,4,7,6>
+  2646937604U, // <6,6,5,5>: Cost 3 vext2 <4,u,6,6>, <5,5,5,5>
+  2646937698U, // <6,6,5,6>: Cost 3 vext2 <4,u,6,6>, <5,6,7,0>
+  2846788918U, // <6,6,5,7>: Cost 3 vuzpr <4,6,4,6>, RHS
+  2846788919U, // <6,6,5,u>: Cost 3 vuzpr <4,6,4,6>, RHS
+  1516699750U, // <6,6,6,0>: Cost 2 vext1 <6,6,6,6>, LHS
+  2590442230U, // <6,6,6,1>: Cost 3 vext1 <6,6,6,6>, <1,0,3,2>
+  2646938106U, // <6,6,6,2>: Cost 3 vext2 <4,u,6,6>, <6,2,7,3>
+  2590443670U, // <6,6,6,3>: Cost 3 vext1 <6,6,6,6>, <3,0,1,2>
+  1516703030U, // <6,6,6,4>: Cost 2 vext1 <6,6,6,6>, RHS
+  2590445264U, // <6,6,6,5>: Cost 3 vext1 <6,6,6,6>, <5,1,7,3>
+  296144182U, // <6,6,6,6>: Cost 1 vdup2 RHS
+  2712572738U, // <6,6,6,7>: Cost 3 vext3 <4,6,4,6>, <6,6,7,7>
+  296144182U, // <6,6,6,u>: Cost 1 vdup2 RHS
+  2566561894U, // <6,6,7,0>: Cost 3 vext1 <2,6,6,7>, LHS
+  3634332924U, // <6,6,7,1>: Cost 4 vext1 <1,6,6,7>, <1,6,6,7>
+  2566563797U, // <6,6,7,2>: Cost 3 vext1 <2,6,6,7>, <2,6,6,7>
+  2584480258U, // <6,6,7,3>: Cost 3 vext1 <5,6,6,7>, <3,4,5,6>
+  2566565174U, // <6,6,7,4>: Cost 3 vext1 <2,6,6,7>, RHS
+  2717438846U, // <6,6,7,5>: Cost 3 vext3 <5,4,7,6>, <6,7,5,4>
+  2980500280U, // <6,6,7,6>: Cost 3 vzipr RHS, <6,6,6,6>
+  1906756918U, // <6,6,7,7>: Cost 2 vzipr RHS, RHS
+  1906756919U, // <6,6,7,u>: Cost 2 vzipr RHS, RHS
+  1516699750U, // <6,6,u,0>: Cost 2 vext1 <6,6,6,6>, LHS
+  1573197614U, // <6,6,u,1>: Cost 2 vext2 <4,u,6,6>, LHS
+  2566571990U, // <6,6,u,2>: Cost 3 vext1 <2,6,6,u>, <2,6,6,u>
+  2846786205U, // <6,6,u,3>: Cost 3 vuzpr <4,6,4,6>, LHS
+  1516703030U, // <6,6,u,4>: Cost 2 vext1 <6,6,6,6>, RHS
+  1573197978U, // <6,6,u,5>: Cost 2 vext2 <4,u,6,6>, RHS
+  296144182U, // <6,6,u,6>: Cost 1 vdup2 RHS
+  1906765110U, // <6,6,u,7>: Cost 2 vzipr RHS, RHS
+  296144182U, // <6,6,u,u>: Cost 1 vdup2 RHS
+  1571209216U, // <6,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+  497467494U, // <6,7,0,1>: Cost 1 vext2 RHS, LHS
+  1571209380U, // <6,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+  2644951292U, // <6,7,0,3>: Cost 3 vext2 RHS, <0,3,1,0>
+  1571209554U, // <6,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+  1510756450U, // <6,7,0,5>: Cost 2 vext1 <5,6,7,0>, <5,6,7,0>
+  2644951542U, // <6,7,0,6>: Cost 3 vext2 RHS, <0,6,1,7>
+  2584499194U, // <6,7,0,7>: Cost 3 vext1 <5,6,7,0>, <7,0,1,2>
+  497468061U, // <6,7,0,u>: Cost 1 vext2 RHS, LHS
+  1571209974U, // <6,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+  1571210036U, // <6,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+  1571210134U, // <6,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0>
+  1571210200U, // <6,7,1,3>: Cost 2 vext2 RHS, <1,3,1,3>
+  2644952098U, // <6,7,1,4>: Cost 3 vext2 RHS, <1,4,0,5>
+  1571210384U, // <6,7,1,5>: Cost 2 vext2 RHS, <1,5,3,7>
+  2644952271U, // <6,7,1,6>: Cost 3 vext2 RHS, <1,6,1,7>
+  2578535418U, // <6,7,1,7>: Cost 3 vext1 <4,6,7,1>, <7,0,1,2>
+  1571210605U, // <6,7,1,u>: Cost 2 vext2 RHS, <1,u,1,3>
+  2644952509U, // <6,7,2,0>: Cost 3 vext2 RHS, <2,0,1,2>
+  2644952582U, // <6,7,2,1>: Cost 3 vext2 RHS, <2,1,0,3>
+  1571210856U, // <6,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+  1571210918U, // <6,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
+  2644952828U, // <6,7,2,4>: Cost 3 vext2 RHS, <2,4,0,6>
+  2633009028U, // <6,7,2,5>: Cost 3 vext2 <2,5,6,7>, <2,5,6,7>
+  1571211194U, // <6,7,2,6>: Cost 2 vext2 RHS, <2,6,3,7>
+  2668840938U, // <6,7,2,7>: Cost 3 vext2 RHS, <2,7,0,1>
+  1571211323U, // <6,7,2,u>: Cost 2 vext2 RHS, <2,u,0,1>
+  1571211414U, // <6,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+  2644953311U, // <6,7,3,1>: Cost 3 vext2 RHS, <3,1,0,3>
+  2644953390U, // <6,7,3,2>: Cost 3 vext2 RHS, <3,2,0,1>
+  1571211676U, // <6,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+  1571211778U, // <6,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+  2644953648U, // <6,7,3,5>: Cost 3 vext2 RHS, <3,5,1,7>
+  2644953720U, // <6,7,3,6>: Cost 3 vext2 RHS, <3,6,0,7>
+  2644953795U, // <6,7,3,7>: Cost 3 vext2 RHS, <3,7,0,1>
+  1571212062U, // <6,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+  1573202834U, // <6,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+  2644954058U, // <6,7,4,1>: Cost 3 vext2 RHS, <4,1,2,3>
+  2644954166U, // <6,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3>
+  2644954258U, // <6,7,4,3>: Cost 3 vext2 RHS, <4,3,6,5>
+  1571212496U, // <6,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+  497470774U, // <6,7,4,5>: Cost 1 vext2 RHS, RHS
+  1573203316U, // <6,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+  2646281688U, // <6,7,4,7>: Cost 3 vext2 <4,7,6,7>, <4,7,6,7>
+  497471017U, // <6,7,4,u>: Cost 1 vext2 RHS, RHS
+  2644954696U, // <6,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2>
+  1573203664U, // <6,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+  2644954878U, // <6,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4>
+  2644954991U, // <6,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0>
+  1571213254U, // <6,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+  1571213316U, // <6,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+  1571213410U, // <6,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0>
+  1573204136U, // <6,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
+  1573204217U, // <6,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7>
+  2644955425U, // <6,7,6,0>: Cost 3 vext2 RHS, <6,0,1,2>
+  2644955561U, // <6,7,6,1>: Cost 3 vext2 RHS, <6,1,7,3>
+  1573204474U, // <6,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+  2644955698U, // <6,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5>
+  2644955789U, // <6,7,6,4>: Cost 3 vext2 RHS, <6,4,5,6>
+  2644955889U, // <6,7,6,5>: Cost 3 vext2 RHS, <6,5,7,7>
+  1571214136U, // <6,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6>
+  1571214158U, // <6,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
+  1573204895U, // <6,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1>
+  1573204986U, // <6,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2>
+  2572608656U, // <6,7,7,1>: Cost 3 vext1 <3,6,7,7>, <1,5,3,7>
+  2644956362U, // <6,7,7,2>: Cost 3 vext2 RHS, <7,2,6,3>
+  2572610231U, // <6,7,7,3>: Cost 3 vext1 <3,6,7,7>, <3,6,7,7>
+  1573205350U, // <6,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6>
+  2646947220U, // <6,7,7,5>: Cost 3 vext2 RHS, <7,5,1,7>
+  1516786498U, // <6,7,7,6>: Cost 2 vext1 <6,6,7,7>, <6,6,7,7>
+  1571214956U, // <6,7,7,7>: Cost 2 vext2 RHS, <7,7,7,7>
+  1573205634U, // <6,7,7,u>: Cost 2 vext2 RHS, <7,u,1,2>
+  1571215059U, // <6,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2>
+  497473326U, // <6,7,u,1>: Cost 1 vext2 RHS, LHS
+  1571215237U, // <6,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0>
+  1571215292U, // <6,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
+  1571215423U, // <6,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6>
+  497473690U, // <6,7,u,5>: Cost 1 vext2 RHS, RHS
+  1571215568U, // <6,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7>
+  1573206272U, // <6,7,u,7>: Cost 2 vext2 RHS, <u,7,0,1>
+  497473893U, // <6,7,u,u>: Cost 1 vext2 RHS, LHS
+  1571217408U, // <6,u,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+  497475686U, // <6,u,0,1>: Cost 1 vext2 RHS, LHS
+  1571217572U, // <6,u,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+  2689865445U, // <6,u,0,3>: Cost 3 vext3 <0,u,2,6>, <u,0,3,2>
+  1571217746U, // <6,u,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+  1510830187U, // <6,u,0,5>: Cost 2 vext1 <5,6,u,0>, <5,6,u,0>
+  2644959734U, // <6,u,0,6>: Cost 3 vext2 RHS, <0,6,1,7>
+  1193130221U, // <6,u,0,7>: Cost 2 vrev <u,6,7,0>
+  497476253U, // <6,u,0,u>: Cost 1 vext2 RHS, LHS
+  1571218166U, // <6,u,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+  1571218228U, // <6,u,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+  1612289838U, // <6,u,1,2>: Cost 2 vext3 <0,2,4,6>, LHS
+  1571218392U, // <6,u,1,3>: Cost 2 vext2 RHS, <1,3,1,3>
+  2566663478U, // <6,u,1,4>: Cost 3 vext1 <2,6,u,1>, RHS
+  1571218576U, // <6,u,1,5>: Cost 2 vext2 RHS, <1,5,3,7>
+  2644960463U, // <6,u,1,6>: Cost 3 vext2 RHS, <1,6,1,7>
+  2717439835U, // <6,u,1,7>: Cost 3 vext3 <5,4,7,6>, <u,1,7,3>
+  1612289892U, // <6,u,1,u>: Cost 2 vext3 <0,2,4,6>, LHS
+  1504870502U, // <6,u,2,0>: Cost 2 vext1 <4,6,u,2>, LHS
+  2644960774U, // <6,u,2,1>: Cost 3 vext2 RHS, <2,1,0,3>
+  1571219048U, // <6,u,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+  1571219110U, // <6,u,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
+  1504873782U, // <6,u,2,4>: Cost 2 vext1 <4,6,u,2>, RHS
+  2633017221U, // <6,u,2,5>: Cost 3 vext2 <2,5,6,u>, <2,5,6,u>
+  1571219386U, // <6,u,2,6>: Cost 2 vext2 RHS, <2,6,3,7>
+  2712573868U, // <6,u,2,7>: Cost 3 vext3 <4,6,4,6>, <u,2,7,3>
+  1571219515U, // <6,u,2,u>: Cost 2 vext2 RHS, <2,u,0,1>
+  1571219606U, // <6,u,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+  2644961503U, // <6,u,3,1>: Cost 3 vext2 RHS, <3,1,0,3>
+  2566678499U, // <6,u,3,2>: Cost 3 vext1 <2,6,u,3>, <2,6,u,3>
+  1571219868U, // <6,u,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+  1571219970U, // <6,u,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+  2689865711U, // <6,u,3,5>: Cost 3 vext3 <0,u,2,6>, <u,3,5,7>
+  2708002806U, // <6,u,3,6>: Cost 3 vext3 <3,u,5,6>, <u,3,6,5>
+  2644961987U, // <6,u,3,7>: Cost 3 vext2 RHS, <3,7,0,1>
+  1571220254U, // <6,u,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+  1571220370U, // <6,u,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+  2644962250U, // <6,u,4,1>: Cost 3 vext2 RHS, <4,1,2,3>
+  1661245476U, // <6,u,4,2>: Cost 2 vext3 <u,4,2,6>, <u,4,2,6>
+  2686031917U, // <6,u,4,3>: Cost 3 vext3 <0,2,4,6>, <u,4,3,6>
+  1571220688U, // <6,u,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+  497478967U, // <6,u,4,5>: Cost 1 vext2 RHS, RHS
+  1571220852U, // <6,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+  1661614161U, // <6,u,4,7>: Cost 2 vext3 <u,4,7,6>, <u,4,7,6>
+  497479209U, // <6,u,4,u>: Cost 1 vext2 RHS, RHS
+  2566692966U, // <6,u,5,0>: Cost 3 vext1 <2,6,u,5>, LHS
+  1571221200U, // <6,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+  2566694885U, // <6,u,5,2>: Cost 3 vext1 <2,6,u,5>, <2,6,u,5>
+  2689865855U, // <6,u,5,3>: Cost 3 vext3 <0,u,2,6>, <u,5,3,7>
+  1571221446U, // <6,u,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+  1571221508U, // <6,u,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+  1612290202U, // <6,u,5,6>: Cost 2 vext3 <0,2,4,6>, RHS
+  1571221672U, // <6,u,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
+  1612290220U, // <6,u,5,u>: Cost 2 vext3 <0,2,4,6>, RHS
+  1504903270U, // <6,u,6,0>: Cost 2 vext1 <4,6,u,6>, LHS
+  2644963752U, // <6,u,6,1>: Cost 3 vext2 RHS, <6,1,7,2>
+  1571222010U, // <6,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+  2686032080U, // <6,u,6,3>: Cost 3 vext3 <0,2,4,6>, <u,6,3,7>
+  1504906550U, // <6,u,6,4>: Cost 2 vext1 <4,6,u,6>, RHS
+  2644964079U, // <6,u,6,5>: Cost 3 vext2 RHS, <6,5,7,5>
+  296144182U, // <6,u,6,6>: Cost 1 vdup2 RHS
+  1571222350U, // <6,u,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
+  296144182U, // <6,u,6,u>: Cost 1 vdup2 RHS
+  1492967526U, // <6,u,7,0>: Cost 2 vext1 <2,6,u,7>, LHS
+  2560738574U, // <6,u,7,1>: Cost 3 vext1 <1,6,u,7>, <1,6,u,7>
+  1492969447U, // <6,u,7,2>: Cost 2 vext1 <2,6,u,7>, <2,6,u,7>
+  1906753692U, // <6,u,7,3>: Cost 2 vzipr RHS, LHS
+  1492970806U, // <6,u,7,4>: Cost 2 vext1 <2,6,u,7>, RHS
+  2980495761U, // <6,u,7,5>: Cost 3 vzipr RHS, <0,4,u,5>
+  1516860235U, // <6,u,7,6>: Cost 2 vext1 <6,6,u,7>, <6,6,u,7>
+  1906756936U, // <6,u,7,7>: Cost 2 vzipr RHS, RHS
+  1492973358U, // <6,u,7,u>: Cost 2 vext1 <2,6,u,7>, LHS
+  1492975718U, // <6,u,u,0>: Cost 2 vext1 <2,6,u,u>, LHS
+  497481518U, // <6,u,u,1>: Cost 1 vext2 RHS, LHS
+  1612290405U, // <6,u,u,2>: Cost 2 vext3 <0,2,4,6>, LHS
+  1571223484U, // <6,u,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
+  1492978998U, // <6,u,u,4>: Cost 2 vext1 <2,6,u,u>, RHS
+  497481882U, // <6,u,u,5>: Cost 1 vext2 RHS, RHS
+  296144182U, // <6,u,u,6>: Cost 1 vdup2 RHS
+  1906765128U, // <6,u,u,7>: Cost 2 vzipr RHS, RHS
+  497482085U, // <6,u,u,u>: Cost 1 vext2 RHS, LHS
+  1638318080U, // <7,0,0,0>: Cost 2 vext3 RHS, <0,0,0,0>
+  1638318090U, // <7,0,0,1>: Cost 2 vext3 RHS, <0,0,1,1>
+  1638318100U, // <7,0,0,2>: Cost 2 vext3 RHS, <0,0,2,2>
+  3646442178U, // <7,0,0,3>: Cost 4 vext1 <3,7,0,0>, <3,7,0,0>
+  2712059941U, // <7,0,0,4>: Cost 3 vext3 RHS, <0,0,4,1>
+  2651603364U, // <7,0,0,5>: Cost 3 vext2 <5,6,7,0>, <0,5,1,6>
+  2590618445U, // <7,0,0,6>: Cost 3 vext1 <6,7,0,0>, <6,7,0,0>
+  3785801798U, // <7,0,0,7>: Cost 4 vext3 RHS, <0,0,7,7>
+  1638318153U, // <7,0,0,u>: Cost 2 vext3 RHS, <0,0,u,1>
+  1516879974U, // <7,0,1,0>: Cost 2 vext1 <6,7,0,1>, LHS
+  2693922911U, // <7,0,1,1>: Cost 3 vext3 <1,5,3,7>, <0,1,1,5>
+  564576358U, // <7,0,1,2>: Cost 1 vext3 RHS, LHS
+  2638996480U, // <7,0,1,3>: Cost 3 vext2 <3,5,7,0>, <1,3,5,7>
+  1516883254U, // <7,0,1,4>: Cost 2 vext1 <6,7,0,1>, RHS
+  2649613456U, // <7,0,1,5>: Cost 3 vext2 <5,3,7,0>, <1,5,3,7>
+  1516884814U, // <7,0,1,6>: Cost 2 vext1 <6,7,0,1>, <6,7,0,1>
+  2590626808U, // <7,0,1,7>: Cost 3 vext1 <6,7,0,1>, <7,0,1,0>
+  564576412U, // <7,0,1,u>: Cost 1 vext3 RHS, LHS
+  1638318244U, // <7,0,2,0>: Cost 2 vext3 RHS, <0,2,0,2>
+  2692743344U, // <7,0,2,1>: Cost 3 vext3 <1,3,5,7>, <0,2,1,5>
+  2712060084U, // <7,0,2,2>: Cost 3 vext3 RHS, <0,2,2,0>
+  2712060094U, // <7,0,2,3>: Cost 3 vext3 RHS, <0,2,3,1>
+  1638318284U, // <7,0,2,4>: Cost 2 vext3 RHS, <0,2,4,6>
+  2712060118U, // <7,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7>
+  2651604922U, // <7,0,2,6>: Cost 3 vext2 <5,6,7,0>, <2,6,3,7>
+  2686255336U, // <7,0,2,7>: Cost 3 vext3 <0,2,7,7>, <0,2,7,7>
+  1638318316U, // <7,0,2,u>: Cost 2 vext3 RHS, <0,2,u,2>
+  2651605142U, // <7,0,3,0>: Cost 3 vext2 <5,6,7,0>, <3,0,1,2>
+  2712060156U, // <7,0,3,1>: Cost 3 vext3 RHS, <0,3,1,0>
+  2712060165U, // <7,0,3,2>: Cost 3 vext3 RHS, <0,3,2,0>
+  2651605404U, // <7,0,3,3>: Cost 3 vext2 <5,6,7,0>, <3,3,3,3>
+  2651605506U, // <7,0,3,4>: Cost 3 vext2 <5,6,7,0>, <3,4,5,6>
+  2638998111U, // <7,0,3,5>: Cost 3 vext2 <3,5,7,0>, <3,5,7,0>
+  2639661744U, // <7,0,3,6>: Cost 3 vext2 <3,6,7,0>, <3,6,7,0>
+  3712740068U, // <7,0,3,7>: Cost 4 vext2 <3,5,7,0>, <3,7,3,7>
+  2640989010U, // <7,0,3,u>: Cost 3 vext2 <3,u,7,0>, <3,u,7,0>
+  2712060232U, // <7,0,4,0>: Cost 3 vext3 RHS, <0,4,0,4>
+  1638318418U, // <7,0,4,1>: Cost 2 vext3 RHS, <0,4,1,5>
+  1638318428U, // <7,0,4,2>: Cost 2 vext3 RHS, <0,4,2,6>
+  3646474950U, // <7,0,4,3>: Cost 4 vext1 <3,7,0,4>, <3,7,0,4>
+  2712060270U, // <7,0,4,4>: Cost 3 vext3 RHS, <0,4,4,6>
+  1577864502U, // <7,0,4,5>: Cost 2 vext2 <5,6,7,0>, RHS
+  2651606388U, // <7,0,4,6>: Cost 3 vext2 <5,6,7,0>, <4,6,4,6>
+  3787792776U, // <7,0,4,7>: Cost 4 vext3 RHS, <0,4,7,5>
+  1638318481U, // <7,0,4,u>: Cost 2 vext3 RHS, <0,4,u,5>
+  2590654566U, // <7,0,5,0>: Cost 3 vext1 <6,7,0,5>, LHS
+  2651606736U, // <7,0,5,1>: Cost 3 vext2 <5,6,7,0>, <5,1,7,3>
+  2712060334U, // <7,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7>
+  2649616239U, // <7,0,5,3>: Cost 3 vext2 <5,3,7,0>, <5,3,7,0>
+  2651606982U, // <7,0,5,4>: Cost 3 vext2 <5,6,7,0>, <5,4,7,6>
+  2651607044U, // <7,0,5,5>: Cost 3 vext2 <5,6,7,0>, <5,5,5,5>
+  1577865314U, // <7,0,5,6>: Cost 2 vext2 <5,6,7,0>, <5,6,7,0>
+  2651607208U, // <7,0,5,7>: Cost 3 vext2 <5,6,7,0>, <5,7,5,7>
+  1579192580U, // <7,0,5,u>: Cost 2 vext2 <5,u,7,0>, <5,u,7,0>
+  2688393709U, // <7,0,6,0>: Cost 3 vext3 <0,6,0,7>, <0,6,0,7>
+  2712060406U, // <7,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7>
+  2688541183U, // <7,0,6,2>: Cost 3 vext3 <0,6,2,7>, <0,6,2,7>
+  2655588936U, // <7,0,6,3>: Cost 3 vext2 <6,3,7,0>, <6,3,7,0>
+  3762430481U, // <7,0,6,4>: Cost 4 vext3 <0,6,4,7>, <0,6,4,7>
+  2651607730U, // <7,0,6,5>: Cost 3 vext2 <5,6,7,0>, <6,5,0,7>
+  2651607864U, // <7,0,6,6>: Cost 3 vext2 <5,6,7,0>, <6,6,6,6>
+  2651607886U, // <7,0,6,7>: Cost 3 vext2 <5,6,7,0>, <6,7,0,1>
+  2688983605U, // <7,0,6,u>: Cost 3 vext3 <0,6,u,7>, <0,6,u,7>
+  2651608058U, // <7,0,7,0>: Cost 3 vext2 <5,6,7,0>, <7,0,1,2>
+  2932703334U, // <7,0,7,1>: Cost 3 vzipl <7,7,7,7>, LHS
+  3066921062U, // <7,0,7,2>: Cost 3 vtrnl <7,7,7,7>, LHS
+  3712742678U, // <7,0,7,3>: Cost 4 vext2 <3,5,7,0>, <7,3,5,7>
+  2651608422U, // <7,0,7,4>: Cost 3 vext2 <5,6,7,0>, <7,4,5,6>
+  2651608513U, // <7,0,7,5>: Cost 3 vext2 <5,6,7,0>, <7,5,6,7>
+  2663552532U, // <7,0,7,6>: Cost 3 vext2 <7,6,7,0>, <7,6,7,0>
+  2651608684U, // <7,0,7,7>: Cost 3 vext2 <5,6,7,0>, <7,7,7,7>
+  2651608706U, // <7,0,7,u>: Cost 3 vext2 <5,6,7,0>, <7,u,1,2>
+  1638318730U, // <7,0,u,0>: Cost 2 vext3 RHS, <0,u,0,2>
+  1638318738U, // <7,0,u,1>: Cost 2 vext3 RHS, <0,u,1,1>
+  564576925U, // <7,0,u,2>: Cost 1 vext3 RHS, LHS
+  2572765898U, // <7,0,u,3>: Cost 3 vext1 <3,7,0,u>, <3,7,0,u>
+  1638318770U, // <7,0,u,4>: Cost 2 vext3 RHS, <0,u,4,6>
+  1577867418U, // <7,0,u,5>: Cost 2 vext2 <5,6,7,0>, RHS
+  1516942165U, // <7,0,u,6>: Cost 2 vext1 <6,7,0,u>, <6,7,0,u>
+  2651609344U, // <7,0,u,7>: Cost 3 vext2 <5,6,7,0>, <u,7,0,1>
+  564576979U, // <7,0,u,u>: Cost 1 vext3 RHS, LHS
+  2590687334U, // <7,1,0,0>: Cost 3 vext1 <6,7,1,0>, LHS
+  2639003750U, // <7,1,0,1>: Cost 3 vext2 <3,5,7,1>, LHS
+  2793357414U, // <7,1,0,2>: Cost 3 vuzpl <7,0,1,2>, LHS
+  1638318838U, // <7,1,0,3>: Cost 2 vext3 RHS, <1,0,3,2>
+  2590690614U, // <7,1,0,4>: Cost 3 vext1 <6,7,1,0>, RHS
+  2712060679U, // <7,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1>
+  2590692182U, // <7,1,0,6>: Cost 3 vext1 <6,7,1,0>, <6,7,1,0>
+  3785802521U, // <7,1,0,7>: Cost 4 vext3 RHS, <1,0,7,1>
+  1638318883U, // <7,1,0,u>: Cost 2 vext3 RHS, <1,0,u,2>
+  2712060715U, // <7,1,1,0>: Cost 3 vext3 RHS, <1,1,0,1>
+  1638318900U, // <7,1,1,1>: Cost 2 vext3 RHS, <1,1,1,1>
+  3774300994U, // <7,1,1,2>: Cost 4 vext3 <2,6,3,7>, <1,1,2,6>
+  1638318920U, // <7,1,1,3>: Cost 2 vext3 RHS, <1,1,3,3>
+  2712060755U, // <7,1,1,4>: Cost 3 vext3 RHS, <1,1,4,5>
+  2691416926U, // <7,1,1,5>: Cost 3 vext3 <1,1,5,7>, <1,1,5,7>
+  2590700375U, // <7,1,1,6>: Cost 3 vext1 <6,7,1,1>, <6,7,1,1>
+  3765158766U, // <7,1,1,7>: Cost 4 vext3 <1,1,5,7>, <1,1,7,5>
+  1638318965U, // <7,1,1,u>: Cost 2 vext3 RHS, <1,1,u,3>
+  2712060796U, // <7,1,2,0>: Cost 3 vext3 RHS, <1,2,0,1>
+  2712060807U, // <7,1,2,1>: Cost 3 vext3 RHS, <1,2,1,3>
+  3712747112U, // <7,1,2,2>: Cost 4 vext2 <3,5,7,1>, <2,2,2,2>
+  1638318998U, // <7,1,2,3>: Cost 2 vext3 RHS, <1,2,3,0>
+  2712060836U, // <7,1,2,4>: Cost 3 vext3 RHS, <1,2,4,5>
+  2712060843U, // <7,1,2,5>: Cost 3 vext3 RHS, <1,2,5,3>
+  2590708568U, // <7,1,2,6>: Cost 3 vext1 <6,7,1,2>, <6,7,1,2>
+  2735948730U, // <7,1,2,7>: Cost 3 vext3 RHS, <1,2,7,0>
+  1638319043U, // <7,1,2,u>: Cost 2 vext3 RHS, <1,2,u,0>
+  2712060876U, // <7,1,3,0>: Cost 3 vext3 RHS, <1,3,0,0>
+  1638319064U, // <7,1,3,1>: Cost 2 vext3 RHS, <1,3,1,3>
+  2712060894U, // <7,1,3,2>: Cost 3 vext3 RHS, <1,3,2,0>
+  2692596718U, // <7,1,3,3>: Cost 3 vext3 <1,3,3,7>, <1,3,3,7>
+  2712060917U, // <7,1,3,4>: Cost 3 vext3 RHS, <1,3,4,5>
+  1619002368U, // <7,1,3,5>: Cost 2 vext3 <1,3,5,7>, <1,3,5,7>
+  2692817929U, // <7,1,3,6>: Cost 3 vext3 <1,3,6,7>, <1,3,6,7>
+  2735948814U, // <7,1,3,7>: Cost 3 vext3 RHS, <1,3,7,3>
+  1619223579U, // <7,1,3,u>: Cost 2 vext3 <1,3,u,7>, <1,3,u,7>
+  2712060962U, // <7,1,4,0>: Cost 3 vext3 RHS, <1,4,0,5>
+  2712060971U, // <7,1,4,1>: Cost 3 vext3 RHS, <1,4,1,5>
+  2712060980U, // <7,1,4,2>: Cost 3 vext3 RHS, <1,4,2,5>
+  2712060989U, // <7,1,4,3>: Cost 3 vext3 RHS, <1,4,3,5>
+  3785802822U, // <7,1,4,4>: Cost 4 vext3 RHS, <1,4,4,5>
+  2639007030U, // <7,1,4,5>: Cost 3 vext2 <3,5,7,1>, RHS
+  2645642634U, // <7,1,4,6>: Cost 3 vext2 <4,6,7,1>, <4,6,7,1>
+  3719384520U, // <7,1,4,7>: Cost 4 vext2 <4,6,7,1>, <4,7,5,0>
+  2639007273U, // <7,1,4,u>: Cost 3 vext2 <3,5,7,1>, RHS
+  2572812390U, // <7,1,5,0>: Cost 3 vext1 <3,7,1,5>, LHS
+  2693776510U, // <7,1,5,1>: Cost 3 vext3 <1,5,1,7>, <1,5,1,7>
+  3774301318U, // <7,1,5,2>: Cost 4 vext3 <2,6,3,7>, <1,5,2,6>
+  1620182160U, // <7,1,5,3>: Cost 2 vext3 <1,5,3,7>, <1,5,3,7>
+  2572815670U, // <7,1,5,4>: Cost 3 vext1 <3,7,1,5>, RHS
+  3766486178U, // <7,1,5,5>: Cost 4 vext3 <1,3,5,7>, <1,5,5,7>
+  2651615331U, // <7,1,5,6>: Cost 3 vext2 <5,6,7,1>, <5,6,7,1>
+  2652278964U, // <7,1,5,7>: Cost 3 vext2 <5,7,7,1>, <5,7,7,1>
+  1620550845U, // <7,1,5,u>: Cost 2 vext3 <1,5,u,7>, <1,5,u,7>
+  3768108230U, // <7,1,6,0>: Cost 4 vext3 <1,6,0,7>, <1,6,0,7>
+  2694440143U, // <7,1,6,1>: Cost 3 vext3 <1,6,1,7>, <1,6,1,7>
+  2712061144U, // <7,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7>
+  2694587617U, // <7,1,6,3>: Cost 3 vext3 <1,6,3,7>, <1,6,3,7>
+  3768403178U, // <7,1,6,4>: Cost 4 vext3 <1,6,4,7>, <1,6,4,7>
+  2694735091U, // <7,1,6,5>: Cost 3 vext3 <1,6,5,7>, <1,6,5,7>
+  3768550652U, // <7,1,6,6>: Cost 4 vext3 <1,6,6,7>, <1,6,6,7>
+  2652279630U, // <7,1,6,7>: Cost 3 vext2 <5,7,7,1>, <6,7,0,1>
+  2694956302U, // <7,1,6,u>: Cost 3 vext3 <1,6,u,7>, <1,6,u,7>
+  2645644282U, // <7,1,7,0>: Cost 3 vext2 <4,6,7,1>, <7,0,1,2>
+  2859062094U, // <7,1,7,1>: Cost 3 vuzpr <6,7,0,1>, <6,7,0,1>
+  3779462437U, // <7,1,7,2>: Cost 4 vext3 <3,5,1,7>, <1,7,2,3>
+  3121938534U, // <7,1,7,3>: Cost 3 vtrnr <5,7,5,7>, LHS
+  2554916150U, // <7,1,7,4>: Cost 3 vext1 <0,7,1,7>, RHS
+  3769140548U, // <7,1,7,5>: Cost 4 vext3 <1,7,5,7>, <1,7,5,7>
+  3726022164U, // <7,1,7,6>: Cost 4 vext2 <5,7,7,1>, <7,6,7,0>
+  2554918508U, // <7,1,7,7>: Cost 3 vext1 <0,7,1,7>, <7,7,7,7>
+  3121938539U, // <7,1,7,u>: Cost 3 vtrnr <5,7,5,7>, LHS
+  2572836966U, // <7,1,u,0>: Cost 3 vext1 <3,7,1,u>, LHS
+  1638319469U, // <7,1,u,1>: Cost 2 vext3 RHS, <1,u,1,3>
+  2712061299U, // <7,1,u,2>: Cost 3 vext3 RHS, <1,u,2,0>
+  1622173059U, // <7,1,u,3>: Cost 2 vext3 <1,u,3,7>, <1,u,3,7>
+  2572840246U, // <7,1,u,4>: Cost 3 vext1 <3,7,1,u>, RHS
+  1622320533U, // <7,1,u,5>: Cost 2 vext3 <1,u,5,7>, <1,u,5,7>
+  2696136094U, // <7,1,u,6>: Cost 3 vext3 <1,u,6,7>, <1,u,6,7>
+  2859060777U, // <7,1,u,7>: Cost 3 vuzpr <6,7,0,1>, RHS
+  1622541744U, // <7,1,u,u>: Cost 2 vext3 <1,u,u,7>, <1,u,u,7>
+  2712061364U, // <7,2,0,0>: Cost 3 vext3 RHS, <2,0,0,2>
+  2712061373U, // <7,2,0,1>: Cost 3 vext3 RHS, <2,0,1,2>
+  2712061380U, // <7,2,0,2>: Cost 3 vext3 RHS, <2,0,2,0>
+  2712061389U, // <7,2,0,3>: Cost 3 vext3 RHS, <2,0,3,0>
+  2712061404U, // <7,2,0,4>: Cost 3 vext3 RHS, <2,0,4,6>
+  2696725990U, // <7,2,0,5>: Cost 3 vext3 <2,0,5,7>, <2,0,5,7>
+  2712061417U, // <7,2,0,6>: Cost 3 vext3 RHS, <2,0,6,1>
+  3785803251U, // <7,2,0,7>: Cost 4 vext3 RHS, <2,0,7,2>
+  2696947201U, // <7,2,0,u>: Cost 3 vext3 <2,0,u,7>, <2,0,u,7>
+  2712061446U, // <7,2,1,0>: Cost 3 vext3 RHS, <2,1,0,3>
+  3785803276U, // <7,2,1,1>: Cost 4 vext3 RHS, <2,1,1,0>
+  3785803285U, // <7,2,1,2>: Cost 4 vext3 RHS, <2,1,2,0>
+  2712061471U, // <7,2,1,3>: Cost 3 vext3 RHS, <2,1,3,1>
+  2712061482U, // <7,2,1,4>: Cost 3 vext3 RHS, <2,1,4,3>
+  3766486576U, // <7,2,1,5>: Cost 4 vext3 <1,3,5,7>, <2,1,5,0>
+  2712061500U, // <7,2,1,6>: Cost 3 vext3 RHS, <2,1,6,3>
+  2602718850U, // <7,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2>
+  2712061516U, // <7,2,1,u>: Cost 3 vext3 RHS, <2,1,u,1>
+  2712061525U, // <7,2,2,0>: Cost 3 vext3 RHS, <2,2,0,1>
+  2712061536U, // <7,2,2,1>: Cost 3 vext3 RHS, <2,2,1,3>
+  1638319720U, // <7,2,2,2>: Cost 2 vext3 RHS, <2,2,2,2>
+  1638319730U, // <7,2,2,3>: Cost 2 vext3 RHS, <2,2,3,3>
+  2712061565U, // <7,2,2,4>: Cost 3 vext3 RHS, <2,2,4,5>
+  2698053256U, // <7,2,2,5>: Cost 3 vext3 <2,2,5,7>, <2,2,5,7>
+  2712061584U, // <7,2,2,6>: Cost 3 vext3 RHS, <2,2,6,6>
+  3771795096U, // <7,2,2,7>: Cost 4 vext3 <2,2,5,7>, <2,2,7,5>
+  1638319775U, // <7,2,2,u>: Cost 2 vext3 RHS, <2,2,u,3>
+  1638319782U, // <7,2,3,0>: Cost 2 vext3 RHS, <2,3,0,1>
+  2693924531U, // <7,2,3,1>: Cost 3 vext3 <1,5,3,7>, <2,3,1,5>
+  2700560061U, // <7,2,3,2>: Cost 3 vext3 <2,6,3,7>, <2,3,2,6>
+  2693924551U, // <7,2,3,3>: Cost 3 vext3 <1,5,3,7>, <2,3,3,7>
+  1638319822U, // <7,2,3,4>: Cost 2 vext3 RHS, <2,3,4,5>
+  2698716889U, // <7,2,3,5>: Cost 3 vext3 <2,3,5,7>, <2,3,5,7>
+  2712061665U, // <7,2,3,6>: Cost 3 vext3 RHS, <2,3,6,6>
+  2735949540U, // <7,2,3,7>: Cost 3 vext3 RHS, <2,3,7,0>
+  1638319854U, // <7,2,3,u>: Cost 2 vext3 RHS, <2,3,u,1>
+  2712061692U, // <7,2,4,0>: Cost 3 vext3 RHS, <2,4,0,6>
+  2712061698U, // <7,2,4,1>: Cost 3 vext3 RHS, <2,4,1,3>
+  2712061708U, // <7,2,4,2>: Cost 3 vext3 RHS, <2,4,2,4>
+  2712061718U, // <7,2,4,3>: Cost 3 vext3 RHS, <2,4,3,5>
+  2712061728U, // <7,2,4,4>: Cost 3 vext3 RHS, <2,4,4,6>
+  2699380522U, // <7,2,4,5>: Cost 3 vext3 <2,4,5,7>, <2,4,5,7>
+  2712061740U, // <7,2,4,6>: Cost 3 vext3 RHS, <2,4,6,0>
+  3809691445U, // <7,2,4,7>: Cost 4 vext3 RHS, <2,4,7,0>
+  2699601733U, // <7,2,4,u>: Cost 3 vext3 <2,4,u,7>, <2,4,u,7>
+  2699675470U, // <7,2,5,0>: Cost 3 vext3 <2,5,0,7>, <2,5,0,7>
+  3766486867U, // <7,2,5,1>: Cost 4 vext3 <1,3,5,7>, <2,5,1,3>
+  2699822944U, // <7,2,5,2>: Cost 3 vext3 <2,5,2,7>, <2,5,2,7>
+  2692745065U, // <7,2,5,3>: Cost 3 vext3 <1,3,5,7>, <2,5,3,7>
+  2699970418U, // <7,2,5,4>: Cost 3 vext3 <2,5,4,7>, <2,5,4,7>
+  3766486907U, // <7,2,5,5>: Cost 4 vext3 <1,3,5,7>, <2,5,5,7>
+  2700117892U, // <7,2,5,6>: Cost 3 vext3 <2,5,6,7>, <2,5,6,7>
+  3771795334U, // <7,2,5,7>: Cost 4 vext3 <2,2,5,7>, <2,5,7,0>
+  2692745110U, // <7,2,5,u>: Cost 3 vext3 <1,3,5,7>, <2,5,u,7>
+  2572894310U, // <7,2,6,0>: Cost 3 vext1 <3,7,2,6>, LHS
+  2712061860U, // <7,2,6,1>: Cost 3 vext3 RHS, <2,6,1,3>
+  2700486577U, // <7,2,6,2>: Cost 3 vext3 <2,6,2,7>, <2,6,2,7>
+  1626818490U, // <7,2,6,3>: Cost 2 vext3 <2,6,3,7>, <2,6,3,7>
+  2572897590U, // <7,2,6,4>: Cost 3 vext1 <3,7,2,6>, RHS
+  2700707788U, // <7,2,6,5>: Cost 3 vext3 <2,6,5,7>, <2,6,5,7>
+  2700781525U, // <7,2,6,6>: Cost 3 vext3 <2,6,6,7>, <2,6,6,7>
+  3774597086U, // <7,2,6,7>: Cost 4 vext3 <2,6,7,7>, <2,6,7,7>
+  1627187175U, // <7,2,6,u>: Cost 2 vext3 <2,6,u,7>, <2,6,u,7>
+  2735949802U, // <7,2,7,0>: Cost 3 vext3 RHS, <2,7,0,1>
+  3780200434U, // <7,2,7,1>: Cost 4 vext3 <3,6,2,7>, <2,7,1,0>
+  3773564928U, // <7,2,7,2>: Cost 4 vext3 <2,5,2,7>, <2,7,2,5>
+  2986541158U, // <7,2,7,3>: Cost 3 vzipr <5,5,7,7>, LHS
+  2554989878U, // <7,2,7,4>: Cost 3 vext1 <0,7,2,7>, RHS
+  3775113245U, // <7,2,7,5>: Cost 4 vext3 <2,7,5,7>, <2,7,5,7>
+  4060283228U, // <7,2,7,6>: Cost 4 vzipr <5,5,7,7>, <0,4,2,6>
+  2554992236U, // <7,2,7,7>: Cost 3 vext1 <0,7,2,7>, <7,7,7,7>
+  2986541163U, // <7,2,7,u>: Cost 3 vzipr <5,5,7,7>, LHS
+  1638320187U, // <7,2,u,0>: Cost 2 vext3 RHS, <2,u,0,1>
+  2693924936U, // <7,2,u,1>: Cost 3 vext3 <1,5,3,7>, <2,u,1,5>
+  1638319720U, // <7,2,u,2>: Cost 2 vext3 RHS, <2,2,2,2>
+  1628145756U, // <7,2,u,3>: Cost 2 vext3 <2,u,3,7>, <2,u,3,7>
+  1638320227U, // <7,2,u,4>: Cost 2 vext3 RHS, <2,u,4,5>
+  2702035054U, // <7,2,u,5>: Cost 3 vext3 <2,u,5,7>, <2,u,5,7>
+  2702108791U, // <7,2,u,6>: Cost 3 vext3 <2,u,6,7>, <2,u,6,7>
+  2735949945U, // <7,2,u,7>: Cost 3 vext3 RHS, <2,u,7,0>
+  1628514441U, // <7,2,u,u>: Cost 2 vext3 <2,u,u,7>, <2,u,u,7>
+  2712062091U, // <7,3,0,0>: Cost 3 vext3 RHS, <3,0,0,0>
+  1638320278U, // <7,3,0,1>: Cost 2 vext3 RHS, <3,0,1,2>
+  2712062109U, // <7,3,0,2>: Cost 3 vext3 RHS, <3,0,2,0>
+  2590836886U, // <7,3,0,3>: Cost 3 vext1 <6,7,3,0>, <3,0,1,2>
+  2712062128U, // <7,3,0,4>: Cost 3 vext3 RHS, <3,0,4,1>
+  2712062138U, // <7,3,0,5>: Cost 3 vext3 RHS, <3,0,5,2>
+  2590839656U, // <7,3,0,6>: Cost 3 vext1 <6,7,3,0>, <6,7,3,0>
+  3311414017U, // <7,3,0,7>: Cost 4 vrev <3,7,7,0>
+  1638320341U, // <7,3,0,u>: Cost 2 vext3 RHS, <3,0,u,2>
+  2237164227U, // <7,3,1,0>: Cost 3 vrev <3,7,0,1>
+  2712062182U, // <7,3,1,1>: Cost 3 vext3 RHS, <3,1,1,1>
+  2712062193U, // <7,3,1,2>: Cost 3 vext3 RHS, <3,1,2,3>
+  2692745468U, // <7,3,1,3>: Cost 3 vext3 <1,3,5,7>, <3,1,3,5>
+  2712062214U, // <7,3,1,4>: Cost 3 vext3 RHS, <3,1,4,6>
+  2693925132U, // <7,3,1,5>: Cost 3 vext3 <1,5,3,7>, <3,1,5,3>
+  3768183059U, // <7,3,1,6>: Cost 4 vext3 <1,6,1,7>, <3,1,6,1>
+  2692745504U, // <7,3,1,7>: Cost 3 vext3 <1,3,5,7>, <3,1,7,5>
+  2696063273U, // <7,3,1,u>: Cost 3 vext3 <1,u,5,7>, <3,1,u,5>
+  2712062254U, // <7,3,2,0>: Cost 3 vext3 RHS, <3,2,0,1>
+  2712062262U, // <7,3,2,1>: Cost 3 vext3 RHS, <3,2,1,0>
+  2712062273U, // <7,3,2,2>: Cost 3 vext3 RHS, <3,2,2,2>
+  2712062280U, // <7,3,2,3>: Cost 3 vext3 RHS, <3,2,3,0>
+  2712062294U, // <7,3,2,4>: Cost 3 vext3 RHS, <3,2,4,5>
+  2712062302U, // <7,3,2,5>: Cost 3 vext3 RHS, <3,2,5,4>
+  2700560742U, // <7,3,2,6>: Cost 3 vext3 <2,6,3,7>, <3,2,6,3>
+  2712062319U, // <7,3,2,7>: Cost 3 vext3 RHS, <3,2,7,3>
+  2712062325U, // <7,3,2,u>: Cost 3 vext3 RHS, <3,2,u,0>
+  2712062335U, // <7,3,3,0>: Cost 3 vext3 RHS, <3,3,0,1>
+  2636368158U, // <7,3,3,1>: Cost 3 vext2 <3,1,7,3>, <3,1,7,3>
+  2637031791U, // <7,3,3,2>: Cost 3 vext2 <3,2,7,3>, <3,2,7,3>
+  1638320540U, // <7,3,3,3>: Cost 2 vext3 RHS, <3,3,3,3>
+  2712062374U, // <7,3,3,4>: Cost 3 vext3 RHS, <3,3,4,4>
+  2704689586U, // <7,3,3,5>: Cost 3 vext3 <3,3,5,7>, <3,3,5,7>
+  2590864235U, // <7,3,3,6>: Cost 3 vext1 <6,7,3,3>, <6,7,3,3>
+  2704837060U, // <7,3,3,7>: Cost 3 vext3 <3,3,7,7>, <3,3,7,7>
+  1638320540U, // <7,3,3,u>: Cost 2 vext3 RHS, <3,3,3,3>
+  2712062416U, // <7,3,4,0>: Cost 3 vext3 RHS, <3,4,0,1>
+  2712062426U, // <7,3,4,1>: Cost 3 vext3 RHS, <3,4,1,2>
+  2566981640U, // <7,3,4,2>: Cost 3 vext1 <2,7,3,4>, <2,7,3,4>
+  2712062447U, // <7,3,4,3>: Cost 3 vext3 RHS, <3,4,3,5>
+  2712062456U, // <7,3,4,4>: Cost 3 vext3 RHS, <3,4,4,5>
+  1638320642U, // <7,3,4,5>: Cost 2 vext3 RHS, <3,4,5,6>
+  2648313204U, // <7,3,4,6>: Cost 3 vext2 <5,1,7,3>, <4,6,4,6>
+  3311446789U, // <7,3,4,7>: Cost 4 vrev <3,7,7,4>
+  1638320669U, // <7,3,4,u>: Cost 2 vext3 RHS, <3,4,u,6>
+  2602819686U, // <7,3,5,0>: Cost 3 vext1 <u,7,3,5>, LHS
+  1574571728U, // <7,3,5,1>: Cost 2 vext2 <5,1,7,3>, <5,1,7,3>
+  2648977185U, // <7,3,5,2>: Cost 3 vext2 <5,2,7,3>, <5,2,7,3>
+  2705869378U, // <7,3,5,3>: Cost 3 vext3 <3,5,3,7>, <3,5,3,7>
+  2237491947U, // <7,3,5,4>: Cost 3 vrev <3,7,4,5>
+  2706016852U, // <7,3,5,5>: Cost 3 vext3 <3,5,5,7>, <3,5,5,7>
+  2648313954U, // <7,3,5,6>: Cost 3 vext2 <5,1,7,3>, <5,6,7,0>
+  2692745823U, // <7,3,5,7>: Cost 3 vext3 <1,3,5,7>, <3,5,7,0>
+  1579217159U, // <7,3,5,u>: Cost 2 vext2 <5,u,7,3>, <5,u,7,3>
+  2706311800U, // <7,3,6,0>: Cost 3 vext3 <3,6,0,7>, <3,6,0,7>
+  2654286249U, // <7,3,6,1>: Cost 3 vext2 <6,1,7,3>, <6,1,7,3>
+  1581208058U, // <7,3,6,2>: Cost 2 vext2 <6,2,7,3>, <6,2,7,3>
+  2706533011U, // <7,3,6,3>: Cost 3 vext3 <3,6,3,7>, <3,6,3,7>
+  2706606748U, // <7,3,6,4>: Cost 3 vext3 <3,6,4,7>, <3,6,4,7>
+  3780422309U, // <7,3,6,5>: Cost 4 vext3 <3,6,5,7>, <3,6,5,7>
+  2712062637U, // <7,3,6,6>: Cost 3 vext3 RHS, <3,6,6,6>
+  2706827959U, // <7,3,6,7>: Cost 3 vext3 <3,6,7,7>, <3,6,7,7>
+  1585189856U, // <7,3,6,u>: Cost 2 vext2 <6,u,7,3>, <6,u,7,3>
+  2693925571U, // <7,3,7,0>: Cost 3 vext3 <1,5,3,7>, <3,7,0,1>
+  2693925584U, // <7,3,7,1>: Cost 3 vext3 <1,5,3,7>, <3,7,1,5>
+  2700561114U, // <7,3,7,2>: Cost 3 vext3 <2,6,3,7>, <3,7,2,6>
+  2572978916U, // <7,3,7,3>: Cost 3 vext1 <3,7,3,7>, <3,7,3,7>
+  2693925611U, // <7,3,7,4>: Cost 3 vext3 <1,5,3,7>, <3,7,4,5>
+  2707344118U, // <7,3,7,5>: Cost 3 vext3 <3,7,5,7>, <3,7,5,7>
+  2654950894U, // <7,3,7,6>: Cost 3 vext2 <6,2,7,3>, <7,6,2,7>
+  2648315500U, // <7,3,7,7>: Cost 3 vext2 <5,1,7,3>, <7,7,7,7>
+  2693925643U, // <7,3,7,u>: Cost 3 vext3 <1,5,3,7>, <3,7,u,1>
+  2237221578U, // <7,3,u,0>: Cost 3 vrev <3,7,0,u>
+  1638320926U, // <7,3,u,1>: Cost 2 vext3 RHS, <3,u,1,2>
+  1593153452U, // <7,3,u,2>: Cost 2 vext2 <u,2,7,3>, <u,2,7,3>
+  1638320540U, // <7,3,u,3>: Cost 2 vext3 RHS, <3,3,3,3>
+  2237516526U, // <7,3,u,4>: Cost 3 vrev <3,7,4,u>
+  1638320966U, // <7,3,u,5>: Cost 2 vext3 RHS, <3,u,5,6>
+  2712062796U, // <7,3,u,6>: Cost 3 vext3 RHS, <3,u,6,3>
+  2692967250U, // <7,3,u,7>: Cost 3 vext3 <1,3,u,7>, <3,u,7,0>
+  1638320989U, // <7,3,u,u>: Cost 2 vext3 RHS, <3,u,u,2>
+  2651635712U, // <7,4,0,0>: Cost 3 vext2 <5,6,7,4>, <0,0,0,0>
+  1577893990U, // <7,4,0,1>: Cost 2 vext2 <5,6,7,4>, LHS
+  2651635876U, // <7,4,0,2>: Cost 3 vext2 <5,6,7,4>, <0,2,0,2>
+  3785804672U, // <7,4,0,3>: Cost 4 vext3 RHS, <4,0,3,1>
+  2651636050U, // <7,4,0,4>: Cost 3 vext2 <5,6,7,4>, <0,4,1,5>
+  1638468498U, // <7,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1>
+  1638468508U, // <7,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2>
+  3787795364U, // <7,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1>
+  1640459181U, // <7,4,0,u>: Cost 2 vext3 RHS, <4,0,u,1>
+  2651636470U, // <7,4,1,0>: Cost 3 vext2 <5,6,7,4>, <1,0,3,2>
+  2651636532U, // <7,4,1,1>: Cost 3 vext2 <5,6,7,4>, <1,1,1,1>
+  2712062922U, // <7,4,1,2>: Cost 3 vext3 RHS, <4,1,2,3>
+  2639029248U, // <7,4,1,3>: Cost 3 vext2 <3,5,7,4>, <1,3,5,7>
+  2712062940U, // <7,4,1,4>: Cost 3 vext3 RHS, <4,1,4,3>
+  2712062946U, // <7,4,1,5>: Cost 3 vext3 RHS, <4,1,5,0>
+  2712062958U, // <7,4,1,6>: Cost 3 vext3 RHS, <4,1,6,3>
+  3785804791U, // <7,4,1,7>: Cost 4 vext3 RHS, <4,1,7,3>
+  2712062973U, // <7,4,1,u>: Cost 3 vext3 RHS, <4,1,u,0>
+  3785804807U, // <7,4,2,0>: Cost 4 vext3 RHS, <4,2,0,1>
+  3785804818U, // <7,4,2,1>: Cost 4 vext3 RHS, <4,2,1,3>
+  2651637352U, // <7,4,2,2>: Cost 3 vext2 <5,6,7,4>, <2,2,2,2>
+  2651637414U, // <7,4,2,3>: Cost 3 vext2 <5,6,7,4>, <2,3,0,1>
+  3716753194U, // <7,4,2,4>: Cost 4 vext2 <4,2,7,4>, <2,4,5,7>
+  2712063030U, // <7,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3>
+  2712063036U, // <7,4,2,6>: Cost 3 vext3 RHS, <4,2,6,0>
+  3773123658U, // <7,4,2,7>: Cost 4 vext3 <2,4,5,7>, <4,2,7,5>
+  2712063054U, // <7,4,2,u>: Cost 3 vext3 RHS, <4,2,u,0>
+  2651637910U, // <7,4,3,0>: Cost 3 vext2 <5,6,7,4>, <3,0,1,2>
+  3712772348U, // <7,4,3,1>: Cost 4 vext2 <3,5,7,4>, <3,1,3,5>
+  3785804906U, // <7,4,3,2>: Cost 4 vext3 RHS, <4,3,2,1>
+  2651638172U, // <7,4,3,3>: Cost 3 vext2 <5,6,7,4>, <3,3,3,3>
+  2651638274U, // <7,4,3,4>: Cost 3 vext2 <5,6,7,4>, <3,4,5,6>
+  2639030883U, // <7,4,3,5>: Cost 3 vext2 <3,5,7,4>, <3,5,7,4>
+  2712063122U, // <7,4,3,6>: Cost 3 vext3 RHS, <4,3,6,5>
+  3712772836U, // <7,4,3,7>: Cost 4 vext2 <3,5,7,4>, <3,7,3,7>
+  2641021782U, // <7,4,3,u>: Cost 3 vext2 <3,u,7,4>, <3,u,7,4>
+  2714053802U, // <7,4,4,0>: Cost 3 vext3 RHS, <4,4,0,2>
+  3785804978U, // <7,4,4,1>: Cost 4 vext3 RHS, <4,4,1,1>
+  3716754505U, // <7,4,4,2>: Cost 4 vext2 <4,2,7,4>, <4,2,7,4>
+  3785804998U, // <7,4,4,3>: Cost 4 vext3 RHS, <4,4,3,3>
+  1638321360U, // <7,4,4,4>: Cost 2 vext3 RHS, <4,4,4,4>
+  1638468826U, // <7,4,4,5>: Cost 2 vext3 RHS, <4,4,5,5>
+  1638468836U, // <7,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6>
+  3785215214U, // <7,4,4,7>: Cost 4 vext3 <4,4,7,7>, <4,4,7,7>
+  1640459509U, // <7,4,4,u>: Cost 2 vext3 RHS, <4,4,u,5>
+  1517207654U, // <7,4,5,0>: Cost 2 vext1 <6,7,4,5>, LHS
+  2573034640U, // <7,4,5,1>: Cost 3 vext1 <3,7,4,5>, <1,5,3,7>
+  2712063246U, // <7,4,5,2>: Cost 3 vext3 RHS, <4,5,2,3>
+  2573036267U, // <7,4,5,3>: Cost 3 vext1 <3,7,4,5>, <3,7,4,5>
+  1517210934U, // <7,4,5,4>: Cost 2 vext1 <6,7,4,5>, RHS
+  2711989549U, // <7,4,5,5>: Cost 3 vext3 <4,5,5,7>, <4,5,5,7>
+  564579638U, // <7,4,5,6>: Cost 1 vext3 RHS, RHS
+  2651639976U, // <7,4,5,7>: Cost 3 vext2 <5,6,7,4>, <5,7,5,7>
+  564579656U, // <7,4,5,u>: Cost 1 vext3 RHS, RHS
+  2712063307U, // <7,4,6,0>: Cost 3 vext3 RHS, <4,6,0,1>
+  3767668056U, // <7,4,6,1>: Cost 4 vext3 <1,5,3,7>, <4,6,1,5>
+  2651640314U, // <7,4,6,2>: Cost 3 vext2 <5,6,7,4>, <6,2,7,3>
+  2655621708U, // <7,4,6,3>: Cost 3 vext2 <6,3,7,4>, <6,3,7,4>
+  1638468980U, // <7,4,6,4>: Cost 2 vext3 RHS, <4,6,4,6>
+  2712063358U, // <7,4,6,5>: Cost 3 vext3 RHS, <4,6,5,7>
+  2712063367U, // <7,4,6,6>: Cost 3 vext3 RHS, <4,6,6,7>
+  2712210826U, // <7,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1>
+  1638469012U, // <7,4,6,u>: Cost 2 vext3 RHS, <4,6,u,2>
+  2651640826U, // <7,4,7,0>: Cost 3 vext2 <5,6,7,4>, <7,0,1,2>
+  3773713830U, // <7,4,7,1>: Cost 4 vext3 <2,5,4,7>, <4,7,1,2>
+  3773713842U, // <7,4,7,2>: Cost 4 vext3 <2,5,4,7>, <4,7,2,5>
+  3780349372U, // <7,4,7,3>: Cost 4 vext3 <3,6,4,7>, <4,7,3,6>
+  2651641140U, // <7,4,7,4>: Cost 3 vext2 <5,6,7,4>, <7,4,0,1>
+  2712210888U, // <7,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0>
+  2712210898U, // <7,4,7,6>: Cost 3 vext3 RHS, <4,7,6,1>
+  2651641452U, // <7,4,7,7>: Cost 3 vext2 <5,6,7,4>, <7,7,7,7>
+  2713538026U, // <7,4,7,u>: Cost 3 vext3 <4,7,u,7>, <4,7,u,7>
+  1517232230U, // <7,4,u,0>: Cost 2 vext1 <6,7,4,u>, LHS
+  1577899822U, // <7,4,u,1>: Cost 2 vext2 <5,6,7,4>, LHS
+  2712063489U, // <7,4,u,2>: Cost 3 vext3 RHS, <4,u,2,3>
+  2573060846U, // <7,4,u,3>: Cost 3 vext1 <3,7,4,u>, <3,7,4,u>
+  1640312342U, // <7,4,u,4>: Cost 2 vext3 RHS, <4,u,4,6>
+  1638469146U, // <7,4,u,5>: Cost 2 vext3 RHS, <4,u,5,1>
+  564579881U, // <7,4,u,6>: Cost 1 vext3 RHS, RHS
+  2714054192U, // <7,4,u,7>: Cost 3 vext3 RHS, <4,u,7,5>
+  564579899U, // <7,4,u,u>: Cost 1 vext3 RHS, RHS
+  2579038310U, // <7,5,0,0>: Cost 3 vext1 <4,7,5,0>, LHS
+  2636382310U, // <7,5,0,1>: Cost 3 vext2 <3,1,7,5>, LHS
+  2796339302U, // <7,5,0,2>: Cost 3 vuzpl <7,4,5,6>, LHS
+  3646810719U, // <7,5,0,3>: Cost 4 vext1 <3,7,5,0>, <3,5,7,0>
+  2712063586U, // <7,5,0,4>: Cost 3 vext3 RHS, <5,0,4,1>
+  2735951467U, // <7,5,0,5>: Cost 3 vext3 RHS, <5,0,5,1>
+  2735951476U, // <7,5,0,6>: Cost 3 vext3 RHS, <5,0,6,1>
+  2579043322U, // <7,5,0,7>: Cost 3 vext1 <4,7,5,0>, <7,0,1,2>
+  2636382877U, // <7,5,0,u>: Cost 3 vext2 <3,1,7,5>, LHS
+  2712211087U, // <7,5,1,0>: Cost 3 vext3 RHS, <5,1,0,1>
+  3698180916U, // <7,5,1,1>: Cost 4 vext2 <1,1,7,5>, <1,1,1,1>
+  3710124950U, // <7,5,1,2>: Cost 4 vext2 <3,1,7,5>, <1,2,3,0>
+  2636383232U, // <7,5,1,3>: Cost 3 vext2 <3,1,7,5>, <1,3,5,7>
+  2712211127U, // <7,5,1,4>: Cost 3 vext3 RHS, <5,1,4,5>
+  2590994128U, // <7,5,1,5>: Cost 3 vext1 <6,7,5,1>, <5,1,7,3>
+  2590995323U, // <7,5,1,6>: Cost 3 vext1 <6,7,5,1>, <6,7,5,1>
+  1638469328U, // <7,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3>
+  1638469337U, // <7,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3>
+  3785805536U, // <7,5,2,0>: Cost 4 vext3 RHS, <5,2,0,1>
+  3785805544U, // <7,5,2,1>: Cost 4 vext3 RHS, <5,2,1,0>
+  3704817288U, // <7,5,2,2>: Cost 4 vext2 <2,2,7,5>, <2,2,5,7>
+  2712063742U, // <7,5,2,3>: Cost 3 vext3 RHS, <5,2,3,4>
+  3716761386U, // <7,5,2,4>: Cost 4 vext2 <4,2,7,5>, <2,4,5,7>
+  2714054415U, // <7,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3>
+  3774304024U, // <7,5,2,6>: Cost 4 vext3 <2,6,3,7>, <5,2,6,3>
+  2712063777U, // <7,5,2,7>: Cost 3 vext3 RHS, <5,2,7,3>
+  2712063787U, // <7,5,2,u>: Cost 3 vext3 RHS, <5,2,u,4>
+  3634888806U, // <7,5,3,0>: Cost 4 vext1 <1,7,5,3>, LHS
+  2636384544U, // <7,5,3,1>: Cost 3 vext2 <3,1,7,5>, <3,1,7,5>
+  3710790001U, // <7,5,3,2>: Cost 4 vext2 <3,2,7,5>, <3,2,7,5>
+  3710126492U, // <7,5,3,3>: Cost 4 vext2 <3,1,7,5>, <3,3,3,3>
+  3634892086U, // <7,5,3,4>: Cost 4 vext1 <1,7,5,3>, RHS
+  2639039076U, // <7,5,3,5>: Cost 3 vext2 <3,5,7,5>, <3,5,7,5>
+  3713444533U, // <7,5,3,6>: Cost 4 vext2 <3,6,7,5>, <3,6,7,5>
+  2693926767U, // <7,5,3,7>: Cost 3 vext3 <1,5,3,7>, <5,3,7,0>
+  2712063864U, // <7,5,3,u>: Cost 3 vext3 RHS, <5,3,u,0>
+  2579071078U, // <7,5,4,0>: Cost 3 vext1 <4,7,5,4>, LHS
+  3646841856U, // <7,5,4,1>: Cost 4 vext1 <3,7,5,4>, <1,3,5,7>
+  3716762698U, // <7,5,4,2>: Cost 4 vext2 <4,2,7,5>, <4,2,7,5>
+  3646843491U, // <7,5,4,3>: Cost 4 vext1 <3,7,5,4>, <3,5,7,4>
+  2579074358U, // <7,5,4,4>: Cost 3 vext1 <4,7,5,4>, RHS
+  2636385590U, // <7,5,4,5>: Cost 3 vext2 <3,1,7,5>, RHS
+  2645675406U, // <7,5,4,6>: Cost 3 vext2 <4,6,7,5>, <4,6,7,5>
+  1638322118U, // <7,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6>
+  1638469583U, // <7,5,4,u>: Cost 2 vext3 RHS, <5,4,u,6>
+  2714054611U, // <7,5,5,0>: Cost 3 vext3 RHS, <5,5,0,1>
+  2652974800U, // <7,5,5,1>: Cost 3 vext2 <5,u,7,5>, <5,1,7,3>
+  3710127905U, // <7,5,5,2>: Cost 4 vext2 <3,1,7,5>, <5,2,7,3>
+  3785805808U, // <7,5,5,3>: Cost 4 vext3 RHS, <5,5,3,3>
+  2712211450U, // <7,5,5,4>: Cost 3 vext3 RHS, <5,5,4,4>
+  1638322180U, // <7,5,5,5>: Cost 2 vext3 RHS, <5,5,5,5>
+  2712064014U, // <7,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6>
+  1638469656U, // <7,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7>
+  1638469665U, // <7,5,5,u>: Cost 2 vext3 RHS, <5,5,u,7>
+  2712064036U, // <7,5,6,0>: Cost 3 vext3 RHS, <5,6,0,1>
+  2714054707U, // <7,5,6,1>: Cost 3 vext3 RHS, <5,6,1,7>
+  3785805879U, // <7,5,6,2>: Cost 4 vext3 RHS, <5,6,2,2>
+  2712064066U, // <7,5,6,3>: Cost 3 vext3 RHS, <5,6,3,4>
+  2712064076U, // <7,5,6,4>: Cost 3 vext3 RHS, <5,6,4,5>
+  2714054743U, // <7,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7>
+  2712064096U, // <7,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7>
+  1638322274U, // <7,5,6,7>: Cost 2 vext3 RHS, <5,6,7,0>
+  1638469739U, // <7,5,6,u>: Cost 2 vext3 RHS, <5,6,u,0>
+  1511325798U, // <7,5,7,0>: Cost 2 vext1 <5,7,5,7>, LHS
+  2692747392U, // <7,5,7,1>: Cost 3 vext3 <1,3,5,7>, <5,7,1,3>
+  2585069160U, // <7,5,7,2>: Cost 3 vext1 <5,7,5,7>, <2,2,2,2>
+  2573126390U, // <7,5,7,3>: Cost 3 vext1 <3,7,5,7>, <3,7,5,7>
+  1511329078U, // <7,5,7,4>: Cost 2 vext1 <5,7,5,7>, RHS
+  1638469800U, // <7,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7>
+  2712211626U, // <7,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0>
+  2712211636U, // <7,5,7,7>: Cost 3 vext3 RHS, <5,7,7,1>
+  1638469823U, // <7,5,7,u>: Cost 2 vext3 RHS, <5,7,u,3>
+  1511333990U, // <7,5,u,0>: Cost 2 vext1 <5,7,5,u>, LHS
+  2636388142U, // <7,5,u,1>: Cost 3 vext2 <3,1,7,5>, LHS
+  2712211671U, // <7,5,u,2>: Cost 3 vext3 RHS, <5,u,2,0>
+  2573134583U, // <7,5,u,3>: Cost 3 vext1 <3,7,5,u>, <3,7,5,u>
+  1511337270U, // <7,5,u,4>: Cost 2 vext1 <5,7,5,u>, RHS
+  1638469881U, // <7,5,u,5>: Cost 2 vext3 RHS, <5,u,5,7>
+  2712064258U, // <7,5,u,6>: Cost 3 vext3 RHS, <5,u,6,7>
+  1638469892U, // <7,5,u,7>: Cost 2 vext3 RHS, <5,u,7,0>
+  1638469904U, // <7,5,u,u>: Cost 2 vext3 RHS, <5,u,u,3>
+  2650324992U, // <7,6,0,0>: Cost 3 vext2 <5,4,7,6>, <0,0,0,0>
+  1576583270U, // <7,6,0,1>: Cost 2 vext2 <5,4,7,6>, LHS
+  2712064300U, // <7,6,0,2>: Cost 3 vext3 RHS, <6,0,2,4>
+  2255295336U, // <7,6,0,3>: Cost 3 vrev <6,7,3,0>
+  2712064316U, // <7,6,0,4>: Cost 3 vext3 RHS, <6,0,4,2>
+  2585088098U, // <7,6,0,5>: Cost 3 vext1 <5,7,6,0>, <5,6,7,0>
+  2735952204U, // <7,6,0,6>: Cost 3 vext3 RHS, <6,0,6,0>
+  2712211799U, // <7,6,0,7>: Cost 3 vext3 RHS, <6,0,7,2>
+  1576583837U, // <7,6,0,u>: Cost 2 vext2 <5,4,7,6>, LHS
+  1181340494U, // <7,6,1,0>: Cost 2 vrev <6,7,0,1>
+  2650325812U, // <7,6,1,1>: Cost 3 vext2 <5,4,7,6>, <1,1,1,1>
+  2650325910U, // <7,6,1,2>: Cost 3 vext2 <5,4,7,6>, <1,2,3,0>
+  2650325976U, // <7,6,1,3>: Cost 3 vext2 <5,4,7,6>, <1,3,1,3>
+  2579123510U, // <7,6,1,4>: Cost 3 vext1 <4,7,6,1>, RHS
+  2650326160U, // <7,6,1,5>: Cost 3 vext2 <5,4,7,6>, <1,5,3,7>
+  2714055072U, // <7,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3>
+  2712064425U, // <7,6,1,7>: Cost 3 vext3 RHS, <6,1,7,3>
+  1181930390U, // <7,6,1,u>: Cost 2 vrev <6,7,u,1>
+  2712211897U, // <7,6,2,0>: Cost 3 vext3 RHS, <6,2,0,1>
+  2714055108U, // <7,6,2,1>: Cost 3 vext3 RHS, <6,2,1,3>
+  2650326632U, // <7,6,2,2>: Cost 3 vext2 <5,4,7,6>, <2,2,2,2>
+  2650326694U, // <7,6,2,3>: Cost 3 vext2 <5,4,7,6>, <2,3,0,1>
+  2714055137U, // <7,6,2,4>: Cost 3 vext3 RHS, <6,2,4,5>
+  2714055148U, // <7,6,2,5>: Cost 3 vext3 RHS, <6,2,5,7>
+  2650326970U, // <7,6,2,6>: Cost 3 vext2 <5,4,7,6>, <2,6,3,7>
+  1638470138U, // <7,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3>
+  1638470147U, // <7,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3>
+  2650327190U, // <7,6,3,0>: Cost 3 vext2 <5,4,7,6>, <3,0,1,2>
+  2255172441U, // <7,6,3,1>: Cost 3 vrev <6,7,1,3>
+  2255246178U, // <7,6,3,2>: Cost 3 vrev <6,7,2,3>
+  2650327452U, // <7,6,3,3>: Cost 3 vext2 <5,4,7,6>, <3,3,3,3>
+  2712064562U, // <7,6,3,4>: Cost 3 vext3 RHS, <6,3,4,5>
+  2650327627U, // <7,6,3,5>: Cost 3 vext2 <5,4,7,6>, <3,5,4,7>
+  3713452726U, // <7,6,3,6>: Cost 4 vext2 <3,6,7,6>, <3,6,7,6>
+  2700563016U, // <7,6,3,7>: Cost 3 vext3 <2,6,3,7>, <6,3,7,0>
+  2712064593U, // <7,6,3,u>: Cost 3 vext3 RHS, <6,3,u,0>
+  2650327954U, // <7,6,4,0>: Cost 3 vext2 <5,4,7,6>, <4,0,5,1>
+  2735952486U, // <7,6,4,1>: Cost 3 vext3 RHS, <6,4,1,3>
+  2735952497U, // <7,6,4,2>: Cost 3 vext3 RHS, <6,4,2,5>
+  2255328108U, // <7,6,4,3>: Cost 3 vrev <6,7,3,4>
+  2712212100U, // <7,6,4,4>: Cost 3 vext3 RHS, <6,4,4,6>
+  1576586550U, // <7,6,4,5>: Cost 2 vext2 <5,4,7,6>, RHS
+  2714055312U, // <7,6,4,6>: Cost 3 vext3 RHS, <6,4,6,0>
+  2712212126U, // <7,6,4,7>: Cost 3 vext3 RHS, <6,4,7,5>
+  1576586793U, // <7,6,4,u>: Cost 2 vext2 <5,4,7,6>, RHS
+  2579152998U, // <7,6,5,0>: Cost 3 vext1 <4,7,6,5>, LHS
+  2650328784U, // <7,6,5,1>: Cost 3 vext2 <5,4,7,6>, <5,1,7,3>
+  2714055364U, // <7,6,5,2>: Cost 3 vext3 RHS, <6,5,2,7>
+  3785806538U, // <7,6,5,3>: Cost 4 vext3 RHS, <6,5,3,4>
+  1576587206U, // <7,6,5,4>: Cost 2 vext2 <5,4,7,6>, <5,4,7,6>
+  2650329092U, // <7,6,5,5>: Cost 3 vext2 <5,4,7,6>, <5,5,5,5>
+  2650329186U, // <7,6,5,6>: Cost 3 vext2 <5,4,7,6>, <5,6,7,0>
+  2712064753U, // <7,6,5,7>: Cost 3 vext3 RHS, <6,5,7,7>
+  1181963162U, // <7,6,5,u>: Cost 2 vrev <6,7,u,5>
+  2714055421U, // <7,6,6,0>: Cost 3 vext3 RHS, <6,6,0,1>
+  2714055432U, // <7,6,6,1>: Cost 3 vext3 RHS, <6,6,1,3>
+  2650329594U, // <7,6,6,2>: Cost 3 vext2 <5,4,7,6>, <6,2,7,3>
+  3785806619U, // <7,6,6,3>: Cost 4 vext3 RHS, <6,6,3,4>
+  2712212260U, // <7,6,6,4>: Cost 3 vext3 RHS, <6,6,4,4>
+  2714055472U, // <7,6,6,5>: Cost 3 vext3 RHS, <6,6,5,7>
+  1638323000U, // <7,6,6,6>: Cost 2 vext3 RHS, <6,6,6,6>
+  1638470466U, // <7,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7>
+  1638470475U, // <7,6,6,u>: Cost 2 vext3 RHS, <6,6,u,7>
+  1638323022U, // <7,6,7,0>: Cost 2 vext3 RHS, <6,7,0,1>
+  2712064854U, // <7,6,7,1>: Cost 3 vext3 RHS, <6,7,1,0>
+  2712064865U, // <7,6,7,2>: Cost 3 vext3 RHS, <6,7,2,2>
+  2712064872U, // <7,6,7,3>: Cost 3 vext3 RHS, <6,7,3,0>
+  1638323062U, // <7,6,7,4>: Cost 2 vext3 RHS, <6,7,4,5>
+  2712064894U, // <7,6,7,5>: Cost 3 vext3 RHS, <6,7,5,4>
+  2712064905U, // <7,6,7,6>: Cost 3 vext3 RHS, <6,7,6,6>
+  2712064915U, // <7,6,7,7>: Cost 3 vext3 RHS, <6,7,7,7>
+  1638323094U, // <7,6,7,u>: Cost 2 vext3 RHS, <6,7,u,1>
+  1638470559U, // <7,6,u,0>: Cost 2 vext3 RHS, <6,u,0,1>
+  1576589102U, // <7,6,u,1>: Cost 2 vext2 <5,4,7,6>, LHS
+  2712212402U, // <7,6,u,2>: Cost 3 vext3 RHS, <6,u,2,2>
+  2712212409U, // <7,6,u,3>: Cost 3 vext3 RHS, <6,u,3,0>
+  1638470599U, // <7,6,u,4>: Cost 2 vext3 RHS, <6,u,4,5>
+  1576589466U, // <7,6,u,5>: Cost 2 vext2 <5,4,7,6>, RHS
+  1638323000U, // <7,6,u,6>: Cost 2 vext3 RHS, <6,6,6,6>
+  1638470624U, // <7,6,u,7>: Cost 2 vext3 RHS, <6,u,7,3>
+  1638470631U, // <7,6,u,u>: Cost 2 vext3 RHS, <6,u,u,1>
+  2712065007U, // <7,7,0,0>: Cost 3 vext3 RHS, <7,0,0,0>
+  1638323194U, // <7,7,0,1>: Cost 2 vext3 RHS, <7,0,1,2>
+  2712065025U, // <7,7,0,2>: Cost 3 vext3 RHS, <7,0,2,0>
+  3646958337U, // <7,7,0,3>: Cost 4 vext1 <3,7,7,0>, <3,7,7,0>
+  2712065044U, // <7,7,0,4>: Cost 3 vext3 RHS, <7,0,4,1>
+  2585161907U, // <7,7,0,5>: Cost 3 vext1 <5,7,7,0>, <5,7,7,0>
+  2591134604U, // <7,7,0,6>: Cost 3 vext1 <6,7,7,0>, <6,7,7,0>
+  2591134714U, // <7,7,0,7>: Cost 3 vext1 <6,7,7,0>, <7,0,1,2>
+  1638323257U, // <7,7,0,u>: Cost 2 vext3 RHS, <7,0,u,2>
+  2712065091U, // <7,7,1,0>: Cost 3 vext3 RHS, <7,1,0,3>
+  2712065098U, // <7,7,1,1>: Cost 3 vext3 RHS, <7,1,1,1>
+  2712065109U, // <7,7,1,2>: Cost 3 vext3 RHS, <7,1,2,3>
+  2692748384U, // <7,7,1,3>: Cost 3 vext3 <1,3,5,7>, <7,1,3,5>
+  2585169206U, // <7,7,1,4>: Cost 3 vext1 <5,7,7,1>, RHS
+  2693928048U, // <7,7,1,5>: Cost 3 vext3 <1,5,3,7>, <7,1,5,3>
+  2585170766U, // <7,7,1,6>: Cost 3 vext1 <5,7,7,1>, <6,7,0,1>
+  2735953024U, // <7,7,1,7>: Cost 3 vext3 RHS, <7,1,7,1>
+  2695918731U, // <7,7,1,u>: Cost 3 vext3 <1,u,3,7>, <7,1,u,3>
+  3770471574U, // <7,7,2,0>: Cost 4 vext3 <2,0,5,7>, <7,2,0,5>
+  3785807002U, // <7,7,2,1>: Cost 4 vext3 RHS, <7,2,1,0>
+  2712065189U, // <7,7,2,2>: Cost 3 vext3 RHS, <7,2,2,2>
+  2712065196U, // <7,7,2,3>: Cost 3 vext3 RHS, <7,2,3,0>
+  3773125818U, // <7,7,2,4>: Cost 4 vext3 <2,4,5,7>, <7,2,4,5>
+  3766490305U, // <7,7,2,5>: Cost 4 vext3 <1,3,5,7>, <7,2,5,3>
+  2700563658U, // <7,7,2,6>: Cost 3 vext3 <2,6,3,7>, <7,2,6,3>
+  2735953107U, // <7,7,2,7>: Cost 3 vext3 RHS, <7,2,7,3>
+  2701890780U, // <7,7,2,u>: Cost 3 vext3 <2,u,3,7>, <7,2,u,3>
+  2712065251U, // <7,7,3,0>: Cost 3 vext3 RHS, <7,3,0,1>
+  3766490350U, // <7,7,3,1>: Cost 4 vext3 <1,3,5,7>, <7,3,1,3>
+  3774305530U, // <7,7,3,2>: Cost 4 vext3 <2,6,3,7>, <7,3,2,6>
+  2637728196U, // <7,7,3,3>: Cost 3 vext2 <3,3,7,7>, <3,3,7,7>
+  2712065291U, // <7,7,3,4>: Cost 3 vext3 RHS, <7,3,4,5>
+  2585186486U, // <7,7,3,5>: Cost 3 vext1 <5,7,7,3>, <5,7,7,3>
+  2639719095U, // <7,7,3,6>: Cost 3 vext2 <3,6,7,7>, <3,6,7,7>
+  2640382728U, // <7,7,3,7>: Cost 3 vext2 <3,7,7,7>, <3,7,7,7>
+  2641046361U, // <7,7,3,u>: Cost 3 vext2 <3,u,7,7>, <3,u,7,7>
+  2712212792U, // <7,7,4,0>: Cost 3 vext3 RHS, <7,4,0,5>
+  3646989312U, // <7,7,4,1>: Cost 4 vext1 <3,7,7,4>, <1,3,5,7>
+  3785807176U, // <7,7,4,2>: Cost 4 vext3 RHS, <7,4,2,3>
+  3646991109U, // <7,7,4,3>: Cost 4 vext1 <3,7,7,4>, <3,7,7,4>
+  2712065371U, // <7,7,4,4>: Cost 3 vext3 RHS, <7,4,4,4>
+  1638323558U, // <7,7,4,5>: Cost 2 vext3 RHS, <7,4,5,6>
+  2712212845U, // <7,7,4,6>: Cost 3 vext3 RHS, <7,4,6,4>
+  2591167846U, // <7,7,4,7>: Cost 3 vext1 <6,7,7,4>, <7,4,5,6>
+  1638323585U, // <7,7,4,u>: Cost 2 vext3 RHS, <7,4,u,6>
+  2585198694U, // <7,7,5,0>: Cost 3 vext1 <5,7,7,5>, LHS
+  2712212884U, // <7,7,5,1>: Cost 3 vext3 RHS, <7,5,1,7>
+  3711471393U, // <7,7,5,2>: Cost 4 vext2 <3,3,7,7>, <5,2,7,3>
+  2649673590U, // <7,7,5,3>: Cost 3 vext2 <5,3,7,7>, <5,3,7,7>
+  2712065455U, // <7,7,5,4>: Cost 3 vext3 RHS, <7,5,4,7>
+  1577259032U, // <7,7,5,5>: Cost 2 vext2 <5,5,7,7>, <5,5,7,7>
+  2712065473U, // <7,7,5,6>: Cost 3 vext3 RHS, <7,5,6,7>
+  2712212936U, // <7,7,5,7>: Cost 3 vext3 RHS, <7,5,7,5>
+  1579249931U, // <7,7,5,u>: Cost 2 vext2 <5,u,7,7>, <5,u,7,7>
+  2591178854U, // <7,7,6,0>: Cost 3 vext1 <6,7,7,6>, LHS
+  2735953374U, // <7,7,6,1>: Cost 3 vext3 RHS, <7,6,1,0>
+  2712212974U, // <7,7,6,2>: Cost 3 vext3 RHS, <7,6,2,7>
+  2655646287U, // <7,7,6,3>: Cost 3 vext2 <6,3,7,7>, <6,3,7,7>
+  2591182134U, // <7,7,6,4>: Cost 3 vext1 <6,7,7,6>, RHS
+  2656973553U, // <7,7,6,5>: Cost 3 vext2 <6,5,7,7>, <6,5,7,7>
+  1583895362U, // <7,7,6,6>: Cost 2 vext2 <6,6,7,7>, <6,6,7,7>
+  2712065556U, // <7,7,6,7>: Cost 3 vext3 RHS, <7,6,7,0>
+  1585222628U, // <7,7,6,u>: Cost 2 vext2 <6,u,7,7>, <6,u,7,7>
+  1523417190U, // <7,7,7,0>: Cost 2 vext1 <7,7,7,7>, LHS
+  2597159670U, // <7,7,7,1>: Cost 3 vext1 <7,7,7,7>, <1,0,3,2>
+  2597160552U, // <7,7,7,2>: Cost 3 vext1 <7,7,7,7>, <2,2,2,2>
+  2597161110U, // <7,7,7,3>: Cost 3 vext1 <7,7,7,7>, <3,0,1,2>
+  1523420470U, // <7,7,7,4>: Cost 2 vext1 <7,7,7,7>, RHS
+  2651002296U, // <7,7,7,5>: Cost 3 vext2 <5,5,7,7>, <7,5,5,7>
+  2657637906U, // <7,7,7,6>: Cost 3 vext2 <6,6,7,7>, <7,6,6,7>
+  363253046U, // <7,7,7,7>: Cost 1 vdup3 RHS
+  363253046U, // <7,7,7,u>: Cost 1 vdup3 RHS
+  1523417190U, // <7,7,u,0>: Cost 2 vext1 <7,7,7,7>, LHS
+  1638471298U, // <7,7,u,1>: Cost 2 vext3 RHS, <7,u,1,2>
+  2712213132U, // <7,7,u,2>: Cost 3 vext3 RHS, <7,u,2,3>
+  2712213138U, // <7,7,u,3>: Cost 3 vext3 RHS, <7,u,3,0>
+  1523420470U, // <7,7,u,4>: Cost 2 vext1 <7,7,7,7>, RHS
+  1638471338U, // <7,7,u,5>: Cost 2 vext3 RHS, <7,u,5,6>
+  1595840756U, // <7,7,u,6>: Cost 2 vext2 <u,6,7,7>, <u,6,7,7>
+  363253046U, // <7,7,u,7>: Cost 1 vdup3 RHS
+  363253046U, // <7,7,u,u>: Cost 1 vdup3 RHS
+  1638318080U, // <7,u,0,0>: Cost 2 vext3 RHS, <0,0,0,0>
+  1638323923U, // <7,u,0,1>: Cost 2 vext3 RHS, <u,0,1,2>
+  1662211804U, // <7,u,0,2>: Cost 2 vext3 RHS, <u,0,2,2>
+  1638323941U, // <7,u,0,3>: Cost 2 vext3 RHS, <u,0,3,2>
+  2712065773U, // <7,u,0,4>: Cost 3 vext3 RHS, <u,0,4,1>
+  1662359286U, // <7,u,0,5>: Cost 2 vext3 RHS, <u,0,5,1>
+  1662359296U, // <7,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2>
+  2987150664U, // <7,u,0,7>: Cost 3 vzipr <5,6,7,0>, RHS
+  1638323986U, // <7,u,0,u>: Cost 2 vext3 RHS, <u,0,u,2>
+  1517469798U, // <7,u,1,0>: Cost 2 vext1 <6,7,u,1>, LHS
+  1638318900U, // <7,u,1,1>: Cost 2 vext3 RHS, <1,1,1,1>
+  564582190U, // <7,u,1,2>: Cost 1 vext3 RHS, LHS
+  1638324023U, // <7,u,1,3>: Cost 2 vext3 RHS, <u,1,3,3>
+  1517473078U, // <7,u,1,4>: Cost 2 vext1 <6,7,u,1>, RHS
+  2693928777U, // <7,u,1,5>: Cost 3 vext3 <1,5,3,7>, <u,1,5,3>
+  1517474710U, // <7,u,1,6>: Cost 2 vext1 <6,7,u,1>, <6,7,u,1>
+  1640462171U, // <7,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3>
+  564582244U, // <7,u,1,u>: Cost 1 vext3 RHS, LHS
+  1638318244U, // <7,u,2,0>: Cost 2 vext3 RHS, <0,2,0,2>
+  2712065907U, // <7,u,2,1>: Cost 3 vext3 RHS, <u,2,1,0>
+  1638319720U, // <7,u,2,2>: Cost 2 vext3 RHS, <2,2,2,2>
+  1638324101U, // <7,u,2,3>: Cost 2 vext3 RHS, <u,2,3,0>
+  1638318284U, // <7,u,2,4>: Cost 2 vext3 RHS, <0,2,4,6>
+  2712065947U, // <7,u,2,5>: Cost 3 vext3 RHS, <u,2,5,4>
+  2700564387U, // <7,u,2,6>: Cost 3 vext3 <2,6,3,7>, <u,2,6,3>
+  1640314796U, // <7,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3>
+  1638324146U, // <7,u,2,u>: Cost 2 vext3 RHS, <u,2,u,0>
+  1638324156U, // <7,u,3,0>: Cost 2 vext3 RHS, <u,3,0,1>
+  1638319064U, // <7,u,3,1>: Cost 2 vext3 RHS, <1,3,1,3>
+  2700564435U, // <7,u,3,2>: Cost 3 vext3 <2,6,3,7>, <u,3,2,6>
+  1638320540U, // <7,u,3,3>: Cost 2 vext3 RHS, <3,3,3,3>
+  1638324196U, // <7,u,3,4>: Cost 2 vext3 RHS, <u,3,4,5>
+  1638324207U, // <7,u,3,5>: Cost 2 vext3 RHS, <u,3,5,7>
+  2700564472U, // <7,u,3,6>: Cost 3 vext3 <2,6,3,7>, <u,3,6,7>
+  2695919610U, // <7,u,3,7>: Cost 3 vext3 <1,u,3,7>, <u,3,7,0>
+  1638324228U, // <7,u,3,u>: Cost 2 vext3 RHS, <u,3,u,1>
+  2712066061U, // <7,u,4,0>: Cost 3 vext3 RHS, <u,4,0,1>
+  1662212122U, // <7,u,4,1>: Cost 2 vext3 RHS, <u,4,1,5>
+  1662212132U, // <7,u,4,2>: Cost 2 vext3 RHS, <u,4,2,6>
+  2712066092U, // <7,u,4,3>: Cost 3 vext3 RHS, <u,4,3,5>
+  1638321360U, // <7,u,4,4>: Cost 2 vext3 RHS, <4,4,4,4>
+  1638324287U, // <7,u,4,5>: Cost 2 vext3 RHS, <u,4,5,6>
+  1662359624U, // <7,u,4,6>: Cost 2 vext3 RHS, <u,4,6,6>
+  1640314961U, // <7,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6>
+  1638324314U, // <7,u,4,u>: Cost 2 vext3 RHS, <u,4,u,6>
+  1517502566U, // <7,u,5,0>: Cost 2 vext1 <6,7,u,5>, LHS
+  1574612693U, // <7,u,5,1>: Cost 2 vext2 <5,1,7,u>, <5,1,7,u>
+  2712066162U, // <7,u,5,2>: Cost 3 vext3 RHS, <u,5,2,3>
+  1638324351U, // <7,u,5,3>: Cost 2 vext3 RHS, <u,5,3,7>
+  1576603592U, // <7,u,5,4>: Cost 2 vext2 <5,4,7,u>, <5,4,7,u>
+  1577267225U, // <7,u,5,5>: Cost 2 vext2 <5,5,7,u>, <5,5,7,u>
+  564582554U, // <7,u,5,6>: Cost 1 vext3 RHS, RHS
+  1640462499U, // <7,u,5,7>: Cost 2 vext3 RHS, <u,5,7,7>
+  564582572U, // <7,u,5,u>: Cost 1 vext3 RHS, RHS
+  2712066223U, // <7,u,6,0>: Cost 3 vext3 RHS, <u,6,0,1>
+  2712066238U, // <7,u,6,1>: Cost 3 vext3 RHS, <u,6,1,7>
+  1581249023U, // <7,u,6,2>: Cost 2 vext2 <6,2,7,u>, <6,2,7,u>
+  1638324432U, // <7,u,6,3>: Cost 2 vext3 RHS, <u,6,3,7>
+  1638468980U, // <7,u,6,4>: Cost 2 vext3 RHS, <4,6,4,6>
+  2712066274U, // <7,u,6,5>: Cost 3 vext3 RHS, <u,6,5,7>
+  1583903555U, // <7,u,6,6>: Cost 2 vext2 <6,6,7,u>, <6,6,7,u>
+  1640315117U, // <7,u,6,7>: Cost 2 vext3 RHS, <u,6,7,0>
+  1638324477U, // <7,u,6,u>: Cost 2 vext3 RHS, <u,6,u,7>
+  1638471936U, // <7,u,7,0>: Cost 2 vext3 RHS, <u,7,0,1>
+  2692970763U, // <7,u,7,1>: Cost 3 vext3 <1,3,u,7>, <u,7,1,3>
+  2700933399U, // <7,u,7,2>: Cost 3 vext3 <2,6,u,7>, <u,7,2,6>
+  2573347601U, // <7,u,7,3>: Cost 3 vext1 <3,7,u,7>, <3,7,u,7>
+  1638471976U, // <7,u,7,4>: Cost 2 vext3 RHS, <u,7,4,5>
+  1511551171U, // <7,u,7,5>: Cost 2 vext1 <5,7,u,7>, <5,7,u,7>
+  2712213815U, // <7,u,7,6>: Cost 3 vext3 RHS, <u,7,6,2>
+  363253046U, // <7,u,7,7>: Cost 1 vdup3 RHS
+  363253046U, // <7,u,7,u>: Cost 1 vdup3 RHS
+  1638324561U, // <7,u,u,0>: Cost 2 vext3 RHS, <u,u,0,1>
+  1638324571U, // <7,u,u,1>: Cost 2 vext3 RHS, <u,u,1,2>
+  564582757U, // <7,u,u,2>: Cost 1 vext3 RHS, LHS
+  1638324587U, // <7,u,u,3>: Cost 2 vext3 RHS, <u,u,3,0>
+  1638324601U, // <7,u,u,4>: Cost 2 vext3 RHS, <u,u,4,5>
+  1638324611U, // <7,u,u,5>: Cost 2 vext3 RHS, <u,u,5,6>
+  564582797U, // <7,u,u,6>: Cost 1 vext3 RHS, RHS
+  363253046U, // <7,u,u,7>: Cost 1 vdup3 RHS
+  564582811U, // <7,u,u,u>: Cost 1 vext3 RHS, LHS
+  135053414U, // <u,0,0,0>: Cost 1 vdup0 LHS
+  1611489290U, // <u,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1>
+  1611489300U, // <u,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2>
+  2568054923U, // <u,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0>
+  1481706806U, // <u,0,0,4>: Cost 2 vext1 <0,u,0,0>, RHS
+  2555449040U, // <u,0,0,5>: Cost 3 vext1 <0,u,0,0>, <5,1,7,3>
+  2591282078U, // <u,0,0,6>: Cost 3 vext1 <6,u,0,0>, <6,u,0,0>
+  2591945711U, // <u,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0>
+  135053414U, // <u,0,0,u>: Cost 1 vdup0 LHS
+  1493655654U, // <u,0,1,0>: Cost 2 vext1 <2,u,0,1>, LHS
+  1860550758U, // <u,0,1,1>: Cost 2 vzipl LHS, LHS
+  537747563U, // <u,0,1,2>: Cost 1 vext3 LHS, LHS
+  2625135576U, // <u,0,1,3>: Cost 3 vext2 <1,2,u,0>, <1,3,1,3>
+  1493658934U, // <u,0,1,4>: Cost 2 vext1 <2,u,0,1>, RHS
+  2625135760U, // <u,0,1,5>: Cost 3 vext2 <1,2,u,0>, <1,5,3,7>
+  1517548447U, // <u,0,1,6>: Cost 2 vext1 <6,u,0,1>, <6,u,0,1>
+  2591290362U, // <u,0,1,7>: Cost 3 vext1 <6,u,0,1>, <7,0,1,2>
+  537747612U, // <u,0,1,u>: Cost 1 vext3 LHS, LHS
+  1611489444U, // <u,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
+  2685231276U, // <u,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1>
+  1994768486U, // <u,0,2,2>: Cost 2 vtrnl LHS, LHS
+  2685231294U, // <u,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1>
+  1611489484U, // <u,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
+  2712068310U, // <u,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7>
+  2625136570U, // <u,0,2,6>: Cost 3 vext2 <1,2,u,0>, <2,6,3,7>
+  2591962097U, // <u,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2>
+  1611489516U, // <u,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2>
+  2954067968U, // <u,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0>
+  2685231356U, // <u,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0>
+  72589981U, // <u,0,3,2>: Cost 1 vrev LHS
+  2625137052U, // <u,0,3,3>: Cost 3 vext2 <1,2,u,0>, <3,3,3,3>
+  2625137154U, // <u,0,3,4>: Cost 3 vext2 <1,2,u,0>, <3,4,5,6>
+  2639071848U, // <u,0,3,5>: Cost 3 vext2 <3,5,u,0>, <3,5,u,0>
+  2639735481U, // <u,0,3,6>: Cost 3 vext2 <3,6,u,0>, <3,6,u,0>
+  2597279354U, // <u,0,3,7>: Cost 3 vext1 <7,u,0,3>, <7,u,0,3>
+  73032403U, // <u,0,3,u>: Cost 1 vrev LHS
+  2687074636U, // <u,0,4,0>: Cost 3 vext3 <0,4,0,u>, <0,4,0,u>
+  1611489618U, // <u,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5>
+  1611489628U, // <u,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6>
+  3629222038U, // <u,0,4,3>: Cost 4 vext1 <0,u,0,4>, <3,0,1,2>
+  2555481398U, // <u,0,4,4>: Cost 3 vext1 <0,u,0,4>, RHS
+  1551396150U, // <u,0,4,5>: Cost 2 vext2 <1,2,u,0>, RHS
+  2651680116U, // <u,0,4,6>: Cost 3 vext2 <5,6,u,0>, <4,6,4,6>
+  2646150600U, // <u,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0>
+  1611932050U, // <u,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6>
+  2561458278U, // <u,0,5,0>: Cost 3 vext1 <1,u,0,5>, LHS
+  1863532646U, // <u,0,5,1>: Cost 2 vzipl RHS, LHS
+  2712068526U, // <u,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7>
+  2649689976U, // <u,0,5,3>: Cost 3 vext2 <5,3,u,0>, <5,3,u,0>
+  2220237489U, // <u,0,5,4>: Cost 3 vrev <0,u,4,5>
+  2651680772U, // <u,0,5,5>: Cost 3 vext2 <5,6,u,0>, <5,5,5,5>
+  1577939051U, // <u,0,5,6>: Cost 2 vext2 <5,6,u,0>, <5,6,u,0>
+  2830077238U, // <u,0,5,7>: Cost 3 vuzpr <1,u,3,0>, RHS
+  1579266317U, // <u,0,5,u>: Cost 2 vext2 <5,u,u,0>, <5,u,u,0>
+  2555494502U, // <u,0,6,0>: Cost 3 vext1 <0,u,0,6>, LHS
+  2712068598U, // <u,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7>
+  1997750374U, // <u,0,6,2>: Cost 2 vtrnl RHS, LHS
+  2655662673U, // <u,0,6,3>: Cost 3 vext2 <6,3,u,0>, <6,3,u,0>
+  2555497782U, // <u,0,6,4>: Cost 3 vext1 <0,u,0,6>, RHS
+  2651681459U, // <u,0,6,5>: Cost 3 vext2 <5,6,u,0>, <6,5,0,u>
+  2651681592U, // <u,0,6,6>: Cost 3 vext2 <5,6,u,0>, <6,6,6,6>
+  2651681614U, // <u,0,6,7>: Cost 3 vext2 <5,6,u,0>, <6,7,0,1>
+  1997750428U, // <u,0,6,u>: Cost 2 vtrnl RHS, LHS
+  2567446630U, // <u,0,7,0>: Cost 3 vext1 <2,u,0,7>, LHS
+  2567447446U, // <u,0,7,1>: Cost 3 vext1 <2,u,0,7>, <1,2,3,0>
+  2567448641U, // <u,0,7,2>: Cost 3 vext1 <2,u,0,7>, <2,u,0,7>
+  2573421338U, // <u,0,7,3>: Cost 3 vext1 <3,u,0,7>, <3,u,0,7>
+  2567449910U, // <u,0,7,4>: Cost 3 vext1 <2,u,0,7>, RHS
+  2651682242U, // <u,0,7,5>: Cost 3 vext2 <5,6,u,0>, <7,5,6,u>
+  2591339429U, // <u,0,7,6>: Cost 3 vext1 <6,u,0,7>, <6,u,0,7>
+  2651682412U, // <u,0,7,7>: Cost 3 vext2 <5,6,u,0>, <7,7,7,7>
+  2567452462U, // <u,0,7,u>: Cost 3 vext1 <2,u,0,7>, LHS
+  135053414U, // <u,0,u,0>: Cost 1 vdup0 LHS
+  1611489938U, // <u,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1>
+  537748125U, // <u,0,u,2>: Cost 1 vext3 LHS, LHS
+  2685674148U, // <u,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1>
+  1611932338U, // <u,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6>
+  1551399066U, // <u,0,u,5>: Cost 2 vext2 <1,2,u,0>, RHS
+  1517605798U, // <u,0,u,6>: Cost 2 vext1 <6,u,0,u>, <6,u,0,u>
+  2830077481U, // <u,0,u,7>: Cost 3 vuzpr <1,u,3,0>, RHS
+  537748179U, // <u,0,u,u>: Cost 1 vext3 LHS, LHS
+  1544101961U, // <u,1,0,0>: Cost 2 vext2 <0,0,u,1>, <0,0,u,1>
+  1558036582U, // <u,1,0,1>: Cost 2 vext2 <2,3,u,1>, LHS
+  2619171051U, // <u,1,0,2>: Cost 3 vext2 <0,2,u,1>, <0,2,u,1>
+  1611490038U, // <u,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2>
+  2555522358U, // <u,1,0,4>: Cost 3 vext1 <0,u,1,0>, RHS
+  2712068871U, // <u,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1>
+  2591355815U, // <u,1,0,6>: Cost 3 vext1 <6,u,1,0>, <6,u,1,0>
+  2597328512U, // <u,1,0,7>: Cost 3 vext1 <7,u,1,0>, <7,u,1,0>
+  1611490083U, // <u,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2>
+  1481785446U, // <u,1,1,0>: Cost 2 vext1 <0,u,1,1>, LHS
+  202162278U, // <u,1,1,1>: Cost 1 vdup1 LHS
+  2555528808U, // <u,1,1,2>: Cost 3 vext1 <0,u,1,1>, <2,2,2,2>
+  1611490120U, // <u,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3>
+  1481788726U, // <u,1,1,4>: Cost 2 vext1 <0,u,1,1>, RHS
+  2689876828U, // <u,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5>
+  2591364008U, // <u,1,1,6>: Cost 3 vext1 <6,u,1,1>, <6,u,1,1>
+  2592691274U, // <u,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1>
+  202162278U, // <u,1,1,u>: Cost 1 vdup1 LHS
+  1499709542U, // <u,1,2,0>: Cost 2 vext1 <3,u,1,2>, LHS
+  2689876871U, // <u,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3>
+  2631116445U, // <u,1,2,2>: Cost 3 vext2 <2,2,u,1>, <2,2,u,1>
+  835584U, // <u,1,2,3>: Cost 0 copy LHS
+  1499712822U, // <u,1,2,4>: Cost 2 vext1 <3,u,1,2>, RHS
+  2689876907U, // <u,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3>
+  2631780282U, // <u,1,2,6>: Cost 3 vext2 <2,3,u,1>, <2,6,3,7>
+  1523603074U, // <u,1,2,7>: Cost 2 vext1 <7,u,1,2>, <7,u,1,2>
+  835584U, // <u,1,2,u>: Cost 0 copy LHS
+  1487773798U, // <u,1,3,0>: Cost 2 vext1 <1,u,1,3>, LHS
+  1611490264U, // <u,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3>
+  2685232094U, // <u,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0>
+  2018746470U, // <u,1,3,3>: Cost 2 vtrnr LHS, LHS
+  1487777078U, // <u,1,3,4>: Cost 2 vext1 <1,u,1,3>, RHS
+  1611490304U, // <u,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7>
+  2685674505U, // <u,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7>
+  2640407307U, // <u,1,3,7>: Cost 3 vext2 <3,7,u,1>, <3,7,u,1>
+  1611490327U, // <u,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3>
+  1567992749U, // <u,1,4,0>: Cost 2 vext2 <4,0,u,1>, <4,0,u,1>
+  2693121070U, // <u,1,4,1>: Cost 3 vext3 <1,4,1,u>, <1,4,1,u>
+  2693194807U, // <u,1,4,2>: Cost 3 vext3 <1,4,2,u>, <1,4,2,u>
+  1152386432U, // <u,1,4,3>: Cost 2 vrev <1,u,3,4>
+  2555555126U, // <u,1,4,4>: Cost 3 vext1 <0,u,1,4>, RHS
+  1558039862U, // <u,1,4,5>: Cost 2 vext2 <2,3,u,1>, RHS
+  2645716371U, // <u,1,4,6>: Cost 3 vext2 <4,6,u,1>, <4,6,u,1>
+  2597361284U, // <u,1,4,7>: Cost 3 vext1 <7,u,1,4>, <7,u,1,4>
+  1152755117U, // <u,1,4,u>: Cost 2 vrev <1,u,u,4>
+  1481818214U, // <u,1,5,0>: Cost 2 vext1 <0,u,1,5>, LHS
+  2555560694U, // <u,1,5,1>: Cost 3 vext1 <0,u,1,5>, <1,0,3,2>
+  2555561576U, // <u,1,5,2>: Cost 3 vext1 <0,u,1,5>, <2,2,2,2>
+  1611490448U, // <u,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7>
+  1481821494U, // <u,1,5,4>: Cost 2 vext1 <0,u,1,5>, RHS
+  2651025435U, // <u,1,5,5>: Cost 3 vext2 <5,5,u,1>, <5,5,u,1>
+  2651689068U, // <u,1,5,6>: Cost 3 vext2 <5,6,u,1>, <5,6,u,1>
+  2823966006U, // <u,1,5,7>: Cost 3 vuzpr <0,u,1,1>, RHS
+  1611932861U, // <u,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7>
+  2555568230U, // <u,1,6,0>: Cost 3 vext1 <0,u,1,6>, LHS
+  2689877199U, // <u,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7>
+  2712069336U, // <u,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7>
+  2685232353U, // <u,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7>
+  2555571510U, // <u,1,6,4>: Cost 3 vext1 <0,u,1,6>, RHS
+  2689877235U, // <u,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7>
+  2657661765U, // <u,1,6,6>: Cost 3 vext2 <6,6,u,1>, <6,6,u,1>
+  1584583574U, // <u,1,6,7>: Cost 2 vext2 <6,7,u,1>, <6,7,u,1>
+  1585247207U, // <u,1,6,u>: Cost 2 vext2 <6,u,u,1>, <6,u,u,1>
+  2561548390U, // <u,1,7,0>: Cost 3 vext1 <1,u,1,7>, LHS
+  2561549681U, // <u,1,7,1>: Cost 3 vext1 <1,u,1,7>, <1,u,1,7>
+  2573493926U, // <u,1,7,2>: Cost 3 vext1 <3,u,1,7>, <2,3,0,1>
+  2042962022U, // <u,1,7,3>: Cost 2 vtrnr RHS, LHS
+  2561551670U, // <u,1,7,4>: Cost 3 vext1 <1,u,1,7>, RHS
+  2226300309U, // <u,1,7,5>: Cost 3 vrev <1,u,5,7>
+  2658325990U, // <u,1,7,6>: Cost 3 vext2 <6,7,u,1>, <7,6,1,u>
+  2658326124U, // <u,1,7,7>: Cost 3 vext2 <6,7,u,1>, <7,7,7,7>
+  2042962027U, // <u,1,7,u>: Cost 2 vtrnr RHS, LHS
+  1481842790U, // <u,1,u,0>: Cost 2 vext1 <0,u,1,u>, LHS
+  202162278U, // <u,1,u,1>: Cost 1 vdup1 LHS
+  2685674867U, // <u,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0>
+  835584U, // <u,1,u,3>: Cost 0 copy LHS
+  1481846070U, // <u,1,u,4>: Cost 2 vext1 <0,u,1,u>, RHS
+  1611933077U, // <u,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7>
+  2685674910U, // <u,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7>
+  1523652232U, // <u,1,u,7>: Cost 2 vext1 <7,u,1,u>, <7,u,1,u>
+  835584U, // <u,1,u,u>: Cost 0 copy LHS
+  1544110154U, // <u,2,0,0>: Cost 2 vext2 <0,0,u,2>, <0,0,u,2>
+  1545437286U, // <u,2,0,1>: Cost 2 vext2 <0,2,u,2>, LHS
+  1545437420U, // <u,2,0,2>: Cost 2 vext2 <0,2,u,2>, <0,2,u,2>
+  2685232589U, // <u,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0>
+  2619179346U, // <u,2,0,4>: Cost 3 vext2 <0,2,u,2>, <0,4,1,5>
+  2712069606U, // <u,2,0,5>: Cost 3 vext3 RHS, <2,0,5,7>
+  2689877484U, // <u,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4>
+  2659656273U, // <u,2,0,7>: Cost 3 vext2 <7,0,u,2>, <0,7,2,u>
+  1545437853U, // <u,2,0,u>: Cost 2 vext2 <0,2,u,2>, LHS
+  1550082851U, // <u,2,1,0>: Cost 2 vext2 <1,0,u,2>, <1,0,u,2>
+  2619179828U, // <u,2,1,1>: Cost 3 vext2 <0,2,u,2>, <1,1,1,1>
+  2619179926U, // <u,2,1,2>: Cost 3 vext2 <0,2,u,2>, <1,2,3,0>
+  2685232671U, // <u,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1>
+  2555604278U, // <u,2,1,4>: Cost 3 vext1 <0,u,2,1>, RHS
+  2619180176U, // <u,2,1,5>: Cost 3 vext2 <0,2,u,2>, <1,5,3,7>
+  2689877564U, // <u,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3>
+  2602718850U, // <u,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2>
+  1158703235U, // <u,2,1,u>: Cost 2 vrev <2,u,u,1>
+  1481867366U, // <u,2,2,0>: Cost 2 vext1 <0,u,2,2>, LHS
+  2555609846U, // <u,2,2,1>: Cost 3 vext1 <0,u,2,2>, <1,0,3,2>
+  269271142U, // <u,2,2,2>: Cost 1 vdup2 LHS
+  1611490930U, // <u,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3>
+  1481870646U, // <u,2,2,4>: Cost 2 vext1 <0,u,2,2>, RHS
+  2689877640U, // <u,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7>
+  2619180986U, // <u,2,2,6>: Cost 3 vext2 <0,2,u,2>, <2,6,3,7>
+  2593436837U, // <u,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2>
+  269271142U, // <u,2,2,u>: Cost 1 vdup2 LHS
+  408134301U, // <u,2,3,0>: Cost 1 vext1 LHS, LHS
+  1481876214U, // <u,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  1481877096U, // <u,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2>
+  1880326246U, // <u,2,3,3>: Cost 2 vzipr LHS, LHS
+  408137014U, // <u,2,3,4>: Cost 1 vext1 LHS, RHS
+  1529654992U, // <u,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3>
+  1529655802U, // <u,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  1529656314U, // <u,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2>
+  408139566U, // <u,2,3,u>: Cost 1 vext1 LHS, LHS
+  1567853468U, // <u,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2>
+  2561598362U, // <u,2,4,1>: Cost 3 vext1 <1,u,2,4>, <1,2,3,4>
+  2555627214U, // <u,2,4,2>: Cost 3 vext1 <0,u,2,4>, <2,3,4,5>
+  2685232918U, // <u,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5>
+  2555628854U, // <u,2,4,4>: Cost 3 vext1 <0,u,2,4>, RHS
+  1545440566U, // <u,2,4,5>: Cost 2 vext2 <0,2,u,2>, RHS
+  1571982740U, // <u,2,4,6>: Cost 2 vext2 <4,6,u,2>, <4,6,u,2>
+  2592125957U, // <u,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4>
+  1545440809U, // <u,2,4,u>: Cost 2 vext2 <0,2,u,2>, RHS
+  2555633766U, // <u,2,5,0>: Cost 3 vext1 <0,u,2,5>, LHS
+  2561606550U, // <u,2,5,1>: Cost 3 vext1 <1,u,2,5>, <1,2,3,0>
+  2689877856U, // <u,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7>
+  2685233000U, // <u,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6>
+  1158441059U, // <u,2,5,4>: Cost 2 vrev <2,u,4,5>
+  2645725188U, // <u,2,5,5>: Cost 3 vext2 <4,6,u,2>, <5,5,5,5>
+  2689877892U, // <u,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7>
+  2823900470U, // <u,2,5,7>: Cost 3 vuzpr <0,u,0,2>, RHS
+  1158736007U, // <u,2,5,u>: Cost 2 vrev <2,u,u,5>
+  1481900134U, // <u,2,6,0>: Cost 2 vext1 <0,u,2,6>, LHS
+  2555642614U, // <u,2,6,1>: Cost 3 vext1 <0,u,2,6>, <1,0,3,2>
+  2555643496U, // <u,2,6,2>: Cost 3 vext1 <0,u,2,6>, <2,2,2,2>
+  1611491258U, // <u,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7>
+  1481903414U, // <u,2,6,4>: Cost 2 vext1 <0,u,2,6>, RHS
+  2689877964U, // <u,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7>
+  2689877973U, // <u,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7>
+  2645726030U, // <u,2,6,7>: Cost 3 vext2 <4,6,u,2>, <6,7,0,1>
+  1611933671U, // <u,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7>
+  1585919033U, // <u,2,7,0>: Cost 2 vext2 <7,0,u,2>, <7,0,u,2>
+  2573566710U, // <u,2,7,1>: Cost 3 vext1 <3,u,2,7>, <1,0,3,2>
+  2567596115U, // <u,2,7,2>: Cost 3 vext1 <2,u,2,7>, <2,u,2,7>
+  1906901094U, // <u,2,7,3>: Cost 2 vzipr RHS, LHS
+  2555653430U, // <u,2,7,4>: Cost 3 vext1 <0,u,2,7>, RHS
+  2800080230U, // <u,2,7,5>: Cost 3 vuzpl LHS, <7,4,5,6>
+  2980643164U, // <u,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6>
+  2645726828U, // <u,2,7,7>: Cost 3 vext2 <4,6,u,2>, <7,7,7,7>
+  1906901099U, // <u,2,7,u>: Cost 2 vzipr RHS, LHS
+  408175266U, // <u,2,u,0>: Cost 1 vext1 LHS, LHS
+  1545443118U, // <u,2,u,1>: Cost 2 vext2 <0,2,u,2>, LHS
+  269271142U, // <u,2,u,2>: Cost 1 vdup2 LHS
+  1611491416U, // <u,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3>
+  408177974U, // <u,2,u,4>: Cost 1 vext1 LHS, RHS
+  1545443482U, // <u,2,u,5>: Cost 2 vext2 <0,2,u,2>, RHS
+  1726339226U, // <u,2,u,6>: Cost 2 vuzpl LHS, RHS
+  1529697274U, // <u,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2>
+  408180526U, // <u,2,u,u>: Cost 1 vext1 LHS, LHS
+  1544781824U, // <u,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+  471040156U, // <u,3,0,1>: Cost 1 vext2 LHS, LHS
+  1544781988U, // <u,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+  2618523900U, // <u,3,0,3>: Cost 3 vext2 LHS, <0,3,1,0>
+  1544782162U, // <u,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+  2238188352U, // <u,3,0,5>: Cost 3 vrev <3,u,5,0>
+  2623169023U, // <u,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7>
+  2238335826U, // <u,3,0,7>: Cost 3 vrev <3,u,7,0>
+  471040669U, // <u,3,0,u>: Cost 1 vext2 LHS, LHS
+  1544782582U, // <u,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+  1544782644U, // <u,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+  1544782742U, // <u,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+  1544782808U, // <u,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+  2618524733U, // <u,3,1,4>: Cost 3 vext2 LHS, <1,4,3,5>
+  1544782992U, // <u,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+  2618524897U, // <u,3,1,6>: Cost 3 vext2 LHS, <1,6,3,7>
+  2703517987U, // <u,3,1,7>: Cost 3 vext3 <3,1,7,u>, <3,1,7,u>
+  1544783213U, // <u,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3>
+  1529716838U, // <u,3,2,0>: Cost 2 vext1 <u,u,3,2>, LHS
+  1164167966U, // <u,3,2,1>: Cost 2 vrev <3,u,1,2>
+  1544783464U, // <u,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2>
+  1544783526U, // <u,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
+  1529720118U, // <u,3,2,4>: Cost 2 vext1 <u,u,3,2>, RHS
+  2618525544U, // <u,3,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
+  1544783802U, // <u,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+  2704181620U, // <u,3,2,7>: Cost 3 vext3 <3,2,7,u>, <3,2,7,u>
+  1544783931U, // <u,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1>
+  1544784022U, // <u,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
+  1487922559U, // <u,3,3,1>: Cost 2 vext1 <1,u,3,3>, <1,u,3,3>
+  1493895256U, // <u,3,3,2>: Cost 2 vext1 <2,u,3,3>, <2,u,3,3>
+  336380006U, // <u,3,3,3>: Cost 1 vdup3 LHS
+  1544784386U, // <u,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
+  2824054478U, // <u,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5>
+  2238286668U, // <u,3,3,6>: Cost 3 vrev <3,u,6,3>
+  2954069136U, // <u,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7>
+  336380006U, // <u,3,3,u>: Cost 1 vdup3 LHS
+  1487929446U, // <u,3,4,0>: Cost 2 vext1 <1,u,3,4>, LHS
+  1487930752U, // <u,3,4,1>: Cost 2 vext1 <1,u,3,4>, <1,u,3,4>
+  2623171644U, // <u,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0>
+  2561673366U, // <u,3,4,3>: Cost 3 vext1 <1,u,3,4>, <3,0,1,2>
+  1487932726U, // <u,3,4,4>: Cost 2 vext1 <1,u,3,4>, RHS
+  471043382U, // <u,3,4,5>: Cost 1 vext2 LHS, RHS
+  1592561012U, // <u,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
+  2238368598U, // <u,3,4,7>: Cost 3 vrev <3,u,7,4>
+  471043625U, // <u,3,4,u>: Cost 1 vext2 LHS, RHS
+  2555707494U, // <u,3,5,0>: Cost 3 vext1 <0,u,3,5>, LHS
+  1574645465U, // <u,3,5,1>: Cost 2 vext2 <5,1,u,3>, <5,1,u,3>
+  2567653106U, // <u,3,5,2>: Cost 3 vext1 <2,u,3,5>, <2,3,u,5>
+  2555709954U, // <u,3,5,3>: Cost 3 vext1 <0,u,3,5>, <3,4,5,6>
+  1592561606U, // <u,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+  1592561668U, // <u,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+  1592561762U, // <u,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0>
+  1750314294U, // <u,3,5,7>: Cost 2 vuzpr LHS, RHS
+  1750314295U, // <u,3,5,u>: Cost 2 vuzpr LHS, RHS
+  2623172897U, // <u,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2>
+  2561688962U, // <u,3,6,1>: Cost 3 vext1 <1,u,3,6>, <1,u,3,6>
+  1581281795U, // <u,3,6,2>: Cost 2 vext2 <6,2,u,3>, <6,2,u,3>
+  2706541204U, // <u,3,6,3>: Cost 3 vext3 <3,6,3,u>, <3,6,3,u>
+  2623173261U, // <u,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6>
+  1164495686U, // <u,3,6,5>: Cost 2 vrev <3,u,5,6>
+  1592562488U, // <u,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+  1592562510U, // <u,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
+  1164716897U, // <u,3,6,u>: Cost 2 vrev <3,u,u,6>
+  1487954022U, // <u,3,7,0>: Cost 2 vext1 <1,u,3,7>, LHS
+  1487955331U, // <u,3,7,1>: Cost 2 vext1 <1,u,3,7>, <1,u,3,7>
+  1493928028U, // <u,3,7,2>: Cost 2 vext1 <2,u,3,7>, <2,u,3,7>
+  2561697942U, // <u,3,7,3>: Cost 3 vext1 <1,u,3,7>, <3,0,1,2>
+  1487957302U, // <u,3,7,4>: Cost 2 vext1 <1,u,3,7>, RHS
+  2707352311U, // <u,3,7,5>: Cost 3 vext3 <3,7,5,u>, <3,7,5,u>
+  2655024623U, // <u,3,7,6>: Cost 3 vext2 <6,2,u,3>, <7,6,2,u>
+  1592563308U, // <u,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
+  1487959854U, // <u,3,7,u>: Cost 2 vext1 <1,u,3,7>, LHS
+  1544787667U, // <u,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2>
+  471045934U, // <u,3,u,1>: Cost 1 vext2 LHS, LHS
+  1549432709U, // <u,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0>
+  336380006U, // <u,3,u,3>: Cost 1 vdup3 LHS
+  1544788031U, // <u,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6>
+  471046298U, // <u,3,u,5>: Cost 1 vext2 LHS, RHS
+  1549433040U, // <u,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7>
+  1750314537U, // <u,3,u,7>: Cost 2 vuzpr LHS, RHS
+  471046501U, // <u,3,u,u>: Cost 1 vext2 LHS, LHS
+  2625167360U, // <u,4,0,0>: Cost 3 vext2 <1,2,u,4>, <0,0,0,0>
+  1551425638U, // <u,4,0,1>: Cost 2 vext2 <1,2,u,4>, LHS
+  2619195630U, // <u,4,0,2>: Cost 3 vext2 <0,2,u,4>, <0,2,u,4>
+  2619343104U, // <u,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4>
+  2625167698U, // <u,4,0,4>: Cost 3 vext2 <1,2,u,4>, <0,4,1,5>
+  1638329234U, // <u,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1>
+  1638329244U, // <u,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2>
+  3787803556U, // <u,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1>
+  1551426205U, // <u,4,0,u>: Cost 2 vext2 <1,2,u,4>, LHS
+  2555748454U, // <u,4,1,0>: Cost 3 vext1 <0,u,4,1>, LHS
+  2625168180U, // <u,4,1,1>: Cost 3 vext2 <1,2,u,4>, <1,1,1,1>
+  1551426503U, // <u,4,1,2>: Cost 2 vext2 <1,2,u,4>, <1,2,u,4>
+  2625168344U, // <u,4,1,3>: Cost 3 vext2 <1,2,u,4>, <1,3,1,3>
+  2555751734U, // <u,4,1,4>: Cost 3 vext1 <0,u,4,1>, RHS
+  1860554038U, // <u,4,1,5>: Cost 2 vzipl LHS, RHS
+  2689879022U, // <u,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3>
+  2592248852U, // <u,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1>
+  1555408301U, // <u,4,1,u>: Cost 2 vext2 <1,u,u,4>, <1,u,u,4>
+  2555756646U, // <u,4,2,0>: Cost 3 vext1 <0,u,4,2>, LHS
+  2625168943U, // <u,4,2,1>: Cost 3 vext2 <1,2,u,4>, <2,1,4,u>
+  2625169000U, // <u,4,2,2>: Cost 3 vext2 <1,2,u,4>, <2,2,2,2>
+  2619197134U, // <u,4,2,3>: Cost 3 vext2 <0,2,u,4>, <2,3,4,5>
+  2555759926U, // <u,4,2,4>: Cost 3 vext1 <0,u,4,2>, RHS
+  2712071222U, // <u,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3>
+  1994771766U, // <u,4,2,6>: Cost 2 vtrnl LHS, RHS
+  2592257045U, // <u,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2>
+  1994771784U, // <u,4,2,u>: Cost 2 vtrnl LHS, RHS
+  2625169558U, // <u,4,3,0>: Cost 3 vext2 <1,2,u,4>, <3,0,1,2>
+  2567709594U, // <u,4,3,1>: Cost 3 vext1 <2,u,4,3>, <1,2,3,4>
+  2567710817U, // <u,4,3,2>: Cost 3 vext1 <2,u,4,3>, <2,u,4,3>
+  2625169820U, // <u,4,3,3>: Cost 3 vext2 <1,2,u,4>, <3,3,3,3>
+  2625169922U, // <u,4,3,4>: Cost 3 vext2 <1,2,u,4>, <3,4,5,6>
+  2954069710U, // <u,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5>
+  2954068172U, // <u,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6>
+  3903849472U, // <u,4,3,7>: Cost 4 vuzpr <1,u,3,4>, <1,3,5,7>
+  2954068174U, // <u,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u>
+  1505919078U, // <u,4,4,0>: Cost 2 vext1 <4,u,4,4>, LHS
+  2567717831U, // <u,4,4,1>: Cost 3 vext1 <2,u,4,4>, <1,2,u,4>
+  2567719010U, // <u,4,4,2>: Cost 3 vext1 <2,u,4,4>, <2,u,4,4>
+  2570373542U, // <u,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4>
+  161926454U, // <u,4,4,4>: Cost 1 vdup0 RHS
+  1551428918U, // <u,4,4,5>: Cost 2 vext2 <1,2,u,4>, RHS
+  1638329572U, // <u,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6>
+  2594927963U, // <u,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4>
+  161926454U, // <u,4,4,u>: Cost 1 vdup0 RHS
+  1493983334U, // <u,4,5,0>: Cost 2 vext1 <2,u,4,5>, LHS
+  2689879301U, // <u,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3>
+  1493985379U, // <u,4,5,2>: Cost 2 vext1 <2,u,4,5>, <2,u,4,5>
+  2567727254U, // <u,4,5,3>: Cost 3 vext1 <2,u,4,5>, <3,0,1,2>
+  1493986614U, // <u,4,5,4>: Cost 2 vext1 <2,u,4,5>, RHS
+  1863535926U, // <u,4,5,5>: Cost 2 vzipl RHS, RHS
+  537750838U, // <u,4,5,6>: Cost 1 vext3 LHS, RHS
+  2830110006U, // <u,4,5,7>: Cost 3 vuzpr <1,u,3,4>, RHS
+  537750856U, // <u,4,5,u>: Cost 1 vext3 LHS, RHS
+  1482047590U, // <u,4,6,0>: Cost 2 vext1 <0,u,4,6>, LHS
+  2555790070U, // <u,4,6,1>: Cost 3 vext1 <0,u,4,6>, <1,0,3,2>
+  2555790952U, // <u,4,6,2>: Cost 3 vext1 <0,u,4,6>, <2,2,2,2>
+  2555791510U, // <u,4,6,3>: Cost 3 vext1 <0,u,4,6>, <3,0,1,2>
+  1482050870U, // <u,4,6,4>: Cost 2 vext1 <0,u,4,6>, RHS
+  2689879422U, // <u,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7>
+  1997753654U, // <u,4,6,6>: Cost 2 vtrnl RHS, RHS
+  2712071562U, // <u,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1>
+  1482053422U, // <u,4,6,u>: Cost 2 vext1 <0,u,4,6>, LHS
+  2567741542U, // <u,4,7,0>: Cost 3 vext1 <2,u,4,7>, LHS
+  2567742362U, // <u,4,7,1>: Cost 3 vext1 <2,u,4,7>, <1,2,3,4>
+  2567743589U, // <u,4,7,2>: Cost 3 vext1 <2,u,4,7>, <2,u,4,7>
+  2573716286U, // <u,4,7,3>: Cost 3 vext1 <3,u,4,7>, <3,u,4,7>
+  2567744822U, // <u,4,7,4>: Cost 3 vext1 <2,u,4,7>, RHS
+  2712071624U, // <u,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0>
+  96808489U, // <u,4,7,6>: Cost 1 vrev RHS
+  2651715180U, // <u,4,7,7>: Cost 3 vext2 <5,6,u,4>, <7,7,7,7>
+  96955963U, // <u,4,7,u>: Cost 1 vrev RHS
+  1482063974U, // <u,4,u,0>: Cost 2 vext1 <0,u,4,u>, LHS
+  1551431470U, // <u,4,u,1>: Cost 2 vext2 <1,2,u,4>, LHS
+  1494009958U, // <u,4,u,2>: Cost 2 vext1 <2,u,4,u>, <2,u,4,u>
+  2555807894U, // <u,4,u,3>: Cost 3 vext1 <0,u,4,u>, <3,0,1,2>
+  161926454U, // <u,4,u,4>: Cost 1 vdup0 RHS
+  1551431834U, // <u,4,u,5>: Cost 2 vext2 <1,2,u,4>, RHS
+  537751081U, // <u,4,u,6>: Cost 1 vext3 LHS, RHS
+  2830110249U, // <u,4,u,7>: Cost 3 vuzpr <1,u,3,4>, RHS
+  537751099U, // <u,4,u,u>: Cost 1 vext3 LHS, RHS
+  2631811072U, // <u,5,0,0>: Cost 3 vext2 <2,3,u,5>, <0,0,0,0>
+  1558069350U, // <u,5,0,1>: Cost 2 vext2 <2,3,u,5>, LHS
+  2619203823U, // <u,5,0,2>: Cost 3 vext2 <0,2,u,5>, <0,2,u,5>
+  2619867456U, // <u,5,0,3>: Cost 3 vext2 <0,3,u,5>, <0,3,u,5>
+  1546273106U, // <u,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5>
+  2733010539U, // <u,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1>
+  2597622682U, // <u,5,0,6>: Cost 3 vext1 <7,u,5,0>, <6,7,u,5>
+  1176539396U, // <u,5,0,7>: Cost 2 vrev <5,u,7,0>
+  1558069917U, // <u,5,0,u>: Cost 2 vext2 <2,3,u,5>, LHS
+  1505968230U, // <u,5,1,0>: Cost 2 vext1 <4,u,5,1>, LHS
+  2624512887U, // <u,5,1,1>: Cost 3 vext2 <1,1,u,5>, <1,1,u,5>
+  2631811990U, // <u,5,1,2>: Cost 3 vext2 <2,3,u,5>, <1,2,3,0>
+  2618541056U, // <u,5,1,3>: Cost 3 vext2 <0,1,u,5>, <1,3,5,7>
+  1505971510U, // <u,5,1,4>: Cost 2 vext1 <4,u,5,1>, RHS
+  2627167419U, // <u,5,1,5>: Cost 3 vext2 <1,5,u,5>, <1,5,u,5>
+  2579714554U, // <u,5,1,6>: Cost 3 vext1 <4,u,5,1>, <6,2,7,3>
+  1638330064U, // <u,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3>
+  1638477529U, // <u,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3>
+  2561802342U, // <u,5,2,0>: Cost 3 vext1 <1,u,5,2>, LHS
+  2561803264U, // <u,5,2,1>: Cost 3 vext1 <1,u,5,2>, <1,3,5,7>
+  2631149217U, // <u,5,2,2>: Cost 3 vext2 <2,2,u,5>, <2,2,u,5>
+  1558071026U, // <u,5,2,3>: Cost 2 vext2 <2,3,u,5>, <2,3,u,5>
+  2561805622U, // <u,5,2,4>: Cost 3 vext1 <1,u,5,2>, RHS
+  2714062607U, // <u,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3>
+  2631813050U, // <u,5,2,6>: Cost 3 vext2 <2,3,u,5>, <2,6,3,7>
+  3092335926U, // <u,5,2,7>: Cost 3 vtrnr <0,u,0,2>, RHS
+  1561389191U, // <u,5,2,u>: Cost 2 vext2 <2,u,u,5>, <2,u,u,5>
+  2561810534U, // <u,5,3,0>: Cost 3 vext1 <1,u,5,3>, LHS
+  2561811857U, // <u,5,3,1>: Cost 3 vext1 <1,u,5,3>, <1,u,5,3>
+  2631813474U, // <u,5,3,2>: Cost 3 vext2 <2,3,u,5>, <3,2,5,u>
+  2631813532U, // <u,5,3,3>: Cost 3 vext2 <2,3,u,5>, <3,3,3,3>
+  2619869698U, // <u,5,3,4>: Cost 3 vext2 <0,3,u,5>, <3,4,5,6>
+  3001847002U, // <u,5,3,5>: Cost 3 vzipr LHS, <4,4,5,5>
+  2954070530U, // <u,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6>
+  2018749750U, // <u,5,3,7>: Cost 2 vtrnr LHS, RHS
+  2018749751U, // <u,5,3,u>: Cost 2 vtrnr LHS, RHS
+  2573762662U, // <u,5,4,0>: Cost 3 vext1 <3,u,5,4>, LHS
+  2620017634U, // <u,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0>
+  2573764338U, // <u,5,4,2>: Cost 3 vext1 <3,u,5,4>, <2,3,u,5>
+  2573765444U, // <u,5,4,3>: Cost 3 vext1 <3,u,5,4>, <3,u,5,4>
+  1570680053U, // <u,5,4,4>: Cost 2 vext2 <4,4,u,5>, <4,4,u,5>
+  1558072630U, // <u,5,4,5>: Cost 2 vext2 <2,3,u,5>, RHS
+  2645749143U, // <u,5,4,6>: Cost 3 vext2 <4,6,u,5>, <4,6,u,5>
+  1638330310U, // <u,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6>
+  1558072873U, // <u,5,4,u>: Cost 2 vext2 <2,3,u,5>, RHS
+  1506000998U, // <u,5,5,0>: Cost 2 vext1 <4,u,5,5>, LHS
+  2561827984U, // <u,5,5,1>: Cost 3 vext1 <1,u,5,5>, <1,5,3,7>
+  2579744360U, // <u,5,5,2>: Cost 3 vext1 <4,u,5,5>, <2,2,2,2>
+  2579744918U, // <u,5,5,3>: Cost 3 vext1 <4,u,5,5>, <3,0,1,2>
+  1506004278U, // <u,5,5,4>: Cost 2 vext1 <4,u,5,5>, RHS
+  229035318U, // <u,5,5,5>: Cost 1 vdup1 RHS
+  2712072206U, // <u,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6>
+  1638330392U, // <u,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7>
+  229035318U, // <u,5,5,u>: Cost 1 vdup1 RHS
+  1500037222U, // <u,5,6,0>: Cost 2 vext1 <3,u,5,6>, LHS
+  2561836436U, // <u,5,6,1>: Cost 3 vext1 <1,u,5,6>, <1,u,5,6>
+  2567809133U, // <u,5,6,2>: Cost 3 vext1 <2,u,5,6>, <2,u,5,6>
+  1500040006U, // <u,5,6,3>: Cost 2 vext1 <3,u,5,6>, <3,u,5,6>
+  1500040502U, // <u,5,6,4>: Cost 2 vext1 <3,u,5,6>, RHS
+  2714062935U, // <u,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7>
+  2712072288U, // <u,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7>
+  27705344U, // <u,5,6,7>: Cost 0 copy RHS
+  27705344U, // <u,5,6,u>: Cost 0 copy RHS
+  1488101478U, // <u,5,7,0>: Cost 2 vext1 <1,u,5,7>, LHS
+  1488102805U, // <u,5,7,1>: Cost 2 vext1 <1,u,5,7>, <1,u,5,7>
+  2561844840U, // <u,5,7,2>: Cost 3 vext1 <1,u,5,7>, <2,2,2,2>
+  2561845398U, // <u,5,7,3>: Cost 3 vext1 <1,u,5,7>, <3,0,1,2>
+  1488104758U, // <u,5,7,4>: Cost 2 vext1 <1,u,5,7>, RHS
+  1638330536U, // <u,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7>
+  2712072362U, // <u,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0>
+  2042965302U, // <u,5,7,7>: Cost 2 vtrnr RHS, RHS
+  1488107310U, // <u,5,7,u>: Cost 2 vext1 <1,u,5,7>, LHS
+  1488109670U, // <u,5,u,0>: Cost 2 vext1 <1,u,5,u>, LHS
+  1488110998U, // <u,5,u,1>: Cost 2 vext1 <1,u,5,u>, <1,u,5,u>
+  2561853032U, // <u,5,u,2>: Cost 3 vext1 <1,u,5,u>, <2,2,2,2>
+  1500056392U, // <u,5,u,3>: Cost 2 vext1 <3,u,5,u>, <3,u,5,u>
+  1488112950U, // <u,5,u,4>: Cost 2 vext1 <1,u,5,u>, RHS
+  229035318U, // <u,5,u,5>: Cost 1 vdup1 RHS
+  2954111490U, // <u,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6>
+  27705344U, // <u,5,u,7>: Cost 0 copy RHS
+  27705344U, // <u,5,u,u>: Cost 0 copy RHS
+  2619211776U, // <u,6,0,0>: Cost 3 vext2 <0,2,u,6>, <0,0,0,0>
+  1545470054U, // <u,6,0,1>: Cost 2 vext2 <0,2,u,6>, LHS
+  1545470192U, // <u,6,0,2>: Cost 2 vext2 <0,2,u,6>, <0,2,u,6>
+  2255958969U, // <u,6,0,3>: Cost 3 vrev <6,u,3,0>
+  1546797458U, // <u,6,0,4>: Cost 2 vext2 <0,4,u,6>, <0,4,u,6>
+  2720624971U, // <u,6,0,5>: Cost 3 vext3 <6,0,5,u>, <6,0,5,u>
+  2256180180U, // <u,6,0,6>: Cost 3 vrev <6,u,6,0>
+  2960682294U, // <u,6,0,7>: Cost 3 vzipr <1,2,u,0>, RHS
+  1545470621U, // <u,6,0,u>: Cost 2 vext2 <0,2,u,6>, LHS
+  1182004127U, // <u,6,1,0>: Cost 2 vrev <6,u,0,1>
+  2619212596U, // <u,6,1,1>: Cost 3 vext2 <0,2,u,6>, <1,1,1,1>
+  2619212694U, // <u,6,1,2>: Cost 3 vext2 <0,2,u,6>, <1,2,3,0>
+  2619212760U, // <u,6,1,3>: Cost 3 vext2 <0,2,u,6>, <1,3,1,3>
+  2626511979U, // <u,6,1,4>: Cost 3 vext2 <1,4,u,6>, <1,4,u,6>
+  2619212944U, // <u,6,1,5>: Cost 3 vext2 <0,2,u,6>, <1,5,3,7>
+  2714063264U, // <u,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3>
+  2967326006U, // <u,6,1,7>: Cost 3 vzipr <2,3,u,1>, RHS
+  1182594023U, // <u,6,1,u>: Cost 2 vrev <6,u,u,1>
+  1506050150U, // <u,6,2,0>: Cost 2 vext1 <4,u,6,2>, LHS
+  2579792630U, // <u,6,2,1>: Cost 3 vext1 <4,u,6,2>, <1,0,3,2>
+  2619213416U, // <u,6,2,2>: Cost 3 vext2 <0,2,u,6>, <2,2,2,2>
+  2619213478U, // <u,6,2,3>: Cost 3 vext2 <0,2,u,6>, <2,3,0,1>
+  1506053430U, // <u,6,2,4>: Cost 2 vext1 <4,u,6,2>, RHS
+  2633148309U, // <u,6,2,5>: Cost 3 vext2 <2,5,u,6>, <2,5,u,6>
+  2619213754U, // <u,6,2,6>: Cost 3 vext2 <0,2,u,6>, <2,6,3,7>
+  1638330874U, // <u,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3>
+  1638478339U, // <u,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3>
+  2619213974U, // <u,6,3,0>: Cost 3 vext2 <0,2,u,6>, <3,0,1,2>
+  2255836074U, // <u,6,3,1>: Cost 3 vrev <6,u,1,3>
+  2255909811U, // <u,6,3,2>: Cost 3 vrev <6,u,2,3>
+  2619214236U, // <u,6,3,3>: Cost 3 vext2 <0,2,u,6>, <3,3,3,3>
+  1564715549U, // <u,6,3,4>: Cost 2 vext2 <3,4,u,6>, <3,4,u,6>
+  2639121006U, // <u,6,3,5>: Cost 3 vext2 <3,5,u,6>, <3,5,u,6>
+  3001847012U, // <u,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6>
+  1880329526U, // <u,6,3,7>: Cost 2 vzipr LHS, RHS
+  1880329527U, // <u,6,3,u>: Cost 2 vzipr LHS, RHS
+  2567864422U, // <u,6,4,0>: Cost 3 vext1 <2,u,6,4>, LHS
+  2733011558U, // <u,6,4,1>: Cost 3 vext3 LHS, <6,4,1,3>
+  2567866484U, // <u,6,4,2>: Cost 3 vext1 <2,u,6,4>, <2,u,6,4>
+  2638458005U, // <u,6,4,3>: Cost 3 vext2 <3,4,u,6>, <4,3,6,u>
+  1570540772U, // <u,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6>
+  1545473334U, // <u,6,4,5>: Cost 2 vext2 <0,2,u,6>, RHS
+  1572015512U, // <u,6,4,6>: Cost 2 vext2 <4,6,u,6>, <4,6,u,6>
+  2960715062U, // <u,6,4,7>: Cost 3 vzipr <1,2,u,4>, RHS
+  1545473577U, // <u,6,4,u>: Cost 2 vext2 <0,2,u,6>, RHS
+  2567872614U, // <u,6,5,0>: Cost 3 vext1 <2,u,6,5>, LHS
+  2645757648U, // <u,6,5,1>: Cost 3 vext2 <4,6,u,6>, <5,1,7,3>
+  2567874490U, // <u,6,5,2>: Cost 3 vext1 <2,u,6,5>, <2,6,3,7>
+  2576501250U, // <u,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6>
+  1576660943U, // <u,6,5,4>: Cost 2 vext2 <5,4,u,6>, <5,4,u,6>
+  2645757956U, // <u,6,5,5>: Cost 3 vext2 <4,6,u,6>, <5,5,5,5>
+  2645758050U, // <u,6,5,6>: Cost 3 vext2 <4,6,u,6>, <5,6,7,0>
+  2824080694U, // <u,6,5,7>: Cost 3 vuzpr <0,u,2,6>, RHS
+  1182626795U, // <u,6,5,u>: Cost 2 vrev <6,u,u,5>
+  1506082918U, // <u,6,6,0>: Cost 2 vext1 <4,u,6,6>, LHS
+  2579825398U, // <u,6,6,1>: Cost 3 vext1 <4,u,6,6>, <1,0,3,2>
+  2645758458U, // <u,6,6,2>: Cost 3 vext2 <4,6,u,6>, <6,2,7,3>
+  2579826838U, // <u,6,6,3>: Cost 3 vext1 <4,u,6,6>, <3,0,1,2>
+  1506086198U, // <u,6,6,4>: Cost 2 vext1 <4,u,6,6>, RHS
+  2579828432U, // <u,6,6,5>: Cost 3 vext1 <4,u,6,6>, <5,1,7,3>
+  296144182U, // <u,6,6,6>: Cost 1 vdup2 RHS
+  1638331202U, // <u,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7>
+  296144182U, // <u,6,6,u>: Cost 1 vdup2 RHS
+  432349286U, // <u,6,7,0>: Cost 1 vext1 RHS, LHS
+  1506091766U, // <u,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2>
+  1506092648U, // <u,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  1506093206U, // <u,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2>
+  432352809U, // <u,6,7,4>: Cost 1 vext1 RHS, RHS
+  1506094800U, // <u,6,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
+  1506095610U, // <u,6,7,6>: Cost 2 vext1 RHS, <6,2,7,3>
+  1906904374U, // <u,6,7,7>: Cost 2 vzipr RHS, RHS
+  432355118U, // <u,6,7,u>: Cost 1 vext1 RHS, LHS
+  432357478U, // <u,6,u,0>: Cost 1 vext1 RHS, LHS
+  1545475886U, // <u,6,u,1>: Cost 2 vext2 <0,2,u,6>, LHS
+  1506100840U, // <u,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  1506101398U, // <u,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2>
+  432361002U, // <u,6,u,4>: Cost 1 vext1 RHS, RHS
+  1545476250U, // <u,6,u,5>: Cost 2 vext2 <0,2,u,6>, RHS
+  296144182U, // <u,6,u,6>: Cost 1 vdup2 RHS
+  1880370486U, // <u,6,u,7>: Cost 2 vzipr LHS, RHS
+  432363310U, // <u,6,u,u>: Cost 1 vext1 RHS, LHS
+  1571356672U, // <u,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+  497614950U, // <u,7,0,1>: Cost 1 vext2 RHS, LHS
+  1571356836U, // <u,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+  2573880146U, // <u,7,0,3>: Cost 3 vext1 <3,u,7,0>, <3,u,7,0>
+  1571357010U, // <u,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+  1512083716U, // <u,7,0,5>: Cost 2 vext1 <5,u,7,0>, <5,u,7,0>
+  2621874741U, // <u,7,0,6>: Cost 3 vext2 <0,6,u,7>, <0,6,u,7>
+  2585826298U, // <u,7,0,7>: Cost 3 vext1 <5,u,7,0>, <7,0,1,2>
+  497615517U, // <u,7,0,u>: Cost 1 vext2 RHS, LHS
+  1571357430U, // <u,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+  1571357492U, // <u,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+  1571357590U, // <u,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0>
+  1552114715U, // <u,7,1,3>: Cost 2 vext2 <1,3,u,7>, <1,3,u,7>
+  2573888822U, // <u,7,1,4>: Cost 3 vext1 <3,u,7,1>, RHS
+  1553441981U, // <u,7,1,5>: Cost 2 vext2 <1,5,u,7>, <1,5,u,7>
+  2627847438U, // <u,7,1,6>: Cost 3 vext2 <1,6,u,7>, <1,6,u,7>
+  2727408775U, // <u,7,1,7>: Cost 3 vext3 <7,1,7,u>, <7,1,7,u>
+  1555432880U, // <u,7,1,u>: Cost 2 vext2 <1,u,u,7>, <1,u,u,7>
+  2629838337U, // <u,7,2,0>: Cost 3 vext2 <2,0,u,7>, <2,0,u,7>
+  1188058754U, // <u,7,2,1>: Cost 2 vrev <7,u,1,2>
+  1571358312U, // <u,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+  1571358374U, // <u,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
+  2632492869U, // <u,7,2,4>: Cost 3 vext2 <2,4,u,7>, <2,4,u,7>
+  2633156502U, // <u,7,2,5>: Cost 3 vext2 <2,5,u,7>, <2,5,u,7>
+  1560078311U, // <u,7,2,6>: Cost 2 vext2 <2,6,u,7>, <2,6,u,7>
+  2728072408U, // <u,7,2,7>: Cost 3 vext3 <7,2,7,u>, <7,2,7,u>
+  1561405577U, // <u,7,2,u>: Cost 2 vext2 <2,u,u,7>, <2,u,u,7>
+  1571358870U, // <u,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+  2627184913U, // <u,7,3,1>: Cost 3 vext2 <1,5,u,7>, <3,1,5,u>
+  2633820523U, // <u,7,3,2>: Cost 3 vext2 <2,6,u,7>, <3,2,6,u>
+  1571359132U, // <u,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+  1571359234U, // <u,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+  1512108295U, // <u,7,3,5>: Cost 2 vext1 <5,u,7,3>, <5,u,7,3>
+  1518080992U, // <u,7,3,6>: Cost 2 vext1 <6,u,7,3>, <6,u,7,3>
+  2640456465U, // <u,7,3,7>: Cost 3 vext2 <3,7,u,7>, <3,7,u,7>
+  1571359518U, // <u,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+  1571359634U, // <u,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+  2573911067U, // <u,7,4,1>: Cost 3 vext1 <3,u,7,4>, <1,3,u,7>
+  2645101622U, // <u,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3>
+  2573912918U, // <u,7,4,3>: Cost 3 vext1 <3,u,7,4>, <3,u,7,4>
+  1571359952U, // <u,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+  497618248U, // <u,7,4,5>: Cost 1 vext2 RHS, RHS
+  1571360116U, // <u,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+  2645102024U, // <u,7,4,7>: Cost 3 vext2 RHS, <4,7,5,0>
+  497618473U, // <u,7,4,u>: Cost 1 vext2 RHS, RHS
+  2645102152U, // <u,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2>
+  1571360464U, // <u,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+  2645102334U, // <u,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4>
+  2645102447U, // <u,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0>
+  1571360710U, // <u,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+  1571360772U, // <u,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+  1571360866U, // <u,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0>
+  1571360936U, // <u,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
+  1571361017U, // <u,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7>
+  1530044518U, // <u,7,6,0>: Cost 2 vext1 <u,u,7,6>, LHS
+  2645103016U, // <u,7,6,1>: Cost 3 vext2 RHS, <6,1,7,2>
+  1571361274U, // <u,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+  2645103154U, // <u,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5>
+  1530047798U, // <u,7,6,4>: Cost 2 vext1 <u,u,7,6>, RHS
+  1188386474U, // <u,7,6,5>: Cost 2 vrev <7,u,5,6>
+  1571361592U, // <u,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6>
+  1571361614U, // <u,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
+  1571361695U, // <u,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1>
+  1571361786U, // <u,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2>
+  2573935616U, // <u,7,7,1>: Cost 3 vext1 <3,u,7,7>, <1,3,5,7>
+  2645103781U, // <u,7,7,2>: Cost 3 vext2 RHS, <7,2,2,2>
+  2573937497U, // <u,7,7,3>: Cost 3 vext1 <3,u,7,7>, <3,u,7,7>
+  1571362150U, // <u,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6>
+  1512141067U, // <u,7,7,5>: Cost 2 vext1 <5,u,7,7>, <5,u,7,7>
+  1518113764U, // <u,7,7,6>: Cost 2 vext1 <6,u,7,7>, <6,u,7,7>
+  363253046U, // <u,7,7,7>: Cost 1 vdup3 RHS
+  363253046U, // <u,7,7,u>: Cost 1 vdup3 RHS
+  1571362515U, // <u,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2>
+  497620782U, // <u,7,u,1>: Cost 1 vext2 RHS, LHS
+  1571362693U, // <u,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0>
+  1571362748U, // <u,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
+  1571362879U, // <u,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6>
+  497621146U, // <u,7,u,5>: Cost 1 vext2 RHS, RHS
+  1571363024U, // <u,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7>
+  363253046U, // <u,7,u,7>: Cost 1 vdup3 RHS
+  497621349U, // <u,7,u,u>: Cost 1 vext2 RHS, LHS
+  135053414U, // <u,u,0,0>: Cost 1 vdup0 LHS
+  471081121U, // <u,u,0,1>: Cost 1 vext2 LHS, LHS
+  1544822948U, // <u,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+  1616140005U, // <u,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2>
+  1544823122U, // <u,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+  1512157453U, // <u,u,0,5>: Cost 2 vext1 <5,u,u,0>, <5,u,u,0>
+  1662220032U, // <u,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2>
+  1194457487U, // <u,u,0,7>: Cost 2 vrev <u,u,7,0>
+  471081629U, // <u,u,0,u>: Cost 1 vext2 LHS, LHS
+  1544823542U, // <u,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+  202162278U, // <u,u,1,1>: Cost 1 vdup1 LHS
+  537753390U, // <u,u,1,2>: Cost 1 vext3 LHS, LHS
+  1544823768U, // <u,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+  1494248758U, // <u,u,1,4>: Cost 2 vext1 <2,u,u,1>, RHS
+  1544823952U, // <u,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+  1518138343U, // <u,u,1,6>: Cost 2 vext1 <6,u,u,1>, <6,u,u,1>
+  1640322907U, // <u,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3>
+  537753444U, // <u,u,1,u>: Cost 1 vext3 LHS, LHS
+  1482309734U, // <u,u,2,0>: Cost 2 vext1 <0,u,u,2>, LHS
+  1194031451U, // <u,u,2,1>: Cost 2 vrev <u,u,1,2>
+  269271142U, // <u,u,2,2>: Cost 1 vdup2 LHS
+  835584U, // <u,u,2,3>: Cost 0 copy LHS
+  1482313014U, // <u,u,2,4>: Cost 2 vext1 <0,u,u,2>, RHS
+  2618566504U, // <u,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
+  1544824762U, // <u,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+  1638479788U, // <u,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3>
+  835584U, // <u,u,2,u>: Cost 0 copy LHS
+  408576723U, // <u,u,3,0>: Cost 1 vext1 LHS, LHS
+  1482318582U, // <u,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  120371557U, // <u,u,3,2>: Cost 1 vrev LHS
+  336380006U, // <u,u,3,3>: Cost 1 vdup3 LHS
+  408579382U, // <u,u,3,4>: Cost 1 vext1 LHS, RHS
+  1616140271U, // <u,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7>
+  1530098170U, // <u,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  1880329544U, // <u,u,3,7>: Cost 2 vzipr LHS, RHS
+  408581934U, // <u,u,3,u>: Cost 1 vext1 LHS, LHS
+  1488298086U, // <u,u,4,0>: Cost 2 vext1 <1,u,u,4>, LHS
+  1488299437U, // <u,u,4,1>: Cost 2 vext1 <1,u,u,4>, <1,u,u,4>
+  1659271204U, // <u,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6>
+  1194195311U, // <u,u,4,3>: Cost 2 vrev <u,u,3,4>
+  161926454U, // <u,u,4,4>: Cost 1 vdup0 RHS
+  471084342U, // <u,u,4,5>: Cost 1 vext2 LHS, RHS
+  1571368308U, // <u,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+  1640323153U, // <u,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6>
+  471084585U, // <u,u,4,u>: Cost 1 vext2 LHS, RHS
+  1494278246U, // <u,u,5,0>: Cost 2 vext1 <2,u,u,5>, LHS
+  1571368656U, // <u,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+  1494280327U, // <u,u,5,2>: Cost 2 vext1 <2,u,u,5>, <2,u,u,5>
+  1616140415U, // <u,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7>
+  1494281526U, // <u,u,5,4>: Cost 2 vext1 <2,u,u,5>, RHS
+  229035318U, // <u,u,5,5>: Cost 1 vdup1 RHS
+  537753754U, // <u,u,5,6>: Cost 1 vext3 LHS, RHS
+  1750355254U, // <u,u,5,7>: Cost 2 vuzpr LHS, RHS
+  537753772U, // <u,u,5,u>: Cost 1 vext3 LHS, RHS
+  1482342502U, // <u,u,6,0>: Cost 2 vext1 <0,u,u,6>, LHS
+  2556084982U, // <u,u,6,1>: Cost 3 vext1 <0,u,u,6>, <1,0,3,2>
+  1571369466U, // <u,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+  1611938000U, // <u,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7>
+  1482345782U, // <u,u,6,4>: Cost 2 vext1 <0,u,u,6>, RHS
+  1194359171U, // <u,u,6,5>: Cost 2 vrev <u,u,5,6>
+  296144182U, // <u,u,6,6>: Cost 1 vdup2 RHS
+  27705344U, // <u,u,6,7>: Cost 0 copy RHS
+  27705344U, // <u,u,6,u>: Cost 0 copy RHS
+  432496742U, // <u,u,7,0>: Cost 1 vext1 RHS, LHS
+  1488324016U, // <u,u,7,1>: Cost 2 vext1 <1,u,u,7>, <1,u,u,7>
+  1494296713U, // <u,u,7,2>: Cost 2 vext1 <2,u,u,7>, <2,u,u,7>
+  1906901148U, // <u,u,7,3>: Cost 2 vzipr RHS, LHS
+  432500283U, // <u,u,7,4>: Cost 1 vext1 RHS, RHS
+  1506242256U, // <u,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
+  120699277U, // <u,u,7,6>: Cost 1 vrev RHS
+  363253046U, // <u,u,7,7>: Cost 1 vdup3 RHS
+  432502574U, // <u,u,7,u>: Cost 1 vext1 RHS, LHS
+  408617688U, // <u,u,u,0>: Cost 1 vext1 LHS, LHS
+  471086894U, // <u,u,u,1>: Cost 1 vext2 LHS, LHS
+  537753957U, // <u,u,u,2>: Cost 1 vext3 LHS, LHS
+  835584U, // <u,u,u,3>: Cost 0 copy LHS
+  408620342U, // <u,u,u,4>: Cost 1 vext1 LHS, RHS
+  471087258U, // <u,u,u,5>: Cost 1 vext2 LHS, RHS
+  537753997U, // <u,u,u,6>: Cost 1 vext3 LHS, RHS
+  27705344U, // <u,u,u,7>: Cost 0 copy RHS
+  835584U, // <u,u,u,u>: Cost 0 copy LHS
   0
 };
diff --git a/lib/Target/ARM/ARMRegisterInfo.cpp b/lib/Target/ARM/ARMRegisterInfo.cpp
index ad51bc1..1cba1ba 100644
--- a/lib/Target/ARM/ARMRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMRegisterInfo.cpp
@@ -12,26 +12,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARM.h"
-#include "ARMAddressingModes.h"
 #include "ARMBaseInstrInfo.h"
-#include "ARMInstrInfo.h"
-#include "ARMMachineFunctionInfo.h"
 #include "ARMRegisterInfo.h"
-#include "ARMSubtarget.h"
-#include "llvm/Constants.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/CodeGen/MachineConstantPool.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineLocation.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/RegisterScavenging.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetOptions.h"
-#include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/SmallVector.h"
 using namespace llvm;
 
 ARMRegisterInfo::ARMRegisterInfo(const ARMBaseInstrInfo &tii,
diff --git a/lib/Target/ARM/ARMRegisterInfo.td b/lib/Target/ARM/ARMRegisterInfo.td
index 22d15b5..f4fbae3 100644
--- a/lib/Target/ARM/ARMRegisterInfo.td
+++ b/lib/Target/ARM/ARMRegisterInfo.td
@@ -30,18 +30,6 @@ def ssub_0  : SubRegIndex;
 def ssub_1  : SubRegIndex;
 def ssub_2  : SubRegIndex; // In a Q reg.
 def ssub_3  : SubRegIndex;
-def ssub_4  : SubRegIndex; // In a QQ reg.
-def ssub_5  : SubRegIndex;
-def ssub_6  : SubRegIndex;
-def ssub_7  : SubRegIndex;
-def ssub_8  : SubRegIndex; // In a QQQQ reg.
-def ssub_9  : SubRegIndex;
-def ssub_10 : SubRegIndex;
-def ssub_11 : SubRegIndex;
-def ssub_12 : SubRegIndex;
-def ssub_13 : SubRegIndex;
-def ssub_14 : SubRegIndex;
-def ssub_15 : SubRegIndex;
 
 def dsub_0 : SubRegIndex;
 def dsub_1 : SubRegIndex;
@@ -70,6 +58,8 @@ def R4  : ARMReg< 4, "r4">,  DwarfRegNum<[4]>;
 def R5  : ARMReg< 5, "r5">,  DwarfRegNum<[5]>;
 def R6  : ARMReg< 6, "r6">,  DwarfRegNum<[6]>;
 def R7  : ARMReg< 7, "r7">,  DwarfRegNum<[7]>;
+// These require 32-bit instructions.
+let CostPerUse = 1 in {
 def R8  : ARMReg< 8, "r8">,  DwarfRegNum<[8]>;
 def R9  : ARMReg< 9, "r9">,  DwarfRegNum<[9]>;
 def R10 : ARMReg<10, "r10">, DwarfRegNum<[10]>;
@@ -78,6 +68,7 @@ def R12 : ARMReg<12, "r12">, DwarfRegNum<[12]>;
 def SP  : ARMReg<13, "sp">,  DwarfRegNum<[13]>;
 def LR  : ARMReg<14, "lr">,  DwarfRegNum<[14]>;
 def PC  : ARMReg<15, "pc">,  DwarfRegNum<[15]>;
+}
 
 // Float registers
 def S0  : ARMFReg< 0, "s0">;  def S1  : ARMFReg< 1, "s1">;
@@ -99,33 +90,41 @@ def S30 : ARMFReg<30, "s30">; def S31 : ARMFReg<31, "s31">;
 
 // Aliases of the F* registers used to hold 64-bit fp values (doubles)
 let SubRegIndices = [ssub_0, ssub_1] in {
-def D0  : ARMReg< 0,  "d0", [S0,   S1]>;
-def D1  : ARMReg< 1,  "d1", [S2,   S3]>;
-def D2  : ARMReg< 2,  "d2", [S4,   S5]>;
-def D3  : ARMReg< 3,  "d3", [S6,   S7]>;
-def D4  : ARMReg< 4,  "d4", [S8,   S9]>;
-def D5  : ARMReg< 5,  "d5", [S10, S11]>;
-def D6  : ARMReg< 6,  "d6", [S12, S13]>;
-def D7  : ARMReg< 7,  "d7", [S14, S15]>;
-def D8  : ARMReg< 8,  "d8", [S16, S17]>;
-def D9  : ARMReg< 9,  "d9", [S18, S19]>;
-def D10 : ARMReg<10, "d10", [S20, S21]>;
-def D11 : ARMReg<11, "d11", [S22, S23]>;
-def D12 : ARMReg<12, "d12", [S24, S25]>;
-def D13 : ARMReg<13, "d13", [S26, S27]>;
-def D14 : ARMReg<14, "d14", [S28, S29]>;
-def D15 : ARMReg<15, "d15", [S30, S31]>;
+def D0  : ARMReg< 0,  "d0", [S0,   S1]>, DwarfRegNum<[256]>;
+def D1  : ARMReg< 1,  "d1", [S2,   S3]>, DwarfRegNum<[257]>;
+def D2  : ARMReg< 2,  "d2", [S4,   S5]>, DwarfRegNum<[258]>;
+def D3  : ARMReg< 3,  "d3", [S6,   S7]>, DwarfRegNum<[259]>;
+def D4  : ARMReg< 4,  "d4", [S8,   S9]>, DwarfRegNum<[260]>;
+def D5  : ARMReg< 5,  "d5", [S10, S11]>, DwarfRegNum<[261]>;
+def D6  : ARMReg< 6,  "d6", [S12, S13]>, DwarfRegNum<[262]>;
+def D7  : ARMReg< 7,  "d7", [S14, S15]>, DwarfRegNum<[263]>;
+def D8  : ARMReg< 8,  "d8", [S16, S17]>, DwarfRegNum<[264]>;
+def D9  : ARMReg< 9,  "d9", [S18, S19]>, DwarfRegNum<[265]>;
+def D10 : ARMReg<10, "d10", [S20, S21]>, DwarfRegNum<[266]>;
+def D11 : ARMReg<11, "d11", [S22, S23]>, DwarfRegNum<[267]>;
+def D12 : ARMReg<12, "d12", [S24, S25]>, DwarfRegNum<[268]>;
+def D13 : ARMReg<13, "d13", [S26, S27]>, DwarfRegNum<[269]>;
+def D14 : ARMReg<14, "d14", [S28, S29]>, DwarfRegNum<[270]>;
+def D15 : ARMReg<15, "d15", [S30, S31]>, DwarfRegNum<[271]>;
 }
 
 // VFP3 defines 16 additional double registers
-def D16 : ARMFReg<16, "d16">; def D17 : ARMFReg<17, "d17">;
-def D18 : ARMFReg<18, "d18">; def D19 : ARMFReg<19, "d19">;
-def D20 : ARMFReg<20, "d20">; def D21 : ARMFReg<21, "d21">;
-def D22 : ARMFReg<22, "d22">; def D23 : ARMFReg<23, "d23">;
-def D24 : ARMFReg<24, "d24">; def D25 : ARMFReg<25, "d25">;
-def D26 : ARMFReg<26, "d26">; def D27 : ARMFReg<27, "d27">;
-def D28 : ARMFReg<28, "d28">; def D29 : ARMFReg<29, "d29">;
-def D30 : ARMFReg<30, "d30">; def D31 : ARMFReg<31, "d31">;
+def D16 : ARMFReg<16, "d16">, DwarfRegNum<[272]>; 
+def D17 : ARMFReg<17, "d17">, DwarfRegNum<[273]>;
+def D18 : ARMFReg<18, "d18">, DwarfRegNum<[274]>;
+def D19 : ARMFReg<19, "d19">, DwarfRegNum<[275]>;
+def D20 : ARMFReg<20, "d20">, DwarfRegNum<[276]>;
+def D21 : ARMFReg<21, "d21">, DwarfRegNum<[277]>;
+def D22 : ARMFReg<22, "d22">, DwarfRegNum<[278]>; 
+def D23 : ARMFReg<23, "d23">, DwarfRegNum<[279]>;
+def D24 : ARMFReg<24, "d24">, DwarfRegNum<[280]>;
+def D25 : ARMFReg<25, "d25">, DwarfRegNum<[281]>;
+def D26 : ARMFReg<26, "d26">, DwarfRegNum<[282]>;
+def D27 : ARMFReg<27, "d27">, DwarfRegNum<[283]>;
+def D28 : ARMFReg<28, "d28">, DwarfRegNum<[284]>;
+def D29 : ARMFReg<29, "d29">, DwarfRegNum<[285]>;
+def D30 : ARMFReg<30, "d30">, DwarfRegNum<[286]>;
+def D31 : ARMFReg<31, "d31">, DwarfRegNum<[287]>;
 
 // Advanced SIMD (NEON) defines 16 quad-word aliases
 let SubRegIndices = [dsub_0, dsub_1],
@@ -158,43 +157,28 @@ def Q15 : ARMReg<15, "q15", [D30, D31]>;
 // starting D register number doesn't have to be multiple of 4, e.g.,
 // D1, D2, D3, D4 would be a legal quad, but that would make the subregister
 // stuff very messy.
-let SubRegIndices = [qsub_0, qsub_1] in {
-let CompositeIndices = [(dsub_2 qsub_1, dsub_0), (dsub_3 qsub_1, dsub_1),
-                        (ssub_4 qsub_1, ssub_0), (ssub_5 qsub_1, ssub_1),
-                        (ssub_6 qsub_1, ssub_2), (ssub_7 qsub_1, ssub_3)] in {
+let SubRegIndices = [qsub_0, qsub_1],
+ CompositeIndices = [(dsub_2 qsub_1, dsub_0), (dsub_3 qsub_1, dsub_1)] in {
 def QQ0 : ARMReg<0, "qq0", [Q0,  Q1]>;
 def QQ1 : ARMReg<1, "qq1", [Q2,  Q3]>;
 def QQ2 : ARMReg<2, "qq2", [Q4,  Q5]>;
 def QQ3 : ARMReg<3, "qq3", [Q6,  Q7]>;
-}
-let CompositeIndices = [(dsub_2 qsub_1, dsub_0), (dsub_3 qsub_1, dsub_1)] in {
 def QQ4 : ARMReg<4, "qq4", [Q8,  Q9]>;
 def QQ5 : ARMReg<5, "qq5", [Q10, Q11]>;
 def QQ6 : ARMReg<6, "qq6", [Q12, Q13]>;
 def QQ7 : ARMReg<7, "qq7", [Q14, Q15]>;
 }
-}
 
 // Pseudo 512-bit registers to represent four consecutive Q registers.
-let SubRegIndices = [qqsub_0, qqsub_1] in {
-let CompositeIndices = [(qsub_2  qqsub_1, qsub_0), (qsub_3  qqsub_1, qsub_1),
-                        (dsub_4  qqsub_1, dsub_0), (dsub_5  qqsub_1, dsub_1),
-                        (dsub_6  qqsub_1, dsub_2), (dsub_7  qqsub_1, dsub_3),
-                        (ssub_8  qqsub_1, ssub_0), (ssub_9  qqsub_1, ssub_1),
-                        (ssub_10 qqsub_1, ssub_2), (ssub_11 qqsub_1, ssub_3),
-                        (ssub_12 qqsub_1, ssub_4), (ssub_13 qqsub_1, ssub_5),
-                        (ssub_14 qqsub_1, ssub_6), (ssub_15 qqsub_1, ssub_7)] in
-{
+let SubRegIndices = [qqsub_0, qqsub_1],
+ CompositeIndices = [(qsub_2  qqsub_1, qsub_0), (qsub_3  qqsub_1, qsub_1),
+                     (dsub_4  qqsub_1, dsub_0), (dsub_5  qqsub_1, dsub_1),
+                     (dsub_6  qqsub_1, dsub_2), (dsub_7  qqsub_1, dsub_3)] in {
 def QQQQ0 : ARMReg<0, "qqqq0", [QQ0, QQ1]>;
 def QQQQ1 : ARMReg<1, "qqqq1", [QQ2, QQ3]>;
-}
-let CompositeIndices = [(qsub_2 qqsub_1, qsub_0), (qsub_3 qqsub_1, qsub_1),
-                        (dsub_4 qqsub_1, dsub_0), (dsub_5 qqsub_1, dsub_1),
-                        (dsub_6 qqsub_1, dsub_2), (dsub_7 qqsub_1, dsub_3)] in {
 def QQQQ2 : ARMReg<2, "qqqq2", [QQ4, QQ5]>;
 def QQQQ3 : ARMReg<3, "qqqq3", [QQ6, QQ7]>;
 }
-}
 
 // Current Program Status Register.
 def CPSR    : ARMReg<0, "cpsr">;
@@ -216,9 +200,8 @@ def FPEXC   : ARMReg<8, "fpexc">;
 // r11 == Frame Pointer (arm-style backtraces)
 // r10 == Stack Limit
 //
-def GPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R4, R5, R6,
-                                           R7, R8, R9, R10, R11, R12,
-                                           SP, LR, PC]> {
+def GPR : RegisterClass<"ARM", [i32], 32, (add (sequence "R%u", 0, 12),
+                                               SP, LR, PC)> {
   let MethodProtos = [{
     iterator allocation_order_begin(const MachineFunction &MF) const;
     iterator allocation_order_end(const MachineFunction &MF) const;
@@ -262,8 +245,7 @@ def GPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R4, R5, R6,
 // register range for operands, but have undefined behaviours when PC
 // or SP (R13 or R15) are used. The ARM ISA refers to these operands
 // via the BadReg() pseudo-code description.
-def rGPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R4, R5, R6,
-                                            R7, R8, R9, R10, R11, R12, LR]> {
+def rGPR : RegisterClass<"ARM", [i32], 32, (sub GPR, SP, PC)> {
   let MethodProtos = [{
     iterator allocation_order_begin(const MachineFunction &MF) const;
     iterator allocation_order_end(const MachineFunction &MF) const;
@@ -307,13 +289,13 @@ def rGPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R4, R5, R6,
 
 // Thumb registers are R0-R7 normally. Some instructions can still use
 // the general GPR register class above (MOV, e.g.)
-def tGPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R4, R5, R6, R7]> {}
+def tGPR : RegisterClass<"ARM", [i32], 32, (trunc GPR, 8)>;
 
 // For tail calls, we can't use callee-saved registers, as they are restored
 // to the saved value before the tail call, which would clobber a call address.
 // Note, getMinimalPhysRegClass(R0) returns tGPR because of the names of
 // this class and the preceding one(!)  This is what we want.
-def tcGPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R9, R12]> {
+def tcGPR : RegisterClass<"ARM", [i32], 32, (add R0, R1, R2, R3, R9, R12)> {
   let MethodProtos = [{
     iterator allocation_order_begin(const MachineFunction &MF) const;
     iterator allocation_order_end(const MachineFunction &MF) const;
@@ -361,25 +343,18 @@ def tcGPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R9, R12]> {
 
 
 // Scalar single precision floating point register class..
-def SPR : RegisterClass<"ARM", [f32], 32, [S0, S1, S2, S3, S4, S5, S6, S7, S8,
-  S9, S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, S20, S21, S22,
-  S23, S24, S25, S26, S27, S28, S29, S30, S31]>;
+def SPR : RegisterClass<"ARM", [f32], 32, (sequence "S%u", 0, 31)>;
 
 // Subset of SPR which can be used as a source of NEON scalars for 16-bit
 // operations
-def SPR_8 : RegisterClass<"ARM", [f32], 32,
-                          [S0, S1,  S2,  S3,  S4,  S5,  S6,  S7,
-                           S8, S9, S10, S11, S12, S13, S14, S15]>;
+def SPR_8 : RegisterClass<"ARM", [f32], 32, (trunc SPR, 16)>;
 
 // Scalar double precision floating point / generic 64-bit vector register
 // class.
 // ARM requires only word alignment for double. It's more performant if it
 // is double-word alignment though.
 def DPR : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64,
-                        [D0,  D1,  D2,  D3,  D4,  D5,  D6,  D7,
-                         D8,  D9,  D10, D11, D12, D13, D14, D15,
-                         D16, D17, D18, D19, D20, D21, D22, D23,
-                         D24, D25, D26, D27, D28, D29, D30, D31]> {
+                        (sequence "D%u", 0, 31)> {
   let MethodProtos = [{
     iterator allocation_order_begin(const MachineFunction &MF) const;
     iterator allocation_order_end(const MachineFunction &MF) const;
@@ -427,22 +402,20 @@ def DPR : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64,
 // Subset of DPR that are accessible with VFP2 (and so that also have
 // 32-bit SPR subregs).
 def DPR_VFP2 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64,
-                             [D0,  D1,  D2,  D3,  D4,  D5,  D6,  D7,
-                              D8,  D9,  D10, D11, D12, D13, D14, D15]> {
+                             (trunc DPR, 16)> {
   let SubRegClasses = [(SPR ssub_0, ssub_1)];
 }
 
 // Subset of DPR which can be used as a source of NEON scalars for 16-bit
 // operations
 def DPR_8 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64,
-                          [D0,  D1,  D2,  D3,  D4,  D5,  D6,  D7]> {
+                          (trunc DPR, 8)> {
   let SubRegClasses = [(SPR_8 ssub_0, ssub_1)];
 }
 
 // Generic 128-bit vector register class.
 def QPR : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], 128,
-                        [Q0,  Q1,  Q2,  Q3,  Q4,  Q5,  Q6,  Q7,
-                         Q8,  Q9,  Q10, Q11, Q12, Q13, Q14, Q15]> {
+                        (sequence "Q%u", 0, 15)> {
   let SubRegClasses = [(DPR dsub_0, dsub_1)];
   let MethodProtos = [{
     iterator allocation_order_begin(const MachineFunction &MF) const;
@@ -471,25 +444,21 @@ def QPR : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], 128,
 
 // Subset of QPR that have 32-bit SPR subregs.
 def QPR_VFP2 : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
-                             128,
-                             [Q0,  Q1,  Q2,  Q3,  Q4,  Q5,  Q6,  Q7]> {
+                             128, (trunc QPR, 8)> {
   let SubRegClasses = [(SPR      ssub_0, ssub_1, ssub_2, ssub_3),
                        (DPR_VFP2 dsub_0, dsub_1)];
 }
 
 // Subset of QPR that have DPR_8 and SPR_8 subregs.
 def QPR_8 : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
-                           128,
-                           [Q0,  Q1,  Q2,  Q3]> {
+                           128, (trunc QPR, 4)> {
   let SubRegClasses = [(SPR_8 ssub_0, ssub_1, ssub_2, ssub_3),
                        (DPR_8 dsub_0, dsub_1)];
 }
 
 // Pseudo 256-bit vector register class to model pairs of Q registers
 // (4 consecutive D registers).
-def QQPR : RegisterClass<"ARM", [v4i64],
-                         256,
-                         [QQ0, QQ1, QQ2, QQ3, QQ4, QQ5, QQ6, QQ7]> {
+def QQPR : RegisterClass<"ARM", [v4i64], 256, (sequence "QQ%u", 0, 7)> {
   let SubRegClasses = [(DPR dsub_0, dsub_1, dsub_2, dsub_3),
                        (QPR qsub_0, qsub_1)];
   let MethodProtos = [{
@@ -516,9 +485,7 @@ def QQPR : RegisterClass<"ARM", [v4i64],
 }
 
 // Subset of QQPR that have 32-bit SPR subregs.
-def QQPR_VFP2 : RegisterClass<"ARM", [v4i64],
-                              256,
-                              [QQ0, QQ1, QQ2, QQ3]> {
+def QQPR_VFP2 : RegisterClass<"ARM", [v4i64], 256, (trunc QQPR, 4)> {
   let SubRegClasses = [(SPR      ssub_0, ssub_1, ssub_2, ssub_3),
                        (DPR_VFP2 dsub_0, dsub_1, dsub_2, dsub_3),
                        (QPR_VFP2 qsub_0, qsub_1)];
@@ -527,9 +494,7 @@ def QQPR_VFP2 : RegisterClass<"ARM", [v4i64],
 
 // Pseudo 512-bit vector register class to model 4 consecutive Q registers
 // (8 consecutive D registers).
-def QQQQPR : RegisterClass<"ARM", [v8i64],
-                         256,
-                         [QQQQ0, QQQQ1, QQQQ2, QQQQ3]> {
+def QQQQPR : RegisterClass<"ARM", [v8i64], 256, (sequence "QQQQ%u", 0, 3)> {
   let SubRegClasses = [(DPR dsub_0, dsub_1, dsub_2, dsub_3,
                             dsub_4, dsub_5, dsub_6, dsub_7),
                        (QPR qsub_0, qsub_1, qsub_2, qsub_3)];
@@ -556,4 +521,6 @@ def QQQQPR : RegisterClass<"ARM", [v8i64],
 }
 
 // Condition code registers.
-def CCR : RegisterClass<"ARM", [i32], 32, [CPSR]>;
+def CCR : RegisterClass<"ARM", [i32], 32, (add CPSR)> {
+  let isAllocatable = 0;
+}
diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td
index 82c6735..49fedf6 100644
--- a/lib/Target/ARM/ARMScheduleA9.td
+++ b/lib/Target/ARM/ARMScheduleA9.td
@@ -656,19 +656,19 @@ def CortexA9Itineraries : ProcessorItineraries<
                               [1, 1, 1]>,
   //
   // Single-precision to Integer Move
+  //
+  // On A9 move-from-VFP is free to issue with no stall if other VFP
+  // operations are in flight. I assume it still can't dual-issue though.
   InstrItinData<IIC_fpMOVSI,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
-                               InstrStage<1, [A9_MUX0], 0>,
-                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
-                               InstrStage<2, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_NPipe]>],
+                               InstrStage<1, [A9_MUX0], 0>],
                               [2, 1]>,
   //
   // Double-precision to Integer Move
+  //
+  // On A9 move-from-VFP is free to issue with no stall if other VFP
+  // operations are in flight. I assume it still can't dual-issue though.
   InstrItinData<IIC_fpMOVDI,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
-                               InstrStage<1, [A9_MUX0], 0>,
-                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
-                               InstrStage<2, [A9_DRegsN],   0, Reserved>,
-                               InstrStage<1, [A9_NPipe]>],
+                               InstrStage<1, [A9_MUX0], 0>],
                               [2, 1, 1]>,
   //
   // Single-precision FP Load
@@ -691,20 +691,22 @@ def CortexA9Itineraries : ProcessorItineraries<
                               [2, 1]>,
   //
   // FP Load Multiple
+  // FIXME: assumes 2 doubles which requires 2 LS cycles.
   InstrItinData<IIC_fpLoad_m, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
                                InstrStage<1, [A9_NPipe], 0>,
-                               InstrStage<1, [A9_LSUnit]>], [1, 1, 1, 1]>,
+                               InstrStage<2, [A9_LSUnit]>], [1, 1, 1, 1]>,
   //
   // FP Load Multiple + update
+  // FIXME: assumes 2 doubles which requires 2 LS cycles.
   InstrItinData<IIC_fpLoad_mu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
                                InstrStage<1, [A9_NPipe], 0>,
-                               InstrStage<1, [A9_LSUnit]>], [2, 1, 1, 1]>,
+                               InstrStage<2, [A9_LSUnit]>], [2, 1, 1, 1]>,
   //
   // Single-precision FP Store
   InstrItinData<IIC_fpStore32,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
@@ -725,205 +727,206 @@ def CortexA9Itineraries : ProcessorItineraries<
                               [1, 1]>,
   //
   // FP Store Multiple
+  // FIXME: assumes 2 doubles which requires 2 LS cycles.
   InstrItinData<IIC_fpStore_m,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
                                InstrStage<1, [A9_NPipe], 0>,
-                               InstrStage<1, [A9_LSUnit]>], [1, 1, 1, 1]>,
+                               InstrStage<2, [A9_LSUnit]>], [1, 1, 1, 1]>,
   //
   // FP Store Multiple + update
+  // FIXME: assumes 2 doubles which requires 2 LS cycles.
   InstrItinData<IIC_fpStore_mu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                 InstrStage<1, [A9_MUX0], 0>,
                                 InstrStage<1, [A9_DRegsVFP], 0, Required>,
                                 InstrStage<2, [A9_DRegsN],   0, Reserved>,
                                 InstrStage<1, [A9_NPipe], 0>,
-                                InstrStage<1, [A9_LSUnit]>], [2, 1, 1, 1]>,
+                                InstrStage<2, [A9_LSUnit]>], [2, 1, 1, 1]>,
   // NEON
   // VLD1
-  // FIXME: Conservatively assume insufficent alignment.
   InstrItinData<IIC_VLD1,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<2, [A9_NPipe], 0>,
-                               InstrStage<2, [A9_LSUnit]>],
-                              [2, 1]>,
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [1, 1]>,
   // VLD1x2
   InstrItinData<IIC_VLD1x2,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<2, [A9_NPipe], 0>,
-                               InstrStage<2, [A9_LSUnit]>],
-                              [2, 2, 1]>,
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [1, 1, 1]>,
   // VLD1x3
   InstrItinData<IIC_VLD1x3,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<3, [A9_NPipe], 0>,
-                               InstrStage<3, [A9_LSUnit]>],
-                              [2, 2, 3, 1]>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [1, 1, 2, 1]>,
   // VLD1x4
   InstrItinData<IIC_VLD1x4,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<3, [A9_NPipe], 0>,
-                               InstrStage<3, [A9_LSUnit]>],
-                              [2, 2, 3, 3, 1]>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [1, 1, 2, 2, 1]>,
   // VLD1u
   InstrItinData<IIC_VLD1u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<2, [A9_NPipe], 0>,
-                               InstrStage<2, [A9_LSUnit]>],
-                              [2, 2, 1]>,
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [1, 2, 1]>,
   // VLD1x2u
   InstrItinData<IIC_VLD1x2u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<2, [A9_NPipe], 0>,
-                               InstrStage<2, [A9_LSUnit]>],
-                              [2, 2, 2, 1]>,
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [1, 1, 2, 1]>,
   // VLD1x3u
   InstrItinData<IIC_VLD1x3u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<3, [A9_NPipe], 0>,
-                               InstrStage<3, [A9_LSUnit]>],
-                              [2, 2, 3, 2, 1]>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [1, 1, 2, 2, 1]>,
   // VLD1x4u
   InstrItinData<IIC_VLD1x4u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<3, [A9_NPipe], 0>,
-                               InstrStage<3, [A9_LSUnit]>],
-                              [2, 2, 3, 3, 2, 1]>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [1, 1, 2, 2, 2, 1]>,
   //
   // VLD1ln
   InstrItinData<IIC_VLD1ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<3, [A9_NPipe], 0>,
-                               InstrStage<3, [A9_LSUnit]>],
-                              [4, 1, 1, 1]>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [3, 1, 1, 1]>,
   //
   // VLD1lnu
   InstrItinData<IIC_VLD1lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<3, [A9_NPipe], 0>,
-                               InstrStage<3, [A9_LSUnit]>],
-                              [4, 2, 1, 1, 1, 1]>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [3, 2, 1, 1, 1, 1]>,
   //
   // VLD1dup
   InstrItinData<IIC_VLD1dup,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<2, [A9_NPipe], 0>,
-                               InstrStage<2, [A9_LSUnit]>],
-                              [3, 1]>,
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 1]>,
   //
   // VLD1dupu
   InstrItinData<IIC_VLD1dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<2, [A9_NPipe], 0>,
-                               InstrStage<2, [A9_LSUnit]>],
-                              [3, 2, 1, 1]>,
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 2, 1, 1]>,
   //
   // VLD2
   InstrItinData<IIC_VLD2,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
-                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<2, [A9_NPipe], 0>,
-                               InstrStage<2, [A9_LSUnit]>],
-                              [3, 3, 1]>,
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 2, 1]>,
   //
   // VLD2x2
   InstrItinData<IIC_VLD2x2,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<3, [A9_NPipe], 0>,
-                               InstrStage<3, [A9_LSUnit]>],
-                              [3, 4, 3, 4, 1]>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [2, 3, 2, 3, 1]>,
   //
   // VLD2ln
   InstrItinData<IIC_VLD2ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<3, [A9_NPipe], 0>,
-                               InstrStage<3, [A9_LSUnit]>],
-                              [4, 4, 1, 1, 1, 1]>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [3, 3, 1, 1, 1, 1]>,
   //
   // VLD2u
   InstrItinData<IIC_VLD2u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
                                // Extra latency cycles since wbck is 7 cycles
-                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<2, [A9_NPipe], 0>,
-                               InstrStage<2, [A9_LSUnit]>],
-                              [3, 3, 2, 1, 1, 1]>,
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 2, 2, 1, 1, 1]>,
   //
   // VLD2x2u
   InstrItinData<IIC_VLD2x2u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<3, [A9_NPipe], 0>,
-                               InstrStage<3, [A9_LSUnit]>],
-                              [3, 4, 3, 4, 2, 1]>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [2, 3, 2, 3, 2, 1]>,
   //
   // VLD2lnu
   InstrItinData<IIC_VLD2lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<3, [A9_NPipe], 0>,
-                               InstrStage<3, [A9_LSUnit]>],
-                              [4, 4, 2, 1, 1, 1, 1, 1]>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [3, 3, 2, 1, 1, 1, 1, 1]>,
   //
   // VLD2dup
   InstrItinData<IIC_VLD2dup,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<2, [A9_NPipe], 0>,
-                               InstrStage<2, [A9_LSUnit]>],
-                              [3, 3, 1]>,
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 2, 1]>,
   //
   // VLD2dupu
   InstrItinData<IIC_VLD2dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<2, [A9_NPipe], 0>,
-                               InstrStage<2, [A9_LSUnit]>],
-                              [3, 3, 2, 1, 1]>,
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 2, 2, 1, 1]>,
   //
   // VLD3
   InstrItinData<IIC_VLD3,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<10,[A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<4, [A9_NPipe], 0>,
-                               InstrStage<4, [A9_LSUnit]>],
-                              [4, 4, 5, 1]>,
+                               InstrStage<9,[A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [3, 3, 4, 1]>,
   //
   // VLD3ln
   InstrItinData<IIC_VLD3ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
@@ -938,10 +941,10 @@ def CortexA9Itineraries : ProcessorItineraries<
   InstrItinData<IIC_VLD3u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<10,[A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<4, [A9_NPipe], 0>,
-                               InstrStage<4, [A9_LSUnit]>],
-                              [4, 4, 5, 2, 1]>,
+                               InstrStage<9,[A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [3, 3, 4, 2, 1]>,
   //
   // VLD3lnu
   InstrItinData<IIC_VLD3lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
@@ -974,108 +977,108 @@ def CortexA9Itineraries : ProcessorItineraries<
   InstrItinData<IIC_VLD4,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<10,[A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<4, [A9_NPipe], 0>,
-                               InstrStage<4, [A9_LSUnit]>],
-                              [4, 4, 5, 5, 1]>,
+                               InstrStage<9,[A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [3, 3, 4, 4, 1]>,
   //
   // VLD4ln
   InstrItinData<IIC_VLD4ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<11,[A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<5, [A9_NPipe], 0>,
-                               InstrStage<5, [A9_LSUnit]>],
-                              [5, 5, 6, 6, 1, 1, 1, 1, 2, 2]>,
+                               InstrStage<10,[A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<4, [A9_NPipe], 0>,
+                               InstrStage<4, [A9_LSUnit]>],
+                              [4, 4, 5, 5, 1, 1, 1, 1, 2, 2]>,
   //
   // VLD4u
   InstrItinData<IIC_VLD4u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<10,[A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<4, [A9_NPipe], 0>,
-                               InstrStage<4, [A9_LSUnit]>],
-                              [4, 4, 5, 5, 2, 1]>,
+                               InstrStage<9,[A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [3, 3, 4, 4, 2, 1]>,
   //
   // VLD4lnu
   InstrItinData<IIC_VLD4lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<11,[A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<5, [A9_NPipe], 0>,
-                               InstrStage<5, [A9_LSUnit]>],
-                              [5, 5, 6, 6, 2, 1, 1, 1, 1, 1, 2, 2]>,
+                               InstrStage<10,[A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<4, [A9_NPipe], 0>,
+                               InstrStage<4, [A9_LSUnit]>],
+                              [4, 4, 5, 5, 2, 1, 1, 1, 1, 1, 2, 2]>,
   //
   // VLD4dup
   InstrItinData<IIC_VLD4dup,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<3, [A9_NPipe], 0>,
-                               InstrStage<3, [A9_LSUnit]>],
-                              [3, 3, 4, 4, 1]>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [2, 2, 3, 3, 1]>,
   //
   // VLD4dupu
   InstrItinData<IIC_VLD4dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<3, [A9_NPipe], 0>,
-                               InstrStage<3, [A9_LSUnit]>],
-                              [3, 3, 4, 4, 2, 1, 1]>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [2, 2, 3, 3, 2, 1, 1]>,
   //
   // VST1
   InstrItinData<IIC_VST1,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<2, [A9_NPipe], 0>,
-                               InstrStage<2, [A9_LSUnit]>],
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
                               [1, 1, 1]>,
   //
   // VST1x2
   InstrItinData<IIC_VST1x2,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<2, [A9_NPipe], 0>,
-                               InstrStage<2, [A9_LSUnit]>],
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
                               [1, 1, 1, 1]>,
   //
   // VST1x3
   InstrItinData<IIC_VST1x3,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<3, [A9_NPipe], 0>,
-                               InstrStage<3, [A9_LSUnit]>],
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
                               [1, 1, 1, 1, 2]>,
   //
   // VST1x4
   InstrItinData<IIC_VST1x4,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<3, [A9_NPipe], 0>,
-                               InstrStage<3, [A9_LSUnit]>],
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
                               [1, 1, 1, 1, 2, 2]>,
   //
   // VST1u
   InstrItinData<IIC_VST1u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<2, [A9_NPipe], 0>,
-                               InstrStage<2, [A9_LSUnit]>],
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
                               [2, 1, 1, 1, 1]>,
   //
   // VST1x2u
   InstrItinData<IIC_VST1x2u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<2, [A9_NPipe], 0>,
-                               InstrStage<2, [A9_LSUnit]>],
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
                               [2, 1, 1, 1, 1, 1]>,
   //
   // VST1x3u
@@ -1083,44 +1086,44 @@ def CortexA9Itineraries : ProcessorItineraries<
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
                                InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<3, [A9_NPipe], 0>,
-                               InstrStage<3, [A9_LSUnit]>],
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
                               [2, 1, 1, 1, 1, 1, 2]>,
   //
   // VST1x4u
   InstrItinData<IIC_VST1x4u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<3, [A9_NPipe], 0>,
-                               InstrStage<3, [A9_LSUnit]>],
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
                               [2, 1, 1, 1, 1, 1, 2, 2]>,
   //
   // VST1ln
   InstrItinData<IIC_VST1ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<2, [A9_NPipe], 0>,
-                               InstrStage<2, [A9_LSUnit]>],
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
                               [1, 1, 1]>,
   //
   // VST1lnu
   InstrItinData<IIC_VST1lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<3, [A9_NPipe], 0>,
-                               InstrStage<3, [A9_LSUnit]>],
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
                               [2, 1, 1, 1, 1]>,
   //
   // VST2
   InstrItinData<IIC_VST2,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<2, [A9_NPipe], 0>,
-                               InstrStage<2, [A9_LSUnit]>],
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
                               [1, 1, 1, 1]>,
   //
   // VST2x2
@@ -1136,9 +1139,9 @@ def CortexA9Itineraries : ProcessorItineraries<
   InstrItinData<IIC_VST2u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<2, [A9_NPipe], 0>,
-                               InstrStage<2, [A9_LSUnit]>],
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
                               [2, 1, 1, 1, 1, 1]>,
   //
   // VST2x2u
@@ -1154,36 +1157,36 @@ def CortexA9Itineraries : ProcessorItineraries<
   InstrItinData<IIC_VST2ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<2, [A9_NPipe], 0>,
-                               InstrStage<2, [A9_LSUnit]>],
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
                               [1, 1, 1, 1]>,
   //
   // VST2lnu
   InstrItinData<IIC_VST2lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<3, [A9_NPipe], 0>,
-                               InstrStage<3, [A9_LSUnit]>],
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
                               [2, 1, 1, 1, 1, 1]>,
   //
   // VST3
   InstrItinData<IIC_VST3,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<3, [A9_NPipe], 0>,
-                               InstrStage<3, [A9_LSUnit]>],
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
                               [1, 1, 1, 1, 2]>,
   //
   // VST3u
   InstrItinData<IIC_VST3u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<3, [A9_NPipe], 0>,
-                               InstrStage<3, [A9_LSUnit]>],
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
                               [2, 1, 1, 1, 1, 1, 2]>,
   //
   // VST3ln
@@ -1208,36 +1211,36 @@ def CortexA9Itineraries : ProcessorItineraries<
   InstrItinData<IIC_VST4,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<3, [A9_NPipe], 0>,
-                               InstrStage<3, [A9_LSUnit]>],
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
                               [1, 1, 1, 1, 2, 2]>,
   //
   // VST4u
   InstrItinData<IIC_VST4u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<3, [A9_NPipe], 0>,
-                               InstrStage<3, [A9_LSUnit]>],
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
                               [2, 1, 1, 1, 1, 1, 2, 2]>,
   //
   // VST4ln
   InstrItinData<IIC_VST4ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<3, [A9_NPipe], 0>,
-                               InstrStage<3, [A9_LSUnit]>],
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
                               [1, 1, 1, 1, 2, 2]>,
   //
   // VST4lnu
   InstrItinData<IIC_VST4lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
                                InstrStage<1, [A9_DRegsN],   0, Required>,
-                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
-                               InstrStage<3, [A9_NPipe], 0>,
-                               InstrStage<3, [A9_LSUnit]>],
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
                               [2, 1, 1, 1, 1, 1, 2, 2]>,
 
   //
diff --git a/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/lib/Target/ARM/ARMSelectionDAGInfo.cpp
index 2b9202b..ef0aaf2 100644
--- a/lib/Target/ARM/ARMSelectionDAGInfo.cpp
+++ b/lib/Target/ARM/ARMSelectionDAGInfo.cpp
@@ -13,6 +13,8 @@
 
 #define DEBUG_TYPE "arm-selectiondag-info"
 #include "ARMTargetMachine.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/CodeGen/SelectionDAG.h"
 using namespace llvm;
 
 ARMSelectionDAGInfo::ARMSelectionDAGInfo(const TargetMachine &TM)
@@ -35,7 +37,7 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
   // This requires 4-byte alignment.
   if ((Align & 3) != 0)
     return SDValue();
-  // This requires the copy size to be a constant, preferrably
+  // This requires the copy size to be a constant, preferably
   // within a subtarget-specific limit.
   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
   if (!ConstantSize)
@@ -132,3 +134,65 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
   }
   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i);
 }
+
+// Adjust parameters for memset, EABI uses format (ptr, size, value),
+// GNU library uses (ptr, value, size)
+// See RTABI section 4.3.4
+SDValue
+ARMSelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
+                                             SDValue Chain, SDValue Dst,
+                                             SDValue Src, SDValue Size,
+                                             unsigned Align, bool isVolatile,
+                                             MachinePointerInfo DstPtrInfo) const
+{
+  // Use default for non AAPCS subtargets
+  if (!Subtarget->isAAPCS_ABI())
+    return SDValue();
+
+  const ARMTargetLowering &TLI =
+    *static_cast<const ARMTargetLowering*>(DAG.getTarget().getTargetLowering());
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+
+  // First argument: data pointer
+  const Type *IntPtrTy = TLI.getTargetData()->getIntPtrType(*DAG.getContext());
+  Entry.Node = Dst;
+  Entry.Ty = IntPtrTy;
+  Args.push_back(Entry);
+
+  // Second argument: buffer size
+  Entry.Node = Size;
+  Entry.Ty = IntPtrTy;
+  Entry.isSExt = false;
+  Args.push_back(Entry);
+
+  // Extend or truncate the argument to be an i32 value for the call.
+  if (Src.getValueType().bitsGT(MVT::i32))
+    Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
+  else
+    Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Src);
+
+  // Third argument: value to fill
+  Entry.Node = Src;
+  Entry.Ty = Type::getInt32Ty(*DAG.getContext());
+  Entry.isSExt = true;
+  Args.push_back(Entry);
+
+  // Emit __eabi_memset call
+  std::pair<SDValue,SDValue> CallResult =
+    TLI.LowerCallTo(Chain,
+                    Type::getVoidTy(*DAG.getContext()), // return type
+                    false, // return sign ext
+                    false, // return zero ext
+                    false, // is var arg
+                    false, // is in regs
+                    0,     // number of fixed arguments
+                    TLI.getLibcallCallingConv(RTLIB::MEMSET), // call conv
+                    false, // is tail call
+                    false, // is return val used
+                    DAG.getExternalSymbol(TLI.getLibcallName(RTLIB::MEMSET),
+                                          TLI.getPointerTy()), // callee
+                    Args, DAG, dl); // arg list, DAG and debug
+
+  return CallResult.second;
+}
diff --git a/lib/Target/ARM/ARMSelectionDAGInfo.h b/lib/Target/ARM/ARMSelectionDAGInfo.h
index 7533690..ec1bf5c 100644
--- a/lib/Target/ARM/ARMSelectionDAGInfo.h
+++ b/lib/Target/ARM/ARMSelectionDAGInfo.h
@@ -35,6 +35,15 @@ public:
                                   bool isVolatile, bool AlwaysInline,
                                   MachinePointerInfo DstPtrInfo,
                                   MachinePointerInfo SrcPtrInfo) const;
+
+  // Adjust parameters for memset, see RTABI section 4.3.4
+  virtual
+  SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
+                                  SDValue Chain,
+                                  SDValue Op1, SDValue Op2,
+                                  SDValue Op3, unsigned Align,
+                                  bool isVolatile,
+                                  MachinePointerInfo DstPtrInfo) const;
 };
 
 }
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
index 2a9d480..c6f266b 100644
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -52,6 +52,7 @@ ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &FS,
   , HasT2ExtractPack(false)
   , HasDataBarrier(false)
   , Pref32BitThumb(false)
+  , AvoidCPSRPartialUpdate(false)
   , HasMPExtension(false)
   , FPOnlySP(false)
   , AllowsUnalignedMem(false)
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index e024182..0271c87 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -110,6 +110,11 @@ protected:
   /// over 16-bit ones.
   bool Pref32BitThumb;
 
+  /// AvoidCPSRPartialUpdate - If true, codegen would avoid using instructions
+  /// that partially update CPSR and add false dependency on the previous
+  /// CPSR setting instruction.
+  bool AvoidCPSRPartialUpdate;
+
   /// HasMPExtension - True if the subtarget supports Multiprocessing
   /// extension (ARMv7 only).
   bool HasMPExtension;
@@ -190,12 +195,15 @@ protected:
   bool isFPBrccSlow() const { return SlowFPBrcc; }
   bool isFPOnlySP() const { return FPOnlySP; }
   bool prefers32BitThumb() const { return Pref32BitThumb; }
+  bool avoidCPSRPartialUpdate() const { return AvoidCPSRPartialUpdate; }
   bool hasMPExtension() const { return HasMPExtension; }
 
   bool hasFP16() const { return HasFP16; }
   bool hasD16() const { return HasD16; }
 
-  bool isTargetDarwin() const { return TargetTriple.getOS() == Triple::Darwin; }
+  const Triple &getTargetTriple() const { return TargetTriple; }
+
+  bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
   bool isTargetELF() const { return !isTargetDarwin(); }
 
   bool isAPCS_ABI() const { return TargetABI == ARM_ABI_APCS; }
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index 6ec6747..29aa4f7 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -22,16 +22,13 @@
 #include "llvm/Target/TargetRegistry.h"
 using namespace llvm;
 
-static cl::opt<bool>ExpandMLx("expand-fp-mlx", cl::init(false), cl::Hidden);
-
 static MCAsmInfo *createMCAsmInfo(const Target &T, StringRef TT) {
   Triple TheTriple(TT);
-  switch (TheTriple.getOS()) {
-  case Triple::Darwin:
+
+  if (TheTriple.isOSDarwin())
     return new ARMMCAsmInfoDarwin();
-  default:
-    return new ARMELFMCAsmInfo();
-  }
+
+  return new ARMELFMCAsmInfo();
 }
 
 // This is duplicated code. Refactor this.
@@ -41,17 +38,17 @@ static MCStreamer *createMCStreamer(const Target &T, const std::string &TT,
                                     MCCodeEmitter *Emitter,
                                     bool RelaxAll,
                                     bool NoExecStack) {
-  switch (Triple(TT).getOS()) {
-  case Triple::Darwin:
+  Triple TheTriple(TT);
+
+  if (TheTriple.isOSDarwin())
     return createMachOStreamer(Ctx, TAB, OS, Emitter, RelaxAll);
-  case Triple::MinGW32:
-  case Triple::Cygwin:
-  case Triple::Win32:
+
+  if (TheTriple.isOSWindows()) {
     llvm_unreachable("ARM does not support Windows COFF format");
     return NULL;
-  default:
-    return createELFStreamer(Ctx, TAB, OS, Emitter, RelaxAll, NoExecStack);
   }
+
+  return createELFStreamer(Ctx, TAB, OS, Emitter, RelaxAll, NoExecStack);
 }
 
 extern "C" void LLVMInitializeARMTarget() {
@@ -148,8 +145,7 @@ bool ARMBaseTargetMachine::addPreRegAlloc(PassManagerBase &PM,
   // FIXME: temporarily disabling load / store optimization pass for Thumb1.
   if (OptLevel != CodeGenOpt::None && !Subtarget.isThumb1Only())
     PM.add(createARMLoadStoreOptimizationPass(true));
-  if (ExpandMLx &&
-      OptLevel != CodeGenOpt::None && Subtarget.hasVFP2())
+  if (OptLevel != CodeGenOpt::None && Subtarget.isCortexA9())
     PM.add(createMLxExpansionPass());
 
   return true;
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 05b2b46..4bc12c9 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -15,6 +15,7 @@
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCExpr.h"
@@ -114,6 +115,7 @@ class ARMAsmParser : public TargetAsmParser {
 public:
   ARMAsmParser(const Target &T, MCAsmParser &_Parser, TargetMachine &_TM)
     : TargetAsmParser(T), Parser(_Parser), TM(_TM) {
+      MCAsmParserExtension::Initialize(_Parser);
       // Initialize the set of available features.
       setAvailableFeatures(ComputeAvailableFeatures(
           &TM.getSubtarget<ARMSubtarget>()));
@@ -1239,6 +1241,8 @@ tryParseMSRMaskOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
         FlagsVal = 0; // No flag
     }
   } else if (SpecReg == "cpsr" || SpecReg == "spsr") {
+    if (Flags == "all") // cpsr_all is an alias for cpsr_fc
+      Flags = "fc";
     for (int i = 0, e = Flags.size(); i != e; ++i) {
       unsigned Flag = StringSwitch<unsigned>(Flags.substr(i, 1))
       .Case("c", 1)
@@ -1826,10 +1830,11 @@ GetMnemonicAcceptInfo(StringRef Mnemonic, bool &CanAcceptCarrySet,
       Mnemonic == "rrx" || Mnemonic == "ror" || Mnemonic == "sub" ||
       Mnemonic == "smull" || Mnemonic == "add" || Mnemonic == "adc" ||
       Mnemonic == "mul" || Mnemonic == "bic" || Mnemonic == "asr" ||
-      Mnemonic == "umlal" || Mnemonic == "orr" || Mnemonic == "mov" ||
+      Mnemonic == "umlal" || Mnemonic == "orr" || Mnemonic == "mvn" ||
       Mnemonic == "rsb" || Mnemonic == "rsc" || Mnemonic == "orn" ||
       Mnemonic == "sbc" || Mnemonic == "mla" || Mnemonic == "umull" ||
-      Mnemonic == "eor" || Mnemonic == "smlal" || Mnemonic == "mvn") {
+      Mnemonic == "eor" || Mnemonic == "smlal" ||
+      (Mnemonic == "mov" && !isThumb)) {
     CanAcceptCarrySet = true;
   } else {
     CanAcceptCarrySet = false;
@@ -1848,7 +1853,8 @@ GetMnemonicAcceptInfo(StringRef Mnemonic, bool &CanAcceptCarrySet,
 
   if (isThumb)
     if (Mnemonic == "bkpt" || Mnemonic == "mcr" || Mnemonic == "mcrr" ||
-        Mnemonic == "mrc" || Mnemonic == "mrrc" || Mnemonic == "cdp")
+        Mnemonic == "mrc" || Mnemonic == "mrrc" || Mnemonic == "cdp" ||
+        Mnemonic == "mov")
       CanAcceptPredicationCode = false;
 }
 
@@ -2098,15 +2104,29 @@ bool ARMAsmParser::ParseDirectiveThumb(SMLoc L) {
 /// ParseDirectiveThumbFunc
 ///  ::= .thumbfunc symbol_name
 bool ARMAsmParser::ParseDirectiveThumbFunc(SMLoc L) {
-  const AsmToken &Tok = Parser.getTok();
-  if (Tok.isNot(AsmToken::Identifier) && Tok.isNot(AsmToken::String))
-    return Error(L, "unexpected token in .thumb_func directive");
-  StringRef Name = Tok.getString();
-  Parser.Lex(); // Consume the identifier token.
+  const MCAsmInfo &MAI = getParser().getStreamer().getContext().getAsmInfo();
+  bool isMachO = MAI.hasSubsectionsViaSymbols();
+  StringRef Name;
+
+  // Darwin asm has function name after .thumb_func direction
+  // ELF doesn't
+  if (isMachO) {
+    const AsmToken &Tok = Parser.getTok();
+    if (Tok.isNot(AsmToken::Identifier) && Tok.isNot(AsmToken::String))
+      return Error(L, "unexpected token in .thumb_func directive");
+    Name = Tok.getString();
+    Parser.Lex(); // Consume the identifier token.
+  }
+
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return Error(L, "unexpected token in directive");
   Parser.Lex();
 
+  // FIXME: assuming function name will be the line following .thumb_func
+  if (!isMachO) {
+    Name = Parser.getTok().getString();
+  }
+
   // Mark symbol as a thumb symbol.
   MCSymbol *Func = getParser().getContext().GetOrCreateSymbol(Name);
   getParser().getStreamer().EmitThumbFunc(Func);
diff --git a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index 74f8b53..bdce2c4 100644
--- a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -422,6 +422,10 @@ bool ARMDisassembler::getInstruction(MCInst &MI,
   if (!Builder)
     return false;
 
+  Builder->setupBuilderForSymbolicDisassembly(getLLVMOpInfoCallback(),
+                                              getDisInfoBlock(), getMCContext(),
+                                              Address);
+
   if (!Builder->Build(MI, insn))
     return false;
 
@@ -433,7 +437,7 @@ bool ThumbDisassembler::getInstruction(MCInst &MI,
                                        const MemoryObject &Region,
                                        uint64_t Address,
                                        raw_ostream &os) const {
-  // The Thumb instruction stream is a sequence of halhwords.
+  // The Thumb instruction stream is a sequence of halfwords.
 
   // This represents the first halfword as well as the machine instruction
   // passed to decodeThumbInstruction().  For 16-bit Thumb instruction, the top
@@ -504,6 +508,10 @@ bool ThumbDisassembler::getInstruction(MCInst &MI,
 
   Builder->SetSession(const_cast<Session *>(&SO));
 
+  Builder->setupBuilderForSymbolicDisassembly(getLLVMOpInfoCallback(),
+                                              getDisInfoBlock(), getMCContext(),
+                                              Address);
+
   if (!Builder->Build(MI, insn))
     return false;
 
diff --git a/lib/Target/ARM/Disassembler/ARMDisassemblerCore.cpp b/lib/Target/ARM/Disassembler/ARMDisassemblerCore.cpp
index ca67e5e..271ca8c 100644
--- a/lib/Target/ARM/Disassembler/ARMDisassemblerCore.cpp
+++ b/lib/Target/ARM/Disassembler/ARMDisassemblerCore.cpp
@@ -17,6 +17,7 @@
 
 #include "ARMDisassemblerCore.h"
 #include "ARMAddressingModes.h"
+#include "ARMMCExpr.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -532,17 +533,18 @@ static bool BadRegsMulFrm(unsigned Opcode, uint32_t insn) {
   switch (Opcode) {
   default:
     // Did we miss an opcode?
-    assert(0 && "Unexpected opcode!");
+    DEBUG(errs() << "BadRegsMulFrm: unexpected opcode!");
     return false;
   case ARM::MLA:     case ARM::MLS:     case ARM::SMLABB:  case ARM::SMLABT:
   case ARM::SMLATB:  case ARM::SMLATT:  case ARM::SMLAWB:  case ARM::SMLAWT:
-  case ARM::SMMLA:   case ARM::SMMLS:   case ARM::USADA8:
+  case ARM::SMMLA:   case ARM::SMMLAR:  case ARM::SMMLS:   case ARM::SMMLSR:
+  case ARM::USADA8:
     if (R19_16 == 15 || R15_12 == 15 || R11_8 == 15 || R3_0 == 15)
       return true;
     return false;
-  case ARM::MUL:     case ARM::SMMUL:   case ARM::SMULBB:  case ARM::SMULBT:
-  case ARM::SMULTB:  case ARM::SMULTT:  case ARM::SMULWB:  case ARM::SMULWT:
-  case ARM::SMUAD:   case ARM::SMUADX:
+  case ARM::MUL:     case ARM::SMMUL:   case ARM::SMMULR:
+  case ARM::SMULBB:  case ARM::SMULBT:  case ARM::SMULTB:  case ARM::SMULTT:
+  case ARM::SMULWB:  case ARM::SMULWT:  case ARM::SMUAD:   case ARM::SMUADX:
   // A8.6.167 SMLAD & A8.6.172 SMLSD
   case ARM::SMLAD:   case ARM::SMLADX:  case ARM::SMLSD:   case ARM::SMLSDX:
   case ARM::USAD8:
@@ -562,14 +564,14 @@ static bool BadRegsMulFrm(unsigned Opcode, uint32_t insn) {
 }
 
 // Multiply Instructions.
-// MLA, MLS, SMLABB, SMLABT, SMLATB, SMLATT, SMLAWB, SMLAWT, SMMLA, SMMLS,
-// SMLAD, SMLADX, SMLSD, SMLSDX, USADA8 (for convenience):
+// MLA, MLS, SMLABB, SMLABT, SMLATB, SMLATT, SMLAWB, SMLAWT, SMMLA, SMMLAR,
+// SMMLS, SMMLAR, SMLAD, SMLADX, SMLSD, SMLSDX, and USADA8 (for convenience):
 //     Rd{19-16} Rn{3-0} Rm{11-8} Ra{15-12}
 // But note that register checking for {SMLAD, SMLADX, SMLSD, SMLSDX} is
 // only for {d, n, m}.
 //
-// MUL, SMMUL, SMULBB, SMULBT, SMULTB, SMULTT, SMULWB, SMULWT, SMUAD, SMUADX,
-// USAD8 (for convenience):
+// MUL, SMMUL, SMMULR, SMULBB, SMULBT, SMULTB, SMULTT, SMULWB, SMULWT, SMUAD,
+// SMUADX, and USAD8 (for convenience):
 //     Rd{19-16} Rn{3-0} Rm{11-8}
 //
 // SMLAL, SMULL, UMAAL, UMLAL, UMULL, SMLALBB, SMLALBT, SMLALTB, SMLALTT,
@@ -893,8 +895,9 @@ static bool DisassembleBrFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
 }
 
 // Misc. Branch Instructions.
-// BLX, BLXi, BX
-// BX, BX_RET
+// BX_RET, MOVPCLR
+// BLX, BLX_pred, BX, BX_pred
+// BLXi
 static bool DisassembleBrMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
     unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
 
@@ -911,7 +914,7 @@ static bool DisassembleBrMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
 
   // BLX and BX take one GPR reg.
   if (Opcode == ARM::BLX || Opcode == ARM::BLX_pred ||
-      Opcode == ARM::BX) {
+      Opcode == ARM::BX || Opcode == ARM::BX_pred) {
     assert(NumOps >= 1 && OpInfo[OpIdx].RegClass == ARM::GPRRegClassID &&
            "Reg operand expected");
     MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
@@ -955,7 +958,7 @@ static bool BadRegsDPFrm(unsigned Opcode, uint32_t insn) {
   switch (Opcode) {
   default:
     // Did we miss an opcode?
-    if (decodeRd(insn) == 15 | decodeRn(insn) == 15 || decodeRm(insn) == 15) {
+    if (decodeRd(insn) == 15 || decodeRn(insn) == 15 || decodeRm(insn) == 15) {
       DEBUG(errs() << "DPFrm with bad reg specifier(s)\n");
       return true;
     }
@@ -1065,7 +1068,8 @@ static bool DisassembleDPFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
     // We have an imm16 = imm4:imm12 (imm4=Inst{19:16}, imm12 = Inst{11:0}).
     assert(getIBit(insn) == 1 && "I_Bit != '1' reg/imm form");
     unsigned Imm16 = slice(insn, 19, 16) << 12 | slice(insn, 11, 0);
-    MI.addOperand(MCOperand::CreateImm(Imm16));
+    if (!B->tryAddingSymbolicOperand(Imm16, 4, MI))
+      MI.addOperand(MCOperand::CreateImm(Imm16));
     ++OpIdx;
   } else {
     // We have a reg/imm form.
@@ -1172,6 +1176,71 @@ static bool DisassembleDPSoRegFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
   return true;
 }
 
+static bool BadRegsLdStFrm(unsigned Opcode, uint32_t insn, bool Store, bool WBack,
+                           bool Imm) {
+  const StringRef Name = ARMInsts[Opcode].Name;
+  unsigned Rt = decodeRd(insn);
+  unsigned Rn = decodeRn(insn);
+  unsigned Rm = decodeRm(insn);
+  unsigned P  = getPBit(insn);
+  unsigned W  = getWBit(insn);
+
+  if (Store) {
+    // Only STR (immediate, register) allows PC as the source.
+    if (Name.startswith("STRB") && Rt == 15) {
+      DEBUG(errs() << "if t == 15 then UNPREDICTABLE\n");
+      return true;
+    }
+    if (WBack && (Rn == 15 || Rn == Rt)) {
+      DEBUG(errs() << "if wback && (n == 15 || n == t) then UNPREDICTABLE\n");
+      return true;
+    }
+    if (!Imm && Rm == 15) {
+      DEBUG(errs() << "if m == 15 then UNPREDICTABLE\n");
+      return true;
+    }
+  } else {
+    // Only LDR (immediate, register) allows PC as the destination.
+    if (Name.startswith("LDRB") && Rt == 15) {
+      DEBUG(errs() << "if t == 15 then UNPREDICTABLE\n");
+      return true;
+    }
+    if (Imm) {
+      // Immediate
+      if (Rn == 15) {
+        // The literal form must be in offset mode; it's an encoding error
+        // otherwise.
+        if (!(P == 1 && W == 0)) {
+          DEBUG(errs() << "Ld literal form with !(P == 1 && W == 0)\n");
+          return true;
+        }
+        // LDRB (literal) does not allow PC as the destination.
+        if (Opcode != ARM::LDRi12 && Rt == 15) {
+          DEBUG(errs() << "if t == 15 then UNPREDICTABLE\n");
+          return true;
+        }
+      } else {
+        // Write back while Rn == Rt does not make sense.
+        if (WBack && (Rn == Rt)) {
+          DEBUG(errs() << "if wback && n == t then UNPREDICTABLE\n");
+          return true;
+        }
+      }
+    } else {
+      // Register
+      if (Rm == 15) {
+        DEBUG(errs() << "if m == 15 then UNPREDICTABLE\n");
+        return true;
+      }
+      if (WBack && (Rn == 15 || Rn == Rt)) {
+        DEBUG(errs() << "if wback && (n == 15 || n == t) then UNPREDICTABLE\n");
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
 static bool DisassembleLdStFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
     unsigned short NumOps, unsigned &NumOpsAdded, bool isStore, BO B) {
 
@@ -1234,6 +1303,9 @@ static bool DisassembleLdStFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
   if (OpIdx + 1 >= NumOps)
     return false;
 
+  if (BadRegsLdStFrm(Opcode, insn, isStore, isPrePost, getIBit(insn)==0))
+    return false;
+
   ARM_AM::AddrOpc AddrOpcode = getUBit(insn) ? ARM_AM::add : ARM_AM::sub;
   unsigned IndexMode =
                (TID.TSFlags & ARMII::IndexModeMask) >> ARMII::IndexModeShift;
@@ -2586,6 +2658,39 @@ static bool DisassembleNLdSt(MCInst &MI, unsigned Opcode, uint32_t insn,
     // <size> == 32 && Inst{6} == 1 --> DblSpaced = true
     if (Name.endswith("32") || Name.endswith("32_UPD"))
       DblSpaced = slice(insn, 6, 6) == 1;
+  } else if (Name.find("DUP") != std::string::npos) {
+    // Single element (or structure) to all lanes.
+    // Inst{9-8} encodes the number of element(s) in the structure, with:
+    // 0b00 (VLD1DUP) (for this, a bit makes sense only for data size 16 and 32.
+    // 0b01 (VLD2DUP)
+    // 0b10 (VLD3DUP) (for this, a bit must be encoded as 0)
+    // 0b11 (VLD4DUP)
+    //
+    // Inst{7-6} encodes the data size, with:
+    // 0b00 => 8, 0b01 => 16, 0b10 => 32
+    //
+    // Inst{4} (the a bit) encodes the align action (0: standard alignment)
+    unsigned elem = slice(insn, 9, 8) + 1;
+    unsigned a = slice(insn, 4, 4);
+    if (elem != 3) {
+      // 0b11 is not a valid encoding for Inst{7-6}.
+      if (slice(insn, 7, 6) == 3)
+        return false;
+      unsigned data_size = 8 << slice(insn, 7, 6);
+      // For VLD1DUP, a bit makes sense only for data size of 16 and 32.
+      if (a && data_size == 8)
+        return false;
+
+      // Now we can calculate the alignment!
+      if (a)
+        alignment = elem * data_size;
+    } else {
+      if (a) {
+        // A8.6.315 VLD3 (single 3-element structure to all lanes)
+        // The a bit must be encoded as 0.
+        return false;
+      }
+    }
   } else {
     // Multiple n-element structures with type encoded as Inst{11-8}.
     // See, for example, A8.6.316 VLD4 (multiple 4-element structures).
@@ -3211,17 +3316,6 @@ static bool DisassembleNDupFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
   return true;
 }
 
-// A8.6.41 DMB
-// A8.6.42 DSB
-// A8.6.49 ISB
-static inline bool MemBarrierInstr(uint32_t insn) {
-  unsigned op7_4 = slice(insn, 7, 4);
-  if (slice(insn, 31, 8) == 0xf57ff0 && (op7_4 >= 4 && op7_4 <= 6))
-    return true;
-
-  return false;
-}
-
 static inline bool PreLoadOpcode(unsigned Opcode) {
   switch(Opcode) {
   case ARM::PLDi12:  case ARM::PLDrs:
@@ -3285,14 +3379,20 @@ static bool DisassemblePreLoadFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
 static bool DisassembleMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
     unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
 
-  if (MemBarrierInstr(insn)) {
-    // DMBsy, DSBsy, and ISBsy instructions have zero operand and are taken care
-    // of within the generic ARMBasicMCBuilder::BuildIt() method.
-    //
+  if (Opcode == ARM::DMB || Opcode == ARM::DSB) {
     // Inst{3-0} encodes the memory barrier option for the variants.
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 3, 0)));
-    NumOpsAdded = 1;
-    return true;
+    unsigned opt = slice(insn, 3, 0);
+    switch (opt) {
+    case ARM_MB::SY:  case ARM_MB::ST:
+    case ARM_MB::ISH: case ARM_MB::ISHST:
+    case ARM_MB::NSH: case ARM_MB::NSHST:
+    case ARM_MB::OSH: case ARM_MB::OSHST:
+      MI.addOperand(MCOperand::CreateImm(opt));
+      NumOpsAdded = 1;
+      return true;
+    default:
+      return false;
+    }
   }
 
   switch (Opcode) {
@@ -3559,11 +3659,17 @@ bool ARMBasicMCBuilder::TryPredicateAndSBitModifier(MCInst& MI, unsigned Opcode,
         // like ARM.
         //
         // A8.6.16 B
-        if (Name == "t2Bcc")
-          MI.addOperand(MCOperand::CreateImm(CondCode(slice(insn, 25, 22))));
-        else if (Name == "tBcc")
-          MI.addOperand(MCOperand::CreateImm(CondCode(slice(insn, 11, 8))));
-        else
+        // Check for undefined encodings.
+        unsigned cond;
+        if (Name == "t2Bcc") {
+          if ((cond = slice(insn, 25, 22)) >= 14)
+            return false;
+          MI.addOperand(MCOperand::CreateImm(CondCode(cond)));
+        } else if (Name == "tBcc") {
+          if ((cond = slice(insn, 11, 8)) == 14)
+            return false;
+          MI.addOperand(MCOperand::CreateImm(CondCode(cond)));
+        } else
           MI.addOperand(MCOperand::CreateImm(ARMCC::AL));
       } else {
         // ARM instructions get their condition field from Inst{31-28}.
@@ -3632,3 +3738,84 @@ ARMBasicMCBuilder *llvm::CreateMCBuilder(unsigned Opcode, ARMFormat Format) {
   return new ARMBasicMCBuilder(Opcode, Format,
                                ARMInsts[Opcode].getNumOperands());
 }
+
+/// tryAddingSymbolicOperand - tryAddingSymbolicOperand trys to add a symbolic
+/// operand in place of the immediate Value in the MCInst.  The immediate
+/// Value has had any PC adjustment made by the caller.  If the getOpInfo()
+/// function was set as part of the setupBuilderForSymbolicDisassembly() call
+/// then that function is called to get any symbolic information at the
+/// builder's Address for this instrution.  If that returns non-zero then the
+/// symbolic information it returns is used to create an MCExpr and that is
+/// added as an operand to the MCInst.  This function returns true if it adds
+/// an operand to the MCInst and false otherwise.
+bool ARMBasicMCBuilder::tryAddingSymbolicOperand(uint64_t Value,
+                                                 uint64_t InstSize,
+                                                 MCInst &MI) {
+  if (!GetOpInfo)
+    return false;
+
+  struct LLVMOpInfo1 SymbolicOp;
+  SymbolicOp.Value = Value;
+  if (!GetOpInfo(DisInfo, Address, 0 /* Offset */, InstSize, 1, &SymbolicOp))
+    return false;
+
+  const MCExpr *Add = NULL;
+  if (SymbolicOp.AddSymbol.Present) {
+    if (SymbolicOp.AddSymbol.Name) {
+      StringRef Name(SymbolicOp.AddSymbol.Name);
+      MCSymbol *Sym = Ctx->GetOrCreateSymbol(Name);
+      Add = MCSymbolRefExpr::Create(Sym, *Ctx);
+    } else {
+      Add = MCConstantExpr::Create(SymbolicOp.AddSymbol.Value, *Ctx);
+    }
+  }
+
+  const MCExpr *Sub = NULL;
+  if (SymbolicOp.SubtractSymbol.Present) {
+    if (SymbolicOp.SubtractSymbol.Name) {
+      StringRef Name(SymbolicOp.SubtractSymbol.Name);
+      MCSymbol *Sym = Ctx->GetOrCreateSymbol(Name);
+      Sub = MCSymbolRefExpr::Create(Sym, *Ctx);
+    } else {
+      Sub = MCConstantExpr::Create(SymbolicOp.SubtractSymbol.Value, *Ctx);
+    }
+  }
+
+  const MCExpr *Off = NULL;
+  if (SymbolicOp.Value != 0)
+    Off = MCConstantExpr::Create(SymbolicOp.Value, *Ctx);
+
+  const MCExpr *Expr;
+  if (Sub) {
+    const MCExpr *LHS;
+    if (Add)
+      LHS = MCBinaryExpr::CreateSub(Add, Sub, *Ctx);
+    else
+      LHS = MCUnaryExpr::CreateMinus(Sub, *Ctx);
+    if (Off != 0)
+      Expr = MCBinaryExpr::CreateAdd(LHS, Off, *Ctx);
+    else
+      Expr = LHS;
+  } else if (Add) {
+    if (Off != 0)
+      Expr = MCBinaryExpr::CreateAdd(Add, Off, *Ctx);
+    else
+      Expr = Add;
+  } else {
+    if (Off != 0)
+      Expr = Off;
+    else
+      Expr = MCConstantExpr::Create(0, *Ctx);
+  }
+
+  if (SymbolicOp.VariantKind == LLVMDisassembler_VariantKind_ARM_HI16)
+    MI.addOperand(MCOperand::CreateExpr(ARMMCExpr::CreateUpper16(Expr, *Ctx)));
+  else if (SymbolicOp.VariantKind == LLVMDisassembler_VariantKind_ARM_LO16)
+    MI.addOperand(MCOperand::CreateExpr(ARMMCExpr::CreateLower16(Expr, *Ctx)));
+  else if (SymbolicOp.VariantKind == LLVMDisassembler_VariantKind_None)
+    MI.addOperand(MCOperand::CreateExpr(Expr));
+  else 
+    assert("bad SymbolicOp.VariantKind");
+
+  return true;
+}
diff --git a/lib/Target/ARM/Disassembler/ARMDisassemblerCore.h b/lib/Target/ARM/Disassembler/ARMDisassemblerCore.h
index 56dd85a..a7ba141 100644
--- a/lib/Target/ARM/Disassembler/ARMDisassemblerCore.h
+++ b/lib/Target/ARM/Disassembler/ARMDisassemblerCore.h
@@ -22,12 +22,17 @@
 #define ARMDISASSEMBLERCORE_H
 
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/Target/TargetInstrInfo.h"
+#include "llvm-c/Disassembler.h"
 #include "ARMBaseInstrInfo.h"
 #include "ARMRegisterInfo.h"
 #include "ARMDisassembler.h"
 
 namespace llvm {
+class MCContext;
 
 class ARMUtils {
 public:
@@ -134,6 +139,31 @@ static inline void setSlice(unsigned &Bits, unsigned From, unsigned To,
   Bits |= (Val & Mask) << To;
 }
 
+// Return an integer result equal to the number of bits of x that are ones.
+static inline uint32_t
+BitCount (uint64_t x)
+{
+    // c accumulates the total bits set in x
+    uint32_t c;
+    for (c = 0; x; ++c)
+    {
+        x &= x - 1; // clear the least significant bit set
+    }
+    return c;
+}
+
+static inline bool
+BitIsSet (const uint64_t value, const uint64_t bit)
+{
+    return (value & (1ull << bit)) != 0;
+}
+
+static inline bool
+BitIsClear (const uint64_t value, const uint64_t bit)
+{
+    return (value & (1ull << bit)) == 0;
+}
+
 /// Various utilities for checking the target specific flags.
 
 /// A unary data processing instruction doesn't have an Rn operand.
@@ -202,7 +232,7 @@ private:
 public:
   ARMBasicMCBuilder(ARMBasicMCBuilder &B)
     : Opcode(B.Opcode), Format(B.Format), NumOps(B.NumOps), Disasm(B.Disasm),
-      SP(B.SP) {
+      SP(B.SP), GetOpInfo(0), DisInfo(0), Ctx(0) {
     Err = 0;
   }
 
@@ -261,6 +291,44 @@ private:
     assert(SP);
     return slice(SP->ITState, 7, 4);
   }
+
+private:
+  //
+  // Hooks for symbolic disassembly via the public 'C' interface.
+  //
+  // The function to get the symbolic information for operands.
+  LLVMOpInfoCallback GetOpInfo;
+  // The pointer to the block of symbolic information for above call back.
+  void *DisInfo;
+  // The assembly context for creating symbols and MCExprs in place of
+  // immediate operands when there is symbolic information.
+  MCContext *Ctx;
+  // The address of the instruction being disassembled.
+  uint64_t Address;
+
+public:
+  void setupBuilderForSymbolicDisassembly(LLVMOpInfoCallback getOpInfo,
+                                          void *disInfo, MCContext *ctx,
+                                          uint64_t address) {
+    GetOpInfo = getOpInfo;
+    DisInfo = disInfo;
+    Ctx = ctx;
+    Address = address;
+  }
+
+  uint64_t getBuilderAddress() const { return Address; }
+
+  /// tryAddingSymbolicOperand - tryAddingSymbolicOperand trys to add a symbolic
+  /// operand in place of the immediate Value in the MCInst.  The immediate
+  /// Value has had any PC adjustment made by the caller.  If the getOpInfo()
+  /// function was set as part of the setupBuilderForSymbolicDisassembly() call
+  /// then that function is called to get any symbolic information at the
+  /// builder's Address for this instrution.  If that returns non-zero then the
+  /// symbolic information it returns is used to create an MCExpr and that is
+  /// added as an operand to the MCInst.  This function returns true if it adds
+  /// an operand to the MCInst and false otherwise.
+  bool tryAddingSymbolicOperand(uint64_t Value, uint64_t InstSize, MCInst &MI);
+
 };
 
 } // namespace llvm
diff --git a/lib/Target/ARM/Disassembler/ThumbDisassemblerCore.h b/lib/Target/ARM/Disassembler/ThumbDisassemblerCore.h
index 066a8e3..9639c8a 100644
--- a/lib/Target/ARM/Disassembler/ThumbDisassemblerCore.h
+++ b/lib/Target/ARM/Disassembler/ThumbDisassemblerCore.h
@@ -108,6 +108,8 @@ static inline bool IsGPR(unsigned RegClass) {
 
 // Utilities for 32-bit Thumb instructions.
 
+static inline bool BadReg(uint32_t n) { return n == 13 || n == 15; }
+
 // Extract imm4: Inst{19-16}.
 static inline unsigned getImm4(uint32_t insn) {
   return slice(insn, 19, 16);
@@ -398,9 +400,17 @@ static bool DisassembleThumb1General(MCInst &MI, unsigned Opcode, uint32_t insn,
     assert(OpInfo[OpIdx].RegClass < 0 &&
            !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()
            && "Pure imm operand expected");
-    MI.addOperand(MCOperand::CreateImm(UseRt ? getT1Imm8(insn)
-                                             : (Imm3 ? getT1Imm3(insn)
-                                                     : getT1Imm5(insn))));
+    unsigned Imm = 0;
+    if (UseRt)
+      Imm = getT1Imm8(insn);
+    else if (Imm3)
+      Imm = getT1Imm3(insn);
+    else {
+      Imm = getT1Imm5(insn);
+      ARM_AM::ShiftOpc ShOp = getShiftOpcForBits(slice(insn, 12, 11));
+      getImmShiftSE(ShOp, Imm);
+    }
+    MI.addOperand(MCOperand::CreateImm(Imm));
   }
   ++OpIdx;
 
@@ -466,9 +476,11 @@ static bool DisassembleThumb1DP(MCInst &MI, unsigned Opcode, uint32_t insn,
 // tADDhirr: Rd Rd(TIED_TO) Rm
 // tCMPhir:  Rd Rm
 // tMOVr, tMOVgpr2gpr, tMOVgpr2tgpr, tMOVtgpr2gpr: Rd|tRd Rm|tRn
+// tBX: Rm
 // tBX_RET: 0 operand
 // tBX_RET_vararg: Rm
 // tBLXr_r9: Rm
+// tBRIND: Rm
 static bool DisassembleThumb1Special(MCInst &MI, unsigned Opcode, uint32_t insn,
     unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
 
@@ -476,11 +488,26 @@ static bool DisassembleThumb1Special(MCInst &MI, unsigned Opcode, uint32_t insn,
   if (NumOps == 0)
     return true;
 
-  // BX/BLX has 1 reg operand: Rm.
-  if (NumOps == 1) {
+  // BX/BLX/tBRIND (indirect branch, i.e, mov pc, Rm) has 1 reg operand: Rm.
+  if (Opcode==ARM::tBLXr_r9 || Opcode==ARM::tBX || Opcode==ARM::tBRIND) {
+    if (Opcode == ARM::tBLXr_r9) {
+      // Handling the two predicate operands before the reg operand.
+      if (!B->DoPredicateOperands(MI, Opcode, insn, NumOps))
+        return false;
+      NumOpsAdded += 2;
+    }
+
     MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
                                                        getT1Rm(insn))));
-    NumOpsAdded = 1;
+    NumOpsAdded += 1;
+
+    if (Opcode == ARM::tBX) {
+      // Handling the two predicate operands after the reg operand.
+      if (!B->DoPredicateOperands(MI, Opcode, insn, NumOps))
+        return false;
+      NumOpsAdded += 2;
+    }
+
     return true;
   }
 
@@ -890,6 +917,10 @@ static bool DisassembleThumb1LdStMul(bool Ld, MCInst &MI, unsigned Opcode,
   }
 
   unsigned RegListBits = slice(insn, 7, 0);
+  if (BitCount(RegListBits) < 1) {
+    DEBUG(errs() << "if BitCount(registers) < 1 then UNPREDICTABLE\n");
+    return false;
+  }
 
   // Fill the variadic part of reglist.
   for (unsigned i = 0; i < 8; ++i)
@@ -936,10 +967,15 @@ static bool DisassembleThumb1CondBr(MCInst &MI, unsigned Opcode, uint32_t insn,
 
   unsigned Imm8 = getT1Imm8(insn);
   MI.addOperand(MCOperand::CreateImm(
-                  Opcode == ARM::tBcc ? SignExtend32<9>(Imm8 << 1) + 4
+                  Opcode == ARM::tBcc ? SignExtend32<9>(Imm8 << 1)
                                       : (int)Imm8));
 
   // Predicate operands by ARMBasicMCBuilder::TryPredicateAndSBitModifier().
+  // But note that for tBcc, if cond = '1110' then UNDEFINED.
+  if (Opcode == ARM::tBcc && slice(insn, 11, 8) == 14) {
+    DEBUG(errs() << "if cond = '1110' then UNDEFINED\n");
+    return false;
+  }
   NumOpsAdded = 1;
 
   return true;
@@ -1120,8 +1156,12 @@ static bool DisassembleThumb2SRS(MCInst &MI, unsigned Opcode, uint32_t insn,
 // t2RFE[IA|DB]W/t2RFE[IA|DB]: Rn
 static bool DisassembleThumb2RFE(MCInst &MI, unsigned Opcode, uint32_t insn,
     unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRn(insn))));
+  unsigned Rn = decodeRn(insn);
+  if (Rn == 15) {
+    DEBUG(errs() << "if n == 15 then UNPREDICTABLE\n");
+    return false;
+  }
+  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B,ARM::GPRRegClassID,Rn)));
   NumOpsAdded = 1;
   return true;
 }
@@ -1202,29 +1242,66 @@ static bool DisassembleThumb2LdStEx(MCInst &MI, unsigned Opcode, uint32_t insn,
   bool isSW = (Opcode == ARM::t2LDREX || Opcode == ARM::t2STREX);
   bool isDW = (Opcode == ARM::t2LDREXD || Opcode == ARM::t2STREXD);
 
+  unsigned Rt  = decodeRd(insn);
+  unsigned Rt2 = decodeRs(insn); // But note that this is Rd for t2STREX.
+  unsigned Rd  = decodeRm(insn);
+  unsigned Rn  = decodeRn(insn);
+
+  // Some sanity checking first.
+  if (isStore) {
+    // if d == n || d == t then UNPREDICTABLE
+    // if d == n || d == t || d == t2 then UNPREDICTABLE
+    if (isDW) {
+      if (Rd == Rn || Rd == Rt || Rd == Rt2) {
+        DEBUG(errs() << "if d == n || d == t || d == t2 then UNPREDICTABLE\n");
+        return false;
+      }
+    } else {
+      if (isSW) {
+        if (Rt2 == Rn || Rt2 == Rt) {
+          DEBUG(errs() << "if d == n || d == t then UNPREDICTABLE\n");
+          return false;
+        }
+      } else {
+        if (Rd == Rn || Rd == Rt) {
+          DEBUG(errs() << "if d == n || d == t then UNPREDICTABLE\n");
+          return false;
+        }
+      }
+    }
+  } else {
+    // Load
+    // A8.6.71 LDREXD
+    // if t == t2 then UNPREDICTABLE
+    if (isDW && Rt == Rt2) {
+      DEBUG(errs() << "if t == t2 then UNPREDICTABLE\n");
+      return false;
+    }
+  }
+
   // Add the destination operand for store.
   if (isStore) {
     MI.addOperand(MCOperand::CreateReg(
                     getRegisterEnum(B, OpInfo[OpIdx].RegClass,
-                                    isSW ? decodeRs(insn) : decodeRm(insn))));
+                                    isSW ? Rt2 : Rd)));
     ++OpIdx;
   }
 
   // Source operand for store and destination operand for load.
   MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[OpIdx].RegClass,
-                                                     decodeRd(insn))));
+                                                     Rt)));
   ++OpIdx;
 
   // Thumb2 doubleword complication: with an extra source/destination operand.
   if (isDW) {
     MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B,OpInfo[OpIdx].RegClass,
-                                                       decodeRs(insn))));
+                                                       Rt2)));
     ++OpIdx;
   }
 
   // Finally add the pointer operand.
   MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[OpIdx].RegClass,
-                                                     decodeRn(insn))));
+                                                     Rn)));
   ++OpIdx;
 
   return true;
@@ -1249,6 +1326,35 @@ static bool DisassembleThumb2LdStDual(MCInst &MI, unsigned Opcode,
          && OpInfo[3].RegClass < 0
          && "Expect >= 4 operands and first 3 as reg operands");
 
+  // Thumnb allows for specifying Rt and Rt2, unlike ARM (which has Rt2==Rt+1).
+  unsigned Rt  = decodeRd(insn);
+  unsigned Rt2 = decodeRs(insn);
+  unsigned Rn  = decodeRn(insn);
+
+  // Some sanity checking first.
+
+  // A8.6.67 LDRD (literal) has its W bit as (0).
+  if (Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2LDRD_PRE || Opcode == ARM::t2LDRD_POST) {
+    if (Rn == 15 && slice(insn, 21, 21) != 0)
+      return false;
+  } else {
+    // For Dual Store, PC cannot be used as the base register.
+    if (Rn == 15) {
+      DEBUG(errs() << "if n == 15 then UNPREDICTABLE\n");
+      return false;
+    }
+  }
+  if (Rt == Rt2) {
+    DEBUG(errs() << "if t == t2 then UNPREDICTABLE\n");
+    return false;
+  }
+  if (Opcode != ARM::t2LDRDi8 && Opcode != ARM::t2STRDi8) {
+    if (Rn == Rt || Rn == Rt2) {
+      DEBUG(errs() << "if wback && (n == t || n == t2) then UNPREDICTABLE\n");
+      return false;
+    }
+  }
+
   // Add the <Rt> <Rt2> operands.
   unsigned RegClassPair = OpInfo[0].RegClass;
   unsigned RegClassBase = OpInfo[2].RegClass;
@@ -1385,9 +1491,12 @@ static bool DisassembleThumb2DPSoReg(MCInst &MI, unsigned Opcode, uint32_t insn,
   if (OpInfo[OpIdx].RegClass < 0 && !OpInfo[OpIdx].isPredicate()
       && !OpInfo[OpIdx].isOptionalDef()) {
 
-    if (Thumb2ShiftOpcode(Opcode))
-      MI.addOperand(MCOperand::CreateImm(getShiftAmtBits(insn)));
-    else {
+    if (Thumb2ShiftOpcode(Opcode)) {
+      unsigned Imm = getShiftAmtBits(insn);
+      ARM_AM::ShiftOpc ShOp = getShiftOpcForBits(slice(insn, 5, 4));
+      getImmShiftSE(ShOp, Imm);
+      MI.addOperand(MCOperand::CreateImm(Imm));
+    } else {
       // Build the constant shift specifier operand.
       unsigned bits2 = getShiftTypeBits(insn);
       unsigned imm5 = getShiftAmtBits(insn);
@@ -1412,7 +1521,8 @@ static bool DisassembleThumb2DPSoReg(MCInst &MI, unsigned Opcode, uint32_t insn,
 static bool DisassembleThumb2DPModImm(MCInst &MI, unsigned Opcode,
     uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
 
-  const TargetOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
+  const TargetInstrDesc &TID = ARMInsts[Opcode];
+  const TargetOperandInfo *OpInfo = TID.OpInfo;
   unsigned &OpIdx = NumOpsAdded;
 
   OpIdx = 0;
@@ -1439,8 +1549,15 @@ static bool DisassembleThumb2DPModImm(MCInst &MI, unsigned Opcode,
       DEBUG(errs()<<"Thumb2 encoding error: d==15 for DPModImm 2-reg instr.\n");
       return false;
     }
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, RnRegClassID,
-                                                       decodeRn(insn))));
+    int Idx;
+    if ((Idx = TID.getOperandConstraint(OpIdx, TOI::TIED_TO)) != -1) {
+      // The reg operand is tied to the first reg operand.
+      MI.addOperand(MI.getOperand(Idx));
+    } else {
+      // Add second reg operand.
+      MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, RnRegClassID,
+                                                         decodeRn(insn))));
+    }
     ++OpIdx;
   }
 
@@ -1509,7 +1626,7 @@ static bool DisassembleThumb2Sat(MCInst &MI, unsigned Opcode, uint32_t insn,
 // o t2ADDri12, t2SUBri12: Rs Rn imm12
 // o t2LEApcrel (ADR): Rs imm12
 // o t2BFC (BFC): Rs Ro(TIED_TO) bf_inv_mask_imm
-// o t2BFI (BFI) (Currently not defined in LLVM as of Jan-07-2010)
+// o t2BFI (BFI): Rs Ro(TIED_TO) Rn bf_inv_mask_imm
 // o t2MOVi16: Rs imm16
 // o t2MOVTi16: Rs imm16
 // o t2SBFX (SBFX): Rs Rn lsb width
@@ -1570,9 +1687,10 @@ static bool DisassembleThumb2DPBinImm(MCInst &MI, unsigned Opcode,
   if (Opcode == ARM::t2ADDri12 || Opcode == ARM::t2SUBri12
       || Opcode == ARM::t2LEApcrel)
     MI.addOperand(MCOperand::CreateImm(getIImm3Imm8(insn)));
-  else if (Opcode == ARM::t2MOVi16 || Opcode == ARM::t2MOVTi16)
-    MI.addOperand(MCOperand::CreateImm(getImm16(insn)));
-  else if (Opcode == ARM::t2BFC || Opcode == ARM::t2BFI) {
+  else if (Opcode == ARM::t2MOVi16 || Opcode == ARM::t2MOVTi16) {
+    if (!B->tryAddingSymbolicOperand(getImm16(insn), 4, MI))
+      MI.addOperand(MCOperand::CreateImm(getImm16(insn)));
+  } else if (Opcode == ARM::t2BFC || Opcode == ARM::t2BFI) {
     uint32_t mask = 0;
     if (getBitfieldInvMask(insn, mask))
       MI.addOperand(MCOperand::CreateImm(mask));
@@ -1616,8 +1734,7 @@ static inline bool t2MiscCtrlInstr(uint32_t insn) {
 // A8.6.26
 // t2BXJ -> Rn
 //
-// Miscellaneous control: t2DMBsy (and its t2DMB variants),
-// t2DSBsy (and its t2DSB varianst), t2ISBsy, t2CLREX
+// Miscellaneous control:
 //   -> no operand (except pred-imm pred-ccr for CLREX, memory barrier variants)
 //
 // Hint: t2NOP, t2YIELD, t2WFE, t2WFI, t2SEV
@@ -1634,6 +1751,22 @@ static bool DisassembleThumb2BrMiscCtrl(MCInst &MI, unsigned Opcode,
   if (NumOps == 0)
     return true;
 
+  if (Opcode == ARM::t2DMB || Opcode == ARM::t2DSB) {
+    // Inst{3-0} encodes the memory barrier option for the variants.
+    unsigned opt = slice(insn, 3, 0);
+    switch (opt) {
+    case ARM_MB::SY:  case ARM_MB::ST:
+    case ARM_MB::ISH: case ARM_MB::ISHST:
+    case ARM_MB::NSH: case ARM_MB::NSHST:
+    case ARM_MB::OSH: case ARM_MB::OSHST:
+      MI.addOperand(MCOperand::CreateImm(opt));
+      NumOpsAdded = 1;
+      return true;
+    default:
+      return false;
+    }
+  }
+
   if (t2MiscCtrlInstr(insn))
     return true;
 
@@ -1741,7 +1874,9 @@ static bool DisassembleThumb2BrMiscCtrl(MCInst &MI, unsigned Opcode,
     Offset = decodeImm32_BLX(insn);
     break;
   }
-  MI.addOperand(MCOperand::CreateImm(Offset));
+
+  if (!B->tryAddingSymbolicOperand(Offset + B->getBuilderAddress() + 4, 4, MI))
+    MI.addOperand(MCOperand::CreateImm(Offset));
 
   // This is an increment as some predicate operands may have been added first.
   NumOpsAdded += 1;
@@ -1819,6 +1954,87 @@ static bool DisassembleThumb2PreLoad(MCInst &MI, unsigned Opcode, uint32_t insn,
   return true;
 }
 
+static bool BadRegsThumb2LdSt(unsigned Opcode, uint32_t insn, bool Load,
+      unsigned R0, unsigned R1, unsigned R2, bool UseRm, bool WB) {
+
+  // Inst{22-21} encodes the data item transferred for load/store.
+  // For single word, it is encoded as ob10.
+  bool Word = (slice(insn, 22, 21) == 2);
+  bool Half = (slice(insn, 22, 21) == 1);
+  bool Byte = (slice(insn, 22, 21) == 0);
+
+  if (UseRm && BadReg(R2)) {
+    DEBUG(errs() << "if BadReg(m) then UNPREDICTABLE\n");
+    return true;
+  }
+
+  if (Load) {
+    if (!Word && R0 == 13) {
+      DEBUG(errs() << "if t == 13 then UNPREDICTABLE\n");
+      return true;
+    }
+    if (Byte) {
+      if (WB && R0 == 15 && slice(insn, 10, 8) == 3)  {
+        // A8.6.78 LDRSB (immediate) Encoding T2 (errata markup 8.0)
+        DEBUG(errs() << "if t == 15 && PUW == '011' then UNPREDICTABLE\n");
+        return true;
+      }
+    }
+    // A6.3.8 Load halfword, memory hints
+    if (Half) {
+      if (WB) {
+        if (R0 == R1)  {
+          // A8.6.82 LDRSH (immediate) Encoding T2
+          DEBUG(errs() << "if WB && n == t then UNPREDICTABLE\n");
+          return true;
+        }
+        if (R0 == 15 && slice(insn, 10, 8) == 3)  {
+          // A8.6.82 LDRSH (immediate) Encoding T2 (errata markup 8.0)
+          DEBUG(errs() << "if t == 15 && PUW == '011' then UNPREDICTABLE\n");
+          return true;
+        }
+      } else {
+        if (Opcode == ARM::t2LDRHi8 || Opcode == ARM::t2LDRSHi8) {
+          if (R0 == 15 && slice(insn, 10, 8) == 4) {
+            // A8.6.82 LDRSH (immediate) Encoding T2
+            DEBUG(errs() << "if Rt == '1111' and PUW == '100' then SEE"
+                         << " \"Unallocated memory hints\"\n");
+            return true;
+          }
+        } else {
+          if (R0 == 15) {
+            // A8.6.82 LDRSH (immediate) Encoding T1
+            DEBUG(errs() << "if Rt == '1111' then SEE"
+                         << " \"Unallocated memory hints\"\n");
+            return true;
+          }
+        }
+      }
+    }
+  } else {
+    if (WB && R0 == R1) {
+      DEBUG(errs() << "if wback && n == t then UNPREDICTABLE\n");
+      return true;
+    }
+    if ((WB && R0 == 15) || (!WB && R1 == 15)) {
+      DEBUG(errs() << "if Rn == '1111' then UNDEFINED\n");
+      return true;
+    }
+    if (Word) {
+      if ((WB && R1 == 15) || (!WB && R0 == 15)) {
+        DEBUG(errs() << "if t == 15 then UNPREDICTABLE\n");
+        return true;
+      }
+    } else {
+      if ((WB && BadReg(R1)) || (!WB && BadReg(R0))) {
+        DEBUG(errs() << "if BadReg(t) then UNPREDICTABLE\n");
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
 // A6.3.10 Store single data item
 // A6.3.9 Load byte, memory hints
 // A6.3.8 Load halfword, memory hints
@@ -1864,8 +2080,8 @@ static bool DisassembleThumb2LdSt(bool Load, MCInst &MI, unsigned Opcode,
   OpIdx = 0;
 
   assert(NumOps >= 3 &&
-         OpInfo[0].RegClass == ARM::GPRRegClassID &&
-         OpInfo[1].RegClass == ARM::GPRRegClassID &&
+         OpInfo[0].RegClass > 0 &&
+         OpInfo[1].RegClass > 0 &&
          "Expect >= 3 operands and first two as reg operands");
 
   bool ThreeReg = (OpInfo[2].RegClass > 0);
@@ -1873,7 +2089,7 @@ static bool DisassembleThumb2LdSt(bool Load, MCInst &MI, unsigned Opcode,
   bool Imm12 = !ThreeReg && slice(insn, 23, 23) == 1; // ARMInstrThumb2.td
 
   // Build the register operands, followed by the immediate.
-  unsigned R0, R1, R2 = 0;
+  unsigned R0 = 0, R1 = 0, R2 = 0;
   unsigned Rd = decodeRd(insn);
   int Imm = 0;
 
@@ -1904,10 +2120,10 @@ static bool DisassembleThumb2LdSt(bool Load, MCInst &MI, unsigned Opcode,
       Imm = decodeImm8(insn);
   }
 
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
+  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[OpIdx].RegClass,
                                                      R0)));
   ++OpIdx;
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
+  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[OpIdx].RegClass,
                                                      R1)));
   ++OpIdx;
 
@@ -1918,6 +2134,10 @@ static bool DisassembleThumb2LdSt(bool Load, MCInst &MI, unsigned Opcode,
     ++OpIdx;
   }
 
+  if (BadRegsThumb2LdSt(Opcode, insn, Load, R0, R1, R2, ThreeReg & !TIED_TO,
+                        TIED_TO))
+    return false;
+
   assert(OpInfo[OpIdx].RegClass < 0 && !OpInfo[OpIdx].isPredicate()
          && !OpInfo[OpIdx].isOptionalDef()
          && "Pure imm operand expected");
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
index fc2aa75..8ae87f8 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
@@ -29,8 +29,8 @@ StringRef ARMInstPrinter::getOpcodeName(unsigned Opcode) const {
   return getInstructionName(Opcode);
 }
 
-StringRef ARMInstPrinter::getRegName(unsigned RegNo) const {
-  return getRegisterName(RegNo);
+void ARMInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+  OS << getRegisterName(RegNo);
 }
 
 void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O) {
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.h b/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
index b3ac03a..bde0eb9 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
@@ -28,7 +28,7 @@ public:
 
   virtual void printInst(const MCInst *MI, raw_ostream &O);
   virtual StringRef getOpcodeName(unsigned Opcode) const;
-  virtual StringRef getRegName(unsigned RegNo) const;
+  virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
 
   static const char *getInstructionName(unsigned Opcode);
 
diff --git a/lib/Target/ARM/MLxExpansionPass.cpp b/lib/Target/ARM/MLxExpansionPass.cpp
index 9a27e2f..f6d0242 100644
--- a/lib/Target/ARM/MLxExpansionPass.cpp
+++ b/lib/Target/ARM/MLxExpansionPass.cpp
@@ -15,11 +15,13 @@
 #define DEBUG_TYPE "mlx-expansion"
 #include "ARM.h"
 #include "ARMBaseInstrInfo.h"
+#include "ARMSubtarget.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -49,15 +51,17 @@ namespace {
     const TargetRegisterInfo *TRI;
     MachineRegisterInfo *MRI;
 
+    bool isA9;
     unsigned MIIdx;
     MachineInstr* LastMIs[4];
+    SmallPtrSet<MachineInstr*, 4> IgnoreStall;
 
     void clearStack();
     void pushStack(MachineInstr *MI);
     MachineInstr *getAccDefMI(MachineInstr *MI) const;
     unsigned getDefReg(MachineInstr *MI) const;
     bool hasRAWHazard(unsigned Reg, MachineInstr *MI) const;
-    bool FindMLxHazard(MachineInstr *MI) const;
+    bool FindMLxHazard(MachineInstr *MI);
     void ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI,
                                 unsigned MulOpc, unsigned AddSubOpc,
                                 bool NegAcc, bool HasLane);
@@ -146,7 +150,7 @@ bool MLxExpansion::hasRAWHazard(unsigned Reg, MachineInstr *MI) const {
 }
 
 
-bool MLxExpansion::FindMLxHazard(MachineInstr *MI) const {
+bool MLxExpansion::FindMLxHazard(MachineInstr *MI) {
   if (NumExpand >= ExpandLimit)
     return false;
 
@@ -154,7 +158,7 @@ bool MLxExpansion::FindMLxHazard(MachineInstr *MI) const {
     return true;
 
   MachineInstr *DefMI = getAccDefMI(MI);
-  if (TII->isFpMLxInstruction(DefMI->getOpcode()))
+  if (TII->isFpMLxInstruction(DefMI->getOpcode())) {
     // r0 = vmla
     // r3 = vmla r0, r1, r2
     // takes 16 - 17 cycles
@@ -163,24 +167,33 @@ bool MLxExpansion::FindMLxHazard(MachineInstr *MI) const {
     // r4 = vmul r1, r2
     // r3 = vadd r0, r4
     // takes about 14 - 15 cycles even with vmul stalling for 4 cycles.
+    IgnoreStall.insert(DefMI);
     return true;
+  }
+
+  if (IgnoreStall.count(MI))
+    return false;
 
   // If a VMLA.F is followed by an VADD.F or VMUL.F with no RAW hazard, the
   // VADD.F or VMUL.F will stall 4 cycles before issue. The 4 cycle stall
   // preserves the in-order retirement of the instructions.
   // Look at the next few instructions, if *most* of them can cause hazards,
   // then the scheduler can't *fix* this, we'd better break up the VMLA.
+  unsigned Limit1 = isA9 ? 1 : 4;
+  unsigned Limit2 = isA9 ? 1 : 4;
   for (unsigned i = 1; i <= 4; ++i) {
     int Idx = ((int)MIIdx - i + 4) % 4;
     MachineInstr *NextMI = LastMIs[Idx];
     if (!NextMI)
       continue;
 
-    if (TII->canCauseFpMLxStall(NextMI->getOpcode()))
-      return true;
+    if (TII->canCauseFpMLxStall(NextMI->getOpcode())) {
+      if (i <= Limit1)
+        return true;
+    }
 
     // Look for VMLx RAW hazard.
-    if (hasRAWHazard(getDefReg(MI), NextMI))
+    if (i <= Limit2 && hasRAWHazard(getDefReg(MI), NextMI))
       return true;
   }
 
@@ -248,6 +261,7 @@ bool MLxExpansion::ExpandFPMLxInstructions(MachineBasicBlock &MBB) {
   bool Changed = false;
 
   clearStack();
+  IgnoreStall.clear();
 
   unsigned Skip = 0;
   MachineBasicBlock::reverse_iterator MII = MBB.rbegin(), E = MBB.rend();
@@ -299,6 +313,8 @@ bool MLxExpansion::runOnMachineFunction(MachineFunction &Fn) {
   TII = static_cast<const ARMBaseInstrInfo*>(Fn.getTarget().getInstrInfo());
   TRI = Fn.getTarget().getRegisterInfo();
   MRI = &Fn.getRegInfo();
+  const ARMSubtarget *STI = &Fn.getTarget().getSubtarget<ARMSubtarget>();
+  isA9 = STI->isCortexA9();
 
   bool Modified = false;
   for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp
index dee3d27..e56d481 100644
--- a/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -136,8 +136,8 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const {
     BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDrSPi), FramePtr)
       .addFrameIndex(FramePtrSpillFI).addImm(0)
       .setMIFlags(MachineInstr::FrameSetup);
-    if (NumBytes > 7)
-      // If offset is > 7 then sp cannot be adjusted in a single instruction,
+    if (NumBytes > 508)
+      // If offset is > 508 then sp cannot be adjusted in a single instruction,
       // try restoring from fp instead.
       AFI->setShouldRestoreSPFromFP(true);
   }
diff --git a/lib/Target/ARM/Thumb1FrameLowering.h b/lib/Target/ARM/Thumb1FrameLowering.h
index c592e12..bcfc516 100644
--- a/lib/Target/ARM/Thumb1FrameLowering.h
+++ b/lib/Target/ARM/Thumb1FrameLowering.h
@@ -12,7 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #ifndef __THUMB_FRAMEINFO_H_
-#define __THUMM_FRAMEINFO_H_
+#define __THUMB_FRAMEINFO_H_
 
 #include "ARM.h"
 #include "ARMFrameLowering.h"
diff --git a/lib/Target/ARM/Thumb1RegisterInfo.cpp b/lib/Target/ARM/Thumb1RegisterInfo.cpp
index 0cdca11..6bf5650 100644
--- a/lib/Target/ARM/Thumb1RegisterInfo.cpp
+++ b/lib/Target/ARM/Thumb1RegisterInfo.cpp
@@ -31,8 +31,6 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -48,6 +46,14 @@ Thumb1RegisterInfo::Thumb1RegisterInfo(const ARMBaseInstrInfo &tii,
   : ARMBaseRegisterInfo(tii, sti) {
 }
 
+const TargetRegisterClass*
+Thumb1RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC)
+                                                                         const {
+  if (ARM::tGPRRegClass.hasSubClassEq(RC))
+    return ARM::tGPRRegisterClass;
+  return ARMBaseRegisterInfo::getLargestLegalSuperClass(RC);
+}
+
 const TargetRegisterClass *
 Thumb1RegisterInfo::getPointerRegClass(unsigned Kind) const {
   return ARM::tGPRRegisterClass;
diff --git a/lib/Target/ARM/Thumb1RegisterInfo.h b/lib/Target/ARM/Thumb1RegisterInfo.h
index b4fdd67..9060e59 100644
--- a/lib/Target/ARM/Thumb1RegisterInfo.h
+++ b/lib/Target/ARM/Thumb1RegisterInfo.h
@@ -28,6 +28,9 @@ struct Thumb1RegisterInfo : public ARMBaseRegisterInfo {
 public:
   Thumb1RegisterInfo(const ARMBaseInstrInfo &tii, const ARMSubtarget &STI);
 
+  const TargetRegisterClass*
+  getLargestLegalSuperClass(const TargetRegisterClass *RC) const;
+
   const TargetRegisterClass *getPointerRegClass(unsigned Kind = 0) const;
 
   /// emitLoadConstPool - Emits a load from constpool to materialize the
diff --git a/lib/Target/ARM/Thumb2RegisterInfo.cpp b/lib/Target/ARM/Thumb2RegisterInfo.cpp
index ce8edbe..355c3bf 100644
--- a/lib/Target/ARM/Thumb2RegisterInfo.cpp
+++ b/lib/Target/ARM/Thumb2RegisterInfo.cpp
@@ -13,26 +13,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARM.h"
-#include "ARMAddressingModes.h"
-#include "ARMBaseInstrInfo.h"
-#include "ARMMachineFunctionInfo.h"
 #include "ARMSubtarget.h"
 #include "Thumb2InstrInfo.h"
 #include "Thumb2RegisterInfo.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
-#include "llvm/LLVMContext.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineLocation.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/ErrorHandling.h"
 using namespace llvm;
 
 Thumb2RegisterInfo::Thumb2RegisterInfo(const ARMBaseInstrInfo &tii,
diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp
index be9c150..ce2e966 100644
--- a/lib/Target/ARM/Thumb2SizeReduction.cpp
+++ b/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -12,6 +12,7 @@
 #include "ARMAddressingModes.h"
 #include "ARMBaseRegisterInfo.h"
 #include "ARMBaseInstrInfo.h"
+#include "ARMSubtarget.h"
 #include "Thumb2InstrInfo.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -49,82 +50,86 @@ namespace {
                            // 1 - No cc field.
                            // 2 - Always set CPSR.
     unsigned PredCC2  : 2;
+    unsigned PartFlag : 1; // 16-bit instruction does partial flag update
     unsigned Special  : 1; // Needs to be dealt with specially
   };
 
   static const ReduceEntry ReduceTable[] = {
-    // Wide,        Narrow1,      Narrow2,     imm1,imm2,  lo1, lo2, P/C, S
-    { ARM::t2ADCrr, 0,            ARM::tADC,     0,   0,    0,   1,  0,0, 0 },
-    { ARM::t2ADDri, ARM::tADDi3,  ARM::tADDi8,   3,   8,    1,   1,  0,0, 0 },
-    { ARM::t2ADDrr, ARM::tADDrr,  ARM::tADDhirr, 0,   0,    1,   0,  0,1, 0 },
+    // Wide,        Narrow1,      Narrow2,     imm1,imm2,  lo1, lo2, P/C, PF, S
+    { ARM::t2ADCrr, 0,            ARM::tADC,     0,   0,    0,   1,  0,0, 0,0 },
+    { ARM::t2ADDri, ARM::tADDi3,  ARM::tADDi8,   3,   8,    1,   1,  0,0, 0,0 },
+    { ARM::t2ADDrr, ARM::tADDrr,  ARM::tADDhirr, 0,   0,    1,   0,  0,1, 0,0 },
     // Note: immediate scale is 4.
-    { ARM::t2ADDrSPi,ARM::tADDrSPi,0,            8,   0,    1,   0,  1,0, 1 },
-    { ARM::t2ADDSri,ARM::tADDi3,  ARM::tADDi8,   3,   8,    1,   1,  2,2, 1 },
-    { ARM::t2ADDSrr,ARM::tADDrr,  0,             0,   0,    1,   0,  2,0, 1 },
-    { ARM::t2ANDrr, 0,            ARM::tAND,     0,   0,    0,   1,  0,0, 0 },
-    { ARM::t2ASRri, ARM::tASRri,  0,             5,   0,    1,   0,  0,0, 0 },
-    { ARM::t2ASRrr, 0,            ARM::tASRrr,   0,   0,    0,   1,  0,0, 0 },
-    { ARM::t2BICrr, 0,            ARM::tBIC,     0,   0,    0,   1,  0,0, 0 },
+    { ARM::t2ADDrSPi,ARM::tADDrSPi,0,            8,   0,    1,   0,  1,0, 0,1 },
+    { ARM::t2ADDSri,ARM::tADDi3,  ARM::tADDi8,   3,   8,    1,   1,  2,2, 0,1 },
+    { ARM::t2ADDSrr,ARM::tADDrr,  0,             0,   0,    1,   0,  2,0, 0,1 },
+    { ARM::t2ANDrr, 0,            ARM::tAND,     0,   0,    0,   1,  0,0, 1,0 },
+    { ARM::t2ASRri, ARM::tASRri,  0,             5,   0,    1,   0,  0,0, 1,0 },
+    { ARM::t2ASRrr, 0,            ARM::tASRrr,   0,   0,    0,   1,  0,0, 1,0 },
+    { ARM::t2BICrr, 0,            ARM::tBIC,     0,   0,    0,   1,  0,0, 1,0 },
     //FIXME: Disable CMN, as CCodes are backwards from compare expectations
-    //{ ARM::t2CMNrr, ARM::tCMN,    0,             0,   0,    1,   0,  2,0, 0 },
-    { ARM::t2CMPri, ARM::tCMPi8,  0,             8,   0,    1,   0,  2,0, 0 },
-    { ARM::t2CMPrr, ARM::tCMPhir, 0,             0,   0,    0,   0,  2,0, 1 },
-    { ARM::t2EORrr, 0,            ARM::tEOR,     0,   0,    0,   1,  0,0, 0 },
+    //{ ARM::t2CMNrr, ARM::tCMN,  0,             0,   0,    1,   0,  2,0, 0,0 },
+    { ARM::t2CMPri, ARM::tCMPi8,  0,             8,   0,    1,   0,  2,0, 0,0 },
+    { ARM::t2CMPrr, ARM::tCMPhir, 0,             0,   0,    0,   0,  2,0, 0,1 },
+    { ARM::t2EORrr, 0,            ARM::tEOR,     0,   0,    0,   1,  0,0, 1,0 },
     // FIXME: adr.n immediate offset must be multiple of 4.
-    //{ ARM::t2LEApcrelJT,ARM::tLEApcrelJT, 0,     0,   0,    1,   0,  1,0, 0 },
-    { ARM::t2LSLri, ARM::tLSLri,  0,             5,   0,    1,   0,  0,0, 0 },
-    { ARM::t2LSLrr, 0,            ARM::tLSLrr,   0,   0,    0,   1,  0,0, 0 },
-    { ARM::t2LSRri, ARM::tLSRri,  0,             5,   0,    1,   0,  0,0, 0 },
-    { ARM::t2LSRrr, 0,            ARM::tLSRrr,   0,   0,    0,   1,  0,0, 0 },
-    { ARM::t2MOVi,  ARM::tMOVi8,  0,             8,   0,    1,   0,  0,0, 0 },
-    { ARM::t2MOVi16,ARM::tMOVi8,  0,             8,   0,    1,   0,  0,0, 1 },
+    //{ ARM::t2LEApcrelJT,ARM::tLEApcrelJT, 0,   0,   0,    1,   0,  1,0, 0,0 },
+    { ARM::t2LSLri, ARM::tLSLri,  0,             5,   0,    1,   0,  0,0, 1,0 },
+    { ARM::t2LSLrr, 0,            ARM::tLSLrr,   0,   0,    0,   1,  0,0, 1,0 },
+    { ARM::t2LSRri, ARM::tLSRri,  0,             5,   0,    1,   0,  0,0, 1,0 },
+    { ARM::t2LSRrr, 0,            ARM::tLSRrr,   0,   0,    0,   1,  0,0, 1,0 },
+    // FIXME: tMOVi8 and tMVN also partially update CPSR but they are less
+    // likely to cause issue in the loop. As a size / performance workaround,
+    // they are not marked as such.
+    { ARM::t2MOVi,  ARM::tMOVi8,  0,             8,   0,    1,   0,  0,0, 0,0 },
+    { ARM::t2MOVi16,ARM::tMOVi8,  0,             8,   0,    1,   0,  0,0, 0,1 },
     // FIXME: Do we need the 16-bit 'S' variant?
-    { ARM::t2MOVr,ARM::tMOVgpr2gpr,0,            0,   0,    0,   0,  1,0, 0 },
-    { ARM::t2MOVCCr,0,            ARM::tMOVCCr,  0,   0,    0,   0,  0,1, 0 },
-    { ARM::t2MOVCCi,0,            ARM::tMOVCCi,  0,   8,    0,   1,  0,1, 0 },
-    { ARM::t2MUL,   0,            ARM::tMUL,     0,   0,    0,   1,  0,0, 0 },
-    { ARM::t2MVNr,  ARM::tMVN,    0,             0,   0,    1,   0,  0,0, 0 },
-    { ARM::t2ORRrr, 0,            ARM::tORR,     0,   0,    0,   1,  0,0, 0 },
-    { ARM::t2REV,   ARM::tREV,    0,             0,   0,    1,   0,  1,0, 0 },
-    { ARM::t2REV16, ARM::tREV16,  0,             0,   0,    1,   0,  1,0, 0 },
-    { ARM::t2REVSH, ARM::tREVSH,  0,             0,   0,    1,   0,  1,0, 0 },
-    { ARM::t2RORrr, 0,            ARM::tROR,     0,   0,    0,   1,  0,0, 0 },
-    { ARM::t2RSBri, ARM::tRSB,    0,             0,   0,    1,   0,  0,0, 1 },
-    { ARM::t2RSBSri,ARM::tRSB,    0,             0,   0,    1,   0,  2,0, 1 },
-    { ARM::t2SBCrr, 0,            ARM::tSBC,     0,   0,    0,   1,  0,0, 0 },
-    { ARM::t2SUBri, ARM::tSUBi3,  ARM::tSUBi8,   3,   8,    1,   1,  0,0, 0 },
-    { ARM::t2SUBrr, ARM::tSUBrr,  0,             0,   0,    1,   0,  0,0, 0 },
-    { ARM::t2SUBSri,ARM::tSUBi3,  ARM::tSUBi8,   3,   8,    1,   1,  2,2, 0 },
-    { ARM::t2SUBSrr,ARM::tSUBrr,  0,             0,   0,    1,   0,  2,0, 0 },
-    { ARM::t2SXTBr, ARM::tSXTB,   0,             0,   0,    1,   0,  1,0, 0 },
-    { ARM::t2SXTHr, ARM::tSXTH,   0,             0,   0,    1,   0,  1,0, 0 },
-    { ARM::t2TSTrr, ARM::tTST,    0,             0,   0,    1,   0,  2,0, 0 },
-    { ARM::t2UXTBr, ARM::tUXTB,   0,             0,   0,    1,   0,  1,0, 0 },
-    { ARM::t2UXTHr, ARM::tUXTH,   0,             0,   0,    1,   0,  1,0, 0 },
+    { ARM::t2MOVr,ARM::tMOVgpr2gpr,0,            0,   0,    0,   0,  1,0, 0,0 },
+    { ARM::t2MOVCCr,0,            ARM::tMOVCCr,  0,   0,    0,   0,  0,1, 0,0 },
+    { ARM::t2MOVCCi,0,            ARM::tMOVCCi,  0,   8,    0,   1,  0,1, 0,0 },
+    { ARM::t2MUL,   0,            ARM::tMUL,     0,   0,    0,   1,  0,0, 1,0 },
+    { ARM::t2MVNr,  ARM::tMVN,    0,             0,   0,    1,   0,  0,0, 0,0 },
+    { ARM::t2ORRrr, 0,            ARM::tORR,     0,   0,    0,   1,  0,0, 1,0 },
+    { ARM::t2REV,   ARM::tREV,    0,             0,   0,    1,   0,  1,0, 0,0 },
+    { ARM::t2REV16, ARM::tREV16,  0,             0,   0,    1,   0,  1,0, 0,0 },
+    { ARM::t2REVSH, ARM::tREVSH,  0,             0,   0,    1,   0,  1,0, 0,0 },
+    { ARM::t2RORrr, 0,            ARM::tROR,     0,   0,    0,   1,  0,0, 1,0 },
+    { ARM::t2RSBri, ARM::tRSB,    0,             0,   0,    1,   0,  0,0, 0,1 },
+    { ARM::t2RSBSri,ARM::tRSB,    0,             0,   0,    1,   0,  2,0, 0,1 },
+    { ARM::t2SBCrr, 0,            ARM::tSBC,     0,   0,    0,   1,  0,0, 0,0 },
+    { ARM::t2SUBri, ARM::tSUBi3,  ARM::tSUBi8,   3,   8,    1,   1,  0,0, 0,0 },
+    { ARM::t2SUBrr, ARM::tSUBrr,  0,             0,   0,    1,   0,  0,0, 0,0 },
+    { ARM::t2SUBSri,ARM::tSUBi3,  ARM::tSUBi8,   3,   8,    1,   1,  2,2, 0,0 },
+    { ARM::t2SUBSrr,ARM::tSUBrr,  0,             0,   0,    1,   0,  2,0, 0,0 },
+    { ARM::t2SXTBr, ARM::tSXTB,   0,             0,   0,    1,   0,  1,0, 0,0 },
+    { ARM::t2SXTHr, ARM::tSXTH,   0,             0,   0,    1,   0,  1,0, 0,0 },
+    { ARM::t2TSTrr, ARM::tTST,    0,             0,   0,    1,   0,  2,0, 0,0 },
+    { ARM::t2UXTBr, ARM::tUXTB,   0,             0,   0,    1,   0,  1,0, 0,0 },
+    { ARM::t2UXTHr, ARM::tUXTH,   0,             0,   0,    1,   0,  1,0, 0,0 },
 
     // FIXME: Clean this up after splitting each Thumb load / store opcode
     // into multiple ones.
-    { ARM::t2LDRi12,ARM::tLDRi,   ARM::tLDRspi,  5,   8,    1,   0,  0,0, 1 },
-    { ARM::t2LDRs,  ARM::tLDRr,   0,             0,   0,    1,   0,  0,0, 1 },
-    { ARM::t2LDRBi12,ARM::tLDRBi, 0,             5,   0,    1,   0,  0,0, 1 },
-    { ARM::t2LDRBs, ARM::tLDRBr,  0,             0,   0,    1,   0,  0,0, 1 },
-    { ARM::t2LDRHi12,ARM::tLDRHi, 0,             5,   0,    1,   0,  0,0, 1 },
-    { ARM::t2LDRHs, ARM::tLDRHr,  0,             0,   0,    1,   0,  0,0, 1 },
-    { ARM::t2LDRSBs,ARM::tLDRSB,  0,             0,   0,    1,   0,  0,0, 1 },
-    { ARM::t2LDRSHs,ARM::tLDRSH,  0,             0,   0,    1,   0,  0,0, 1 },
-    { ARM::t2STRi12,ARM::tSTRi,   ARM::tSTRspi,  5,   8,    1,   0,  0,0, 1 },
-    { ARM::t2STRs,  ARM::tSTRr,   0,             0,   0,    1,   0,  0,0, 1 },
-    { ARM::t2STRBi12,ARM::tSTRBi, 0,             5,   0,    1,   0,  0,0, 1 },
-    { ARM::t2STRBs, ARM::tSTRBr,  0,             0,   0,    1,   0,  0,0, 1 },
-    { ARM::t2STRHi12,ARM::tSTRHi, 0,             5,   0,    1,   0,  0,0, 1 },
-    { ARM::t2STRHs, ARM::tSTRHr,  0,             0,   0,    1,   0,  0,0, 1 },
-
-    { ARM::t2LDMIA, ARM::tLDMIA,  0,             0,   0,    1,   1,  1,1, 1 },
-    { ARM::t2LDMIA_RET,0,         ARM::tPOP_RET, 0,   0,    1,   1,  1,1, 1 },
-    { ARM::t2LDMIA_UPD,ARM::tLDMIA_UPD,ARM::tPOP,0,   0,    1,   1,  1,1, 1 },
+    { ARM::t2LDRi12,ARM::tLDRi,   ARM::tLDRspi,  5,   8,    1,   0,  0,0, 0,1 },
+    { ARM::t2LDRs,  ARM::tLDRr,   0,             0,   0,    1,   0,  0,0, 0,1 },
+    { ARM::t2LDRBi12,ARM::tLDRBi, 0,             5,   0,    1,   0,  0,0, 0,1 },
+    { ARM::t2LDRBs, ARM::tLDRBr,  0,             0,   0,    1,   0,  0,0, 0,1 },
+    { ARM::t2LDRHi12,ARM::tLDRHi, 0,             5,   0,    1,   0,  0,0, 0,1 },
+    { ARM::t2LDRHs, ARM::tLDRHr,  0,             0,   0,    1,   0,  0,0, 0,1 },
+    { ARM::t2LDRSBs,ARM::tLDRSB,  0,             0,   0,    1,   0,  0,0, 0,1 },
+    { ARM::t2LDRSHs,ARM::tLDRSH,  0,             0,   0,    1,   0,  0,0, 0,1 },
+    { ARM::t2STRi12,ARM::tSTRi,   ARM::tSTRspi,  5,   8,    1,   0,  0,0, 0,1 },
+    { ARM::t2STRs,  ARM::tSTRr,   0,             0,   0,    1,   0,  0,0, 0,1 },
+    { ARM::t2STRBi12,ARM::tSTRBi, 0,             5,   0,    1,   0,  0,0, 0,1 },
+    { ARM::t2STRBs, ARM::tSTRBr,  0,             0,   0,    1,   0,  0,0, 0,1 },
+    { ARM::t2STRHi12,ARM::tSTRHi, 0,             5,   0,    1,   0,  0,0, 0,1 },
+    { ARM::t2STRHs, ARM::tSTRHr,  0,             0,   0,    1,   0,  0,0, 0,1 },
+
+    { ARM::t2LDMIA, ARM::tLDMIA,  0,             0,   0,    1,   1,  1,1, 0,1 },
+    { ARM::t2LDMIA_RET,0,         ARM::tPOP_RET, 0,   0,    1,   1,  1,1, 0,1 },
+    { ARM::t2LDMIA_UPD,ARM::tLDMIA_UPD,ARM::tPOP,0,   0,    1,   1,  1,1, 0,1 },
     // ARM::t2STM (with no basereg writeback) has no Thumb1 equivalent
-    { ARM::t2STMIA_UPD,ARM::tSTMIA_UPD, 0,       0,   0,    1,   1,  1,1, 1 },
-    { ARM::t2STMDB_UPD, 0,        ARM::tPUSH,    0,   0,    1,   1,  1,1, 1 },
+    { ARM::t2STMIA_UPD,ARM::tSTMIA_UPD, 0,       0,   0,    1,   1,  1,1, 0,1 },
+    { ARM::t2STMDB_UPD, 0,        ARM::tPUSH,    0,   0,    1,   1,  1,1, 0,1 },
   };
 
   class Thumb2SizeReduce : public MachineFunctionPass {
@@ -133,6 +138,7 @@ namespace {
     Thumb2SizeReduce();
 
     const Thumb2InstrInfo *TII;
+    const ARMSubtarget *STI;
 
     virtual bool runOnMachineFunction(MachineFunction &MF);
 
@@ -144,6 +150,8 @@ namespace {
     /// ReduceOpcodeMap - Maps wide opcode to index of entry in ReduceTable.
     DenseMap<unsigned, unsigned> ReduceOpcodeMap;
 
+    bool canAddPseudoFlagDep(MachineInstr *Def, MachineInstr *Use);
+
     bool VerifyPredAndCC(MachineInstr *MI, const ReduceEntry &Entry,
                          bool is2Addr, ARMCC::CondCodes Pred,
                          bool LiveCPSR, bool &HasCC, bool &CCDead);
@@ -152,19 +160,20 @@ namespace {
                          const ReduceEntry &Entry);
 
     bool ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
-                       const ReduceEntry &Entry, bool LiveCPSR);
+                       const ReduceEntry &Entry, bool LiveCPSR,
+                       MachineInstr *CPSRDef);
 
     /// ReduceTo2Addr - Reduce a 32-bit instruction to a 16-bit two-address
     /// instruction.
     bool ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
                        const ReduceEntry &Entry,
-                       bool LiveCPSR);
+                       bool LiveCPSR, MachineInstr *CPSRDef);
 
     /// ReduceToNarrow - Reduce a 32-bit instruction to a 16-bit
     /// non-two-address instruction.
     bool ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI,
                         const ReduceEntry &Entry,
-                        bool LiveCPSR);
+                        bool LiveCPSR, MachineInstr *CPSRDef);
 
     /// ReduceMBB - Reduce width of instructions in the specified basic block.
     bool ReduceMBB(MachineBasicBlock &MBB);
@@ -187,6 +196,52 @@ static bool HasImplicitCPSRDef(const TargetInstrDesc &TID) {
   return false;
 }
 
+/// canAddPseudoFlagDep - For A9 (and other out-of-order) implementations,
+/// the 's' 16-bit instruction partially update CPSR. Abort the
+/// transformation to avoid adding false dependency on last CPSR setting
+/// instruction which hurts the ability for out-of-order execution engine
+/// to do register renaming magic.
+/// This function checks if there is a read-of-write dependency between the
+/// last instruction that defines the CPSR and the current instruction. If there
+/// is, then there is no harm done since the instruction cannot be retired
+/// before the CPSR setting instruction anyway.
+/// Note, we are not doing full dependency analysis here for the sake of compile
+/// time. We're not looking for cases like:
+/// r0 = muls ...
+/// r1 = add.w r0, ...
+/// ...
+///    = mul.w r1
+/// In this case it would have been ok to narrow the mul.w to muls since there
+/// are indirect RAW dependency between the muls and the mul.w
+bool
+Thumb2SizeReduce::canAddPseudoFlagDep(MachineInstr *Def, MachineInstr *Use) {
+  if (!Def || !STI->avoidCPSRPartialUpdate())
+    return false;
+
+  SmallSet<unsigned, 2> Defs;
+  for (unsigned i = 0, e = Def->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = Def->getOperand(i);
+    if (!MO.isReg() || MO.isUndef() || MO.isUse())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (Reg == 0 || Reg == ARM::CPSR)
+      continue;
+    Defs.insert(Reg);
+  }
+
+  for (unsigned i = 0, e = Use->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = Use->getOperand(i);
+    if (!MO.isReg() || MO.isUndef() || MO.isDef())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (Defs.count(Reg))
+      return false;
+  }
+
+  // No read-after-write dependency. The narrowing will add false dependency.
+  return true;
+}
+
 bool
 Thumb2SizeReduce::VerifyPredAndCC(MachineInstr *MI, const ReduceEntry &Entry,
                                   bool is2Addr, ARMCC::CondCodes Pred,
@@ -410,7 +465,7 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
     MIB.addOperand(MI->getOperand(OpNum));
 
   // Transfer memoperands.
-  (*MIB).setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+  MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
 
   // Transfer MI flags.
   MIB.setMIFlags(MI->getFlags());
@@ -425,7 +480,7 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
 bool
 Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
                                 const ReduceEntry &Entry,
-                                bool LiveCPSR) {
+                                bool LiveCPSR, MachineInstr *CPSRDef) {
   if (Entry.LowRegs1 && !VerifyLowRegs(MI))
     return false;
 
@@ -443,12 +498,12 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
       switch (Opc) {
       default: break;
       case ARM::t2ADDSri: {
-        if (ReduceTo2Addr(MBB, MI, Entry, LiveCPSR))
+        if (ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, CPSRDef))
           return true;
         // fallthrough
       }
       case ARM::t2ADDSrr:
-        return ReduceToNarrow(MBB, MI, Entry, LiveCPSR);
+        return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef);
       }
     }
     break;
@@ -456,13 +511,13 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
   case ARM::t2RSBri:
   case ARM::t2RSBSri:
     if (MI->getOperand(2).getImm() == 0)
-      return ReduceToNarrow(MBB, MI, Entry, LiveCPSR);
+      return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef);
     break;
   case ARM::t2MOVi16:
     // Can convert only 'pure' immediate operands, not immediates obtained as
     // globals' addresses.
     if (MI->getOperand(1).isImm())
-      return ReduceToNarrow(MBB, MI, Entry, LiveCPSR);
+      return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef);
     break;
   case ARM::t2CMPrr: {
     // Try to reduce to the lo-reg only version first. Why there are two
@@ -471,17 +526,17 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
     // are prioritized, but the table assumes a unique entry for each
     // source insn opcode. So for now, we hack a local entry record to use.
     static const ReduceEntry NarrowEntry =
-      { ARM::t2CMPrr,ARM::tCMPr, 0, 0, 0, 1, 1,2, 0, 1 };
-    if (ReduceToNarrow(MBB, MI, NarrowEntry, LiveCPSR))
+      { ARM::t2CMPrr,ARM::tCMPr, 0, 0, 0, 1, 1,2, 0, 0,1 };
+    if (ReduceToNarrow(MBB, MI, NarrowEntry, LiveCPSR, CPSRDef))
       return true;
-    return ReduceToNarrow(MBB, MI, Entry, LiveCPSR);
+    return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef);
   }
   case ARM::t2ADDrSPi: {
     static const ReduceEntry NarrowEntry =
-      { ARM::t2ADDrSPi,ARM::tADDspi, 0, 7, 0, 1, 0, 1, 0, 1 };
+      { ARM::t2ADDrSPi,ARM::tADDspi, 0, 7, 0, 1, 0, 1, 0, 0,1 };
     if (MI->getOperand(0).getReg() == ARM::SP)
-      return ReduceToNarrow(MBB, MI, NarrowEntry, LiveCPSR);
-    return ReduceToNarrow(MBB, MI, Entry, LiveCPSR);
+      return ReduceToNarrow(MBB, MI, NarrowEntry, LiveCPSR, CPSRDef);
+    return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef);
   }
   }
   return false;
@@ -490,7 +545,7 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
 bool
 Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
                                 const ReduceEntry &Entry,
-                                bool LiveCPSR) {
+                                bool LiveCPSR, MachineInstr *CPSRDef) {
 
   if (ReduceLimit2Addr != -1 && ((int)Num2Addrs >= ReduceLimit2Addr))
     return false;
@@ -545,6 +600,12 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
   if (!VerifyPredAndCC(MI, Entry, true, Pred, LiveCPSR, HasCC, CCDead))
     return false;
 
+  // Avoid adding a false dependency on partial flag update by some 16-bit
+  // instructions which has the 's' bit set.
+  if (Entry.PartFlag && NewTID.hasOptionalDef() && HasCC &&
+      canAddPseudoFlagDep(CPSRDef, MI))
+    return false;
+
   // Add the 16-bit instruction.
   DebugLoc dl = MI->getDebugLoc();
   MachineInstrBuilder MIB = BuildMI(MBB, *MI, dl, NewTID);
@@ -579,7 +640,7 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
 bool
 Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI,
                                  const ReduceEntry &Entry,
-                                 bool LiveCPSR) {
+                                 bool LiveCPSR, MachineInstr *CPSRDef) {
   if (ReduceLimit != -1 && ((int)NumNarrows >= ReduceLimit))
     return false;
 
@@ -632,6 +693,12 @@ Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI,
   if (!VerifyPredAndCC(MI, Entry, false, Pred, LiveCPSR, HasCC, CCDead))
     return false;
 
+  // Avoid adding a false dependency on partial flag update by some 16-bit
+  // instructions which has the 's' bit set.
+  if (Entry.PartFlag && NewTID.hasOptionalDef() && HasCC &&
+      canAddPseudoFlagDep(CPSRDef, MI))
+    return false;
+
   // Add the 16-bit instruction.
   DebugLoc dl = MI->getDebugLoc();
   MachineInstrBuilder MIB = BuildMI(MBB, *MI, dl, NewTID);
@@ -679,7 +746,7 @@ Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI,
   return true;
 }
 
-static bool UpdateCPSRDef(MachineInstr &MI, bool LiveCPSR) {
+static bool UpdateCPSRDef(MachineInstr &MI, bool LiveCPSR, bool &DefCPSR) {
   bool HasDef = false;
   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI.getOperand(i);
@@ -687,6 +754,8 @@ static bool UpdateCPSRDef(MachineInstr &MI, bool LiveCPSR) {
       continue;
     if (MO.getReg() != ARM::CPSR)
       continue;
+
+    DefCPSR = true;
     if (!MO.isDead())
       HasDef = true;
   }
@@ -716,6 +785,7 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) {
 
   // Yes, CPSR could be livein.
   bool LiveCPSR = MBB.isLiveIn(ARM::CPSR);
+  MachineInstr *CPSRDef = 0;
 
   MachineBasicBlock::iterator MII = MBB.begin(), E = MBB.end();
   MachineBasicBlock::iterator NextMII;
@@ -731,7 +801,7 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) {
       const ReduceEntry &Entry = ReduceTable[OPI->second];
       // Ignore "special" cases for now.
       if (Entry.Special) {
-        if (ReduceSpecial(MBB, MI, Entry, LiveCPSR)) {
+        if (ReduceSpecial(MBB, MI, Entry, LiveCPSR, CPSRDef)) {
           Modified = true;
           MachineBasicBlock::iterator I = prior(NextMII);
           MI = &*I;
@@ -740,7 +810,8 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) {
       }
 
       // Try to transform to a 16-bit two-address instruction.
-      if (Entry.NarrowOpc2 && ReduceTo2Addr(MBB, MI, Entry, LiveCPSR)) {
+      if (Entry.NarrowOpc2 &&
+          ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, CPSRDef)) {
         Modified = true;
         MachineBasicBlock::iterator I = prior(NextMII);
         MI = &*I;
@@ -748,7 +819,8 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) {
       }
 
       // Try to transform to a 16-bit non-two-address instruction.
-      if (Entry.NarrowOpc1 && ReduceToNarrow(MBB, MI, Entry, LiveCPSR)) {
+      if (Entry.NarrowOpc1 &&
+          ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef)) {
         Modified = true;
         MachineBasicBlock::iterator I = prior(NextMII);
         MI = &*I;
@@ -756,7 +828,14 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) {
     }
 
   ProcessNext:
-    LiveCPSR = UpdateCPSRDef(*MI, LiveCPSR);
+    bool DefCPSR = false;
+    LiveCPSR = UpdateCPSRDef(*MI, LiveCPSR, DefCPSR);
+    if (MI->getDesc().isCall())
+      // Calls don't really set CPSR.
+      CPSRDef = 0;
+    else if (DefCPSR)
+      // This is the last CPSR defining instruction.
+      CPSRDef = MI;
   }
 
   return Modified;
@@ -765,6 +844,7 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) {
 bool Thumb2SizeReduce::runOnMachineFunction(MachineFunction &MF) {
   const TargetMachine &TM = MF.getTarget();
   TII = static_cast<const Thumb2InstrInfo*>(TM.getInstrInfo());
+  STI = &TM.getSubtarget<ARMSubtarget>();
 
   bool Modified = false;
   for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I)
diff --git a/lib/Target/Alpha/Alpha.td b/lib/Target/Alpha/Alpha.td
index 4508eda..ae79c2e 100644
--- a/lib/Target/Alpha/Alpha.td
+++ b/lib/Target/Alpha/Alpha.td
@@ -21,7 +21,7 @@ include "llvm/Target/Target.td"
 //===----------------------------------------------------------------------===//
 
 def FeatureCIX : SubtargetFeature<"cix", "HasCT", "true",
-                                  "Enable CIX extentions">;
+                                  "Enable CIX extensions">;
 
 //===----------------------------------------------------------------------===//
 // Register File Description
diff --git a/lib/Target/Alpha/AlphaISelLowering.cpp b/lib/Target/Alpha/AlphaISelLowering.cpp
index c4f43ab..0875cfd 100644
--- a/lib/Target/Alpha/AlphaISelLowering.cpp
+++ b/lib/Target/Alpha/AlphaISelLowering.cpp
@@ -155,6 +155,8 @@ AlphaTargetLowering::AlphaTargetLowering(TargetMachine &TM)
   setJumpBufSize(272);
   setJumpBufAlignment(16);
 
+  setMinFunctionAlignment(4);
+
   computeRegisterProperties();
 }
 
@@ -180,11 +182,6 @@ const char *AlphaTargetLowering::getTargetNodeName(unsigned Opcode) const {
   }
 }
 
-/// getFunctionAlignment - Return the Log2 alignment of this function.
-unsigned AlphaTargetLowering::getFunctionAlignment(const Function *F) const {
-  return 4;
-}
-
 static SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) {
   EVT PtrVT = Op.getValueType();
   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
@@ -233,8 +230,8 @@ AlphaTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
-                 ArgLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), ArgLocs, *DAG.getContext());
 
   CCInfo.AnalyzeCallOperands(Outs, CC_Alpha);
 
@@ -296,7 +293,7 @@ AlphaTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
 
   // Build a sequence of copy-to-reg nodes chained together with token chain and
   // flag operands which copy the outgoing args into registers.  The InFlag in
-  // necessary since all emited instructions must be stuck together.
+  // necessary since all emitted instructions must be stuck together.
   SDValue InFlag;
   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
@@ -347,8 +344,8 @@ AlphaTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
 
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(), RVLocs,
-                 *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), RVLocs, *DAG.getContext());
 
   CCInfo.AnalyzeCallResult(Ins, RetCC_Alpha);
 
diff --git a/lib/Target/Alpha/AlphaISelLowering.h b/lib/Target/Alpha/AlphaISelLowering.h
index cb98f92..d38c314 100644
--- a/lib/Target/Alpha/AlphaISelLowering.h
+++ b/lib/Target/Alpha/AlphaISelLowering.h
@@ -104,9 +104,6 @@ namespace llvm {
 
     virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
 
-    /// getFunctionAlignment - Return the Log2 alignment of this function.
-    virtual unsigned getFunctionAlignment(const Function *F) const;
-
     /// isFPImmLegal - Returns true if the target can instruction select the
     /// specified FP immediate natively. If false, the legalizer will
     /// materialize the FP immediate as a load from a constant pool.
diff --git a/lib/Target/Alpha/AlphaInstrInfo.td b/lib/Target/Alpha/AlphaInstrInfo.td
index 099d715..b201712 100644
--- a/lib/Target/Alpha/AlphaInstrInfo.td
+++ b/lib/Target/Alpha/AlphaInstrInfo.td
@@ -1030,7 +1030,7 @@ def : Pat<(brcond (setune F8RC:$RA, immFPZ), bb:$DISP),
 //WMB Mfc 18.4400 Write memory barrier
 //MF_FPCR F-P 17.025 Move from FPCR
 //MT_FPCR F-P 17.024 Move to FPCR
-//There are in the Multimedia extentions, so let's not use them yet
+//There are in the Multimedia extensions, so let's not use them yet
 //def MAXSB8  : OForm<0x1C, 0x3E, "MAXSB8 $RA,$RB,$RC">; //Vector signed byte maximum
 //def MAXSW4 : OForm< 0x1C, 0x3F, "MAXSW4 $RA,$RB,$RC">; //Vector signed word maximum
 //def MAXUB8  : OForm<0x1C, 0x3C, "MAXUB8 $RA,$RB,$RC">; //Vector unsigned byte maximum
diff --git a/lib/Target/Alpha/AlphaRegisterInfo.cpp b/lib/Target/Alpha/AlphaRegisterInfo.cpp
index 7667fd8..d6c3809 100644
--- a/lib/Target/Alpha/AlphaRegisterInfo.cpp
+++ b/lib/Target/Alpha/AlphaRegisterInfo.cpp
@@ -69,6 +69,7 @@ const unsigned* AlphaRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF)
 BitVector AlphaRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
   Reserved.set(Alpha::R15);
+  Reserved.set(Alpha::R29);
   Reserved.set(Alpha::R30);
   Reserved.set(Alpha::R31);
   return Reserved;
@@ -198,6 +199,11 @@ int AlphaRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
   return -1;
 }
 
+int AlphaRegisterInfo::getLLVMRegNum(unsigned DwarfRegNum, bool isEH) const {
+  llvm_unreachable("What is the dwarf register number");
+  return -1;
+}
+
 #include "AlphaGenRegisterInfo.inc"
 
 std::string AlphaRegisterInfo::getPrettyName(unsigned reg)
diff --git a/lib/Target/Alpha/AlphaRegisterInfo.h b/lib/Target/Alpha/AlphaRegisterInfo.h
index b0d4dd0..ffe6cf1 100644
--- a/lib/Target/Alpha/AlphaRegisterInfo.h
+++ b/lib/Target/Alpha/AlphaRegisterInfo.h
@@ -48,6 +48,7 @@ struct AlphaRegisterInfo : public AlphaGenRegisterInfo {
   unsigned getEHHandlerRegister() const;
 
   int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+  int getLLVMRegNum(unsigned RegNum, bool isEH) const;
 
   static std::string getPrettyName(unsigned reg);
 };
diff --git a/lib/Target/Alpha/AlphaRegisterInfo.td b/lib/Target/Alpha/AlphaRegisterInfo.td
index 35e6804..32120d7 100644
--- a/lib/Target/Alpha/AlphaRegisterInfo.td
+++ b/lib/Target/Alpha/AlphaRegisterInfo.td
@@ -110,10 +110,10 @@ def F31 : FPR<31, "$f31">, DwarfRegNum<[64]>;
   // $28 is undefined after any and all calls
 
 /// Register classes
-def GPRC : RegisterClass<"Alpha", [i64], 64,
+def GPRC : RegisterClass<"Alpha", [i64], 64, (add
      // Volatile
-     [R0, R1, R2, R3, R4, R5, R6, R7, R8, R16, R17, R18, R19, R20, R21, R22,
-      R23, R24, R25, R28, 
+     R0, R1, R2, R3, R4, R5, R6, R7, R8, R16, R17, R18, R19, R20, R21, R22,
+     R23, R24, R25, R28,
      //Special meaning, but volatile
      R27, //procedure address
      R26, //return address
@@ -121,51 +121,13 @@ def GPRC : RegisterClass<"Alpha", [i64], 64,
      // Non-volatile
      R9, R10, R11, R12, R13, R14,
 // Don't allocate 15, 30, 31
-     R15, R30, R31 ]> //zero
-{
-  let MethodProtos = [{
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    GPRCClass::iterator
-    GPRCClass::allocation_order_end(const MachineFunction &MF) const {
-        return end()-3;
-    }
-  }];
-}
+     R15, R30, R31)>; //zero
 
-def F4RC : RegisterClass<"Alpha", [f32], 64, [F0, F1, 
+def F4RC : RegisterClass<"Alpha", [f32], 64, (add F0, F1,
         F10, F11, F12, F13, F14, F15, F16, F17, F18, F19,
         F20, F21, F22, F23, F24, F25, F26, F27, F28, F29, F30,
         // Saved:
         F2, F3, F4, F5, F6, F7, F8, F9,
-        F31 ]> //zero
-{
-  let MethodProtos = [{
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    F4RCClass::iterator
-    F4RCClass::allocation_order_end(const MachineFunction &MF) const {
-        return end()-1;
-    }
-  }];
-}
+        F31)>; //zero
 
-def F8RC : RegisterClass<"Alpha", [f64], 64, [F0, F1, 
-        F10, F11, F12, F13, F14, F15, F16, F17, F18, F19,
-        F20, F21, F22, F23, F24, F25, F26, F27, F28, F29, F30,
-        // Saved:
-        F2, F3, F4, F5, F6, F7, F8, F9,
-        F31 ]> //zero
-{
-  let MethodProtos = [{
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    F8RCClass::iterator
-    F8RCClass::allocation_order_end(const MachineFunction &MF) const {
-        return end()-1;
-    }
-  }];
-}
+def F8RC : RegisterClass<"Alpha", [f64], 64, (add F4RC)>;
diff --git a/lib/Target/Alpha/README.txt b/lib/Target/Alpha/README.txt
index 9ae1517..cc170e3 100644
--- a/lib/Target/Alpha/README.txt
+++ b/lib/Target/Alpha/README.txt
@@ -33,9 +33,9 @@ add crazy vector instructions (MVI):
 (MIN|MAX)(U|S)(B8|W4) min and max, signed and unsigned, byte and word
 PKWB, UNPKBW pack/unpack word to byte
 PKLB UNPKBL pack/unpack long to byte
-PERR pixel error (sum accross bytes of bytewise abs(i8v8 a - i8v8 b))
+PERR pixel error (sum across bytes of bytewise abs(i8v8 a - i8v8 b))
 
-cmpbytes bytewise cmpeq of i8v8 a and i8v8 b (not part of MVI extentions)
+cmpbytes bytewise cmpeq of i8v8 a and i8v8 b (not part of MVI extensions)
 
 this has some good examples for other operations that can be synthesised well 
 from these rather meager vector ops (such as saturating add).
diff --git a/lib/Target/Blackfin/BlackfinFrameLowering.cpp b/lib/Target/Blackfin/BlackfinFrameLowering.cpp
index 08bb952..0b0984d 100644
--- a/lib/Target/Blackfin/BlackfinFrameLowering.cpp
+++ b/lib/Target/Blackfin/BlackfinFrameLowering.cpp
@@ -31,6 +31,12 @@ bool BlackfinFrameLowering::hasFP(const MachineFunction &MF) const {
     MFI->adjustsStack() || MFI->hasVarSizedObjects();
 }
 
+// Always reserve a call frame. We dont have enough registers to adjust SP.
+bool BlackfinFrameLowering::
+hasReservedCallFrame(const MachineFunction &MF) const {
+  return true;
+}
+
 // Emit a prologue that sets up a stack frame.
 // On function entry, R0-R2 and P0 may hold arguments.
 // R3, P1, and P2 may be used as scratch registers
diff --git a/lib/Target/Blackfin/BlackfinFrameLowering.h b/lib/Target/Blackfin/BlackfinFrameLowering.h
index 3d2ee25..726fa2c 100644
--- a/lib/Target/Blackfin/BlackfinFrameLowering.h
+++ b/lib/Target/Blackfin/BlackfinFrameLowering.h
@@ -36,6 +36,7 @@ public:
   void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
 
   bool hasFP(const MachineFunction &MF) const;
+  bool hasReservedCallFrame(const MachineFunction &MF) const;
 
   void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                                             RegScavenger *RS) const;
diff --git a/lib/Target/Blackfin/BlackfinISelDAGToDAG.cpp b/lib/Target/Blackfin/BlackfinISelDAGToDAG.cpp
index 9df2aee..42659ae 100644
--- a/lib/Target/Blackfin/BlackfinISelDAGToDAG.cpp
+++ b/lib/Target/Blackfin/BlackfinISelDAGToDAG.cpp
@@ -117,11 +117,11 @@ bool BlackfinDAGToDAGISel::SelectADDRspii(SDValue Addr,
 }
 
 static inline bool isCC(const TargetRegisterClass *RC) {
-  return RC == &BF::AnyCCRegClass || BF::AnyCCRegClass.hasSubClass(RC);
+  return BF::AnyCCRegClass.hasSubClassEq(RC);
 }
 
 static inline bool isDCC(const TargetRegisterClass *RC) {
-  return RC == &BF::DRegClass || BF::DRegClass.hasSubClass(RC) || isCC(RC);
+  return BF::DRegClass.hasSubClassEq(RC) || isCC(RC);
 }
 
 static void UpdateNodeOperand(SelectionDAG &DAG,
diff --git a/lib/Target/Blackfin/BlackfinISelLowering.cpp b/lib/Target/Blackfin/BlackfinISelLowering.cpp
index 7c80eec..588d9bd 100644
--- a/lib/Target/Blackfin/BlackfinISelLowering.cpp
+++ b/lib/Target/Blackfin/BlackfinISelLowering.cpp
@@ -121,6 +121,8 @@ BlackfinTargetLowering::BlackfinTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::VAEND, MVT::Other, Expand);
   setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
   setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+
+  setMinFunctionAlignment(2);
 }
 
 const char *BlackfinTargetLowering::getTargetNodeName(unsigned Opcode) const {
@@ -169,8 +171,8 @@ BlackfinTargetLowering::LowerFormalArguments(SDValue Chain,
   MachineFrameInfo *MFI = MF.getFrameInfo();
 
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
-                 ArgLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), ArgLocs, *DAG.getContext());
   CCInfo.AllocateStack(12, 4);  // ABI requires 12 bytes stack space
   CCInfo.AnalyzeFormalArguments(Ins, CC_Blackfin);
 
@@ -227,8 +229,8 @@ BlackfinTargetLowering::LowerReturn(SDValue Chain,
   SmallVector<CCValAssign, 16> RVLocs;
 
   // CCState - Info about the registers and stack slot.
-  CCState CCInfo(CallConv, isVarArg, DAG.getTarget(),
-                 RVLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 DAG.getTarget(), RVLocs, *DAG.getContext());
 
   // Analize return values.
   CCInfo.AnalyzeReturn(Outs, RetCC_Blackfin);
@@ -288,8 +290,8 @@ BlackfinTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getTarget(), ArgLocs,
-                 *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 DAG.getTarget(), ArgLocs, *DAG.getContext());
   CCInfo.AllocateStack(12, 4);  // ABI requires 12 bytes stack space
   CCInfo.AnalyzeCallOperands(Outs, CC_Blackfin);
 
@@ -345,7 +347,7 @@ BlackfinTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
 
   // Build a sequence of copy-to-reg nodes chained together with token
   // chain and flag operands which copy the outgoing args into registers.
-  // The InFlag in necessary since all emited instructions must be
+  // The InFlag in necessary since all emitted instructions must be
   // stuck together.
   SDValue InFlag;
   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
@@ -376,8 +378,8 @@ BlackfinTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
 
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState RVInfo(CallConv, isVarArg, DAG.getTarget(), RVLocs,
-                 *DAG.getContext());
+  CCState RVInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 DAG.getTarget(), RVLocs, *DAG.getContext());
 
   RVInfo.AnalyzeCallResult(Ins, RetCC_Blackfin);
 
@@ -497,11 +499,6 @@ BlackfinTargetLowering::ReplaceNodeResults(SDNode *N,
   }
 }
 
-/// getFunctionAlignment - Return the Log2 alignment of this function.
-unsigned BlackfinTargetLowering::getFunctionAlignment(const Function *F) const {
-  return 2;
-}
-
 //===----------------------------------------------------------------------===//
 //                         Blackfin Inline Assembly Support
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/Blackfin/BlackfinISelLowering.h b/lib/Target/Blackfin/BlackfinISelLowering.h
index 102c830..9a54557 100644
--- a/lib/Target/Blackfin/BlackfinISelLowering.h
+++ b/lib/Target/Blackfin/BlackfinISelLowering.h
@@ -53,7 +53,6 @@ namespace llvm {
                                       EVT VT) const;
     virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
     const char *getTargetNodeName(unsigned Opcode) const;
-    unsigned getFunctionAlignment(const Function *F) const;
 
   private:
     SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/Blackfin/BlackfinInstrInfo.cpp b/lib/Target/Blackfin/BlackfinInstrInfo.cpp
index e50d57a..598cf2a 100644
--- a/lib/Target/Blackfin/BlackfinInstrInfo.cpp
+++ b/lib/Target/Blackfin/BlackfinInstrInfo.cpp
@@ -160,7 +160,7 @@ static bool inClass(const TargetRegisterClass &Test,
   if (TargetRegisterInfo::isPhysicalRegister(Reg))
     return Test.contains(Reg);
   else
-    return &Test==RC || Test.hasSubClass(RC);
+    return Test.hasSubClassEq(RC);
 }
 
 void
diff --git a/lib/Target/Blackfin/BlackfinRegisterInfo.cpp b/lib/Target/Blackfin/BlackfinRegisterInfo.cpp
index b4a9b84..6ca460e 100644
--- a/lib/Target/Blackfin/BlackfinRegisterInfo.cpp
+++ b/lib/Target/Blackfin/BlackfinRegisterInfo.cpp
@@ -351,5 +351,11 @@ int BlackfinRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
   return -1;
 }
 
+int BlackfinRegisterInfo::getLLVMRegNum(unsigned DwarfRegNum,
+                                        bool isEH) const {
+  llvm_unreachable("What is the dwarf register number");
+  return -1;
+}
+
 #include "BlackfinGenRegisterInfo.inc"
 
diff --git a/lib/Target/Blackfin/BlackfinRegisterInfo.h b/lib/Target/Blackfin/BlackfinRegisterInfo.h
index 642b8ad..375d277 100644
--- a/lib/Target/Blackfin/BlackfinRegisterInfo.h
+++ b/lib/Target/Blackfin/BlackfinRegisterInfo.h
@@ -41,8 +41,6 @@ namespace llvm {
       return &BF::PRegClass;
     }
 
-    // bool hasReservedCallFrame(MachineFunction &MF) const;
-
     bool requiresRegisterScavenging(const MachineFunction &MF) const;
 
     void eliminateCallFramePseudoInstr(MachineFunction &MF,
@@ -60,6 +58,7 @@ namespace llvm {
     unsigned getEHHandlerRegister() const;
 
     int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+    int getLLVMRegNum(unsigned RegNum, bool isEH) const;
 
     // Utility functions
     void adjustRegister(MachineBasicBlock &MBB,
diff --git a/lib/Target/Blackfin/BlackfinRegisterInfo.td b/lib/Target/Blackfin/BlackfinRegisterInfo.td
index f5dd439..9e2f79f 100644
--- a/lib/Target/Blackfin/BlackfinRegisterInfo.td
+++ b/lib/Target/Blackfin/BlackfinRegisterInfo.td
@@ -195,158 +195,66 @@ def LB0 : Ri<6, 2, "lb0">, DwarfRegNum<[48]>;
 def LB1 : Ri<6, 5, "lb1">, DwarfRegNum<[49]>;
 
 // Register classes.
-def D16 : RegisterClass<"BF", [i16], 16,
-    [R0H, R0L, R1H, R1L, R2H, R2L, R3H, R3L,
-     R4H, R4L, R5H, R5L, R6H, R6L, R7H, R7L]>;
+def D16L : RegisterClass<"BF", [i16], 16, (sequence "R%uL", 0, 7)>;
 
-def D16L : RegisterClass<"BF", [i16], 16,
-    [R0L, R1L, R2L, R3L, R4L, R5L, R6L, R7L]>;
+def D16H : RegisterClass<"BF", [i16], 16, (sequence "R%uH", 0, 7)>;
 
-def D16H : RegisterClass<"BF", [i16], 16,
-    [R0H, R1H, R2H, R3H, R4H, R5H, R6H, R7H]>;
-
-def P16 : RegisterClass<"BF", [i16], 16,
-    [P0H, P0L, P1H, P1L, P2H, P2L, P3H, P3L,
-     P4H, P4L, P5H, P5L, SPH, SPL, FPH, FPL]>;
+def D16 : RegisterClass<"BF", [i16], 16, (add D16L, D16H)>;
 
 def P16L : RegisterClass<"BF", [i16], 16,
-    [P0L, P1L, P2L, P3L, P4L, P5L, SPL, FPL]>;
+                         (add (sequence "P%uL", 0, 5), SPL, FPL)>;
 
 def P16H : RegisterClass<"BF", [i16], 16,
-    [P0H, P1H, P2H, P3H, P4H, P5H, SPH, FPH]>;
+                         (add (sequence "P%uH", 0, 5), SPH, FPH)>;
+
+def P16 : RegisterClass<"BF", [i16], 16, (add P16L, P16H)>;
 
-def DP16 : RegisterClass<"BF", [i16], 16,
-    [R0H, R0L, R1H, R1L, R2H, R2L, R3H, R3L,
-     R4H, R4L, R5H, R5L, R6H, R6L, R7H, R7L,
-     P0H, P0L, P1H, P1L, P2H, P2L, P3H, P3L,
-     P4H, P4L, P5H, P5L, SPH, SPL, FPH, FPL]>;
+def DP16 : RegisterClass<"BF", [i16], 16, (add D16, P16)>;
 
-def DP16L : RegisterClass<"BF", [i16], 16,
-    [R0L, R1L, R2L, R3L, R4L, R5L, R6L, R7L,
-     P0L, P1L, P2L, P3L, P4L, P5L, SPL, FPL]>;
+def DP16L : RegisterClass<"BF", [i16], 16, (add D16L, P16L)>;
 
-def DP16H : RegisterClass<"BF", [i16], 16,
-    [R0H, R1H, R2H, R3H, R4H, R5H, R6H, R7H,
-     P0H, P1H, P2H, P3H, P4H, P5H, SPH, FPH]>;
+def DP16H : RegisterClass<"BF", [i16], 16, (add D16H, P16H)>;
 
 def GR16 : RegisterClass<"BF", [i16], 16,
-    [R0H, R0L, R1H, R1L, R2H, R2L, R3H, R3L,
-     R4H, R4L, R5H, R5L, R6H, R6L, R7H, R7L,
-     P0H, P0L, P1H, P1L, P2H, P2L, P3H, P3L,
-     P4H, P4L, P5H, P5L, SPH, SPL, FPH, FPL,
+    (add DP16,
      I0H, I0L, I1H, I1L, I2H, I2L, I3H, I3L,
      M0H, M0L, M1H, M1L, M2H, M2L, M3H, M3L,
      B0H, B0L, B1H, B1L, B2H, B2L, B3H, B3L,
-     L0H, L0L, L1H, L1L, L2H, L2L, L3H, L3L]>;
+     L0H, L0L, L1H, L1L, L2H, L2L, L3H, L3L)>;
 
-def D : RegisterClass<"BF", [i32], 32, [R0, R1, R2, R3, R4, R5, R6, R7]> {
+def D : RegisterClass<"BF", [i32], 32, (sequence "R%u", 0, 7)> {
   let SubRegClasses = [(D16L lo16), (D16H hi16)];
 }
 
-def P : RegisterClass<"BF", [i32], 32, [P0, P1, P2, P3, P4, P5, FP, SP]> {
+def P : RegisterClass<"BF", [i32], 32, (add (sequence "P%u", 0, 5), FP, SP)> {
   let SubRegClasses = [(P16L lo16), (P16H hi16)];
-  let MethodProtos = [{
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    PClass::iterator
-    PClass::allocation_order_end(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const TargetFrameLowering *TFI = TM.getFrameLowering();
-      return allocation_order_begin(MF)
-             + (TFI->hasFP(MF) ? 7 : 6);
-    }
-  }];
 }
 
-def I : RegisterClass<"BF", [i32], 32, [I0, I1, I2, I3]>;
-def M : RegisterClass<"BF", [i32], 32, [M0, M1, M2, M3]>;
-def B : RegisterClass<"BF", [i32], 32, [B0, B1, B2, B3]>;
-def L : RegisterClass<"BF", [i32], 32, [L0, L1, L2, L3]>;
-
-def DP : RegisterClass<"BF", [i32], 32,
-    [R0, R1, R2, R3, R4, R5, R6, R7,
-     P0, P1, P2, P3, P4, P5, FP, SP]> {
+def DP : RegisterClass<"BF", [i32], 32, (add D, P)> {
   let SubRegClasses = [(DP16L lo16), (DP16H hi16)];
-  let MethodProtos = [{
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    DPClass::iterator
-    DPClass::allocation_order_end(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const TargetFrameLowering *TFI = TM.getFrameLowering();
-      return allocation_order_begin(MF)
-             + (TFI->hasFP(MF) ? 15 : 14);
-    }
-  }];
 }
 
-def GR : RegisterClass<"BF", [i32], 32,
-    [R0, R1, R2, R3, R4, R5, R6, R7,
-     P0, P1, P2, P3, P4, P5,
-     I0, I1, I2, I3, M0, M1, M2, M3,
-     B0, B1, B2, B3, L0, L1, L2, L3,
-     FP, SP]> {
-  let MethodProtos = [{
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    GRClass::iterator
-    GRClass::allocation_order_end(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const TargetFrameLowering *TFI = TM.getFrameLowering();
-      return allocation_order_begin(MF)
-             + (TFI->hasFP(MF) ? 31 : 30);
-    }
-  }];
-}
+def I : RegisterClass<"BF", [i32], 32, (add I0, I1, I2, I3)>;
+def M : RegisterClass<"BF", [i32], 32, (add M0, M1, M2, M3)>;
+def B : RegisterClass<"BF", [i32], 32, (add B0, B1, B2, B3)>;
+def L : RegisterClass<"BF", [i32], 32, (add L0, L1, L2, L3)>;
+
+def GR : RegisterClass<"BF", [i32], 32, (add DP, I, M, B, L)>;
 
 def ALL : RegisterClass<"BF", [i32], 32,
-    [R0, R1, R2, R3, R4, R5, R6, R7,
-     P0, P1, P2, P3, P4, P5,
-     I0, I1, I2, I3, M0, M1, M2, M3,
-     B0, B1, B2, B3, L0, L1, L2, L3,
-     FP, SP,
+    (add GR,
      A0X, A0W, A1X, A1W, ASTAT, RETS,
      LC0, LT0, LB0, LC1, LT1, LB1, CYCLES, CYCLES2,
-     USP, SEQSTAT, SYSCFG, RETI, RETX, RETN, RETE, EMUDAT]> {
-  let MethodProtos = [{
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    ALLClass::iterator
-    ALLClass::allocation_order_end(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const TargetFrameLowering *TFI = TM.getFrameLowering();
-      return allocation_order_begin(MF)
-             + (TFI->hasFP(MF) ? 31 : 30);
-    }
-  }];
-}
+     USP, SEQSTAT, SYSCFG, RETI, RETX, RETN, RETE, EMUDAT)>;
 
-def PI : RegisterClass<"BF", [i32], 32,
-    [P0, P1, P2, P3, P4, P5, I0, I1, I2, I3, FP, SP]> {
-  let MethodProtos = [{
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    PIClass::iterator
-    PIClass::allocation_order_end(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const TargetFrameLowering *TFI = TM.getFrameLowering();
-      return allocation_order_begin(MF)
-             + (TFI->hasFP(MF) ? 11 : 10);
-    }
-  }];
-}
+def PI : RegisterClass<"BF", [i32], 32, (add P, I)>;
 
 // We are going to pretend that CC and !CC are 32-bit registers, even though
 // they only can hold 1 bit.
 let CopyCost = -1, Size = 8 in {
-def JustCC  : RegisterClass<"BF", [i32], 8, [CC]>;
-def NotCC   : RegisterClass<"BF", [i32], 8, [NCC]>;
-def AnyCC   : RegisterClass<"BF", [i32], 8, [CC, NCC]> {
+def JustCC  : RegisterClass<"BF", [i32], 8, (add CC)>;
+def NotCC   : RegisterClass<"BF", [i32], 8, (add NCC)>;
+def AnyCC   : RegisterClass<"BF", [i32], 8, (add CC, NCC)> {
   let MethodProtos = [{
     iterator allocation_order_end(const MachineFunction &MF) const;
   }];
@@ -358,8 +266,8 @@ def AnyCC   : RegisterClass<"BF", [i32], 8, [CC, NCC]> {
   }];
 }
 def StatBit : RegisterClass<"BF", [i1], 8,
-    [AZ, AN, CC, AQ, AC0, AC1, AV0, AV0S, AV1, AV1S, V, VS]>;
+    (add AZ, AN, CC, AQ, AC0, AC1, AV0, AV0S, AV1, AV1S, V, VS)>;
 }
 
 // Should be i40, but that isn't defined. It is not a legal type yet anyway.
-def Accu : RegisterClass<"BF", [i64], 64, [A0, A1]>;
+def Accu : RegisterClass<"BF", [i64], 64, (add A0, A1)>;
diff --git a/lib/Target/CBackend/CBackend.cpp b/lib/Target/CBackend/CBackend.cpp
index 358d1b3..7c24037 100644
--- a/lib/Target/CBackend/CBackend.cpp
+++ b/lib/Target/CBackend/CBackend.cpp
@@ -278,7 +278,7 @@ namespace {
       return AI;
     }
 
-    // isInlineAsm - Check if the instruction is a call to an inline asm chunk
+    // isInlineAsm - Check if the instruction is a call to an inline asm chunk.
     static bool isInlineAsm(const Instruction& I) {
       if (const CallInst *CI = dyn_cast<CallInst>(&I))
         return isa<InlineAsm>(CI->getCalledValue());
@@ -373,7 +373,7 @@ static std::string CBEMangle(const std::string &S) {
 ///
 bool CBackendNameAllUsedStructsAndMergeFunctions::runOnModule(Module &M) {
   // Get a set of types that are used by the program...
-  std::set<const Type *> UT = getAnalysis<FindUsedTypes>().getTypes();
+  SetVector<const Type *> UT = getAnalysis<FindUsedTypes>().getTypes();
 
   // Loop over the module symbol table, removing types from UT that are
   // already named, and removing names for types that are not used.
@@ -390,11 +390,10 @@ bool CBackendNameAllUsedStructsAndMergeFunctions::runOnModule(Module &M) {
       TST.remove(I);
     } else {
       // If this is not used, remove it from the symbol table.
-      std::set<const Type *>::iterator UTI = UT.find(I->second);
-      if (UTI == UT.end())
+      if (!UT.count(I->second))
         TST.remove(I);
       else
-        UT.erase(UTI);    // Only keep one name for this type.
+        UT.remove(I->second); // Only keep one name for this type.
     }
   }
 
@@ -403,7 +402,7 @@ bool CBackendNameAllUsedStructsAndMergeFunctions::runOnModule(Module &M) {
   //
   bool Changed = false;
   unsigned RenameCounter = 0;
-  for (std::set<const Type *>::const_iterator I = UT.begin(), E = UT.end();
+  for (SetVector<const Type *>::const_iterator I = UT.begin(), E = UT.end();
        I != E; ++I)
     if ((*I)->isStructTy() || (*I)->isArrayTy()) {
       while (M.addTypeName("unnamed"+utostr(RenameCounter), *I))
@@ -661,7 +660,7 @@ void CWriter::printConstantArray(ConstantArray *CPA, bool Static) {
 
   if (isString) {
     Out << '\"';
-    // Keep track of whether the last number was a hexadecimal escape
+    // Keep track of whether the last number was a hexadecimal escape.
     bool LastWasHex = false;
 
     // Do not include the last character, which we know is null
diff --git a/lib/Target/CellSPU/SPU64InstrInfo.td b/lib/Target/CellSPU/SPU64InstrInfo.td
index 5ef5716..f340edf 100644
--- a/lib/Target/CellSPU/SPU64InstrInfo.td
+++ b/lib/Target/CellSPU/SPU64InstrInfo.td
@@ -24,7 +24,7 @@
 // 5. The code sequences for r64 and v2i64 are probably overly conservative,
 //    compared to the code that gcc produces.
 //
-// M00$E B!tes Kan be Pretty N@sTi!!!!! (appologies to Monty!)
+// M00$E B!tes Kan be Pretty N@sTi!!!!! (apologies to Monty!)
 //-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
 
 // selb instruction definition for i64. Note that the selection mask is
diff --git a/lib/Target/CellSPU/SPUISelLowering.cpp b/lib/Target/CellSPU/SPUISelLowering.cpp
index 743a4d7..f9b5041 100644
--- a/lib/Target/CellSPU/SPUISelLowering.cpp
+++ b/lib/Target/CellSPU/SPUISelLowering.cpp
@@ -445,6 +445,8 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
   setTargetDAGCombine(ISD::SIGN_EXTEND);
   setTargetDAGCombine(ISD::ANY_EXTEND);
 
+  setMinFunctionAlignment(3);
+
   computeRegisterProperties();
 
   // Set pre-RA register scheduler default to BURR, which produces slightly
@@ -489,11 +491,6 @@ SPUTargetLowering::getTargetNodeName(unsigned Opcode) const
   return ((i != node_names.end()) ? i->second : 0);
 }
 
-/// getFunctionAlignment - Return the Log2 alignment of this function.
-unsigned SPUTargetLowering::getFunctionAlignment(const Function *) const {
-  return 3;
-}
-
 //===----------------------------------------------------------------------===//
 // Return the Cell SPU's SETCC result type
 //===----------------------------------------------------------------------===//
@@ -705,7 +702,7 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
                                                  offset
                                                 ));
 
-    // Shift the low similarily
+    // Shift the low similarly
     // TODO: add SPUISD::SHL_BYTES
     low = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, low, offset );
 
@@ -1120,8 +1117,8 @@ SPUTargetLowering::LowerFormalArguments(SDValue Chain,
   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
 
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs,
-                 *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), ArgLocs, *DAG.getContext());
   // FIXME: allow for other calling conventions
   CCInfo.AnalyzeFormalArguments(Ins, CCC_SPU);
 
@@ -1218,7 +1215,7 @@ SPUTargetLowering::LowerFormalArguments(SDValue Chain,
       FuncInfo->setVarArgsFrameIndex(
         MFI->CreateFixedObject(StackSlotSize, ArgOffset, true));
       SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
-      unsigned VReg = MF.addLiveIn(ArgRegs[ArgRegIdx], &SPU::R32CRegClass);
+      unsigned VReg = MF.addLiveIn(ArgRegs[ArgRegIdx], &SPU::VECREGRegClass);
       SDValue ArgVal = DAG.getRegister(VReg, MVT::v16i8);
       SDValue Store = DAG.getStore(Chain, dl, ArgVal, FIN, MachinePointerInfo(),
                                    false, false, 0);
@@ -1267,8 +1264,8 @@ SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
   unsigned StackSlotSize = SPUFrameLowering::stackSlotSize();
 
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs,
-                 *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), ArgLocs, *DAG.getContext());
   // FIXME: allow for other calling conventions
   CCInfo.AnalyzeCallOperands(Outs, CCC_SPU);
 
@@ -1428,8 +1425,8 @@ SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
 
   // Now handle the return value(s)
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCRetInfo(CallConv, isVarArg, getTargetMachine(),
-                    RVLocs, *DAG.getContext());
+  CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		    getTargetMachine(), RVLocs, *DAG.getContext());
   CCRetInfo.AnalyzeCallResult(Ins, CCC_SPU);
 
 
@@ -1455,8 +1452,8 @@ SPUTargetLowering::LowerReturn(SDValue Chain,
                                DebugLoc dl, SelectionDAG &DAG) const {
 
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
-                 RVLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), RVLocs, *DAG.getContext());
   CCInfo.AnalyzeReturn(Outs, RetCC_SPU);
 
   // If this is the first return lowered for this function, add the regs to the
@@ -3207,11 +3204,11 @@ SPUTargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
 // LowerAsmOperandForConstraint
 void
 SPUTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
-                                                char ConstraintLetter,
+                                                std::string &Constraint,
                                                 std::vector<SDValue> &Ops,
                                                 SelectionDAG &DAG) const {
   // Default, for the time being, to the base class handler
-  TargetLowering::LowerAsmOperandForConstraint(Op, ConstraintLetter, Ops, DAG);
+  TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
 }
 
 /// isLegalAddressImmediate - Return true if the integer value can be used
diff --git a/lib/Target/CellSPU/SPUISelLowering.h b/lib/Target/CellSPU/SPUISelLowering.h
index cf883e2..d23f6cc 100644
--- a/lib/Target/CellSPU/SPUISelLowering.h
+++ b/lib/Target/CellSPU/SPUISelLowering.h
@@ -141,7 +141,7 @@ namespace llvm {
       getRegForInlineAsmConstraint(const std::string &Constraint,
                                    EVT VT) const;
 
-    void LowerAsmOperandForConstraint(SDValue Op, char ConstraintLetter,
+    void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
                                       std::vector<SDValue> &Ops,
                                       SelectionDAG &DAG) const;
 
@@ -152,9 +152,6 @@ namespace llvm {
 
     virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
 
-    /// getFunctionAlignment - Return the Log2 alignment of this function.
-    virtual unsigned getFunctionAlignment(const Function *F) const;
-
     virtual SDValue
       LowerFormalArguments(SDValue Chain,
                            CallingConv::ID CallConv, bool isVarArg,
diff --git a/lib/Target/CellSPU/SPURegisterInfo.cpp b/lib/Target/CellSPU/SPURegisterInfo.cpp
index 0bdd50a..623ae76 100644
--- a/lib/Target/CellSPU/SPURegisterInfo.cpp
+++ b/lib/Target/CellSPU/SPURegisterInfo.cpp
@@ -328,6 +328,10 @@ SPURegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
   return SPUGenRegisterInfo::getDwarfRegNumFull(RegNum, 0);
 }
 
+int SPURegisterInfo::getLLVMRegNum(unsigned RegNum, bool isEH) const {
+  return SPUGenRegisterInfo::getLLVMRegNumFull(RegNum, 0);
+}
+
 int
 SPURegisterInfo::convertDFormToXForm(int dFormOpcode) const
 {
diff --git a/lib/Target/CellSPU/SPURegisterInfo.h b/lib/Target/CellSPU/SPURegisterInfo.h
index 1708c59..6ecf0f2 100644
--- a/lib/Target/CellSPU/SPURegisterInfo.h
+++ b/lib/Target/CellSPU/SPURegisterInfo.h
@@ -83,6 +83,7 @@ namespace llvm {
 
     //! Get DWARF debugging register number
     int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+    int getLLVMRegNum(unsigned RegNum, bool isEH) const;
 
     //! Convert D-form load/store to X-form load/store
     /*!
diff --git a/lib/Target/CellSPU/SPURegisterInfo.td b/lib/Target/CellSPU/SPURegisterInfo.td
index 3e8f097..e16f51f 100644
--- a/lib/Target/CellSPU/SPURegisterInfo.td
+++ b/lib/Target/CellSPU/SPURegisterInfo.td
@@ -155,275 +155,29 @@ def R127 : SPUVecReg<127, "$127">, DwarfRegNum<[127]>;
 
 // The SPU's registers as 128-bit wide entities, and can function as general
 // purpose registers, where the operands are in the "preferred slot":
+// The non-volatile registers are allocated in reverse order, like PPC does it.
 def GPRC : RegisterClass<"SPU", [i128], 128,
- [
-   /* volatile register */
-   R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, 
-   R17, R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31,
-   R32, R33, R34, R35, R36, R37, R38, R39, R40, R41, R42, R43, R44, R45, R46,
-   R47, R48, R49, R50, R51, R52, R53, R54, R55, R56, R57, R58, R59, R60, R61,
-   R62, R63, R64, R65, R66, R67, R68, R69, R70, R71, R72, R73, R74, R75, R76,
-   R77, R78, R79,
-   /* non-volatile register: take hint from PPC and allocate in reverse order */
-   R127, R126, R125, R124, R123, R122, R121, R120, R119, R118, R117, R116, R115,
-   R114, R113, R112, R111, R110, R109, R108, R107, R106, R105, R104, R103, R102,
-   R101, R100, R99, R98, R97, R96, R95, R94, R93, R92, R91, R90, R89, R88, R87,
-   R86, R85, R84, R83, R82, R81, R80, 
-   /* environment ptr, SP, LR */ 
-   R2, R1, R0 ]>
-{
-  let MethodProtos = [{
-    iterator allocation_order_begin(const MachineFunction &MF) const;
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    GPRCClass::iterator
-    GPRCClass::allocation_order_begin(const MachineFunction &MF) const {
-      return begin();
-    }
-    GPRCClass::iterator
-    GPRCClass::allocation_order_end(const MachineFunction &MF) const {
-      return end()-3;  // don't allocate R2, R1, or R0 (envp, sp, lr)
-    }
-  }];
-}
+                         (add (sequence "R%u", 0, 79),
+                              (sequence "R%u", 127, 80))>;
 
 // The SPU's registers as 64-bit wide (double word integer) "preferred slot":
-def R64C : RegisterClass<"SPU", [i64], 128,
- [
-   /* volatile register */
-   R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, 
-   R17, R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31,
-   R32, R33, R34, R35, R36, R37, R38, R39, R40, R41, R42, R43, R44, R45, R46,
-   R47, R48, R49, R50, R51, R52, R53, R54, R55, R56, R57, R58, R59, R60, R61,
-   R62, R63, R64, R65, R66, R67, R68, R69, R70, R71, R72, R73, R74, R75, R76,
-   R77, R78, R79,
-   /* non-volatile register: take hint from PPC and allocate in reverse order */
-   R127, R126, R125, R124, R123, R122, R121, R120, R119, R118, R117, R116, R115,
-   R114, R113, R112, R111, R110, R109, R108, R107, R106, R105, R104, R103, R102,
-   R101, R100, R99, R98, R97, R96, R95, R94, R93, R92, R91, R90, R89, R88, R87,
-   R86, R85, R84, R83, R82, R81, R80, 
-   /* environment ptr, SP, LR */ 
-   R2, R1, R0 ]>
-{
-  let MethodProtos = [{
-    iterator allocation_order_begin(const MachineFunction &MF) const;
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    R64CClass::iterator
-    R64CClass::allocation_order_begin(const MachineFunction &MF) const {
-      return begin();
-    }
-    R64CClass::iterator
-    R64CClass::allocation_order_end(const MachineFunction &MF) const {
-      return end()-3;  // don't allocate R2, R1, or R0 (envp, sp, lr)
-    }
-  }];
-}
+def R64C : RegisterClass<"SPU", [i64], 128, (add GPRC)>;
 
 // The SPU's registers as 64-bit wide (double word) FP "preferred slot":
-def R64FP : RegisterClass<"SPU", [f64], 128,
- [
-   /* volatile register */
-   R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, 
-   R17, R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31,
-   R32, R33, R34, R35, R36, R37, R38, R39, R40, R41, R42, R43, R44, R45, R46,
-   R47, R48, R49, R50, R51, R52, R53, R54, R55, R56, R57, R58, R59, R60, R61,
-   R62, R63, R64, R65, R66, R67, R68, R69, R70, R71, R72, R73, R74, R75, R76,
-   R77, R78, R79,
-   /* non-volatile register: take hint from PPC and allocate in reverse order */
-   R127, R126, R125, R124, R123, R122, R121, R120, R119, R118, R117, R116, R115,
-   R114, R113, R112, R111, R110, R109, R108, R107, R106, R105, R104, R103, R102,
-   R101, R100, R99, R98, R97, R96, R95, R94, R93, R92, R91, R90, R89, R88, R87,
-   R86, R85, R84, R83, R82, R81, R80, 
-   /* environment ptr, SP, LR */ 
-   R2, R1, R0 ]>
-{
-  let MethodProtos = [{
-    iterator allocation_order_begin(const MachineFunction &MF) const;
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    R64FPClass::iterator
-    R64FPClass::allocation_order_begin(const MachineFunction &MF) const {
-      return begin();
-    }
-    R64FPClass::iterator
-    R64FPClass::allocation_order_end(const MachineFunction &MF) const {
-      return end()-3;  // don't allocate R2, R1, or R0 (envp, sp, lr)
-    }
-  }];
-}
+def R64FP : RegisterClass<"SPU", [f64], 128, (add GPRC)>;
 
 // The SPU's registers as 32-bit wide (word) "preferred slot":
-def R32C : RegisterClass<"SPU", [i32], 128,
- [
-   /* volatile register */
-   R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, 
-   R17, R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31,
-   R32, R33, R34, R35, R36, R37, R38, R39, R40, R41, R42, R43, R44, R45, R46,
-   R47, R48, R49, R50, R51, R52, R53, R54, R55, R56, R57, R58, R59, R60, R61,
-   R62, R63, R64, R65, R66, R67, R68, R69, R70, R71, R72, R73, R74, R75, R76,
-   R77, R78, R79,
-   /* non-volatile register: take hint from PPC and allocate in reverse order */
-   R127, R126, R125, R124, R123, R122, R121, R120, R119, R118, R117, R116, R115,
-   R114, R113, R112, R111, R110, R109, R108, R107, R106, R105, R104, R103, R102,
-   R101, R100, R99, R98, R97, R96, R95, R94, R93, R92, R91, R90, R89, R88, R87,
-   R86, R85, R84, R83, R82, R81, R80, 
-   /* environment ptr, SP, LR */ 
-   R2, R1, R0 ]>
-{
-  let MethodProtos = [{
-    iterator allocation_order_begin(const MachineFunction &MF) const;
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    R32CClass::iterator
-    R32CClass::allocation_order_begin(const MachineFunction &MF) const {
-      return begin();
-    }
-    R32CClass::iterator
-    R32CClass::allocation_order_end(const MachineFunction &MF) const {
-      return end()-3;  // don't allocate R2, R1, or R0 (envp, sp, lr)
-    }
-  }];
-}
+def R32C : RegisterClass<"SPU", [i32], 128, (add GPRC)>;
 
 // The SPU's registers as single precision floating point "preferred slot":
-def R32FP : RegisterClass<"SPU", [f32], 128,
- [
-   /* volatile register */
-   R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, 
-   R17, R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31,
-   R32, R33, R34, R35, R36, R37, R38, R39, R40, R41, R42, R43, R44, R45, R46,
-   R47, R48, R49, R50, R51, R52, R53, R54, R55, R56, R57, R58, R59, R60, R61,
-   R62, R63, R64, R65, R66, R67, R68, R69, R70, R71, R72, R73, R74, R75, R76,
-   R77, R78, R79,
-   /* non-volatile register: take hint from PPC and allocate in reverse order */
-   R127, R126, R125, R124, R123, R122, R121, R120, R119, R118, R117, R116, R115,
-   R114, R113, R112, R111, R110, R109, R108, R107, R106, R105, R104, R103, R102,
-   R101, R100, R99, R98, R97, R96, R95, R94, R93, R92, R91, R90, R89, R88, R87,
-   R86, R85, R84, R83, R82, R81, R80, 
-   /* environment ptr, SP, LR */ 
-   R2, R1, R0 ]>
-{
-  let MethodProtos = [{
-    iterator allocation_order_begin(const MachineFunction &MF) const;
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    R32FPClass::iterator
-    R32FPClass::allocation_order_begin(const MachineFunction &MF) const {
-      return begin();
-    }
-    R32FPClass::iterator
-    R32FPClass::allocation_order_end(const MachineFunction &MF) const {
-      return end()-3;  // don't allocate R2, R1, or R0 (envp, sp, lr)
-    }
-  }];
-}
+def R32FP : RegisterClass<"SPU", [f32], 128, (add GPRC)>;
 
 // The SPU's registers as 16-bit wide (halfword) "preferred slot":
-def R16C : RegisterClass<"SPU", [i16], 128,
- [
-   /* volatile register */
-   R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, 
-   R17, R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31,
-   R32, R33, R34, R35, R36, R37, R38, R39, R40, R41, R42, R43, R44, R45, R46,
-   R47, R48, R49, R50, R51, R52, R53, R54, R55, R56, R57, R58, R59, R60, R61,
-   R62, R63, R64, R65, R66, R67, R68, R69, R70, R71, R72, R73, R74, R75, R76,
-   R77, R78, R79,
-   /* non-volatile register: take hint from PPC and allocate in reverse order */
-   R127, R126, R125, R124, R123, R122, R121, R120, R119, R118, R117, R116, R115,
-   R114, R113, R112, R111, R110, R109, R108, R107, R106, R105, R104, R103, R102,
-   R101, R100, R99, R98, R97, R96, R95, R94, R93, R92, R91, R90, R89, R88, R87,
-   R86, R85, R84, R83, R82, R81, R80, 
-   /* environment ptr, SP, LR */ 
-   R2, R1, R0 ]>
-{
-  let MethodProtos = [{
-    iterator allocation_order_begin(const MachineFunction &MF) const;
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    R16CClass::iterator
-    R16CClass::allocation_order_begin(const MachineFunction &MF) const {
-      return begin();
-    }
-    R16CClass::iterator
-    R16CClass::allocation_order_end(const MachineFunction &MF) const {
-      return end()-3;  // don't allocate R2, R1, or R0 (envp, sp, lr)
-    }
-  }];
-}
+def R16C : RegisterClass<"SPU", [i16], 128, (add GPRC)>;
 
 // The SPU's registers as 8-bit wide (byte) "preferred slot":
-def R8C : RegisterClass<"SPU", [i8], 128,
- [
-   /* volatile register */
-   R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, 
-   R17, R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31,
-   R32, R33, R34, R35, R36, R37, R38, R39, R40, R41, R42, R43, R44, R45, R46,
-   R47, R48, R49, R50, R51, R52, R53, R54, R55, R56, R57, R58, R59, R60, R61,
-   R62, R63, R64, R65, R66, R67, R68, R69, R70, R71, R72, R73, R74, R75, R76,
-   R77, R78, R79,
-   /* non-volatile register: take hint from PPC and allocate in reverse order */
-   R127, R126, R125, R124, R123, R122, R121, R120, R119, R118, R117, R116, R115,
-   R114, R113, R112, R111, R110, R109, R108, R107, R106, R105, R104, R103, R102,
-   R101, R100, R99, R98, R97, R96, R95, R94, R93, R92, R91, R90, R89, R88, R87,
-   R86, R85, R84, R83, R82, R81, R80, 
-   /* environment ptr, SP, LR */ 
-   R2, R1, R0 ]>
-{
-  let MethodProtos = [{
-    iterator allocation_order_begin(const MachineFunction &MF) const;
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    R8CClass::iterator
-    R8CClass::allocation_order_begin(const MachineFunction &MF) const {
-      return begin();
-    }
-    R8CClass::iterator
-    R8CClass::allocation_order_end(const MachineFunction &MF) const {
-      return end()-3;  // don't allocate R2, R1, or R0 (envp, sp, lr)
-    }
-  }];
-}
+def R8C : RegisterClass<"SPU", [i8], 128, (add GPRC)>;
 
 // The SPU's registers as vector registers:
-def VECREG : RegisterClass<"SPU",
-                           [v16i8,v8i16,v4i32,v4f32,v2i64,v2f64],
-                           128,
- [
-   /* volatile register */
-   R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, 
-   R17, R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31,
-   R32, R33, R34, R35, R36, R37, R38, R39, R40, R41, R42, R43, R44, R45, R46,
-   R47, R48, R49, R50, R51, R52, R53, R54, R55, R56, R57, R58, R59, R60, R61,
-   R62, R63, R64, R65, R66, R67, R68, R69, R70, R71, R72, R73, R74, R75, R76,
-   R77, R78, R79,
-   /* non-volatile register: take hint from PPC and allocate in reverse order */
-   R127, R126, R125, R124, R123, R122, R121, R120, R119, R118, R117, R116, R115,
-   R114, R113, R112, R111, R110, R109, R108, R107, R106, R105, R104, R103, R102,
-   R101, R100, R99, R98, R97, R96, R95, R94, R93, R92, R91, R90, R89, R88, R87,
-   R86, R85, R84, R83, R82, R81, R80, 
-   /* environment ptr, SP, LR */ 
-   R2, R1, R0 ]>
-{
-  let MethodProtos = [{
-    iterator allocation_order_begin(const MachineFunction &MF) const;
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    VECREGClass::iterator
-    VECREGClass::allocation_order_begin(const MachineFunction &MF) const {
-      return begin();
-    }
-    VECREGClass::iterator
-    VECREGClass::allocation_order_end(const MachineFunction &MF) const {
-      return end()-3;  // don't allocate R2, R1, or R0 (envp, sp, lr)
-    }
-  }];
-}
+def VECREG : RegisterClass<"SPU", [v16i8,v8i16,v4i32,v4f32,v2i64,v2f64], 128,
+                           (add GPRC)>;
diff --git a/lib/Target/CppBackend/CPPBackend.cpp b/lib/Target/CppBackend/CPPBackend.cpp
index 38de3b6..797cfd5 100644
--- a/lib/Target/CppBackend/CPPBackend.cpp
+++ b/lib/Target/CppBackend/CPPBackend.cpp
@@ -1348,7 +1348,7 @@ void CppWriter::printInstruction(const Instruction *I,
     const PHINode* phi = cast<PHINode>(I);
 
     Out << "PHINode* " << iName << " = PHINode::Create("
-        << getCppName(phi->getType()) << ", \""
+        << getCppName(phi->getType()) << ", "
         << phi->getNumIncomingValues() << ", \"";
     printEscapedString(phi->getName());
     Out << "\", " << bbname << ");";
diff --git a/lib/Target/MBlaze/Disassembler/MBlazeDisassembler.cpp b/lib/Target/MBlaze/Disassembler/MBlazeDisassembler.cpp
index 3379ac2..060a87b 100644
--- a/lib/Target/MBlaze/Disassembler/MBlazeDisassembler.cpp
+++ b/lib/Target/MBlaze/Disassembler/MBlazeDisassembler.cpp
@@ -57,18 +57,26 @@ static unsigned mblazeBinary2Opcode[] = {
 };
 
 static unsigned getRD(uint32_t insn) {
+  if (!MBlazeRegisterInfo::isRegister((insn>>21)&0x1F))
+    return UNSUPPORTED;
   return MBlazeRegisterInfo::getRegisterFromNumbering((insn>>21)&0x1F);
 }
 
 static unsigned getRA(uint32_t insn) {
+  if (!MBlazeRegisterInfo::getRegisterFromNumbering((insn>>16)&0x1F))
+    return UNSUPPORTED;
   return MBlazeRegisterInfo::getRegisterFromNumbering((insn>>16)&0x1F);
 }
 
 static unsigned getRB(uint32_t insn) {
+  if (!MBlazeRegisterInfo::getRegisterFromNumbering((insn>>11)&0x1F))
+    return UNSUPPORTED;
   return MBlazeRegisterInfo::getRegisterFromNumbering((insn>>11)&0x1F);
 }
 
 static int64_t getRS(uint32_t insn) {
+  if (!MBlazeRegisterInfo::isSpecialRegister(insn&0x3FFF))
+    return UNSUPPORTED;
   return MBlazeRegisterInfo::getSpecialRegisterFromNumbering(insn&0x3FFF);
 }
 
@@ -489,13 +497,14 @@ bool MBlazeDisassembler::getInstruction(MCInst &instr,
                                         raw_ostream &vStream) const {
   // The machine instruction.
   uint32_t insn;
+  uint64_t read;
   uint8_t bytes[4];
 
-  // We always consume 4 bytes of data
-  size = 4;
+  // By default we consume 1 byte on failure
+  size = 1;
 
   // We want to read exactly 4 bytes of data.
-  if (region.readBytes(address, 4, (uint8_t*)bytes, NULL) == -1)
+  if (region.readBytes(address, 4, (uint8_t*)bytes, &read) == -1 || read < 4)
     return false;
 
   // Encoded as a big-endian 32-bit word in the stream.
@@ -509,44 +518,63 @@ bool MBlazeDisassembler::getInstruction(MCInst &instr,
 
   instr.setOpcode(opcode);
 
+  unsigned RD = getRD(insn);
+  unsigned RA = getRA(insn);
+  unsigned RB = getRB(insn);
+  unsigned RS = getRS(insn);
+
   uint64_t tsFlags = MBlazeInsts[opcode].TSFlags;
   switch ((tsFlags & MBlazeII::FormMask)) {
-  default: llvm_unreachable("unknown instruction encoding");
+  default: 
+    return false;
 
   case MBlazeII::FRRRR:
-    instr.addOperand(MCOperand::CreateReg(getRD(insn)));
-    instr.addOperand(MCOperand::CreateReg(getRB(insn)));
-    instr.addOperand(MCOperand::CreateReg(getRA(insn)));
+    if (RD == UNSUPPORTED || RA == UNSUPPORTED || RB == UNSUPPORTED)
+      return false;
+    instr.addOperand(MCOperand::CreateReg(RD));
+    instr.addOperand(MCOperand::CreateReg(RB));
+    instr.addOperand(MCOperand::CreateReg(RA));
     break;
 
   case MBlazeII::FRRR:
-    instr.addOperand(MCOperand::CreateReg(getRD(insn)));
-    instr.addOperand(MCOperand::CreateReg(getRA(insn)));
-    instr.addOperand(MCOperand::CreateReg(getRB(insn)));
+    if (RD == UNSUPPORTED || RA == UNSUPPORTED || RB == UNSUPPORTED)
+      return false;
+    instr.addOperand(MCOperand::CreateReg(RD));
+    instr.addOperand(MCOperand::CreateReg(RA));
+    instr.addOperand(MCOperand::CreateReg(RB));
     break;
 
   case MBlazeII::FRI:
     switch (opcode) {
-    default: llvm_unreachable("unknown instruction encoding");
+    default: 
+      return false;
     case MBlaze::MFS:
-      instr.addOperand(MCOperand::CreateReg(getRD(insn)));
+      if (RD == UNSUPPORTED)
+        return false;
+      instr.addOperand(MCOperand::CreateReg(RD));
       instr.addOperand(MCOperand::CreateImm(insn&0x3FFF));
       break;
     case MBlaze::MTS:
+      if (RA == UNSUPPORTED)
+        return false;
       instr.addOperand(MCOperand::CreateImm(insn&0x3FFF));
-      instr.addOperand(MCOperand::CreateReg(getRA(insn)));
+      instr.addOperand(MCOperand::CreateReg(RA));
       break;
     case MBlaze::MSRSET:
     case MBlaze::MSRCLR:
-      instr.addOperand(MCOperand::CreateReg(getRD(insn)));
+      if (RD == UNSUPPORTED)
+        return false;
+      instr.addOperand(MCOperand::CreateReg(RD));
       instr.addOperand(MCOperand::CreateImm(insn&0x7FFF));
       break;
     }
     break;
 
   case MBlazeII::FRRI:
-    instr.addOperand(MCOperand::CreateReg(getRD(insn)));
-    instr.addOperand(MCOperand::CreateReg(getRA(insn)));
+    if (RD == UNSUPPORTED || RA == UNSUPPORTED)
+      return false;
+    instr.addOperand(MCOperand::CreateReg(RD));
+    instr.addOperand(MCOperand::CreateReg(RA));
     switch (opcode) {
     default:
       instr.addOperand(MCOperand::CreateImm(getIMM(insn)));
@@ -560,27 +588,37 @@ bool MBlazeDisassembler::getInstruction(MCInst &instr,
     break;
 
   case MBlazeII::FCRR:
-    instr.addOperand(MCOperand::CreateReg(getRA(insn)));
-    instr.addOperand(MCOperand::CreateReg(getRB(insn)));
+    if (RA == UNSUPPORTED || RB == UNSUPPORTED)
+      return false;
+    instr.addOperand(MCOperand::CreateReg(RA));
+    instr.addOperand(MCOperand::CreateReg(RB));
     break;
 
   case MBlazeII::FCRI:
-    instr.addOperand(MCOperand::CreateReg(getRA(insn)));
+    if (RA == UNSUPPORTED)
+      return false;
+    instr.addOperand(MCOperand::CreateReg(RA));
     instr.addOperand(MCOperand::CreateImm(getIMM(insn)));
     break;
 
   case MBlazeII::FRCR:
-    instr.addOperand(MCOperand::CreateReg(getRD(insn)));
-    instr.addOperand(MCOperand::CreateReg(getRB(insn)));
+    if (RD == UNSUPPORTED || RB == UNSUPPORTED)
+      return false;
+    instr.addOperand(MCOperand::CreateReg(RD));
+    instr.addOperand(MCOperand::CreateReg(RB));
     break;
 
   case MBlazeII::FRCI:
-    instr.addOperand(MCOperand::CreateReg(getRD(insn)));
+    if (RD == UNSUPPORTED)
+      return false;
+    instr.addOperand(MCOperand::CreateReg(RD));
     instr.addOperand(MCOperand::CreateImm(getIMM(insn)));
     break;
 
   case MBlazeII::FCCR:
-    instr.addOperand(MCOperand::CreateReg(getRB(insn)));
+    if (RB == UNSUPPORTED)
+      return false;
+    instr.addOperand(MCOperand::CreateReg(RB));
     break;
 
   case MBlazeII::FCCI:
@@ -588,33 +626,45 @@ bool MBlazeDisassembler::getInstruction(MCInst &instr,
     break;
 
   case MBlazeII::FRRCI:
-    instr.addOperand(MCOperand::CreateReg(getRD(insn)));
-    instr.addOperand(MCOperand::CreateReg(getRA(insn)));
+    if (RD == UNSUPPORTED || RA == UNSUPPORTED)
+      return false;
+    instr.addOperand(MCOperand::CreateReg(RD));
+    instr.addOperand(MCOperand::CreateReg(RA));
     instr.addOperand(MCOperand::CreateImm(getSHT(insn)));
     break;
 
   case MBlazeII::FRRC:
-    instr.addOperand(MCOperand::CreateReg(getRD(insn)));
-    instr.addOperand(MCOperand::CreateReg(getRA(insn)));
+    if (RD == UNSUPPORTED || RA == UNSUPPORTED)
+      return false;
+    instr.addOperand(MCOperand::CreateReg(RD));
+    instr.addOperand(MCOperand::CreateReg(RA));
     break;
 
   case MBlazeII::FRCX:
-    instr.addOperand(MCOperand::CreateReg(getRD(insn)));
+    if (RD == UNSUPPORTED)
+      return false;
+    instr.addOperand(MCOperand::CreateReg(RD));
     instr.addOperand(MCOperand::CreateImm(getFSL(insn)));
     break;
 
   case MBlazeII::FRCS:
-    instr.addOperand(MCOperand::CreateReg(getRD(insn)));
-    instr.addOperand(MCOperand::CreateReg(getRS(insn)));
+    if (RD == UNSUPPORTED || RS == UNSUPPORTED)
+      return false;
+    instr.addOperand(MCOperand::CreateReg(RD));
+    instr.addOperand(MCOperand::CreateReg(RS));
     break;
 
   case MBlazeII::FCRCS:
-    instr.addOperand(MCOperand::CreateReg(getRS(insn)));
-    instr.addOperand(MCOperand::CreateReg(getRA(insn)));
+    if (RS == UNSUPPORTED || RA == UNSUPPORTED)
+      return false;
+    instr.addOperand(MCOperand::CreateReg(RS));
+    instr.addOperand(MCOperand::CreateReg(RA));
     break;
 
   case MBlazeII::FCRCX:
-    instr.addOperand(MCOperand::CreateReg(getRA(insn)));
+    if (RA == UNSUPPORTED)
+      return false;
+    instr.addOperand(MCOperand::CreateReg(RA));
     instr.addOperand(MCOperand::CreateImm(getFSL(insn)));
     break;
 
@@ -623,16 +673,23 @@ bool MBlazeDisassembler::getInstruction(MCInst &instr,
     break;
 
   case MBlazeII::FCR:
-    instr.addOperand(MCOperand::CreateReg(getRB(insn)));
+    if (RB == UNSUPPORTED)
+      return false;
+    instr.addOperand(MCOperand::CreateReg(RB));
     break;
 
   case MBlazeII::FRIR:
-    instr.addOperand(MCOperand::CreateReg(getRD(insn)));
+    if (RD == UNSUPPORTED || RA == UNSUPPORTED)
+      return false;
+    instr.addOperand(MCOperand::CreateReg(RD));
     instr.addOperand(MCOperand::CreateImm(getIMM(insn)));
-    instr.addOperand(MCOperand::CreateReg(getRA(insn)));
+    instr.addOperand(MCOperand::CreateReg(RA));
     break;
   }
 
+  // We always consume 4 bytes of data on success
+  size = 4;
+
   return true;
 }
 
diff --git a/lib/Target/MBlaze/MBlaze.td b/lib/Target/MBlaze/MBlaze.td
index 1fa1e4d..1245658 100644
--- a/lib/Target/MBlaze/MBlaze.td
+++ b/lib/Target/MBlaze/MBlaze.td
@@ -31,49 +31,28 @@ def MBlazeInstrInfo : InstrInfo;
 // Microblaze Subtarget features                                              //
 //===----------------------------------------------------------------------===//
 
-def FeaturePipe3       : SubtargetFeature<"pipe3", "HasPipe3", "true",
-                                "Implements 3-stage pipeline">;
 def FeatureBarrel      : SubtargetFeature<"barrel", "HasBarrel", "true",
                                 "Implements barrel shifter">;
 def FeatureDiv         : SubtargetFeature<"div", "HasDiv", "true",
                                 "Implements hardware divider">;
 def FeatureMul         : SubtargetFeature<"mul", "HasMul", "true",
                                 "Implements hardware multiplier">;
-def FeatureFSL         : SubtargetFeature<"fsl", "HasFSL", "true",
-                                "Implements FSL instructions">;
-def FeatureEFSL        : SubtargetFeature<"efsl", "HasEFSL", "true",
-                                "Implements extended FSL instructions">;
-def FeatureMSRSet      : SubtargetFeature<"msrset", "HasMSRSet", "true",
-                                "Implements MSR register set and clear">;
-def FeatureException   : SubtargetFeature<"exception", "HasException", "true",
-                                "Implements hardware exception support">;
 def FeaturePatCmp      : SubtargetFeature<"patcmp", "HasPatCmp", "true",
                                 "Implements pattern compare instruction">;
 def FeatureFPU         : SubtargetFeature<"fpu", "HasFPU", "true",
                                 "Implements floating point unit">;
-def FeatureESR         : SubtargetFeature<"esr", "HasESR", "true",
-                                "Implements ESR and EAR registers">;
-def FeaturePVR         : SubtargetFeature<"pvr", "HasPVR", "true",
-                                "Implements processor version register">;
 def FeatureMul64       : SubtargetFeature<"mul64", "HasMul64", "true",
                                 "Implements multiplier with 64-bit result">;
 def FeatureSqrt        : SubtargetFeature<"sqrt", "HasSqrt", "true",
                                 "Implements sqrt and floating point convert">;
-def FeatureMMU         : SubtargetFeature<"mmu", "HasMMU", "true",
-                                "Implements memory management unit">;
 
 //===----------------------------------------------------------------------===//
 // MBlaze processors supported.
 //===----------------------------------------------------------------------===//
 
-class Proc<string Name, list<SubtargetFeature> Features>
- : Processor<Name, MBlazeGenericItineraries, Features>;
-
-def : Proc<"v400", []>;
-def : Proc<"v500", []>;
-def : Proc<"v600", []>;
-def : Proc<"v700", []>;
-def : Proc<"v710", []>;
+def : Processor<"mblaze",  MBlazeGenericItineraries, []>;
+def : Processor<"mblaze3", MBlazePipe3Itineraries, []>;
+def : Processor<"mblaze5", MBlazePipe5Itineraries, []>;
 
 //===----------------------------------------------------------------------===//
 // Instruction Descriptions
diff --git a/lib/Target/MBlaze/MBlazeAsmBackend.cpp b/lib/Target/MBlaze/MBlazeAsmBackend.cpp
index a4b21af..08f14c3 100644
--- a/lib/Target/MBlaze/MBlazeAsmBackend.cpp
+++ b/lib/Target/MBlaze/MBlazeAsmBackend.cpp
@@ -150,14 +150,13 @@ void ELFMBlazeAsmBackend::ApplyFixup(const MCFixup &Fixup, char *Data,
 
 TargetAsmBackend *llvm::createMBlazeAsmBackend(const Target &T,
                                             const std::string &TT) {
-  switch (Triple(TT).getOS()) {
-  case Triple::Darwin:
+  Triple TheTriple(TT);
+
+  if (TheTriple.isOSDarwin())
     assert(0 && "Mac not supported on MBlaze");
-  case Triple::MinGW32:
-  case Triple::Cygwin:
-  case Triple::Win32:
+
+  if (TheTriple.isOSWindows())
     assert(0 && "Windows not supported on MBlaze");
-  default:
-    return new ELFMBlazeAsmBackend(T, Triple(TT).getOS());
-  }
+
+  return new ELFMBlazeAsmBackend(T, TheTriple.getOS());
 }
diff --git a/lib/Target/MBlaze/MBlazeDelaySlotFiller.cpp b/lib/Target/MBlaze/MBlazeDelaySlotFiller.cpp
index 4399ee2..973e968 100644
--- a/lib/Target/MBlaze/MBlazeDelaySlotFiller.cpp
+++ b/lib/Target/MBlaze/MBlazeDelaySlotFiller.cpp
@@ -77,7 +77,7 @@ static bool hasImmInstruction(MachineBasicBlock::iterator &candidate) {
 
         // We must assume that unknown immediate values require more than
         // 16-bits to represent.
-        if (mop.isGlobal() || mop.isSymbol())
+        if (mop.isGlobal() || mop.isSymbol() || mop.isJTI() || mop.isCPI())
           return true;
 
         // FIXME: we could probably check to see if the FP value happens
diff --git a/lib/Target/MBlaze/MBlazeISelLowering.cpp b/lib/Target/MBlaze/MBlazeISelLowering.cpp
index f39826b..c5e0a89 100644
--- a/lib/Target/MBlaze/MBlazeISelLowering.cpp
+++ b/lib/Target/MBlaze/MBlazeISelLowering.cpp
@@ -180,6 +180,8 @@ MBlazeTargetLowering::MBlazeTargetLowering(MBlazeTargetMachine &TM)
   setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
 
+  setMinFunctionAlignment(2);
+
   setStackPointerRegisterToSaveRestore(MBlaze::R1);
   computeRegisterProperties();
 }
@@ -188,11 +190,6 @@ MVT::SimpleValueType MBlazeTargetLowering::getSetCCResultType(EVT VT) const {
   return MVT::i32;
 }
 
-/// getFunctionAlignment - Return the Log2 alignment of this function.
-unsigned MBlazeTargetLowering::getFunctionAlignment(const Function *) const {
-  return 2;
-}
-
 SDValue MBlazeTargetLowering::LowerOperation(SDValue Op,
                                              SelectionDAG &DAG) const {
   switch (Op.getOpcode())
@@ -274,7 +271,7 @@ MBlazeTargetLowering::EmitCustomShift(MachineInstr *MI,
   F->insert(It, loop);
   F->insert(It, finish);
 
-  // Update machine-CFG edges by transfering adding all successors and
+  // Update machine-CFG edges by transferring adding all successors and
   // remaining instructions from the current block to the new block which
   // will contain the Phi node for the select.
   finish->splice(finish->begin(), MBB,
@@ -420,7 +417,7 @@ MBlazeTargetLowering::EmitCustomAtomic(MachineInstr *MI,
   // All atomic instructions on the Microblaze are implemented using the
   // load-linked / store-conditional style atomic instruction sequences.
   // Thus, all operations will look something like the following:
-  // 
+  //
   //  start:
   //    lwx     RV, RP, 0
   //    <do stuff>
@@ -456,7 +453,7 @@ MBlazeTargetLowering::EmitCustomAtomic(MachineInstr *MI,
   F->insert(It, start);
   F->insert(It, exit);
 
-  // Update machine-CFG edges by transfering adding all successors and
+  // Update machine-CFG edges by transferring adding all successors and
   // remaining instructions from the current block to the new block which
   // will contain the Phi node for the select.
   exit->splice(exit->begin(), MBB, llvm::next(MachineBasicBlock::iterator(MI)),
@@ -701,8 +698,8 @@ LowerCall(SDValue Chain, SDValue Callee, CallingConv::ID CallConv,
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs,
-                 *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), ArgLocs, *DAG.getContext());
   CCInfo.AnalyzeCallOperands(Outs, CC_MBlaze);
 
   // Get a count of how many bytes are to be pushed on the stack.
@@ -778,7 +775,7 @@ LowerCall(SDValue Chain, SDValue Callee, CallingConv::ID CallConv,
 
   // Build a sequence of copy-to-reg nodes chained together with token
   // chain and flag operands which copy the outgoing args into registers.
-  // The InFlag in necessary since all emited instructions must be
+  // The InFlag in necessary since all emitted instructions must be
   // stuck together.
   SDValue InFlag;
   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
@@ -840,8 +837,8 @@ LowerCallResult(SDValue Chain, SDValue InFlag, CallingConv::ID CallConv,
                 SmallVectorImpl<SDValue> &InVals) const {
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
-                 RVLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), RVLocs, *DAG.getContext());
 
   CCInfo.AnalyzeCallResult(Ins, RetCC_MBlaze);
 
@@ -883,8 +880,8 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
-                 ArgLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), ArgLocs, *DAG.getContext());
 
   CCInfo.AnalyzeFormalArguments(Ins, CC_MBlaze);
   SDValue StackPtr;
@@ -1015,8 +1012,8 @@ LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
   SmallVector<CCValAssign, 16> RVLocs;
 
   // CCState - Info about the registers and stack slot.
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
-                 RVLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), RVLocs, *DAG.getContext());
 
   // Analize return values.
   CCInfo.AnalyzeReturn(Outs, RetCC_MBlaze);
@@ -1046,9 +1043,9 @@ LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
 
   // If this function is using the interrupt_handler calling convention
   // then use "rtid r14, 0" otherwise use "rtsd r15, 8"
-  unsigned Ret = (CallConv == llvm::CallingConv::MBLAZE_INTR) ? MBlazeISD::IRet 
+  unsigned Ret = (CallConv == llvm::CallingConv::MBLAZE_INTR) ? MBlazeISD::IRet
                                                               : MBlazeISD::Ret;
-  unsigned Reg = (CallConv == llvm::CallingConv::MBLAZE_INTR) ? MBlaze::R14 
+  unsigned Reg = (CallConv == llvm::CallingConv::MBLAZE_INTR) ? MBlaze::R14
                                                               : MBlaze::R15;
   SDValue DReg = DAG.getRegister(Reg, MVT::i32);
 
@@ -1103,7 +1100,7 @@ MBlazeTargetLowering::getSingleConstraintMatchWeight(
   switch (*constraint) {
   default:
     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
-    break;
+    break;
   case 'd':
   case 'y':
     if (type->isIntegerTy())
diff --git a/lib/Target/MBlaze/MBlazeISelLowering.h b/lib/Target/MBlaze/MBlazeISelLowering.h
index 91649bc..265c1a7 100644
--- a/lib/Target/MBlaze/MBlazeISelLowering.h
+++ b/lib/Target/MBlaze/MBlazeISelLowering.h
@@ -104,7 +104,6 @@ namespace llvm {
     /// getSetCCResultType - get the ISD::SETCC result ValueType
     MVT::SimpleValueType getSetCCResultType(EVT VT) const;
 
-    virtual unsigned getFunctionAlignment(const Function *F) const;
   private:
     // Subtarget Info
     const MBlazeSubtarget *Subtarget;
diff --git a/lib/Target/MBlaze/MBlazeInstrFPU.td b/lib/Target/MBlaze/MBlazeInstrFPU.td
index 094de5c..4acdcfd 100644
--- a/lib/Target/MBlaze/MBlazeInstrFPU.td
+++ b/lib/Target/MBlaze/MBlazeInstrFPU.td
@@ -21,22 +21,22 @@
 class LoadFM<bits<6> op, string instr_asm, PatFrag OpNode> :
              TA<op, 0x000, (outs GPR:$dst), (ins memrr:$addr),
                 !strconcat(instr_asm, "   $dst, $addr"),
-                [(set (f32 GPR:$dst), (OpNode xaddr:$addr))], IILoad>;
+                [(set (f32 GPR:$dst), (OpNode xaddr:$addr))], IIC_MEMl>;
 
 class LoadFMI<bits<6> op, string instr_asm, PatFrag OpNode> :
               TB<op, (outs GPR:$dst), (ins memri:$addr),
                  !strconcat(instr_asm, "   $dst, $addr"),
-                 [(set (f32 GPR:$dst), (OpNode iaddr:$addr))], IILoad>;
+                 [(set (f32 GPR:$dst), (OpNode iaddr:$addr))], IIC_MEMl>;
 
 class StoreFM<bits<6> op, string instr_asm, PatFrag OpNode> :
               TA<op, 0x000, (outs), (ins GPR:$dst, memrr:$addr),
                  !strconcat(instr_asm, "   $dst, $addr"),
-                 [(OpNode (f32 GPR:$dst), xaddr:$addr)], IIStore>;
+                 [(OpNode (f32 GPR:$dst), xaddr:$addr)], IIC_MEMs>;
 
 class StoreFMI<bits<6> op, string instr_asm, PatFrag OpNode> :
                TB<op, (outs), (ins GPR:$dst, memrr:$addr),
                   !strconcat(instr_asm, "   $dst, $addr"),
-                  [(OpNode (f32 GPR:$dst), iaddr:$addr)], IIStore>;
+                  [(OpNode (f32 GPR:$dst), iaddr:$addr)], IIC_MEMs>;
 
 class ArithF<bits<6> op, bits<11> flags, string instr_asm, SDNode OpNode,
              InstrItinClass itin> :
@@ -56,15 +56,10 @@ class ArithFR<bits<6> op, bits<11> flags, string instr_asm, SDNode OpNode,
                  !strconcat(instr_asm, "   $dst, $c, $b"),
                  [(set GPR:$dst, (OpNode GPR:$b, GPR:$c))], itin>;
 
-class LogicF<bits<6> op, string instr_asm> :
-             TB<op, (outs GPR:$dst), (ins GPR:$b, GPR:$c),
-                !strconcat(instr_asm, "   $dst, $b, $c"),
-                [], IIAlu>;
-
 class LogicFI<bits<6> op, string instr_asm> :
              TB<op, (outs GPR:$dst), (ins GPR:$b, fimm:$c),
                 !strconcat(instr_asm, "   $dst, $b, $c"),
-                [], IIAlu>;
+                [], IIC_ALU>;
 
 let rb=0 in {
   class ArithF2<bits<6> op, bits<11> flags, string instr_asm,
@@ -95,10 +90,10 @@ let rb=0 in {
 //===----------------------------------------------------------------------===//
 let Predicates=[HasFPU] in {
   def FORI   : LogicFI<0x28, "ori    ">;
-  def FADD   :  ArithF<0x16, 0x000, "fadd   ", fadd, IIAlu>;
-  def FRSUB  : ArithFR<0x16, 0x080, "frsub  ", fsub, IIAlu>;
-  def FMUL   :  ArithF<0x16, 0x100, "fmul   ", fmul, IIAlu>;
-  def FDIV   :  ArithF<0x16, 0x180, "fdiv   ", fdiv, IIAlu>;
+  def FADD   :  ArithF<0x16, 0x000, "fadd   ", fadd, IIC_FPU>;
+  def FRSUB  : ArithFR<0x16, 0x080, "frsub  ", fsub, IIC_FPU>;
+  def FMUL   :  ArithF<0x16, 0x100, "fmul   ", fmul, IIC_FPU>;
+  def FDIV   :  ArithF<0x16, 0x180, "fdiv   ", fdiv, IIC_FPUd>;
 }
 
 let Predicates=[HasFPU], isCodeGenOnly=1 in {
@@ -110,19 +105,19 @@ let Predicates=[HasFPU], isCodeGenOnly=1 in {
 }
 
 let Predicates=[HasFPU,HasSqrt] in {
-  def FLT    : ArithIF<0x16, 0x280, "flt    ", IIAlu>;
-  def FINT   : ArithFI<0x16, 0x300, "fint   ", IIAlu>;
-  def FSQRT  : ArithF2<0x16, 0x380, "fsqrt  ", IIAlu>;
+  def FLT    : ArithIF<0x16, 0x280, "flt    ", IIC_FPUf>;
+  def FINT   : ArithFI<0x16, 0x300, "fint   ", IIC_FPUi>;
+  def FSQRT  : ArithF2<0x16, 0x380, "fsqrt  ", IIC_FPUs>;
 }
 
 let isAsCheapAsAMove = 1 in {
-  def FCMP_UN : CmpFN<0x16, 0x200, "fcmp.un", IIAlu>;
-  def FCMP_LT : CmpFN<0x16, 0x210, "fcmp.lt", IIAlu>;
-  def FCMP_EQ : CmpFN<0x16, 0x220, "fcmp.eq", IIAlu>;
-  def FCMP_LE : CmpFN<0x16, 0x230, "fcmp.le", IIAlu>;
-  def FCMP_GT : CmpFN<0x16, 0x240, "fcmp.gt", IIAlu>;
-  def FCMP_NE : CmpFN<0x16, 0x250, "fcmp.ne", IIAlu>;
-  def FCMP_GE : CmpFN<0x16, 0x260, "fcmp.ge", IIAlu>;
+  def FCMP_UN : CmpFN<0x16, 0x200, "fcmp.un", IIC_FPUc>;
+  def FCMP_LT : CmpFN<0x16, 0x210, "fcmp.lt", IIC_FPUc>;
+  def FCMP_EQ : CmpFN<0x16, 0x220, "fcmp.eq", IIC_FPUc>;
+  def FCMP_LE : CmpFN<0x16, 0x230, "fcmp.le", IIC_FPUc>;
+  def FCMP_GT : CmpFN<0x16, 0x240, "fcmp.gt", IIC_FPUc>;
+  def FCMP_NE : CmpFN<0x16, 0x250, "fcmp.ne", IIC_FPUc>;
+  def FCMP_GE : CmpFN<0x16, 0x260, "fcmp.ge", IIC_FPUc>;
 }
 
 
diff --git a/lib/Target/MBlaze/MBlazeInstrFSL.td b/lib/Target/MBlaze/MBlazeInstrFSL.td
index 3209845..3082a7e 100644
--- a/lib/Target/MBlaze/MBlazeInstrFSL.td
+++ b/lib/Target/MBlaze/MBlazeInstrFSL.td
@@ -13,7 +13,7 @@
 class FSLGet<bits<6> op, bits<5> flags, string instr_asm, Intrinsic OpNode> :
              MBlazeInst<op, FRCX, (outs GPR:$dst), (ins fslimm:$b),
                         !strconcat(instr_asm, " $dst, $b"),
-                        [(set GPR:$dst, (OpNode immZExt4:$b))],IIAlu>
+                        [(set GPR:$dst, (OpNode immZExt4:$b))],IIC_FSLg>
 {
     bits<5> rd;
     bits<4> fslno;
@@ -29,7 +29,7 @@ class FSLGet<bits<6> op, bits<5> flags, string instr_asm, Intrinsic OpNode> :
 class FSLGetD<bits<6> op, bits<5> flags, string instr_asm, Intrinsic OpNode> :
               MBlazeInst<op, FRCR, (outs GPR:$dst), (ins GPR:$b),
                          !strconcat(instr_asm, " $dst, $b"),
-                         [(set GPR:$dst, (OpNode GPR:$b))], IIAlu>
+                         [(set GPR:$dst, (OpNode GPR:$b))], IIC_FSLg>
 {
     bits<5> rd;
     bits<5> rb;
@@ -45,7 +45,7 @@ class FSLGetD<bits<6> op, bits<5> flags, string instr_asm, Intrinsic OpNode> :
 class FSLPut<bits<6> op, bits<4> flags, string instr_asm, Intrinsic OpNode> :
              MBlazeInst<op, FCRCX, (outs), (ins GPR:$v, fslimm:$b),
                         !strconcat(instr_asm, " $v, $b"),
-                        [(OpNode GPR:$v, immZExt4:$b)], IIAlu>
+                        [(OpNode GPR:$v, immZExt4:$b)], IIC_FSLp>
 {
     bits<5> ra;
     bits<4> fslno;
@@ -61,7 +61,7 @@ class FSLPut<bits<6> op, bits<4> flags, string instr_asm, Intrinsic OpNode> :
 class FSLPutD<bits<6> op, bits<4> flags, string instr_asm, Intrinsic OpNode> :
               MBlazeInst<op, FCRR, (outs), (ins GPR:$v, GPR:$b),
                          !strconcat(instr_asm, " $v, $b"),
-                         [(OpNode GPR:$v, GPR:$b)], IIAlu>
+                         [(OpNode GPR:$v, GPR:$b)], IIC_FSLp>
 {
     bits<5> ra;
     bits<5> rb;
@@ -77,7 +77,7 @@ class FSLPutD<bits<6> op, bits<4> flags, string instr_asm, Intrinsic OpNode> :
 class FSLPutT<bits<6> op, bits<4> flags, string instr_asm, Intrinsic OpNode> :
               MBlazeInst<op, FCX, (outs), (ins fslimm:$b),
                          !strconcat(instr_asm, " $b"),
-                         [(OpNode immZExt4:$b)], IIAlu>
+                         [(OpNode immZExt4:$b)], IIC_FSLp>
 {
     bits<4> fslno;
 
@@ -92,7 +92,7 @@ class FSLPutT<bits<6> op, bits<4> flags, string instr_asm, Intrinsic OpNode> :
 class FSLPutTD<bits<6> op, bits<4> flags, string instr_asm, Intrinsic OpNode> :
                MBlazeInst<op, FCR, (outs), (ins GPR:$b),
                           !strconcat(instr_asm, " $b"),
-                          [(OpNode GPR:$b)], IIAlu>
+                          [(OpNode GPR:$b)], IIC_FSLp>
 {
     bits<5> rb;
 
diff --git a/lib/Target/MBlaze/MBlazeInstrFormats.td b/lib/Target/MBlaze/MBlazeInstrFormats.td
index d62574d..54f605f 100644
--- a/lib/Target/MBlaze/MBlazeInstrFormats.td
+++ b/lib/Target/MBlaze/MBlazeInstrFormats.td
@@ -81,7 +81,7 @@ class MBlazeInst<bits<6> op, Format form, dag outs, dag ins, string asmstr,
 // Pseudo instruction class
 //===----------------------------------------------------------------------===//
 class MBlazePseudo<dag outs, dag ins, string asmstr, list<dag> pattern>:
-      MBlazeInst<0x0, FPseudo, outs, ins, asmstr, pattern, IIPseudo>;
+      MBlazeInst<0x0, FPseudo, outs, ins, asmstr, pattern, IIC_Pseudo>;
 
 //===----------------------------------------------------------------------===//
 // Type A instruction class in MBlaze : <|opcode|rd|ra|rb|flags|>
diff --git a/lib/Target/MBlaze/MBlazeInstrInfo.cpp b/lib/Target/MBlaze/MBlazeInstrInfo.cpp
index b353dcd..794ebed 100644
--- a/lib/Target/MBlaze/MBlazeInstrInfo.cpp
+++ b/lib/Target/MBlaze/MBlazeInstrInfo.cpp
@@ -17,6 +17,8 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/ScoreboardHazardRecognizer.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "MBlazeGenInstrInfo.inc"
 
diff --git a/lib/Target/MBlaze/MBlazeInstrInfo.h b/lib/Target/MBlaze/MBlazeInstrInfo.h
index b7300c1..b717da8 100644
--- a/lib/Target/MBlaze/MBlazeInstrInfo.h
+++ b/lib/Target/MBlaze/MBlazeInstrInfo.h
@@ -261,7 +261,6 @@ public:
   virtual bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond)
     const;
 
-
   virtual void copyPhysReg(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator I, DebugLoc DL,
                            unsigned DestReg, unsigned SrcReg,
diff --git a/lib/Target/MBlaze/MBlazeInstrInfo.td b/lib/Target/MBlaze/MBlazeInstrInfo.td
index 7b8f70a..950f2d7 100644
--- a/lib/Target/MBlaze/MBlazeInstrInfo.td
+++ b/lib/Target/MBlaze/MBlazeInstrInfo.td
@@ -47,22 +47,22 @@ def callseq_end   : SDNode<"ISD::CALLSEQ_END", SDT_MBCallSeqEnd,
 //===----------------------------------------------------------------------===//
 // MBlaze Instruction Predicate Definitions.
 //===----------------------------------------------------------------------===//
-def HasPipe3     : Predicate<"Subtarget.hasPipe3()">;
+// def HasPipe3     : Predicate<"Subtarget.hasPipe3()">;
 def HasBarrel    : Predicate<"Subtarget.hasBarrel()">;
-def NoBarrel     : Predicate<"!Subtarget.hasBarrel()">;
+// def NoBarrel     : Predicate<"!Subtarget.hasBarrel()">;
 def HasDiv       : Predicate<"Subtarget.hasDiv()">;
 def HasMul       : Predicate<"Subtarget.hasMul()">;
-def HasFSL       : Predicate<"Subtarget.hasFSL()">;
-def HasEFSL      : Predicate<"Subtarget.hasEFSL()">;
-def HasMSRSet    : Predicate<"Subtarget.hasMSRSet()">;
-def HasException : Predicate<"Subtarget.hasException()">;
+// def HasFSL       : Predicate<"Subtarget.hasFSL()">;
+// def HasEFSL      : Predicate<"Subtarget.hasEFSL()">;
+// def HasMSRSet    : Predicate<"Subtarget.hasMSRSet()">;
+// def HasException : Predicate<"Subtarget.hasException()">;
 def HasPatCmp    : Predicate<"Subtarget.hasPatCmp()">;
 def HasFPU       : Predicate<"Subtarget.hasFPU()">;
-def HasESR       : Predicate<"Subtarget.hasESR()">;
-def HasPVR       : Predicate<"Subtarget.hasPVR()">;
+// def HasESR       : Predicate<"Subtarget.hasESR()">;
+// def HasPVR       : Predicate<"Subtarget.hasPVR()">;
 def HasMul64     : Predicate<"Subtarget.hasMul64()">;
 def HasSqrt      : Predicate<"Subtarget.hasSqrt()">;
-def HasMMU       : Predicate<"Subtarget.hasMMU()">;
+// def HasMMU       : Predicate<"Subtarget.hasMMU()">;
 
 //===----------------------------------------------------------------------===//
 // MBlaze Operand, Complex Patterns and Transformations Definitions.
@@ -170,18 +170,18 @@ class ArithI<bits<6> op, string instr_asm, SDNode OpNode,
              Operand Od, PatLeaf imm_type> :
              TB<op, (outs GPR:$dst), (ins GPR:$b, Od:$c),
                 !strconcat(instr_asm, "   $dst, $b, $c"),
-                [(set GPR:$dst, (OpNode GPR:$b, imm_type:$c))], IIAlu>;
+                [(set GPR:$dst, (OpNode GPR:$b, imm_type:$c))], IIC_ALU>;
 
 class ArithI32<bits<6> op, string instr_asm,Operand Od, PatLeaf imm_type> :
                TB<op, (outs GPR:$dst), (ins GPR:$b, Od:$c),
                   !strconcat(instr_asm, "   $dst, $b, $c"),
-                  [], IIAlu>;
+                  [], IIC_ALU>;
 
 class ShiftI<bits<6> op, bits<2> flags, string instr_asm, SDNode OpNode,
              Operand Od, PatLeaf imm_type> :
              SHT<op, flags, (outs GPR:$dst), (ins GPR:$b, Od:$c),
                  !strconcat(instr_asm, "   $dst, $b, $c"),
-                 [(set GPR:$dst, (OpNode GPR:$b, imm_type:$c))], IIAlu>;
+                 [(set GPR:$dst, (OpNode GPR:$b, imm_type:$c))], IIC_SHT>;
 
 class ArithR<bits<6> op, bits<11> flags, string instr_asm, SDNode OpNode,
             InstrItinClass itin> :
@@ -193,7 +193,7 @@ class ArithRI<bits<6> op, string instr_asm, SDNode OpNode,
              Operand Od, PatLeaf imm_type> :
              TBR<op, (outs GPR:$dst), (ins Od:$b, GPR:$c),
                  !strconcat(instr_asm, "   $dst, $c, $b"),
-                 [(set GPR:$dst, (OpNode imm_type:$b, GPR:$c))], IIAlu>;
+                 [(set GPR:$dst, (OpNode imm_type:$b, GPR:$c))], IIC_ALU>;
 
 class ArithN<bits<6> op, bits<11> flags, string instr_asm,
             InstrItinClass itin> :
@@ -204,7 +204,7 @@ class ArithN<bits<6> op, bits<11> flags, string instr_asm,
 class ArithNI<bits<6> op, string instr_asm,Operand Od, PatLeaf imm_type> :
              TB<op, (outs GPR:$dst), (ins GPR:$b, Od:$c),
                 !strconcat(instr_asm, "   $dst, $b, $c"),
-                [], IIAlu>;
+                [], IIC_ALU>;
 
 class ArithRN<bits<6> op, bits<11> flags, string instr_asm,
             InstrItinClass itin> :
@@ -215,7 +215,7 @@ class ArithRN<bits<6> op, bits<11> flags, string instr_asm,
 class ArithRNI<bits<6> op, string instr_asm,Operand Od, PatLeaf imm_type> :
              TBR<op, (outs GPR:$dst), (ins Od:$c, GPR:$b),
                  !strconcat(instr_asm, "   $dst, $b, $c"),
-                 [], IIAlu>;
+                 [], IIC_ALU>;
 
 //===----------------------------------------------------------------------===//
 // Misc Arithmetic Instructions
@@ -224,46 +224,51 @@ class ArithRNI<bits<6> op, string instr_asm,Operand Od, PatLeaf imm_type> :
 class Logic<bits<6> op, bits<11> flags, string instr_asm, SDNode OpNode> :
             TA<op, flags, (outs GPR:$dst), (ins GPR:$b, GPR:$c),
                !strconcat(instr_asm, "   $dst, $b, $c"),
-               [(set GPR:$dst, (OpNode GPR:$b, GPR:$c))], IIAlu>;
+               [(set GPR:$dst, (OpNode GPR:$b, GPR:$c))], IIC_ALU>;
 
 class LogicI<bits<6> op, string instr_asm, SDNode OpNode> :
              TB<op, (outs GPR:$dst), (ins GPR:$b, uimm16:$c),
                 !strconcat(instr_asm, "   $dst, $b, $c"),
                 [(set GPR:$dst, (OpNode GPR:$b, immZExt16:$c))],
-                IIAlu>;
+                IIC_ALU>;
 
 class LogicI32<bits<6> op, string instr_asm> :
                TB<op, (outs GPR:$dst), (ins GPR:$b, uimm16:$c),
                   !strconcat(instr_asm, "   $dst, $b, $c"),
-                  [], IIAlu>;
+                  [], IIC_ALU>;
 
 class PatCmp<bits<6> op, bits<11> flags, string instr_asm> :
              TA<op, flags, (outs GPR:$dst), (ins GPR:$b, GPR:$c),
                 !strconcat(instr_asm, "   $dst, $b, $c"),
-                 [], IIAlu>;
+                 [], IIC_ALU>;
 
 //===----------------------------------------------------------------------===//
 // Memory Access Instructions
 //===----------------------------------------------------------------------===//
+
+let mayLoad = 1 in {
 class LoadM<bits<6> op, bits<11> flags, string instr_asm> :
             TA<op, flags, (outs GPR:$dst), (ins memrr:$addr),
                !strconcat(instr_asm, "   $dst, $addr"),
-               [], IILoad>;
+               [], IIC_MEMl>;
+}
 
 class LoadMI<bits<6> op, string instr_asm, PatFrag OpNode> :
              TB<op, (outs GPR:$dst), (ins memri:$addr),
                 !strconcat(instr_asm, "   $dst, $addr"),
-                [(set (i32 GPR:$dst), (OpNode iaddr:$addr))], IILoad>;
+                [(set (i32 GPR:$dst), (OpNode iaddr:$addr))], IIC_MEMl>;
 
+let mayStore = 1 in {
 class StoreM<bits<6> op, bits<11> flags, string instr_asm> :
              TA<op, flags, (outs), (ins GPR:$dst, memrr:$addr),
                 !strconcat(instr_asm, "   $dst, $addr"),
-                [], IIStore>;
+                [], IIC_MEMs>;
+}
 
 class StoreMI<bits<6> op, string instr_asm, PatFrag OpNode> :
               TB<op, (outs), (ins GPR:$dst, memri:$addr),
                  !strconcat(instr_asm, "   $dst, $addr"),
-                 [(OpNode (i32 GPR:$dst), iaddr:$addr)], IIStore>;
+                 [(OpNode (i32 GPR:$dst), iaddr:$addr)], IIC_MEMs>;
 
 //===----------------------------------------------------------------------===//
 // Branch Instructions
@@ -271,7 +276,7 @@ class StoreMI<bits<6> op, string instr_asm, PatFrag OpNode> :
 class Branch<bits<6> op, bits<5> br, bits<11> flags, string instr_asm> :
              TA<op, flags, (outs), (ins GPR:$target),
                 !strconcat(instr_asm, "   $target"),
-                [], IIBranch> {
+                [], IIC_BR> {
   let rd = 0x0;
   let ra = br;
   let Form = FCCR;
@@ -280,7 +285,7 @@ class Branch<bits<6> op, bits<5> br, bits<11> flags, string instr_asm> :
 class BranchI<bits<6> op, bits<5> br, string instr_asm> :
               TB<op, (outs), (ins brtarget:$target),
                  !strconcat(instr_asm, "   $target"),
-                 [], IIBranch> {
+                 [], IIC_BR> {
   let rd = 0;
   let ra = br;
   let Form = FCCI;
@@ -292,7 +297,7 @@ class BranchI<bits<6> op, bits<5> br, string instr_asm> :
 class BranchL<bits<6> op, bits<5> br, bits<11> flags, string instr_asm> :
               TA<op, flags, (outs), (ins GPR:$link, GPR:$target, variable_ops),
                  !strconcat(instr_asm, "   $link, $target"),
-                 [], IIBranch> {
+                 [], IIC_BRl> {
   let ra = br;
   let Form = FRCR;
 }
@@ -300,7 +305,7 @@ class BranchL<bits<6> op, bits<5> br, bits<11> flags, string instr_asm> :
 class BranchLI<bits<6> op, bits<5> br, string instr_asm> :
                TB<op, (outs), (ins GPR:$link, calltarget:$target, variable_ops),
                   !strconcat(instr_asm, "   $link, $target"),
-                  [], IIBranch> {
+                  [], IIC_BRl> {
   let ra = br;
   let Form = FRCI;
 }
@@ -312,7 +317,7 @@ class BranchC<bits<6> op, bits<5> br, bits<11> flags, string instr_asm> :
               TA<op, flags, (outs),
                  (ins GPR:$a, GPR:$b),
                  !strconcat(instr_asm, "   $a, $b"),
-                 [], IIBranch> {
+                 [], IIC_BRc> {
   let rd = br;
   let Form = FCRR;
 }
@@ -320,7 +325,7 @@ class BranchC<bits<6> op, bits<5> br, bits<11> flags, string instr_asm> :
 class BranchCI<bits<6> op, bits<5> br, string instr_asm> :
                TB<op, (outs), (ins GPR:$a, brtarget:$offset),
                   !strconcat(instr_asm, "   $a, $offset"),
-                  [], IIBranch> {
+                  [], IIC_BRc> {
   let rd = br;
   let Form = FCRI;
 }
@@ -330,71 +335,74 @@ class BranchCI<bits<6> op, bits<5> br, string instr_asm> :
 //===----------------------------------------------------------------------===//
 
 let isCommutable = 1, isAsCheapAsAMove = 1 in {
-  def ADDK   :  Arith<0x04, 0x000, "addk   ", add,  IIAlu>;
+  def ADDK   :  Arith<0x04, 0x000, "addk   ", add,  IIC_ALU>;
   def AND    :  Logic<0x21, 0x000, "and    ", and>;
   def OR     :  Logic<0x20, 0x000, "or     ", or>;
   def XOR    :  Logic<0x22, 0x000, "xor    ", xor>;
-  def PCMPBF : PatCmp<0x20, 0x400, "pcmpbf ">;
-  def PCMPEQ : PatCmp<0x22, 0x400, "pcmpeq ">;
-  def PCMPNE : PatCmp<0x23, 0x400, "pcmpne ">;
+
+  let Predicates=[HasPatCmp] in {
+    def PCMPBF : PatCmp<0x20, 0x400, "pcmpbf ">;
+    def PCMPEQ : PatCmp<0x22, 0x400, "pcmpeq ">;
+    def PCMPNE : PatCmp<0x23, 0x400, "pcmpne ">;
+  }
 
   let Defs = [CARRY] in {
-    def ADD    :  Arith<0x00, 0x000, "add    ", addc, IIAlu>;
+    def ADD    :  Arith<0x00, 0x000, "add    ", addc, IIC_ALU>;
 
     let Uses = [CARRY] in {
-      def ADDC   :  Arith<0x02, 0x000, "addc   ", adde, IIAlu>;
+      def ADDC   :  Arith<0x02, 0x000, "addc   ", adde, IIC_ALU>;
     }
   }
 
   let Uses = [CARRY] in {
-    def ADDKC  : ArithN<0x06, 0x000, "addkc  ", IIAlu>;
+    def ADDKC  : ArithN<0x06, 0x000, "addkc  ", IIC_ALU>;
   }
 }
 
 let isAsCheapAsAMove = 1 in {
-  def ANDN   :  ArithN<0x23, 0x000, "andn   ", IIAlu>;
-  def CMP    :  ArithN<0x05, 0x001, "cmp    ", IIAlu>;
-  def CMPU   :  ArithN<0x05, 0x003, "cmpu   ", IIAlu>;
-  def RSUBK  :  ArithR<0x05, 0x000, "rsubk  ", sub,  IIAlu>;
+  def ANDN   :  ArithN<0x23, 0x000, "andn   ", IIC_ALU>;
+  def CMP    :  ArithN<0x05, 0x001, "cmp    ", IIC_ALU>;
+  def CMPU   :  ArithN<0x05, 0x003, "cmpu   ", IIC_ALU>;
+  def RSUBK  :  ArithR<0x05, 0x000, "rsubk  ", sub,  IIC_ALU>;
 
   let Defs = [CARRY] in {
-    def RSUB   :  ArithR<0x01, 0x000, "rsub   ", subc, IIAlu>;
+    def RSUB   :  ArithR<0x01, 0x000, "rsub   ", subc, IIC_ALU>;
 
     let Uses = [CARRY] in {
-      def RSUBC  :  ArithR<0x03, 0x000, "rsubc  ", sube, IIAlu>;
+      def RSUBC  :  ArithR<0x03, 0x000, "rsubc  ", sube, IIC_ALU>;
     }
   }
 
   let Uses = [CARRY] in {
-    def RSUBKC : ArithRN<0x07, 0x000, "rsubkc ", IIAlu>;
+    def RSUBKC : ArithRN<0x07, 0x000, "rsubkc ", IIC_ALU>;
   }
 }
 
 let isCommutable = 1, Predicates=[HasMul] in {
-  def MUL    : Arith<0x10, 0x000, "mul    ", mul,   IIAlu>;
+  def MUL    : Arith<0x10, 0x000, "mul    ", mul,   IIC_ALUm>;
 }
 
 let isCommutable = 1, Predicates=[HasMul,HasMul64] in {
-  def MULH   : Arith<0x10, 0x001, "mulh   ", mulhs, IIAlu>;
-  def MULHU  : Arith<0x10, 0x003, "mulhu  ", mulhu, IIAlu>;
+  def MULH   : Arith<0x10, 0x001, "mulh   ", mulhs, IIC_ALUm>;
+  def MULHU  : Arith<0x10, 0x003, "mulhu  ", mulhu, IIC_ALUm>;
 }
 
 let Predicates=[HasMul,HasMul64] in {
-  def MULHSU : ArithN<0x10, 0x002, "mulhsu ", IIAlu>;
+  def MULHSU : ArithN<0x10, 0x002, "mulhsu ", IIC_ALUm>;
 }
 
 let Predicates=[HasBarrel] in {
-  def BSRL   :   Arith<0x11, 0x000, "bsrl   ", srl, IIAlu>;
-  def BSRA   :   Arith<0x11, 0x200, "bsra   ", sra, IIAlu>;
-  def BSLL   :   Arith<0x11, 0x400, "bsll   ", shl, IIAlu>;
+  def BSRL   :   Arith<0x11, 0x000, "bsrl   ", srl, IIC_SHT>;
+  def BSRA   :   Arith<0x11, 0x200, "bsra   ", sra, IIC_SHT>;
+  def BSLL   :   Arith<0x11, 0x400, "bsll   ", shl, IIC_SHT>;
   def BSRLI  :  ShiftI<0x19, 0x0, "bsrli  ", srl, uimm5, immZExt5>;
   def BSRAI  :  ShiftI<0x19, 0x1, "bsrai  ", sra, uimm5, immZExt5>;
   def BSLLI  :  ShiftI<0x19, 0x2, "bslli  ", shl, uimm5, immZExt5>;
 }
 
 let Predicates=[HasDiv] in {
-  def IDIV   :  ArithR<0x12, 0x000, "idiv   ", sdiv, IIAlu>;
-  def IDIVU  :  ArithR<0x12, 0x002, "idivu  ", udiv, IIAlu>;
+  def IDIV   :  ArithR<0x12, 0x000, "idiv   ", sdiv, IIC_ALUd>;
+  def IDIVU  :  ArithR<0x12, 0x002, "idivu  ", udiv, IIC_ALUd>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -552,7 +560,7 @@ let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1,
   def RTSD   : TB<0x2D, (outs), (ins GPR:$target, simm16:$imm),
                   "rtsd      $target, $imm",
                   [],
-                  IIBranch>;
+                  IIC_BR>;
 }
 
 let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1,
@@ -560,7 +568,7 @@ let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1,
   def RTID   : TB<0x2D, (outs), (ins GPR:$target, simm16:$imm),
                   "rtid      $target, $imm",
                   [],
-                  IIBranch>;
+                  IIC_BR>;
 }
 
 let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1,
@@ -568,7 +576,7 @@ let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1,
   def RTBD   : TB<0x2D, (outs), (ins GPR:$target, simm16:$imm),
                   "rtbd      $target, $imm",
                   [],
-                  IIBranch>;
+                  IIC_BR>;
 }
 
 let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1,
@@ -576,7 +584,7 @@ let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1,
   def RTED   : TB<0x2D, (outs), (ins GPR:$target, simm16:$imm),
                   "rted      $target, $imm",
                   [],
-                  IIBranch>;
+                  IIC_BR>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -584,7 +592,7 @@ let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1,
 //===----------------------------------------------------------------------===//
 
 let neverHasSideEffects = 1 in {
-  def NOP :  MBlazeInst< 0x20, FC, (outs), (ins), "nop    ", [], IIAlu>;
+  def NOP :  MBlazeInst< 0x20, FC, (outs), (ins), "nop    ", [], IIC_ALU>;
 }
 
 let usesCustomInserter = 1 in {
@@ -611,17 +619,17 @@ let usesCustomInserter = 1 in {
 
 let rb = 0 in {
   def SEXT16 : TA<0x24, 0x061, (outs GPR:$dst), (ins GPR:$src),
-                  "sext16    $dst, $src", [], IIAlu>;
+                  "sext16    $dst, $src", [], IIC_ALU>;
   def SEXT8  : TA<0x24, 0x060, (outs GPR:$dst), (ins GPR:$src),
-                  "sext8     $dst, $src", [], IIAlu>;
+                  "sext8     $dst, $src", [], IIC_ALU>;
   let Defs = [CARRY] in {
     def SRL    : TA<0x24, 0x041, (outs GPR:$dst), (ins GPR:$src),
-                    "srl       $dst, $src", [], IIAlu>;
+                    "srl       $dst, $src", [], IIC_ALU>;
     def SRA    : TA<0x24, 0x001, (outs GPR:$dst), (ins GPR:$src),
-                    "sra       $dst, $src", [], IIAlu>;
+                    "sra       $dst, $src", [], IIC_ALU>;
     let Uses = [CARRY] in {
       def SRC    : TA<0x24, 0x021, (outs GPR:$dst), (ins GPR:$src),
-                      "src       $dst, $src", [], IIAlu>;
+                      "src       $dst, $src", [], IIC_ALU>;
     }
   }
 }
@@ -637,36 +645,36 @@ let isCodeGenOnly=1 in {
 //===----------------------------------------------------------------------===//
 let Form=FRCS in {
   def MFS : SPC<0x25, 0x2, (outs GPR:$dst), (ins SPR:$src),
-                "mfs       $dst, $src", [], IIAlu>;
+                "mfs       $dst, $src", [], IIC_ALU>;
 }
 
 let Form=FCRCS in {
   def MTS : SPC<0x25, 0x3, (outs SPR:$dst), (ins GPR:$src),
-                "mts       $dst, $src", [], IIAlu>;
+                "mts       $dst, $src", [], IIC_ALU>;
 }
 
 def MSRSET : MSR<0x25, 0x20, (outs GPR:$dst), (ins uimm15:$set),
-                 "msrset    $dst, $set", [], IIAlu>;
+                 "msrset    $dst, $set", [], IIC_ALU>;
 
 def MSRCLR : MSR<0x25, 0x22, (outs GPR:$dst), (ins uimm15:$clr),
-                 "msrclr    $dst, $clr", [], IIAlu>;
+                 "msrclr    $dst, $clr", [], IIC_ALU>;
 
 let rd=0x0, Form=FCRR in {
   def WDC  : TA<0x24, 0x64, (outs), (ins GPR:$a, GPR:$b),
-                "wdc       $a, $b", [], IIAlu>;
+                "wdc       $a, $b", [], IIC_WDC>;
   def WDCF : TA<0x24, 0x74, (outs), (ins GPR:$a, GPR:$b),
-                "wdc.flush $a, $b", [], IIAlu>;
+                "wdc.flush $a, $b", [], IIC_WDC>;
   def WDCC : TA<0x24, 0x66, (outs), (ins GPR:$a, GPR:$b),
-                "wdc.clear $a, $b", [], IIAlu>;
+                "wdc.clear $a, $b", [], IIC_WDC>;
   def WIC  : TA<0x24, 0x68, (outs), (ins GPR:$a, GPR:$b),
-                "wic       $a, $b", [], IIAlu>;
+                "wic       $a, $b", [], IIC_WDC>;
 }
 
 def BRK  :  BranchL<0x26, 0x0C, 0x000, "brk    ">;
 def BRKI : BranchLI<0x2E, 0x0C, "brki   ">;
 
 def IMM : MBlazeInst<0x2C, FCCI, (outs), (ins simm16:$imm),
-                     "imm       $imm", [], IIAlu>;
+                     "imm       $imm", [], IIC_ALU>;
 
 //===----------------------------------------------------------------------===//
 // Pseudo instructions for atomic operations
@@ -848,11 +856,6 @@ def : Pat<(MBWrapper tconstpool:$in),  (ORI (i32 R0), tconstpool:$in)>;
 // Misc instructions
 def : Pat<(and (i32 GPR:$lh), (not (i32 GPR:$rh))),(ANDN GPR:$lh, GPR:$rh)>;
 
-// Arithmetic with immediates
-def : Pat<(add (i32 GPR:$in), imm:$imm),(ADDIK GPR:$in, imm:$imm)>;
-def : Pat<(or (i32 GPR:$in), imm:$imm),(ORI GPR:$in, imm:$imm)>;
-def : Pat<(xor (i32 GPR:$in), imm:$imm),(XORI GPR:$in, imm:$imm)>;
-
 // Convert any extend loads into zero extend loads
 def : Pat<(extloadi8  iaddr:$src), (i32 (LBUI iaddr:$src))>;
 def : Pat<(extloadi16 iaddr:$src), (i32 (LHUI iaddr:$src))>;
diff --git a/lib/Target/MBlaze/MBlazeRegisterInfo.cpp b/lib/Target/MBlaze/MBlazeRegisterInfo.cpp
index fa9140d..517279f 100644
--- a/lib/Target/MBlaze/MBlazeRegisterInfo.cpp
+++ b/lib/Target/MBlaze/MBlazeRegisterInfo.cpp
@@ -181,6 +181,26 @@ unsigned MBlazeRegisterInfo::getSpecialRegisterFromNumbering(unsigned Reg) {
   return 0; // Not reached
 }
 
+bool MBlazeRegisterInfo::isRegister(unsigned Reg) {
+  return Reg <= 31;
+}
+
+bool MBlazeRegisterInfo::isSpecialRegister(unsigned Reg) {
+  switch (Reg) {
+    case 0x0000 : case 0x0001 : case 0x0003 : case 0x0005 : 
+    case 0x0007 : case 0x000B : case 0x000D : case 0x1000 : 
+    case 0x1001 : case 0x1002 : case 0x1003 : case 0x1004 : 
+    case 0x2000 : case 0x2001 : case 0x2002 : case 0x2003 : 
+    case 0x2004 : case 0x2005 : case 0x2006 : case 0x2007 : 
+    case 0x2008 : case 0x2009 : case 0x200A : case 0x200B : 
+      return true;
+
+    default:
+      return false;
+  }
+  return false; // Not reached
+}
+
 unsigned MBlazeRegisterInfo::getPICCallReg() {
   return MBlaze::R20;
 }
@@ -336,5 +356,9 @@ int MBlazeRegisterInfo::getDwarfRegNum(unsigned RegNo, bool isEH) const {
   return MBlazeGenRegisterInfo::getDwarfRegNumFull(RegNo,0);
 }
 
+int MBlazeRegisterInfo::getLLVMRegNum(unsigned DwarfRegNo, bool isEH) const {
+  return MBlazeGenRegisterInfo::getLLVMRegNumFull(DwarfRegNo,0);
+}
+
 #include "MBlazeGenRegisterInfo.inc"
 
diff --git a/lib/Target/MBlaze/MBlazeRegisterInfo.h b/lib/Target/MBlaze/MBlazeRegisterInfo.h
index 839536d..3807839 100644
--- a/lib/Target/MBlaze/MBlazeRegisterInfo.h
+++ b/lib/Target/MBlaze/MBlazeRegisterInfo.h
@@ -45,6 +45,8 @@ struct MBlazeRegisterInfo : public MBlazeGenRegisterInfo {
   static unsigned getRegisterNumbering(unsigned RegEnum);
   static unsigned getRegisterFromNumbering(unsigned RegEnum);
   static unsigned getSpecialRegisterFromNumbering(unsigned RegEnum);
+  static bool isRegister(unsigned RegEnum);
+  static bool isSpecialRegister(unsigned RegEnum);
 
   /// Get PIC indirect call register
   static unsigned getPICCallReg();
@@ -73,6 +75,7 @@ struct MBlazeRegisterInfo : public MBlazeGenRegisterInfo {
   unsigned getEHHandlerRegister() const;
 
   int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+  int getLLVMRegNum(unsigned RegNum, bool isEH) const;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/MBlaze/MBlazeRegisterInfo.td b/lib/Target/MBlaze/MBlazeRegisterInfo.td
index fbefb22..13c46ba 100644
--- a/lib/Target/MBlaze/MBlazeRegisterInfo.td
+++ b/lib/Target/MBlaze/MBlazeRegisterInfo.td
@@ -43,7 +43,7 @@ let Namespace = "MBlaze" in {
   def R1  : MBlazeGPRReg< 1,  "r1">,   DwarfRegNum<[1]>;
   def R2  : MBlazeGPRReg< 2,  "r2">,   DwarfRegNum<[2]>;
   def R3  : MBlazeGPRReg< 3,  "r3">,   DwarfRegNum<[3]>;
-  def R4  : MBlazeGPRReg< 4,  "r4">,   DwarfRegNum<[5]>;
+  def R4  : MBlazeGPRReg< 4,  "r4">,   DwarfRegNum<[4]>;
   def R5  : MBlazeGPRReg< 5,  "r5">,   DwarfRegNum<[5]>;
   def R6  : MBlazeGPRReg< 6,  "r6">,   DwarfRegNum<[6]>;
   def R7  : MBlazeGPRReg< 7,  "r7">,   DwarfRegNum<[7]>;
@@ -85,67 +85,33 @@ let Namespace = "MBlaze" in {
   def RTLBX  : MBlazeSPRReg<0x1002, "rtlbx">,  DwarfRegNum<[41]>;
   def RTLBLO : MBlazeSPRReg<0x1003, "rtlblo">, DwarfRegNum<[42]>;
   def RTLBHI : MBlazeSPRReg<0x1004, "rtlbhi">, DwarfRegNum<[43]>;
-  def RPVR0  : MBlazeSPRReg<0x2000, "rpvr0">,  DwarfRegNum<[44]>;
-  def RPVR1  : MBlazeSPRReg<0x2001, "rpvr1">,  DwarfRegNum<[45]>;
-  def RPVR2  : MBlazeSPRReg<0x2002, "rpvr2">,  DwarfRegNum<[46]>;
-  def RPVR3  : MBlazeSPRReg<0x2003, "rpvr3">,  DwarfRegNum<[47]>;
-  def RPVR4  : MBlazeSPRReg<0x2004, "rpvr4">,  DwarfRegNum<[48]>;
-  def RPVR5  : MBlazeSPRReg<0x2005, "rpvr5">,  DwarfRegNum<[49]>;
-  def RPVR6  : MBlazeSPRReg<0x2006, "rpvr6">,  DwarfRegNum<[50]>;
-  def RPVR7  : MBlazeSPRReg<0x2007, "rpvr7">,  DwarfRegNum<[51]>;
-  def RPVR8  : MBlazeSPRReg<0x2008, "rpvr8">,  DwarfRegNum<[52]>;
-  def RPVR9  : MBlazeSPRReg<0x2009, "rpvr9">,  DwarfRegNum<[53]>;
-  def RPVR10 : MBlazeSPRReg<0x200A, "rpvr10">, DwarfRegNum<[54]>;
-  def RPVR11 : MBlazeSPRReg<0x200B, "rpvr11">, DwarfRegNum<[55]>;
+  def RTLBSX : MBlazeSPRReg<0x1004, "rtlbsx">, DwarfRegNum<[44]>;
+  def RPVR0  : MBlazeSPRReg<0x2000, "rpvr0">,  DwarfRegNum<[45]>;
+  def RPVR1  : MBlazeSPRReg<0x2001, "rpvr1">,  DwarfRegNum<[46]>;
+  def RPVR2  : MBlazeSPRReg<0x2002, "rpvr2">,  DwarfRegNum<[47]>;
+  def RPVR3  : MBlazeSPRReg<0x2003, "rpvr3">,  DwarfRegNum<[48]>;
+  def RPVR4  : MBlazeSPRReg<0x2004, "rpvr4">,  DwarfRegNum<[49]>;
+  def RPVR5  : MBlazeSPRReg<0x2005, "rpvr5">,  DwarfRegNum<[50]>;
+  def RPVR6  : MBlazeSPRReg<0x2006, "rpvr6">,  DwarfRegNum<[51]>;
+  def RPVR7  : MBlazeSPRReg<0x2007, "rpvr7">,  DwarfRegNum<[52]>;
+  def RPVR8  : MBlazeSPRReg<0x2008, "rpvr8">,  DwarfRegNum<[53]>;
+  def RPVR9  : MBlazeSPRReg<0x2009, "rpvr9">,  DwarfRegNum<[54]>;
+  def RPVR10 : MBlazeSPRReg<0x200A, "rpvr10">, DwarfRegNum<[55]>;
+  def RPVR11 : MBlazeSPRReg<0x200B, "rpvr11">, DwarfRegNum<[56]>;
 
   // The carry bit. In the Microblaze this is really bit 29 of the
   // MSR register but this is the only bit of that register that we
   // are interested in modeling.
-  def CARRY  : MBlazeSPRReg<0x0000, "rmsr[c]">, DwarfRegNum<[33]>;
+  def CARRY  : MBlazeSPRReg<0x0000, "rmsr[c]">;
 }
 
 //===----------------------------------------------------------------------===//
 // Register Classes
 //===----------------------------------------------------------------------===//
 
-def GPR : RegisterClass<"MBlaze", [i32,f32], 32,
-  [
-  // Return Values and Arguments
-  R3, R4, R5, R6, R7, R8, R9, R10,
+def GPR : RegisterClass<"MBlaze", [i32,f32], 32, (sequence "R%u", 0, 31)>;
 
-  // Not preserved across procedure calls
-  R11, R12,
-
-  // Callee save
-  R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31,
-
-  // Reserved
-  R0,  // Always zero
-  R1,  // The stack pointer
-  R2,  // Read-only small data area anchor
-  R13, // Read-write small data area anchor
-  R14, // Return address for interrupts
-  R15, // Return address for sub-routines
-  R16, // Return address for trap
-  R17, // Return address for exceptions
-  R18, // Reserved for assembler
-  R19  // The frame-pointer
-  ]>
-{
-  let MethodProtos = [{
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    GPRClass::iterator
-    GPRClass::allocation_order_end(const MachineFunction &MF) const {
-      // The last 10 registers on the list above are reserved
-      return end()-10;
-    }
-  }];
-}
-
-def SPR : RegisterClass<"MBlaze", [i32], 32,
-  [
+def SPR : RegisterClass<"MBlaze", [i32], 32, (add
   // Reserved
   RPC,
   RMSR,
@@ -171,20 +137,12 @@ def SPR : RegisterClass<"MBlaze", [i32], 32,
   RPVR9,
   RPVR10,
   RPVR11
-  ]>
+  )>
 {
-  let MethodProtos = [{
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    SPRClass::iterator
-    SPRClass::allocation_order_end(const MachineFunction &MF) const {
-      // None of the special purpose registers are allocatable.
-      return end()-24;
-    }
-  }];
+  // None of the special purpose registers are allocatable.
+  let isAllocatable = 0;
 }
 
-def CRC : RegisterClass<"MBlaze", [i32], 32, [CARRY]> {
+def CRC : RegisterClass<"MBlaze", [i32], 32, (add CARRY)> {
   let CopyCost = -1;
 }
diff --git a/lib/Target/MBlaze/MBlazeSchedule.td b/lib/Target/MBlaze/MBlazeSchedule.td
index ac4d98c..4662f25 100644
--- a/lib/Target/MBlaze/MBlazeSchedule.td
+++ b/lib/Target/MBlaze/MBlazeSchedule.td
@@ -8,57 +8,48 @@
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
-// Functional units across MBlaze chips sets. Based on GCC/MBlaze backend files.
+// MBlaze functional units.
 //===----------------------------------------------------------------------===//
-def ALU     : FuncUnit;
-def IMULDIV : FuncUnit;
+def IF : FuncUnit;
+def ID : FuncUnit;
+def EX : FuncUnit;
+def MA : FuncUnit;
+def WB : FuncUnit;
 
 //===----------------------------------------------------------------------===//
 // Instruction Itinerary classes used for MBlaze
 //===----------------------------------------------------------------------===//
-def IIAlu              : InstrItinClass;
-def IILoad             : InstrItinClass;
-def IIStore            : InstrItinClass;
-def IIXfer             : InstrItinClass;
-def IIBranch           : InstrItinClass;
-def IIHiLo             : InstrItinClass;
-def IIImul             : InstrItinClass;
-def IIIdiv             : InstrItinClass;
-def IIFcvt             : InstrItinClass;
-def IIFmove            : InstrItinClass;
-def IIFcmp             : InstrItinClass;
-def IIFadd             : InstrItinClass;
-def IIFmulSingle       : InstrItinClass;
-def IIFmulDouble       : InstrItinClass;
-def IIFdivSingle       : InstrItinClass;
-def IIFdivDouble       : InstrItinClass;
-def IIFsqrtSingle      : InstrItinClass;
-def IIFsqrtDouble      : InstrItinClass;
-def IIFrecipFsqrtStep  : InstrItinClass;
-def IIPseudo           : InstrItinClass;
+def IIC_ALU    : InstrItinClass;
+def IIC_ALUm   : InstrItinClass;
+def IIC_ALUd   : InstrItinClass;
+def IIC_SHT    : InstrItinClass;
+def IIC_FSLg   : InstrItinClass;
+def IIC_FSLp   : InstrItinClass;
+def IIC_MEMs   : InstrItinClass;
+def IIC_MEMl   : InstrItinClass;
+def IIC_FPU    : InstrItinClass;
+def IIC_FPUd   : InstrItinClass;
+def IIC_FPUf   : InstrItinClass;
+def IIC_FPUi   : InstrItinClass;
+def IIC_FPUs   : InstrItinClass;
+def IIC_FPUc   : InstrItinClass;
+def IIC_BR     : InstrItinClass;
+def IIC_BRc    : InstrItinClass;
+def IIC_BRl    : InstrItinClass;
+def IIC_WDC    : InstrItinClass;
+def IIC_Pseudo : InstrItinClass;
 
 //===----------------------------------------------------------------------===//
-// MBlaze Generic instruction itineraries.
+// MBlaze generic instruction itineraries.
 //===----------------------------------------------------------------------===//
-def MBlazeGenericItineraries : ProcessorItineraries<
-  [ALU, IMULDIV], [], [
-  InstrItinData<IIAlu              , [InstrStage<1,  [ALU]>]>,
-  InstrItinData<IILoad             , [InstrStage<3,  [ALU]>]>,
-  InstrItinData<IIStore            , [InstrStage<1,  [ALU]>]>,
-  InstrItinData<IIXfer             , [InstrStage<2,  [ALU]>]>,
-  InstrItinData<IIBranch           , [InstrStage<1,  [ALU]>]>,
-  InstrItinData<IIHiLo             , [InstrStage<1,  [IMULDIV]>]>,
-  InstrItinData<IIImul             , [InstrStage<17, [IMULDIV]>]>,
-  InstrItinData<IIIdiv             , [InstrStage<38, [IMULDIV]>]>,
-  InstrItinData<IIFcvt             , [InstrStage<1,  [ALU]>]>,
-  InstrItinData<IIFmove            , [InstrStage<2,  [ALU]>]>,
-  InstrItinData<IIFcmp             , [InstrStage<3,  [ALU]>]>,
-  InstrItinData<IIFadd             , [InstrStage<4,  [ALU]>]>,
-  InstrItinData<IIFmulSingle       , [InstrStage<7,  [ALU]>]>,
-  InstrItinData<IIFmulDouble       , [InstrStage<8,  [ALU]>]>,
-  InstrItinData<IIFdivSingle       , [InstrStage<23, [ALU]>]>,
-  InstrItinData<IIFdivDouble       , [InstrStage<36, [ALU]>]>,
-  InstrItinData<IIFsqrtSingle      , [InstrStage<54, [ALU]>]>,
-  InstrItinData<IIFsqrtDouble      , [InstrStage<12, [ALU]>]>,
-  InstrItinData<IIFrecipFsqrtStep  , [InstrStage<5,  [ALU]>]>
-]>;
+def MBlazeGenericItineraries : ProcessorItineraries<[], [], []>;
+
+//===----------------------------------------------------------------------===//
+// MBlaze instruction itineraries for three stage pipeline.
+//===----------------------------------------------------------------------===//
+include "MBlazeSchedule3.td"
+
+//===----------------------------------------------------------------------===//
+// MBlaze instruction itineraries for five stage pipeline.
+//===----------------------------------------------------------------------===//
+include "MBlazeSchedule5.td"
diff --git a/lib/Target/MBlaze/MBlazeSchedule3.td b/lib/Target/MBlaze/MBlazeSchedule3.td
new file mode 100644
index 0000000..ccbf99d
--- /dev/null
+++ b/lib/Target/MBlaze/MBlazeSchedule3.td
@@ -0,0 +1,236 @@
+//===- MBlazeSchedule3.td - MBlaze Scheduling Definitions --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MBlaze instruction itineraries for the three stage pipeline.
+//===----------------------------------------------------------------------===//
+def MBlazePipe3Itineraries : ProcessorItineraries<
+  [IF,ID,EX], [], [
+
+  // ALU instruction with one destination register and either two register
+  // source operands or one register source operand and one immediate operand.
+  // The instruction takes one cycle to execute in each of the stages. The
+  // two source operands are read during the decode stage and the result is
+  // ready after the execute stage.
+  InstrItinData< IIC_ALU,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>], // one cycle in execute stage
+               [ 2                    // result ready after two cycles
+               , 1                    // first operand read after one cycle
+               , 1 ]>,                // second operand read after one cycle
+
+  // ALU multiply instruction with one destination register and either two
+  // register source operands or one register source operand and one immediate
+  // operand.  The instruction takes one cycle to execute in each of the
+  // pipeline stages except the execute stage, which takes three cycles. The
+  // two source operands are read during the decode stage and the result is
+  // ready after the execute stage.
+  InstrItinData< IIC_ALUm,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<3,[EX]>], // three cycles in execute stage
+               [ 4                    // result ready after four cycles
+               , 1                    // first operand read after one cycle
+               , 1 ]>,                // second operand read after one cycle
+
+  // ALU divide instruction with one destination register two register source
+  // operands. The instruction takes one cycle to execute in each the pipeline
+  // stages except the execute stage, which takes 34 cycles. The two
+  // source operands are read during the decode stage and the result is ready
+  // after the execute stage.
+  InstrItinData< IIC_ALUd,
+               [ InstrStage<1,[IF]>    // one cycle in fetch stage
+               , InstrStage<1,[ID]>    // one cycle in decode stage
+               , InstrStage<34,[EX]>], // 34 cycles in execute stage
+               [ 35                    // result ready after 35 cycles
+               , 1                     // first operand read after one cycle
+               , 1 ]>,                 // second operand read after one cycle
+
+  // Shift instruction with one destination register and either two register
+  // source operands or one register source operand and one immediate operand.
+  // The instruction takes one cycle to execute in each of the pipeline stages
+  // except the execute stage, which takes two cycles.  The two source operands
+  // are read during the decode stage and the result is ready after the execute
+  // stage.
+  InstrItinData< IIC_SHT,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<2,[EX]>], // two cycles in execute stage
+               [ 3                    // result ready after three cycles
+               , 1                    // first operand read after one cycle
+               , 1 ]>,                // second operand read after one cycle
+
+  // Branch instruction with one source operand register. The instruction takes
+  // one cycle to execute in each of the pipeline stages. The source operand is
+  // read during the decode stage.
+  InstrItinData< IIC_BR,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>], // one cycle in execute stage
+               [ 1 ]>,                // first operand read after one cycle
+
+  // Conditional branch instruction with two source operand registers. The
+  // instruction takes one cycle to execute in each of the pipeline stages. The
+  // two source operands are read during the decode stage.
+  InstrItinData< IIC_BRc,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>], // one cycle in execute stage
+               [ 1                    // first operand read after one cycle
+               , 1 ]>,                // second operand read after one cycle
+
+  // Branch and link instruction with one destination register and one source
+  // operand register. The instruction takes one cycle to execute in each of
+  // the pipeline stages. The source operand is read during the decode stage
+  // and the destination register is ready after the execute stage.
+  InstrItinData< IIC_BRl,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>], // one cycle in execute stage
+               [ 2                    // result ready after two cycles
+               , 1 ]>,                // first operand read after one cycle
+
+  // Cache control instruction with two source operand registers. The
+  // instruction takes one cycle to execute in each of the pipeline stages
+  // except the memory access stage, which takes two cycles. The source
+  // operands are read during the decode stage.
+  InstrItinData< IIC_WDC,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<2,[EX]>], // two cycles in execute stage
+               [ 1                    // first operand read after one cycle
+               , 1 ]>,                // second operand read after one cycle
+
+  // Floating point instruction with one destination register and two source
+  // operand registers. The instruction takes one cycle to execute in each of
+  // the pipeline stages except the execute stage, which takes six cycles. The
+  // source operands are read during the decode stage and the results are ready
+  // after the execute stage.
+  InstrItinData< IIC_FPU,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<6,[EX]>], // six cycles in execute stage
+               [ 7                    // result ready after seven cycles
+               , 1                    // first operand read after one cycle
+               , 1 ]>,                // second operand read after one cycle
+
+  // Floating point divide instruction with one destination register and two
+  // source operand registers. The instruction takes one cycle to execute in
+  // each of the pipeline stages except the execute stage, which takes 30
+  // cycles. The source operands are read during the decode stage and the
+  // results are ready after the execute stage.
+  InstrItinData< IIC_FPUd,
+               [ InstrStage<1,[IF]>    // one cycle in fetch stage
+               , InstrStage<1,[ID]>    // one cycle in decode stage
+               , InstrStage<30,[EX]>], // one cycle in execute stage
+               [ 31                    // result ready after 31 cycles
+               , 1                     // first operand read after one cycle
+               , 1 ]>,                 // second operand read after one cycle
+
+  // Convert floating point to integer instruction with one destination
+  // register and one source operand register. The instruction takes one cycle
+  // to execute in each of the pipeline stages except the execute stage,
+  // which takes seven cycles. The source operands are read during the decode
+  // stage and the results are ready after the execute stage.
+  InstrItinData< IIC_FPUi,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<7,[EX]>], // seven cycles in execute stage
+               [ 8                    // result ready after eight cycles
+               , 1 ]>,                // first operand read after one cycle
+
+  // Convert integer to floating point instruction with one destination
+  // register and one source operand register. The instruction takes one cycle
+  // to execute in each of the pipeline stages except the execute stage,
+  // which takes six cycles. The source operands are read during the decode
+  // stage and the results are ready after the execute stage.
+  InstrItinData< IIC_FPUf,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<6,[EX]>], // six cycles in execute stage
+               [ 7                    // result ready after seven cycles
+               , 1 ]>,                // first operand read after one cycle
+
+  // Floating point square root instruction with one destination register and
+  // one source operand register. The instruction takes one cycle to execute in
+  // each of the pipeline stages except the execute stage, which takes 29
+  // cycles. The source operands are read during the decode stage and the
+  // results are ready after the execute stage.
+  InstrItinData< IIC_FPUs,
+               [ InstrStage<1,[IF]>    // one cycle in fetch stage
+               , InstrStage<1,[ID]>    // one cycle in decode stage
+               , InstrStage<29,[EX]>], // 29 cycles in execute stage
+               [ 30                    // result ready after 30 cycles
+               , 1 ]>,                 // first operand read after one cycle
+
+  // Floating point comparison instruction with one destination register and
+  // two source operand registers. The instruction takes one cycle to execute
+  // in each of the pipeline stages except the execute stage, which takes three
+  // cycles. The source operands are read during the decode stage and the
+  // results are ready after the execute stage.
+  InstrItinData< IIC_FPUc,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<3,[EX]>], // three cycles in execute stage
+               [ 4                    // result ready after four cycles
+               , 1                    // first operand read after one cycle
+               , 1 ]>,                // second operand read after one cycle
+
+  // FSL get instruction with one register or immediate source operand and one
+  // destination register. The instruction takes one cycle to execute in each
+  // of the pipeline stages except the execute stage, which takes two cycles.
+  // The one source operand is read during the decode stage and the result is
+  // ready after the execute stage.
+  InstrItinData< IIC_FSLg,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<2,[EX]>], // two cycles in execute stage
+               [ 3                    // result ready after two cycles
+               , 1 ]>,                // first operand read after one cycle
+
+  // FSL put instruction with either two register source operands or one
+  // register source operand and one immediate operand. There is no result
+  // produced by the instruction. The instruction takes one cycle to execute in
+  // each of the pipeline stages except the execute stage, which takes two
+  // cycles. The two source operands are read during the decode stage.
+  InstrItinData< IIC_FSLp,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<2,[EX]>], // two cycles in execute stage
+               [ 1                    // first operand read after one cycle
+               , 1 ]>,                // second operand read after one cycle
+
+  // Memory store instruction with either three register source operands or two
+  // register source operands and one immediate operand. There is no result
+  // produced by the instruction. The instruction takes one cycle to execute in
+  // each of the pipeline stages except the execute stage, which takes two
+  // cycles. All of the source operands are read during the decode stage.
+  InstrItinData< IIC_MEMs,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<2,[EX]>], // two cycles in execute stage
+               [ 1                    // first operand read after one cycle
+               , 1                    // second operand read after one cycle
+               , 1 ]>,                // third operand read after one cycle
+
+  // Memory load instruction with one destination register and either two
+  // register source operands or one register source operand and one immediate
+  // operand. The instruction takes one cycle to execute in each of the
+  // pipeline stages except the execute stage, which takes two cycles. All of
+  // the source operands are read during the decode stage and the result is
+  // ready after the execute stage.
+  InstrItinData< IIC_MEMl,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<2,[EX]>], // two cycles in execute stage
+               [ 3                    // result ready after four cycles
+               , 1                    // second operand read after one cycle
+               , 1 ]>                 // third operand read after one cycle
+]>;
diff --git a/lib/Target/MBlaze/MBlazeSchedule5.td b/lib/Target/MBlaze/MBlazeSchedule5.td
new file mode 100644
index 0000000..fa88766
--- /dev/null
+++ b/lib/Target/MBlaze/MBlazeSchedule5.td
@@ -0,0 +1,267 @@
+//===- MBlazeSchedule5.td - MBlaze Scheduling Definitions --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MBlaze instruction itineraries for the five stage pipeline.
+//===----------------------------------------------------------------------===//
+def MBlazePipe5Itineraries : ProcessorItineraries<
+  [IF,ID,EX,MA,WB], [], [
+
+  // ALU instruction with one destination register and either two register
+  // source operands or one register source operand and one immediate operand.
+  // The instruction takes one cycle to execute in each of the stages. The
+  // two source operands are read during the decode stage and the result is
+  // ready after the execute stage.
+  InstrItinData< IIC_ALU,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>   // one cycle in execute stage
+               , InstrStage<1,[MA]>   // one cycle in memory access stage
+               , InstrStage<1,[WB]>], // one cycle in write back stage
+               [ 2                    // result ready after two cycles
+               , 1                    // first operand read after one cycle
+               , 1 ]>,                // second operand read after one cycle
+
+  // ALU multiply instruction with one destination register and either two
+  // register source operands or one register source operand and one immediate
+  // operand.  The instruction takes one cycle to execute in each of the
+  // pipeline stages. The two source operands are read during the decode stage
+  // and the result is ready after the execute stage.
+  InstrItinData< IIC_ALUm,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>   // one cycle in execute stage
+               , InstrStage<1,[MA]>   // one cycle in memory access stage
+               , InstrStage<1,[WB]>], // one cycle in write back stage
+               [ 2                    // result ready after two cycles
+               , 1                    // first operand read after one cycle
+               , 1 ]>,                // second operand read after one cycle
+
+  // ALU divide instruction with one destination register two register source
+  // operands. The instruction takes one cycle to execute in each the pipeline
+  // stages except the memory access stage, which takes 31 cycles. The two
+  // source operands are read during the decode stage and the result is ready
+  // after the memory access stage.
+  InstrItinData< IIC_ALUd,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>   // one cycle in execute stage
+               , InstrStage<31,[MA]>  // 31 cycles in memory access stage
+               , InstrStage<1,[WB]>], // one cycle in write back stage
+               [ 33                   // result ready after 33 cycles
+               , 1                    // first operand read after one cycle
+               , 1 ]>,                // second operand read after one cycle
+
+  // Shift instruction with one destination register and either two register
+  // source operands or one register source operand and one immediate operand.
+  // The instruction takes one cycle to execute in each of the pipeline stages.
+  // The two source operands are read during the decode stage and the result is
+  // ready after the memory access stage.
+  InstrItinData< IIC_SHT,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>   // one cycle in execute stage
+               , InstrStage<1,[MA]>   // one cycle in memory access stage
+               , InstrStage<1,[WB]>], // one cycle in write back stage
+               [ 3                    // result ready after three cycles
+               , 1                    // first operand read after one cycle
+               , 1 ]>,                // second operand read after one cycle
+
+  // Branch instruction with one source operand register. The instruction takes
+  // one cycle to execute in each of the pipeline stages. The source operand is
+  // read during the decode stage.
+  InstrItinData< IIC_BR,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>   // one cycle in execute stage
+               , InstrStage<1,[MA]>   // one cycle in memory access stage
+               , InstrStage<1,[WB]>], // one cycle in write back stage
+               [ 1 ]>,                // first operand read after one cycle
+
+  // Conditional branch instruction with two source operand registers. The
+  // instruction takes one cycle to execute in each of the pipeline stages. The
+  // two source operands are read during the decode stage.
+  InstrItinData< IIC_BRc,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>   // one cycle in execute stage
+               , InstrStage<1,[MA]>   // one cycle in memory access stage
+               , InstrStage<1,[WB]>], // one cycle in write back stage
+               [ 1                    // first operand read after one cycle
+               , 1 ]>,                // second operand read after one cycle
+
+  // Branch and link instruction with one destination register and one source
+  // operand register. The instruction takes one cycle to execute in each of
+  // the pipeline stages. The source operand is read during the decode stage
+  // and the destination register is ready after the writeback stage.
+  InstrItinData< IIC_BRl,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>   // one cycle in execute stage
+               , InstrStage<1,[MA]>   // one cycle in memory access stage
+               , InstrStage<1,[WB]>], // one cycle in write back stage
+               [ 4                    // result ready after four cycles
+               , 1 ]>,                // first operand read after one cycle
+
+  // Cache control instruction with two source operand registers. The
+  // instruction takes one cycle to execute in each of the pipeline stages
+  // except the memory access stage, which takes two cycles. The source
+  // operands are read during the decode stage.
+  InstrItinData< IIC_WDC,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>   // one cycle in execute stage
+               , InstrStage<2,[MA]>   // two cycles in memory access stage
+               , InstrStage<1,[WB]>], // one cycle in write back stage
+               [ 1                    // first operand read after one cycle
+               , 1 ]>,                // second operand read after one cycle
+
+  // Floating point instruction with one destination register and two source
+  // operand registers. The instruction takes one cycle to execute in each of
+  // the pipeline stages except the memory access stage, which takes two
+  // cycles. The source operands are read during the decode stage and the
+  // results are ready after the writeback stage.
+  InstrItinData< IIC_FPU,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>   // one cycle in execute stage
+               , InstrStage<2,[MA]>   // two cycles in memory access stage
+               , InstrStage<1,[WB]>], // one cycle in write back stage
+               [ 5                    // result ready after five cycles
+               , 1                    // first operand read after one cycle
+               , 1 ]>,                // second operand read after one cycle
+
+  // Floating point divide instruction with one destination register and two
+  // source operand registers. The instruction takes one cycle to execute in
+  // each of the pipeline stages except the memory access stage, which takes 26
+  // cycles. The source operands are read during the decode stage and the
+  // results are ready after the writeback stage.
+  InstrItinData< IIC_FPUd,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>   // one cycle in execute stage
+               , InstrStage<26,[MA]>  // 26 cycles in memory access stage
+               , InstrStage<1,[WB]>], // one cycle in write back stage
+               [ 29                   // result ready after 29 cycles
+               , 1                    // first operand read after one cycle
+               , 1 ]>,                // second operand read after one cycle
+
+  // Convert floating point to integer instruction with one destination
+  // register and one source operand register. The instruction takes one cycle
+  // to execute in each of the pipeline stages except the memory access stage,
+  // which takes three cycles. The source operands are read during the decode
+  // stage and the results are ready after the writeback stage.
+  InstrItinData< IIC_FPUi,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>   // one cycle in execute stage
+               , InstrStage<3,[MA]>   // three cycles in memory access stage
+               , InstrStage<1,[WB]>], // one cycle in write back stage
+               [ 6                   // result ready after six cycles
+               , 1 ]>,                // first operand read after one cycle
+
+  // Convert integer to floating point instruction with one destination
+  // register and one source operand register. The instruction takes one cycle
+  // to execute in each of the pipeline stages except the memory access stage,
+  // which takes two cycles. The source operands are read during the decode
+  // stage and the results are ready after the writeback stage.
+  InstrItinData< IIC_FPUf,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>   // one cycle in execute stage
+               , InstrStage<2,[MA]>   // two cycles in memory access stage
+               , InstrStage<1,[WB]>], // one cycle in write back stage
+               [ 5                    // result ready after five cycles
+               , 1 ]>,                // first operand read after one cycle
+
+  // Floating point square root instruction with one destination register and
+  // one source operand register. The instruction takes one cycle to execute in
+  // each of the pipeline stages except the memory access stage, which takes 25
+  // cycles. The source operands are read during the decode stage and the
+  // results are ready after the writeback stage.
+  InstrItinData< IIC_FPUs,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>   // one cycle in execute stage
+               , InstrStage<25,[MA]>  // 25 cycles in memory access stage
+               , InstrStage<1,[WB]>], // one cycle in write back stage
+               [ 28                   // result ready after 28 cycles
+               , 1 ]>,                // first operand read after one cycle
+
+  // Floating point comparison instruction with one destination register and
+  // two source operand registers. The instruction takes one cycle to execute
+  // in each of the pipeline stages. The source operands are read during the
+  // decode stage and the results are ready after the execute stage.
+  InstrItinData< IIC_FPUc,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>   // one cycle in execute stage
+               , InstrStage<1,[MA]>   // one cycle in memory access stage
+               , InstrStage<1,[WB]>], // one cycle in write back stage
+               [ 2                    // result ready after two cycles
+               , 1                    // first operand read after one cycle
+               , 1 ]>,                // second operand read after one cycle
+
+  // FSL get instruction with one register or immediate source operand and one
+  // destination register. The instruction takes one cycle to execute in each
+  // of the pipeline stages. The one source operand is read during the decode
+  // stage and the result is ready after the execute stage.
+  InstrItinData< IIC_FSLg,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>   // one cycle in execute stage
+               , InstrStage<1,[MA]>   // one cycle in memory access stage
+               , InstrStage<1,[WB]>], // one cycle in write back stage
+               [ 2                    // result ready after two cycles
+               , 1 ]>,                // first operand read after one cycle
+
+  // FSL put instruction with either two register source operands or one
+  // register source operand and one immediate operand. There is no result
+  // produced by the instruction. The instruction takes one cycle to execute in
+  // each of the pipeline stages. The two source operands are read during the
+  // decode stage.
+  InstrItinData< IIC_FSLp,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>   // one cycle in execute stage
+               , InstrStage<1,[MA]>   // one cycle in memory access stage
+               , InstrStage<1,[WB]>], // one cycle in write back stage
+               [ 1                    // first operand read after one cycle
+               , 1 ]>,                // second operand read after one cycle
+
+  // Memory store instruction with either three register source operands or two
+  // register source operands and one immediate operand. There is no result
+  // produced by the instruction. The instruction takes one cycle to execute in
+  // each of the pipeline stages. All of the source operands are read during
+  // the decode stage.
+  InstrItinData< IIC_MEMs,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>   // one cycle in execute stage
+               , InstrStage<1,[MA]>   // one cycle in memory access stage
+               , InstrStage<1,[WB]>], // one cycle in write back stage
+               [ 1                    // first operand read after one cycle
+               , 1                    // second operand read after one cycle
+               , 1 ]>,                // third operand read after one cycle
+
+  // Memory load instruction with one destination register and either two
+  // register source operands or one register source operand and one immediate
+  // operand. The instruction takes one cycle to execute in each of the
+  // pipeline stages. All of the source operands are read during the decode
+  // stage and the result is ready after the writeback stage.
+  InstrItinData< IIC_MEMl,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>   // one cycle in execute stage
+               , InstrStage<1,[MA]>   // one cycle in memory access stage
+               , InstrStage<1,[WB]>], // one cycle in write back stage
+               [ 4                    // result ready after four cycles
+               , 1                    // second operand read after one cycle
+               , 1 ]>                 // third operand read after one cycle
+]>;
diff --git a/lib/Target/MBlaze/MBlazeSubtarget.cpp b/lib/Target/MBlaze/MBlazeSubtarget.cpp
index 3440521..a80744a 100644
--- a/lib/Target/MBlaze/MBlazeSubtarget.cpp
+++ b/lib/Target/MBlaze/MBlazeSubtarget.cpp
@@ -13,19 +13,39 @@
 
 #include "MBlazeSubtarget.h"
 #include "MBlaze.h"
+#include "MBlazeRegisterInfo.h"
 #include "MBlazeGenSubtarget.inc"
 #include "llvm/Support/CommandLine.h"
 using namespace llvm;
 
 MBlazeSubtarget::MBlazeSubtarget(const std::string &TT, const std::string &FS):
-  HasPipe3(false), HasBarrel(false), HasDiv(false), HasMul(false),
-  HasFSL(false), HasEFSL(false), HasMSRSet(false), HasException(false),
-  HasPatCmp(false), HasFPU(false), HasESR(false), HasPVR(false),
-  HasMul64(false), HasSqrt(false), HasMMU(false)
+  HasBarrel(false), HasDiv(false), HasMul(false), HasPatCmp(false),
+  HasFPU(false), HasMul64(false), HasSqrt(false)
 {
-  std::string CPU = "v400";
-  MBlazeArchVersion = V400;
-
   // Parse features string.
-  ParseSubtargetFeatures(FS, CPU);
+  std::string CPU = "mblaze";
+  CPU = ParseSubtargetFeatures(FS, CPU);
+
+  // Only use instruction scheduling if the selected CPU has an instruction
+  // itinerary (the default CPU is the only one that doesn't).
+  HasItin = CPU != "mblaze";
+  DEBUG(dbgs() << "CPU " << CPU << "(" << HasItin << ")\n");
+
+  // Compute the issue width of the MBlaze itineraries
+  computeIssueWidth();
+}
+
+void MBlazeSubtarget::computeIssueWidth() {
+  InstrItins.IssueWidth = 1;
+}
+
+bool MBlazeSubtarget::
+enablePostRAScheduler(CodeGenOpt::Level OptLevel,
+                      TargetSubtarget::AntiDepBreakMode& Mode,
+                      RegClassVector& CriticalPathRCs) const {
+  Mode = TargetSubtarget::ANTIDEP_CRITICAL;
+  CriticalPathRCs.clear();
+  CriticalPathRCs.push_back(&MBlaze::GPRRegClass);
+  return HasItin && OptLevel >= CodeGenOpt::Default;
 }
+
diff --git a/lib/Target/MBlaze/MBlazeSubtarget.h b/lib/Target/MBlaze/MBlazeSubtarget.h
index bebb3f7..2255b28 100644
--- a/lib/Target/MBlaze/MBlazeSubtarget.h
+++ b/lib/Target/MBlaze/MBlazeSubtarget.h
@@ -24,29 +24,14 @@ namespace llvm {
 class MBlazeSubtarget : public TargetSubtarget {
 
 protected:
-
-  enum MBlazeArchEnum {
-    V400, V500, V600, V700, V710
-  };
-
-  // MBlaze architecture version
-  MBlazeArchEnum MBlazeArchVersion;
-
-  bool HasPipe3;
   bool HasBarrel;
   bool HasDiv;
   bool HasMul;
-  bool HasFSL;
-  bool HasEFSL;
-  bool HasMSRSet;
-  bool HasException;
   bool HasPatCmp;
   bool HasFPU;
-  bool HasESR;
-  bool HasPVR;
   bool HasMul64;
   bool HasSqrt;
-  bool HasMMU;
+  bool HasItin;
 
   InstrItineraryData InstrItins;
 
@@ -61,18 +46,26 @@ public:
   std::string ParseSubtargetFeatures(const std::string &FS,
                                      const std::string &CPU);
 
+  /// Compute the number of maximum number of issues per cycle for the
+  /// MBlaze scheduling itineraries.
+  void computeIssueWidth();
+
+  /// enablePostRAScheduler - True at 'More' optimization.
+  bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
+                             TargetSubtarget::AntiDepBreakMode& Mode,
+                             RegClassVector& CriticalPathRCs) const;
+
+  /// getInstrItins - Return the instruction itineraies based on subtarget.
+  const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
+
+  bool hasItin()   const { return HasItin; }
+  bool hasPCMP()   const { return HasPatCmp; }
   bool hasFPU()    const { return HasFPU; }
   bool hasSqrt()   const { return HasSqrt; }
   bool hasMul()    const { return HasMul; }
   bool hasMul64()  const { return HasMul64; }
   bool hasDiv()    const { return HasDiv; }
   bool hasBarrel() const { return HasBarrel; }
-
-  bool isV400() const { return MBlazeArchVersion == V400; }
-  bool isV500() const { return MBlazeArchVersion == V500; }
-  bool isV600() const { return MBlazeArchVersion == V600; }
-  bool isV700() const { return MBlazeArchVersion == V700; }
-  bool isV710() const { return MBlazeArchVersion == V710; }
 };
 } // End llvm namespace
 
diff --git a/lib/Target/MBlaze/MBlazeTargetMachine.cpp b/lib/Target/MBlaze/MBlazeTargetMachine.cpp
index cd949e1..df34a83 100644
--- a/lib/Target/MBlaze/MBlazeTargetMachine.cpp
+++ b/lib/Target/MBlaze/MBlazeTargetMachine.cpp
@@ -36,19 +36,18 @@ static MCStreamer *createMCStreamer(const Target &T, const std::string &TT,
                                     bool RelaxAll,
                                     bool NoExecStack) {
   Triple TheTriple(TT);
-  switch (TheTriple.getOS()) {
-  case Triple::Darwin:
+
+  if (TheTriple.isOSDarwin()) {
     llvm_unreachable("MBlaze does not support Darwin MACH-O format");
     return NULL;
-  case Triple::MinGW32:
-  case Triple::Cygwin:
-  case Triple::Win32:
+  }
+
+  if (TheTriple.isOSWindows()) {
     llvm_unreachable("MBlaze does not support Windows COFF format");
     return NULL;
-  default:
-    return createELFStreamer(Ctx, TAB, _OS, _Emitter, RelaxAll,
-                             NoExecStack);
   }
+
+  return createELFStreamer(Ctx, TAB, _OS, _Emitter, RelaxAll, NoExecStack);
 }
 
 
@@ -87,7 +86,8 @@ MBlazeTargetMachine(const Target &T, const std::string &TT,
   DataLayout("E-p:32:32:32-i8:8:8-i16:16:16"),
   InstrInfo(*this),
   FrameLowering(Subtarget),
-  TLInfo(*this), TSInfo(*this), ELFWriterInfo(*this) {
+  TLInfo(*this), TSInfo(*this), ELFWriterInfo(*this),
+  InstrItins(Subtarget.getInstrItineraryData()) {
   if (getRelocationModel() == Reloc::Default) {
       setRelocationModel(Reloc::Static);
   }
diff --git a/lib/Target/MBlaze/MBlazeTargetMachine.h b/lib/Target/MBlaze/MBlazeTargetMachine.h
index 45ad078..48ce37a 100644
--- a/lib/Target/MBlaze/MBlazeTargetMachine.h
+++ b/lib/Target/MBlaze/MBlazeTargetMachine.h
@@ -38,13 +38,18 @@ namespace llvm {
     MBlazeSelectionDAGInfo TSInfo;
     MBlazeIntrinsicInfo    IntrinsicInfo;
     MBlazeELFWriterInfo    ELFWriterInfo;
+    InstrItineraryData     InstrItins;
+
   public:
     MBlazeTargetMachine(const Target &T, const std::string &TT,
-                      const std::string &FS);
+                        const std::string &FS);
 
     virtual const MBlazeInstrInfo *getInstrInfo() const
     { return &InstrInfo; }
 
+    virtual const InstrItineraryData *getInstrItineraryData() const
+    {  return &InstrItins; }
+
     virtual const TargetFrameLowering *getFrameLowering() const
     { return &FrameLowering; }
 
diff --git a/lib/Target/MBlaze/TODO b/lib/Target/MBlaze/TODO
index 2e613eb..317d7c0 100644
--- a/lib/Target/MBlaze/TODO
+++ b/lib/Target/MBlaze/TODO
@@ -9,8 +9,6 @@
   needs to be examined more closely:
     - The stack layout needs to be examined to make sure it meets
       the standard, especially in regards to var arg functions.
-    - The processor itineraries are copied from a different backend
-      and need to be updated to model the MicroBlaze correctly.
     - Look at the MBlazeGenFastISel.inc stuff and make use of it
       if appropriate.
 
@@ -18,9 +16,6 @@
   There are a few things that need to be looked at:
     - There are some instructions that are not generated by the backend
       and have not been tested as far as the parser is concerned.
-    - The assembly parser does not use any MicroBlaze specific directives.
+    - The assembly parser does not use many MicroBlaze specific directives.
       I should investigate if there are MicroBlaze specific directive and,
       if there are, add them.
-    - The instruction MFS and MTS use special names for some of the
-      special registers that can be accessed. These special register
-      names should be parsed by the assembly parser.
diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp
index a95d59c..0a3eab1 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -170,6 +170,9 @@ MSP430TargetLowering::MSP430TargetLowering(MSP430TargetMachine &tm) :
     setLibcallName(RTLIB::MUL_I8,  "__mulqi3hw_noint");
     setLibcallName(RTLIB::MUL_I16, "__mulhi3hw_noint");
   }
+
+  setMinFunctionAlignment(1);
+  setPrefFunctionAlignment(2);
 }
 
 SDValue MSP430TargetLowering::LowerOperation(SDValue Op,
@@ -193,11 +196,6 @@ SDValue MSP430TargetLowering::LowerOperation(SDValue Op,
   }
 }
 
-/// getFunctionAlignment - Return the Log2 alignment of this function.
-unsigned MSP430TargetLowering::getFunctionAlignment(const Function *F) const {
-  return F->hasFnAttr(Attribute::OptimizeForSize) ? 1 : 2;
-}
-
 //===----------------------------------------------------------------------===//
 //                       MSP430 Inline Assembly Support
 //===----------------------------------------------------------------------===//
@@ -314,8 +312,8 @@ MSP430TargetLowering::LowerCCCArguments(SDValue Chain,
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
-                 ArgLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), ArgLocs, *DAG.getContext());
   CCInfo.AnalyzeFormalArguments(Ins, CC_MSP430);
 
   assert(!isVarArg && "Varargs not supported yet");
@@ -397,8 +395,8 @@ MSP430TargetLowering::LowerReturn(SDValue Chain,
   }
 
   // CCState - Info about the registers and stack slot.
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
-                 RVLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), RVLocs, *DAG.getContext());
 
   // Analize return values.
   CCInfo.AnalyzeReturn(Outs, RetCC_MSP430);
@@ -451,8 +449,8 @@ MSP430TargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
                                      SmallVectorImpl<SDValue> &InVals) const {
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
-                 ArgLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), ArgLocs, *DAG.getContext());
 
   CCInfo.AnalyzeCallOperands(Outs, CC_MSP430);
 
@@ -515,7 +513,7 @@ MSP430TargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
 
   // Build a sequence of copy-to-reg nodes chained together with token chain and
   // flag operands which copy the outgoing args into registers.  The InFlag in
-  // necessary since all emited instructions must be stuck together.
+  // necessary since all emitted instructions must be stuck together.
   SDValue InFlag;
   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
@@ -574,8 +572,8 @@ MSP430TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
 
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
-                 RVLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), RVLocs, *DAG.getContext());
 
   CCInfo.AnalyzeCallResult(Ins, RetCC_MSP430);
 
diff --git a/lib/Target/MSP430/MSP430ISelLowering.h b/lib/Target/MSP430/MSP430ISelLowering.h
index 19c9eac..bd660a0 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.h
+++ b/lib/Target/MSP430/MSP430ISelLowering.h
@@ -82,9 +82,6 @@ namespace llvm {
     /// DAG node.
     virtual const char *getTargetNodeName(unsigned Opcode) const;
 
-    /// getFunctionAlignment - Return the Log2 alignment of this function.
-    virtual unsigned getFunctionAlignment(const Function *F) const;
-
     SDValue LowerShifts(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.cpp b/lib/Target/MSP430/MSP430RegisterInfo.cpp
index 1da6d8d..53f4c2e 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.cpp
+++ b/lib/Target/MSP430/MSP430RegisterInfo.cpp
@@ -76,7 +76,11 @@ BitVector MSP430RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
 
-  // Mark 4 special registers as reserved.
+  // Mark 4 special registers with subregisters as reserved.
+  Reserved.set(MSP430::PCB);
+  Reserved.set(MSP430::SPB);
+  Reserved.set(MSP430::SRB);
+  Reserved.set(MSP430::CGB);
   Reserved.set(MSP430::PCW);
   Reserved.set(MSP430::SPW);
   Reserved.set(MSP430::SRW);
@@ -242,4 +246,9 @@ int MSP430RegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
   return 0;
 }
 
+int MSP430RegisterInfo::getLLVMRegNum(unsigned RegNum, bool isEH) const {
+  llvm_unreachable("Not implemented yet!");
+  return 0;
+}
+
 #include "MSP430GenRegisterInfo.inc"
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.h b/lib/Target/MSP430/MSP430RegisterInfo.h
index 56744fa..e820558 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.h
+++ b/lib/Target/MSP430/MSP430RegisterInfo.h
@@ -39,6 +39,13 @@ public:
   BitVector getReservedRegs(const MachineFunction &MF) const;
   const TargetRegisterClass* getPointerRegClass(unsigned Kind = 0) const;
 
+  const TargetRegisterClass *
+  getMatchingSuperRegClass(const TargetRegisterClass *A,
+                           const TargetRegisterClass *B, unsigned Idx) const {
+    // No sub-classes makes this really easy.
+    return A;
+  }
+
   void eliminateCallFramePseudoInstr(MachineFunction &MF,
                                      MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator I) const;
@@ -54,6 +61,7 @@ public:
 
   //! Get DWARF debugging register number
   int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+  int getLLVMRegNum(unsigned RegNum, bool isEH) const;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.td b/lib/Target/MSP430/MSP430RegisterInfo.td
index ab7b59b..d1c2e3f 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.td
+++ b/lib/Target/MSP430/MSP430RegisterInfo.td
@@ -66,54 +66,20 @@ def R15W : MSP430RegWithSubregs<15, "r15", [R15B]>;
 
 def GR8 : RegisterClass<"MSP430", [i8], 8,
    // Volatile registers
-  [R12B, R13B, R14B, R15B, R11B, R10B, R9B, R8B, R7B, R6B, R5B,
+  (add R12B, R13B, R14B, R15B, R11B, R10B, R9B, R8B, R7B, R6B, R5B,
    // Frame pointer, sometimes allocable
    FPB,
    // Volatile, but not allocable
-   PCB, SPB, SRB, CGB]>
-{
-  let MethodProtos = [{
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    GR8Class::iterator
-    GR8Class::allocation_order_end(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const TargetFrameLowering *TFI = TM.getFrameLowering();
-      // Depending on whether the function uses frame pointer or not, last 5 or 4
-      // registers on the list above are reserved
-      if (TFI->hasFP(MF))
-        return end()-5;
-      else
-        return end()-4;
-    }
-  }];
-}
+   PCB, SPB, SRB, CGB)>;
 
 def GR16 : RegisterClass<"MSP430", [i16], 16,
    // Volatile registers
-  [R12W, R13W, R14W, R15W, R11W, R10W, R9W, R8W, R7W, R6W, R5W,
+  (add R12W, R13W, R14W, R15W, R11W, R10W, R9W, R8W, R7W, R6W, R5W,
    // Frame pointer, sometimes allocable
    FPW,
    // Volatile, but not allocable
-   PCW, SPW, SRW, CGW]>
+   PCW, SPW, SRW, CGW)>
 {
   let SubRegClasses = [(GR8 subreg_8bit)];
-  let MethodProtos = [{
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    GR16Class::iterator
-    GR16Class::allocation_order_end(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const TargetFrameLowering *TFI = TM.getFrameLowering();
-      // Depending on whether the function uses frame pointer or not, last 5 or 4
-      // registers on the list above are reserved
-      if (TFI->hasFP(MF))
-        return end()-5;
-      else
-        return end()-4;
-    }
-  }];
 }
 
diff --git a/lib/Target/Mips/CMakeLists.txt b/lib/Target/Mips/CMakeLists.txt
index 26df1a0..fd16516 100644
--- a/lib/Target/Mips/CMakeLists.txt
+++ b/lib/Target/Mips/CMakeLists.txt
@@ -13,6 +13,8 @@ tablegen(MipsGenSubtarget.inc -gen-subtarget)
 add_llvm_target(MipsCodeGen
   MipsAsmPrinter.cpp
   MipsDelaySlotFiller.cpp
+  MipsEmitGPRestore.cpp
+  MipsExpandPseudo.cpp
   MipsInstrInfo.cpp
   MipsISelDAGToDAG.cpp
   MipsISelLowering.cpp
diff --git a/lib/Target/Mips/Mips.h b/lib/Target/Mips/Mips.h
index fb3c492..76a26a9 100644
--- a/lib/Target/Mips/Mips.h
+++ b/lib/Target/Mips/Mips.h
@@ -25,6 +25,8 @@ namespace llvm {
 
   FunctionPass *createMipsISelDag(MipsTargetMachine &TM);
   FunctionPass *createMipsDelaySlotFillerPass(MipsTargetMachine &TM);
+  FunctionPass *createMipsExpandPseudoPass(MipsTargetMachine &TM);
+  FunctionPass *createMipsEmitGPRestorePass(MipsTargetMachine &TM);
 
   extern Target TheMipsTarget;
   extern Target TheMipselTarget;
diff --git a/lib/Target/Mips/Mips.td b/lib/Target/Mips/Mips.td
index 5102c69..b79016d 100644
--- a/lib/Target/Mips/Mips.td
+++ b/lib/Target/Mips/Mips.td
@@ -81,7 +81,7 @@ def : Proc<"r6000", [FeatureMips2]>;
 
 def : Proc<"4ke", [FeatureMips32r2]>;
 
-// Allegrex is a 32bit subset of r4000, both for interger and fp registers,
+// Allegrex is a 32bit subset of r4000, both for integer and fp registers,
 // but much more similar to Mips2 than Mips3. It also contains some of
 // Mips32/Mips32r2 instructions and a custom vector fpu processor.
 def : Proc<"allegrex", [FeatureMips2, FeatureSingleFloat, FeatureEABI,
diff --git a/lib/Target/Mips/MipsAsmPrinter.cpp b/lib/Target/Mips/MipsAsmPrinter.cpp
index 718110f..8caa7cd 100644
--- a/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -77,7 +77,8 @@ namespace {
     }
     virtual void EmitFunctionBodyStart();
     virtual void EmitFunctionBodyEnd();
-    virtual bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const;
+    virtual bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock*
+                                                   MBB) const;
     static const char *getRegisterName(unsigned RegNo);
 
     virtual void EmitFunctionEntryLabel();
@@ -125,44 +126,60 @@ namespace {
 // Create a bitmask with all callee saved registers for CPU or Floating Point
 // registers. For CPU registers consider RA, GP and FP for saving if necessary.
 void MipsAsmPrinter::printSavedRegsBitmask(raw_ostream &O) {
-  const TargetFrameLowering *TFI = TM.getFrameLowering();
-  const TargetRegisterInfo *RI = TM.getRegisterInfo();
-  const MipsFunctionInfo *MipsFI = MF->getInfo<MipsFunctionInfo>();
-
   // CPU and FPU Saved Registers Bitmasks
-  unsigned int CPUBitmask = 0;
-  unsigned int FPUBitmask = 0;
+  unsigned CPUBitmask = 0, FPUBitmask = 0;
+  int CPUTopSavedRegOff, FPUTopSavedRegOff;
 
   // Set the CPU and FPU Bitmasks
   const MachineFrameInfo *MFI = MF->getFrameInfo();
   const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
-  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+  // size of stack area to which FP callee-saved regs are saved.
+  unsigned CPURegSize = Mips::CPURegsRegisterClass->getSize();
+  unsigned FGR32RegSize = Mips::FGR32RegisterClass->getSize();
+  unsigned AFGR64RegSize = Mips::AFGR64RegisterClass->getSize();
+  bool HasAFGR64Reg = false;
+  unsigned CSFPRegsSize = 0;
+  unsigned i, e = CSI.size();
+
+  // Set FPU Bitmask.
+  for (i = 0; i != e; ++i) {
     unsigned Reg = CSI[i].getReg();
-    unsigned RegNum = MipsRegisterInfo::getRegisterNumbering(Reg);
     if (Mips::CPURegsRegisterClass->contains(Reg))
-      CPUBitmask |= (1 << RegNum);
-    else
-      FPUBitmask |= (1 << RegNum);
+      break;
+
+    unsigned RegNum = MipsRegisterInfo::getRegisterNumbering(Reg);
+    if (Mips::AFGR64RegisterClass->contains(Reg)) {
+      FPUBitmask |= (3 << RegNum);
+      CSFPRegsSize += AFGR64RegSize;
+      HasAFGR64Reg = true;
+      continue;
+    }
+
+    FPUBitmask |= (1 << RegNum);
+    CSFPRegsSize += FGR32RegSize;
+  }
+
+  // Set CPU Bitmask.
+  for (; i != e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+    unsigned RegNum = MipsRegisterInfo::getRegisterNumbering(Reg);
+    CPUBitmask |= (1 << RegNum);
   }
 
-  // Return Address and Frame registers must also be set in CPUBitmask.
-  // FIXME: Do we really need hasFP() call here? When no FP is present SP is
-  // just returned -- will it be ok?
-  if (TFI->hasFP(*MF))
-    CPUBitmask |= (1 << MipsRegisterInfo::
-                getRegisterNumbering(RI->getFrameRegister(*MF)));
+  // FP Regs are saved right below where the virtual frame pointer points to.
+  FPUTopSavedRegOff = FPUBitmask ?
+    (HasAFGR64Reg ? -AFGR64RegSize : -FGR32RegSize) : 0;
 
-  if (MFI->adjustsStack())
-    CPUBitmask |= (1 << MipsRegisterInfo::
-                getRegisterNumbering(RI->getRARegister()));
+  // CPU Regs are saved below FP Regs.
+  CPUTopSavedRegOff = CPUBitmask ? -CSFPRegsSize - CPURegSize : 0;
 
   // Print CPUBitmask
   O << "\t.mask \t"; printHex32(CPUBitmask, O);
-  O << ',' << MipsFI->getCPUTopSavedRegOff() << '\n';
+  O << ',' << CPUTopSavedRegOff << '\n';
 
   // Print FPUBitmask
-  O << "\t.fmask\t"; printHex32(FPUBitmask, O); O << ","
-    << MipsFI->getFPUTopSavedRegOff() << '\n';
+  O << "\t.fmask\t"; printHex32(FPUBitmask, O);
+  O << "," << FPUTopSavedRegOff << '\n';
 }
 
 // Print a 32 bit hex number with all numbers.
@@ -236,8 +253,8 @@ void MipsAsmPrinter::EmitFunctionBodyEnd() {
 /// isBlockOnlyReachableByFallthough - Return true if the basic block has
 /// exactly one predecessor and the control transfer mechanism between
 /// the predecessor and this block is a fall-through.
-bool MipsAsmPrinter::isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB)
-    const {
+bool MipsAsmPrinter::isBlockOnlyReachableByFallthrough(const MachineBasicBlock*
+                                                       MBB) const {
   // The predecessor has to be immediately before this block.
   const MachineBasicBlock *Pred = *MBB->pred_begin();
 
@@ -301,6 +318,10 @@ void MipsAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
   case MipsII::MO_GOT:      O << "%got(";    break;
   case MipsII::MO_ABS_HI:   O << "%hi(";     break;
   case MipsII::MO_ABS_LO:   O << "%lo(";     break;
+  case MipsII::MO_TLSGD:    O << "%tlsgd(";  break;
+  case MipsII::MO_GOTTPREL: O << "%gottprel("; break;
+  case MipsII::MO_TPREL_HI: O << "%tprel_hi("; break;
+  case MipsII::MO_TPREL_LO: O << "%tprel_lo("; break;
   }
 
   switch (MO.getType()) {
@@ -309,7 +330,7 @@ void MipsAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
       break;
 
     case MachineOperand::MO_Immediate:
-      O << (short int)MO.getImm();
+      O << MO.getImm();
       break;
 
     case MachineOperand::MO_MachineBasicBlock:
diff --git a/lib/Target/Mips/MipsCallingConv.td b/lib/Target/Mips/MipsCallingConv.td
index 8e4b216..57aeb1d 100644
--- a/lib/Target/Mips/MipsCallingConv.td
+++ b/lib/Target/Mips/MipsCallingConv.td
@@ -48,7 +48,7 @@ def CC_MipsEABI : CallingConv<[
   CCIfType<[f32], CCIfSubtarget<"isNotSingleFloat()",
                   CCAssignToReg<[F12, F14, F16, F18]>>>,
 
-  // The first 4 doubl fp arguments are passed in single fp registers.
+  // The first 4 double fp arguments are passed in single fp registers.
   CCIfType<[f64], CCIfSubtarget<"isNotSingleFloat()",
                   CCAssignToReg<[D6, D7, D8, D9]>>>,
 
diff --git a/lib/Target/Mips/MipsEmitGPRestore.cpp b/lib/Target/Mips/MipsEmitGPRestore.cpp
new file mode 100644
index 0000000..f49d490
--- /dev/null
+++ b/lib/Target/Mips/MipsEmitGPRestore.cpp
@@ -0,0 +1,94 @@
+//===-- MipsEmitGPRestore.cpp - Emit GP restore instruction----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass emits instructions that restore $gp right
+// after jalr instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "emit-gp-restore"
+
+#include "Mips.h"
+#include "MipsTargetMachine.h"
+#include "MipsMachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/ADT/Statistic.h"
+
+using namespace llvm;
+
+namespace {
+  struct Inserter : public MachineFunctionPass {
+
+    TargetMachine &TM;
+    const TargetInstrInfo *TII;
+
+    static char ID;
+    Inserter(TargetMachine &tm)
+      : MachineFunctionPass(ID), TM(tm), TII(tm.getInstrInfo()) { }
+
+    virtual const char *getPassName() const {
+      return "Mips Emit GP Restore";
+    }
+
+    bool runOnMachineFunction(MachineFunction &F);
+  };
+  char Inserter::ID = 0;
+} // end of anonymous namespace
+
+bool Inserter::runOnMachineFunction(MachineFunction &F) {
+  if (TM.getRelocationModel() != Reloc::PIC_)
+    return false;
+
+  bool Changed = false;
+  int FI =  F.getInfo<MipsFunctionInfo>()->getGPFI();
+
+  for (MachineFunction::iterator MFI = F.begin(), MFE = F.end();
+       MFI != MFE; ++MFI) {
+    MachineBasicBlock& MBB = *MFI;
+    MachineBasicBlock::iterator I = MFI->begin();
+
+    // If MBB is a landing pad, insert instruction that restores $gp after
+    // EH_LABEL.
+    if (MBB.isLandingPad()) {
+      // Find EH_LABEL first.
+      for (; I->getOpcode() != TargetOpcode::EH_LABEL; ++I) ;
+      
+      // Insert lw.
+      ++I;
+      DebugLoc dl = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
+      BuildMI(MBB, I, dl, TII->get(Mips::LW), Mips::GP).addImm(0)
+                                                       .addFrameIndex(FI);
+      Changed = true;
+    }
+
+    while (I != MFI->end()) {
+      if (I->getOpcode() != Mips::JALR) {
+        ++I;
+        continue;
+      }
+
+      DebugLoc dl = I->getDebugLoc();
+      // emit lw $gp, ($gp save slot on stack) after jalr
+      BuildMI(MBB, ++I, dl, TII->get(Mips::LW), Mips::GP).addImm(0)
+        .addFrameIndex(FI);
+      Changed = true;
+    }
+  } 
+
+  return Changed;
+}
+
+/// createMipsEmitGPRestorePass - Returns a pass that emits instructions that
+/// restores $gp clobbered by jalr instructions.
+FunctionPass *llvm::createMipsEmitGPRestorePass(MipsTargetMachine &tm) {
+  return new Inserter(tm);
+}
+
diff --git a/lib/Target/Mips/MipsExpandPseudo.cpp b/lib/Target/Mips/MipsExpandPseudo.cpp
new file mode 100644
index 0000000..4423f51
--- /dev/null
+++ b/lib/Target/Mips/MipsExpandPseudo.cpp
@@ -0,0 +1,117 @@
+//===--  MipsExpandPseudo.cpp - Expand pseudo instructions ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass expands pseudo instructions into target instructions after register
+// allocation but before post-RA scheduling.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mips-expand-pseudo"
+
+#include "Mips.h"
+#include "MipsTargetMachine.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/ADT/Statistic.h"
+
+using namespace llvm;
+
+namespace {
+  struct MipsExpandPseudo : public MachineFunctionPass {
+
+    TargetMachine &TM;
+    const TargetInstrInfo *TII;
+
+    static char ID;
+    MipsExpandPseudo(TargetMachine &tm)
+      : MachineFunctionPass(ID), TM(tm), TII(tm.getInstrInfo()) { }
+
+    virtual const char *getPassName() const {
+      return "Mips PseudoInstrs Expansion";
+    }
+
+    bool runOnMachineFunction(MachineFunction &F);
+    bool runOnMachineBasicBlock(MachineBasicBlock &MBB);
+
+  private:
+    void ExpandBuildPairF64(MachineBasicBlock&, MachineBasicBlock::iterator);
+    void ExpandExtractElementF64(MachineBasicBlock&,
+                                 MachineBasicBlock::iterator);
+  };
+  char MipsExpandPseudo::ID = 0;
+} // end of anonymous namespace
+
+bool MipsExpandPseudo::runOnMachineFunction(MachineFunction& F) {
+  bool Changed = false;
+
+  for (MachineFunction::iterator I = F.begin(); I != F.end(); ++I)
+    Changed |= runOnMachineBasicBlock(*I);
+
+  return Changed;
+}
+
+bool MipsExpandPseudo::runOnMachineBasicBlock(MachineBasicBlock& MBB) {
+
+  bool Changed = false;
+  for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end();) {
+    const TargetInstrDesc& Tid = I->getDesc();
+
+    switch(Tid.getOpcode()) {
+    default: 
+      ++I;
+      continue;
+    case Mips::BuildPairF64:
+      ExpandBuildPairF64(MBB, I);
+      break;
+    case Mips::ExtractElementF64:
+      ExpandExtractElementF64(MBB, I);
+      break;
+    } 
+
+    // delete original instr
+    MBB.erase(I++);
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+void MipsExpandPseudo::ExpandBuildPairF64(MachineBasicBlock& MBB,
+                                            MachineBasicBlock::iterator I) {  
+  unsigned DstReg = I->getOperand(0).getReg();
+  unsigned LoReg = I->getOperand(1).getReg(), HiReg = I->getOperand(2).getReg();
+  const TargetInstrDesc& Mtc1Tdd = TII->get(Mips::MTC1);
+  DebugLoc dl = I->getDebugLoc();
+  const unsigned* SubReg =
+    TM.getRegisterInfo()->getSubRegisters(DstReg);
+
+  // mtc1 Lo, $fp
+  // mtc1 Hi, $fp + 1
+  BuildMI(MBB, I, dl, Mtc1Tdd, *SubReg).addReg(LoReg);
+  BuildMI(MBB, I, dl, Mtc1Tdd, *(SubReg + 1)).addReg(HiReg);
+}
+
+void MipsExpandPseudo::ExpandExtractElementF64(MachineBasicBlock& MBB,
+                                               MachineBasicBlock::iterator I) {
+  unsigned DstReg = I->getOperand(0).getReg();
+  unsigned SrcReg = I->getOperand(1).getReg();
+  unsigned N = I->getOperand(2).getImm();
+  const TargetInstrDesc& Mfc1Tdd = TII->get(Mips::MFC1);
+  DebugLoc dl = I->getDebugLoc();
+  const unsigned* SubReg = TM.getRegisterInfo()->getSubRegisters(SrcReg);
+
+  BuildMI(MBB, I, dl, Mfc1Tdd, DstReg).addReg(*(SubReg + N));
+}
+
+/// createMipsMipsExpandPseudoPass - Returns a pass that expands pseudo 
+/// instrs into real instrs
+FunctionPass *llvm::createMipsExpandPseudoPass(MipsTargetMachine &tm) {
+  return new MipsExpandPseudo(tm);
+}
diff --git a/lib/Target/Mips/MipsFrameLowering.cpp b/lib/Target/Mips/MipsFrameLowering.cpp
index 5e4a7da..a0f90a0 100644
--- a/lib/Target/Mips/MipsFrameLowering.cpp
+++ b/lib/Target/Mips/MipsFrameLowering.cpp
@@ -84,128 +84,20 @@ using namespace llvm;
 // if frame pointer elimination is disabled.
 bool MipsFrameLowering::hasFP(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
-  return DisableFramePointerElim(MF) || MFI->hasVarSizedObjects();
+  return DisableFramePointerElim(MF) || MFI->hasVarSizedObjects()
+      || MFI->isFrameAddressTaken();
 }
 
-void MipsFrameLowering::adjustMipsStackFrame(MachineFunction &MF) const {
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
-  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
-  unsigned StackAlign = getStackAlignment();
-  unsigned RegSize = STI.isGP32bit() ? 4 : 8;
-  bool HasGP = MipsFI->needGPSaveRestore();
-
-  // Min and Max CSI FrameIndex.
-  int MinCSFI = -1, MaxCSFI = -1;
-
-  // See the description at MipsMachineFunction.h
-  int TopCPUSavedRegOff = -1, TopFPUSavedRegOff = -1;
-
-  // Replace the dummy '0' SPOffset by the negative offsets, as explained on
-  // LowerFormalArguments. Leaving '0' for while is necessary to avoid the
-  // approach done by calculateFrameObjectOffsets to the stack frame.
-  MipsFI->adjustLoadArgsFI(MFI);
-  MipsFI->adjustStoreVarArgsFI(MFI);
-
-  // It happens that the default stack frame allocation order does not directly
-  // map to the convention used for mips. So we must fix it. We move the callee
-  // save register slots after the local variables area, as described in the
-  // stack frame above.
-  unsigned CalleeSavedAreaSize = 0;
-  if (!CSI.empty()) {
-    MinCSFI = CSI[0].getFrameIdx();
-    MaxCSFI = CSI[CSI.size()-1].getFrameIdx();
-  }
-  for (unsigned i = 0, e = CSI.size(); i != e; ++i)
-    CalleeSavedAreaSize += MFI->getObjectAlignment(CSI[i].getFrameIdx());
-
-  unsigned StackOffset = HasGP ? (MipsFI->getGPStackOffset()+RegSize)
-                : (STI.isABI_O32() ? 16 : 0);
-
-  // Adjust local variables. They should come on the stack right
-  // after the arguments.
-  int LastOffsetFI = -1;
-  for (int i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) {
-    if (i >= MinCSFI && i <= MaxCSFI)
-      continue;
-    if (MFI->isDeadObjectIndex(i))
-      continue;
-    unsigned Offset =
-      StackOffset + MFI->getObjectOffset(i) - CalleeSavedAreaSize;
-    if (LastOffsetFI == -1)
-      LastOffsetFI = i;
-    if (Offset > MFI->getObjectOffset(LastOffsetFI))
-      LastOffsetFI = i;
-    MFI->setObjectOffset(i, Offset);
-  }
-
-  // Adjust CPU Callee Saved Registers Area. Registers RA and FP must
-  // be saved in this CPU Area. This whole area must be aligned to the
-  // default Stack Alignment requirements.
-  if (LastOffsetFI >= 0)
-    StackOffset = MFI->getObjectOffset(LastOffsetFI)+
-                  MFI->getObjectSize(LastOffsetFI);
-  StackOffset = ((StackOffset+StackAlign-1)/StackAlign*StackAlign);
-
-  for (unsigned i = 0, e = CSI.size(); i != e ; ++i) {
-    unsigned Reg = CSI[i].getReg();
-    if (!Mips::CPURegsRegisterClass->contains(Reg))
-      break;
-    MFI->setObjectOffset(CSI[i].getFrameIdx(), StackOffset);
-    TopCPUSavedRegOff = StackOffset;
-    StackOffset += MFI->getObjectAlignment(CSI[i].getFrameIdx());
-  }
-
-  // Stack locations for FP and RA. If only one of them is used,
-  // the space must be allocated for both, otherwise no space at all.
-  if (hasFP(MF) || MFI->adjustsStack()) {
-    // FP stack location
-    MFI->setObjectOffset(MFI->CreateStackObject(RegSize, RegSize, true),
-                         StackOffset);
-    MipsFI->setFPStackOffset(StackOffset);
-    TopCPUSavedRegOff = StackOffset;
-    StackOffset += RegSize;
-
-    // SP stack location
-    MFI->setObjectOffset(MFI->CreateStackObject(RegSize, RegSize, true),
-                         StackOffset);
-    MipsFI->setRAStackOffset(StackOffset);
-    StackOffset += RegSize;
-
-    if (MFI->adjustsStack())
-      TopCPUSavedRegOff += RegSize;
-  }
-
-  StackOffset = ((StackOffset+StackAlign-1)/StackAlign*StackAlign);
-
-  // Adjust FPU Callee Saved Registers Area. This Area must be
-  // aligned to the default Stack Alignment requirements.
-  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
-    unsigned Reg = CSI[i].getReg();
-    if (Mips::CPURegsRegisterClass->contains(Reg))
-      continue;
-    MFI->setObjectOffset(CSI[i].getFrameIdx(), StackOffset);
-    TopFPUSavedRegOff = StackOffset;
-    StackOffset += MFI->getObjectAlignment(CSI[i].getFrameIdx());
-  }
-  StackOffset = ((StackOffset+StackAlign-1)/StackAlign*StackAlign);
-
-  // Update frame info
-  MFI->setStackSize(StackOffset);
-
-  // Recalculate the final tops offset. The final values must be '0'
-  // if there isn't a callee saved register for CPU or FPU, otherwise
-  // a negative offset is needed.
-  if (TopCPUSavedRegOff >= 0)
-    MipsFI->setCPUTopSavedRegOff(TopCPUSavedRegOff-StackOffset);
-
-  if (TopFPUSavedRegOff >= 0)
-    MipsFI->setFPUTopSavedRegOff(TopFPUSavedRegOff-StackOffset);
+bool MipsFrameLowering::targetHandlesStackFrameRounding() const {
+  return true;
 }
 
+static unsigned AlignOffset(unsigned Offset, unsigned Align) {
+  return (Offset + Align - 1) / Align * Align; 
+} 
 
-// expand pair of register and immediate if the immediate doesn't fit in the 16-bit
-// offset field.
+// expand pair of register and immediate if the immediate doesn't fit in the
+// 16-bit offset field.
 // e.g.
 //  if OrigImm = 0x10000, OrigReg = $sp:
 //    generate the following sequence of instrs:
@@ -216,7 +108,8 @@ void MipsFrameLowering::adjustMipsStackFrame(MachineFunction &MF) const {
 //    return true
 static bool expandRegLargeImmPair(unsigned OrigReg, int OrigImm,
                                   unsigned& NewReg, int& NewImm,
-                                  MachineBasicBlock& MBB, MachineBasicBlock::iterator I) {
+                                  MachineBasicBlock& MBB,
+                                  MachineBasicBlock::iterator I) {
   // OrigImm fits in the 16-bit field
   if (OrigImm < 0x8000 && OrigImm >= -0x8000) {
     NewReg = OrigReg;
@@ -227,13 +120,15 @@ static bool expandRegLargeImmPair(unsigned OrigReg, int OrigImm,
   MachineFunction* MF = MBB.getParent();
   const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
   DebugLoc DL = I->getDebugLoc();
-  int ImmLo = OrigImm & 0xffff;
-  int ImmHi = (((unsigned)OrigImm & 0xffff0000) >> 16) + ((OrigImm & 0x8000) != 0);
+  int ImmLo = (short)(OrigImm & 0xffff);
+  int ImmHi = (((unsigned)OrigImm & 0xffff0000) >> 16) +
+              ((OrigImm & 0x8000) != 0);
 
   // FIXME: change this when mips goes MC".
   BuildMI(MBB, I, DL, TII->get(Mips::NOAT));
   BuildMI(MBB, I, DL, TII->get(Mips::LUi), Mips::AT).addImm(ImmHi);
-  BuildMI(MBB, I, DL, TII->get(Mips::ADDu), Mips::AT).addReg(OrigReg).addReg(Mips::AT);
+  BuildMI(MBB, I, DL, TII->get(Mips::ADDu), Mips::AT).addReg(OrigReg)
+                                                     .addReg(Mips::AT);
   NewReg = Mips::AT;
   NewImm = ImmLo;
 
@@ -255,18 +150,18 @@ void MipsFrameLowering::emitPrologue(MachineFunction &MF) const {
   int NewImm = 0;
   bool ATUsed;
 
-  // Get the right frame order for Mips.
-  adjustMipsStackFrame(MF);
-
-  // Get the number of bytes to allocate from the FrameInfo.
-  unsigned StackSize = MFI->getStackSize();
-
-  // No need to allocate space on the stack.
-  if (StackSize == 0 && !MFI->adjustsStack()) return;
-
-  int FPOffset = MipsFI->getFPStackOffset();
-  int RAOffset = MipsFI->getRAStackOffset();
-
+  // First, compute final stack size.
+  unsigned RegSize = STI.isGP32bit() ? 4 : 8;
+  unsigned StackAlign = getStackAlignment();
+  unsigned LocalVarAreaOffset = MipsFI->needGPSaveRestore() ? 
+    (MFI->getObjectOffset(MipsFI->getGPFI()) + RegSize) :
+    MipsFI->getMaxCallFrameSize();
+  unsigned StackSize = AlignOffset(LocalVarAreaOffset, StackAlign) +
+    AlignOffset(MFI->getStackSize(), StackAlign);
+
+   // Update stack size
+  MFI->setStackSize(StackSize); 
+  
   BuildMI(MBB, MBBI, dl, TII.get(Mips::NOREORDER));
 
   // TODO: check need from GP here.
@@ -275,6 +170,13 @@ void MipsFrameLowering::emitPrologue(MachineFunction &MF) const {
       .addReg(RegInfo->getPICCallReg());
   BuildMI(MBB, MBBI, dl, TII.get(Mips::NOMACRO));
 
+  // No need to allocate space on the stack.
+  if (StackSize == 0 && !MFI->adjustsStack()) return;
+
+  MachineModuleInfo &MMI = MF.getMMI();
+  std::vector<MachineMove> &Moves = MMI.getFrameMoves();
+  MachineLocation DstML, SrcML;
+
   // Adjust stack : addi sp, sp, (-imm)
   ATUsed = expandRegLargeImmPair(Mips::SP, -StackSize, NewReg, NewImm, MBB,
                                  MBBI);
@@ -285,97 +187,109 @@ void MipsFrameLowering::emitPrologue(MachineFunction &MF) const {
   if (ATUsed)
     BuildMI(MBB, MBBI, dl, TII.get(Mips::ATMACRO));
 
-  // Save the return address only if the function isnt a leaf one.
-  // sw  $ra, stack_loc($sp)
-  if (MFI->adjustsStack()) {
-    ATUsed = expandRegLargeImmPair(Mips::SP, RAOffset, NewReg, NewImm, MBB,
-                                   MBBI);
-    BuildMI(MBB, MBBI, dl, TII.get(Mips::SW))
-      .addReg(Mips::RA).addImm(NewImm).addReg(NewReg);
+  // emit ".cfi_def_cfa_offset StackSize"
+  MCSymbol *AdjustSPLabel = MMI.getContext().CreateTempSymbol();
+  BuildMI(MBB, MBBI, dl,
+          TII.get(TargetOpcode::PROLOG_LABEL)).addSym(AdjustSPLabel);
+  DstML = MachineLocation(MachineLocation::VirtualFP);
+  SrcML = MachineLocation(MachineLocation::VirtualFP, -StackSize);
+  Moves.push_back(MachineMove(AdjustSPLabel, DstML, SrcML));
 
-    // FIXME: change this when mips goes MC".
-    if (ATUsed)
-      BuildMI(MBB, MBBI, dl, TII.get(Mips::ATMACRO));
-  }
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
 
-  // if framepointer enabled, save it and set it
-  // to point to the stack pointer
+  if (CSI.size()) {
+    // Find the instruction past the last instruction that saves a callee-saved
+    // register to the stack.
+    for (unsigned i = 0; i < CSI.size(); ++i)
+      ++MBBI;
+ 
+    // Iterate over list of callee-saved registers and emit .cfi_offset
+    // directives.
+    MCSymbol *CSLabel = MMI.getContext().CreateTempSymbol();
+    BuildMI(MBB, MBBI, dl,
+            TII.get(TargetOpcode::PROLOG_LABEL)).addSym(CSLabel);
+ 
+    for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(),
+           E = CSI.end(); I != E; ++I) {
+      int64_t Offset = MFI->getObjectOffset(I->getFrameIdx());
+      unsigned Reg = I->getReg();
+
+      // If Reg is a double precision register, emit two cfa_offsets,
+      // one for each of the paired single precision registers.
+      if (Mips::AFGR64RegisterClass->contains(Reg)) {
+        const unsigned *SubRegs = RegInfo->getSubRegisters(Reg);
+        MachineLocation DstML0(MachineLocation::VirtualFP, Offset);
+        MachineLocation DstML1(MachineLocation::VirtualFP, Offset + 4);
+        MachineLocation SrcML0(*SubRegs);
+        MachineLocation SrcML1(*(SubRegs + 1));
+
+        if (!STI.isLittle())
+          std::swap(SrcML0, SrcML1);
+
+        Moves.push_back(MachineMove(CSLabel, DstML0, SrcML0));
+        Moves.push_back(MachineMove(CSLabel, DstML1, SrcML1));
+      }
+      else {
+        // Reg is either in CPURegs or FGR32.
+        DstML = MachineLocation(MachineLocation::VirtualFP, Offset);
+        SrcML = MachineLocation(Reg);
+        Moves.push_back(MachineMove(CSLabel, DstML, SrcML));
+      }
+    }
+  }    
+
+  // if framepointer enabled, set it to point to the stack pointer.
   if (hasFP(MF)) {
-    // sw  $fp,stack_loc($sp)
-    ATUsed = expandRegLargeImmPair(Mips::SP, FPOffset, NewReg, NewImm, MBB,
-                                   MBBI);
-    BuildMI(MBB, MBBI, dl, TII.get(Mips::SW))
-      .addReg(Mips::FP).addImm(NewImm).addReg(NewReg);
-
-    // FIXME: change this when mips goes MC".
-    if (ATUsed)
-      BuildMI(MBB, MBBI, dl, TII.get(Mips::ATMACRO));
-
-    // move $fp, $sp
+    // Insert instruction "move $fp, $sp" at this location.    
     BuildMI(MBB, MBBI, dl, TII.get(Mips::ADDu), Mips::FP)
       .addReg(Mips::SP).addReg(Mips::ZERO);
+
+    // emit ".cfi_def_cfa_register $fp" 
+    MCSymbol *SetFPLabel = MMI.getContext().CreateTempSymbol();
+    BuildMI(MBB, MBBI, dl,
+            TII.get(TargetOpcode::PROLOG_LABEL)).addSym(SetFPLabel);
+    DstML = MachineLocation(Mips::FP);
+    SrcML = MachineLocation(MachineLocation::VirtualFP);
+    Moves.push_back(MachineMove(SetFPLabel, DstML, SrcML));
   }
 
   // Restore GP from the saved stack location
   if (MipsFI->needGPSaveRestore())
     BuildMI(MBB, MBBI, dl, TII.get(Mips::CPRESTORE))
-      .addImm(MipsFI->getGPStackOffset());
+      .addImm(MFI->getObjectOffset(MipsFI->getGPFI()));
 }
 
 void MipsFrameLowering::emitEpilogue(MachineFunction &MF,
                                  MachineBasicBlock &MBB) const {
   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
   MachineFrameInfo *MFI            = MF.getFrameInfo();
-  MipsFunctionInfo *MipsFI         = MF.getInfo<MipsFunctionInfo>();
   const MipsInstrInfo &TII =
     *static_cast<const MipsInstrInfo*>(MF.getTarget().getInstrInfo());
   DebugLoc dl = MBBI->getDebugLoc();
 
   // Get the number of bytes from FrameInfo
-  int NumBytes = (int) MFI->getStackSize();
-
-  // Get the FI's where RA and FP are saved.
-  int FPOffset = MipsFI->getFPStackOffset();
-  int RAOffset = MipsFI->getRAStackOffset();
+  unsigned StackSize = MFI->getStackSize();
 
   unsigned NewReg = 0;
   int NewImm = 0;
   bool ATUsed = false;
 
-  // if framepointer enabled, restore it and restore the
-  // stack pointer
+  // if framepointer enabled, restore the stack pointer.
   if (hasFP(MF)) {
-    // move $sp, $fp
-    BuildMI(MBB, MBBI, dl, TII.get(Mips::ADDu), Mips::SP)
+    // Find the first instruction that restores a callee-saved register.
+    MachineBasicBlock::iterator I = MBBI;
+    
+    for (unsigned i = 0; i < MFI->getCalleeSavedInfo().size(); ++i)
+      --I;
+
+    // Insert instruction "move $sp, $fp" at this location.
+    BuildMI(MBB, I, dl, TII.get(Mips::ADDu), Mips::SP)
       .addReg(Mips::FP).addReg(Mips::ZERO);
-
-    // lw  $fp,stack_loc($sp)
-    ATUsed = expandRegLargeImmPair(Mips::SP, FPOffset, NewReg, NewImm, MBB,
-                                   MBBI);
-    BuildMI(MBB, MBBI, dl, TII.get(Mips::LW), Mips::FP)
-      .addImm(NewImm).addReg(NewReg);
-
-    // FIXME: change this when mips goes MC".
-    if (ATUsed)
-      BuildMI(MBB, MBBI, dl, TII.get(Mips::ATMACRO));
-  }
-
-  // Restore the return address only if the function isnt a leaf one.
-  // lw  $ra, stack_loc($sp)
-  if (MFI->adjustsStack()) {
-    ATUsed = expandRegLargeImmPair(Mips::SP, RAOffset, NewReg, NewImm, MBB,
-                                   MBBI);
-    BuildMI(MBB, MBBI, dl, TII.get(Mips::LW), Mips::RA)
-      .addImm(NewImm).addReg(NewReg);
-
-    // FIXME: change this when mips goes MC".
-    if (ATUsed)
-      BuildMI(MBB, MBBI, dl, TII.get(Mips::ATMACRO));
   }
 
   // adjust stack  : insert addi sp, sp, (imm)
-  if (NumBytes) {
-    ATUsed = expandRegLargeImmPair(Mips::SP, NumBytes, NewReg, NewImm, MBB,
+  if (StackSize) {
+    ATUsed = expandRegLargeImmPair(Mips::SP, StackSize, NewReg, NewImm, MBB,
                                    MBBI);
     BuildMI(MBB, MBBI, dl, TII.get(Mips::ADDiu), Mips::SP)
       .addReg(NewReg).addImm(NewImm);
@@ -386,9 +300,32 @@ void MipsFrameLowering::emitEpilogue(MachineFunction &MF,
   }
 }
 
+void
+MipsFrameLowering::getInitialFrameState(std::vector<MachineMove> &Moves) const {
+  MachineLocation Dst(MachineLocation::VirtualFP);
+  MachineLocation Src(Mips::SP, 0);
+  Moves.push_back(MachineMove(0, Dst, Src));
+}
+
 void MipsFrameLowering::
-processFunctionBeforeFrameFinalized(MachineFunction &MF) const {
-  const MipsRegisterInfo *RegInfo =
-    static_cast<const MipsRegisterInfo*>(MF.getTarget().getRegisterInfo());
-  RegInfo->processFunctionBeforeFrameFinalized(MF);
+processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                     RegScavenger *RS) const {
+  MachineRegisterInfo& MRI = MF.getRegInfo();
+
+  // FIXME: remove this code if register allocator can correctly mark
+  //        $fp and $ra used or unused.
+
+  // Mark $fp and $ra as used or unused.
+  if (hasFP(MF))
+    MRI.setPhysRegUsed(Mips::FP);
+
+  // The register allocator might determine $ra is used after seeing 
+  // instruction "jr $ra", but we do not want PrologEpilogInserter to insert
+  // instructions to save/restore $ra unless there is a function call.
+  // To correct this, $ra is explicitly marked unused if there is no
+  // function call.
+  if (MF.getFrameInfo()->hasCalls())
+    MRI.setPhysRegUsed(Mips::RA);
+  else
+    MRI.setPhysRegUnused(Mips::RA);
 }
diff --git a/lib/Target/Mips/MipsFrameLowering.h b/lib/Target/Mips/MipsFrameLowering.h
index 34647df..78c78ee 100644
--- a/lib/Target/Mips/MipsFrameLowering.h
+++ b/lib/Target/Mips/MipsFrameLowering.h
@@ -27,11 +27,10 @@ protected:
 
 public:
   explicit MipsFrameLowering(const MipsSubtarget &sti)
-    // FIXME: Is this correct at all?
-    : TargetFrameLowering(StackGrowsUp, 8, 0), STI(sti) {
+    : TargetFrameLowering(StackGrowsDown, 8, 0), STI(sti) {
   }
 
-  void adjustMipsStackFrame(MachineFunction &MF) const;
+  bool targetHandlesStackFrameRounding() const;
 
   /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
   /// the function.
@@ -40,7 +39,10 @@ public:
 
   bool hasFP(const MachineFunction &MF) const;
 
-  void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
+  void getInitialFrameState(std::vector<MachineMove> &Moves) const;
+  
+  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                            RegScavenger *RS) const;
 };
 
 } // End llvm namespace
diff --git a/lib/Target/Mips/MipsISelDAGToDAG.cpp b/lib/Target/Mips/MipsISelDAGToDAG.cpp
index 0382964..d8a84ce 100644
--- a/lib/Target/Mips/MipsISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsISelDAGToDAG.cpp
@@ -119,39 +119,41 @@ SelectAddr(SDValue Addr, SDValue &Offset, SDValue &Base) {
 
   // on PIC code Load GA
   if (TM.getRelocationModel() == Reloc::PIC_) {
-    if ((Addr.getOpcode() == ISD::TargetGlobalAddress) ||
-        (Addr.getOpcode() == ISD::TargetConstantPool) ||
-        (Addr.getOpcode() == ISD::TargetJumpTable) ||
-        (Addr.getOpcode() == ISD::TargetBlockAddress) ||
-        (Addr.getOpcode() == ISD::TargetExternalSymbol)) {
+    if (Addr.getOpcode() == MipsISD::WrapperPIC) {
       Base   = CurDAG->getRegister(Mips::GP, MVT::i32);
-      Offset = Addr;
+      Offset = Addr.getOperand(0);
       return true;
     }
   } else {
     if ((Addr.getOpcode() == ISD::TargetExternalSymbol ||
         Addr.getOpcode() == ISD::TargetGlobalAddress))
       return false;
+    else if (Addr.getOpcode() == ISD::TargetGlobalTLSAddress) {
+      Base   = CurDAG->getRegister(Mips::GP, MVT::i32);
+      Offset = Addr;
+      return true;
+    }
   }
 
-  // Operand is a result from an ADD.
-  if (Addr.getOpcode() == ISD::ADD) {
-    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
-      if (isInt<16>(CN->getSExtValue())) {
-
-        // If the first operand is a FI, get the TargetFI Node
-        if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>
-                                    (Addr.getOperand(0))) {
-          Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
-        } else {
-          Base = Addr.getOperand(0);
-        }
-
-        Offset = CurDAG->getTargetConstant(CN->getZExtValue(), MVT::i32);
-        return true;
-      }
+  // Addresses of the form FI+const or FI|const
+  if (CurDAG->isBaseWithConstantOffset(Addr)) {
+    ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
+    if (isInt<16>(CN->getSExtValue())) {
+
+      // If the first operand is a FI, get the TargetFI Node
+      if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>
+                                  (Addr.getOperand(0)))
+        Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+      else
+        Base = Addr.getOperand(0);
+
+      Offset = CurDAG->getTargetConstant(CN->getZExtValue(), MVT::i32);
+      return true;
     }
+  }
 
+  // Operand is a result from an ADD.
+  if (Addr.getOpcode() == ISD::ADD) {
     // When loading from constant pools, load the lower address part in
     // the instruction itself. Example, instead of:
     //  lui $2, %hi($CPI1_0)
@@ -321,7 +323,6 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
   // tablegen selection should be handled here.
   ///
   switch(Opcode) {
-
     default: break;
 
     case ISD::SUBE:
@@ -355,10 +356,7 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
                                   LHS, SDValue(AddCarry,0));
     }
 
-    /// Mul/Div with two results
-    case ISD::SDIVREM:
-    case ISD::UDIVREM:
-      break;
+    /// Mul with two results
     case ISD::SMUL_LOHI:
     case ISD::UMUL_LOHI: {
       SDValue Op1 = Node->getOperand(0);
@@ -405,13 +403,6 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
         return CurDAG->getMachineNode(Mips::MFHI, dl, MVT::i32, InFlag);
     }
 
-    /// Div/Rem operations
-    case ISD::SREM:
-    case ISD::UREM:
-    case ISD::SDIV:
-    case ISD::UDIV:
-      break;
-
     // Get target GOT address.
     case ISD::GLOBAL_OFFSET_TABLE:
       return getGlobalBaseReg();
@@ -445,6 +436,18 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
         return ResNode;
       // Other cases are autogenerated.
       break;
+
+    case MipsISD::ThreadPointer: {
+      unsigned SrcReg = Mips::HWR29;
+      unsigned DestReg = Mips::V1;
+      SDNode *Rdhwr = CurDAG->getMachineNode(Mips::RDHWR, Node->getDebugLoc(),
+          Node->getValueType(0), CurDAG->getRegister(SrcReg, MVT::i32));
+      SDValue Chain = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, DestReg,
+          SDValue(Rdhwr, 0));
+      SDValue ResNode = CurDAG->getCopyFromReg(Chain, dl, DestReg, MVT::i32);
+      ReplaceUses(SDValue(Node, 0), ResNode);
+      return ResNode.getNode();
+    }
   }
 
   // Select the default instruction
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index 0e193f2..c42054e 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -36,23 +36,30 @@ using namespace llvm;
 
 const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch (Opcode) {
-    case MipsISD::JmpLink    : return "MipsISD::JmpLink";
-    case MipsISD::Hi         : return "MipsISD::Hi";
-    case MipsISD::Lo         : return "MipsISD::Lo";
-    case MipsISD::GPRel      : return "MipsISD::GPRel";
-    case MipsISD::Ret        : return "MipsISD::Ret";
-    case MipsISD::FPBrcond   : return "MipsISD::FPBrcond";
-    case MipsISD::FPCmp      : return "MipsISD::FPCmp";
-    case MipsISD::CMovFP_T   : return "MipsISD::CMovFP_T";
-    case MipsISD::CMovFP_F   : return "MipsISD::CMovFP_F";
-    case MipsISD::FPRound    : return "MipsISD::FPRound";
-    case MipsISD::MAdd       : return "MipsISD::MAdd";
-    case MipsISD::MAddu      : return "MipsISD::MAddu";
-    case MipsISD::MSub       : return "MipsISD::MSub";
-    case MipsISD::MSubu      : return "MipsISD::MSubu";
-    case MipsISD::DivRem     : return "MipsISD::DivRem";
-    case MipsISD::DivRemU    : return "MipsISD::DivRemU";
-    default                  : return NULL;
+  case MipsISD::JmpLink:           return "MipsISD::JmpLink";
+  case MipsISD::Hi:                return "MipsISD::Hi";
+  case MipsISD::Lo:                return "MipsISD::Lo";
+  case MipsISD::GPRel:             return "MipsISD::GPRel";
+  case MipsISD::TlsGd:             return "MipsISD::TlsGd";
+  case MipsISD::TprelHi:           return "MipsISD::TprelHi";
+  case MipsISD::TprelLo:           return "MipsISD::TprelLo";
+  case MipsISD::ThreadPointer:     return "MipsISD::ThreadPointer";
+  case MipsISD::Ret:               return "MipsISD::Ret";
+  case MipsISD::FPBrcond:          return "MipsISD::FPBrcond";
+  case MipsISD::FPCmp:             return "MipsISD::FPCmp";
+  case MipsISD::CMovFP_T:          return "MipsISD::CMovFP_T";
+  case MipsISD::CMovFP_F:          return "MipsISD::CMovFP_F";
+  case MipsISD::FPRound:           return "MipsISD::FPRound";
+  case MipsISD::MAdd:              return "MipsISD::MAdd";
+  case MipsISD::MAddu:             return "MipsISD::MAddu";
+  case MipsISD::MSub:              return "MipsISD::MSub";
+  case MipsISD::MSubu:             return "MipsISD::MSubu";
+  case MipsISD::DivRem:            return "MipsISD::DivRem";
+  case MipsISD::DivRemU:           return "MipsISD::DivRemU";
+  case MipsISD::BuildPairF64:      return "MipsISD::BuildPairF64";
+  case MipsISD::ExtractElementF64: return "MipsISD::ExtractElementF64";
+  case MipsISD::WrapperPIC:        return "MipsISD::WrapperPIC";
+  default:                         return NULL;
   }
 }
 
@@ -100,7 +107,6 @@ MipsTargetLowering(MipsTargetMachine &TM)
   setOperationAction(ISD::SELECT,             MVT::i32,   Custom);
   setOperationAction(ISD::BRCOND,             MVT::Other, Custom);
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32,   Custom);
-  setOperationAction(ISD::FP_TO_SINT,         MVT::i32,   Custom);
   setOperationAction(ISD::VASTART,            MVT::Other, Custom);
 
   setOperationAction(ISD::SDIV, MVT::i32, Expand);
@@ -125,20 +131,22 @@ MipsTargetLowering(MipsTargetMachine &TM)
   setOperationAction(ISD::SHL_PARTS,         MVT::i32,   Expand);
   setOperationAction(ISD::SRA_PARTS,         MVT::i32,   Expand);
   setOperationAction(ISD::SRL_PARTS,         MVT::i32,   Expand);
-  setOperationAction(ISD::FCOPYSIGN,         MVT::f32,   Expand);
-  setOperationAction(ISD::FCOPYSIGN,         MVT::f64,   Expand);
+  setOperationAction(ISD::FCOPYSIGN,         MVT::f32,   Custom);
+  setOperationAction(ISD::FCOPYSIGN,         MVT::f64,   Custom);
   setOperationAction(ISD::FSIN,              MVT::f32,   Expand);
   setOperationAction(ISD::FSIN,              MVT::f64,   Expand);
   setOperationAction(ISD::FCOS,              MVT::f32,   Expand);
   setOperationAction(ISD::FCOS,              MVT::f64,   Expand);
   setOperationAction(ISD::FPOWI,             MVT::f32,   Expand);
   setOperationAction(ISD::FPOW,              MVT::f32,   Expand);
+  setOperationAction(ISD::FPOW,              MVT::f64,   Expand);
   setOperationAction(ISD::FLOG,              MVT::f32,   Expand);
   setOperationAction(ISD::FLOG2,             MVT::f32,   Expand);
   setOperationAction(ISD::FLOG10,            MVT::f32,   Expand);
   setOperationAction(ISD::FEXP,              MVT::f32,   Expand);
 
-  setOperationAction(ISD::EH_LABEL,          MVT::Other, Expand);
+  setOperationAction(ISD::EXCEPTIONADDR,     MVT::i32, Expand);
+  setOperationAction(ISD::EHSELECTION,       MVT::i32, Expand);
 
   setOperationAction(ISD::VAARG,             MVT::Other, Expand);
   setOperationAction(ISD::VACOPY,            MVT::Other, Expand);
@@ -169,19 +177,19 @@ MipsTargetLowering(MipsTargetMachine &TM)
   setTargetDAGCombine(ISD::UDIVREM);
   setTargetDAGCombine(ISD::SETCC);
 
+  setMinFunctionAlignment(2);
+
   setStackPointerRegisterToSaveRestore(Mips::SP);
   computeRegisterProperties();
+
+  setExceptionPointerRegister(Mips::A0);
+  setExceptionSelectorRegister(Mips::A1);
 }
 
 MVT::SimpleValueType MipsTargetLowering::getSetCCResultType(EVT VT) const {
   return MVT::i32;
 }
 
-/// getFunctionAlignment - Return the Log2 alignment of this function.
-unsigned MipsTargetLowering::getFunctionAlignment(const Function *) const {
-  return 2;
-}
-
 // SelectMadd -
 // Transforms a subgraph in CurDAG if the following pattern is found:
 //  (addc multLo, Lo0), (adde multHi, Hi0),
@@ -381,7 +389,7 @@ static SDValue PerformDivRemCombine(SDNode *N, SelectionDAG& DAG,
   // insert MFHI
   if (N->hasAnyUseOfValue(1)) {
     SDValue CopyFromHi = DAG.getCopyFromReg(InChain, dl,
-                                               Mips::HI, MVT::i32, InGlue);
+                                            Mips::HI, MVT::i32, InGlue);
     DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), CopyFromHi);
   }
 
@@ -442,8 +450,8 @@ static SDValue CreateFPCmp(SelectionDAG& DAG, const SDValue& Op) {
   SDValue RHS = Op.getOperand(1);
   DebugLoc dl = Op.getDebugLoc();
 
-  // Assume the 3rd operand is a CondCodeSDNode. Add code to check the type of node
-  // if necessary.
+  // Assume the 3rd operand is a CondCodeSDNode. Add code to check the type of
+  // node if necessary.
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
 
   return DAG.getNode(MipsISD::FPCmp, dl, MVT::Glue, LHS, RHS,
@@ -507,13 +515,14 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const
     case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
     case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
     case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
-    case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
     case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
     case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
     case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
     case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
     case ISD::SELECT:             return LowerSELECT(Op, DAG);
     case ISD::VASTART:            return LowerVASTART(Op, DAG);
+    case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
+    case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
   }
   return SDValue();
 }
@@ -545,45 +554,16 @@ static Mips::FPBranchCode GetFPBranchCodeFromCond(Mips::CondCode CC) {
   return Mips::BRANCH_INVALID;
 }
 
-MachineBasicBlock *
-MipsTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
-                                                MachineBasicBlock *BB) const {
+static MachineBasicBlock* ExpandCondMov(MachineInstr *MI, MachineBasicBlock *BB,
+                                        DebugLoc dl,
+                                        const MipsSubtarget* Subtarget,
+                                        const TargetInstrInfo *TII,
+                                        bool isFPCmp, unsigned Opc) {
   // There is no need to expand CMov instructions if target has
   // conditional moves.
   if (Subtarget->hasCondMov())
     return BB;
 
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-  bool isFPCmp = false;
-  DebugLoc dl = MI->getDebugLoc();
-  unsigned Opc;
-
-  switch (MI->getOpcode()) {
-  default: assert(false && "Unexpected instr type to insert");
-  case Mips::MOVT:
-  case Mips::MOVT_S:
-  case Mips::MOVT_D:
-    isFPCmp = true;
-    Opc = Mips::BC1F;
-    break;
-  case Mips::MOVF:
-  case Mips::MOVF_S:
-  case Mips::MOVF_D:
-    isFPCmp = true;
-    Opc = Mips::BC1T;
-    break;
-  case Mips::MOVZ_I:
-  case Mips::MOVZ_S:
-  case Mips::MOVZ_D:
-    Opc = Mips::BNE;
-    break;
-  case Mips::MOVN_I:
-  case Mips::MOVN_S:
-  case Mips::MOVN_D:
-    Opc = Mips::BEQ;
-    break;
-  }
-
   // To "insert" a SELECT_CC instruction, we actually have to insert the
   // diamond control-flow pattern.  The incoming instruction knows the
   // destination vreg to set, the condition code register to branch on, the
@@ -622,7 +602,6 @@ MipsTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     BuildMI(BB, dl, TII->get(Opc)).addReg(MI->getOperand(2).getReg())
       .addReg(Mips::ZERO).addMBB(sinkMBB);
 
-
   //  copy0MBB:
   //   %FalseValue = ...
   //   # fallthrough to sinkMBB
@@ -651,46 +630,572 @@ MipsTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   return BB;
 }
 
-//===----------------------------------------------------------------------===//
-//  Misc Lower Operation implementation
-//===----------------------------------------------------------------------===//
+MachineBasicBlock *
+MipsTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                MachineBasicBlock *BB) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc dl = MI->getDebugLoc();
 
-SDValue MipsTargetLowering::
-LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const
-{
-  if (!Subtarget->isMips1())
-    return Op;
+  switch (MI->getOpcode()) {
+  default:
+    assert(false && "Unexpected instr type to insert");
+    return NULL;
+  case Mips::MOVT:
+  case Mips::MOVT_S:
+  case Mips::MOVT_D:
+    return ExpandCondMov(MI, BB, dl, Subtarget, TII, true, Mips::BC1F);
+  case Mips::MOVF:
+  case Mips::MOVF_S:
+  case Mips::MOVF_D:
+    return ExpandCondMov(MI, BB, dl, Subtarget, TII, true, Mips::BC1T);
+  case Mips::MOVZ_I:
+  case Mips::MOVZ_S:
+  case Mips::MOVZ_D:
+    return ExpandCondMov(MI, BB, dl, Subtarget, TII, false, Mips::BNE);
+  case Mips::MOVN_I:
+  case Mips::MOVN_S:
+  case Mips::MOVN_D:
+    return ExpandCondMov(MI, BB, dl, Subtarget, TII, false, Mips::BEQ);
+
+  case Mips::ATOMIC_LOAD_ADD_I8:
+    return EmitAtomicBinaryPartword(MI, BB, 1, Mips::ADDu);
+  case Mips::ATOMIC_LOAD_ADD_I16:
+    return EmitAtomicBinaryPartword(MI, BB, 2, Mips::ADDu);
+  case Mips::ATOMIC_LOAD_ADD_I32:
+    return EmitAtomicBinary(MI, BB, 4, Mips::ADDu);
+
+  case Mips::ATOMIC_LOAD_AND_I8:
+    return EmitAtomicBinaryPartword(MI, BB, 1, Mips::AND);
+  case Mips::ATOMIC_LOAD_AND_I16:
+    return EmitAtomicBinaryPartword(MI, BB, 2, Mips::AND);
+  case Mips::ATOMIC_LOAD_AND_I32:
+    return EmitAtomicBinary(MI, BB, 4, Mips::AND);
+
+  case Mips::ATOMIC_LOAD_OR_I8:
+    return EmitAtomicBinaryPartword(MI, BB, 1, Mips::OR);
+  case Mips::ATOMIC_LOAD_OR_I16:
+    return EmitAtomicBinaryPartword(MI, BB, 2, Mips::OR);
+  case Mips::ATOMIC_LOAD_OR_I32:
+    return EmitAtomicBinary(MI, BB, 4, Mips::OR);
+
+  case Mips::ATOMIC_LOAD_XOR_I8:
+    return EmitAtomicBinaryPartword(MI, BB, 1, Mips::XOR);
+  case Mips::ATOMIC_LOAD_XOR_I16:
+    return EmitAtomicBinaryPartword(MI, BB, 2, Mips::XOR);
+  case Mips::ATOMIC_LOAD_XOR_I32:
+    return EmitAtomicBinary(MI, BB, 4, Mips::XOR);
+
+  case Mips::ATOMIC_LOAD_NAND_I8:
+    return EmitAtomicBinaryPartword(MI, BB, 1, 0, true);
+  case Mips::ATOMIC_LOAD_NAND_I16:
+    return EmitAtomicBinaryPartword(MI, BB, 2, 0, true);
+  case Mips::ATOMIC_LOAD_NAND_I32:
+    return EmitAtomicBinary(MI, BB, 4, 0, true);
+
+  case Mips::ATOMIC_LOAD_SUB_I8:
+    return EmitAtomicBinaryPartword(MI, BB, 1, Mips::SUBu);
+  case Mips::ATOMIC_LOAD_SUB_I16:
+    return EmitAtomicBinaryPartword(MI, BB, 2, Mips::SUBu);
+  case Mips::ATOMIC_LOAD_SUB_I32:
+    return EmitAtomicBinary(MI, BB, 4, Mips::SUBu);
+
+  case Mips::ATOMIC_SWAP_I8:
+    return EmitAtomicBinaryPartword(MI, BB, 1, 0);
+  case Mips::ATOMIC_SWAP_I16:
+    return EmitAtomicBinaryPartword(MI, BB, 2, 0);
+  case Mips::ATOMIC_SWAP_I32:
+    return EmitAtomicBinary(MI, BB, 4, 0);
+
+  case Mips::ATOMIC_CMP_SWAP_I8:
+    return EmitAtomicCmpSwapPartword(MI, BB, 1);
+  case Mips::ATOMIC_CMP_SWAP_I16:
+    return EmitAtomicCmpSwapPartword(MI, BB, 2);
+  case Mips::ATOMIC_CMP_SWAP_I32:
+    return EmitAtomicCmpSwap(MI, BB, 4);
+  }
+}
 
-  MachineFunction &MF = DAG.getMachineFunction();
-  unsigned CCReg = AddLiveIn(MF, Mips::FCR31, Mips::CCRRegisterClass);
+// This function also handles Mips::ATOMIC_SWAP_I32 (when BinOpcode == 0), and
+// Mips::ATOMIC_LOAD_NAND_I32 (when Nand == true)
+MachineBasicBlock *
+MipsTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
+                                     unsigned Size, unsigned BinOpcode,
+                                     bool Nand) const {
+  assert(Size == 4 && "Unsupported size for EmitAtomicBinary.");
+
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+  const TargetRegisterClass *RC = getRegClassFor(MVT::i32);
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc dl = MI->getDebugLoc();
 
-  SDValue Chain = DAG.getEntryNode();
-  DebugLoc dl = Op.getDebugLoc();
-  SDValue Src = Op.getOperand(0);
-
-  // Set the condition register
-  SDValue CondReg = DAG.getCopyFromReg(Chain, dl, CCReg, MVT::i32);
-  CondReg = DAG.getCopyToReg(Chain, dl, Mips::AT, CondReg);
-  CondReg = DAG.getCopyFromReg(CondReg, dl, Mips::AT, MVT::i32);
-
-  SDValue Cst = DAG.getConstant(3, MVT::i32);
-  SDValue Or = DAG.getNode(ISD::OR, dl, MVT::i32, CondReg, Cst);
-  Cst = DAG.getConstant(2, MVT::i32);
-  SDValue Xor = DAG.getNode(ISD::XOR, dl, MVT::i32, Or, Cst);
-
-  SDValue InFlag(0, 0);
-  CondReg = DAG.getCopyToReg(Chain, dl, Mips::FCR31, Xor, InFlag);
-
-  // Emit the round instruction and bit convert to integer
-  SDValue Trunc = DAG.getNode(MipsISD::FPRound, dl, MVT::f32,
-                              Src, CondReg.getValue(1));
-  SDValue BitCvt = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Trunc);
-  return BitCvt;
+  unsigned Dest = MI->getOperand(0).getReg();
+  unsigned Ptr = MI->getOperand(1).getReg();
+  unsigned Incr = MI->getOperand(2).getReg();
+
+  unsigned Oldval = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp1 = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp2 = RegInfo.createVirtualRegister(RC);
+
+  // insert new blocks after the current block
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineFunction::iterator It = BB;
+  ++It;
+  MF->insert(It, loopMBB);
+  MF->insert(It, exitMBB);
+
+  // Transfer the remainder of BB and its successor edges to exitMBB.
+  exitMBB->splice(exitMBB->begin(), BB,
+                  llvm::next(MachineBasicBlock::iterator(MI)),
+                  BB->end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  //  thisMBB:
+  //    ...
+  //    sw incr, fi(sp)           // store incr to stack (when BinOpcode == 0)
+  //    fallthrough --> loopMBB
+
+  // Note: for atomic.swap (when BinOpcode == 0), storing incr to stack before
+  // the loop and then loading it from stack in block loopMBB is necessary to
+  // prevent MachineLICM pass to hoist "or" instruction out of the block
+  // loopMBB.
+
+  int fi = 0;
+  if (BinOpcode == 0 && !Nand) {
+    // Get or create a temporary stack location.
+    MipsFunctionInfo *MipsFI = MF->getInfo<MipsFunctionInfo>();
+    fi = MipsFI->getAtomicFrameIndex();
+    if (fi == -1) {
+      fi = MF->getFrameInfo()->CreateStackObject(Size, Size, false);
+      MipsFI->setAtomicFrameIndex(fi);
+    }
+
+    BuildMI(BB, dl, TII->get(Mips::SW))
+        .addReg(Incr).addImm(0).addFrameIndex(fi);
+  }
+  BB->addSuccessor(loopMBB);
+
+  //  loopMBB:
+  //    ll oldval, 0(ptr)
+  //    or dest, $0, oldval
+  //    <binop> tmp1, oldval, incr
+  //    sc tmp1, 0(ptr)
+  //    beq tmp1, $0, loopMBB
+  BB = loopMBB;
+  BuildMI(BB, dl, TII->get(Mips::LL), Oldval).addImm(0).addReg(Ptr);
+  BuildMI(BB, dl, TII->get(Mips::OR), Dest).addReg(Mips::ZERO).addReg(Oldval);
+  if (Nand) {
+    //  and tmp2, oldval, incr
+    //  nor tmp1, $0, tmp2
+    BuildMI(BB, dl, TII->get(Mips::AND), Tmp2).addReg(Oldval).addReg(Incr);
+    BuildMI(BB, dl, TII->get(Mips::NOR), Tmp1).addReg(Mips::ZERO).addReg(Tmp2);
+  } else if (BinOpcode) {
+    //  <binop> tmp1, oldval, incr
+    BuildMI(BB, dl, TII->get(BinOpcode), Tmp1).addReg(Oldval).addReg(Incr);
+  } else {
+    //  lw tmp2, fi(sp)              // load incr from stack
+    //  or tmp1, $zero, tmp2
+    BuildMI(BB, dl, TII->get(Mips::LW), Tmp2).addImm(0).addFrameIndex(fi);;
+    BuildMI(BB, dl, TII->get(Mips::OR), Tmp1).addReg(Mips::ZERO).addReg(Tmp2);
+  }
+  BuildMI(BB, dl, TII->get(Mips::SC), Tmp1).addReg(Tmp1).addImm(0).addReg(Ptr);
+  BuildMI(BB, dl, TII->get(Mips::BEQ))
+    .addReg(Tmp1).addReg(Mips::ZERO).addMBB(loopMBB);
+  BB->addSuccessor(loopMBB);
+  BB->addSuccessor(exitMBB);
+
+  MI->eraseFromParent();   // The instruction is gone now.
+
+  return BB;
+}
+
+MachineBasicBlock *
+MipsTargetLowering::EmitAtomicBinaryPartword(MachineInstr *MI,
+                                             MachineBasicBlock *BB,
+                                             unsigned Size, unsigned BinOpcode,
+                                             bool Nand) const {
+  assert((Size == 1 || Size == 2) &&
+      "Unsupported size for EmitAtomicBinaryPartial.");
+
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+  const TargetRegisterClass *RC = getRegClassFor(MVT::i32);
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc dl = MI->getDebugLoc();
+
+  unsigned Dest = MI->getOperand(0).getReg();
+  unsigned Ptr = MI->getOperand(1).getReg();
+  unsigned Incr = MI->getOperand(2).getReg();
+
+  unsigned Addr = RegInfo.createVirtualRegister(RC);
+  unsigned Shift = RegInfo.createVirtualRegister(RC);
+  unsigned Mask = RegInfo.createVirtualRegister(RC);
+  unsigned Mask2 = RegInfo.createVirtualRegister(RC);
+  unsigned Newval = RegInfo.createVirtualRegister(RC);
+  unsigned Oldval = RegInfo.createVirtualRegister(RC);
+  unsigned Incr2 = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp1 = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp2 = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp3 = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp4 = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp5 = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp6 = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp7 = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp8 = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp9 = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp10 = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp11 = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp12 = RegInfo.createVirtualRegister(RC);
+
+  // insert new blocks after the current block
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineFunction::iterator It = BB;
+  ++It;
+  MF->insert(It, loopMBB);
+  MF->insert(It, exitMBB);
+
+  // Transfer the remainder of BB and its successor edges to exitMBB.
+  exitMBB->splice(exitMBB->begin(), BB,
+                  llvm::next(MachineBasicBlock::iterator(MI)),
+                  BB->end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  //  thisMBB:
+  //    addiu   tmp1,$0,-4                # 0xfffffffc
+  //    and     addr,ptr,tmp1
+  //    andi    tmp2,ptr,3
+  //    sll     shift,tmp2,3
+  //    ori     tmp3,$0,255               # 0xff
+  //    sll     mask,tmp3,shift
+  //    nor     mask2,$0,mask
+  //    andi    tmp4,incr,255
+  //    sll     incr2,tmp4,shift
+  //    sw      incr2, fi(sp)      // store incr2 to stack (when BinOpcode == 0)
+
+  // Note: for atomic.swap (when BinOpcode == 0), storing incr2 to stack before
+  // the loop and then loading it from stack in block loopMBB is necessary to
+  // prevent MachineLICM pass to hoist "or" instruction out of the block
+  // loopMBB.
+
+  int64_t MaskImm = (Size == 1) ? 255 : 65535;
+  BuildMI(BB, dl, TII->get(Mips::ADDiu), Tmp1).addReg(Mips::ZERO).addImm(-4);
+  BuildMI(BB, dl, TII->get(Mips::AND), Addr).addReg(Ptr).addReg(Tmp1);
+  BuildMI(BB, dl, TII->get(Mips::ANDi), Tmp2).addReg(Ptr).addImm(3);
+  BuildMI(BB, dl, TII->get(Mips::SLL), Shift).addReg(Tmp2).addImm(3);
+  BuildMI(BB, dl, TII->get(Mips::ORi), Tmp3).addReg(Mips::ZERO).addImm(MaskImm);
+  BuildMI(BB, dl, TII->get(Mips::SLL), Mask).addReg(Tmp3).addReg(Shift);
+  BuildMI(BB, dl, TII->get(Mips::NOR), Mask2).addReg(Mips::ZERO).addReg(Mask);
+  if (BinOpcode != Mips::SUBu) {
+    BuildMI(BB, dl, TII->get(Mips::ANDi), Tmp4).addReg(Incr).addImm(MaskImm);
+    BuildMI(BB, dl, TII->get(Mips::SLL), Incr2).addReg(Tmp4).addReg(Shift);
+  } else {
+    BuildMI(BB, dl, TII->get(Mips::SUBu), Tmp4).addReg(Mips::ZERO).addReg(Incr);
+    BuildMI(BB, dl, TII->get(Mips::ANDi), Tmp5).addReg(Tmp4).addImm(MaskImm);
+    BuildMI(BB, dl, TII->get(Mips::SLL), Incr2).addReg(Tmp5).addReg(Shift);
+  }
+
+  int fi = 0;
+  if (BinOpcode == 0 && !Nand) {
+    // Get or create a temporary stack location.
+    MipsFunctionInfo *MipsFI = MF->getInfo<MipsFunctionInfo>();
+    fi = MipsFI->getAtomicFrameIndex();
+    if (fi == -1) {
+      fi = MF->getFrameInfo()->CreateStackObject(Size, Size, false);
+      MipsFI->setAtomicFrameIndex(fi);
+    }
+
+    BuildMI(BB, dl, TII->get(Mips::SW))
+        .addReg(Incr2).addImm(0).addFrameIndex(fi);
+  }
+  BB->addSuccessor(loopMBB);
+
+  // loopMBB:
+  //   ll      oldval,0(addr)
+  //   binop   tmp7,oldval,incr2
+  //   and     newval,tmp7,mask
+  //   and     tmp8,oldval,mask2
+  //   or      tmp9,tmp8,newval
+  //   sc      tmp9,0(addr)
+  //   beq     tmp9,$0,loopMBB
+  BB = loopMBB;
+  BuildMI(BB, dl, TII->get(Mips::LL), Oldval).addImm(0).addReg(Addr);
+  if (Nand) {
+    //  and tmp6, oldval, incr2
+    //  nor tmp7, $0, tmp6
+    BuildMI(BB, dl, TII->get(Mips::AND), Tmp6).addReg(Oldval).addReg(Incr2);
+    BuildMI(BB, dl, TII->get(Mips::NOR), Tmp7).addReg(Mips::ZERO).addReg(Tmp6);
+  } else if (BinOpcode == Mips::SUBu) {
+    //  addu tmp7, oldval, incr2
+    BuildMI(BB, dl, TII->get(Mips::ADDu), Tmp7).addReg(Oldval).addReg(Incr2);
+  } else if (BinOpcode) {
+    //  <binop> tmp7, oldval, incr2
+    BuildMI(BB, dl, TII->get(BinOpcode), Tmp7).addReg(Oldval).addReg(Incr2);
+  } else {
+    //  lw tmp6, fi(sp)              // load incr2 from stack
+    //  or tmp7, $zero, tmp6
+    BuildMI(BB, dl, TII->get(Mips::LW), Tmp6).addImm(0).addFrameIndex(fi);;
+    BuildMI(BB, dl, TII->get(Mips::OR), Tmp7).addReg(Mips::ZERO).addReg(Tmp6);
+  }
+  BuildMI(BB, dl, TII->get(Mips::AND), Newval).addReg(Tmp7).addReg(Mask);
+  BuildMI(BB, dl, TII->get(Mips::AND), Tmp8).addReg(Oldval).addReg(Mask2);
+  BuildMI(BB, dl, TII->get(Mips::OR), Tmp9).addReg(Tmp8).addReg(Newval);
+  BuildMI(BB, dl, TII->get(Mips::SC), Tmp9).addReg(Tmp9).addImm(0).addReg(Addr);
+  BuildMI(BB, dl, TII->get(Mips::BEQ))
+      .addReg(Tmp9).addReg(Mips::ZERO).addMBB(loopMBB);
+  BB->addSuccessor(loopMBB);
+  BB->addSuccessor(exitMBB);
+
+  //  exitMBB:
+  //    and     tmp10,oldval,mask
+  //    srl     tmp11,tmp10,shift
+  //    sll     tmp12,tmp11,24
+  //    sra     dest,tmp12,24
+  BB = exitMBB;
+  int64_t ShiftImm = (Size == 1) ? 24 : 16;
+  // reverse order
+  BuildMI(*BB, BB->begin(), dl, TII->get(Mips::SRA), Dest)
+      .addReg(Tmp12).addImm(ShiftImm);
+  BuildMI(*BB, BB->begin(), dl, TII->get(Mips::SLL), Tmp12)
+      .addReg(Tmp11).addImm(ShiftImm);
+  BuildMI(*BB, BB->begin(), dl, TII->get(Mips::SRL), Tmp11)
+      .addReg(Tmp10).addReg(Shift);
+  BuildMI(*BB, BB->begin(), dl, TII->get(Mips::AND), Tmp10)
+    .addReg(Oldval).addReg(Mask);
+
+  MI->eraseFromParent();   // The instruction is gone now.
+
+  return BB;
+}
+
+MachineBasicBlock *
+MipsTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI,
+                                      MachineBasicBlock *BB,
+                                      unsigned Size) const {
+  assert(Size == 4 && "Unsupported size for EmitAtomicCmpSwap.");
+
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+  const TargetRegisterClass *RC = getRegClassFor(MVT::i32);
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc dl = MI->getDebugLoc();
+
+  unsigned Dest    = MI->getOperand(0).getReg();
+  unsigned Ptr     = MI->getOperand(1).getReg();
+  unsigned Oldval  = MI->getOperand(2).getReg();
+  unsigned Newval  = MI->getOperand(3).getReg();
+
+  unsigned Tmp1 = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp2 = RegInfo.createVirtualRegister(RC);
+
+  // insert new blocks after the current block
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineFunction::iterator It = BB;
+  ++It;
+  MF->insert(It, loop1MBB);
+  MF->insert(It, loop2MBB);
+  MF->insert(It, exitMBB);
+
+  // Transfer the remainder of BB and its successor edges to exitMBB.
+  exitMBB->splice(exitMBB->begin(), BB,
+                  llvm::next(MachineBasicBlock::iterator(MI)),
+                  BB->end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Get or create a temporary stack location.
+  MipsFunctionInfo *MipsFI = MF->getInfo<MipsFunctionInfo>();
+  int fi = MipsFI->getAtomicFrameIndex();
+  if (fi == -1) {
+    fi = MF->getFrameInfo()->CreateStackObject(Size, Size, false);
+    MipsFI->setAtomicFrameIndex(fi);
+  }
+
+  //  thisMBB:
+  //    ...
+  //    sw newval, fi(sp)           // store newval to stack
+  //    fallthrough --> loop1MBB
+
+  // Note: storing newval to stack before the loop and then loading it from
+  // stack in block loop2MBB is necessary to prevent MachineLICM pass to
+  // hoist "or" instruction out of the block loop2MBB.
+
+  BuildMI(BB, dl, TII->get(Mips::SW))
+      .addReg(Newval).addImm(0).addFrameIndex(fi);
+  BB->addSuccessor(loop1MBB);
+
+  // loop1MBB:
+  //   ll dest, 0(ptr)
+  //   bne dest, oldval, exitMBB
+  BB = loop1MBB;
+  BuildMI(BB, dl, TII->get(Mips::LL), Dest).addImm(0).addReg(Ptr);
+  BuildMI(BB, dl, TII->get(Mips::BNE))
+    .addReg(Dest).addReg(Oldval).addMBB(exitMBB);
+  BB->addSuccessor(exitMBB);
+  BB->addSuccessor(loop2MBB);
+
+  // loop2MBB:
+  //   lw tmp2, fi(sp)              // load newval from stack
+  //   or tmp1, $0, tmp2
+  //   sc tmp1, 0(ptr)
+  //   beq tmp1, $0, loop1MBB
+  BB = loop2MBB;
+  BuildMI(BB, dl, TII->get(Mips::LW), Tmp2).addImm(0).addFrameIndex(fi);;
+  BuildMI(BB, dl, TII->get(Mips::OR), Tmp1).addReg(Mips::ZERO).addReg(Tmp2);
+  BuildMI(BB, dl, TII->get(Mips::SC), Tmp1).addReg(Tmp1).addImm(0).addReg(Ptr);
+  BuildMI(BB, dl, TII->get(Mips::BEQ))
+    .addReg(Tmp1).addReg(Mips::ZERO).addMBB(loop1MBB);
+  BB->addSuccessor(loop1MBB);
+  BB->addSuccessor(exitMBB);
+
+  MI->eraseFromParent();   // The instruction is gone now.
+
+  return BB;
 }
 
+MachineBasicBlock *
+MipsTargetLowering::EmitAtomicCmpSwapPartword(MachineInstr *MI,
+                                              MachineBasicBlock *BB,
+                                              unsigned Size) const {
+  assert((Size == 1 || Size == 2) &&
+      "Unsupported size for EmitAtomicCmpSwapPartial.");
+
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+  const TargetRegisterClass *RC = getRegClassFor(MVT::i32);
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc dl = MI->getDebugLoc();
+
+  unsigned Dest    = MI->getOperand(0).getReg();
+  unsigned Ptr     = MI->getOperand(1).getReg();
+  unsigned Oldval  = MI->getOperand(2).getReg();
+  unsigned Newval  = MI->getOperand(3).getReg();
+
+  unsigned Addr = RegInfo.createVirtualRegister(RC);
+  unsigned Shift = RegInfo.createVirtualRegister(RC);
+  unsigned Mask = RegInfo.createVirtualRegister(RC);
+  unsigned Mask2 = RegInfo.createVirtualRegister(RC);
+  unsigned Oldval2 = RegInfo.createVirtualRegister(RC);
+  unsigned Oldval3 = RegInfo.createVirtualRegister(RC);
+  unsigned Oldval4 = RegInfo.createVirtualRegister(RC);
+  unsigned Newval2 = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp1 = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp2 = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp3 = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp4 = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp5 = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp6 = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp7 = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp8 = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp9 = RegInfo.createVirtualRegister(RC);
+
+  // insert new blocks after the current block
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineFunction::iterator It = BB;
+  ++It;
+  MF->insert(It, loop1MBB);
+  MF->insert(It, loop2MBB);
+  MF->insert(It, exitMBB);
+
+  // Transfer the remainder of BB and its successor edges to exitMBB.
+  exitMBB->splice(exitMBB->begin(), BB,
+                  llvm::next(MachineBasicBlock::iterator(MI)),
+                  BB->end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  //  thisMBB:
+  //    addiu   tmp1,$0,-4                # 0xfffffffc
+  //    and     addr,ptr,tmp1
+  //    andi    tmp2,ptr,3
+  //    sll     shift,tmp2,3
+  //    ori     tmp3,$0,255               # 0xff
+  //    sll     mask,tmp3,shift
+  //    nor     mask2,$0,mask
+  //    andi    tmp4,oldval,255
+  //    sll     oldval2,tmp4,shift
+  //    andi    tmp5,newval,255
+  //    sll     newval2,tmp5,shift
+  int64_t MaskImm = (Size == 1) ? 255 : 65535;
+  BuildMI(BB, dl, TII->get(Mips::ADDiu), Tmp1).addReg(Mips::ZERO).addImm(-4);
+  BuildMI(BB, dl, TII->get(Mips::AND), Addr).addReg(Ptr).addReg(Tmp1);
+  BuildMI(BB, dl, TII->get(Mips::ANDi), Tmp2).addReg(Ptr).addImm(3);
+  BuildMI(BB, dl, TII->get(Mips::SLL), Shift).addReg(Tmp2).addImm(3);
+  BuildMI(BB, dl, TII->get(Mips::ORi), Tmp3).addReg(Mips::ZERO).addImm(MaskImm);
+  BuildMI(BB, dl, TII->get(Mips::SLL), Mask).addReg(Tmp3).addReg(Shift);
+  BuildMI(BB, dl, TII->get(Mips::NOR), Mask2).addReg(Mips::ZERO).addReg(Mask);
+  BuildMI(BB, dl, TII->get(Mips::ANDi), Tmp4).addReg(Oldval).addImm(MaskImm);
+  BuildMI(BB, dl, TII->get(Mips::SLL), Oldval2).addReg(Tmp4).addReg(Shift);
+  BuildMI(BB, dl, TII->get(Mips::ANDi), Tmp5).addReg(Newval).addImm(MaskImm);
+  BuildMI(BB, dl, TII->get(Mips::SLL), Newval2).addReg(Tmp5).addReg(Shift);
+  BB->addSuccessor(loop1MBB);
+
+  //  loop1MBB:
+  //    ll      oldval3,0(addr)
+  //    and     oldval4,oldval3,mask
+  //    bne     oldval4,oldval2,exitMBB
+  BB = loop1MBB;
+  BuildMI(BB, dl, TII->get(Mips::LL), Oldval3).addImm(0).addReg(Addr);
+  BuildMI(BB, dl, TII->get(Mips::AND), Oldval4).addReg(Oldval3).addReg(Mask);
+  BuildMI(BB, dl, TII->get(Mips::BNE))
+      .addReg(Oldval4).addReg(Oldval2).addMBB(exitMBB);
+  BB->addSuccessor(exitMBB);
+  BB->addSuccessor(loop2MBB);
+
+  //  loop2MBB:
+  //    and     tmp6,oldval3,mask2
+  //    or      tmp7,tmp6,newval2
+  //    sc      tmp7,0(addr)
+  //    beq     tmp7,$0,loop1MBB
+  BB = loop2MBB;
+  BuildMI(BB, dl, TII->get(Mips::AND), Tmp6).addReg(Oldval3).addReg(Mask2);
+  BuildMI(BB, dl, TII->get(Mips::OR), Tmp7).addReg(Tmp6).addReg(Newval2);
+  BuildMI(BB, dl, TII->get(Mips::SC), Tmp7)
+      .addReg(Tmp7).addImm(0).addReg(Addr);
+  BuildMI(BB, dl, TII->get(Mips::BEQ))
+      .addReg(Tmp7).addReg(Mips::ZERO).addMBB(loop1MBB);
+  BB->addSuccessor(loop1MBB);
+  BB->addSuccessor(exitMBB);
+
+  //  exitMBB:
+  //    srl     tmp8,oldval4,shift
+  //    sll     tmp9,tmp8,24
+  //    sra     dest,tmp9,24
+  BB = exitMBB;
+  int64_t ShiftImm = (Size == 1) ? 24 : 16;
+  // reverse order
+  BuildMI(*BB, BB->begin(), dl, TII->get(Mips::SRA), Dest)
+      .addReg(Tmp9).addImm(ShiftImm);
+  BuildMI(*BB, BB->begin(), dl, TII->get(Mips::SLL), Tmp9)
+      .addReg(Tmp8).addImm(ShiftImm);
+  BuildMI(*BB, BB->begin(), dl, TII->get(Mips::SRL), Tmp8)
+      .addReg(Oldval4).addReg(Shift);
+
+  MI->eraseFromParent();   // The instruction is gone now.
+
+  return BB;
+}
+
+//===----------------------------------------------------------------------===//
+//  Misc Lower Operation implementation
+//===----------------------------------------------------------------------===//
 SDValue MipsTargetLowering::
 LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
 {
+  unsigned StackAlignment =
+    getTargetMachine().getFrameLowering()->getStackAlignment();
+  assert(StackAlignment >=
+         cast<ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue() &&
+         "Cannot lower if the alignment of the allocated space is larger than \
+          that of the stack.");
+
   SDValue Chain = Op.getOperand(0);
   SDValue Size = Op.getOperand(1);
   DebugLoc dl = Op.getDebugLoc();
@@ -704,11 +1209,25 @@ LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
 
   // The Sub result contains the new stack start address, so it
   // must be placed in the stack pointer register.
-  Chain = DAG.getCopyToReg(StackPointer.getValue(1), dl, Mips::SP, Sub);
+  Chain = DAG.getCopyToReg(StackPointer.getValue(1), dl, Mips::SP, Sub,
+                           SDValue());
+  // Retrieve updated $sp. There is a glue input to prevent instructions that
+  // clobber $sp from being inserted between copytoreg and copyfromreg.
+  SDValue NewSP = DAG.getCopyFromReg(Chain, dl, Mips::SP, MVT::i32,
+                                     Chain.getValue(1));
+
+  // The stack space reserved by alloca is located right above the argument
+  // area. It is aligned on a boundary that is a multiple of StackAlignment.
+  MachineFunction &MF = DAG.getMachineFunction();
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+  unsigned SPOffset = (MipsFI->getMaxCallFrameSize() + StackAlignment - 1) /
+                      StackAlignment * StackAlignment;
+  SDValue AllocPtr = DAG.getNode(ISD::ADD, dl, MVT::i32, NewSP,
+                                 DAG.getConstant(SPOffset, MVT::i32));
 
   // This node always has two return values: a new stack pointer
   // value and a chain
-  SDValue Ops[2] = { Sub, Chain };
+  SDValue Ops[2] = { AllocPtr, NewSP.getValue(1) };
   return DAG.getMergeValues(Ops, 2, dl);
 }
 
@@ -723,7 +1242,7 @@ LowerBRCOND(SDValue Op, SelectionDAG &DAG) const
 
   SDValue CondRes = CreateFPCmp(DAG, Op.getOperand(1));
 
-  // Return if flag is not set by a floating point comparision.
+  // Return if flag is not set by a floating point comparison.
   if (CondRes.getOpcode() != MipsISD::FPCmp)
     return Op;
 
@@ -741,7 +1260,7 @@ LowerSELECT(SDValue Op, SelectionDAG &DAG) const
 {
   SDValue Cond = CreateFPCmp(DAG, Op.getOperand(0));
 
-  // Return if flag is not set by a floating point comparision.
+  // Return if flag is not set by a floating point comparison.
   if (Cond.getOpcode() != MipsISD::FPCmp)
     return Op;
 
@@ -776,54 +1295,111 @@ SDValue MipsTargetLowering::LowerGlobalAddress(SDValue Op,
     SDValue HiPart = DAG.getNode(MipsISD::Hi, dl, VTs, &GAHi, 1);
     SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, GALo);
     return DAG.getNode(ISD::ADD, dl, MVT::i32, HiPart, Lo);
-  } else {
-    SDValue GA = DAG.getTargetGlobalAddress(GV, dl, MVT::i32, 0,
-                                            MipsII::MO_GOT);
-    SDValue ResNode = DAG.getLoad(MVT::i32, dl,
-                                  DAG.getEntryNode(), GA, MachinePointerInfo(),
-                                  false, false, 0);
-    // On functions and global targets not internal linked only
-    // a load from got/GP is necessary for PIC to work.
-    if (!GV->hasInternalLinkage() &&
-        (!GV->hasLocalLinkage() || isa<Function>(GV)))
-      return ResNode;
-    SDValue GALo = DAG.getTargetGlobalAddress(GV, dl, MVT::i32, 0,
-                                              MipsII::MO_ABS_LO);
-    SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, GALo);
-    return DAG.getNode(ISD::ADD, dl, MVT::i32, ResNode, Lo);
   }
 
-  llvm_unreachable("Dont know how to handle GlobalAddress");
-  return SDValue(0,0);
+  SDValue GA = DAG.getTargetGlobalAddress(GV, dl, MVT::i32, 0,
+                                          MipsII::MO_GOT);
+  GA = DAG.getNode(MipsISD::WrapperPIC, dl, MVT::i32, GA);
+  SDValue ResNode = DAG.getLoad(MVT::i32, dl,
+                                DAG.getEntryNode(), GA, MachinePointerInfo(),
+                                false, false, 0);
+  // On functions and global targets not internal linked only
+  // a load from got/GP is necessary for PIC to work.
+  if (!GV->hasInternalLinkage() &&
+      (!GV->hasLocalLinkage() || isa<Function>(GV)))
+    return ResNode;
+  SDValue GALo = DAG.getTargetGlobalAddress(GV, dl, MVT::i32, 0,
+                                            MipsII::MO_ABS_LO);
+  SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, GALo);
+  return DAG.getNode(ISD::ADD, dl, MVT::i32, ResNode, Lo);
 }
 
 SDValue MipsTargetLowering::LowerBlockAddress(SDValue Op,
                                               SelectionDAG &DAG) const {
+  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+  // FIXME there isn't actually debug info here
+  DebugLoc dl = Op.getDebugLoc();
+
   if (getTargetMachine().getRelocationModel() != Reloc::PIC_) {
-    assert(false && "implement LowerBlockAddress for -static");
-    return SDValue(0, 0);
-  }
-  else {
-    // FIXME there isn't actually debug info here
-    DebugLoc dl = Op.getDebugLoc();
-    const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
-    SDValue BAGOTOffset = DAG.getBlockAddress(BA, MVT::i32, true,
-                                              MipsII::MO_GOT);
-    SDValue BALOOffset = DAG.getBlockAddress(BA, MVT::i32, true,
-                                             MipsII::MO_ABS_LO);
-    SDValue Load = DAG.getLoad(MVT::i32, dl,
-                               DAG.getEntryNode(), BAGOTOffset,
-                               MachinePointerInfo(), false, false, 0);
-    SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, BALOOffset);
-    return DAG.getNode(ISD::ADD, dl, MVT::i32, Load, Lo);
+    // %hi/%lo relocation
+    SDValue BAHi = DAG.getBlockAddress(BA, MVT::i32, true,
+                                       MipsII::MO_ABS_HI);
+    SDValue BALo = DAG.getBlockAddress(BA, MVT::i32, true,
+                                       MipsII::MO_ABS_LO);
+    SDValue Hi = DAG.getNode(MipsISD::Hi, dl, MVT::i32, BAHi);
+    SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, BALo);
+    return DAG.getNode(ISD::ADD, dl, MVT::i32, Hi, Lo);
   }
+
+  SDValue BAGOTOffset = DAG.getBlockAddress(BA, MVT::i32, true,
+                                            MipsII::MO_GOT);
+  BAGOTOffset = DAG.getNode(MipsISD::WrapperPIC, dl, MVT::i32, BAGOTOffset);
+  SDValue BALOOffset = DAG.getBlockAddress(BA, MVT::i32, true,
+                                           MipsII::MO_ABS_LO);
+  SDValue Load = DAG.getLoad(MVT::i32, dl,
+                             DAG.getEntryNode(), BAGOTOffset,
+                             MachinePointerInfo(), false, false, 0);
+  SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, BALOOffset);
+  return DAG.getNode(ISD::ADD, dl, MVT::i32, Load, Lo);
 }
 
 SDValue MipsTargetLowering::
 LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
 {
-  llvm_unreachable("TLS not implemented for MIPS.");
-  return SDValue(); // Not reached
+  // If the relocation model is PIC, use the General Dynamic TLS Model,
+  // otherwise use the Initial Exec or Local Exec TLS Model.
+  // TODO: implement Local Dynamic TLS model
+
+  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+  DebugLoc dl = GA->getDebugLoc();
+  const GlobalValue *GV = GA->getGlobal();
+  EVT PtrVT = getPointerTy();
+
+  if (getTargetMachine().getRelocationModel() == Reloc::PIC_) {
+    // General Dynamic TLS Model
+    SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, MVT::i32,
+                                                 0, MipsII::MO_TLSGD);
+    SDValue Tlsgd = DAG.getNode(MipsISD::TlsGd, dl, MVT::i32, TGA);
+    SDValue GP = DAG.getRegister(Mips::GP, MVT::i32);
+    SDValue Argument = DAG.getNode(ISD::ADD, dl, MVT::i32, GP, Tlsgd);
+
+    ArgListTy Args;
+    ArgListEntry Entry;
+    Entry.Node = Argument;
+    Entry.Ty = (const Type *) Type::getInt32Ty(*DAG.getContext());
+    Args.push_back(Entry);
+    std::pair<SDValue, SDValue> CallResult =
+        LowerCallTo(DAG.getEntryNode(),
+                 (const Type *) Type::getInt32Ty(*DAG.getContext()),
+                 false, false, false, false,
+                 0, CallingConv::C, false, true,
+                 DAG.getExternalSymbol("__tls_get_addr", PtrVT), Args, DAG, dl);
+
+    return CallResult.first;
+  } else {
+    SDValue Offset;
+    if (GV->isDeclaration()) {
+      // Initial Exec TLS Model
+      SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, MVT::i32, 0,
+                                              MipsII::MO_GOTTPREL);
+      Offset = DAG.getLoad(MVT::i32, dl,
+                                  DAG.getEntryNode(), TGA, MachinePointerInfo(),
+                                  false, false, 0);
+    } else {
+      // Local Exec TLS Model
+      SDVTList VTs = DAG.getVTList(MVT::i32);
+      SDValue TGAHi = DAG.getTargetGlobalAddress(GV, dl, MVT::i32, 0,
+                                              MipsII::MO_TPREL_HI);
+      SDValue TGALo = DAG.getTargetGlobalAddress(GV, dl, MVT::i32, 0,
+                                              MipsII::MO_TPREL_LO);
+      SDValue Hi = DAG.getNode(MipsISD::TprelHi, dl, VTs, &TGAHi, 1);
+      SDValue Lo = DAG.getNode(MipsISD::TprelLo, dl, MVT::i32, TGALo);
+      Offset = DAG.getNode(ISD::ADD, dl, MVT::i32, Hi, Lo);
+    }
+
+    SDValue ThreadPointer = DAG.getNode(MipsISD::ThreadPointer, dl, PtrVT);
+    return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
+  }
 }
 
 SDValue MipsTargetLowering::
@@ -844,12 +1420,15 @@ LowerJumpTable(SDValue Op, SelectionDAG &DAG) const
   if (!IsPIC) {
     SDValue Ops[] = { JTI };
     HiPart = DAG.getNode(MipsISD::Hi, dl, DAG.getVTList(MVT::i32), Ops, 1);
-  } else // Emit Load from Global Pointer
+  } else {// Emit Load from Global Pointer
+    JTI = DAG.getNode(MipsISD::WrapperPIC, dl, MVT::i32, JTI);
     HiPart = DAG.getLoad(MVT::i32, dl, DAG.getEntryNode(), JTI,
                          MachinePointerInfo(),
                          false, false, 0);
+  }
 
-  SDValue JTILo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MipsII::MO_ABS_LO);
+  SDValue JTILo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
+                                         MipsII::MO_ABS_LO);
   SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, JTILo);
   ResNode = DAG.getNode(ISD::ADD, dl, MVT::i32, HiPart, Lo);
 
@@ -867,7 +1446,7 @@ LowerConstantPool(SDValue Op, SelectionDAG &DAG) const
 
   // gp_rel relocation
   // FIXME: we should reference the constant pool using small data sections,
-  // but the asm printer currently doens't support this feature without
+  // but the asm printer currently doesn't support this feature without
   // hacking it. This feature should come soon so we can uncomment the
   // stuff below.
   //if (IsInSmallSection(C->getType())) {
@@ -886,6 +1465,7 @@ LowerConstantPool(SDValue Op, SelectionDAG &DAG) const
   } else {
     SDValue CP = DAG.getTargetConstantPool(C, MVT::i32, N->getAlignment(),
                                            N->getOffset(), MipsII::MO_GOT);
+    CP = DAG.getNode(MipsISD::WrapperPIC, dl, MVT::i32, CP);
     SDValue Load = DAG.getLoad(MVT::i32, dl, DAG.getEntryNode(),
                                CP, MachinePointerInfo::getConstantPool(),
                                false, false, 0);
@@ -914,6 +1494,74 @@ SDValue MipsTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
                       false, false, 0);
 }
 
+static SDValue LowerFCOPYSIGN32(SDValue Op, SelectionDAG &DAG) {
+  // FIXME: Use ext/ins instructions if target architecture is Mips32r2.
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op.getOperand(0));
+  SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op.getOperand(1));
+  SDValue And0 = DAG.getNode(ISD::AND, dl, MVT::i32, Op0,
+                             DAG.getConstant(0x7fffffff, MVT::i32));
+  SDValue And1 = DAG.getNode(ISD::AND, dl, MVT::i32, Op1,
+                             DAG.getConstant(0x80000000, MVT::i32));
+  SDValue Result = DAG.getNode(ISD::OR, dl, MVT::i32, And0, And1);
+  return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Result);
+}
+
+static SDValue LowerFCOPYSIGN64(SDValue Op, SelectionDAG &DAG, bool isLittle) {
+  // FIXME:
+  //  Use ext/ins instructions if target architecture is Mips32r2.
+  //  Eliminate redundant mfc1 and mtc1 instructions.
+  unsigned LoIdx = 0, HiIdx = 1;
+
+  if (!isLittle)
+    std::swap(LoIdx, HiIdx);
+
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue Word0 = DAG.getNode(MipsISD::ExtractElementF64, dl, MVT::i32,
+                              Op.getOperand(0),
+                              DAG.getConstant(LoIdx, MVT::i32));
+  SDValue Hi0 = DAG.getNode(MipsISD::ExtractElementF64, dl, MVT::i32,
+                            Op.getOperand(0), DAG.getConstant(HiIdx, MVT::i32));
+  SDValue Hi1 = DAG.getNode(MipsISD::ExtractElementF64, dl, MVT::i32,
+                            Op.getOperand(1), DAG.getConstant(HiIdx, MVT::i32));
+  SDValue And0 = DAG.getNode(ISD::AND, dl, MVT::i32, Hi0,
+                             DAG.getConstant(0x7fffffff, MVT::i32));
+  SDValue And1 = DAG.getNode(ISD::AND, dl, MVT::i32, Hi1,
+                             DAG.getConstant(0x80000000, MVT::i32));
+  SDValue Word1 = DAG.getNode(ISD::OR, dl, MVT::i32, And0, And1);
+
+  if (!isLittle)
+    std::swap(Word0, Word1);
+
+  return DAG.getNode(MipsISD::BuildPairF64, dl, MVT::f64, Word0, Word1);
+}
+
+SDValue MipsTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG)
+  const {
+  EVT Ty = Op.getValueType();
+
+  assert(Ty == MVT::f32 || Ty == MVT::f64);
+
+  if (Ty == MVT::f32)
+    return LowerFCOPYSIGN32(Op, DAG);
+  else
+    return LowerFCOPYSIGN64(Op, DAG, Subtarget->isLittle());
+}
+
+SDValue MipsTargetLowering::
+LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
+  // check the depth
+  assert((cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() == 0) &&
+         "Frame address can only be determined for current frame.");
+
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  MFI->setFrameAddressIsTaken(true);
+  EVT VT = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, Mips::FP, VT);
+  return FrameAddr;
+}
+
 //===----------------------------------------------------------------------===//
 //                      Calling Convention Implementation
 //===----------------------------------------------------------------------===//
@@ -931,6 +1579,8 @@ SDValue MipsTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
 //       yet to hold an argument. Otherwise, use A2, A3 and stack. If A1 is
 //       not used, it must be shadowed. If only A3 is avaiable, shadow it and
 //       go to stack.
+//
+//  For vararg functions, all arguments are passed in A0, A1, A2, A3 and stack.
 //===----------------------------------------------------------------------===//
 
 static bool CC_MipsO32(unsigned ValNo, MVT ValVT,
@@ -949,90 +1599,17 @@ static bool CC_MipsO32(unsigned ValNo, MVT ValVT,
       Mips::D6, Mips::D7
   };
 
-  unsigned Reg = 0;
-  static bool IntRegUsed = false;
-
-  // This must be the first arg of the call if no regs have been allocated.
-  // Initialize IntRegUsed in that case.
-  if (IntRegs[State.getFirstUnallocated(IntRegs, IntRegsSize)] == Mips::A0 &&
-      F32Regs[State.getFirstUnallocated(F32Regs, FloatRegsSize)] == Mips::F12 &&
-      F64Regs[State.getFirstUnallocated(F64Regs, FloatRegsSize)] == Mips::D6)
-    IntRegUsed = false;
-
-  // Promote i8 and i16
-  if (LocVT == MVT::i8 || LocVT == MVT::i16) {
-    LocVT = MVT::i32;
-    if (ArgFlags.isSExt())
-      LocInfo = CCValAssign::SExt;
-    else if (ArgFlags.isZExt())
-      LocInfo = CCValAssign::ZExt;
-    else
-      LocInfo = CCValAssign::AExt;
+  // ByVal Args
+  if (ArgFlags.isByVal()) {
+    State.HandleByVal(ValNo, ValVT, LocVT, LocInfo,
+                      1 /*MinSize*/, 4 /*MinAlign*/, ArgFlags);
+    unsigned NextReg = (State.getNextStackOffset() + 3) / 4;
+    for (unsigned r = State.getFirstUnallocated(IntRegs, IntRegsSize);
+         r < std::min(IntRegsSize, NextReg); ++r)
+      State.AllocateReg(IntRegs[r]);
+    return false;
   }
 
-  if (ValVT == MVT::i32) {
-    Reg = State.AllocateReg(IntRegs, IntRegsSize);
-    IntRegUsed = true;
-  } else if (ValVT == MVT::f32) {
-    // An int reg has to be marked allocated regardless of whether or not
-    // IntRegUsed is true.
-    Reg = State.AllocateReg(IntRegs, IntRegsSize);
-
-    if (IntRegUsed) {
-      if (Reg) // Int reg is available
-        LocVT = MVT::i32;
-    } else {
-      unsigned FReg = State.AllocateReg(F32Regs, FloatRegsSize);
-      if (FReg) // F32 reg is available
-        Reg = FReg;
-      else if (Reg) // No F32 regs are available, but an int reg is available.
-        LocVT = MVT::i32;
-    }
-  } else if (ValVT == MVT::f64) {
-    // Int regs have to be marked allocated regardless of whether or not
-    // IntRegUsed is true.
-    Reg = State.AllocateReg(IntRegs, IntRegsSize);
-    if (Reg == Mips::A1)
-      Reg = State.AllocateReg(IntRegs, IntRegsSize);
-    else if (Reg == Mips::A3)
-      Reg = 0;
-    State.AllocateReg(IntRegs, IntRegsSize);
-
-    // At this point, Reg is A0, A2 or 0, and all the unavailable integer regs
-    // are marked as allocated.
-    if (IntRegUsed) {
-      if (Reg)// if int reg is available
-        LocVT = MVT::i32;
-    } else {
-      unsigned FReg = State.AllocateReg(F64Regs, FloatRegsSize);
-      if (FReg) // F64 reg is available.
-        Reg = FReg;
-      else if (Reg) // No F64 regs are available, but an int reg is available.
-        LocVT = MVT::i32;
-    }
-  } else
-    assert(false && "cannot handle this ValVT");
-
-  if (!Reg) {
-    unsigned SizeInBytes = ValVT.getSizeInBits() >> 3;
-    unsigned Offset = State.AllocateStack(SizeInBytes, SizeInBytes);
-    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
-  } else
-    State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-
-  return false; // CC must always match
-}
-
-static bool CC_MipsO32_VarArgs(unsigned ValNo, MVT ValVT,
-                       MVT LocVT, CCValAssign::LocInfo LocInfo,
-                       ISD::ArgFlagsTy ArgFlags, CCState &State) {
-
-  static const unsigned IntRegsSize=4;
-
-  static const unsigned IntRegs[] = {
-      Mips::A0, Mips::A1, Mips::A2, Mips::A3
-  };
-
   // Promote i8 and i16
   if (LocVT == MVT::i8 || LocVT == MVT::i16) {
     LocVT = MVT::i32;
@@ -1046,23 +1623,52 @@ static bool CC_MipsO32_VarArgs(unsigned ValNo, MVT ValVT,
 
   unsigned Reg;
 
-  if (ValVT == MVT::i32 || ValVT == MVT::f32) {
+  // f32 and f64 are allocated in A0, A1, A2, A3 when either of the following
+  // is true: function is vararg, argument is 3rd or higher, there is previous
+  // argument which is not f32 or f64.
+  bool AllocateFloatsInIntReg = State.isVarArg() || ValNo > 1
+      || State.getFirstUnallocated(F32Regs, FloatRegsSize) != ValNo;
+  unsigned OrigAlign = ArgFlags.getOrigAlign();
+  bool isI64 = (ValVT == MVT::i32 && OrigAlign == 8);
+
+  if (ValVT == MVT::i32 || (ValVT == MVT::f32 && AllocateFloatsInIntReg)) {
     Reg = State.AllocateReg(IntRegs, IntRegsSize);
+    // If this is the first part of an i64 arg,
+    // the allocated register must be either A0 or A2.
+    if (isI64 && (Reg == Mips::A1 || Reg == Mips::A3))
+      Reg = State.AllocateReg(IntRegs, IntRegsSize);
     LocVT = MVT::i32;
-  } else if (ValVT == MVT::f64) {
+  } else if (ValVT == MVT::f64 && AllocateFloatsInIntReg) {
+    // Allocate int register and shadow next int register. If first
+    // available register is Mips::A1 or Mips::A3, shadow it too.
     Reg = State.AllocateReg(IntRegs, IntRegsSize);
     if (Reg == Mips::A1 || Reg == Mips::A3)
       Reg = State.AllocateReg(IntRegs, IntRegsSize);
     State.AllocateReg(IntRegs, IntRegsSize);
     LocVT = MVT::i32;
+  } else if (ValVT.isFloatingPoint() && !AllocateFloatsInIntReg) {
+    // we are guaranteed to find an available float register
+    if (ValVT == MVT::f32) {
+      Reg = State.AllocateReg(F32Regs, FloatRegsSize);
+      // Shadow int register
+      State.AllocateReg(IntRegs, IntRegsSize);
+    } else {
+      Reg = State.AllocateReg(F64Regs, FloatRegsSize);
+      // Shadow int registers
+      unsigned Reg2 = State.AllocateReg(IntRegs, IntRegsSize);
+      if (Reg2 == Mips::A1 || Reg2 == Mips::A3)
+        State.AllocateReg(IntRegs, IntRegsSize);
+      State.AllocateReg(IntRegs, IntRegsSize);
+    }
   } else
     llvm_unreachable("Cannot handle this ValVT.");
 
-  if (!Reg) {
-    unsigned SizeInBytes = ValVT.getSizeInBits() >> 3;
-    unsigned Offset = State.AllocateStack(SizeInBytes, SizeInBytes);
+  unsigned SizeInBytes = ValVT.getSizeInBits() >> 3;
+  unsigned Offset = State.AllocateStack(SizeInBytes, OrigAlign);
+
+  if (!Reg)
     State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
-  } else
+  else
     State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
 
   return false; // CC must always match
@@ -1072,6 +1678,56 @@ static bool CC_MipsO32_VarArgs(unsigned ValNo, MVT ValVT,
 //                  Call Calling Convention Implementation
 //===----------------------------------------------------------------------===//
 
+static const unsigned O32IntRegsSize = 4;
+
+static const unsigned O32IntRegs[] = {
+  Mips::A0, Mips::A1, Mips::A2, Mips::A3
+};
+
+// Write ByVal Arg to arg registers and stack.
+static void
+WriteByValArg(SDValue& Chain, DebugLoc dl,
+              SmallVector<std::pair<unsigned, SDValue>, 16>& RegsToPass,
+              SmallVector<SDValue, 8>& MemOpChains, int& LastFI,
+              MachineFrameInfo *MFI, SelectionDAG &DAG, SDValue Arg,
+              const CCValAssign &VA, const ISD::ArgFlagsTy& Flags,
+              MVT PtrType) {
+  unsigned FirstWord = VA.getLocMemOffset() / 4;
+  unsigned NumWords = (Flags.getByValSize() + 3) / 4;
+  unsigned LastWord = FirstWord + NumWords;
+  unsigned CurWord;
+
+  // copy the first 4 words of byval arg to registers A0 - A3
+  for (CurWord = FirstWord; CurWord < std::min(LastWord, O32IntRegsSize);
+       ++CurWord) {
+    SDValue LoadPtr = DAG.getNode(ISD::ADD, dl, MVT::i32, Arg,
+                                  DAG.getConstant((CurWord - FirstWord) * 4,
+                                                  MVT::i32));
+    SDValue LoadVal = DAG.getLoad(MVT::i32, dl, Chain, LoadPtr,
+                                  MachinePointerInfo(),
+                                  false, false, 0);
+    MemOpChains.push_back(LoadVal.getValue(1));
+    unsigned DstReg = O32IntRegs[CurWord];
+    RegsToPass.push_back(std::make_pair(DstReg, LoadVal));
+  }
+
+  // copy remaining part of byval arg to stack.
+  if (CurWord < LastWord) {
+    unsigned SizeInBytes = (LastWord - CurWord) * 4;
+    SDValue Src = DAG.getNode(ISD::ADD, dl, MVT::i32, Arg,
+                              DAG.getConstant((CurWord - FirstWord) * 4,
+                                              MVT::i32));
+    LastFI = MFI->CreateFixedObject(SizeInBytes, CurWord * 4, true);
+    SDValue Dst = DAG.getFrameIndex(LastFI, PtrType);
+    Chain = DAG.getMemcpy(Chain, dl, Dst, Src,
+                          DAG.getConstant(SizeInBytes, MVT::i32),
+                          /*Align*/4,
+                          /*isVolatile=*/false, /*AlwaysInline=*/false,
+                          MachinePointerInfo(0), MachinePointerInfo(0));
+    MemOpChains.push_back(Chain);
+  }
+}
+
 /// LowerCall - functions arguments are copied from virtual regs to
 /// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted.
 /// TODO: isTailCall.
@@ -1089,35 +1745,57 @@ MipsTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
 
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
+  const TargetFrameLowering *TFL = MF.getTarget().getFrameLowering();
   bool IsPIC = getTargetMachine().getRelocationModel() == Reloc::PIC_;
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs,
-                 *DAG.getContext());
-
-  // To meet O32 ABI, Mips must always allocate 16 bytes on
-  // the stack (even if less than 4 are used as arguments)
-  if (Subtarget->isABI_O32()) {
-    int VTsize = MVT(MVT::i32).getSizeInBits()/8;
-    MFI->CreateFixedObject(VTsize, (VTsize*3), true);
-    CCInfo.AnalyzeCallOperands(Outs,
-                     isVarArg ? CC_MipsO32_VarArgs : CC_MipsO32);
-  } else
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), ArgLocs, *DAG.getContext());
+
+  if (Subtarget->isABI_O32())
+    CCInfo.AnalyzeCallOperands(Outs, CC_MipsO32);
+  else
     CCInfo.AnalyzeCallOperands(Outs, CC_Mips);
 
   // Get a count of how many bytes are to be pushed on the stack.
-  unsigned NumBytes = CCInfo.getNextStackOffset();
-  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
+  unsigned NextStackOffset = CCInfo.getNextStackOffset();
+
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NextStackOffset,
+                                                            true));
+
+  // If this is the first call, create a stack frame object that points to
+  // a location to which .cprestore saves $gp.
+  if (IsPIC && !MipsFI->getGPFI())
+    MipsFI->setGPFI(MFI->CreateFixedObject(4, 0, true));
+
+  // Update size of the maximum argument space.
+  // For O32, a minimum of four words (16 bytes) of argument space is
+  // allocated.
+  if (Subtarget->isABI_O32())
+    NextStackOffset = std::max(NextStackOffset, (unsigned)16);
+
+  unsigned MaxCallFrameSize = MipsFI->getMaxCallFrameSize();
+
+  if (MaxCallFrameSize < NextStackOffset) {
+    MipsFI->setMaxCallFrameSize(NextStackOffset);
+
+    if (IsPIC) {
+      // $gp restore slot must be aligned.
+      unsigned StackAlignment = TFL->getStackAlignment();
+      NextStackOffset = (NextStackOffset + StackAlignment - 1) /
+                        StackAlignment * StackAlignment;
+      int GPFI = MipsFI->getGPFI();
+      MFI->setObjectOffset(GPFI, NextStackOffset);
+    }
+  }
 
   // With EABI is it possible to have 16 args on registers.
   SmallVector<std::pair<unsigned, SDValue>, 16> RegsToPass;
   SmallVector<SDValue, 8> MemOpChains;
 
-  // First/LastArgStackLoc contains the first/last
-  // "at stack" argument location.
-  int LastArgStackLoc = 0;
-  unsigned FirstStackArgLoc = (Subtarget->isABI_EABI() ? 0 : 16);
+  int FirstFI = -MFI->getNumFixedObjects() - 1, LastFI = 0;
 
   // Walk the register/memloc assignments, inserting copies/loads.
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
@@ -1132,11 +1810,12 @@ MipsTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
         if (VA.getValVT() == MVT::f32 && VA.getLocVT() == MVT::i32)
           Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg);
         if (VA.getValVT() == MVT::f64 && VA.getLocVT() == MVT::i32) {
-          Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
-          SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Arg,
-                                   DAG.getConstant(0, getPointerTy()));
-          SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Arg,
-                                   DAG.getConstant(1, getPointerTy()));
+          SDValue Lo = DAG.getNode(MipsISD::ExtractElementF64, dl, MVT::i32,
+                                   Arg, DAG.getConstant(0, MVT::i32));
+          SDValue Hi = DAG.getNode(MipsISD::ExtractElementF64, dl, MVT::i32,
+                                   Arg, DAG.getConstant(1, MVT::i32));
+          if (!Subtarget->isLittle())
+            std::swap(Lo, Hi);
           RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
           RegsToPass.push_back(std::make_pair(VA.getLocReg()+1, Hi));
           continue;
@@ -1164,15 +1843,22 @@ MipsTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
     // Register can't get to this point...
     assert(VA.isMemLoc());
 
-    // Create the frame index object for this incoming parameter
-    // This guarantees that when allocating Local Area the firsts
-    // 16 bytes which are alwayes reserved won't be overwritten
-    // if O32 ABI is used. For EABI the first address is zero.
-    LastArgStackLoc = (FirstStackArgLoc + VA.getLocMemOffset());
-    int FI = MFI->CreateFixedObject(VA.getValVT().getSizeInBits()/8,
-                                    LastArgStackLoc, true);
+    // ByVal Arg.
+    ISD::ArgFlagsTy Flags = Outs[i].Flags;
+    if (Flags.isByVal()) {
+      assert(Subtarget->isABI_O32() &&
+             "No support for ByVal args by ABIs other than O32 yet.");
+      assert(Flags.getByValSize() &&
+             "ByVal args of size 0 should have been ignored by front-end.");
+      WriteByValArg(Chain, dl, RegsToPass, MemOpChains, LastFI, MFI, DAG, Arg,
+                    VA, Flags, getPointerTy());
+      continue;
+    }
 
-    SDValue PtrOff = DAG.getFrameIndex(FI,getPointerTy());
+    // Create the frame index object for this incoming parameter
+    LastFI = MFI->CreateFixedObject(VA.getValVT().getSizeInBits()/8,
+                                    VA.getLocMemOffset(), true);
+    SDValue PtrOff = DAG.getFrameIndex(LastFI, getPointerTy());
 
     // emit ISD::STORE whichs stores the
     // parameter value to a stack Location
@@ -1181,23 +1867,18 @@ MipsTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
                                        false, false, 0));
   }
 
+  // Extend range of indices of frame objects for outgoing arguments that were
+  // created during this function call. Skip this step if no such objects were
+  // created.
+  if (LastFI)
+    MipsFI->extendOutArgFIRange(FirstFI, LastFI);
+
   // Transform all store nodes into one single node because all store
   // nodes are independent of each other.
   if (!MemOpChains.empty())
     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
                         &MemOpChains[0], MemOpChains.size());
 
-  // Build a sequence of copy-to-reg nodes chained together with token
-  // chain and flag operands which copy the outgoing args into registers.
-  // The InFlag in necessary since all emited instructions must be
-  // stuck together.
-  SDValue InFlag;
-  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
-    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
-                             RegsToPass[i].second, InFlag);
-    InFlag = Chain.getValue(1);
-  }
-
   // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
   // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
   // node so that legalize doesn't hack it.
@@ -1224,10 +1905,13 @@ MipsTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
     LoadSymAddr = true;
   }
 
+  SDValue InFlag;
+
   // Create nodes that load address of callee and copy it to T9
   if (IsPIC) {
     if (LoadSymAddr) {
       // Load callee address
+      Callee = DAG.getNode(MipsISD::WrapperPIC, dl, MVT::i32, Callee);
       SDValue LoadValue = DAG.getLoad(MVT::i32, dl, Chain, Callee,
                                       MachinePointerInfo::getGOT(),
                                       false, false, 0);
@@ -1239,7 +1923,7 @@ MipsTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
       } else
         Callee = LoadValue;
 
-      // Use chain output from LoadValue 
+      // Use chain output from LoadValue
       Chain = LoadValue.getValue(1);
     }
 
@@ -1249,6 +1933,16 @@ MipsTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
     Callee = DAG.getRegister(Mips::T9, MVT::i32);
   }
 
+  // Build a sequence of copy-to-reg nodes chained together with token
+  // chain and flag operands which copy the outgoing args into registers.
+  // The InFlag in necessary since all emitted instructions must be
+  // stuck together.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
   // MipsJmpLink = #chain, #target_address, #opt_in_flags...
   //             = Chain, Callee, Reg#1, Reg#2, ...
   //
@@ -1270,39 +1964,8 @@ MipsTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
   Chain  = DAG.getNode(MipsISD::JmpLink, dl, NodeTys, &Ops[0], Ops.size());
   InFlag = Chain.getValue(1);
 
-  // Create a stack location to hold GP when PIC is used. This stack
-  // location is used on function prologue to save GP and also after all
-  // emited CALL's to restore GP.
-  if (IsPIC) {
-      // Function can have an arbitrary number of calls, so
-      // hold the LastArgStackLoc with the biggest offset.
-      int FI;
-      MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
-      if (LastArgStackLoc >= MipsFI->getGPStackOffset()) {
-        LastArgStackLoc = (!LastArgStackLoc) ? (16) : (LastArgStackLoc+4);
-        // Create the frame index only once. SPOffset here can be anything
-        // (this will be fixed on processFunctionBeforeFrameFinalized)
-        if (MipsFI->getGPStackOffset() == -1) {
-          FI = MFI->CreateFixedObject(4, 0, true);
-          MipsFI->setGPFI(FI);
-        }
-        MipsFI->setGPStackOffset(LastArgStackLoc);
-      }
-
-      // Reload GP value.
-      FI = MipsFI->getGPFI();
-      SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
-      SDValue GPLoad = DAG.getLoad(MVT::i32, dl, Chain, FIN,
-                                   MachinePointerInfo::getFixedStack(FI),
-                                   false, false, 0);
-      Chain = GPLoad.getValue(1);
-      Chain = DAG.getCopyToReg(Chain, dl, DAG.getRegister(Mips::GP, MVT::i32),
-                               GPLoad, SDValue(0,0));
-      InFlag = Chain.getValue(1);
-  }
-
   // Create the CALLSEQ_END node.
-  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NextStackOffset, true),
                              DAG.getIntPtrConstant(0, true), InFlag);
   InFlag = Chain.getValue(1);
 
@@ -1320,11 +1983,10 @@ MipsTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
                                     const SmallVectorImpl<ISD::InputArg> &Ins,
                                     DebugLoc dl, SelectionDAG &DAG,
                                     SmallVectorImpl<SDValue> &InVals) const {
-
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
-                 RVLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), RVLocs, *DAG.getContext());
 
   CCInfo.AnalyzeCallResult(Ins, RetCC_Mips);
 
@@ -1342,18 +2004,41 @@ MipsTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
 //===----------------------------------------------------------------------===//
 //             Formal Arguments Calling Convention Implementation
 //===----------------------------------------------------------------------===//
+static void ReadByValArg(MachineFunction &MF, SDValue Chain, DebugLoc dl,
+                         std::vector<SDValue>& OutChains,
+                         SelectionDAG &DAG, unsigned NumWords, SDValue FIN,
+                         const CCValAssign &VA, const ISD::ArgFlagsTy& Flags) {
+  unsigned LocMem = VA.getLocMemOffset();
+  unsigned FirstWord = LocMem / 4;
+
+  // copy register A0 - A3 to frame object
+  for (unsigned i = 0; i < NumWords; ++i) {
+    unsigned CurWord = FirstWord + i;
+    if (CurWord >= O32IntRegsSize)
+      break;
+
+    unsigned SrcReg = O32IntRegs[CurWord];
+    unsigned Reg = AddLiveIn(MF, SrcReg, Mips::CPURegsRegisterClass);
+    SDValue StorePtr = DAG.getNode(ISD::ADD, dl, MVT::i32, FIN,
+                                   DAG.getConstant(i * 4, MVT::i32));
+    SDValue Store = DAG.getStore(Chain, dl, DAG.getRegister(Reg, MVT::i32),
+                                 StorePtr, MachinePointerInfo(), false,
+                                 false, 0);
+    OutChains.push_back(Store);
+  }
+}
 
 /// LowerFormalArguments - transform physical registers into virtual registers
 /// and generate load operations for arguments places on the stack.
 SDValue
 MipsTargetLowering::LowerFormalArguments(SDValue Chain,
-                                        CallingConv::ID CallConv, bool isVarArg,
-                                        const SmallVectorImpl<ISD::InputArg>
-                                        &Ins,
-                                        DebugLoc dl, SelectionDAG &DAG,
-                                        SmallVectorImpl<SDValue> &InVals)
+                                         CallingConv::ID CallConv,
+                                         bool isVarArg,
+                                         const SmallVectorImpl<ISD::InputArg>
+                                         &Ins,
+                                         DebugLoc dl, SelectionDAG &DAG,
+                                         SmallVectorImpl<SDValue> &InVals)
                                           const {
-
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
@@ -1363,23 +2048,17 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
   // Used with vargs to acumulate store chains.
   std::vector<SDValue> OutChains;
 
-  // Keep track of the last register used for arguments
-  unsigned ArgRegEnd = 0;
-
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
-                 ArgLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), ArgLocs, *DAG.getContext());
 
   if (Subtarget->isABI_O32())
-    CCInfo.AnalyzeFormalArguments(Ins,
-                        isVarArg ? CC_MipsO32_VarArgs : CC_MipsO32);
+    CCInfo.AnalyzeFormalArguments(Ins, CC_MipsO32);
   else
     CCInfo.AnalyzeFormalArguments(Ins, CC_Mips);
 
-  unsigned FirstStackArgLoc = (Subtarget->isABI_EABI() ? 0 : 16);
-  unsigned LastStackArgEndOffset = 0;
-  EVT LastRegArgValVT;
+  int LastFI = 0;// MipsFI->LastInArgFI is 0 at the entry of this function.
 
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
@@ -1387,8 +2066,7 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
     // Arguments stored on registers
     if (VA.isRegLoc()) {
       EVT RegVT = VA.getLocVT();
-      ArgRegEnd = VA.getLocReg();
-      LastRegArgValVT = VA.getValVT();
+      unsigned ArgReg = VA.getLocReg();
       TargetRegisterClass *RC = 0;
 
       if (RegVT == MVT::i32)
@@ -1403,7 +2081,7 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
 
       // Transform the arguments stored on
       // physical registers into virtual ones
-      unsigned Reg = AddLiveIn(DAG.getMachineFunction(), ArgRegEnd, RC);
+      unsigned Reg = AddLiveIn(DAG.getMachineFunction(), ArgReg, RC);
       SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
 
       // If this is an 8 or 16-bit value, it has been passed promoted
@@ -1429,9 +2107,10 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
           unsigned Reg2 = AddLiveIn(DAG.getMachineFunction(),
                                     VA.getLocReg()+1, RC);
           SDValue ArgValue2 = DAG.getCopyFromReg(Chain, dl, Reg2, RegVT);
-          SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, ArgValue, 
-                                       ArgValue2);
-          ArgValue = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Pair);
+          if (!Subtarget->isLittle())
+            std::swap(ArgValue, ArgValue2);
+          ArgValue = DAG.getNode(MipsISD::BuildPairF64, dl, MVT::f64,
+                                 ArgValue, ArgValue2);
         }
       }
 
@@ -1441,26 +2120,31 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
       // sanity check
       assert(VA.isMemLoc());
 
-      // The last argument is not a register anymore
-      ArgRegEnd = 0;
+      ISD::ArgFlagsTy Flags = Ins[i].Flags;
+
+      if (Flags.isByVal()) {
+        assert(Subtarget->isABI_O32() &&
+               "No support for ByVal args by ABIs other than O32 yet.");
+        assert(Flags.getByValSize() &&
+               "ByVal args of size 0 should have been ignored by front-end.");
+        unsigned NumWords = (Flags.getByValSize() + 3) / 4;
+        LastFI = MFI->CreateFixedObject(NumWords * 4, VA.getLocMemOffset(),
+                                        true);
+        SDValue FIN = DAG.getFrameIndex(LastFI, getPointerTy());
+        InVals.push_back(FIN);
+        ReadByValArg(MF, Chain, dl, OutChains, DAG, NumWords, FIN, VA, Flags);
+
+        continue;
+      }
 
       // The stack pointer offset is relative to the caller stack frame.
-      // Since the real stack size is unknown here, a negative SPOffset
-      // is used so there's a way to adjust these offsets when the stack
-      // size get known (on EliminateFrameIndex). A dummy SPOffset is
-      // used instead of a direct negative address (which is recorded to
-      // be used on emitPrologue) to avoid mis-calc of the first stack
-      // offset on PEI::calculateFrameObjectOffsets.
-      unsigned ArgSize = VA.getValVT().getSizeInBits()/8;
-      LastStackArgEndOffset = FirstStackArgLoc + VA.getLocMemOffset() + ArgSize;
-      int FI = MFI->CreateFixedObject(ArgSize, 0, true);
-      MipsFI->recordLoadArgsFI(FI, -(4 +
-        (FirstStackArgLoc + VA.getLocMemOffset())));
+      LastFI = MFI->CreateFixedObject(VA.getValVT().getSizeInBits()/8,
+                                      VA.getLocMemOffset(), true);
 
       // Create load nodes to retrieve arguments from the stack
-      SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+      SDValue FIN = DAG.getFrameIndex(LastFI, getPointerTy());
       InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
-                                   MachinePointerInfo::getFixedStack(FI),
+                                   MachinePointerInfo::getFixedStack(LastFI),
                                    false, false, 0));
     }
   }
@@ -1478,58 +2162,33 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
   }
 
-  // To meet ABI, when VARARGS are passed on registers, the registers
-  // must have their values written to the caller stack frame. If the last
-  // argument was placed in the stack, there's no need to save any register.
   if (isVarArg && Subtarget->isABI_O32()) {
-    if (ArgRegEnd) {
-      // Last named formal argument is passed in register.
-
-      // The last register argument that must be saved is Mips::A3
+    // Record the frame index of the first variable argument
+    // which is a value necessary to VASTART.
+    unsigned NextStackOffset = CCInfo.getNextStackOffset();
+    assert(NextStackOffset % 4 == 0 &&
+           "NextStackOffset must be aligned to 4-byte boundaries.");
+    LastFI = MFI->CreateFixedObject(4, NextStackOffset, true);
+    MipsFI->setVarArgsFrameIndex(LastFI);
+
+    // If NextStackOffset is smaller than o32's 16-byte reserved argument area,
+    // copy the integer registers that have not been used for argument passing
+    // to the caller's stack frame.
+    for (; NextStackOffset < 16; NextStackOffset += 4) {
       TargetRegisterClass *RC = Mips::CPURegsRegisterClass;
-      if (LastRegArgValVT == MVT::f64)
-        ArgRegEnd++;
-
-      if (ArgRegEnd < Mips::A3) {
-        // Both the last named formal argument and the first variable
-        // argument are passed in registers.
-        for (++ArgRegEnd; ArgRegEnd <= Mips::A3; ++ArgRegEnd) {
-          unsigned Reg = AddLiveIn(DAG.getMachineFunction(), ArgRegEnd, RC);
-          SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, MVT::i32);
-
-          int FI = MFI->CreateFixedObject(4, 0, true);
-          MipsFI->recordStoreVarArgsFI(FI, -(4+(ArgRegEnd-Mips::A0)*4));
-          SDValue PtrOff = DAG.getFrameIndex(FI, getPointerTy());
-          OutChains.push_back(DAG.getStore(Chain, dl, ArgValue, PtrOff,
-                                           MachinePointerInfo(),
-                                           false, false, 0));
-
-          // Record the frame index of the first variable argument
-          // which is a value necessary to VASTART.
-          if (!MipsFI->getVarArgsFrameIndex()) {
-            MFI->setObjectAlignment(FI, 4);
-            MipsFI->setVarArgsFrameIndex(FI);
-          }
-        }
-      } else {
-        // Last named formal argument is in register Mips::A3, and the first
-        // variable argument is on stack. Record the frame index of the first
-        // variable argument.
-        int FI = MFI->CreateFixedObject(4, 0, true);
-        MFI->setObjectAlignment(FI, 4);
-        MipsFI->recordStoreVarArgsFI(FI, -20);
-        MipsFI->setVarArgsFrameIndex(FI);
-      }
-    } else {
-      // Last named formal argument and all the variable arguments are passed
-      // on stack. Record the frame index of the first variable argument.
-      int FI = MFI->CreateFixedObject(4, 0, true);
-      MFI->setObjectAlignment(FI, 4);
-      MipsFI->recordStoreVarArgsFI(FI, -(4+LastStackArgEndOffset));
-      MipsFI->setVarArgsFrameIndex(FI);
+      unsigned Idx = NextStackOffset / 4;
+      unsigned Reg = AddLiveIn(DAG.getMachineFunction(), O32IntRegs[Idx], RC);
+      SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, MVT::i32);
+      LastFI = MFI->CreateFixedObject(4, NextStackOffset, true);
+      SDValue PtrOff = DAG.getFrameIndex(LastFI, getPointerTy());
+      OutChains.push_back(DAG.getStore(Chain, dl, ArgValue, PtrOff,
+                                       MachinePointerInfo(),
+                                       false, false, 0));
     }
   }
 
+  MipsFI->setLastInArgFI(LastFI);
+
   // All stores are grouped in one node to allow the matching between
   // the size of Ins and InVals. This only happens when on varg functions
   if (!OutChains.empty()) {
@@ -1557,8 +2216,8 @@ MipsTargetLowering::LowerReturn(SDValue Chain,
   SmallVector<CCValAssign, 16> RVLocs;
 
   // CCState - Info about the registers and stack slot.
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
-                 RVLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), RVLocs, *DAG.getContext());
 
   // Analize return values.
   CCInfo.AnalyzeReturn(Outs, RetCC_Mips);
diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h
index 95f8d77..fbcedfd 100644
--- a/lib/Target/Mips/MipsISelLowering.h
+++ b/lib/Target/Mips/MipsISelLowering.h
@@ -40,6 +40,16 @@ namespace llvm {
       // Handle gp_rel (small data/bss sections) relocation.
       GPRel,
 
+      // General Dynamic TLS
+      TlsGd,
+
+      // Local Exec TLS
+      TprelHi,
+      TprelLo,
+
+      // Thread Pointer
+      ThreadPointer,
+
       // Floating Point Branch Conditional
       FPBrcond,
 
@@ -64,7 +74,12 @@ namespace llvm {
 
       // DivRem(u)
       DivRem,
-      DivRemU
+      DivRemU,
+
+      BuildPairF64,
+      ExtractElementF64,
+
+      WrapperPIC
     };
   }
 
@@ -86,9 +101,6 @@ namespace llvm {
     /// getSetCCResultType - get the ISD::SETCC result ValueType
     MVT::SimpleValueType getSetCCResultType(EVT VT) const;
 
-    /// getFunctionAlignment - Return the Log2 alignment of this function.
-    virtual unsigned getFunctionAlignment(const Function *F) const;
-
     virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   private:
     // Subtarget Info
@@ -106,13 +118,14 @@ namespace llvm {
     SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
 
     virtual SDValue
       LowerFormalArguments(SDValue Chain,
@@ -164,6 +177,16 @@ namespace llvm {
     /// specified FP immediate natively. If false, the legalizer will
     /// materialize the FP immediate as a load from a constant pool.
     virtual bool isFPImmLegal(const APFloat &Imm, EVT VT) const;
+
+    MachineBasicBlock *EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
+                    unsigned Size, unsigned BinOpcode, bool Nand = false) const;
+    MachineBasicBlock *EmitAtomicBinaryPartword(MachineInstr *MI,
+                    MachineBasicBlock *BB, unsigned Size, unsigned BinOpcode,
+                    bool Nand = false) const;
+    MachineBasicBlock *EmitAtomicCmpSwap(MachineInstr *MI,
+                                  MachineBasicBlock *BB, unsigned Size) const;
+    MachineBasicBlock *EmitAtomicCmpSwapPartword(MachineInstr *MI,
+                                  MachineBasicBlock *BB, unsigned Size) const;
   };
 }
 
diff --git a/lib/Target/Mips/MipsInstrFPU.td b/lib/Target/Mips/MipsInstrFPU.td
index 251f377..021c167 100644
--- a/lib/Target/Mips/MipsInstrFPU.td
+++ b/lib/Target/Mips/MipsInstrFPU.td
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file contains the Mips implementation of the TargetInstrInfo class.
+// This file describes the Mips FPU instruction set.
 //
 //===----------------------------------------------------------------------===//
 
@@ -30,6 +30,12 @@ def SDT_MipsFPCmp : SDTypeProfile<0, 3, [SDTCisSameAs<0, 1>, SDTCisFP<1>,
                                          SDTCisInt<2>]>;
 def SDT_MipsCMovFP : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>,
                                           SDTCisSameAs<1, 2>]>;
+def SDT_MipsBuildPairF64 : SDTypeProfile<1, 2, [SDTCisVT<0, f64>,
+                                                SDTCisVT<1, i32>,
+                                                SDTCisSameAs<1, 2>]>;
+def SDT_MipsExtractElementF64 : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
+                                                     SDTCisVT<1, f64>,
+                                                     SDTCisVT<0, i32>]>;
 
 def MipsFPCmp : SDNode<"MipsISD::FPCmp", SDT_MipsFPCmp, [SDNPOutGlue]>;
 def MipsCMovFP_T : SDNode<"MipsISD::CMovFP_T", SDT_MipsCMovFP, [SDNPInGlue]>;
@@ -37,6 +43,9 @@ def MipsCMovFP_F : SDNode<"MipsISD::CMovFP_F", SDT_MipsCMovFP, [SDNPInGlue]>;
 def MipsFPRound : SDNode<"MipsISD::FPRound", SDTFPRoundOp, [SDNPOptInGlue]>;
 def MipsFPBrcond : SDNode<"MipsISD::FPBrcond", SDT_MipsFPBrcond,
                           [SDNPHasChain, SDNPOptInGlue]>;
+def MipsBuildPairF64 : SDNode<"MipsISD::BuildPairF64", SDT_MipsBuildPairF64>;
+def MipsExtractElementF64 : SDNode<"MipsISD::ExtractElementF64",
+                                   SDT_MipsExtractElementF64>;
 
 // Operand for printing out a condition code.
 let PrintMethod = "printFCCOperand" in
@@ -68,40 +77,42 @@ def IsNotMipsI       : Predicate<"!Subtarget.isMips1()">;
 multiclass FFR1_1<bits<6> funct, string asmstr>
 {
   def _S32 : FFR<0x11, funct, 0x0, (outs FGR32:$fd), (ins FGR32:$fs),
-      !strconcat(asmstr, ".s $fd, $fs"), []>;
+      !strconcat(asmstr, ".s\t$fd, $fs"), []>;
 
   def _D32  : FFR<0x11, funct, 0x1, (outs FGR32:$fd), (ins AFGR64:$fs),
-      !strconcat(asmstr, ".d $fd, $fs"), []>, Requires<[In32BitMode]>;
+      !strconcat(asmstr, ".d\t$fd, $fs"), []>, Requires<[In32BitMode]>;
 }
 
 multiclass FFR1_2<bits<6> funct, string asmstr, SDNode FOp>
 {
   def _S32 : FFR<0x11, funct, 0x0, (outs FGR32:$fd), (ins FGR32:$fs),
-                 !strconcat(asmstr, ".s $fd, $fs"),
+                 !strconcat(asmstr, ".s\t$fd, $fs"),
                  [(set FGR32:$fd, (FOp FGR32:$fs))]>;
 
   def _D32  : FFR<0x11, funct, 0x1, (outs AFGR64:$fd), (ins AFGR64:$fs),
-                 !strconcat(asmstr, ".d $fd, $fs"),
+                 !strconcat(asmstr, ".d\t$fd, $fs"),
                  [(set AFGR64:$fd, (FOp AFGR64:$fs))]>, Requires<[In32BitMode]>;
 }
 
 class FFR1_3<bits<6> funct, bits<5> fmt, RegisterClass RcSrc,
               RegisterClass RcDst, string asmstr>:
   FFR<0x11, funct, fmt, (outs RcSrc:$fd), (ins RcDst:$fs),
-      !strconcat(asmstr, " $fd, $fs"), []>;
+      !strconcat(asmstr, "\t$fd, $fs"), []>;
 
 
-multiclass FFR1_4<bits<6> funct, string asmstr, SDNode FOp> {
+multiclass FFR1_4<bits<6> funct, string asmstr, SDNode FOp, bit isComm = 0> {
+  let isCommutable = isComm in {
   def _S32 : FFR<0x11, funct, 0x0, (outs FGR32:$fd),
                  (ins FGR32:$fs, FGR32:$ft),
-                 !strconcat(asmstr, ".s $fd, $fs, $ft"),
+                 !strconcat(asmstr, ".s\t$fd, $fs, $ft"),
                  [(set FGR32:$fd, (FOp FGR32:$fs, FGR32:$ft))]>;
 
   def _D32 : FFR<0x11, funct, 0x1, (outs AFGR64:$fd),
                  (ins AFGR64:$fs, AFGR64:$ft),
-                 !strconcat(asmstr, ".d $fd, $fs, $ft"),
+                 !strconcat(asmstr, ".d\t$fd, $fs, $ft"),
                  [(set AFGR64:$fd, (FOp AFGR64:$fs, AFGR64:$ft))]>,
                  Requires<[In32BitMode]>;
+  }
 }
 
 //===----------------------------------------------------------------------===//
@@ -161,42 +172,42 @@ let ft = 0 in {
 let fd = 0 in {
   /// Move Control Registers From/To CPU Registers
   def CFC1  : FFR<0x11, 0x0, 0x2, (outs CPURegs:$rt), (ins CCR:$fs),
-                  "cfc1 $rt, $fs", []>;
+                  "cfc1\t$rt, $fs", []>;
 
   def CTC1  : FFR<0x11, 0x0, 0x6, (outs CCR:$rt), (ins CPURegs:$fs),
-                  "ctc1 $fs, $rt", []>;
+                  "ctc1\t$fs, $rt", []>;
 
   def MFC1  : FFR<0x11, 0x00, 0x00, (outs CPURegs:$rt), (ins FGR32:$fs),
-                  "mfc1 $rt, $fs", []>;
+                  "mfc1\t$rt, $fs", []>;
 
   def MTC1  : FFR<0x11, 0x00, 0x04, (outs FGR32:$fs), (ins CPURegs:$rt),
-                  "mtc1 $rt, $fs", []>;
+                  "mtc1\t$rt, $fs", []>;
 }
 
 def FMOV_S32 : FFR<0x11, 0b000110, 0x0, (outs FGR32:$fd), (ins FGR32:$fs),
-                   "mov.s $fd, $fs", []>;
+                   "mov.s\t$fd, $fs", []>;
 def FMOV_D32 : FFR<0x11, 0b000110, 0x1, (outs AFGR64:$fd), (ins AFGR64:$fs),
-                   "mov.d $fd, $fs", []>;
+                   "mov.d\t$fd, $fs", []>;
 
 /// Floating Point Memory Instructions
 let Predicates = [IsNotSingleFloat, IsNotMipsI] in {
   def LDC1 : FFI<0b110101, (outs AFGR64:$ft), (ins mem:$addr),
-                 "ldc1 $ft, $addr", [(set AFGR64:$ft, (load addr:$addr))]>;
+                 "ldc1\t$ft, $addr", [(set AFGR64:$ft, (load addr:$addr))]>;
 
   def SDC1 : FFI<0b111101, (outs), (ins AFGR64:$ft, mem:$addr),
-                 "sdc1 $ft, $addr", [(store AFGR64:$ft, addr:$addr)]>;
+                 "sdc1\t$ft, $addr", [(store AFGR64:$ft, addr:$addr)]>;
 }
 
-// LWC1 and SWC1 can always be emited with odd registers.
-def LWC1  : FFI<0b110001, (outs FGR32:$ft), (ins mem:$addr), "lwc1 $ft, $addr",
+// LWC1 and SWC1 can always be emitted with odd registers.
+def LWC1  : FFI<0b110001, (outs FGR32:$ft), (ins mem:$addr), "lwc1\t$ft, $addr",
                [(set FGR32:$ft, (load addr:$addr))]>;
-def SWC1  : FFI<0b111001, (outs), (ins FGR32:$ft, mem:$addr), "swc1 $ft, $addr",
-               [(store FGR32:$ft, addr:$addr)]>;
+def SWC1  : FFI<0b111001, (outs), (ins FGR32:$ft, mem:$addr),
+               "swc1\t$ft, $addr", [(store FGR32:$ft, addr:$addr)]>;
 
 /// Floating-point Aritmetic
-defm FADD : FFR1_4<0x10, "add", fadd>;
+defm FADD : FFR1_4<0x10, "add", fadd, 1>;
 defm FDIV : FFR1_4<0x03, "div", fdiv>;
-defm FMUL : FFR1_4<0x02, "mul", fmul>;
+defm FMUL : FFR1_4<0x02, "mul", fmul, 1>;
 defm FSUB : FFR1_4<0x01, "sub", fsub>;
 
 //===----------------------------------------------------------------------===//
@@ -212,7 +223,7 @@ def MIPS_BRANCH_TL : PatLeaf<(i32 3)>;
 /// Floating Point Branch of False/True (Likely)
 let isBranch=1, isTerminator=1, hasDelaySlot=1, base=0x8, Uses=[FCR31] in
   class FBRANCH<PatLeaf op, string asmstr> : FFI<0x11, (outs),
-        (ins brtarget:$dst), !strconcat(asmstr, " $dst"),
+        (ins brtarget:$dst), !strconcat(asmstr, "\t$dst"),
         [(MipsFPBrcond op, bb:$dst)]>;
 
 def BC1F  : FBRANCH<MIPS_BRANCH_F,  "bc1f">;
@@ -245,19 +256,20 @@ def MIPS_FCOND_NGT  : PatLeaf<(i32 15)>;
 /// Floating Point Compare
 let hasDelaySlot = 1, Defs=[FCR31] in {
   def FCMP_S32 : FCC<0x0, (outs), (ins FGR32:$fs, FGR32:$ft, condcode:$cc),
-                     "c.$cc.s $fs, $ft",
+                     "c.$cc.s\t$fs, $ft",
                      [(MipsFPCmp FGR32:$fs, FGR32:$ft, imm:$cc)]>;
 
   def FCMP_D32 : FCC<0x1, (outs), (ins AFGR64:$fs, AFGR64:$ft, condcode:$cc),
-                     "c.$cc.d $fs, $ft",
+                     "c.$cc.d\t$fs, $ft",
                      [(MipsFPCmp AFGR64:$fs, AFGR64:$ft, imm:$cc)]>,
                      Requires<[In32BitMode]>;
 }
 
 
 // Conditional moves:
-// These instructions are expanded in MipsISelLowering::EmitInstrWithCustomInserter
-// if target does not have conditional move instructions.
+// These instructions are expanded in
+// MipsISelLowering::EmitInstrWithCustomInserter if target does not have
+// conditional move instructions.
 // flag:int, data:float
 let usesCustomInserter = 1, Constraints = "$F = $dst" in
 class CondMovIntFP<RegisterClass RC, bits<5> fmt, bits<6> func,
@@ -312,6 +324,23 @@ let Predicates = [In32BitMode] in {
 def MOVCCRToCCR : MipsPseudo<(outs CCR:$dst), (ins CCR:$src),
                              "# MOVCCRToCCR", []>;
 
+// This pseudo instr gets expanded into 2 mtc1 instrs after register
+// allocation.
+def BuildPairF64 :
+  MipsPseudo<(outs AFGR64:$dst),
+             (ins CPURegs:$lo, CPURegs:$hi), "",
+             [(set AFGR64:$dst, (MipsBuildPairF64 CPURegs:$lo, CPURegs:$hi))]>;
+
+// This pseudo instr gets expanded into 2 mfc1 instrs after register
+// allocation.
+// if n is 0, lower part of src is extracted.
+// if n is 1, higher part of src is extracted.
+def ExtractElementF64 :
+  MipsPseudo<(outs CPURegs:$dst),
+             (ins AFGR64:$src, i32imm:$n), "",
+             [(set CPURegs:$dst,
+               (MipsExtractElementF64 AFGR64:$src, imm:$n))]>;
+
 //===----------------------------------------------------------------------===//
 // Floating Point Patterns
 //===----------------------------------------------------------------------===//
@@ -330,6 +359,7 @@ def : Pat<(f32 (sint_to_fp CPURegs:$src)), (CVTS_W32 (MTC1 CPURegs:$src))>;
 def : Pat<(f64 (sint_to_fp CPURegs:$src)), (CVTD_W32 (MTC1 CPURegs:$src))>;
 
 def : Pat<(i32 (fp_to_sint FGR32:$src)), (MFC1 (TRUNC_W_S32 FGR32:$src))>;
+def : Pat<(i32 (fp_to_sint AFGR64:$src)), (MFC1 (TRUNC_W_D32 AFGR64:$src))>;
 
 def : Pat<(i32 (bitconvert FGR32:$src)),  (MFC1 FGR32:$src)>;
 def : Pat<(f32 (bitconvert CPURegs:$src)), (MTC1 CPURegs:$src)>;
diff --git a/lib/Target/Mips/MipsInstrFormats.td b/lib/Target/Mips/MipsInstrFormats.td
index 9dfcdfb..9f55fb3 100644
--- a/lib/Target/Mips/MipsInstrFormats.td
+++ b/lib/Target/Mips/MipsInstrFormats.td
@@ -1,4 +1,4 @@
-//===- MipsRegisterInfo.td - Mips Register defs ------------*- tablegen -*-===//
+//===- MipsInstrFormats.td - Mips Instruction Formats ------*- tablegen -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/lib/Target/Mips/MipsInstrInfo.h b/lib/Target/Mips/MipsInstrInfo.h
index 5fdbf1f..abf6773 100644
--- a/lib/Target/Mips/MipsInstrInfo.h
+++ b/lib/Target/Mips/MipsInstrInfo.h
@@ -146,7 +146,21 @@ namespace MipsII {
     /// MO_ABS_HI/LO - Represents the hi or low part of an absolute symbol
     /// address.
     MO_ABS_HI,
-    MO_ABS_LO
+    MO_ABS_LO,
+
+    /// MO_TLSGD - Represents the offset into the global offset table at which
+    // the module ID and TSL block offset reside during execution (General
+    // Dynamic TLS).
+    MO_TLSGD,
+
+    /// MO_GOTTPREL - Represents the offset from the thread pointer (Initial
+    // Exec TLS).
+    MO_GOTTPREL,
+
+    /// MO_TPREL_HI/LO - Represents the hi and low part of the offset from
+    // the thread pointer (Local Exec TLS).
+    MO_TPREL_HI,
+    MO_TPREL_LO
   };
 }
 
diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
index c14dc9c..329a002 100644
--- a/lib/Target/Mips/MipsInstrInfo.td
+++ b/lib/Target/Mips/MipsInstrInfo.td
@@ -1,4 +1,4 @@
-//===- MipsInstrInfo.td - Mips Register defs ---------------*- tablegen -*-===//
+//===- MipsInstrInfo.td - Target Description for Mips Target -*- tablegen -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -6,6 +6,10 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
+//
+// This file contains the Mips implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
 // Instruction format superclass
@@ -20,8 +24,9 @@ include "MipsInstrFormats.td"
 def SDT_MipsRet          : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
 def SDT_MipsJmpLink      : SDTypeProfile<0, 1, [SDTCisVT<0, iPTR>]>;
 def SDT_MipsCMov         : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>,
-                                         SDTCisSameAs<1, 2>, SDTCisSameAs<3, 4>,
-                                         SDTCisInt<4>]>;
+                                                SDTCisSameAs<1, 2>,
+                                                SDTCisSameAs<3, 4>,
+                                                SDTCisInt<4>]>;
 def SDT_MipsCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
 def SDT_MipsCallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
 def SDT_MipsMAddMSub     : SDTypeProfile<0, 4,
@@ -32,6 +37,8 @@ def SDT_MipsDivRem       : SDTypeProfile<0, 2,
                                          [SDTCisVT<0, i32>,
                                           SDTCisSameAs<0, 1>]>;
 
+def SDT_MipsThreadPointer : SDTypeProfile<1, 0, [SDTCisPtrTy<0>]>;
+
 // Call
 def MipsJmpLink : SDNode<"MipsISD::JmpLink",SDT_MipsJmpLink,
                          [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
@@ -44,6 +51,16 @@ def MipsHi    : SDNode<"MipsISD::Hi", SDTIntUnaryOp>;
 def MipsLo    : SDNode<"MipsISD::Lo", SDTIntUnaryOp>;
 def MipsGPRel : SDNode<"MipsISD::GPRel", SDTIntUnaryOp>;
 
+// TlsGd node is used to handle General Dynamic TLS
+def MipsTlsGd : SDNode<"MipsISD::TlsGd", SDTIntUnaryOp>;
+
+// TprelHi and TprelLo nodes are used to handle Local Exec TLS
+def MipsTprelHi    : SDNode<"MipsISD::TprelHi", SDTIntUnaryOp>;
+def MipsTprelLo    : SDNode<"MipsISD::TprelLo", SDTIntUnaryOp>;
+
+// Thread pointer
+def MipsThreadPointer: SDNode<"MipsISD::ThreadPointer", SDT_MipsThreadPointer>;
+
 // Return
 def MipsRet : SDNode<"MipsISD::Ret", SDT_MipsRet, [SDNPHasChain,
                      SDNPOptInGlue]>;
@@ -70,6 +87,18 @@ def MipsDivRem    : SDNode<"MipsISD::DivRem", SDT_MipsDivRem,
 def MipsDivRemU   : SDNode<"MipsISD::DivRemU", SDT_MipsDivRem,
                            [SDNPOutGlue]>;
 
+// Target constant nodes that are not part of any isel patterns and remain
+// unchanged can cause instructions with illegal operands to be emitted.
+// Wrapper node patterns give the instruction selector a chance to replace
+// target constant nodes that would otherwise remain unchanged with ADDiu
+// nodes. Without these wrapper node patterns, the following conditional move
+// instrucion is emitted when function cmov2 in test/CodeGen/Mips/cmov.ll is
+// compiled: 
+//  movn  %got(d)($gp), %got(c)($gp), $4
+// This instruction is illegal since movn can take only register operands.
+
+def MipsWrapperPIC    : SDNode<"MipsISD::WrapperPIC",  SDTIntUnaryOp>;
+
 //===----------------------------------------------------------------------===//
 // Mips Instruction Predicate Definitions.
 //===----------------------------------------------------------------------===//
@@ -140,17 +169,20 @@ def addr : ComplexPattern<iPTR, 2, "SelectAddr", [frameindex], []>;
 //===----------------------------------------------------------------------===//
 
 // Arithmetic 3 register operands
-let isCommutable = 1 in
 class ArithR<bits<6> op, bits<6> func, string instr_asm, SDNode OpNode,
-             InstrItinClass itin>:
+             InstrItinClass itin, bit isComm = 0>:
   FR<op, func, (outs CPURegs:$dst), (ins CPURegs:$b, CPURegs:$c),
      !strconcat(instr_asm, "\t$dst, $b, $c"),
-     [(set CPURegs:$dst, (OpNode CPURegs:$b, CPURegs:$c))], itin>;
+     [(set CPURegs:$dst, (OpNode CPURegs:$b, CPURegs:$c))], itin> {
+  let isCommutable = isComm;
+}
 
-let isCommutable = 1 in
-class ArithOverflowR<bits<6> op, bits<6> func, string instr_asm>:
+class ArithOverflowR<bits<6> op, bits<6> func, string instr_asm,
+                     bit isComm = 0>:
   FR<op, func, (outs CPURegs:$dst), (ins CPURegs:$b, CPURegs:$c),
-     !strconcat(instr_asm, "\t$dst, $b, $c"), [], IIAlu>;
+     !strconcat(instr_asm, "\t$dst, $b, $c"), [], IIAlu> {
+  let isCommutable = isComm;
+}
 
 // Arithmetic 2 register operands
 class ArithI<bits<6> op, string instr_asm, SDNode OpNode,
@@ -166,12 +198,15 @@ class ArithOverflowI<bits<6> op, string instr_asm, SDNode OpNode,
 
 // Arithmetic Multiply ADD/SUB
 let rd = 0, shamt = 0, Defs = [HI, LO], Uses = [HI, LO] in
-class MArithR<bits<6> func, string instr_asm, SDNode op> :
+class MArithR<bits<6> func, string instr_asm, SDNode op, bit isComm = 0> :
   FR<0x1c, func, (outs), (ins CPURegs:$rs, CPURegs:$rt),
      !strconcat(instr_asm, "\t$rs, $rt"),
-     [(op CPURegs:$rs, CPURegs:$rt, LO, HI)], IIImul>;
+     [(op CPURegs:$rs, CPURegs:$rt, LO, HI)], IIImul> {
+  let isCommutable = isComm;
+}
 
 //  Logical
+let isCommutable = 1 in
 class LogicR<bits<6> func, string instr_asm, SDNode OpNode>:
   FR<0x00, func, (outs CPURegs:$dst), (ins CPURegs:$b, CPURegs:$c),
      !strconcat(instr_asm, "\t$dst, $b, $c"),
@@ -182,6 +217,7 @@ class LogicI<bits<6> op, string instr_asm, SDNode OpNode>:
      !strconcat(instr_asm, "\t$dst, $b, $c"),
      [(set CPURegs:$dst, (OpNode CPURegs:$b, immZExt16:$c))], IIAlu>;
 
+let isCommutable = 1 in
 class LogicNOR<bits<6> op, bits<6> func, string instr_asm>:
   FR<op, func, (outs CPURegs:$dst), (ins CPURegs:$b, CPURegs:$c),
      !strconcat(instr_asm, "\t$dst, $b, $c"),
@@ -287,6 +323,7 @@ let isCall=1, hasDelaySlot=1,
 
 // Mul, Div
 let Defs = [HI, LO] in {
+  let isCommutable = 1 in
   class Mul<bits<6> func, string instr_asm, InstrItinClass itin>:
     FR<0x00, func, (outs), (ins CPURegs:$a, CPURegs:$b),
        !strconcat(instr_asm, "\t$a, $b"), [], itin>;
@@ -337,6 +374,13 @@ class CondMov<bits<6> func, string instr_asm, PatLeaf MovCode>:
      CPURegs:$cond), !strconcat(instr_asm, "\t$dst, $T, $cond"),
      [], NoItinerary>;
 
+// Read Hardware
+class ReadHardware: FR<0x1f, 0x3b, (outs CPURegs:$dst), (ins HWRegs:$src),
+    "rdhwr\t$dst, $src", [], IIAlu> {
+  let rs = 0;
+  let shamt = 0;
+}
+
 //===----------------------------------------------------------------------===//
 // Pseudo instructions
 //===----------------------------------------------------------------------===//
@@ -368,7 +412,116 @@ def ATMACRO   : MipsPseudo<(outs), (ins), ".set\tat", []>;
 // are used, we have the same behavior, but get also a bunch of warnings
 // from the assembler.
 def CPLOAD : MipsPseudo<(outs), (ins CPURegs:$picreg), ".cpload\t$picreg", []>;
-def CPRESTORE : MipsPseudo<(outs), (ins uimm16:$loc), ".cprestore\t$loc\n", []>;
+def CPRESTORE : MipsPseudo<(outs), (ins i32imm:$loc), ".cprestore\t$loc\n", []>;
+
+let usesCustomInserter = 1 in {
+  def ATOMIC_LOAD_ADD_I8 : MipsPseudo<
+    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
+    "atomic_load_add_8\t$dst, $ptr, $incr",
+    [(set CPURegs:$dst, (atomic_load_add_8 CPURegs:$ptr, CPURegs:$incr))]>;
+  def ATOMIC_LOAD_ADD_I16 : MipsPseudo<
+    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
+    "atomic_load_add_16\t$dst, $ptr, $incr",
+    [(set CPURegs:$dst, (atomic_load_add_16 CPURegs:$ptr, CPURegs:$incr))]>;
+  def ATOMIC_LOAD_ADD_I32 : MipsPseudo<
+    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
+    "atomic_load_add_32\t$dst, $ptr, $incr",
+    [(set CPURegs:$dst, (atomic_load_add_32 CPURegs:$ptr, CPURegs:$incr))]>;
+
+  def ATOMIC_LOAD_SUB_I8 : MipsPseudo<
+    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
+    "atomic_load_sub_8\t$dst, $ptr, $incr",
+    [(set CPURegs:$dst, (atomic_load_sub_8 CPURegs:$ptr, CPURegs:$incr))]>;
+  def ATOMIC_LOAD_SUB_I16 : MipsPseudo<
+    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
+    "atomic_load_sub_16\t$dst, $ptr, $incr",
+    [(set CPURegs:$dst, (atomic_load_sub_16 CPURegs:$ptr, CPURegs:$incr))]>;
+  def ATOMIC_LOAD_SUB_I32 : MipsPseudo<
+    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
+    "atomic_load_sub_32\t$dst, $ptr, $incr",
+    [(set CPURegs:$dst, (atomic_load_sub_32 CPURegs:$ptr, CPURegs:$incr))]>;
+
+  def ATOMIC_LOAD_AND_I8 : MipsPseudo<
+    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
+    "atomic_load_and_8\t$dst, $ptr, $incr",
+    [(set CPURegs:$dst, (atomic_load_and_8 CPURegs:$ptr, CPURegs:$incr))]>;
+  def ATOMIC_LOAD_AND_I16 : MipsPseudo<
+    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
+    "atomic_load_and_16\t$dst, $ptr, $incr",
+    [(set CPURegs:$dst, (atomic_load_and_16 CPURegs:$ptr, CPURegs:$incr))]>;
+  def ATOMIC_LOAD_AND_I32 : MipsPseudo<
+    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
+    "atomic_load_and_32\t$dst, $ptr, $incr",
+    [(set CPURegs:$dst, (atomic_load_and_32 CPURegs:$ptr, CPURegs:$incr))]>;
+
+  def ATOMIC_LOAD_OR_I8 : MipsPseudo<
+    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
+    "atomic_load_or_8\t$dst, $ptr, $incr",
+    [(set CPURegs:$dst, (atomic_load_or_8 CPURegs:$ptr, CPURegs:$incr))]>;
+  def ATOMIC_LOAD_OR_I16 : MipsPseudo<
+    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
+    "atomic_load_or_16\t$dst, $ptr, $incr",
+    [(set CPURegs:$dst, (atomic_load_or_16 CPURegs:$ptr, CPURegs:$incr))]>;
+  def ATOMIC_LOAD_OR_I32 : MipsPseudo<
+    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
+    "atomic_load_or_32\t$dst, $ptr, $incr",
+    [(set CPURegs:$dst, (atomic_load_or_32 CPURegs:$ptr, CPURegs:$incr))]>;
+
+  def ATOMIC_LOAD_XOR_I8 : MipsPseudo<
+    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
+    "atomic_load_xor_8\t$dst, $ptr, $incr",
+    [(set CPURegs:$dst, (atomic_load_xor_8 CPURegs:$ptr, CPURegs:$incr))]>;
+  def ATOMIC_LOAD_XOR_I16 : MipsPseudo<
+    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
+    "atomic_load_xor_16\t$dst, $ptr, $incr",
+    [(set CPURegs:$dst, (atomic_load_xor_16 CPURegs:$ptr, CPURegs:$incr))]>;
+  def ATOMIC_LOAD_XOR_I32 : MipsPseudo<
+    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
+    "atomic_load_xor_32\t$dst, $ptr, $incr",
+    [(set CPURegs:$dst, (atomic_load_xor_32 CPURegs:$ptr, CPURegs:$incr))]>;
+
+  def ATOMIC_LOAD_NAND_I8 : MipsPseudo<
+    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
+    "atomic_load_nand_8\t$dst, $ptr, $incr",
+    [(set CPURegs:$dst, (atomic_load_nand_8 CPURegs:$ptr, CPURegs:$incr))]>;
+  def ATOMIC_LOAD_NAND_I16 : MipsPseudo<
+    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
+    "atomic_load_nand_16\t$dst, $ptr, $incr",
+    [(set CPURegs:$dst, (atomic_load_nand_16 CPURegs:$ptr, CPURegs:$incr))]>;
+  def ATOMIC_LOAD_NAND_I32 : MipsPseudo<
+    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
+    "atomic_load_nand_32\t$dst, $ptr, $incr",
+    [(set CPURegs:$dst, (atomic_load_nand_32 CPURegs:$ptr, CPURegs:$incr))]>;
+
+  def ATOMIC_SWAP_I8 : MipsPseudo<
+    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$val),
+    "atomic_swap_8\t$dst, $ptr, $val",
+    [(set CPURegs:$dst, (atomic_swap_8 CPURegs:$ptr, CPURegs:$val))]>;
+  def ATOMIC_SWAP_I16 : MipsPseudo<
+    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$val),
+    "atomic_swap_16\t$dst, $ptr, $val",
+    [(set CPURegs:$dst, (atomic_swap_16 CPURegs:$ptr, CPURegs:$val))]>;
+  def ATOMIC_SWAP_I32 : MipsPseudo<
+    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$val),
+    "atomic_swap_32\t$dst, $ptr, $val",
+    [(set CPURegs:$dst, (atomic_swap_32 CPURegs:$ptr, CPURegs:$val))]>;
+
+  def ATOMIC_CMP_SWAP_I8 : MipsPseudo<
+    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$oldval, CPURegs:$newval),
+    "atomic_cmp_swap_8\t$dst, $ptr, $oldval, $newval",
+    [(set CPURegs:$dst,
+         (atomic_cmp_swap_8 CPURegs:$ptr, CPURegs:$oldval, CPURegs:$newval))]>;
+  def ATOMIC_CMP_SWAP_I16 : MipsPseudo<
+    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$oldval, CPURegs:$newval),
+    "atomic_cmp_swap_16\t$dst, $ptr, $oldval, $newval",
+    [(set CPURegs:$dst,
+         (atomic_cmp_swap_16 CPURegs:$ptr, CPURegs:$oldval, CPURegs:$newval))]>;
+  def ATOMIC_CMP_SWAP_I32 : MipsPseudo<
+    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$oldval, CPURegs:$newval),
+    "atomic_cmp_swap_32\t$dst, $ptr, $oldval, $newval",
+    [(set CPURegs:$dst,
+         (atomic_cmp_swap_32 CPURegs:$ptr, CPURegs:$oldval, CPURegs:$newval))]>;
+}
 
 //===----------------------------------------------------------------------===//
 // Instruction definition
@@ -389,9 +542,9 @@ def XORi    : LogicI<0x0e, "xori",  xor>;
 def LUi     : LoadUpper<0x0f, "lui">;
 
 /// Arithmetic Instructions (3-Operand, R-Type)
-def ADDu    : ArithR<0x00, 0x21, "addu", add, IIAlu>;
+def ADDu    : ArithR<0x00, 0x21, "addu", add, IIAlu, 1>;
 def SUBu    : ArithR<0x00, 0x23, "subu", sub, IIAlu>;
-def ADD     : ArithOverflowR<0x00, 0x20, "add">;
+def ADD     : ArithOverflowR<0x00, 0x20, "add", 1>;
 def SUB     : ArithOverflowR<0x00, 0x22, "sub">;
 def SLT     : SetCC_R<0x00, 0x2a, "slt", setlt>;
 def SLTu    : SetCC_R<0x00, 0x2b, "sltu", setult>;
@@ -424,6 +577,14 @@ def SB      : StoreM<0x28, "sb", truncstorei8>;
 def SH      : StoreM<0x29, "sh", truncstorei16>;
 def SW      : StoreM<0x2b, "sw", store>;
 
+/// Load-linked, Store-conditional
+let hasDelaySlot = 1 in
+  def LL    : FI<0x30, (outs CPURegs:$dst), (ins mem:$addr),
+              "ll\t$dst, $addr", [], IILoad>;
+let Constraints = "$src = $dst" in
+  def SC    : FI<0x38, (outs CPURegs:$dst), (ins CPURegs:$src, mem:$addr),
+              "sc\t$src, $addr", [], IIStore>;
+
 /// Jump and Branch Instructions
 def J       : JumpFJ<0x02, "j">;
 def JR      : JumpFR<0x00, 0x08, "jr">;
@@ -491,8 +652,9 @@ def MIPS_CMOV_ZERO  : PatLeaf<(i32 0)>;
 def MIPS_CMOV_NZERO : PatLeaf<(i32 1)>;
 
 // Conditional moves:
-// These instructions are expanded in MipsISelLowering::EmitInstrWithCustomInserter
-// if target does not have conditional move instructions.
+// These instructions are expanded in
+// MipsISelLowering::EmitInstrWithCustomInserter if target does not have
+// conditional move instructions.
 // flag:int, data:int
 let usesCustomInserter = 1, shamt = 0, Constraints = "$F = $dst" in
   class CondMovIntInt<bits<6> funct, string instr_asm> :
@@ -514,14 +676,16 @@ let addr=0 in
 def LEA_ADDiu : EffectiveAddress<"addiu\t$dst, ${addr:stackloc}">;
 
 // MADD*/MSUB*
-def MADD  : MArithR<0, "madd", MipsMAdd>;
-def MADDU : MArithR<1, "maddu", MipsMAddu>;
+def MADD  : MArithR<0, "madd", MipsMAdd, 1>;
+def MADDU : MArithR<1, "maddu", MipsMAddu, 1>;
 def MSUB  : MArithR<4, "msub", MipsMSub>;
 def MSUBU : MArithR<5, "msubu", MipsMSubu>;
 
 // MUL is a assembly macro in the current used ISAs. In recent ISA's
 // it is a real instruction.
-def MUL   : ArithR<0x1c, 0x02, "mul", mul, IIImul>, Requires<[IsMips32]>;
+def MUL   : ArithR<0x1c, 0x02, "mul", mul, IIImul, 1>, Requires<[IsMips32]>;
+
+def RDHWR : ReadHardware;
 
 //===----------------------------------------------------------------------===//
 //  Arbitrary patterns that map to one or more instructions
@@ -555,6 +719,7 @@ def : Pat<(MipsJmpLink (i32 texternalsym:$dst)),
 
 // hi/lo relocs
 def : Pat<(MipsHi tglobaladdr:$in), (LUi tglobaladdr:$in)>;
+def : Pat<(MipsHi tblockaddress:$in), (LUi tblockaddress:$in)>;
 def : Pat<(add CPURegs:$hi, (MipsLo tglobaladdr:$lo)),
           (ADDiu CPURegs:$hi, tglobaladdr:$lo)>;
 def : Pat<(add CPURegs:$hi, (MipsLo tblockaddress:$lo)),
@@ -574,6 +739,26 @@ def : Pat<(add CPURegs:$gp, (MipsGPRel tglobaladdr:$in)),
 def : Pat<(add CPURegs:$gp, (MipsGPRel tconstpool:$in)),
           (ADDiu CPURegs:$gp, tconstpool:$in)>;
 
+// tlsgd
+def : Pat<(add CPURegs:$gp, (MipsTlsGd tglobaltlsaddr:$in)),
+          (ADDiu CPURegs:$gp, tglobaltlsaddr:$in)>;
+
+// tprel hi/lo
+def : Pat<(MipsTprelHi tglobaltlsaddr:$in), (LUi tglobaltlsaddr:$in)>;
+def : Pat<(add CPURegs:$hi, (MipsTprelLo tglobaltlsaddr:$lo)),
+          (ADDiu CPURegs:$hi, tglobaltlsaddr:$lo)>;
+
+// wrapper_pic
+class WrapperPICPat<SDNode node>:
+      Pat<(MipsWrapperPIC node:$in),
+          (ADDiu GP, node:$in)>;
+
+def : WrapperPICPat<tglobaladdr>;
+def : WrapperPICPat<tconstpool>;
+def : WrapperPICPat<texternalsym>;
+def : WrapperPICPat<tblockaddress>;
+def : WrapperPICPat<tjumptable>;
+
 // Mips does not have "not", so we expand our way
 def : Pat<(not CPURegs:$in),
           (NOR CPURegs:$in, ZERO)>;
@@ -641,13 +826,6 @@ multiclass MovnPats<RegisterClass RC, Instruction MOVNInst> {
 defm : MovzPats<CPURegs, MOVZ_I>;
 defm : MovnPats<CPURegs, MOVN_I>;
 
-// select patterns with got access
-let AddedComplexity = 10 in
-  def : Pat<(select (setne CPURegs:$lhs, CPURegs:$rhs),
-                    (i32 tglobaladdr:$T), CPURegs:$F),
-            (MOVN_I CPURegs:$F, (ADDiu GP, tglobaladdr:$T),
-                    (XOR CPURegs:$lhs, CPURegs:$rhs))>;
-
 // setcc patterns
 def : Pat<(seteq CPURegs:$lhs, CPURegs:$rhs),
           (SLTu (XOR CPURegs:$lhs, CPURegs:$rhs), 1)>;
diff --git a/lib/Target/Mips/MipsMCAsmInfo.cpp b/lib/Target/Mips/MipsMCAsmInfo.cpp
index fe48ab7..c86bf40 100644
--- a/lib/Target/Mips/MipsMCAsmInfo.cpp
+++ b/lib/Target/Mips/MipsMCAsmInfo.cpp
@@ -17,11 +17,15 @@ using namespace llvm;
 MipsMCAsmInfo::MipsMCAsmInfo(const Target &T, StringRef TT) {
   AlignmentIsInBytes          = false;
   Data16bitsDirective         = "\t.half\t";
-  Data32bitsDirective         = "\t.word\t";
+  Data32bitsDirective         = "\t.4byte\t";
   Data64bitsDirective         = 0;
   PrivateGlobalPrefix         = "$";
   CommentString               = "#";
   ZeroDirective               = "\t.space\t";
   GPRel32Directive            = "\t.gpword\t";
-  HasSetDirective             = false;
+  WeakRefDirective            = "\t.weak\t";
+
+  SupportsDebugInformation = true;
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+  HasLEB128 = true;
 }
diff --git a/lib/Target/Mips/MipsMachineFunction.h b/lib/Target/Mips/MipsMachineFunction.h
index 1e8e4fe..df40e6c 100644
--- a/lib/Target/Mips/MipsMachineFunction.h
+++ b/lib/Target/Mips/MipsMachineFunction.h
@@ -14,6 +14,7 @@
 #ifndef MIPS_MACHINE_FUNCTION_INFO_H
 #define MIPS_MACHINE_FUNCTION_INFO_H
 
+#include <utility>
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/VectorExtras.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -26,50 +27,6 @@ namespace llvm {
 class MipsFunctionInfo : public MachineFunctionInfo {
 
 private:
-  /// Holds for each function where on the stack the Frame Pointer must be
-  /// saved. This is used on Prologue and Epilogue to emit FP save/restore
-  int FPStackOffset;
-
-  /// Holds for each function where on the stack the Return Address must be
-  /// saved. This is used on Prologue and Epilogue to emit RA save/restore
-  int RAStackOffset;
-
-  /// At each function entry, two special bitmask directives must be emitted
-  /// to help debugging, for CPU and FPU callee saved registers. Both need
-  /// the negative offset from the final stack size and its higher registers
-  /// location on the stack.
-  int CPUTopSavedRegOff;
-  int FPUTopSavedRegOff;
-
-  /// MipsFIHolder - Holds a FrameIndex and it's Stack Pointer Offset
-  struct MipsFIHolder {
-
-    int FI;
-    int SPOffset;
-
-    MipsFIHolder(int FrameIndex, int StackPointerOffset)
-      : FI(FrameIndex), SPOffset(StackPointerOffset) {}
-  };
-
-  /// When PIC is used the GP must be saved on the stack on the function
-  /// prologue and must be reloaded from this stack location after every
-  /// call. A reference to its stack location and frame index must be kept
-  /// to be used on emitPrologue and processFunctionBeforeFrameFinalized.
-  MipsFIHolder GPHolder;
-
-  /// On LowerFormalArguments the stack size is unknown, so the Stack
-  /// Pointer Offset calculation of "not in register arguments" must be
-  /// postponed to emitPrologue.
-  SmallVector<MipsFIHolder, 16> FnLoadArgs;
-  bool HasLoadArgs;
-
-  // When VarArgs, we must write registers back to caller stack, preserving
-  // on register arguments. Since the stack size is unknown on
-  // LowerFormalArguments, the Stack Pointer Offset calculation must be
-  // postponed to emitPrologue.
-  SmallVector<MipsFIHolder, 4> FnStoreVarArgs;
-  bool HasStoreVarArgs;
-
   /// SRetReturnReg - Some subtargets require that sret lowering includes
   /// returning the value of the returned struct in a register. This field
   /// holds the virtual register into which the sret argument is passed.
@@ -83,55 +40,47 @@ private:
   /// VarArgsFrameIndex - FrameIndex for start of varargs area.
   int VarArgsFrameIndex;
 
+  // Range of frame object indices.
+  // InArgFIRange: Range of indices of all frame objects created during call to
+  //               LowerFormalArguments.
+  // OutArgFIRange: Range of indices of all frame objects created during call to
+  //                LowerCall except for the frame object for restoring $gp. 
+  std::pair<int, int> InArgFIRange, OutArgFIRange;
+  int GPFI; // Index of the frame object for restoring $gp 
+  unsigned MaxCallFrameSize;
+
+  /// AtomicFrameIndex - To implement atomic.swap and atomic.cmp.swap
+  /// intrinsics, it is necessary to use a temporary stack location.
+  /// This field holds the frame index of this location.
+  int AtomicFrameIndex;
 public:
   MipsFunctionInfo(MachineFunction& MF)
-  : FPStackOffset(0), RAStackOffset(0), CPUTopSavedRegOff(0),
-    FPUTopSavedRegOff(0), GPHolder(-1,-1), HasLoadArgs(false),
-    HasStoreVarArgs(false), SRetReturnReg(0), GlobalBaseReg(0),
-    VarArgsFrameIndex(0)
+  : SRetReturnReg(0), GlobalBaseReg(0),
+    VarArgsFrameIndex(0), InArgFIRange(std::make_pair(-1, 0)),
+    OutArgFIRange(std::make_pair(-1, 0)), GPFI(0), MaxCallFrameSize(0),
+    AtomicFrameIndex(-1)
   {}
 
-  int getFPStackOffset() const { return FPStackOffset; }
-  void setFPStackOffset(int Off) { FPStackOffset = Off; }
-
-  int getRAStackOffset() const { return RAStackOffset; }
-  void setRAStackOffset(int Off) { RAStackOffset = Off; }
-
-  int getCPUTopSavedRegOff() const { return CPUTopSavedRegOff; }
-  void setCPUTopSavedRegOff(int Off) { CPUTopSavedRegOff = Off; }
-
-  int getFPUTopSavedRegOff() const { return FPUTopSavedRegOff; }
-  void setFPUTopSavedRegOff(int Off) { FPUTopSavedRegOff = Off; }
-
-  int getGPStackOffset() const { return GPHolder.SPOffset; }
-  int getGPFI() const { return GPHolder.FI; }
-  void setGPStackOffset(int Off) { GPHolder.SPOffset = Off; }
-  void setGPFI(int FI) { GPHolder.FI = FI; }
-  bool needGPSaveRestore() const { return GPHolder.SPOffset != -1; }
-
-  bool hasLoadArgs() const { return HasLoadArgs; }
-  bool hasStoreVarArgs() const { return HasStoreVarArgs; }
-
-  void recordLoadArgsFI(int FI, int SPOffset) {
-    if (!HasLoadArgs) HasLoadArgs=true;
-    FnLoadArgs.push_back(MipsFIHolder(FI, SPOffset));
-  }
-  void recordStoreVarArgsFI(int FI, int SPOffset) {
-    if (!HasStoreVarArgs) HasStoreVarArgs=true;
-    FnStoreVarArgs.push_back(MipsFIHolder(FI, SPOffset));
+  bool isInArgFI(int FI) const {
+    return FI <= InArgFIRange.first && FI >= InArgFIRange.second;
   }
+  void setLastInArgFI(int FI) { InArgFIRange.second = FI; }
 
-  void adjustLoadArgsFI(MachineFrameInfo *MFI) const {
-    if (!hasLoadArgs()) return;
-    for (unsigned i = 0, e = FnLoadArgs.size(); i != e; ++i)
-      MFI->setObjectOffset( FnLoadArgs[i].FI, FnLoadArgs[i].SPOffset );
+  bool isOutArgFI(int FI) const { 
+    return FI <= OutArgFIRange.first && FI >= OutArgFIRange.second;
   }
-  void adjustStoreVarArgsFI(MachineFrameInfo *MFI) const {
-    if (!hasStoreVarArgs()) return;
-    for (unsigned i = 0, e = FnStoreVarArgs.size(); i != e; ++i)
-      MFI->setObjectOffset( FnStoreVarArgs[i].FI, FnStoreVarArgs[i].SPOffset );
+  void extendOutArgFIRange(int FirstFI, int LastFI) {
+    if (!OutArgFIRange.second)
+      // this must be the first time this function was called.
+      OutArgFIRange.first = FirstFI;
+    OutArgFIRange.second = LastFI;
   }
 
+  int getGPFI() const { return GPFI; }
+  void setGPFI(int FI) { GPFI = FI; }
+  bool needGPSaveRestore() const { return getGPFI(); }
+  bool isGPFI(int FI) const { return GPFI && GPFI == FI; }
+
   unsigned getSRetReturnReg() const { return SRetReturnReg; }
   void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
 
@@ -140,6 +89,12 @@ public:
 
   int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
   void setVarArgsFrameIndex(int Index) { VarArgsFrameIndex = Index; }
+
+  unsigned getMaxCallFrameSize() const { return MaxCallFrameSize; }
+  void setMaxCallFrameSize(unsigned S) { MaxCallFrameSize = S; }
+
+  int getAtomicFrameIndex() const { return AtomicFrameIndex; }
+  void setAtomicFrameIndex(int Index) { AtomicFrameIndex = Index; }
 };
 
 } // end of namespace llvm
diff --git a/lib/Target/Mips/MipsRegisterInfo.cpp b/lib/Target/Mips/MipsRegisterInfo.cpp
index acea7da..b0984af 100644
--- a/lib/Target/Mips/MipsRegisterInfo.cpp
+++ b/lib/Target/Mips/MipsRegisterInfo.cpp
@@ -65,16 +65,16 @@ getRegisterNumbering(unsigned RegEnum)
     case Mips::T5   : case Mips::F13: return 13;
     case Mips::T6   : case Mips::F14: case Mips::D7: return 14;
     case Mips::T7   : case Mips::F15: return 15;
-    case Mips::T8   : case Mips::F16: case Mips::D8: return 16;
-    case Mips::T9   : case Mips::F17: return 17;
-    case Mips::S0   : case Mips::F18: case Mips::D9: return 18;
-    case Mips::S1   : case Mips::F19: return 19;
-    case Mips::S2   : case Mips::F20: case Mips::D10: return 20;
-    case Mips::S3   : case Mips::F21: return 21;
-    case Mips::S4   : case Mips::F22: case Mips::D11: return 22;
-    case Mips::S5   : case Mips::F23: return 23;
-    case Mips::S6   : case Mips::F24: case Mips::D12: return 24;
-    case Mips::S7   : case Mips::F25: return 25;
+    case Mips::S0   : case Mips::F16: case Mips::D8: return 16;
+    case Mips::S1   : case Mips::F17: return 17;
+    case Mips::S2   : case Mips::F18: case Mips::D9: return 18;
+    case Mips::S3   : case Mips::F19: return 19;
+    case Mips::S4   : case Mips::F20: case Mips::D10: return 20;
+    case Mips::S5   : case Mips::F21: return 21;
+    case Mips::S6   : case Mips::F22: case Mips::D11: return 22;
+    case Mips::S7   : case Mips::F23: return 23;
+    case Mips::T8   : case Mips::F24: case Mips::D12: return 24;
+    case Mips::T9   : case Mips::F25: return 25;
     case Mips::K0   : case Mips::F26: case Mips::D13: return 26;
     case Mips::K1   : case Mips::F27: return 27;
     case Mips::GP   : case Mips::F28: case Mips::D14: return 28;
@@ -98,22 +98,22 @@ getCalleeSavedRegs(const MachineFunction *MF) const
 {
   // Mips callee-save register range is $16-$23, $f20-$f30
   static const unsigned SingleFloatOnlyCalleeSavedRegs[] = {
-    Mips::S0, Mips::S1, Mips::S2, Mips::S3,
-    Mips::S4, Mips::S5, Mips::S6, Mips::S7,
-    Mips::F20, Mips::F21, Mips::F22, Mips::F23, Mips::F24, Mips::F25,
-    Mips::F26, Mips::F27, Mips::F28, Mips::F29, Mips::F30, 0
+    Mips::F30, Mips::F29, Mips::F28, Mips::F27, Mips::F26,
+    Mips::F25, Mips::F24, Mips::F23, Mips::F22, Mips::F21, Mips::F20,
+    Mips::RA, Mips::FP, Mips::S7, Mips::S6, Mips::S5, Mips::S4,
+    Mips::S3, Mips::S2, Mips::S1, Mips::S0, 0
   };
 
-  static const unsigned BitMode32CalleeSavedRegs[] = {
-    Mips::S0, Mips::S1, Mips::S2, Mips::S3,
-    Mips::S4, Mips::S5, Mips::S6, Mips::S7,
-    Mips::F20, Mips::F22, Mips::F24, Mips::F26, Mips::F28, Mips::F30, 0
+  static const unsigned Mips32CalleeSavedRegs[] = {
+    Mips::D15, Mips::D14, Mips::D13, Mips::D12, Mips::D11, Mips::D10,
+    Mips::RA, Mips::FP, Mips::S7, Mips::S6, Mips::S5, Mips::S4,
+    Mips::S3, Mips::S2, Mips::S1, Mips::S0, 0
   };
 
   if (Subtarget.isSingleFloat())
     return SingleFloatOnlyCalleeSavedRegs;
   else
-    return BitMode32CalleeSavedRegs;
+    return Mips32CalleeSavedRegs;
 }
 
 BitVector MipsRegisterInfo::
@@ -127,9 +127,11 @@ getReservedRegs(const MachineFunction &MF) const {
   Reserved.set(Mips::SP);
   Reserved.set(Mips::FP);
   Reserved.set(Mips::RA);
+  Reserved.set(Mips::F31);
+  Reserved.set(Mips::D15);
 
   // SRV4 requires that odd register can't be used.
-  if (!Subtarget.isSingleFloat())
+  if (!Subtarget.isSingleFloat() && !Subtarget.isMips32())
     for (unsigned FReg=(Mips::F0)+1; FReg < Mips::F30; FReg+=2)
       Reserved.set(FReg);
 
@@ -153,6 +155,8 @@ eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
                     RegScavenger *RS) const {
   MachineInstr &MI = *II;
   MachineFunction &MF = *MI.getParent()->getParent();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
 
   unsigned i = 0;
   while (!MI.getOperand(i).isFI()) {
@@ -172,9 +176,19 @@ eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
                << "spOffset   : " << spOffset << "\n"
                << "stackSize  : " << stackSize << "\n");
 
-  // as explained on LowerFormalArguments, detect negative offsets
-  // and adjust SPOffsets considering the final stack size.
-  int Offset = ((spOffset < 0) ? (stackSize + (-(spOffset+4))) : (spOffset));
+  int Offset;
+
+  // Calculate final offset.
+  // - There is no need to change the offset if the frame object is an outgoing
+  //   argument or a $gp restore location,
+  // - If the frame object is any of the following, its offset must be adjusted
+  //   by adding the size of the stack:
+  //   incoming argument, callee-saved register location or local variable.  
+  if (MipsFI->isOutArgFI(FrameIndex) || MipsFI->isGPFI(FrameIndex))
+    Offset = spOffset;
+  else
+    Offset = spOffset + stackSize;
+
   Offset    += MI.getOperand(i-1).getImm();
 
   DEBUG(errs() << "Offset     : " << Offset << "\n" << "<--------->\n");
@@ -183,25 +197,46 @@ eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
   int NewImm = 0;
   MachineBasicBlock &MBB = *MI.getParent();
   bool ATUsed;
-  unsigned OrigReg = getFrameRegister(MF);
-  int OrigImm = Offset;
+  unsigned FrameReg;
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+  int MinCSFI = 0;
+  int MaxCSFI = -1;
+
+  if (CSI.size()) {
+    MinCSFI = CSI[0].getFrameIdx();
+    MaxCSFI = CSI[CSI.size() - 1].getFrameIdx();
+  }
 
-// OrigImm fits in the 16-bit field
-  if (OrigImm < 0x8000 && OrigImm >= -0x8000) {
-    NewReg = OrigReg;
-    NewImm = OrigImm;
+  // The following stack frame objects are always referenced relative to $sp:
+  //  1. Outgoing arguments.
+  //  2. Pointer to dynamically allocated stack space.
+  //  3. Locations for callee-saved registers.
+  // Everything else is referenced relative to whatever register 
+  // getFrameRegister() returns.
+  if (MipsFI->isOutArgFI(FrameIndex) ||
+      (FrameIndex >= MinCSFI && FrameIndex <= MaxCSFI))
+    FrameReg = Mips::SP;
+  else
+    FrameReg = getFrameRegister(MF); 
+  
+  // Offset fits in the 16-bit field
+  if (Offset < 0x8000 && Offset >= -0x8000) {
+    NewReg = FrameReg;
+    NewImm = Offset;
     ATUsed = false;
   }
   else {
     const TargetInstrInfo *TII = MF.getTarget().getInstrInfo();
     DebugLoc DL = II->getDebugLoc();
-    int ImmLo = OrigImm & 0xffff;
-    int ImmHi = (((unsigned)OrigImm & 0xffff0000) >> 16) + ((OrigImm & 0x8000) != 0);
+    int ImmLo = (short)(Offset & 0xffff);
+    int ImmHi = (((unsigned)Offset & 0xffff0000) >> 16) +
+                ((Offset & 0x8000) != 0);
 
     // FIXME: change this when mips goes MC".
     BuildMI(MBB, II, DL, TII->get(Mips::NOAT));
     BuildMI(MBB, II, DL, TII->get(Mips::LUi), Mips::AT).addImm(ImmHi);
-    BuildMI(MBB, II, DL, TII->get(Mips::ADDu), Mips::AT).addReg(OrigReg).addReg(Mips::AT);
+    BuildMI(MBB, II, DL, TII->get(Mips::ADDu), Mips::AT).addReg(FrameReg)
+                                                        .addReg(Mips::AT);
     NewReg = Mips::AT;
     NewImm = ImmLo;
     
@@ -216,15 +251,6 @@ eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
   MI.getOperand(i-1).ChangeToImmediate(NewImm);
 }
 
-void MipsRegisterInfo::
-processFunctionBeforeFrameFinalized(MachineFunction &MF) const {
-  // Set the stack offset where GP must be saved/loaded from.
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
-  if (MipsFI->needGPSaveRestore())
-    MFI->setObjectOffset(MipsFI->getGPFI(), MipsFI->getGPStackOffset());
-}
-
 unsigned MipsRegisterInfo::
 getRARegister() const {
   return Mips::RA;
@@ -251,8 +277,11 @@ getEHHandlerRegister() const {
 
 int MipsRegisterInfo::
 getDwarfRegNum(unsigned RegNum, bool isEH) const {
-  llvm_unreachable("What is the dwarf register number");
-  return -1;
+  return MipsGenRegisterInfo::getDwarfRegNumFull(RegNum, 0);
+}
+
+int MipsRegisterInfo::getLLVMRegNum(unsigned DwarfRegNo, bool isEH) const {
+  return MipsGenRegisterInfo::getLLVMRegNumFull(DwarfRegNo,0);
 }
 
 #include "MipsGenRegisterInfo.inc"
diff --git a/lib/Target/Mips/MipsRegisterInfo.h b/lib/Target/Mips/MipsRegisterInfo.h
index 767359f..76b0035 100644
--- a/lib/Target/Mips/MipsRegisterInfo.h
+++ b/lib/Target/Mips/MipsRegisterInfo.h
@@ -63,6 +63,7 @@ struct MipsRegisterInfo : public MipsGenRegisterInfo {
   unsigned getEHHandlerRegister() const;
 
   int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+  int getLLVMRegNum(unsigned RegNum, bool isEH) const;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/Mips/MipsRegisterInfo.td b/lib/Target/Mips/MipsRegisterInfo.td
index 9f9cae7..f0db518 100644
--- a/lib/Target/Mips/MipsRegisterInfo.td
+++ b/lib/Target/Mips/MipsRegisterInfo.td
@@ -44,6 +44,11 @@ class AFPR<bits<5> num, string n, list<Register> subregs>
   let SubRegIndices = [sub_fpeven, sub_fpodd];
 }
 
+// Mips Hardware Registers
+class HWR<bits<5> num, string n> : MipsReg<n> {
+  let Num = num;
+}
+
 //===----------------------------------------------------------------------===//
 //  Registers
 //===----------------------------------------------------------------------===//
@@ -55,7 +60,7 @@ let Namespace = "Mips" in {
   def AT   : MipsGPRReg< 1, "AT">,   DwarfRegNum<[1]>;
   def V0   : MipsGPRReg< 2, "2">,    DwarfRegNum<[2]>;
   def V1   : MipsGPRReg< 3, "3">,    DwarfRegNum<[3]>;
-  def A0   : MipsGPRReg< 4, "4">,    DwarfRegNum<[5]>;
+  def A0   : MipsGPRReg< 4, "4">,    DwarfRegNum<[4]>;
   def A1   : MipsGPRReg< 5, "5">,    DwarfRegNum<[5]>;
   def A2   : MipsGPRReg< 6, "6">,    DwarfRegNum<[6]>;
   def A3   : MipsGPRReg< 7, "7">,    DwarfRegNum<[7]>;
@@ -120,22 +125,22 @@ let Namespace = "Mips" in {
 
   /// Mips Double point precision FPU Registers (aliased
   /// with the single precision to hold 64 bit values)
-  def D0  : AFPR< 0,  "F0", [F0,   F1]>, DwarfRegNum<[32]>;
-  def D1  : AFPR< 2,  "F2", [F2,   F3]>, DwarfRegNum<[34]>;
-  def D2  : AFPR< 4,  "F4", [F4,   F5]>, DwarfRegNum<[36]>;
-  def D3  : AFPR< 6,  "F6", [F6,   F7]>, DwarfRegNum<[38]>;
-  def D4  : AFPR< 8,  "F8", [F8,   F9]>, DwarfRegNum<[40]>;
-  def D5  : AFPR<10, "F10", [F10, F11]>, DwarfRegNum<[42]>;
-  def D6  : AFPR<12, "F12", [F12, F13]>, DwarfRegNum<[44]>;
-  def D7  : AFPR<14, "F14", [F14, F15]>, DwarfRegNum<[46]>;
-  def D8  : AFPR<16, "F16", [F16, F17]>, DwarfRegNum<[48]>;
-  def D9  : AFPR<18, "F18", [F18, F19]>, DwarfRegNum<[50]>;
-  def D10 : AFPR<20, "F20", [F20, F21]>, DwarfRegNum<[52]>;
-  def D11 : AFPR<22, "F22", [F22, F23]>, DwarfRegNum<[54]>;
-  def D12 : AFPR<24, "F24", [F24, F25]>, DwarfRegNum<[56]>;
-  def D13 : AFPR<26, "F26", [F26, F27]>, DwarfRegNum<[58]>;
-  def D14 : AFPR<28, "F28", [F28, F29]>, DwarfRegNum<[60]>;
-  def D15 : AFPR<30, "F30", [F30, F31]>, DwarfRegNum<[62]>;
+  def D0  : AFPR< 0,  "F0", [F0,   F1]>;
+  def D1  : AFPR< 2,  "F2", [F2,   F3]>;
+  def D2  : AFPR< 4,  "F4", [F4,   F5]>;
+  def D3  : AFPR< 6,  "F6", [F6,   F7]>;
+  def D4  : AFPR< 8,  "F8", [F8,   F9]>;
+  def D5  : AFPR<10, "F10", [F10, F11]>;
+  def D6  : AFPR<12, "F12", [F12, F13]>;
+  def D7  : AFPR<14, "F14", [F14, F15]>;
+  def D8  : AFPR<16, "F16", [F16, F17]>;
+  def D9  : AFPR<18, "F18", [F18, F19]>;
+  def D10 : AFPR<20, "F20", [F20, F21]>;
+  def D11 : AFPR<22, "F22", [F22, F23]>;
+  def D12 : AFPR<24, "F24", [F24, F25]>;
+  def D13 : AFPR<26, "F26", [F26, F27]>;
+  def D14 : AFPR<28, "F28", [F28, F29]>;
+  def D15 : AFPR<30, "F30", [F30, F31]>;
 
   // Hi/Lo registers
   def HI  : Register<"hi">, DwarfRegNum<[64]>;
@@ -143,33 +148,24 @@ let Namespace = "Mips" in {
 
   // Status flags register
   def FCR31 : Register<"31">;
+
+  // Hardware register $29
+  def HWR29 : Register<"29">;
 }
 
 //===----------------------------------------------------------------------===//
 // Register Classes
 //===----------------------------------------------------------------------===//
 
-def CPURegs : RegisterClass<"Mips", [i32], 32,
+def CPURegs : RegisterClass<"Mips", [i32], 32, (add
   // Return Values and Arguments
-  [V0, V1, A0, A1, A2, A3,
+  V0, V1, A0, A1, A2, A3,
   // Not preserved across procedure calls
   T0, T1, T2, T3, T4, T5, T6, T7, T8, T9,
   // Callee save
   S0, S1, S2, S3, S4, S5, S6, S7,
   // Reserved
-  ZERO, AT, K0, K1, GP, SP, FP, RA]>
-{
-  let MethodProtos = [{
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    CPURegsClass::iterator
-    CPURegsClass::allocation_order_end(const MachineFunction &MF) const {
-      // The last 8 registers on the list above are reserved
-      return end()-8;
-    }
-  }];
-}
+  ZERO, AT, K0, K1, GP, SP, FP, RA)>;
 
 // 64bit fp:
 // * FGR64  - 32 64-bit registers
@@ -178,87 +174,25 @@ def CPURegs : RegisterClass<"Mips", [i32], 32,
 // 32bit fp:
 // * FGR32 - 16 32-bit even registers
 // * FGR32 - 32 32-bit registers (single float only mode)
-def FGR32 : RegisterClass<"Mips", [f32], 32,
-  // Return Values and Arguments
-  [F0, F1, F2, F3, F12, F13, F14, F15,
-  // Not preserved across procedure calls
-  F4, F5, F6, F7, F8, F9, F10, F11, F16, F17, F18, F19,
-  // Callee save
-  F20, F21, F22, F23, F24, F25, F26, F27, F28, F29, F30,
-  // Reserved
-  F31]>
-{
-  let MethodProtos = [{
-    iterator allocation_order_begin(const MachineFunction &MF) const;
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-
-    static const unsigned MIPS_FGR32[] = {
-      Mips::F0,  Mips::F1,  Mips::F2,  Mips::F3,  Mips::F12,  Mips::F13,
-      Mips::F14, Mips::F15, Mips::F4,  Mips::F5,  Mips::F6,   Mips::F7,
-      Mips::F8,  Mips::F9,  Mips::F10, Mips::F11, Mips::F16,  Mips::F17,
-      Mips::F18, Mips::F19, Mips::F20, Mips::F21, Mips::F22,  Mips::F23,
-      Mips::F24, Mips::F25, Mips::F26, Mips::F27, Mips::F28,  Mips::F29,
-      Mips::F30
-    };
-
-    static const unsigned MIPS_SVR4_FGR32[] = {
-      Mips::F0,  Mips::F2,  Mips::F12, Mips::F14, Mips::F4,
-      Mips::F6,  Mips::F8,  Mips::F10, Mips::F16, Mips::F18,
-      Mips::F20, Mips::F22, Mips::F24, Mips::F26, Mips::F28, Mips::F30,
-    };
-
-    FGR32Class::iterator
-    FGR32Class::allocation_order_begin(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const MipsSubtarget &Subtarget = TM.getSubtarget<MipsSubtarget>();
-
-      if (Subtarget.isSingleFloat())
-        return MIPS_FGR32;
-      else
-        return MIPS_SVR4_FGR32;
-    }
-
-    FGR32Class::iterator
-    FGR32Class::allocation_order_end(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const MipsSubtarget &Subtarget = TM.getSubtarget<MipsSubtarget>();
-
-      if (Subtarget.isSingleFloat())
-        return MIPS_FGR32 + (sizeof(MIPS_FGR32) / sizeof(unsigned));
-      else
-        return MIPS_SVR4_FGR32 + (sizeof(MIPS_SVR4_FGR32) / sizeof(unsigned));
-    }
-  }];
-}
+def FGR32 : RegisterClass<"Mips", [f32], 32, (sequence "F%u", 0, 31)>;
 
-def AFGR64 : RegisterClass<"Mips", [f64], 64,
+def AFGR64 : RegisterClass<"Mips", [f64], 64, (add
   // Return Values and Arguments
-  [D0, D1, D6, D7,
+  D0, D1, D6, D7,
   // Not preserved across procedure calls
   D2, D3, D4, D5, D8, D9,
   // Callee save
   D10, D11, D12, D13, D14,
   // Reserved
-  D15]>
-{
+  D15)> {
   let SubRegClasses = [(FGR32 sub_fpeven, sub_fpodd)];
-  let MethodProtos = [{
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    AFGR64Class::iterator
-    AFGR64Class::allocation_order_end(const MachineFunction &MF) const {
-      // The last register on the list above is reserved
-      return end()-1;
-    }
-  }];
 }
 
 // Condition Register for floating point operations
-def CCR  : RegisterClass<"Mips", [i32], 32, [FCR31]>;
+def CCR  : RegisterClass<"Mips", [i32], 32, (add FCR31)>;
 
 // Hi/Lo Registers
-def HILO : RegisterClass<"Mips", [i32], 32, [HI, LO]>;
+def HILO : RegisterClass<"Mips", [i32], 32, (add HI, LO)>;
 
+// Hardware registers
+def HWRegs : RegisterClass<"Mips", [i32], 32, (add HWR29)>;
diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp
index 7a2dd1f..cfbb92c 100644
--- a/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/lib/Target/Mips/MipsTargetMachine.cpp
@@ -38,8 +38,9 @@ MipsTargetMachine(const Target &T, const std::string &TT, const std::string &FS,
                   bool isLittle=false):
   LLVMTargetMachine(T, TT),
   Subtarget(TT, FS, isLittle),
-  DataLayout(isLittle ? std::string("e-p:32:32:32-i8:8:32-i16:16:32-n32") :
-                        std::string("E-p:32:32:32-i8:8:32-i16:16:32-n32")),
+  DataLayout(isLittle ? 
+             std::string("e-p:32:32:32-i8:8:32-i16:16:32-i64:64:64-n32") :
+             std::string("E-p:32:32:32-i8:8:32-i16:16:32-i64:64:64-n32")),
   InstrInfo(*this),
   FrameLowering(Subtarget),
   TLInfo(*this), TSInfo(*this) {
@@ -75,3 +76,15 @@ addPreEmitPass(PassManagerBase &PM, CodeGenOpt::Level OptLevel)
   PM.add(createMipsDelaySlotFillerPass(*this));
   return true;
 }
+
+bool MipsTargetMachine::
+addPreRegAlloc(PassManagerBase &PM, CodeGenOpt::Level OptLevel) {
+  PM.add(createMipsEmitGPRestorePass(*this));
+  return true;
+}
+
+bool MipsTargetMachine::
+addPostRegAlloc(PassManagerBase &PM, CodeGenOpt::Level OptLevel) {
+  PM.add(createMipsExpandPseudoPass(*this));
+  return true;
+}
diff --git a/lib/Target/Mips/MipsTargetMachine.h b/lib/Target/Mips/MipsTargetMachine.h
index 43ab798..102dd85 100644
--- a/lib/Target/Mips/MipsTargetMachine.h
+++ b/lib/Target/Mips/MipsTargetMachine.h
@@ -63,6 +63,9 @@ namespace llvm {
                                  CodeGenOpt::Level OptLevel);
     virtual bool addPreEmitPass(PassManagerBase &PM,
                                 CodeGenOpt::Level OptLevel);
+    virtual bool addPreRegAlloc(PassManagerBase &PM,
+                                CodeGenOpt::Level OptLevel);
+    virtual bool addPostRegAlloc(PassManagerBase &, CodeGenOpt::Level);
   };
 
 /// MipselTargetMachine - Mipsel target machine.
diff --git a/lib/Target/PTX/CMakeLists.txt b/lib/Target/PTX/CMakeLists.txt
index 331266d..c4448d6 100644
--- a/lib/Target/PTX/CMakeLists.txt
+++ b/lib/Target/PTX/CMakeLists.txt
@@ -1,6 +1,7 @@
 set(LLVM_TARGET_DEFINITIONS PTX.td)
 
 tablegen(PTXGenAsmWriter.inc -gen-asm-writer)
+tablegen(PTXGenCallingConv.inc -gen-callingconv)
 tablegen(PTXGenDAGISel.inc -gen-dag-isel)
 tablegen(PTXGenInstrInfo.inc -gen-instr-desc)
 tablegen(PTXGenInstrNames.inc -gen-instr-enums)
diff --git a/lib/Target/PTX/Makefile b/lib/Target/PTX/Makefile
index 2c40d69..844480f 100644
--- a/lib/Target/PTX/Makefile
+++ b/lib/Target/PTX/Makefile
@@ -13,6 +13,7 @@ TARGET = PTX
 
 # Make sure that tblgen is run, first thing.
 BUILT_SOURCES = PTXGenAsmWriter.inc \
+		PTXGenCallingConv.inc \
 		PTXGenDAGISel.inc \
 		PTXGenInstrInfo.inc \
 		PTXGenInstrNames.inc \
diff --git a/lib/Target/PTX/PTX.h b/lib/Target/PTX/PTX.h
index 49045cd..ec2be92 100644
--- a/lib/Target/PTX/PTX.h
+++ b/lib/Target/PTX/PTX.h
@@ -42,7 +42,8 @@ namespace llvm {
   FunctionPass *createPTXMFInfoExtract(PTXTargetMachine &TM,
                                        CodeGenOpt::Level OptLevel);
 
-  extern Target ThePTXTarget;
+  extern Target ThePTX32Target;
+  extern Target ThePTX64Target;
 } // namespace llvm;
 
 // Defines symbolic names for PTX registers.
diff --git a/lib/Target/PTX/PTX.td b/lib/Target/PTX/PTX.td
index dbc6f57..2c7bd3b 100644
--- a/lib/Target/PTX/PTX.td
+++ b/lib/Target/PTX/PTX.td
@@ -24,8 +24,8 @@ include "llvm/Target/Target.td"
 def FeatureDouble : SubtargetFeature<"double", "SupportsDouble", "true",
                                      "Do not demote .f64 to .f32">;
 
-def Feature64Bit : SubtargetFeature<"64bit", "Use64BitAddresses", "true",
-                                    "Use 64-bit integer types for addresses.">;
+def FeatureNoFMA  : SubtargetFeature<"no-fma","SupportsFMA", "false",
+                                     "Disable Fused-Multiply Add">;
 
 //===- PTX Version --------------------------------------------------------===//
 
@@ -41,6 +41,10 @@ def FeaturePTX22 : SubtargetFeature<"ptx22", "PTXVersion", "PTX_VERSION_2_2",
                                     "Use PTX Language Version 2.2",
                                     [FeaturePTX21]>;
 
+def FeaturePTX23 : SubtargetFeature<"ptx23", "PTXVersion", "PTX_VERSION_2_3",
+                                    "Use PTX Language Version 2.3",
+                                    [FeaturePTX22]>;
+
 //===- PTX Shader Model ---------------------------------------------------===//
 
 def FeatureSM10 : SubtargetFeature<"sm10", "PTXShaderModel", "PTX_SM_1_0",
@@ -68,6 +72,12 @@ def : Proc<"generic", []>;
 include "PTXRegisterInfo.td"
 
 //===----------------------------------------------------------------------===//
+// Calling Conventions
+//===----------------------------------------------------------------------===//
+
+include "PTXCallingConv.td"
+
+//===----------------------------------------------------------------------===//
 // Instruction Descriptions
 //===----------------------------------------------------------------------===//
 
diff --git a/lib/Target/PTX/PTXAsmPrinter.cpp b/lib/Target/PTX/PTXAsmPrinter.cpp
index 27c9605..1142144 100644
--- a/lib/Target/PTX/PTXAsmPrinter.cpp
+++ b/lib/Target/PTX/PTXAsmPrinter.cpp
@@ -79,12 +79,12 @@ static const char PARAM_PREFIX[] = "__param_";
 static const char *getRegisterTypeName(unsigned RegNo) {
 #define TEST_REGCLS(cls, clsstr)                \
   if (PTX::cls ## RegisterClass->contains(RegNo)) return # clsstr;
-  TEST_REGCLS(Preds, pred);
-  TEST_REGCLS(RRegu16, u16);
-  TEST_REGCLS(RRegu32, u32);
-  TEST_REGCLS(RRegu64, u64);
-  TEST_REGCLS(RRegf32, f32);
-  TEST_REGCLS(RRegf64, f64);
+  TEST_REGCLS(RegPred, pred);
+  TEST_REGCLS(RegI16, b16);
+  TEST_REGCLS(RegI32, b32);
+  TEST_REGCLS(RegI64, b64);
+  TEST_REGCLS(RegF32, b32);
+  TEST_REGCLS(RegF64, b64);
 #undef TEST_REGCLS
 
   llvm_unreachable("Not in any register class!");
@@ -226,7 +226,7 @@ void PTXAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
       OS << *Mang->getSymbol(MO.getGlobal());
       break;
     case MachineOperand::MO_Immediate:
-      OS << (int) MO.getImm();
+      OS << (long) MO.getImm();
       break;
     case MachineOperand::MO_MachineBasicBlock:
       OS << *MO.getMBB()->getSymbol();
@@ -308,34 +308,60 @@ void PTXAsmPrinter::EmitVariableDeclaration(const GlobalVariable *gv) {
     const PointerType* pointerTy = dyn_cast<const PointerType>(gv->getType());
     const Type* elementTy = pointerTy->getElementType();
 
-    assert(elementTy->isArrayTy() && "Only pointers to arrays are supported");
+    decl += ".b8 ";
+    decl += gvsym->getName();
+    decl += "[";
 
-    const ArrayType* arrayTy = dyn_cast<const ArrayType>(elementTy);
-    elementTy = arrayTy->getElementType();
+    if (elementTy->isArrayTy())
+    {
+      assert(elementTy->isArrayTy() && "Only pointers to arrays are supported");
 
-    unsigned numElements = arrayTy->getNumElements();
+      const ArrayType* arrayTy = dyn_cast<const ArrayType>(elementTy);
+      elementTy = arrayTy->getElementType();
 
-    while (elementTy->isArrayTy()) {
+      unsigned numElements = arrayTy->getNumElements();
 
-      arrayTy = dyn_cast<const ArrayType>(elementTy);
-      elementTy = arrayTy->getElementType();
+      while (elementTy->isArrayTy()) {
 
-      numElements *= arrayTy->getNumElements();
-    }
+        arrayTy = dyn_cast<const ArrayType>(elementTy);
+        elementTy = arrayTy->getElementType();
 
-    // FIXME: isPrimitiveType() == false for i16?
-    assert(elementTy->isSingleValueType() &&
-           "Non-primitive types are not handled");
+        numElements *= arrayTy->getNumElements();
+      }
 
-    // Compute the size of the array, in bytes.
-    uint64_t arraySize = (elementTy->getPrimitiveSizeInBits() >> 3)
-                       * numElements;
+      // FIXME: isPrimitiveType() == false for i16?
+      assert(elementTy->isSingleValueType() &&
+              "Non-primitive types are not handled");
+
+      // Compute the size of the array, in bytes.
+      uint64_t arraySize = (elementTy->getPrimitiveSizeInBits() >> 3)
+                        * numElements;
+
+      decl += utostr(arraySize);
+    }
 
-    decl += ".b8 ";
-    decl += gvsym->getName();
-    decl += "[";
-    decl += utostr(arraySize);
     decl += "]";
+
+    // handle string constants (assume ConstantArray means string)
+
+    if (gv->hasInitializer())
+    {
+      Constant *C = gv->getInitializer();  
+      if (const ConstantArray *CA = dyn_cast<ConstantArray>(C))
+      {
+        decl += " = {";
+
+        for (unsigned i = 0, e = C->getNumOperands(); i != e; ++i)
+        {
+          if (i > 0)   decl += ",";
+
+          decl += "0x" +
+                utohexstr(cast<ConstantInt>(CA->getOperand(i))->getZExtValue());
+        }
+
+        decl += "}";
+      }
+    }
   }
   else {
     // Note: this is currently the fall-through case and most likely generates
@@ -368,17 +394,23 @@ void PTXAsmPrinter::EmitFunctionDeclaration() {
 
   const PTXMachineFunctionInfo *MFI = MF->getInfo<PTXMachineFunctionInfo>();
   const bool isKernel = MFI->isKernel();
-  unsigned reg;
 
   std::string decl = isKernel ? ".entry" : ".func";
 
-  // Print return register
-  reg = MFI->retReg();
-  if (!isKernel && reg != PTX::NoRegister) {
-    decl += " (.reg ."; // FIXME: could it return in .param space?
-    decl += getRegisterTypeName(reg);
-    decl += " ";
-    decl += getRegisterName(reg);
+  if (!isKernel) {
+    decl += " (";
+
+    for (PTXMachineFunctionInfo::ret_iterator
+         i = MFI->retRegBegin(), e = MFI->retRegEnd(), b = i;
+         i != e; ++i) {
+      if (i != b) {
+        decl += ", ";
+      }
+      decl += ".reg .";
+      decl += getRegisterTypeName(*i);
+      decl += " ";
+      decl += getRegisterName(*i);
+    }
     decl += ")";
   }
 
@@ -386,40 +418,66 @@ void PTXAsmPrinter::EmitFunctionDeclaration() {
   decl += " ";
   decl += CurrentFnSym->getName().str();
 
-  // Print parameter list
-  if (!MFI->argRegEmpty()) {
-    decl += " (";
+  decl += " (";
+
+  unsigned cnt = 0;
+
+  // Print parameters
+  for (PTXMachineFunctionInfo::reg_iterator
+       i = MFI->argRegBegin(), e = MFI->argRegEnd(), b = i;
+       i != e; ++i) {
+    if (i != b) {
+      decl += ", ";
+    }
     if (isKernel) {
-      unsigned cnt = 0;
-      for(PTXMachineFunctionInfo::reg_iterator
-          i = MFI->argRegBegin(), e = MFI->argRegEnd(), b = i;
-          i != e; ++i) {
-        reg = *i;
-        assert(reg != PTX::NoRegister && "Not a valid register!");
-        if (i != b)
-          decl += ", ";
-        decl += ".param .";
-        decl += getRegisterTypeName(reg);
-        decl += " ";
-        decl += PARAM_PREFIX;
-        decl += utostr(++cnt);
-      }
+      decl += ".param .b";
+      decl += utostr(*i);
+      decl += " ";
+      decl += PARAM_PREFIX;
+      decl += utostr(++cnt);
     } else {
-      for (PTXMachineFunctionInfo::reg_iterator
-           i = MFI->argRegBegin(), e = MFI->argRegEnd(), b = i;
-           i != e; ++i) {
-        reg = *i;
-        assert(reg != PTX::NoRegister && "Not a valid register!");
-        if (i != b)
-          decl += ", ";
-        decl += ".reg .";
-        decl += getRegisterTypeName(reg);
-        decl += " ";
-        decl += getRegisterName(reg);
-      }
+      decl += ".reg .";
+      decl += getRegisterTypeName(*i);
+      decl += " ";
+      decl += getRegisterName(*i);
     }
-    decl += ")";
   }
+  decl += ")";
+
+  // // Print parameter list
+  // if (!MFI->argRegEmpty()) {
+  //   decl += " (";
+  //   if (isKernel) {
+  //     unsigned cnt = 0;
+  //     for(PTXMachineFunctionInfo::reg_iterator
+  //         i = MFI->argRegBegin(), e = MFI->argRegEnd(), b = i;
+  //         i != e; ++i) {
+  //       reg = *i;
+  //       assert(reg != PTX::NoRegister && "Not a valid register!");
+  //       if (i != b)
+  //         decl += ", ";
+  //       decl += ".param .";
+  //       decl += getRegisterTypeName(reg);
+  //       decl += " ";
+  //       decl += PARAM_PREFIX;
+  //       decl += utostr(++cnt);
+  //     }
+  //   } else {
+  //     for (PTXMachineFunctionInfo::reg_iterator
+  //          i = MFI->argRegBegin(), e = MFI->argRegEnd(), b = i;
+  //          i != e; ++i) {
+  //       reg = *i;
+  //       assert(reg != PTX::NoRegister && "Not a valid register!");
+  //       if (i != b)
+  //         decl += ", ";
+  //       decl += ".reg .";
+  //       decl += getRegisterTypeName(reg);
+  //       decl += " ";
+  //       decl += getRegisterName(reg);
+  //     }
+  //   }
+  //   decl += ")";
+  // }
 
   OutStreamer.EmitRawText(Twine(decl));
 }
@@ -447,5 +505,6 @@ printPredicateOperand(const MachineInstr *MI, raw_ostream &O) {
 
 // Force static initialization.
 extern "C" void LLVMInitializePTXAsmPrinter() {
-  RegisterAsmPrinter<PTXAsmPrinter> X(ThePTXTarget);
+  RegisterAsmPrinter<PTXAsmPrinter> X(ThePTX32Target);
+  RegisterAsmPrinter<PTXAsmPrinter> Y(ThePTX64Target);
 }
diff --git a/lib/Target/PTX/PTXCallingConv.td b/lib/Target/PTX/PTXCallingConv.td
new file mode 100644
index 0000000..4d7759b
--- /dev/null
+++ b/lib/Target/PTX/PTXCallingConv.td
@@ -0,0 +1,36 @@
+//===--- PTXCallingConv.td - Calling Conventions -----------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the PTX architecture.
+//
+//===----------------------------------------------------------------------===//
+
+// Currently, we reserve one register of each type for return values and let
+// the rest be used for parameters.  This is a dirty hack, but I am not sure
+// how to tell LLVM that registers used for parameter passing cannot be used
+// for return values.
+
+// PTX Calling Conventions
+def CC_PTX : CallingConv<[
+  CCIfType<[i1], CCAssignToReg<[P1, P2, P3, P4, P5, P6, P7]>>,
+  CCIfType<[i16], CCAssignToReg<[RH1, RH2, RH3, RH4, RH5, RH6, RH7]>>,
+  CCIfType<[i32, f32], CCAssignToReg<[R1, R2, R3, R4, R5, R6, R7]>>,
+  CCIfType<[i64, f64], CCAssignToReg<[RD1, RD2, RD3, RD4, RD5, RD6, RD7]>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// Return Value Calling Conventions
+//===----------------------------------------------------------------------===//
+
+def RetCC_PTX : CallingConv<[
+  CCIfType<[i1], CCAssignToReg<[P0]>>,
+  CCIfType<[i16], CCAssignToReg<[RH0]>>,
+  CCIfType<[i32, f32], CCAssignToReg<[R0]>>,
+  CCIfType<[i64, f64], CCAssignToReg<[RD0]>>
+]>;
diff --git a/lib/Target/PTX/PTXISelLowering.cpp b/lib/Target/PTX/PTXISelLowering.cpp
index 7187518..c3cdaba 100644
--- a/lib/Target/PTX/PTXISelLowering.cpp
+++ b/lib/Target/PTX/PTXISelLowering.cpp
@@ -16,6 +16,7 @@
 #include "PTXMachineFunctionInfo.h"
 #include "PTXRegisterInfo.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
@@ -24,21 +25,43 @@
 
 using namespace llvm;
 
+//===----------------------------------------------------------------------===//
+// Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+#include "PTXGenCallingConv.inc"
+
+//===----------------------------------------------------------------------===//
+// TargetLowering Implementation
+//===----------------------------------------------------------------------===//
+
 PTXTargetLowering::PTXTargetLowering(TargetMachine &TM)
   : TargetLowering(TM, new TargetLoweringObjectFileELF()) {
   // Set up the register classes.
-  addRegisterClass(MVT::i1,  PTX::PredsRegisterClass);
-  addRegisterClass(MVT::i16, PTX::RRegu16RegisterClass);
-  addRegisterClass(MVT::i32, PTX::RRegu32RegisterClass);
-  addRegisterClass(MVT::i64, PTX::RRegu64RegisterClass);
-  addRegisterClass(MVT::f32, PTX::RRegf32RegisterClass);
-  addRegisterClass(MVT::f64, PTX::RRegf64RegisterClass);
+  addRegisterClass(MVT::i1,  PTX::RegPredRegisterClass);
+  addRegisterClass(MVT::i16, PTX::RegI16RegisterClass);
+  addRegisterClass(MVT::i32, PTX::RegI32RegisterClass);
+  addRegisterClass(MVT::i64, PTX::RegI64RegisterClass);
+  addRegisterClass(MVT::f32, PTX::RegF32RegisterClass);
+  addRegisterClass(MVT::f64, PTX::RegF64RegisterClass);
+
+  setBooleanContents(ZeroOrOneBooleanContent);
 
   setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand);
 
   setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
   setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
 
+  // Turn i16 (z)extload into load + (z)extend
+  setLoadExtAction(ISD::EXTLOAD, MVT::i16, Expand);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Expand);
+
+  // Turn f32 extload into load + fextend
+  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+
+  // Turn f64 truncstore into trunc + store.
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
   // Customize translation of memory addresses
   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
   setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
@@ -46,14 +69,30 @@ PTXTargetLowering::PTXTargetLowering(TargetMachine &TM)
   // Expand BR_CC into BRCOND
   setOperationAction(ISD::BR_CC, MVT::Other, Expand);
 
+  // Expand SELECT_CC into SETCC
+  setOperationAction(ISD::SELECT_CC, MVT::Other, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
+
+  // need to lower SETCC of RegPred into bitwise logic
+  setOperationAction(ISD::SETCC, MVT::i1, Custom);
+
+  setMinFunctionAlignment(2);
+
   // Compute derived properties from the register classes
   computeRegisterProperties();
 }
 
+MVT::SimpleValueType PTXTargetLowering::getSetCCResultType(EVT VT) const {
+  return MVT::i1;
+}
+
 SDValue PTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
     default:
       llvm_unreachable("Unimplemented operand");
+    case ISD::SETCC:
+      return LowerSETCC(Op, DAG);
     case ISD::GlobalAddress:
       return LowerGlobalAddress(Op, DAG);
   }
@@ -78,6 +117,28 @@ const char *PTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
 //                      Custom Lower Operation
 //===----------------------------------------------------------------------===//
 
+SDValue PTXTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+  assert(Op.getValueType() == MVT::i1 && "SetCC type must be 1-bit integer");
+  SDValue Op0 = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  SDValue Op2 = Op.getOperand(2);
+  DebugLoc dl = Op.getDebugLoc();
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+
+  // Look for X == 0, X == 1, X != 0, or X != 1  
+  // We can simplify these to bitwise logic
+
+  if (Op1.getOpcode() == ISD::Constant &&
+      (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 ||
+       cast<ConstantSDNode>(Op1)->isNullValue()) &&
+      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+
+    return DAG.getNode(ISD::AND, dl, MVT::i1, Op0, Op1);
+  }
+
+  return DAG.getNode(ISD::SETCC, dl, MVT::i1, Op0, Op1, Op2);
+}
+
 SDValue PTXTargetLowering::
 LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
   EVT PtrVT = getPointerTy();
@@ -111,12 +172,12 @@ struct argmap_entry {
   void reset() { loc = RC->begin(); }
   bool operator==(MVT::SimpleValueType _VT) const { return VT == _VT; }
 } argmap[] = {
-  argmap_entry(MVT::i1,  PTX::PredsRegisterClass),
-  argmap_entry(MVT::i16, PTX::RRegu16RegisterClass),
-  argmap_entry(MVT::i32, PTX::RRegu32RegisterClass),
-  argmap_entry(MVT::i64, PTX::RRegu64RegisterClass),
-  argmap_entry(MVT::f32, PTX::RRegf32RegisterClass),
-  argmap_entry(MVT::f64, PTX::RRegf64RegisterClass)
+  argmap_entry(MVT::i1,  PTX::RegPredRegisterClass),
+  argmap_entry(MVT::i16, PTX::RegI16RegisterClass),
+  argmap_entry(MVT::i32, PTX::RegI32RegisterClass),
+  argmap_entry(MVT::i64, PTX::RegI64RegisterClass),
+  argmap_entry(MVT::f32, PTX::RegF32RegisterClass),
+  argmap_entry(MVT::f64, PTX::RegF64RegisterClass)
 };
 }                               // end anonymous namespace
 
@@ -145,44 +206,72 @@ SDValue PTXTargetLowering::
       break;
   }
 
-  // Make sure we don't add argument registers twice
-  if (MFI->isDoneAddArg())
-    llvm_unreachable("cannot add argument registers twice");
-
-  // Reset argmap before allocation
-  for (struct argmap_entry *i = argmap, *e = argmap + array_lengthof(argmap);
-       i != e; ++ i)
-    i->reset();
-
-  for (int i = 0, e = Ins.size(); i != e; ++ i) {
-    MVT::SimpleValueType VT = Ins[i].VT.SimpleTy;
+  if (MFI->isKernel()) {
+    // For kernel functions, we just need to emit the proper READ_PARAM ISDs
+    for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
 
-    struct argmap_entry *entry = std::find(argmap,
-                                           argmap + array_lengthof(argmap), VT);
-    if (entry == argmap + array_lengthof(argmap))
-      llvm_unreachable("Type of argument is not supported");
+      assert(Ins[i].VT != MVT::i1 && "Kernels cannot take pred operands");
 
-    if (MFI->isKernel() && entry->RC == PTX::PredsRegisterClass)
-      llvm_unreachable("cannot pass preds to kernel");
+      SDValue ArgValue = DAG.getNode(PTXISD::READ_PARAM, dl, Ins[i].VT, Chain,
+                                     DAG.getTargetConstant(i, MVT::i32));
+      InVals.push_back(ArgValue);
 
-    MachineRegisterInfo &RegInfo = DAG.getMachineFunction().getRegInfo();
-
-    unsigned preg = *++(entry->loc); // allocate start from register 1
-    unsigned vreg = RegInfo.createVirtualRegister(entry->RC);
-    RegInfo.addLiveIn(preg, vreg);
-
-    MFI->addArgReg(preg);
-
-    SDValue inval;
-    if (MFI->isKernel())
-      inval = DAG.getNode(PTXISD::READ_PARAM, dl, VT, Chain,
-                          DAG.getTargetConstant(i, MVT::i32));
-    else
-      inval = DAG.getCopyFromReg(Chain, dl, vreg, VT);
-    InVals.push_back(inval);
+      // Instead of storing a physical register in our argument list, we just
+      // store the total size of the parameter, in bits.  The ASM printer
+      // knows how to process this.
+      MFI->addArgReg(Ins[i].VT.getStoreSizeInBits());
+    }
+  }
+  else {
+    // For device functions, we use the PTX calling convention to do register
+    // assignments then create CopyFromReg ISDs for the allocated registers
+
+    SmallVector<CCValAssign, 16> ArgLocs;
+    CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), ArgLocs,
+                   *DAG.getContext());
+
+    CCInfo.AnalyzeFormalArguments(Ins, CC_PTX);
+
+    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+
+      CCValAssign&         VA    = ArgLocs[i];
+      EVT                  RegVT = VA.getLocVT();
+      TargetRegisterClass* TRC   = 0;
+
+      assert(VA.isRegLoc() && "CCValAssign must be RegLoc");
+
+      // Determine which register class we need
+      if (RegVT == MVT::i1) {
+        TRC = PTX::RegPredRegisterClass;
+      }
+      else if (RegVT == MVT::i16) {
+        TRC = PTX::RegI16RegisterClass;
+      }
+      else if (RegVT == MVT::i32) {
+        TRC = PTX::RegI32RegisterClass;
+      }
+      else if (RegVT == MVT::i64) {
+        TRC = PTX::RegI64RegisterClass;
+      }
+      else if (RegVT == MVT::f32) {
+        TRC = PTX::RegF32RegisterClass;
+      }
+      else if (RegVT == MVT::f64) {
+        TRC = PTX::RegF64RegisterClass;
+      }
+      else {
+        llvm_unreachable("Unknown parameter type");
+      }
+
+      unsigned Reg = MF.getRegInfo().createVirtualRegister(TRC);
+      MF.getRegInfo().addLiveIn(VA.getLocReg(), Reg);
+
+      SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
+      InVals.push_back(ArgValue);
+
+      MFI->addArgReg(VA.getLocReg());
+    }
   }
-
-  MFI->doneAddArg();
 
   return Chain;
 }
@@ -204,51 +293,43 @@ SDValue PTXTargetLowering::
       assert(Outs.size() == 0 && "Kernel must return void.");
       return DAG.getNode(PTXISD::EXIT, dl, MVT::Other, Chain);
     case CallingConv::PTX_Device:
-      assert(Outs.size() <= 1 && "Can at most return one value.");
+      //assert(Outs.size() <= 1 && "Can at most return one value.");
       break;
   }
 
-  // PTX_Device
-
-  // return void
-  if (Outs.size() == 0)
-    return DAG.getNode(PTXISD::RET, dl, MVT::Other, Chain);
+  MachineFunction& MF = DAG.getMachineFunction();
+  PTXMachineFunctionInfo *MFI = MF.getInfo<PTXMachineFunctionInfo>();
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), RVLocs, *DAG.getContext());
 
   SDValue Flag;
-  unsigned reg;
 
-  if (Outs[0].VT == MVT::i16) {
-    reg = PTX::RH0;
-  }
-  else if (Outs[0].VT == MVT::i32) {
-    reg = PTX::R0;
-  }
-  else if (Outs[0].VT == MVT::i64) {
-    reg = PTX::RD0;
-  }
-  else if (Outs[0].VT == MVT::f32) {
-    reg = PTX::F0;
-  }
-  else {
-    assert(Outs[0].VT == MVT::f64 && "Can return only basic types");
-    reg = PTX::FD0;
-  }
+  CCInfo.AnalyzeReturn(Outs, RetCC_PTX);
 
-  MachineFunction &MF = DAG.getMachineFunction();
-  PTXMachineFunctionInfo *MFI = MF.getInfo<PTXMachineFunctionInfo>();
-  MFI->setRetReg(reg);
+  for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
+
+    CCValAssign& VA  = RVLocs[i];
+
+    assert(VA.isRegLoc() && "CCValAssign must be RegLoc");
+
+    unsigned Reg = VA.getLocReg();
 
-  // If this is the first return lowered for this function, add the regs to the
-  // liveout set for the function
-  if (DAG.getMachineFunction().getRegInfo().liveout_empty())
-    DAG.getMachineFunction().getRegInfo().addLiveOut(reg);
+    DAG.getMachineFunction().getRegInfo().addLiveOut(Reg);
 
-  // Copy the result values into the output registers
-  Chain = DAG.getCopyToReg(Chain, dl, reg, OutVals[0], Flag);
+    Chain = DAG.getCopyToReg(Chain, dl, Reg, OutVals[i], Flag);
 
-  // Guarantee that all emitted copies are stuck together,
-  // avoiding something bad
-  Flag = Chain.getValue(1);
+    // Guarantee that all emitted copies are stuck together,
+    // avoiding something bad
+    Flag = Chain.getValue(1);
 
-  return DAG.getNode(PTXISD::RET, dl, MVT::Other, Chain, Flag);
+    MFI->addRetReg(Reg);
+  }
+
+  if (Flag.getNode() == 0) {
+    return DAG.getNode(PTXISD::RET, dl, MVT::Other, Chain);
+  }
+  else {
+    return DAG.getNode(PTXISD::RET, dl, MVT::Other, Chain, Flag);
+  }
 }
diff --git a/lib/Target/PTX/PTXISelLowering.h b/lib/Target/PTX/PTXISelLowering.h
index c69c416..ead17ed 100644
--- a/lib/Target/PTX/PTXISelLowering.h
+++ b/lib/Target/PTX/PTXISelLowering.h
@@ -37,11 +37,10 @@ class PTXTargetLowering : public TargetLowering {
 
     virtual const char *getTargetNodeName(unsigned Opcode) const;
 
-    virtual unsigned getFunctionAlignment(const Function *F) const {
-      return 2; }
-
     virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
 
+    virtual SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+
     virtual SDValue
       LowerFormalArguments(SDValue Chain,
                            CallingConv::ID CallConv,
@@ -60,6 +59,8 @@ class PTXTargetLowering : public TargetLowering {
                   DebugLoc dl,
                   SelectionDAG &DAG) const;
 
+    virtual MVT::SimpleValueType getSetCCResultType(EVT VT) const;
+
   private:
     SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
 }; // class PTXTargetLowering
diff --git a/lib/Target/PTX/PTXInstrFormats.td b/lib/Target/PTX/PTXInstrFormats.td
index e4e0999..8cee351 100644
--- a/lib/Target/PTX/PTXInstrFormats.td
+++ b/lib/Target/PTX/PTXInstrFormats.td
@@ -9,7 +9,7 @@
 
 // PTX Predicate operand, default to (0, 0) = (zero-reg, always).
 // Leave PrintMethod empty; predicate printing is defined elsewhere.
-def pred : PredicateOperand<OtherVT, (ops Preds, i32imm),
+def pred : PredicateOperand<OtherVT, (ops RegPred, i32imm),
                                      (ops (i1 zero_reg), (i32 0))>;
 
 let Namespace = "PTX" in {
diff --git a/lib/Target/PTX/PTXInstrInfo.cpp b/lib/Target/PTX/PTXInstrInfo.cpp
index a12a6d0..c305c05 100644
--- a/lib/Target/PTX/PTXInstrInfo.cpp
+++ b/lib/Target/PTX/PTXInstrInfo.cpp
@@ -33,12 +33,12 @@ static const struct map_entry {
   const TargetRegisterClass *cls;
   const int opcode;
 } map[] = {
-  { &PTX::RRegu16RegClass, PTX::MOVU16rr },
-  { &PTX::RRegu32RegClass, PTX::MOVU32rr },
-  { &PTX::RRegu64RegClass, PTX::MOVU64rr },
-  { &PTX::RRegf32RegClass, PTX::MOVF32rr },
-  { &PTX::RRegf64RegClass, PTX::MOVF64rr },
-  { &PTX::PredsRegClass,   PTX::MOVPREDrr }
+  { &PTX::RegI16RegClass, PTX::MOVU16rr },
+  { &PTX::RegI32RegClass, PTX::MOVU32rr },
+  { &PTX::RegI64RegClass, PTX::MOVU64rr },
+  { &PTX::RegF32RegClass, PTX::MOVF32rr },
+  { &PTX::RegF64RegClass, PTX::MOVF64rr },
+  { &PTX::RegPredRegClass,   PTX::MOVPREDrr }
 };
 
 void PTXInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
@@ -155,7 +155,7 @@ DefinesPredicate(MachineInstr *MI,
 
   const MachineOperand &MO = MI->getOperand(0);
 
-  if (!MO.isReg() || RI.getRegClass(MO.getReg()) != &PTX::PredsRegClass)
+  if (!MO.isReg() || RI.getRegClass(MO.getReg()) != &PTX::RegPredRegClass)
     return false;
 
   Pred.push_back(MO);
diff --git a/lib/Target/PTX/PTXInstrInfo.td b/lib/Target/PTX/PTXInstrInfo.td
index c075512..71f7cc3 100644
--- a/lib/Target/PTX/PTXInstrInfo.td
+++ b/lib/Target/PTX/PTXInstrInfo.td
@@ -22,8 +22,8 @@ include "PTXInstrFormats.td"
 //===----------------------------------------------------------------------===//
 
 // Addressing
-def Use32BitAddresses : Predicate<"!getSubtarget().use64BitAddresses()">;
-def Use64BitAddresses : Predicate<"getSubtarget().use64BitAddresses()">;
+def Use32BitAddresses : Predicate<"!getSubtarget().is64Bit()">;
+def Use64BitAddresses : Predicate<"getSubtarget().is64Bit()">;
 
 // Shader Model Support
 def SupportsSM13       : Predicate<"getSubtarget().supportsSM13()">;
@@ -36,6 +36,12 @@ def SupportsPTX21       : Predicate<"getSubtarget().supportsPTX21()">;
 def DoesNotSupportPTX21 : Predicate<"!getSubtarget().supportsPTX21()">;
 def SupportsPTX22       : Predicate<"getSubtarget().supportsPTX22()">;
 def DoesNotSupportPTX22 : Predicate<"!getSubtarget().supportsPTX22()">;
+def SupportsPTX23       : Predicate<"getSubtarget().supportsPTX23()">;
+def DoesNotSupportPTX23 : Predicate<"!getSubtarget().supportsPTX23()">;
+
+// Fused-Multiply Add
+def SupportsFMA         : Predicate<"getSubtarget().supportsFMA()">;
+def DoesNotSupportFMA   : Predicate<"!getSubtarget().supportsFMA()">;
 
 //===----------------------------------------------------------------------===//
 // Instruction Pattern Stuff
@@ -137,11 +143,11 @@ def ADDRii64 : ComplexPattern<i64, 2, "SelectADDRii", [], []>;
 // Address operands
 def MEMri32 : Operand<i32> {
   let PrintMethod = "printMemOperand";
-  let MIOperandInfo = (ops RRegu32, i32imm);
+  let MIOperandInfo = (ops RegI32, i32imm);
 }
 def MEMri64 : Operand<i64> {
   let PrintMethod = "printMemOperand";
-  let MIOperandInfo = (ops RRegu64, i64imm);
+  let MIOperandInfo = (ops RegI64, i64imm);
 }
 def MEMii32 : Operand<i32> {
   let PrintMethod = "printMemOperand";
@@ -182,209 +188,312 @@ def PTXcopyaddress
 // Instruction Class Templates
 //===----------------------------------------------------------------------===//
 
+//===- Floating-Point Instructions - 2 Operand Form -----------------------===//
+multiclass PTX_FLOAT_2OP<string opcstr, SDNode opnode> {
+  def rr32 : InstPTX<(outs RegF32:$d),
+                     (ins RegF32:$a),
+                     !strconcat(opcstr, ".f32\t$d, $a"),
+                     [(set RegF32:$d, (opnode RegF32:$a))]>;
+  def ri32 : InstPTX<(outs RegF32:$d),
+                     (ins f32imm:$a),
+                     !strconcat(opcstr, ".f32\t$d, $a"),
+                     [(set RegF32:$d, (opnode fpimm:$a))]>;
+  def rr64 : InstPTX<(outs RegF64:$d),
+                     (ins RegF64:$a),
+                     !strconcat(opcstr, ".f64\t$d, $a"),
+                     [(set RegF64:$d, (opnode RegF64:$a))]>;
+  def ri64 : InstPTX<(outs RegF64:$d),
+                     (ins f64imm:$a),
+                     !strconcat(opcstr, ".f64\t$d, $a"),
+                     [(set RegF64:$d, (opnode fpimm:$a))]>;
+}
+
 //===- Floating-Point Instructions - 3 Operand Form -----------------------===//
 multiclass PTX_FLOAT_3OP<string opcstr, SDNode opnode> {
-  def rr32 : InstPTX<(outs RRegf32:$d),
-                     (ins RRegf32:$a, RRegf32:$b),
+  def rr32 : InstPTX<(outs RegF32:$d),
+                     (ins RegF32:$a, RegF32:$b),
                      !strconcat(opcstr, ".f32\t$d, $a, $b"),
-                     [(set RRegf32:$d, (opnode RRegf32:$a, RRegf32:$b))]>;
-  def ri32 : InstPTX<(outs RRegf32:$d),
-                     (ins RRegf32:$a, f32imm:$b),
+                     [(set RegF32:$d, (opnode RegF32:$a, RegF32:$b))]>;
+  def ri32 : InstPTX<(outs RegF32:$d),
+                     (ins RegF32:$a, f32imm:$b),
                      !strconcat(opcstr, ".f32\t$d, $a, $b"),
-                     [(set RRegf32:$d, (opnode RRegf32:$a, fpimm:$b))]>;
-  def rr64 : InstPTX<(outs RRegf64:$d),
-                     (ins RRegf64:$a, RRegf64:$b),
+                     [(set RegF32:$d, (opnode RegF32:$a, fpimm:$b))]>;
+  def rr64 : InstPTX<(outs RegF64:$d),
+                     (ins RegF64:$a, RegF64:$b),
                      !strconcat(opcstr, ".f64\t$d, $a, $b"),
-                     [(set RRegf64:$d, (opnode RRegf64:$a, RRegf64:$b))]>;
-  def ri64 : InstPTX<(outs RRegf64:$d),
-                     (ins RRegf64:$a, f64imm:$b),
+                     [(set RegF64:$d, (opnode RegF64:$a, RegF64:$b))]>;
+  def ri64 : InstPTX<(outs RegF64:$d),
+                     (ins RegF64:$a, f64imm:$b),
                      !strconcat(opcstr, ".f64\t$d, $a, $b"),
-                     [(set RRegf64:$d, (opnode RRegf64:$a, fpimm:$b))]>;
+                     [(set RegF64:$d, (opnode RegF64:$a, fpimm:$b))]>;
 }
 
 //===- Floating-Point Instructions - 4 Operand Form -----------------------===//
 multiclass PTX_FLOAT_4OP<string opcstr, SDNode opnode1, SDNode opnode2> {
-  def rrr32 : InstPTX<(outs RRegf32:$d),
-                      (ins RRegf32:$a, RRegf32:$b, RRegf32:$c),
+  def rrr32 : InstPTX<(outs RegF32:$d),
+                      (ins RegF32:$a, RegF32:$b, RegF32:$c),
                       !strconcat(opcstr, ".f32\t$d, $a, $b, $c"),
-                      [(set RRegf32:$d, (opnode2 (opnode1 RRegf32:$a,
-                                                          RRegf32:$b),
-                                                 RRegf32:$c))]>;
-  def rri32 : InstPTX<(outs RRegf32:$d),
-                      (ins RRegf32:$a, RRegf32:$b, f32imm:$c),
+                      [(set RegF32:$d, (opnode2 (opnode1 RegF32:$a,
+                                                          RegF32:$b),
+                                                 RegF32:$c))]>;
+  def rri32 : InstPTX<(outs RegF32:$d),
+                      (ins RegF32:$a, RegF32:$b, f32imm:$c),
                       !strconcat(opcstr, ".f32\t$d, $a, $b, $c"),
-                      [(set RRegf32:$d, (opnode2 (opnode1 RRegf32:$a,
-                                                          RRegf32:$b),
+                      [(set RegF32:$d, (opnode2 (opnode1 RegF32:$a,
+                                                          RegF32:$b),
                                                  fpimm:$c))]>;
-  def rrr64 : InstPTX<(outs RRegf64:$d),
-                      (ins RRegf64:$a, RRegf64:$b, RRegf64:$c),
+  def rrr64 : InstPTX<(outs RegF64:$d),
+                      (ins RegF64:$a, RegF64:$b, RegF64:$c),
                       !strconcat(opcstr, ".f64\t$d, $a, $b, $c"),
-                      [(set RRegf64:$d, (opnode2 (opnode1 RRegf64:$a,
-                                                          RRegf64:$b),
-                                                 RRegf64:$c))]>;
-  def rri64 : InstPTX<(outs RRegf64:$d),
-                      (ins RRegf64:$a, RRegf64:$b, f64imm:$c),
+                      [(set RegF64:$d, (opnode2 (opnode1 RegF64:$a,
+                                                          RegF64:$b),
+                                                 RegF64:$c))]>;
+  def rri64 : InstPTX<(outs RegF64:$d),
+                      (ins RegF64:$a, RegF64:$b, f64imm:$c),
                       !strconcat(opcstr, ".f64\t$d, $a, $b, $c"),
-                      [(set RRegf64:$d, (opnode2 (opnode1 RRegf64:$a,
-                                                          RRegf64:$b),
+                      [(set RegF64:$d, (opnode2 (opnode1 RegF64:$a,
+                                                          RegF64:$b),
                                                  fpimm:$c))]>;
 }
 
 multiclass INT3<string opcstr, SDNode opnode> {
-  def rr16 : InstPTX<(outs RRegu16:$d),
-                     (ins RRegu16:$a, RRegu16:$b),
+  def rr16 : InstPTX<(outs RegI16:$d),
+                     (ins RegI16:$a, RegI16:$b),
                      !strconcat(opcstr, ".u16\t$d, $a, $b"),
-                     [(set RRegu16:$d, (opnode RRegu16:$a, RRegu16:$b))]>;
-  def ri16 : InstPTX<(outs RRegu16:$d),
-                     (ins RRegu16:$a, i16imm:$b),
+                     [(set RegI16:$d, (opnode RegI16:$a, RegI16:$b))]>;
+  def ri16 : InstPTX<(outs RegI16:$d),
+                     (ins RegI16:$a, i16imm:$b),
                      !strconcat(opcstr, ".u16\t$d, $a, $b"),
-                     [(set RRegu16:$d, (opnode RRegu16:$a, imm:$b))]>;
-  def rr32 : InstPTX<(outs RRegu32:$d),
-                     (ins RRegu32:$a, RRegu32:$b),
+                     [(set RegI16:$d, (opnode RegI16:$a, imm:$b))]>;
+  def rr32 : InstPTX<(outs RegI32:$d),
+                     (ins RegI32:$a, RegI32:$b),
                      !strconcat(opcstr, ".u32\t$d, $a, $b"),
-                     [(set RRegu32:$d, (opnode RRegu32:$a, RRegu32:$b))]>;
-  def ri32 : InstPTX<(outs RRegu32:$d),
-                     (ins RRegu32:$a, i32imm:$b),
+                     [(set RegI32:$d, (opnode RegI32:$a, RegI32:$b))]>;
+  def ri32 : InstPTX<(outs RegI32:$d),
+                     (ins RegI32:$a, i32imm:$b),
                      !strconcat(opcstr, ".u32\t$d, $a, $b"),
-                     [(set RRegu32:$d, (opnode RRegu32:$a, imm:$b))]>;
-  def rr64 : InstPTX<(outs RRegu64:$d),
-                     (ins RRegu64:$a, RRegu64:$b),
+                     [(set RegI32:$d, (opnode RegI32:$a, imm:$b))]>;
+  def rr64 : InstPTX<(outs RegI64:$d),
+                     (ins RegI64:$a, RegI64:$b),
                      !strconcat(opcstr, ".u64\t$d, $a, $b"),
-                     [(set RRegu64:$d, (opnode RRegu64:$a, RRegu64:$b))]>;
-  def ri64 : InstPTX<(outs RRegu64:$d),
-                     (ins RRegu64:$a, i64imm:$b),
+                     [(set RegI64:$d, (opnode RegI64:$a, RegI64:$b))]>;
+  def ri64 : InstPTX<(outs RegI64:$d),
+                     (ins RegI64:$a, i64imm:$b),
                      !strconcat(opcstr, ".u64\t$d, $a, $b"),
-                     [(set RRegu64:$d, (opnode RRegu64:$a, imm:$b))]>;
+                     [(set RegI64:$d, (opnode RegI64:$a, imm:$b))]>;
 }
 
 multiclass PTX_LOGIC<string opcstr, SDNode opnode> {
-  def rr16 : InstPTX<(outs RRegu16:$d),
-                     (ins RRegu16:$a, RRegu16:$b),
+  def ripreds : InstPTX<(outs RegPred:$d),
+                     (ins RegPred:$a, i1imm:$b),
+                     !strconcat(opcstr, ".pred\t$d, $a, $b"),
+                     [(set RegPred:$d, (opnode RegPred:$a, imm:$b))]>;
+  def rrpreds : InstPTX<(outs RegPred:$d),
+                     (ins RegPred:$a, RegPred:$b),
+                     !strconcat(opcstr, ".pred\t$d, $a, $b"),
+                     [(set RegPred:$d, (opnode RegPred:$a, RegPred:$b))]>;
+  def rr16 : InstPTX<(outs RegI16:$d),
+                     (ins RegI16:$a, RegI16:$b),
                      !strconcat(opcstr, ".b16\t$d, $a, $b"),
-                     [(set RRegu16:$d, (opnode RRegu16:$a, RRegu16:$b))]>;
-  def ri16 : InstPTX<(outs RRegu16:$d),
-                     (ins RRegu16:$a, i16imm:$b),
+                     [(set RegI16:$d, (opnode RegI16:$a, RegI16:$b))]>;
+  def ri16 : InstPTX<(outs RegI16:$d),
+                     (ins RegI16:$a, i16imm:$b),
                      !strconcat(opcstr, ".b16\t$d, $a, $b"),
-                     [(set RRegu16:$d, (opnode RRegu16:$a, imm:$b))]>;
-  def rr32 : InstPTX<(outs RRegu32:$d),
-                     (ins RRegu32:$a, RRegu32:$b),
+                     [(set RegI16:$d, (opnode RegI16:$a, imm:$b))]>;
+  def rr32 : InstPTX<(outs RegI32:$d),
+                     (ins RegI32:$a, RegI32:$b),
                      !strconcat(opcstr, ".b32\t$d, $a, $b"),
-                     [(set RRegu32:$d, (opnode RRegu32:$a, RRegu32:$b))]>;
-  def ri32 : InstPTX<(outs RRegu32:$d),
-                     (ins RRegu32:$a, i32imm:$b),
+                     [(set RegI32:$d, (opnode RegI32:$a, RegI32:$b))]>;
+  def ri32 : InstPTX<(outs RegI32:$d),
+                     (ins RegI32:$a, i32imm:$b),
                      !strconcat(opcstr, ".b32\t$d, $a, $b"),
-                     [(set RRegu32:$d, (opnode RRegu32:$a, imm:$b))]>;
-  def rr64 : InstPTX<(outs RRegu64:$d),
-                     (ins RRegu64:$a, RRegu64:$b),
+                     [(set RegI32:$d, (opnode RegI32:$a, imm:$b))]>;
+  def rr64 : InstPTX<(outs RegI64:$d),
+                     (ins RegI64:$a, RegI64:$b),
                      !strconcat(opcstr, ".b64\t$d, $a, $b"),
-                     [(set RRegu64:$d, (opnode RRegu64:$a, RRegu64:$b))]>;
-  def ri64 : InstPTX<(outs RRegu64:$d),
-                     (ins RRegu64:$a, i64imm:$b),
+                     [(set RegI64:$d, (opnode RegI64:$a, RegI64:$b))]>;
+  def ri64 : InstPTX<(outs RegI64:$d),
+                     (ins RegI64:$a, i64imm:$b),
                      !strconcat(opcstr, ".b64\t$d, $a, $b"),
-                     [(set RRegu64:$d, (opnode RRegu64:$a, imm:$b))]>;
+                     [(set RegI64:$d, (opnode RegI64:$a, imm:$b))]>;
 }
 
 multiclass INT3ntnc<string opcstr, SDNode opnode> {
-  def rr16 : InstPTX<(outs RRegu16:$d),
-                     (ins RRegu16:$a, RRegu16:$b),
+  def rr16 : InstPTX<(outs RegI16:$d),
+                     (ins RegI16:$a, RegI16:$b),
                      !strconcat(opcstr, "16\t$d, $a, $b"),
-                     [(set RRegu16:$d, (opnode RRegu16:$a, RRegu16:$b))]>;
-  def rr32 : InstPTX<(outs RRegu32:$d),
-                     (ins RRegu32:$a, RRegu32:$b),
+                     [(set RegI16:$d, (opnode RegI16:$a, RegI16:$b))]>;
+  def rr32 : InstPTX<(outs RegI32:$d),
+                     (ins RegI32:$a, RegI32:$b),
                      !strconcat(opcstr, "32\t$d, $a, $b"),
-                     [(set RRegu32:$d, (opnode RRegu32:$a, RRegu32:$b))]>;
-  def rr64 : InstPTX<(outs RRegu64:$d),
-                     (ins RRegu64:$a, RRegu64:$b),
+                     [(set RegI32:$d, (opnode RegI32:$a, RegI32:$b))]>;
+  def rr64 : InstPTX<(outs RegI64:$d),
+                     (ins RegI64:$a, RegI64:$b),
                      !strconcat(opcstr, "64\t$d, $a, $b"),
-                     [(set RRegu64:$d, (opnode RRegu64:$a, RRegu64:$b))]>;
-  def ri16 : InstPTX<(outs RRegu16:$d),
-                     (ins RRegu16:$a, i16imm:$b),
+                     [(set RegI64:$d, (opnode RegI64:$a, RegI64:$b))]>;
+  def ri16 : InstPTX<(outs RegI16:$d),
+                     (ins RegI16:$a, i16imm:$b),
                      !strconcat(opcstr, "16\t$d, $a, $b"),
-                     [(set RRegu16:$d, (opnode RRegu16:$a, imm:$b))]>;
-  def ri32 : InstPTX<(outs RRegu32:$d),
-                     (ins RRegu32:$a, i32imm:$b),
+                     [(set RegI16:$d, (opnode RegI16:$a, imm:$b))]>;
+  def ri32 : InstPTX<(outs RegI32:$d),
+                     (ins RegI32:$a, i32imm:$b),
                      !strconcat(opcstr, "32\t$d, $a, $b"),
-                     [(set RRegu32:$d, (opnode RRegu32:$a, imm:$b))]>;
-  def ri64 : InstPTX<(outs RRegu64:$d),
-                     (ins RRegu64:$a, i64imm:$b),
+                     [(set RegI32:$d, (opnode RegI32:$a, imm:$b))]>;
+  def ri64 : InstPTX<(outs RegI64:$d),
+                     (ins RegI64:$a, i64imm:$b),
                      !strconcat(opcstr, "64\t$d, $a, $b"),
-                     [(set RRegu64:$d, (opnode RRegu64:$a, imm:$b))]>;
-  def ir16 : InstPTX<(outs RRegu16:$d),
-                     (ins i16imm:$a, RRegu16:$b),
+                     [(set RegI64:$d, (opnode RegI64:$a, imm:$b))]>;
+  def ir16 : InstPTX<(outs RegI16:$d),
+                     (ins i16imm:$a, RegI16:$b),
                      !strconcat(opcstr, "16\t$d, $a, $b"),
-                     [(set RRegu16:$d, (opnode imm:$a, RRegu16:$b))]>;
-  def ir32 : InstPTX<(outs RRegu32:$d),
-                     (ins i32imm:$a, RRegu32:$b),
+                     [(set RegI16:$d, (opnode imm:$a, RegI16:$b))]>;
+  def ir32 : InstPTX<(outs RegI32:$d),
+                     (ins i32imm:$a, RegI32:$b),
                      !strconcat(opcstr, "32\t$d, $a, $b"),
-                     [(set RRegu32:$d, (opnode imm:$a, RRegu32:$b))]>;
-  def ir64 : InstPTX<(outs RRegu64:$d),
-                     (ins i64imm:$a, RRegu64:$b),
+                     [(set RegI32:$d, (opnode imm:$a, RegI32:$b))]>;
+  def ir64 : InstPTX<(outs RegI64:$d),
+                     (ins i64imm:$a, RegI64:$b),
                      !strconcat(opcstr, "64\t$d, $a, $b"),
-                     [(set RRegu64:$d, (opnode imm:$a, RRegu64:$b))]>;
+                     [(set RegI64:$d, (opnode imm:$a, RegI64:$b))]>;
 }
 
-multiclass PTX_SETP<RegisterClass RC, string regclsname, Operand immcls,
+multiclass PTX_SETP_I<RegisterClass RC, string regclsname, Operand immcls,
                         CondCode cmp, string cmpstr> {
-  // TODO 1. support floating-point 2. support 5-operand format: p|q, a, b, c
+  // TODO support 5-operand format: p|q, a, b, c
 
   def rr
-    : InstPTX<(outs Preds:$p), (ins RC:$a, RC:$b),
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b),
               !strconcat("setp.", cmpstr, ".", regclsname, "\t$p, $a, $b"),
-              [(set Preds:$p, (setcc RC:$a, RC:$b, cmp))]>;
+              [(set RegPred:$p, (setcc RC:$a, RC:$b, cmp))]>;
   def ri
-    : InstPTX<(outs Preds:$p), (ins RC:$a, immcls:$b),
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b),
               !strconcat("setp.", cmpstr, ".", regclsname, "\t$p, $a, $b"),
-              [(set Preds:$p, (setcc RC:$a, imm:$b, cmp))]>;
+              [(set RegPred:$p, (setcc RC:$a, imm:$b, cmp))]>;
 
   def rr_and_r
-    : InstPTX<(outs Preds:$p), (ins RC:$a, RC:$b, Preds:$c),
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
               !strconcat("setp.", cmpstr, ".and.", regclsname, "\t$p, $a, $b, $c"),
-              [(set Preds:$p, (and (setcc RC:$a, RC:$b, cmp), Preds:$c))]>;
+              [(set RegPred:$p, (and (setcc RC:$a, RC:$b, cmp), RegPred:$c))]>;
   def ri_and_r
-    : InstPTX<(outs Preds:$p), (ins RC:$a, immcls:$b, Preds:$c),
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b, RegPred:$c),
               !strconcat("setp.", cmpstr, ".and.", regclsname, "\t$p, $a, $b, $c"),
-              [(set Preds:$p, (and (setcc RC:$a, imm:$b, cmp), Preds:$c))]>;
+              [(set RegPred:$p, (and (setcc RC:$a, imm:$b, cmp), RegPred:$c))]>;
   def rr_or_r
-    : InstPTX<(outs Preds:$p), (ins RC:$a, RC:$b, Preds:$c),
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
               !strconcat("setp.", cmpstr, ".or.", regclsname, "\t$p, $a, $b, $c"),
-              [(set Preds:$p, (or (setcc RC:$a, RC:$b, cmp), Preds:$c))]>;
+              [(set RegPred:$p, (or (setcc RC:$a, RC:$b, cmp), RegPred:$c))]>;
   def ri_or_r
-    : InstPTX<(outs Preds:$p), (ins RC:$a, immcls:$b, Preds:$c),
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b, RegPred:$c),
               !strconcat("setp.", cmpstr, ".or.", regclsname, "\t$p, $a, $b, $c"),
-              [(set Preds:$p, (or (setcc RC:$a, imm:$b, cmp), Preds:$c))]>;
+              [(set RegPred:$p, (or (setcc RC:$a, imm:$b, cmp), RegPred:$c))]>;
   def rr_xor_r
-    : InstPTX<(outs Preds:$p), (ins RC:$a, RC:$b, Preds:$c),
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
               !strconcat("setp.", cmpstr, ".xor.", regclsname, "\t$p, $a, $b, $c"),
-              [(set Preds:$p, (xor (setcc RC:$a, RC:$b, cmp), Preds:$c))]>;
+              [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, cmp), RegPred:$c))]>;
   def ri_xor_r
-    : InstPTX<(outs Preds:$p), (ins RC:$a, immcls:$b, Preds:$c),
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b, RegPred:$c),
               !strconcat("setp.", cmpstr, ".xor.", regclsname, "\t$p, $a, $b, $c"),
-              [(set Preds:$p, (xor (setcc RC:$a, imm:$b, cmp), Preds:$c))]>;
+              [(set RegPred:$p, (xor (setcc RC:$a, imm:$b, cmp), RegPred:$c))]>;
 
   def rr_and_not_r
-    : InstPTX<(outs Preds:$p), (ins RC:$a, RC:$b, Preds:$c),
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
               !strconcat("setp.", cmpstr, ".and.", regclsname, "\t$p, $a, $b, !$c"),
-              [(set Preds:$p, (and (setcc RC:$a, RC:$b, cmp), (not Preds:$c)))]>;
+              [(set RegPred:$p, (and (setcc RC:$a, RC:$b, cmp), (not RegPred:$c)))]>;
   def ri_and_not_r
-    : InstPTX<(outs Preds:$p), (ins RC:$a, immcls:$b, Preds:$c),
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b, RegPred:$c),
               !strconcat("setp.", cmpstr, ".and.", regclsname, "\t$p, $a, $b, !$c"),
-              [(set Preds:$p, (and (setcc RC:$a, imm:$b, cmp), (not Preds:$c)))]>;
+              [(set RegPred:$p, (and (setcc RC:$a, imm:$b, cmp), (not RegPred:$c)))]>;
   def rr_or_not_r
-    : InstPTX<(outs Preds:$p), (ins RC:$a, RC:$b, Preds:$c),
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
               !strconcat("setp.", cmpstr, ".or.", regclsname, "\t$p, $a, $b, !$c"),
-              [(set Preds:$p, (or (setcc RC:$a, RC:$b, cmp), (not Preds:$c)))]>;
+              [(set RegPred:$p, (or (setcc RC:$a, RC:$b, cmp), (not RegPred:$c)))]>;
   def ri_or_not_r
-    : InstPTX<(outs Preds:$p), (ins RC:$a, immcls:$b, Preds:$c),
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b, RegPred:$c),
               !strconcat("setp.", cmpstr, ".or.", regclsname, "\t$p, $a, $b, !$c"),
-              [(set Preds:$p, (or (setcc RC:$a, imm:$b, cmp), (not Preds:$c)))]>;
+              [(set RegPred:$p, (or (setcc RC:$a, imm:$b, cmp), (not RegPred:$c)))]>;
   def rr_xor_not_r
-    : InstPTX<(outs Preds:$p), (ins RC:$a, RC:$b, Preds:$c),
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
               !strconcat("setp.", cmpstr, ".xor.", regclsname, "\t$p, $a, $b, !$c"),
-              [(set Preds:$p, (xor (setcc RC:$a, RC:$b, cmp), (not Preds:$c)))]>;
+              [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, cmp), (not RegPred:$c)))]>;
   def ri_xor_not_r
-    : InstPTX<(outs Preds:$p), (ins RC:$a, immcls:$b, Preds:$c),
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b, RegPred:$c),
+              !strconcat("setp.", cmpstr, ".xor.", regclsname, "\t$p, $a, $b, !$c"),
+              [(set RegPred:$p, (xor (setcc RC:$a, imm:$b, cmp), (not RegPred:$c)))]>;
+}
+
+multiclass PTX_SETP_FP<RegisterClass RC, string regclsname,
+                        CondCode ucmp, CondCode ocmp, string cmpstr> {
+  // TODO support 5-operand format: p|q, a, b, c
+
+  def rr_u
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b),
+              !strconcat("setp.", cmpstr, "u.", regclsname, "\t$p, $a, $b"),
+              [(set RegPred:$p, (setcc RC:$a, RC:$b, ucmp))]>;
+  def rr_o
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b),
+              !strconcat("setp.", cmpstr, ".", regclsname, "\t$p, $a, $b"),
+              [(set RegPred:$p, (setcc RC:$a, RC:$b, ocmp))]>;
+
+  def rr_and_r_u
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
+              !strconcat("setp.", cmpstr, "u.and.", regclsname, "\t$p, $a, $b, $c"),
+              [(set RegPred:$p, (and (setcc RC:$a, RC:$b, ucmp), RegPred:$c))]>;
+  def rr_and_r_o
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
+              !strconcat("setp.", cmpstr, ".and.", regclsname, "\t$p, $a, $b, $c"),
+              [(set RegPred:$p, (and (setcc RC:$a, RC:$b, ocmp), RegPred:$c))]>;
+
+  def rr_or_r_u
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
+              !strconcat("setp.", cmpstr, "u.or.", regclsname, "\t$p, $a, $b, $c"),
+              [(set RegPred:$p, (or (setcc RC:$a, RC:$b, ucmp), RegPred:$c))]>;
+  def rr_or_r_o
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
+              !strconcat("setp.", cmpstr, ".or.", regclsname, "\t$p, $a, $b, $c"),
+              [(set RegPred:$p, (or (setcc RC:$a, RC:$b, ocmp), RegPred:$c))]>;
+
+  def rr_xor_r_u
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
+              !strconcat("setp.", cmpstr, "u.xor.", regclsname, "\t$p, $a, $b, $c"),
+              [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, ucmp), RegPred:$c))]>;
+  def rr_xor_r_o
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
+              !strconcat("setp.", cmpstr, ".xor.", regclsname, "\t$p, $a, $b, $c"),
+              [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, ocmp), RegPred:$c))]>;
+
+  def rr_and_not_r_u
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
+              !strconcat("setp.", cmpstr, "u.and.", regclsname, "\t$p, $a, $b, !$c"),
+              [(set RegPred:$p, (and (setcc RC:$a, RC:$b, ucmp), (not RegPred:$c)))]>;
+  def rr_and_not_r_o
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
+              !strconcat("setp.", cmpstr, ".and.", regclsname, "\t$p, $a, $b, !$c"),
+              [(set RegPred:$p, (and (setcc RC:$a, RC:$b, ocmp), (not RegPred:$c)))]>;
+
+  def rr_or_not_r_u
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
+              !strconcat("setp.", cmpstr, "u.or.", regclsname, "\t$p, $a, $b, !$c"),
+              [(set RegPred:$p, (or (setcc RC:$a, RC:$b, ucmp), (not RegPred:$c)))]>;
+  def rr_or_not_r_o
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
+              !strconcat("setp.", cmpstr, ".or.", regclsname, "\t$p, $a, $b, !$c"),
+              [(set RegPred:$p, (or (setcc RC:$a, RC:$b, ocmp), (not RegPred:$c)))]>;
+
+  def rr_xor_not_r_u
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
+              !strconcat("setp.", cmpstr, "u.xor.", regclsname, "\t$p, $a, $b, !$c"),
+              [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, ucmp), (not RegPred:$c)))]>;
+  def rr_xor_not_r_o
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
               !strconcat("setp.", cmpstr, ".xor.", regclsname, "\t$p, $a, $b, !$c"),
-              [(set Preds:$p, (xor (setcc RC:$a, imm:$b, cmp), (not Preds:$c)))]>;
+              [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, ocmp), (not RegPred:$c)))]>;
+}
+
+multiclass PTX_SELP<RegisterClass RC, string regclsname> {
+  def rr
+    : InstPTX<(outs RC:$r), (ins RegPred:$a, RC:$b, RC:$c),
+              !strconcat("selp.", regclsname, "\t$r, $b, $c, $a"),
+              [(set RC:$r, (select RegPred:$a, RC:$b, RC:$c))]>;
 }
 
 multiclass PTX_LD<string opstr, string typestr, RegisterClass RC, PatFrag pat_load> {
@@ -415,11 +524,11 @@ multiclass PTX_LD<string opstr, string typestr, RegisterClass RC, PatFrag pat_lo
 }
 
 multiclass PTX_LD_ALL<string opstr, PatFrag pat_load> {
-  defm u16 : PTX_LD<opstr, ".u16", RRegu16, pat_load>;
-  defm u32 : PTX_LD<opstr, ".u32", RRegu32, pat_load>;
-  defm u64 : PTX_LD<opstr, ".u64", RRegu64, pat_load>;
-  defm f32 : PTX_LD<opstr, ".f32", RRegf32, pat_load>;
-  defm f64 : PTX_LD<opstr, ".f64", RRegf64, pat_load>;
+  defm u16 : PTX_LD<opstr, ".u16", RegI16, pat_load>;
+  defm u32 : PTX_LD<opstr, ".u32", RegI32, pat_load>;
+  defm u64 : PTX_LD<opstr, ".u64", RegI64, pat_load>;
+  defm f32 : PTX_LD<opstr, ".f32", RegF32, pat_load>;
+  defm f64 : PTX_LD<opstr, ".f64", RegF64, pat_load>;
 }
 
 multiclass PTX_ST<string opstr, string typestr, RegisterClass RC, PatFrag pat_store> {
@@ -450,11 +559,11 @@ multiclass PTX_ST<string opstr, string typestr, RegisterClass RC, PatFrag pat_st
 }
 
 multiclass PTX_ST_ALL<string opstr, PatFrag pat_store> {
-  defm u16 : PTX_ST<opstr, ".u16", RRegu16, pat_store>;
-  defm u32 : PTX_ST<opstr, ".u32", RRegu32, pat_store>;
-  defm u64 : PTX_ST<opstr, ".u64", RRegu64, pat_store>;
-  defm f32 : PTX_ST<opstr, ".f32", RRegf32, pat_store>;
-  defm f64 : PTX_ST<opstr, ".f64", RRegf64, pat_store>;
+  defm u16 : PTX_ST<opstr, ".u16", RegI16, pat_store>;
+  defm u32 : PTX_ST<opstr, ".u32", RegI32, pat_store>;
+  defm u64 : PTX_ST<opstr, ".u64", RegI64, pat_store>;
+  defm f32 : PTX_ST<opstr, ".f32", RegF32, pat_store>;
+  defm f64 : PTX_ST<opstr, ".f64", RegF64, pat_store>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -466,9 +575,14 @@ multiclass PTX_ST_ALL<string opstr, PatFrag pat_store> {
 defm ADD : INT3<"add", add>;
 defm SUB : INT3<"sub", sub>;
 defm MUL : INT3<"mul.lo", mul>; // FIXME: Allow 32x32 -> 64 multiplies
+defm DIV : INT3<"div", udiv>;
+defm REM : INT3<"rem", urem>;
 
 ///===- Floating-Point Arithmetic Instructions ----------------------------===//
 
+// Standard Unary Operations
+defm FNEG : PTX_FLOAT_2OP<"neg", fneg>;
+
 // Standard Binary Operations
 defm FADD : PTX_FLOAT_3OP<"add", fadd>;
 defm FSUB : PTX_FLOAT_3OP<"sub", fsub>;
@@ -478,35 +592,35 @@ defm FMUL : PTX_FLOAT_3OP<"mul", fmul>;
 // For division, we need to have f32 and f64 differently.
 // For f32, we just always use .approx since it is supported on all hardware
 // for PTX 1.4+, which is our minimum target.
-def FDIVrr32 : InstPTX<(outs RRegf32:$d),
-                       (ins RRegf32:$a, RRegf32:$b),
+def FDIVrr32 : InstPTX<(outs RegF32:$d),
+                       (ins RegF32:$a, RegF32:$b),
                        "div.approx.f32\t$d, $a, $b",
-                       [(set RRegf32:$d, (fdiv RRegf32:$a, RRegf32:$b))]>;
-def FDIVri32 : InstPTX<(outs RRegf32:$d),
-                       (ins RRegf32:$a, f32imm:$b),
+                       [(set RegF32:$d, (fdiv RegF32:$a, RegF32:$b))]>;
+def FDIVri32 : InstPTX<(outs RegF32:$d),
+                       (ins RegF32:$a, f32imm:$b),
                        "div.approx.f32\t$d, $a, $b",
-                       [(set RRegf32:$d, (fdiv RRegf32:$a, fpimm:$b))]>;
+                       [(set RegF32:$d, (fdiv RegF32:$a, fpimm:$b))]>;
 
 // For f64, we must specify a rounding for sm 1.3+ but *not* for sm 1.0.
-def FDIVrr64SM13 : InstPTX<(outs RRegf64:$d),
-                           (ins RRegf64:$a, RRegf64:$b),
+def FDIVrr64SM13 : InstPTX<(outs RegF64:$d),
+                           (ins RegF64:$a, RegF64:$b),
                            "div.rn.f64\t$d, $a, $b",
-                           [(set RRegf64:$d, (fdiv RRegf64:$a, RRegf64:$b))]>,
+                           [(set RegF64:$d, (fdiv RegF64:$a, RegF64:$b))]>,
                    Requires<[SupportsSM13]>;
-def FDIVri64SM13 : InstPTX<(outs RRegf64:$d),
-                           (ins RRegf64:$a, f64imm:$b),
+def FDIVri64SM13 : InstPTX<(outs RegF64:$d),
+                           (ins RegF64:$a, f64imm:$b),
                            "div.rn.f64\t$d, $a, $b",
-                           [(set RRegf64:$d, (fdiv RRegf64:$a, fpimm:$b))]>,
+                           [(set RegF64:$d, (fdiv RegF64:$a, fpimm:$b))]>,
                    Requires<[SupportsSM13]>;
-def FDIVrr64SM10 : InstPTX<(outs RRegf64:$d),
-                           (ins RRegf64:$a, RRegf64:$b),
+def FDIVrr64SM10 : InstPTX<(outs RegF64:$d),
+                           (ins RegF64:$a, RegF64:$b),
                            "div.f64\t$d, $a, $b",
-                           [(set RRegf64:$d, (fdiv RRegf64:$a, RRegf64:$b))]>,
+                           [(set RegF64:$d, (fdiv RegF64:$a, RegF64:$b))]>,
                    Requires<[DoesNotSupportSM13]>;
-def FDIVri64SM10 : InstPTX<(outs RRegf64:$d),
-                           (ins RRegf64:$a, f64imm:$b),
+def FDIVri64SM10 : InstPTX<(outs RegF64:$d),
+                           (ins RegF64:$a, f64imm:$b),
                            "div.f64\t$d, $a, $b",
-                           [(set RRegf64:$d, (fdiv RRegf64:$a, fpimm:$b))]>,
+                           [(set RegF64:$d, (fdiv RegF64:$a, fpimm:$b))]>,
                    Requires<[DoesNotSupportSM13]>;
 
 
@@ -519,56 +633,98 @@ def FDIVri64SM10 : InstPTX<(outs RRegf64:$d),
 // In the short term, mad is supported on all PTX versions and we use a
 // default rounding mode no matter what shader model or PTX version.
 // TODO: Allow the rounding mode to be selectable through llc.
-defm FMADSM13 : PTX_FLOAT_4OP<"mad.rn", fmul, fadd>, Requires<[SupportsSM13]>;
-defm FMAD : PTX_FLOAT_4OP<"mad", fmul, fadd>, Requires<[DoesNotSupportSM13]>;
+defm FMADSM13 : PTX_FLOAT_4OP<"mad.rn", fmul, fadd>, Requires<[SupportsSM13, SupportsFMA]>;
+defm FMAD : PTX_FLOAT_4OP<"mad", fmul, fadd>, Requires<[DoesNotSupportSM13, SupportsFMA]>;
 
 ///===- Floating-Point Intrinsic Instructions -----------------------------===//
 
-def FSQRT32 : InstPTX<(outs RRegf32:$d),
-                      (ins RRegf32:$a),
+def FSQRT32 : InstPTX<(outs RegF32:$d),
+                      (ins RegF32:$a),
                       "sqrt.rn.f32\t$d, $a",
-                      [(set RRegf32:$d, (fsqrt RRegf32:$a))]>;
+                      [(set RegF32:$d, (fsqrt RegF32:$a))]>;
 
-def FSQRT64 : InstPTX<(outs RRegf64:$d),
-                      (ins RRegf64:$a),
+def FSQRT64 : InstPTX<(outs RegF64:$d),
+                      (ins RegF64:$a),
                       "sqrt.rn.f64\t$d, $a",
-                      [(set RRegf64:$d, (fsqrt RRegf64:$a))]>;
+                      [(set RegF64:$d, (fsqrt RegF64:$a))]>;
 
-def FSIN32 : InstPTX<(outs RRegf32:$d),
-                     (ins RRegf32:$a),
+def FSIN32 : InstPTX<(outs RegF32:$d),
+                     (ins RegF32:$a),
                      "sin.approx.f32\t$d, $a",
-                     [(set RRegf32:$d, (fsin RRegf32:$a))]>;
+                     [(set RegF32:$d, (fsin RegF32:$a))]>;
 
-def FSIN64 : InstPTX<(outs RRegf64:$d),
-                     (ins RRegf64:$a),
+def FSIN64 : InstPTX<(outs RegF64:$d),
+                     (ins RegF64:$a),
                      "sin.approx.f64\t$d, $a",
-                     [(set RRegf64:$d, (fsin RRegf64:$a))]>;
+                     [(set RegF64:$d, (fsin RegF64:$a))]>;
 
-def FCOS32 : InstPTX<(outs RRegf32:$d),
-                     (ins RRegf32:$a),
+def FCOS32 : InstPTX<(outs RegF32:$d),
+                     (ins RegF32:$a),
                      "cos.approx.f32\t$d, $a",
-                     [(set RRegf32:$d, (fcos RRegf32:$a))]>;
+                     [(set RegF32:$d, (fcos RegF32:$a))]>;
 
-def FCOS64 : InstPTX<(outs RRegf64:$d),
-                     (ins RRegf64:$a),
+def FCOS64 : InstPTX<(outs RegF64:$d),
+                     (ins RegF64:$a),
                      "cos.approx.f64\t$d, $a",
-                     [(set RRegf64:$d, (fcos RRegf64:$a))]>;
+                     [(set RegF64:$d, (fcos RegF64:$a))]>;
 
 
 ///===- Comparison and Selection Instructions -----------------------------===//
 
-defm SETPEQu32 : PTX_SETP<RRegu32, "u32", i32imm, SETEQ,  "eq">;
-defm SETPNEu32 : PTX_SETP<RRegu32, "u32", i32imm, SETNE,  "ne">;
-defm SETPLTu32 : PTX_SETP<RRegu32, "u32", i32imm, SETULT, "lt">;
-defm SETPLEu32 : PTX_SETP<RRegu32, "u32", i32imm, SETULE, "le">;
-defm SETPGTu32 : PTX_SETP<RRegu32, "u32", i32imm, SETUGT, "gt">;
-defm SETPGEu32 : PTX_SETP<RRegu32, "u32", i32imm, SETUGE, "ge">;
-defm SETPEQu64 : PTX_SETP<RRegu64, "u64", i64imm, SETEQ,  "eq">;
-defm SETPNEu64 : PTX_SETP<RRegu64, "u64", i64imm, SETNE,  "ne">;
-defm SETPLTu64 : PTX_SETP<RRegu64, "u64", i64imm, SETULT, "lt">;
-defm SETPLEu64 : PTX_SETP<RRegu64, "u64", i64imm, SETULE, "le">;
-defm SETPGTu64 : PTX_SETP<RRegu64, "u64", i64imm, SETUGT, "gt">;
-defm SETPGEu64 : PTX_SETP<RRegu64, "u64", i64imm, SETUGE, "ge">;
+// .setp
+
+// Compare u16
+
+defm SETPEQu16 : PTX_SETP_I<RegI16, "u16", i16imm, SETEQ,  "eq">;
+defm SETPNEu16 : PTX_SETP_I<RegI16, "u16", i16imm, SETNE,  "ne">;
+defm SETPLTu16 : PTX_SETP_I<RegI16, "u16", i16imm, SETULT, "lt">;
+defm SETPLEu16 : PTX_SETP_I<RegI16, "u16", i16imm, SETULE, "le">;
+defm SETPGTu16 : PTX_SETP_I<RegI16, "u16", i16imm, SETUGT, "gt">;
+defm SETPGEu16 : PTX_SETP_I<RegI16, "u16", i16imm, SETUGE, "ge">;
+
+// Compare u32
+
+defm SETPEQu32 : PTX_SETP_I<RegI32, "u32", i32imm, SETEQ,  "eq">;
+defm SETPNEu32 : PTX_SETP_I<RegI32, "u32", i32imm, SETNE,  "ne">;
+defm SETPLTu32 : PTX_SETP_I<RegI32, "u32", i32imm, SETULT, "lt">;
+defm SETPLEu32 : PTX_SETP_I<RegI32, "u32", i32imm, SETULE, "le">;
+defm SETPGTu32 : PTX_SETP_I<RegI32, "u32", i32imm, SETUGT, "gt">;
+defm SETPGEu32 : PTX_SETP_I<RegI32, "u32", i32imm, SETUGE, "ge">;
+
+// Compare u64
+
+defm SETPEQu64 : PTX_SETP_I<RegI64, "u64", i64imm, SETEQ,  "eq">;
+defm SETPNEu64 : PTX_SETP_I<RegI64, "u64", i64imm, SETNE,  "ne">;
+defm SETPLTu64 : PTX_SETP_I<RegI64, "u64", i64imm, SETULT, "lt">;
+defm SETPLEu64 : PTX_SETP_I<RegI64, "u64", i64imm, SETULE, "le">;
+defm SETPGTu64 : PTX_SETP_I<RegI64, "u64", i64imm, SETUGT, "gt">;
+defm SETPGEu64 : PTX_SETP_I<RegI64, "u64", i64imm, SETUGE, "ge">;
+
+// Compare f32
+
+defm SETPEQf32 : PTX_SETP_FP<RegF32, "f32", SETUEQ, SETOEQ, "eq">;
+defm SETPNEf32 : PTX_SETP_FP<RegF32, "f32", SETUNE, SETONE, "ne">;
+defm SETPLTf32 : PTX_SETP_FP<RegF32, "f32", SETULT, SETOLT, "lt">;
+defm SETPLEf32 : PTX_SETP_FP<RegF32, "f32", SETULE, SETOLE, "le">;
+defm SETPGTf32 : PTX_SETP_FP<RegF32, "f32", SETUGT, SETOGT, "gt">;
+defm SETPGEf32 : PTX_SETP_FP<RegF32, "f32", SETUGE, SETOGE, "ge">;
+
+// Compare f64
+
+defm SETPEQf64 : PTX_SETP_FP<RegF64, "f64", SETUEQ, SETOEQ, "eq">;
+defm SETPNEf64 : PTX_SETP_FP<RegF64, "f64", SETUNE, SETONE, "ne">;
+defm SETPLTf64 : PTX_SETP_FP<RegF64, "f64", SETULT, SETOLT, "lt">;
+defm SETPLEf64 : PTX_SETP_FP<RegF64, "f64", SETULE, SETOLE, "le">;
+defm SETPGTf64 : PTX_SETP_FP<RegF64, "f64", SETUGT, SETOGT, "gt">;
+defm SETPGEf64 : PTX_SETP_FP<RegF64, "f64", SETUGE, SETOGE, "ge">;
+
+// .selp
+
+defm PTX_SELPu16 : PTX_SELP<RegI16, "u16">;
+defm PTX_SELPu32 : PTX_SELP<RegI32, "u32">;
+defm PTX_SELPu64 : PTX_SELP<RegI64, "u64">;
+defm PTX_SELPf32 : PTX_SELP<RegF32, "f32">;
+defm PTX_SELPf64 : PTX_SELP<RegF64, "f64">;
 
 ///===- Logic and Shift Instructions --------------------------------------===//
 
@@ -584,47 +740,47 @@ defm XOR : PTX_LOGIC<"xor", xor>;
 
 let neverHasSideEffects = 1 in {
   def MOVPREDrr
-    : InstPTX<(outs Preds:$d), (ins Preds:$a), "mov.pred\t$d, $a", []>;
+    : InstPTX<(outs RegPred:$d), (ins RegPred:$a), "mov.pred\t$d, $a", []>;
   def MOVU16rr
-    : InstPTX<(outs RRegu16:$d), (ins RRegu16:$a), "mov.u16\t$d, $a", []>;
+    : InstPTX<(outs RegI16:$d), (ins RegI16:$a), "mov.u16\t$d, $a", []>;
   def MOVU32rr
-    : InstPTX<(outs RRegu32:$d), (ins RRegu32:$a), "mov.u32\t$d, $a", []>;
+    : InstPTX<(outs RegI32:$d), (ins RegI32:$a), "mov.u32\t$d, $a", []>;
   def MOVU64rr
-    : InstPTX<(outs RRegu64:$d), (ins RRegu64:$a), "mov.u64\t$d, $a", []>;
+    : InstPTX<(outs RegI64:$d), (ins RegI64:$a), "mov.u64\t$d, $a", []>;
   def MOVF32rr
-    : InstPTX<(outs RRegf32:$d), (ins RRegf32:$a), "mov.f32\t$d, $a", []>;
+    : InstPTX<(outs RegF32:$d), (ins RegF32:$a), "mov.f32\t$d, $a", []>;
   def MOVF64rr
-    : InstPTX<(outs RRegf64:$d), (ins RRegf64:$a), "mov.f64\t$d, $a", []>;
+    : InstPTX<(outs RegF64:$d), (ins RegF64:$a), "mov.f64\t$d, $a", []>;
 }
 
 let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
   def MOVPREDri
-    : InstPTX<(outs Preds:$d), (ins i1imm:$a), "mov.pred\t$d, $a",
-              [(set Preds:$d, imm:$a)]>;
+    : InstPTX<(outs RegPred:$d), (ins i1imm:$a), "mov.pred\t$d, $a",
+              [(set RegPred:$d, imm:$a)]>;
   def MOVU16ri
-    : InstPTX<(outs RRegu16:$d), (ins i16imm:$a), "mov.u16\t$d, $a",
-              [(set RRegu16:$d, imm:$a)]>;
+    : InstPTX<(outs RegI16:$d), (ins i16imm:$a), "mov.u16\t$d, $a",
+              [(set RegI16:$d, imm:$a)]>;
   def MOVU32ri
-    : InstPTX<(outs RRegu32:$d), (ins i32imm:$a), "mov.u32\t$d, $a",
-              [(set RRegu32:$d, imm:$a)]>;
-  def MOVU164ri
-    : InstPTX<(outs RRegu64:$d), (ins i64imm:$a), "mov.u64\t$d, $a",
-              [(set RRegu64:$d, imm:$a)]>;
+    : InstPTX<(outs RegI32:$d), (ins i32imm:$a), "mov.u32\t$d, $a",
+              [(set RegI32:$d, imm:$a)]>;
+  def MOVU64ri
+    : InstPTX<(outs RegI64:$d), (ins i64imm:$a), "mov.u64\t$d, $a",
+              [(set RegI64:$d, imm:$a)]>;
   def MOVF32ri
-    : InstPTX<(outs RRegf32:$d), (ins f32imm:$a), "mov.f32\t$d, $a",
-              [(set RRegf32:$d, fpimm:$a)]>;
+    : InstPTX<(outs RegF32:$d), (ins f32imm:$a), "mov.f32\t$d, $a",
+              [(set RegF32:$d, fpimm:$a)]>;
   def MOVF64ri
-    : InstPTX<(outs RRegf64:$d), (ins f64imm:$a), "mov.f64\t$d, $a",
-              [(set RRegf64:$d, fpimm:$a)]>;
+    : InstPTX<(outs RegF64:$d), (ins f64imm:$a), "mov.f64\t$d, $a",
+              [(set RegF64:$d, fpimm:$a)]>;
 }
 
 let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
   def MOVaddr32
-    : InstPTX<(outs RRegu32:$d), (ins i32imm:$a), "mov.u32\t$d, $a",
-              [(set RRegu32:$d, (PTXcopyaddress tglobaladdr:$a))]>;
+    : InstPTX<(outs RegI32:$d), (ins i32imm:$a), "mov.u32\t$d, $a",
+              [(set RegI32:$d, (PTXcopyaddress tglobaladdr:$a))]>;
   def MOVaddr64
-    : InstPTX<(outs RRegu64:$d), (ins i64imm:$a), "mov.u64\t$d, $a",
-              [(set RRegu64:$d, (PTXcopyaddress tglobaladdr:$a))]>;
+    : InstPTX<(outs RegI64:$d), (ins i64imm:$a), "mov.u64\t$d, $a",
+              [(set RegI64:$d, (PTXcopyaddress tglobaladdr:$a))]>;
 }
 
 // Loads
@@ -634,15 +790,15 @@ defm LDl : PTX_LD_ALL<"ld.local",  load_local>;
 defm LDs : PTX_LD_ALL<"ld.shared", load_shared>;
 
 // This is a special instruction that is manually inserted for kernel parameters
-def LDpiU16 : InstPTX<(outs RRegu16:$d), (ins MEMpi:$a),
+def LDpiU16 : InstPTX<(outs RegI16:$d), (ins MEMpi:$a),
                       "ld.param.u16\t$d, [$a]", []>;
-def LDpiU32 : InstPTX<(outs RRegu32:$d), (ins MEMpi:$a),
+def LDpiU32 : InstPTX<(outs RegI32:$d), (ins MEMpi:$a),
                       "ld.param.u32\t$d, [$a]", []>;
-def LDpiU64 : InstPTX<(outs RRegu64:$d), (ins MEMpi:$a),
+def LDpiU64 : InstPTX<(outs RegI64:$d), (ins MEMpi:$a),
                       "ld.param.u64\t$d, [$a]", []>;
-def LDpiF32 : InstPTX<(outs RRegf32:$d), (ins MEMpi:$a),
+def LDpiF32 : InstPTX<(outs RegF32:$d), (ins MEMpi:$a),
                       "ld.param.f32\t$d, [$a]", []>;
-def LDpiF64 : InstPTX<(outs RRegf64:$d), (ins MEMpi:$a),
+def LDpiF64 : InstPTX<(outs RegF64:$d), (ins MEMpi:$a),
                       "ld.param.f64\t$d, [$a]", []>;
 
 // Stores
@@ -654,17 +810,137 @@ defm STs : PTX_ST_ALL<"st.shared", store_shared>;
 // defm LDp : PTX_LD_ALL<"ld.param",  load_parameter>;
 // TODO: Do something with st.param if/when it is needed.
 
+// Conversion to pred
+
+def CVT_pred_u16
+  : InstPTX<(outs RegPred:$d), (ins RegI16:$a), "cvt.pred.u16\t$d, $a",
+            [(set RegPred:$d, (trunc RegI16:$a))]>;
+
 def CVT_pred_u32
-  : InstPTX<(outs Preds:$d), (ins RRegu32:$a), "cvt.pred.u32\t$d, $a",
-            [(set Preds:$d, (trunc RRegu32:$a))]>;
+  : InstPTX<(outs RegPred:$d), (ins RegI32:$a), "cvt.pred.u32\t$d, $a",
+            [(set RegPred:$d, (trunc RegI32:$a))]>;
+
+def CVT_pred_u64
+  : InstPTX<(outs RegPred:$d), (ins RegI64:$a), "cvt.pred.u64\t$d, $a",
+            [(set RegPred:$d, (trunc RegI64:$a))]>;
+
+def CVT_pred_f32
+  : InstPTX<(outs RegPred:$d), (ins RegF32:$a), "cvt.rni.pred.f32\t$d, $a",
+            [(set RegPred:$d, (fp_to_uint RegF32:$a))]>;
+
+def CVT_pred_f64
+  : InstPTX<(outs RegPred:$d), (ins RegF64:$a), "cvt.rni.pred.f64\t$d, $a",
+            [(set RegPred:$d, (fp_to_uint RegF64:$a))]>;
+
+// Conversion to u16
+
+def CVT_u16_pred
+  : InstPTX<(outs RegI16:$d), (ins RegPred:$a), "cvt.u16.pred\t$d, $a",
+            [(set RegI16:$d, (zext RegPred:$a))]>;
+
+def CVT_u16_u32
+  : InstPTX<(outs RegI16:$d), (ins RegI32:$a), "cvt.u16.u32\t$d, $a",
+            [(set RegI16:$d, (trunc RegI32:$a))]>;
+
+def CVT_u16_u64
+  : InstPTX<(outs RegI16:$d), (ins RegI64:$a), "cvt.u16.u64\t$d, $a",
+            [(set RegI16:$d, (trunc RegI64:$a))]>;
+
+def CVT_u16_f32
+  : InstPTX<(outs RegI16:$d), (ins RegF32:$a), "cvt.rni.u16.f32\t$d, $a",
+            [(set RegI16:$d, (fp_to_uint RegF32:$a))]>;
+
+def CVT_u16_f64
+  : InstPTX<(outs RegI16:$d), (ins RegF64:$a), "cvt.rni.u16.f64\t$d, $a",
+            [(set RegI16:$d, (fp_to_uint RegF64:$a))]>;
+
+// Conversion to u32
 
 def CVT_u32_pred
-  : InstPTX<(outs RRegu32:$d), (ins Preds:$a), "cvt.u32.pred\t$d, $a",
-            [(set RRegu32:$d, (zext Preds:$a))]>;
+  : InstPTX<(outs RegI32:$d), (ins RegPred:$a), "cvt.u32.pred\t$d, $a",
+            [(set RegI32:$d, (zext RegPred:$a))]>;
+
+def CVT_u32_u16
+  : InstPTX<(outs RegI32:$d), (ins RegI16:$a), "cvt.u32.u16\t$d, $a",
+            [(set RegI32:$d, (zext RegI16:$a))]>;
+
+def CVT_u32_u64
+  : InstPTX<(outs RegI32:$d), (ins RegI64:$a), "cvt.u32.u64\t$d, $a",
+            [(set RegI32:$d, (trunc RegI64:$a))]>;
+
+def CVT_u32_f32
+  : InstPTX<(outs RegI32:$d), (ins RegF32:$a), "cvt.rni.u32.f32\t$d, $a",
+            [(set RegI32:$d, (fp_to_uint RegF32:$a))]>;
+
+def CVT_u32_f64
+  : InstPTX<(outs RegI32:$d), (ins RegF64:$a), "cvt.rni.u32.f64\t$d, $a",
+            [(set RegI32:$d, (fp_to_uint RegF64:$a))]>;
+
+// Conversion to u64
+
+def CVT_u64_pred
+  : InstPTX<(outs RegI64:$d), (ins RegPred:$a), "cvt.u64.pred\t$d, $a",
+            [(set RegI64:$d, (zext RegPred:$a))]>;
+
+def CVT_u64_u16
+  : InstPTX<(outs RegI64:$d), (ins RegI16:$a), "cvt.u64.u16\t$d, $a",
+            [(set RegI64:$d, (zext RegI16:$a))]>;
 
 def CVT_u64_u32
-  : InstPTX<(outs RRegu64:$d), (ins RRegu32:$a), "cvt.u64.u32\t$d, $a",
-            [(set RRegu64:$d, (zext RRegu32:$a))]>;
+  : InstPTX<(outs RegI64:$d), (ins RegI32:$a), "cvt.u64.u32\t$d, $a",
+            [(set RegI64:$d, (zext RegI32:$a))]>;
+
+def CVT_u64_f32
+  : InstPTX<(outs RegI64:$d), (ins RegF32:$a), "cvt.rni.u64.f32\t$d, $a",
+            [(set RegI64:$d, (fp_to_uint RegF32:$a))]>;
+
+def CVT_u64_f64
+  : InstPTX<(outs RegI64:$d), (ins RegF64:$a), "cvt.rni.u64.f64\t$d, $a",
+            [(set RegI64:$d, (fp_to_uint RegF64:$a))]>;
+
+// Conversion to f32
+
+def CVT_f32_pred
+  : InstPTX<(outs RegF32:$d), (ins RegPred:$a), "cvt.rn.f32.pred\t$d, $a",
+            [(set RegF32:$d, (uint_to_fp RegPred:$a))]>;
+
+def CVT_f32_u16
+  : InstPTX<(outs RegF32:$d), (ins RegI16:$a), "cvt.rn.f32.u16\t$d, $a",
+            [(set RegF32:$d, (uint_to_fp RegI16:$a))]>;
+
+def CVT_f32_u32
+  : InstPTX<(outs RegF32:$d), (ins RegI32:$a), "cvt.rn.f32.u32\t$d, $a",
+            [(set RegF32:$d, (uint_to_fp RegI32:$a))]>;
+
+def CVT_f32_u64
+  : InstPTX<(outs RegF32:$d), (ins RegI64:$a), "cvt.rn.f32.u64\t$d, $a",
+            [(set RegF32:$d, (uint_to_fp RegI64:$a))]>;
+
+def CVT_f32_f64
+  : InstPTX<(outs RegF32:$d), (ins RegF64:$a), "cvt.rn.f32.f64\t$d, $a",
+            [(set RegF32:$d, (fround RegF64:$a))]>;
+
+// Conversion to f64
+
+def CVT_f64_pred
+  : InstPTX<(outs RegF64:$d), (ins RegPred:$a), "cvt.rn.f64.pred\t$d, $a",
+            [(set RegF64:$d, (uint_to_fp RegPred:$a))]>;
+
+def CVT_f64_u16
+  : InstPTX<(outs RegF64:$d), (ins RegI16:$a), "cvt.rn.f64.u16\t$d, $a",
+            [(set RegF64:$d, (uint_to_fp RegI16:$a))]>;
+
+def CVT_f64_u32
+  : InstPTX<(outs RegF64:$d), (ins RegI32:$a), "cvt.rn.f64.u32\t$d, $a",
+            [(set RegF64:$d, (uint_to_fp RegI32:$a))]>;
+
+def CVT_f64_u64
+  : InstPTX<(outs RegF64:$d), (ins RegI64:$a), "cvt.rn.f64.u64\t$d, $a",
+            [(set RegF64:$d, (uint_to_fp RegI64:$a))]>;
+
+def CVT_f64_f32
+  : InstPTX<(outs RegF64:$d), (ins RegF32:$a), "cvt.f64.f32\t$d, $a",
+            [(set RegF64:$d, (fextend RegF32:$a))]>;
 
 ///===- Control Flow Instructions -----------------------------------------===//
 
@@ -675,7 +951,7 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
 
 let isBranch = 1, isTerminator = 1 in {
   // FIXME: The pattern part is blank because I cannot (or do not yet know
-  // how to) use the first operand of PredicateOperand (a Preds register) here
+  // how to) use the first operand of PredicateOperand (a RegPred register) here
   def BRAdp
     : InstPTX<(outs), (ins brtarget:$d), "bra\t$d",
               [/*(brcond pred:$_p, bb:$d)*/]>;
diff --git a/lib/Target/PTX/PTXIntrinsicInstrInfo.td b/lib/Target/PTX/PTXIntrinsicInstrInfo.td
index 320934a..8d97909 100644
--- a/lib/Target/PTX/PTXIntrinsicInstrInfo.td
+++ b/lib/Target/PTX/PTXIntrinsicInstrInfo.td
@@ -14,14 +14,14 @@
 // PTX Special Purpose Register Accessor Intrinsics
 
 class PTX_READ_SPECIAL_REGISTER_R64<string regname, Intrinsic intop>
-  : InstPTX<(outs RRegu64:$d), (ins),
+  : InstPTX<(outs RegI64:$d), (ins),
             !strconcat("mov.u64\t$d, %", regname),
-            [(set RRegu64:$d, (intop))]>;
+            [(set RegI64:$d, (intop))]>;
 
 class PTX_READ_SPECIAL_REGISTER_R32<string regname, Intrinsic intop>
-  : InstPTX<(outs RRegu32:$d), (ins),
+  : InstPTX<(outs RegI32:$d), (ins),
             !strconcat("mov.u32\t$d, %", regname),
-            [(set RRegu32:$d, (intop))]>;
+            [(set RegI32:$d, (intop))]>;
 
 // TODO Add read vector-version of special registers
 
diff --git a/lib/Target/PTX/PTXMCAsmStreamer.cpp b/lib/Target/PTX/PTXMCAsmStreamer.cpp
index cf743e4..1574670 100644
--- a/lib/Target/PTX/PTXMCAsmStreamer.cpp
+++ b/lib/Target/PTX/PTXMCAsmStreamer.cpp
@@ -143,9 +143,9 @@ public:
   virtual void EmitBytes(StringRef Data, unsigned AddrSpace);
 
   virtual void EmitValueImpl(const MCExpr *Value, unsigned Size,
-                             bool isPCRel, unsigned AddrSpace);
-  virtual void EmitULEB128Value(const MCExpr *Value, unsigned AddrSpace = 0);
-  virtual void EmitSLEB128Value(const MCExpr *Value, unsigned AddrSpace = 0);
+                             unsigned AddrSpace);
+  virtual void EmitULEB128Value(const MCExpr *Value);
+  virtual void EmitSLEB128Value(const MCExpr *Value);
   virtual void EmitGPRel32Value(const MCExpr *Value);
 
 
@@ -352,9 +352,8 @@ void PTXMCAsmStreamer::EmitBytes(StringRef Data, unsigned AddrSpace) {
 }
 
 void PTXMCAsmStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
-                                     bool isPCRel, unsigned AddrSpace) {
+                                     unsigned AddrSpace) {
   assert(getCurrentSection() && "Cannot emit contents before setting section!");
-  assert(!isPCRel && "Cannot emit pc relative relocations!");
   const char *Directive = 0;
   switch (Size) {
   default: break;
@@ -383,15 +382,13 @@ void PTXMCAsmStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
   EmitEOL();
 }
 
-void PTXMCAsmStreamer::EmitULEB128Value(const MCExpr *Value,
-                                        unsigned AddrSpace) {
+void PTXMCAsmStreamer::EmitULEB128Value(const MCExpr *Value) {
   assert(MAI.hasLEB128() && "Cannot print a .uleb");
   OS << ".uleb128 " << *Value;
   EmitEOL();
 }
 
-void PTXMCAsmStreamer::EmitSLEB128Value(const MCExpr *Value,
-                                        unsigned AddrSpace) {
+void PTXMCAsmStreamer::EmitSLEB128Value(const MCExpr *Value) {
   assert(MAI.hasLEB128() && "Cannot print a .sleb");
   OS << ".sleb128 " << *Value;
   EmitEOL();
@@ -533,7 +530,7 @@ void PTXMCAsmStreamer::Finish() {}
 namespace llvm {
   MCStreamer *createPTXAsmStreamer(MCContext &Context,
                                    formatted_raw_ostream &OS,
-                                   bool isVerboseAsm, bool useLoc,
+                                   bool isVerboseAsm, bool useLoc, bool useCFI,
                                    MCInstPrinter *IP,
                                    MCCodeEmitter *CE, TargetAsmBackend *TAB,
                                    bool ShowInst) {
diff --git a/lib/Target/PTX/PTXMFInfoExtract.cpp b/lib/Target/PTX/PTXMFInfoExtract.cpp
index c5e1910..6fe9e6c 100644
--- a/lib/Target/PTX/PTXMFInfoExtract.cpp
+++ b/lib/Target/PTX/PTXMFInfoExtract.cpp
@@ -54,8 +54,6 @@ bool PTXMFInfoExtract::runOnMachineFunction(MachineFunction &MF) {
 
   DEBUG(dbgs() << "******** PTX FUNCTION LOCAL VAR REG DEF ********\n");
 
-  unsigned retreg = MFI->retReg();
-
   DEBUG(dbgs()
         << "PTX::NoRegister == " << PTX::NoRegister << "\n"
         << "PTX::NUM_TARGET_REGS == " << PTX::NUM_TARGET_REGS << "\n");
@@ -68,15 +66,13 @@ bool PTXMFInfoExtract::runOnMachineFunction(MachineFunction &MF) {
   // FIXME: This is a slow linear scanning
   for (unsigned reg = PTX::NoRegister + 1; reg < PTX::NUM_TARGET_REGS; ++reg)
     if (MRI.isPhysRegUsed(reg) &&
-        reg != retreg &&
+        !MFI->isRetReg(reg) &&
         (MFI->isKernel() || !MFI->isArgReg(reg)))
       MFI->addLocalVarReg(reg);
 
   // Notify MachineFunctionInfo that I've done adding local var reg
   MFI->doneAddLocalVar();
 
-  DEBUG(dbgs() << "Return Reg: " << retreg << "\n");
-
   DEBUG(for (PTXMachineFunctionInfo::reg_iterator
              i = MFI->argRegBegin(), e = MFI->argRegEnd();
              i != e; ++i)
diff --git a/lib/Target/PTX/PTXMachineFunctionInfo.h b/lib/Target/PTX/PTXMachineFunctionInfo.h
index b5b3c3b..1da4b5d 100644
--- a/lib/Target/PTX/PTXMachineFunctionInfo.h
+++ b/lib/Target/PTX/PTXMachineFunctionInfo.h
@@ -15,6 +15,7 @@
 #define PTX_MACHINE_FUNCTION_INFO_H
 
 #include "PTX.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/CodeGen/MachineFunction.h"
 
 namespace llvm {
@@ -25,7 +26,7 @@ class PTXMachineFunctionInfo : public MachineFunctionInfo {
 private:
   bool is_kernel;
   std::vector<unsigned> reg_arg, reg_local_var;
-  unsigned reg_ret;
+  DenseSet<unsigned> reg_ret;
   bool _isDoneAddArg;
 
 public:
@@ -39,22 +40,18 @@ public:
 
   void addArgReg(unsigned reg) { reg_arg.push_back(reg); }
   void addLocalVarReg(unsigned reg) { reg_local_var.push_back(reg); }
-  void setRetReg(unsigned reg) { reg_ret = reg; }
+  void addRetReg(unsigned reg) { reg_ret.insert(reg); }
 
   void doneAddArg(void) {
-    std::sort(reg_arg.begin(), reg_arg.end());
     _isDoneAddArg = true;
   }
-  void doneAddLocalVar(void) {
-    std::sort(reg_local_var.begin(), reg_local_var.end());
-  }
-
-  bool isDoneAddArg(void) { return _isDoneAddArg; }
+  void doneAddLocalVar(void) {}
 
   bool isKernel() const { return is_kernel; }
 
   typedef std::vector<unsigned>::const_iterator         reg_iterator;
   typedef std::vector<unsigned>::const_reverse_iterator reg_reverse_iterator;
+  typedef DenseSet<unsigned>::const_iterator            ret_iterator;
 
   bool         argRegEmpty() const { return reg_arg.empty(); }
   int          getNumArg() const { return reg_arg.size(); }
@@ -67,14 +64,22 @@ public:
   reg_iterator localVarRegBegin() const { return reg_local_var.begin(); }
   reg_iterator localVarRegEnd()   const { return reg_local_var.end(); }
 
-  unsigned retReg() const { return reg_ret; }
+  bool         retRegEmpty() const { return reg_ret.empty(); }
+  int          getNumRet() const { return reg_ret.size(); }
+  ret_iterator retRegBegin() const { return reg_ret.begin(); }
+  ret_iterator retRegEnd()   const { return reg_ret.end(); }
 
   bool isArgReg(unsigned reg) const {
-    return std::binary_search(reg_arg.begin(), reg_arg.end(), reg);
+    return std::find(reg_arg.begin(), reg_arg.end(), reg) != reg_arg.end();
+  }
+
+  bool isRetReg(unsigned reg) const {
+    return std::find(reg_ret.begin(), reg_ret.end(), reg) != reg_ret.end();
   }
 
   bool isLocalVarReg(unsigned reg) const {
-    return std::binary_search(reg_local_var.begin(), reg_local_var.end(), reg);
+    return std::find(reg_local_var.begin(), reg_local_var.end(), reg)
+      != reg_local_var.end();
   }
 }; // class PTXMachineFunctionInfo
 } // namespace llvm
diff --git a/lib/Target/PTX/PTXRegisterInfo.h b/lib/Target/PTX/PTXRegisterInfo.h
index 67e130f..dc56352 100644
--- a/lib/Target/PTX/PTXRegisterInfo.h
+++ b/lib/Target/PTX/PTXRegisterInfo.h
@@ -57,6 +57,9 @@ struct PTXRegisterInfo : public PTXGenRegisterInfo {
   virtual int getDwarfRegNum(unsigned RegNum, bool isEH) const {
     return PTXGenRegisterInfo::getDwarfRegNumFull(RegNum, 0);
   }
+  virtual int getLLVMRegNum(unsigned RegNum, bool isEH) const {
+    return PTXGenRegisterInfo::getLLVMRegNumFull(RegNum, 0);
+  }
 }; // struct PTXRegisterInfo
 } // namespace llvm
 
diff --git a/lib/Target/PTX/PTXRegisterInfo.td b/lib/Target/PTX/PTXRegisterInfo.td
index 548e3bb..08a39a8 100644
--- a/lib/Target/PTX/PTXRegisterInfo.td
+++ b/lib/Target/PTX/PTXRegisterInfo.td
@@ -29,30 +29,6 @@ def P4  : PTXReg<"p4">;
 def P5  : PTXReg<"p5">;
 def P6  : PTXReg<"p6">;
 def P7  : PTXReg<"p7">;
-def P8  : PTXReg<"p8">;
-def P9  : PTXReg<"p9">;
-def P10 : PTXReg<"p10">;
-def P11 : PTXReg<"p11">;
-def P12 : PTXReg<"p12">;
-def P13 : PTXReg<"p13">;
-def P14 : PTXReg<"p14">;
-def P15 : PTXReg<"p15">;
-def P16 : PTXReg<"p16">;
-def P17 : PTXReg<"p17">;
-def P18 : PTXReg<"p18">;
-def P19 : PTXReg<"p19">;
-def P20 : PTXReg<"p20">;
-def P21 : PTXReg<"p21">;
-def P22 : PTXReg<"p22">;
-def P23 : PTXReg<"p23">;
-def P24 : PTXReg<"p24">;
-def P25 : PTXReg<"p25">;
-def P26 : PTXReg<"p26">;
-def P27 : PTXReg<"p27">;
-def P28 : PTXReg<"p28">;
-def P29 : PTXReg<"p29">;
-def P30 : PTXReg<"p30">;
-def P31 : PTXReg<"p31">;
 
 ///===- 16-bit Integer Registers ------------------------------------------===//
 
@@ -64,30 +40,6 @@ def RH4  : PTXReg<"rh4">;
 def RH5  : PTXReg<"rh5">;
 def RH6  : PTXReg<"rh6">;
 def RH7  : PTXReg<"rh7">;
-def RH8  : PTXReg<"rh8">;
-def RH9  : PTXReg<"rh9">;
-def RH10 : PTXReg<"rh10">;
-def RH11 : PTXReg<"rh11">;
-def RH12 : PTXReg<"rh12">;
-def RH13 : PTXReg<"rh13">;
-def RH14 : PTXReg<"rh14">;
-def RH15 : PTXReg<"rh15">;
-def RH16 : PTXReg<"rh16">;
-def RH17 : PTXReg<"rh17">;
-def RH18 : PTXReg<"rh18">;
-def RH19 : PTXReg<"rh19">;
-def RH20 : PTXReg<"rh20">;
-def RH21 : PTXReg<"rh21">;
-def RH22 : PTXReg<"rh22">;
-def RH23 : PTXReg<"rh23">;
-def RH24 : PTXReg<"rh24">;
-def RH25 : PTXReg<"rh25">;
-def RH26 : PTXReg<"rh26">;
-def RH27 : PTXReg<"rh27">;
-def RH28 : PTXReg<"rh28">;
-def RH29 : PTXReg<"rh29">;
-def RH30 : PTXReg<"rh30">;
-def RH31 : PTXReg<"rh31">;
 
 ///===- 32-bit Integer Registers ------------------------------------------===//
 
@@ -99,30 +51,6 @@ def R4  : PTXReg<"r4">;
 def R5  : PTXReg<"r5">;
 def R6  : PTXReg<"r6">;
 def R7  : PTXReg<"r7">;
-def R8  : PTXReg<"r8">;
-def R9  : PTXReg<"r9">;
-def R10 : PTXReg<"r10">;
-def R11 : PTXReg<"r11">;
-def R12 : PTXReg<"r12">;
-def R13 : PTXReg<"r13">;
-def R14 : PTXReg<"r14">;
-def R15 : PTXReg<"r15">;
-def R16 : PTXReg<"r16">;
-def R17 : PTXReg<"r17">;
-def R18 : PTXReg<"r18">;
-def R19 : PTXReg<"r19">;
-def R20 : PTXReg<"r20">;
-def R21 : PTXReg<"r21">;
-def R22 : PTXReg<"r22">;
-def R23 : PTXReg<"r23">;
-def R24 : PTXReg<"r24">;
-def R25 : PTXReg<"r25">;
-def R26 : PTXReg<"r26">;
-def R27 : PTXReg<"r27">;
-def R28 : PTXReg<"r28">;
-def R29 : PTXReg<"r29">;
-def R30 : PTXReg<"r30">;
-def R31 : PTXReg<"r31">;
 
 ///===- 64-bit Integer Registers ------------------------------------------===//
 
@@ -134,138 +62,14 @@ def RD4  : PTXReg<"rd4">;
 def RD5  : PTXReg<"rd5">;
 def RD6  : PTXReg<"rd6">;
 def RD7  : PTXReg<"rd7">;
-def RD8  : PTXReg<"rd8">;
-def RD9  : PTXReg<"rd9">;
-def RD10 : PTXReg<"rd10">;
-def RD11 : PTXReg<"rd11">;
-def RD12 : PTXReg<"rd12">;
-def RD13 : PTXReg<"rd13">;
-def RD14 : PTXReg<"rd14">;
-def RD15 : PTXReg<"rd15">;
-def RD16 : PTXReg<"rd16">;
-def RD17 : PTXReg<"rd17">;
-def RD18 : PTXReg<"rd18">;
-def RD19 : PTXReg<"rd19">;
-def RD20 : PTXReg<"rd20">;
-def RD21 : PTXReg<"rd21">;
-def RD22 : PTXReg<"rd22">;
-def RD23 : PTXReg<"rd23">;
-def RD24 : PTXReg<"rd24">;
-def RD25 : PTXReg<"rd25">;
-def RD26 : PTXReg<"rd26">;
-def RD27 : PTXReg<"rd27">;
-def RD28 : PTXReg<"rd28">;
-def RD29 : PTXReg<"rd29">;
-def RD30 : PTXReg<"rd30">;
-def RD31 : PTXReg<"rd31">;
-
-///===- 32-bit Floating-Point Registers -----------------------------------===//
-
-def F0  : PTXReg<"f0">;
-def F1  : PTXReg<"f1">;
-def F2  : PTXReg<"f2">;
-def F3  : PTXReg<"f3">;
-def F4  : PTXReg<"f4">;
-def F5  : PTXReg<"f5">;
-def F6  : PTXReg<"f6">;
-def F7  : PTXReg<"f7">;
-def F8  : PTXReg<"f8">;
-def F9  : PTXReg<"f9">;
-def F10 : PTXReg<"f10">;
-def F11 : PTXReg<"f11">;
-def F12 : PTXReg<"f12">;
-def F13 : PTXReg<"f13">;
-def F14 : PTXReg<"f14">;
-def F15 : PTXReg<"f15">;
-def F16 : PTXReg<"f16">;
-def F17 : PTXReg<"f17">;
-def F18 : PTXReg<"f18">;
-def F19 : PTXReg<"f19">;
-def F20 : PTXReg<"f20">;
-def F21 : PTXReg<"f21">;
-def F22 : PTXReg<"f22">;
-def F23 : PTXReg<"f23">;
-def F24 : PTXReg<"f24">;
-def F25 : PTXReg<"f25">;
-def F26 : PTXReg<"f26">;
-def F27 : PTXReg<"f27">;
-def F28 : PTXReg<"f28">;
-def F29 : PTXReg<"f29">;
-def F30 : PTXReg<"f30">;
-def F31 : PTXReg<"f31">;
-
-///===- 64-bit Floating-Point Registers -----------------------------------===//
-
-def FD0  : PTXReg<"fd0">;
-def FD1  : PTXReg<"fd1">;
-def FD2  : PTXReg<"fd2">;
-def FD3  : PTXReg<"fd3">;
-def FD4  : PTXReg<"fd4">;
-def FD5  : PTXReg<"fd5">;
-def FD6  : PTXReg<"fd6">;
-def FD7  : PTXReg<"fd7">;
-def FD8  : PTXReg<"fd8">;
-def FD9  : PTXReg<"fd9">;
-def FD10 : PTXReg<"fd10">;
-def FD11 : PTXReg<"fd11">;
-def FD12 : PTXReg<"fd12">;
-def FD13 : PTXReg<"fd13">;
-def FD14 : PTXReg<"fd14">;
-def FD15 : PTXReg<"fd15">;
-def FD16 : PTXReg<"fd16">;
-def FD17 : PTXReg<"fd17">;
-def FD18 : PTXReg<"fd18">;
-def FD19 : PTXReg<"fd19">;
-def FD20 : PTXReg<"fd20">;
-def FD21 : PTXReg<"fd21">;
-def FD22 : PTXReg<"fd22">;
-def FD23 : PTXReg<"fd23">;
-def FD24 : PTXReg<"fd24">;
-def FD25 : PTXReg<"fd25">;
-def FD26 : PTXReg<"fd26">;
-def FD27 : PTXReg<"fd27">;
-def FD28 : PTXReg<"fd28">;
-def FD29 : PTXReg<"fd29">;
-def FD30 : PTXReg<"fd30">;
-def FD31 : PTXReg<"fd31">;
-
 
 //===----------------------------------------------------------------------===//
 //  Register classes
 //===----------------------------------------------------------------------===//
 
-def Preds : RegisterClass<"PTX", [i1], 8,
-                          [P0, P1, P2, P3, P4, P5, P6, P7,
-                           P8, P9, P10, P11, P12, P13, P14, P15,
-                           P16, P17, P18, P19, P20, P21, P22, P23,
-                           P24, P25, P26, P27, P28, P29, P30, P31]>;
-
-def RRegu16 : RegisterClass<"PTX", [i16], 16,
-                            [RH0, RH1, RH2, RH3, RH4, RH5, RH6, RH7,
-                             RH8, RH9, RH10, RH11, RH12, RH13, RH14, RH15,
-                             RH16, RH17, RH18, RH19, RH20, RH21, RH22, RH23,
-                             RH24, RH25, RH26, RH27, RH28, RH29, RH30, RH31]>;
-
-def RRegu32 : RegisterClass<"PTX", [i32], 32,
-                            [R0, R1, R2, R3, R4, R5, R6, R7,
-                             R8, R9, R10, R11, R12, R13, R14, R15,
-                             R16, R17, R18, R19, R20, R21, R22, R23,
-                             R24, R25, R26, R27, R28, R29, R30, R31]>;
-
-def RRegu64 : RegisterClass<"PTX", [i64], 64,
-                            [RD0, RD1, RD2, RD3, RD4, RD5, RD6, RD7,
-                             RD8, RD9, RD10, RD11, RD12, RD13, RD14, RD15,
-                             RD16, RD17, RD18, RD19, RD20, RD21, RD22, RD23,
-                             RD24, RD25, RD26, RD27, RD28, RD29, RD30, RD31]>;
-
-def RRegf32 : RegisterClass<"PTX", [f32], 32,
-                            [F0, F1, F2, F3, F4, F5, F6, F7,
-                             F8, F9, F10, F11, F12, F13, F14, F15,
-                             F16, F17, F18, F19, F20, F21, F22, F23,
-                             F24, F25, F26, F27, F28, F29, F30, F31]>;
-
-def RRegf64 : RegisterClass<"PTX", [f64], 64,
-                            [FD0, FD1, FD2, FD3, FD4, FD5, FD6, FD7,
-                             FD8, FD9, FD10, FD11, FD12, FD13, FD14, FD15,
-                             FD16, FD17, FD18, FD19, FD20, FD21, FD22, FD23,
-                             FD24, FD25, FD26, FD27, FD28, FD29, FD30, FD31]>;
+def RegPred : RegisterClass<"PTX", [i1], 8, (sequence "P%u", 0, 7)>;
+def RegI16  : RegisterClass<"PTX", [i16], 16, (sequence "RH%u", 0, 7)>;
+def RegI32  : RegisterClass<"PTX", [i32], 32, (sequence "R%u",  0, 7)>;
+def RegI64  : RegisterClass<"PTX", [i64], 64, (sequence "RD%u", 0, 7)>;
+def RegF32  : RegisterClass<"PTX", [f32], 32, (sequence "R%u",  0, 7)>;
+def RegF64  : RegisterClass<"PTX", [f64], 64, (sequence "RD%u", 0, 7)>;
diff --git a/lib/Target/PTX/PTXSubtarget.cpp b/lib/Target/PTX/PTXSubtarget.cpp
index 527622d..e8a1dfe 100644
--- a/lib/Target/PTX/PTXSubtarget.cpp
+++ b/lib/Target/PTX/PTXSubtarget.cpp
@@ -16,11 +16,13 @@
 
 using namespace llvm;
 
-PTXSubtarget::PTXSubtarget(const std::string &TT, const std::string &FS)
+PTXSubtarget::PTXSubtarget(const std::string &TT, const std::string &FS,
+                           bool is64Bit)
   : PTXShaderModel(PTX_SM_1_0),
     PTXVersion(PTX_VERSION_2_0),
     SupportsDouble(false),
-    Use64BitAddresses(false) {
+    SupportsFMA(true),
+    Is64Bit(is64Bit) {	
   std::string TARGET = "generic";
   ParseSubtargetFeatures(FS, TARGET);
 }
@@ -40,6 +42,7 @@ std::string PTXSubtarget::getPTXVersionString() const {
     case PTX_VERSION_2_0: return "2.0";
     case PTX_VERSION_2_1: return "2.1";
     case PTX_VERSION_2_2: return "2.2";
+    case PTX_VERSION_2_3: return "2.3";
   }
 }
 
diff --git a/lib/Target/PTX/PTXSubtarget.h b/lib/Target/PTX/PTXSubtarget.h
index 57cd43d..c8f8c3b 100644
--- a/lib/Target/PTX/PTXSubtarget.h
+++ b/lib/Target/PTX/PTXSubtarget.h
@@ -37,7 +37,8 @@ namespace llvm {
       enum PTXVersionEnum {
         PTX_VERSION_2_0,  /*< PTX Version 2.0 */
         PTX_VERSION_2_1,  /*< PTX Version 2.1 */
-        PTX_VERSION_2_2   /*< PTX Version 2.2 */
+        PTX_VERSION_2_2,  /*< PTX Version 2.2 */
+        PTX_VERSION_2_3   /*< PTX Version 2.3 */
       };
 
       /// Shader Model supported on the target GPU.
@@ -49,11 +50,15 @@ namespace llvm {
       // The native .f64 type is supported on the hardware.
       bool SupportsDouble;
 
+      // Support the fused-multiply add (FMA) and multiply-add (MAD)
+      // instructions
+      bool SupportsFMA;
+
       // Use .u64 instead of .u32 for addresses.
-      bool Use64BitAddresses;
+      bool Is64Bit;
 
     public:
-      PTXSubtarget(const std::string &TT, const std::string &FS);
+      PTXSubtarget(const std::string &TT, const std::string &FS, bool is64Bit);
 
       std::string getTargetString() const;
 
@@ -61,7 +66,9 @@ namespace llvm {
 
       bool supportsDouble() const { return SupportsDouble; }
 
-      bool use64BitAddresses() const { return Use64BitAddresses; }
+      bool is64Bit() const { return Is64Bit; }
+
+      bool supportsFMA() const { return SupportsFMA; }
 
       bool supportsSM13() const { return PTXShaderModel >= PTX_SM_1_3; }
 
@@ -71,6 +78,8 @@ namespace llvm {
 
       bool supportsPTX22() const { return PTXVersion >= PTX_VERSION_2_2; }
 
+      bool supportsPTX23() const { return PTXVersion >= PTX_VERSION_2_3; }
+
       std::string ParseSubtargetFeatures(const std::string &FS,
                                          const std::string &CPU);
   }; // class PTXSubtarget
diff --git a/lib/Target/PTX/PTXTargetMachine.cpp b/lib/Target/PTX/PTXTargetMachine.cpp
index 4701a94..1b737c9 100644
--- a/lib/Target/PTX/PTXTargetMachine.cpp
+++ b/lib/Target/PTX/PTXTargetMachine.cpp
@@ -23,6 +23,7 @@ using namespace llvm;
 namespace llvm {
   MCStreamer *createPTXAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
                                    bool isVerboseAsm, bool useLoc,
+                                   bool useCFI,
                                    MCInstPrinter *InstPrint,
                                    MCCodeEmitter *CE,
                                    TargetAsmBackend *TAB,
@@ -30,9 +31,15 @@ namespace llvm {
 }
 
 extern "C" void LLVMInitializePTXTarget() {
-  RegisterTargetMachine<PTXTargetMachine> X(ThePTXTarget);
-  RegisterAsmInfo<PTXMCAsmInfo> Y(ThePTXTarget);
-  TargetRegistry::RegisterAsmStreamer(ThePTXTarget, createPTXAsmStreamer);
+
+  RegisterTargetMachine<PTX32TargetMachine> X(ThePTX32Target);
+  RegisterTargetMachine<PTX64TargetMachine> Y(ThePTX64Target);
+
+  RegisterAsmInfo<PTXMCAsmInfo> Z(ThePTX32Target);
+  RegisterAsmInfo<PTXMCAsmInfo> W(ThePTX64Target);
+
+  TargetRegistry::RegisterAsmStreamer(ThePTX32Target, createPTXAsmStreamer);
+  TargetRegistry::RegisterAsmStreamer(ThePTX64Target, createPTXAsmStreamer);
 }
 
 namespace {
@@ -45,18 +52,28 @@ namespace {
 // DataLayout and FrameLowering are filled with dummy data
 PTXTargetMachine::PTXTargetMachine(const Target &T,
                                    const std::string &TT,
-                                   const std::string &FS)
+                                   const std::string &FS,
+                                   bool is64Bit)
   : LLVMTargetMachine(T, TT),
-    // FIXME: This feels like a dirty hack, but Subtarget does not appear to be
-    //        initialized at this point, and we need to finish initialization of
-    //        DataLayout.
-    DataLayout((FS.find("64bit") != FS.npos) ? DataLayout64 : DataLayout32),
-    Subtarget(TT, FS),
+    DataLayout(is64Bit ? DataLayout64 : DataLayout32),
+    Subtarget(TT, FS, is64Bit),
     FrameLowering(Subtarget),
     InstrInfo(*this),
     TLInfo(*this) {
 }
 
+PTX32TargetMachine::PTX32TargetMachine(const Target &T,
+                                       const std::string& TT,
+                                       const std::string& FS)
+  : PTXTargetMachine(T, TT, FS, false) {
+}
+
+PTX64TargetMachine::PTX64TargetMachine(const Target &T,
+                                       const std::string& TT,
+                                       const std::string& FS)
+  : PTXTargetMachine(T, TT, FS, true) {
+}
+
 bool PTXTargetMachine::addInstSelector(PassManagerBase &PM,
                                        CodeGenOpt::Level OptLevel) {
   PM.add(createPTXISelDag(*this, OptLevel));
diff --git a/lib/Target/PTX/PTXTargetMachine.h b/lib/Target/PTX/PTXTargetMachine.h
index a5dba53..149be8e 100644
--- a/lib/Target/PTX/PTXTargetMachine.h
+++ b/lib/Target/PTX/PTXTargetMachine.h
@@ -33,7 +33,7 @@ class PTXTargetMachine : public LLVMTargetMachine {
 
   public:
     PTXTargetMachine(const Target &T, const std::string &TT,
-                     const std::string &FS);
+                     const std::string &FS, bool is64Bit);
 
     virtual const TargetData *getTargetData() const { return &DataLayout; }
 
@@ -55,6 +55,22 @@ class PTXTargetMachine : public LLVMTargetMachine {
     virtual bool addPostRegAlloc(PassManagerBase &PM,
                                  CodeGenOpt::Level OptLevel);
 }; // class PTXTargetMachine
+
+
+class PTX32TargetMachine : public PTXTargetMachine {
+public:
+
+  PTX32TargetMachine(const Target &T, const std::string &TT,
+                     const std::string& FS);
+}; // class PTX32TargetMachine
+
+class PTX64TargetMachine : public PTXTargetMachine {
+public:
+
+  PTX64TargetMachine(const Target &T, const std::string &TT,
+                     const std::string& FS);
+}; // class PTX32TargetMachine
+
 } // namespace llvm
 
 #endif // PTX_TARGET_MACHINE_H
diff --git a/lib/Target/PTX/TargetInfo/PTXTargetInfo.cpp b/lib/Target/PTX/TargetInfo/PTXTargetInfo.cpp
index a577d77..9df6c75 100644
--- a/lib/Target/PTX/TargetInfo/PTXTargetInfo.cpp
+++ b/lib/Target/PTX/TargetInfo/PTXTargetInfo.cpp
@@ -13,9 +13,13 @@
 
 using namespace llvm;
 
-Target llvm::ThePTXTarget;
+Target llvm::ThePTX32Target;
+Target llvm::ThePTX64Target;
 
 extern "C" void LLVMInitializePTXTargetInfo() {
   // see llvm/ADT/Triple.h
-  RegisterTarget<Triple::ptx> X(ThePTXTarget, "ptx", "PTX");
+  RegisterTarget<Triple::ptx32> X32(ThePTX32Target, "ptx32",
+                                    "PTX (32-bit) [Experimental]");
+  RegisterTarget<Triple::ptx64> X64(ThePTX64Target, "ptx64",
+                                    "PTX (64-bit) [Experimental]");
 }
diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
index c8db0c4..1a9bd76 100644
--- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
+++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
@@ -26,6 +26,9 @@ StringRef PPCInstPrinter::getOpcodeName(unsigned Opcode) const {
   return getInstructionName(Opcode);
 }
 
+void PPCInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+  OS << getRegisterName(RegNo);
+}
 
 void PPCInstPrinter::printInst(const MCInst *MI, raw_ostream &O) {
   // Check for slwi/srwi mnemonics.
diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
index 9cf9db9..adfa0aa 100644
--- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
+++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
@@ -33,6 +33,7 @@ public:
     return SyntaxVariant == 1;
   }
   
+  virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
   virtual void printInst(const MCInst *MI, raw_ostream &O);
   virtual StringRef getOpcodeName(unsigned Opcode) const;
   
diff --git a/lib/Target/PowerPC/PPC.h b/lib/Target/PowerPC/PPC.h
index 7242f3a..92672b5 100644
--- a/lib/Target/PowerPC/PPC.h
+++ b/lib/Target/PowerPC/PPC.h
@@ -43,7 +43,7 @@ namespace llvm {
   TargetAsmBackend *createPPCAsmBackend(const Target &, const std::string &);
   
   void LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
-                                    AsmPrinter &AP);
+                                    AsmPrinter &AP, bool isDarwin);
   
   extern Target ThePPC32Target;
   extern Target ThePPC64Target;
diff --git a/lib/Target/PowerPC/PPCAsmBackend.cpp b/lib/Target/PowerPC/PPCAsmBackend.cpp
index c4d4ac9..f562a3f 100644
--- a/lib/Target/PowerPC/PPCAsmBackend.cpp
+++ b/lib/Target/PowerPC/PPCAsmBackend.cpp
@@ -110,10 +110,8 @@ namespace {
 
 TargetAsmBackend *llvm::createPPCAsmBackend(const Target &T,
                                             const std::string &TT) {
-  switch (Triple(TT).getOS()) {
-  case Triple::Darwin:
+  if (Triple(TT).isOSDarwin())
     return new DarwinPPCAsmBackend(T);
-  default:
-    return 0;
-  }
+
+  return 0;
 }
diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 09a9be9..b795db9 100644
--- a/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -344,7 +344,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   }
   case PPC::LDtoc: {
     // Transform %X3 = LDtoc <ga:@min1>, %X2
-    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this);
+    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin());
       
     // Change the opcode to LD, and the global address operand to be a
     // reference to the TOC entry we will synthesize later.
@@ -376,7 +376,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     return;
   }
 
-  LowerPPCMachineInstrToMCInst(MI, TmpInst, *this);
+  LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin());
   OutStreamer.EmitInstruction(TmpInst);
 }
 
diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp
index 6aca6b0..375e000 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -259,8 +259,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
   MachineModuleInfo &MMI = MF.getMMI();
   DebugLoc dl;
   bool needsFrameMoves = MMI.hasDebugInfo() ||
-       !MF.getFunction()->doesNotThrow() ||
-       UnwindTablesMandatory;
+    MF.getFunction()->needsUnwindTableEntry();
 
   // Prepare for frame info.
   MCSymbol *FrameLabel = 0;
@@ -488,6 +487,14 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
       int Offset = MFI->getObjectOffset(CSI[I].getFrameIdx());
       unsigned Reg = CSI[I].getReg();
       if (Reg == PPC::LR || Reg == PPC::LR8 || Reg == PPC::RM) continue;
+
+      // This is a bit of a hack: CR2LT, CR2GT, CR2EQ and CR2UN are just
+      // subregisters of CR2. We just need to emit a move of CR2.
+      if (Reg == PPC::CR2LT || Reg == PPC::CR2GT || Reg == PPC::CR2EQ)
+        continue;
+      if (Reg == PPC::CR2UN)
+        Reg = PPC::CR2;
+
       MachineLocation CSDst(MachineLocation::VirtualFP, Offset);
       MachineLocation CSSrc(Reg);
       Moves.push_back(MachineMove(Label, CSDst, CSSrc));
diff --git a/lib/Target/PowerPC/PPCHazardRecognizers.cpp b/lib/Target/PowerPC/PPCHazardRecognizers.cpp
index 0de5844..74ecff5 100644
--- a/lib/Target/PowerPC/PPCHazardRecognizers.cpp
+++ b/lib/Target/PowerPC/PPCHazardRecognizers.cpp
@@ -233,7 +233,7 @@ void PPCHazardRecognizer970::EmitInstruction(SUnit *SU) {
   unsigned Opcode = Node->getMachineOpcode();
 
   // Update structural hazard information.
-  if (Opcode == PPC::MTCTR) HasCTRSet = true;
+  if (Opcode == PPC::MTCTR || Opcode == PPC::MTCTR8) HasCTRSet = true;
 
   // Track the address stored to.
   if (isStore) {
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index faae9b2..511bb22 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -240,11 +240,11 @@ SDNode *PPCDAGToDAGISel::getGlobalBaseReg() {
 
     if (PPCLowering.getPointerTy() == MVT::i32) {
       GlobalBaseReg = RegInfo->createVirtualRegister(PPC::GPRCRegisterClass);
-      BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR), PPC::LR);
+      BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR));
       BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR), GlobalBaseReg);
     } else {
       GlobalBaseReg = RegInfo->createVirtualRegister(PPC::G8RCRegisterClass);
-      BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR8), PPC::LR8);
+      BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR8));
       BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR8), GlobalBaseReg);
     }
   }
@@ -1057,9 +1057,10 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
     SDValue Chain = N->getOperand(0);
     SDValue Target = N->getOperand(1);
     unsigned Opc = Target.getValueType() == MVT::i32 ? PPC::MTCTR : PPC::MTCTR8;
+    unsigned Reg = Target.getValueType() == MVT::i32 ? PPC::BCTR : PPC::BCTR8;
     Chain = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Target,
                                            Chain), 0);
-    return CurDAG->SelectNodeTo(N, PPC::BCTR, MVT::Other, Chain);
+    return CurDAG->SelectNodeTo(N, Reg, MVT::Other, Chain);
   }
   }
 
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 46b97e1..55c15ec 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -394,6 +394,10 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
     setLibcallName(RTLIB::EXP2_PPCF128, "exp2l$LDBL128");
   }
 
+  setMinFunctionAlignment(2);
+  if (PPCSubTarget.isDarwin())
+    setPrefFunctionAlignment(4);
+
   computeRegisterProperties();
 }
 
@@ -460,14 +464,6 @@ MVT::SimpleValueType PPCTargetLowering::getSetCCResultType(EVT VT) const {
   return MVT::i32;
 }
 
-/// getFunctionAlignment - Return the Log2 alignment of this function.
-unsigned PPCTargetLowering::getFunctionAlignment(const Function *F) const {
-  if (getTargetMachine().getSubtarget<PPCSubtarget>().isDarwin())
-    return F->hasFnAttr(Attribute::OptimizeForSize) ? 2 : 4;
-  else
-    return 2;
-}
-
 //===----------------------------------------------------------------------===//
 // Node matching predicates, for use by the tblgen matching code.
 //===----------------------------------------------------------------------===//
@@ -1014,7 +1010,8 @@ bool PPCTargetLowering::SelectAddressRegImmShift(SDValue N, SDValue &Disp,
       short Imm;
       if (isIntS16Immediate(CN, Imm)) {
         Disp = DAG.getTargetConstant((unsigned short)Imm >> 2, getPointerTy());
-        Base = DAG.getRegister(PPC::R0, CN->getValueType(0));
+        Base = DAG.getRegister(PPCSubTarget.isPPC64() ? PPC::X0 : PPC::R0,
+                               CN->getValueType(0));
         return true;
       }
 
@@ -1561,8 +1558,8 @@ PPCTargetLowering::LowerFormalArguments_SVR4(
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs,
-                 *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), ArgLocs, *DAG.getContext());
 
   // Reserve space for the linkage area on the stack.
   CCInfo.AllocateStack(PPCFrameLowering::getLinkageSize(false, false), PtrByteSize);
@@ -1622,8 +1619,8 @@ PPCTargetLowering::LowerFormalArguments_SVR4(
   // Aggregates passed by value are stored in the local variable space of the
   // caller's stack frame, right above the parameter list area.
   SmallVector<CCValAssign, 16> ByValArgLocs;
-  CCState CCByValInfo(CallConv, isVarArg, getTargetMachine(),
-                      ByValArgLocs, *DAG.getContext());
+  CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		      getTargetMachine(), ByValArgLocs, *DAG.getContext());
 
   // Reserve stack space for the allocations in CCInfo.
   CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize);
@@ -2155,7 +2152,7 @@ CalculateParameterAndLinkageAreaSize(SelectionDAG &DAG,
 }
 
 /// CalculateTailCallSPDiff - Get the amount the stack pointer has to be
-/// adjusted to accomodate the arguments for the tailcall.
+/// adjusted to accommodate the arguments for the tailcall.
 static int CalculateTailCallSPDiff(SelectionDAG& DAG, bool isTailCall,
                                    unsigned ParamSize) {
 
@@ -2396,7 +2393,7 @@ void PrepareTailCall(SelectionDAG &DAG, SDValue &InFlag, SDValue &Chain,
   // Emit a sequence of copyto/copyfrom virtual registers for arguments that
   // might overwrite each other in case of tail call optimization.
   SmallVector<SDValue, 8> MemOpChains2;
-  // Do not flag preceeding copytoreg stuff together with the following stuff.
+  // Do not flag preceding copytoreg stuff together with the following stuff.
   InFlag = SDValue();
   StoreTailCallArgumentsToStackSlot(DAG, Chain, TailCallArguments,
                                     MemOpChains2, dl);
@@ -2444,7 +2441,8 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
     if (!DAG.getTarget().getSubtarget<PPCSubtarget>().isJITCodeModel()) {
       unsigned OpFlags = 0;
       if (DAG.getTarget().getRelocationModel() != Reloc::Static &&
-          PPCSubTarget.getDarwinVers() < 9 &&
+          (!PPCSubTarget.getTargetTriple().isMacOSX() ||
+           PPCSubTarget.getTargetTriple().isMacOSXVersionLT(10, 5)) &&
           (G->getGlobal()->isDeclaration() ||
            G->getGlobal()->isWeakForLinker())) {
         // PC-relative references to external symbols should go through $stub,
@@ -2467,7 +2465,8 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
     unsigned char OpFlags = 0;
 
     if (DAG.getTarget().getRelocationModel() != Reloc::Static &&
-        PPCSubTarget.getDarwinVers() < 9) {
+        (!PPCSubTarget.getTargetTriple().isMacOSX() ||
+         PPCSubTarget.getTargetTriple().isMacOSXVersionLT(10, 5))) {
       // PC-relative references to external symbols should go through $stub,
       // unless we're building with the leopard linker or later, which
       // automatically synthesizes these stubs.
@@ -2563,7 +2562,7 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
     Callee.setNode(0);
     // Add CTR register as callee so a bctr can be emitted later.
     if (isTailCall)
-      Ops.push_back(DAG.getRegister(PPC::CTR, PtrVT));
+      Ops.push_back(DAG.getRegister(isPPC64 ? PPC::CTR8 : PPC::CTR, PtrVT));
   }
 
   // If this is a direct call, pass the chain and the callee.
@@ -2592,8 +2591,8 @@ PPCTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
                                    SmallVectorImpl<SDValue> &InVals) const {
 
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCRetInfo(CallConv, isVarArg, getTargetMachine(),
-                    RVLocs, *DAG.getContext());
+  CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		    getTargetMachine(), RVLocs, *DAG.getContext());
   CCRetInfo.AnalyzeCallResult(Ins, RetCC_PPC);
 
   // Copy all of the result registers out of their specified physreg.
@@ -2642,8 +2641,8 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, DebugLoc dl,
     // to the liveout set for the function.
     if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
       SmallVector<CCValAssign, 16> RVLocs;
-      CCState CCInfo(CallConv, isVarArg, getTargetMachine(), RVLocs,
-                     *DAG.getContext());
+      CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		     getTargetMachine(), RVLocs, *DAG.getContext());
       CCInfo.AnalyzeCallResult(Ins, RetCC_PPC);
       for (unsigned i = 0; i != RVLocs.size(); ++i)
         DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
@@ -2756,8 +2755,8 @@ PPCTargetLowering::LowerCall_SVR4(SDValue Chain, SDValue Callee,
 
   // Assign locations to all of the outgoing arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
-                 ArgLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), ArgLocs, *DAG.getContext());
 
   // Reserve space for the linkage area on the stack.
   CCInfo.AllocateStack(PPCFrameLowering::getLinkageSize(false, false), PtrByteSize);
@@ -2796,8 +2795,8 @@ PPCTargetLowering::LowerCall_SVR4(SDValue Chain, SDValue Callee,
 
   // Assign locations to all of the outgoing aggregate by value arguments.
   SmallVector<CCValAssign, 16> ByValArgLocs;
-  CCState CCByValInfo(CallConv, isVarArg, getTargetMachine(), ByValArgLocs,
-                      *DAG.getContext());
+  CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		      getTargetMachine(), ByValArgLocs, *DAG.getContext());
 
   // Reserve stack space for the allocations in CCInfo.
   CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize);
@@ -2903,6 +2902,12 @@ PPCTargetLowering::LowerCall_SVR4(SDValue Chain, SDValue Callee,
     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
                         &MemOpChains[0], MemOpChains.size());
 
+  // Set CR6 to true if this is a vararg call.
+  if (isVarArg) {
+    SDValue SetCR(DAG.getMachineNode(PPC::CRSET, dl, MVT::i32), 0);
+    RegsToPass.push_back(std::make_pair(unsigned(PPC::CR1EQ), SetCR));
+  }
+
   // Build a sequence of copy-to-reg nodes chained together with token chain
   // and flag operands which copy the outgoing args into the appropriate regs.
   SDValue InFlag;
@@ -2912,13 +2917,6 @@ PPCTargetLowering::LowerCall_SVR4(SDValue Chain, SDValue Callee,
     InFlag = Chain.getValue(1);
   }
 
-  // Set CR6 to true if this is a vararg call.
-  if (isVarArg) {
-    SDValue SetCR(DAG.getMachineNode(PPC::CRSET, dl, MVT::i32), 0);
-    Chain = DAG.getCopyToReg(Chain, dl, PPC::CR1EQ, SetCR, InFlag);
-    InFlag = Chain.getValue(1);
-  }
-
   if (isTailCall)
     PrepareTailCall(DAG, InFlag, Chain, dl, false, SPDiff, NumBytes, LROp, FPOp,
                     false, TailCallArguments);
@@ -3304,8 +3302,8 @@ PPCTargetLowering::LowerReturn(SDValue Chain,
                                DebugLoc dl, SelectionDAG &DAG) const {
 
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
-                 RVLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), RVLocs, *DAG.getContext());
   CCInfo.AnalyzeReturn(Outs, RetCC_PPC);
 
   // If this is the first return lowered for this function, add the regs to the
@@ -5440,10 +5438,16 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
 
 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
 /// vector.  If it is invalid, don't add anything to Ops.
-void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op, char Letter,
+void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
+                                                     std::string &Constraint,
                                                      std::vector<SDValue>&Ops,
                                                      SelectionDAG &DAG) const {
   SDValue Result(0,0);
+
+  // Only support length 1 constraints.
+  if (Constraint.length() > 1) return;
+
+  char Letter = Constraint[0];
   switch (Letter) {
   default: break;
   case 'I':
@@ -5499,7 +5503,7 @@ void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op, char Letter,
   }
 
   // Handle standard constraint letters.
-  TargetLowering::LowerAsmOperandForConstraint(Op, Letter, Ops, DAG);
+  TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
 }
 
 // isLegalAddressingMode - Return true if the addressing mode represented
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index 33daae9..986b4e7 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -328,7 +328,7 @@ namespace llvm {
     /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
     /// vector.  If it is invalid, don't add anything to Ops.
     virtual void LowerAsmOperandForConstraint(SDValue Op,
-                                              char ConstraintLetter,
+                                              std::string &Constraint,
                                               std::vector<SDValue> &Ops,
                                               SelectionDAG &DAG) const;
 
@@ -364,9 +364,6 @@ namespace llvm {
                         bool NonScalarIntSafe, bool MemcpyStrSrc,
                         MachineFunction &MF) const;
 
-    /// getFunctionAlignment - Return the Log2 alignment of this function.
-    virtual unsigned getFunctionAlignment(const Function *F) const;
-
   private:
     SDValue getFramePointerFrameIndex(SelectionDAG & DAG) const;
     SDValue getReturnAddrFrameIndex(SelectionDAG & DAG) const;
diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
index 9f0fae5..e88ad37 100644
--- a/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -60,7 +60,7 @@ def HI48_64 : SDNodeXForm<imm, [{
 //
 
 let Defs = [LR8] in
-  def MovePCtoLR8 : Pseudo<(outs), (ins piclabel:$label), "", []>,
+  def MovePCtoLR8 : Pseudo<(outs), (ins), "", []>,
                     PPC970_Unit_BRU;
 
 // Darwin ABI Calls.
@@ -190,10 +190,15 @@ def TCRETURNri8 : Pseudo<(outs), (ins CTRRC8:$dst, i32imm:$offset, variable_ops)
 
 
 let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7, isBranch = 1,
-    isIndirectBranch = 1, isCall = 1, isReturn = 1, Uses = [CTR, RM] in
-def TAILBCTR8 : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", BrB, []>,
-    Requires<[In64BitMode]>;
+    isIndirectBranch = 1, isCall = 1, Uses = [CTR8, RM] in {
+  let isReturn = 1 in {
+    def TAILBCTR8 : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", BrB, []>,
+        Requires<[In64BitMode]>;
+  }
 
+  def BCTR8 : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", BrB, []>,
+      Requires<[In64BitMode]>;
+}
 
 
 let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7,
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index 24071b7..773578c 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -300,7 +300,6 @@ def calltarget : Operand<iPTR> {
 def aaddr : Operand<iPTR> {
   let PrintMethod = "printAbsAddrOperand";
 }
-def piclabel: Operand<iPTR> {}
 def symbolHi: Operand<i32> {
   let PrintMethod = "printSymbolHi";
   let EncoderMethod = "getHA16Encoding";
@@ -413,7 +412,7 @@ let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7 in {
 }
 
 let Defs = [LR] in
-  def MovePCtoLR : Pseudo<(outs), (ins piclabel:$label), "", []>,
+  def MovePCtoLR : Pseudo<(outs), (ins), "", []>,
                    PPC970_Unit_BRU;
 
 let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in {
diff --git a/lib/Target/PowerPC/PPCJITInfo.cpp b/lib/Target/PowerPC/PPCJITInfo.cpp
index 78383e0..4590f00 100644
--- a/lib/Target/PowerPC/PPCJITInfo.cpp
+++ b/lib/Target/PowerPC/PPCJITInfo.cpp
@@ -87,7 +87,7 @@ asm(
     // FIXME: could shrink frame
     // Set up a proper stack frame
     // FIXME Layout
-    //   PowerPC64 ABI linkage    -  24 bytes
+    //   PowerPC32 ABI linkage    -  24 bytes
     //                 parameters -  32 bytes
     //   13 double registers      - 104 bytes
     //   8 int registers          -  32 bytes
@@ -205,11 +205,27 @@ void PPC32CompilationCallback() {
 
 #if (defined(__POWERPC__) || defined (__ppc__) || defined(_POWER)) && \
     defined(__ppc64__)
+#ifdef __ELF__
+asm(
+    ".text\n"
+    ".align 2\n"
+    ".globl PPC64CompilationCallback\n"
+    ".section \".opd\",\"aw\"\n"
+    ".align 3\n"
+"PPC64CompilationCallback:\n"
+    ".quad .L.PPC64CompilationCallback,.TOC.@tocbase,0\n"
+    ".size PPC64CompilationCallback,24\n"
+    ".previous\n"
+    ".align 4\n"
+    ".type PPC64CompilationCallback,@function\n"
+".L.PPC64CompilationCallback:\n"
+#else
 asm(
     ".text\n"
     ".align 2\n"
     ".globl _PPC64CompilationCallback\n"
 "_PPC64CompilationCallback:\n"
+#endif
     // Make space for 8 ints r[3-10] and 13 doubles f[1-13] and the 
     // FIXME: need to save v[0-19] for altivec?
     // Set up a proper stack frame
@@ -218,49 +234,55 @@ asm(
     //                 parameters -  64 bytes
     //   13 double registers      - 104 bytes
     //   8 int registers          -  64 bytes
-    "mflr r0\n"
-    "std r0,  16(r1)\n"
-    "stdu r1, -280(r1)\n"
+    "mflr 0\n"
+    "std  0,  16(1)\n"
+    "stdu 1, -280(1)\n"
     // Save all int arg registers
-    "std r10, 272(r1)\n"    "std r9,  264(r1)\n"
-    "std r8,  256(r1)\n"    "std r7,  248(r1)\n"
-    "std r6,  240(r1)\n"    "std r5,  232(r1)\n"
-    "std r4,  224(r1)\n"    "std r3,  216(r1)\n"
+    "std 10, 272(1)\n"    "std 9,  264(1)\n"
+    "std 8,  256(1)\n"    "std 7,  248(1)\n"
+    "std 6,  240(1)\n"    "std 5,  232(1)\n"
+    "std 4,  224(1)\n"    "std 3,  216(1)\n"
     // Save all call-clobbered FP regs.
-    "stfd f13, 208(r1)\n"    "stfd f12, 200(r1)\n"
-    "stfd f11, 192(r1)\n"    "stfd f10, 184(r1)\n"
-    "stfd f9,  176(r1)\n"    "stfd f8,  168(r1)\n"
-    "stfd f7,  160(r1)\n"    "stfd f6,  152(r1)\n"
-    "stfd f5,  144(r1)\n"    "stfd f4,  136(r1)\n"
-    "stfd f3,  128(r1)\n"    "stfd f2,  120(r1)\n"
-    "stfd f1,  112(r1)\n"
+    "stfd 13, 208(1)\n"    "stfd 12, 200(1)\n"
+    "stfd 11, 192(1)\n"    "stfd 10, 184(1)\n"
+    "stfd 9,  176(1)\n"    "stfd 8,  168(1)\n"
+    "stfd 7,  160(1)\n"    "stfd 6,  152(1)\n"
+    "stfd 5,  144(1)\n"    "stfd 4,  136(1)\n"
+    "stfd 3,  128(1)\n"    "stfd 2,  120(1)\n"
+    "stfd 1,  112(1)\n"
     // Arguments to Compilation Callback:
     // r3 - our lr (address of the call instruction in stub plus 4)
     // r4 - stub's lr (address of instruction that called the stub plus 4)
     // r5 - is64Bit - always 1.
-    "mr   r3, r0\n"
-    "ld   r2, 280(r1)\n" // stub's frame
-    "ld   r4, 16(r2)\n"  // stub's lr
-    "li   r5, 1\n"       // 1 == 64 bit
+    "mr   3, 0\n"      // return address (still in r0)
+    "ld   5, 280(1)\n" // stub's frame
+    "ld   4, 16(5)\n"  // stub's lr
+    "li   5, 1\n"      // 1 == 64 bit
+#ifdef __ELF__
+    "bl PPCCompilationCallbackC\n"
+    "nop\n"
+#else
     "bl _PPCCompilationCallbackC\n"
-    "mtctr r3\n"
+#endif
+    "mtctr 3\n"
     // Restore all int arg registers
-    "ld r10, 272(r1)\n"    "ld r9,  264(r1)\n"
-    "ld r8,  256(r1)\n"    "ld r7,  248(r1)\n"
-    "ld r6,  240(r1)\n"    "ld r5,  232(r1)\n"
-    "ld r4,  224(r1)\n"    "ld r3,  216(r1)\n"
+    "ld 10, 272(1)\n"    "ld 9,  264(1)\n"
+    "ld 8,  256(1)\n"    "ld 7,  248(1)\n"
+    "ld 6,  240(1)\n"    "ld 5,  232(1)\n"
+    "ld 4,  224(1)\n"    "ld 3,  216(1)\n"
     // Restore all FP arg registers
-    "lfd f13, 208(r1)\n"    "lfd f12, 200(r1)\n"
-    "lfd f11, 192(r1)\n"    "lfd f10, 184(r1)\n"
-    "lfd f9,  176(r1)\n"    "lfd f8,  168(r1)\n"
-    "lfd f7,  160(r1)\n"    "lfd f6,  152(r1)\n"
-    "lfd f5,  144(r1)\n"    "lfd f4,  136(r1)\n"
-    "lfd f3,  128(r1)\n"    "lfd f2,  120(r1)\n"
-    "lfd f1,  112(r1)\n"
+    "lfd 13, 208(1)\n"    "lfd 12, 200(1)\n"
+    "lfd 11, 192(1)\n"    "lfd 10, 184(1)\n"
+    "lfd 9,  176(1)\n"    "lfd 8,  168(1)\n"
+    "lfd 7,  160(1)\n"    "lfd 6,  152(1)\n"
+    "lfd 5,  144(1)\n"    "lfd 4,  136(1)\n"
+    "lfd 3,  128(1)\n"    "lfd 2,  120(1)\n"
+    "lfd 1,  112(1)\n"
     // Pop 3 frames off the stack and branch to target
-    "ld  r1, 280(r1)\n"
-    "ld  r2, 16(r1)\n"
-    "mtlr r2\n"
+    "ld  1, 280(1)\n"
+    "ld  0, 16(1)\n"
+    "mtlr 0\n"
+    // XXX: any special TOC handling in the ELF case for JIT?
     "bctr\n"
     );
 #else
diff --git a/lib/Target/PowerPC/PPCMCAsmInfo.cpp b/lib/Target/PowerPC/PPCMCAsmInfo.cpp
index d1178dd..2d5c880 100644
--- a/lib/Target/PowerPC/PPCMCAsmInfo.cpp
+++ b/lib/Target/PowerPC/PPCMCAsmInfo.cpp
@@ -17,7 +17,7 @@ using namespace llvm;
 PPCMCAsmInfoDarwin::PPCMCAsmInfoDarwin(bool is64Bit) {
   PCSymbol = ".";
   CommentString = ";";
-  ExceptionsType = ExceptionHandling::DwarfTable;
+  ExceptionsType = ExceptionHandling::DwarfCFI;
 
   if (!is64Bit)
     Data64bitsDirective = 0;      // We can't emit a 64-bit unit in PPC32 mode.
@@ -48,7 +48,7 @@ PPCLinuxMCAsmInfo::PPCLinuxMCAsmInfo(bool is64Bit) {
 
   // Exceptions handling
   if (!is64Bit)
-    ExceptionsType = ExceptionHandling::DwarfTable;
+    ExceptionsType = ExceptionHandling::DwarfCFI;
     
   ZeroDirective = "\t.space\t";
   Data64bitsDirective = is64Bit ? "\t.quad\t" : 0;
diff --git a/lib/Target/PowerPC/PPCMCInstLower.cpp b/lib/Target/PowerPC/PPCMCInstLower.cpp
index 6082587..33af426 100644
--- a/lib/Target/PowerPC/PPCMCInstLower.cpp
+++ b/lib/Target/PowerPC/PPCMCInstLower.cpp
@@ -95,14 +95,14 @@ static MCSymbol *GetSymbolFromOperand(const MachineOperand &MO, AsmPrinter &AP){
 }
 
 static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
-                              AsmPrinter &Printer) {
+                              AsmPrinter &Printer, bool isDarwin) {
   MCContext &Ctx = Printer.OutContext;
   MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None;
 
   if (MO.getTargetFlags() & PPCII::MO_LO16)
-    RefKind = MCSymbolRefExpr::VK_PPC_LO16;
+    RefKind = isDarwin ? MCSymbolRefExpr::VK_PPC_DARWIN_LO16 : MCSymbolRefExpr::VK_PPC_GAS_LO16;
   else if (MO.getTargetFlags() & PPCII::MO_HA16)
-    RefKind = MCSymbolRefExpr::VK_PPC_HA16;
+    RefKind = isDarwin ? MCSymbolRefExpr::VK_PPC_DARWIN_HA16 : MCSymbolRefExpr::VK_PPC_GAS_HA16;
 
   // FIXME: This isn't right, but we don't have a good way to express this in
   // the MC Level, see below.
@@ -130,7 +130,7 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
 }
 
 void llvm::LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
-                                        AsmPrinter &AP) {
+                                        AsmPrinter &AP, bool isDarwin) {
   OutMI.setOpcode(MI->getOpcode());
   
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
@@ -154,16 +154,17 @@ void llvm::LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
       break;
     case MachineOperand::MO_GlobalAddress:
     case MachineOperand::MO_ExternalSymbol:
-      MCOp = GetSymbolRef(MO, GetSymbolFromOperand(MO, AP), AP);
+      MCOp = GetSymbolRef(MO, GetSymbolFromOperand(MO, AP), AP, isDarwin);
       break;
     case MachineOperand::MO_JumpTableIndex:
-      MCOp = GetSymbolRef(MO, AP.GetJTISymbol(MO.getIndex()), AP);
+      MCOp = GetSymbolRef(MO, AP.GetJTISymbol(MO.getIndex()), AP, isDarwin);
       break;
     case MachineOperand::MO_ConstantPoolIndex:
-      MCOp = GetSymbolRef(MO, AP.GetCPISymbol(MO.getIndex()), AP);
+      MCOp = GetSymbolRef(MO, AP.GetCPISymbol(MO.getIndex()), AP, isDarwin);
       break;
     case MachineOperand::MO_BlockAddress:
-      MCOp = GetSymbolRef(MO,AP.GetBlockAddressSymbol(MO.getBlockAddress()),AP);
+      MCOp = GetSymbolRef(MO,AP.GetBlockAddressSymbol(MO.getBlockAddress()),AP,
+                          isDarwin);
       break;
     }
     
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 45d8b6b..3374e9b 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -686,9 +686,28 @@ unsigned PPCRegisterInfo::getEHHandlerRegister() const {
   return !Subtarget.isPPC64() ? PPC::R4 : PPC::X4;
 }
 
+/// DWARFFlavour - Flavour of dwarf regnumbers
+///
+namespace DWARFFlavour {
+  enum {
+    PPC64 = 0, PPC32 = 1
+  };
+}
+
 int PPCRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
   // FIXME: Most probably dwarf numbers differs for Linux and Darwin
-  return PPCGenRegisterInfo::getDwarfRegNumFull(RegNum, 0);
+  unsigned Flavour = Subtarget.isPPC64() ?
+    DWARFFlavour::PPC64 : DWARFFlavour::PPC32;
+
+  return PPCGenRegisterInfo::getDwarfRegNumFull(RegNum, Flavour);
+}
+
+int PPCRegisterInfo::getLLVMRegNum(unsigned RegNum, bool isEH) const {
+  // FIXME: Most probably dwarf numbers differs for Linux and Darwin
+  unsigned Flavour = Subtarget.isPPC64() ?
+    DWARFFlavour::PPC64 : DWARFFlavour::PPC32;
+
+  return PPCGenRegisterInfo::getLLVMRegNumFull(RegNum, Flavour);
 }
 
 #include "PPCGenRegisterInfo.inc"
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.h b/lib/Target/PowerPC/PPCRegisterInfo.h
index aa29ffe..48c2562 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.h
+++ b/lib/Target/PowerPC/PPCRegisterInfo.h
@@ -68,6 +68,7 @@ public:
   unsigned getEHHandlerRegister() const;
 
   int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+  int getLLVMRegNum(unsigned RegNum, bool isEH) const;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.td b/lib/Target/PowerPC/PPCRegisterInfo.td
index 2639165..1acdf4e 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.td
+++ b/lib/Target/PowerPC/PPCRegisterInfo.td
@@ -65,203 +65,203 @@ class CRBIT<bits<5> num, string n> : PPCReg<n> {
 
 
 // General-purpose registers
-def R0  : GPR< 0,  "r0">, DwarfRegNum<[0]>;
-def R1  : GPR< 1,  "r1">, DwarfRegNum<[1]>;
-def R2  : GPR< 2,  "r2">, DwarfRegNum<[2]>;
-def R3  : GPR< 3,  "r3">, DwarfRegNum<[3]>;
-def R4  : GPR< 4,  "r4">, DwarfRegNum<[4]>;
-def R5  : GPR< 5,  "r5">, DwarfRegNum<[5]>;
-def R6  : GPR< 6,  "r6">, DwarfRegNum<[6]>;
-def R7  : GPR< 7,  "r7">, DwarfRegNum<[7]>;
-def R8  : GPR< 8,  "r8">, DwarfRegNum<[8]>;
-def R9  : GPR< 9,  "r9">, DwarfRegNum<[9]>;
-def R10 : GPR<10, "r10">, DwarfRegNum<[10]>;
-def R11 : GPR<11, "r11">, DwarfRegNum<[11]>;
-def R12 : GPR<12, "r12">, DwarfRegNum<[12]>;
-def R13 : GPR<13, "r13">, DwarfRegNum<[13]>;
-def R14 : GPR<14, "r14">, DwarfRegNum<[14]>;
-def R15 : GPR<15, "r15">, DwarfRegNum<[15]>;
-def R16 : GPR<16, "r16">, DwarfRegNum<[16]>;
-def R17 : GPR<17, "r17">, DwarfRegNum<[17]>;
-def R18 : GPR<18, "r18">, DwarfRegNum<[18]>;
-def R19 : GPR<19, "r19">, DwarfRegNum<[19]>;
-def R20 : GPR<20, "r20">, DwarfRegNum<[20]>;
-def R21 : GPR<21, "r21">, DwarfRegNum<[21]>;
-def R22 : GPR<22, "r22">, DwarfRegNum<[22]>;
-def R23 : GPR<23, "r23">, DwarfRegNum<[23]>;
-def R24 : GPR<24, "r24">, DwarfRegNum<[24]>;
-def R25 : GPR<25, "r25">, DwarfRegNum<[25]>;
-def R26 : GPR<26, "r26">, DwarfRegNum<[26]>;
-def R27 : GPR<27, "r27">, DwarfRegNum<[27]>;
-def R28 : GPR<28, "r28">, DwarfRegNum<[28]>;
-def R29 : GPR<29, "r29">, DwarfRegNum<[29]>;
-def R30 : GPR<30, "r30">, DwarfRegNum<[30]>;
-def R31 : GPR<31, "r31">, DwarfRegNum<[31]>;
+def R0  : GPR< 0,  "r0">, DwarfRegNum<[-2, 0]>;
+def R1  : GPR< 1,  "r1">, DwarfRegNum<[-2, 1]>;
+def R2  : GPR< 2,  "r2">, DwarfRegNum<[-2, 2]>;
+def R3  : GPR< 3,  "r3">, DwarfRegNum<[-2, 3]>;
+def R4  : GPR< 4,  "r4">, DwarfRegNum<[-2, 4]>;
+def R5  : GPR< 5,  "r5">, DwarfRegNum<[-2, 5]>;
+def R6  : GPR< 6,  "r6">, DwarfRegNum<[-2, 6]>;
+def R7  : GPR< 7,  "r7">, DwarfRegNum<[-2, 7]>;
+def R8  : GPR< 8,  "r8">, DwarfRegNum<[-2, 8]>;
+def R9  : GPR< 9,  "r9">, DwarfRegNum<[-2, 9]>;
+def R10 : GPR<10, "r10">, DwarfRegNum<[-2, 10]>;
+def R11 : GPR<11, "r11">, DwarfRegNum<[-2, 11]>;
+def R12 : GPR<12, "r12">, DwarfRegNum<[-2, 12]>;
+def R13 : GPR<13, "r13">, DwarfRegNum<[-2, 13]>;
+def R14 : GPR<14, "r14">, DwarfRegNum<[-2, 14]>;
+def R15 : GPR<15, "r15">, DwarfRegNum<[-2, 15]>;
+def R16 : GPR<16, "r16">, DwarfRegNum<[-2, 16]>;
+def R17 : GPR<17, "r17">, DwarfRegNum<[-2, 17]>;
+def R18 : GPR<18, "r18">, DwarfRegNum<[-2, 18]>;
+def R19 : GPR<19, "r19">, DwarfRegNum<[-2, 19]>;
+def R20 : GPR<20, "r20">, DwarfRegNum<[-2, 20]>;
+def R21 : GPR<21, "r21">, DwarfRegNum<[-2, 21]>;
+def R22 : GPR<22, "r22">, DwarfRegNum<[-2, 22]>;
+def R23 : GPR<23, "r23">, DwarfRegNum<[-2, 23]>;
+def R24 : GPR<24, "r24">, DwarfRegNum<[-2, 24]>;
+def R25 : GPR<25, "r25">, DwarfRegNum<[-2, 25]>;
+def R26 : GPR<26, "r26">, DwarfRegNum<[-2, 26]>;
+def R27 : GPR<27, "r27">, DwarfRegNum<[-2, 27]>;
+def R28 : GPR<28, "r28">, DwarfRegNum<[-2, 28]>;
+def R29 : GPR<29, "r29">, DwarfRegNum<[-2, 29]>;
+def R30 : GPR<30, "r30">, DwarfRegNum<[-2, 30]>;
+def R31 : GPR<31, "r31">, DwarfRegNum<[-2, 31]>;
 
 // 64-bit General-purpose registers
-def X0  : GP8< R0,  "r0">, DwarfRegNum<[0]>;
-def X1  : GP8< R1,  "r1">, DwarfRegNum<[1]>;
-def X2  : GP8< R2,  "r2">, DwarfRegNum<[2]>;
-def X3  : GP8< R3,  "r3">, DwarfRegNum<[3]>;
-def X4  : GP8< R4,  "r4">, DwarfRegNum<[4]>;
-def X5  : GP8< R5,  "r5">, DwarfRegNum<[5]>;
-def X6  : GP8< R6,  "r6">, DwarfRegNum<[6]>;
-def X7  : GP8< R7,  "r7">, DwarfRegNum<[7]>;
-def X8  : GP8< R8,  "r8">, DwarfRegNum<[8]>;
-def X9  : GP8< R9,  "r9">, DwarfRegNum<[9]>;
-def X10 : GP8<R10, "r10">, DwarfRegNum<[10]>;
-def X11 : GP8<R11, "r11">, DwarfRegNum<[11]>;
-def X12 : GP8<R12, "r12">, DwarfRegNum<[12]>;
-def X13 : GP8<R13, "r13">, DwarfRegNum<[13]>;
-def X14 : GP8<R14, "r14">, DwarfRegNum<[14]>;
-def X15 : GP8<R15, "r15">, DwarfRegNum<[15]>;
-def X16 : GP8<R16, "r16">, DwarfRegNum<[16]>;
-def X17 : GP8<R17, "r17">, DwarfRegNum<[17]>;
-def X18 : GP8<R18, "r18">, DwarfRegNum<[18]>;
-def X19 : GP8<R19, "r19">, DwarfRegNum<[19]>;
-def X20 : GP8<R20, "r20">, DwarfRegNum<[20]>;
-def X21 : GP8<R21, "r21">, DwarfRegNum<[21]>;
-def X22 : GP8<R22, "r22">, DwarfRegNum<[22]>;
-def X23 : GP8<R23, "r23">, DwarfRegNum<[23]>;
-def X24 : GP8<R24, "r24">, DwarfRegNum<[24]>;
-def X25 : GP8<R25, "r25">, DwarfRegNum<[25]>;
-def X26 : GP8<R26, "r26">, DwarfRegNum<[26]>;
-def X27 : GP8<R27, "r27">, DwarfRegNum<[27]>;
-def X28 : GP8<R28, "r28">, DwarfRegNum<[28]>;
-def X29 : GP8<R29, "r29">, DwarfRegNum<[29]>;
-def X30 : GP8<R30, "r30">, DwarfRegNum<[30]>;
-def X31 : GP8<R31, "r31">, DwarfRegNum<[31]>;
+def X0  : GP8< R0,  "r0">, DwarfRegNum<[0, -2]>;
+def X1  : GP8< R1,  "r1">, DwarfRegNum<[1, -2]>;
+def X2  : GP8< R2,  "r2">, DwarfRegNum<[2, -2]>;
+def X3  : GP8< R3,  "r3">, DwarfRegNum<[3, -2]>;
+def X4  : GP8< R4,  "r4">, DwarfRegNum<[4, -2]>;
+def X5  : GP8< R5,  "r5">, DwarfRegNum<[5, -2]>;
+def X6  : GP8< R6,  "r6">, DwarfRegNum<[6, -2]>;
+def X7  : GP8< R7,  "r7">, DwarfRegNum<[7, -2]>;
+def X8  : GP8< R8,  "r8">, DwarfRegNum<[8, -2]>;
+def X9  : GP8< R9,  "r9">, DwarfRegNum<[9, -2]>;
+def X10 : GP8<R10, "r10">, DwarfRegNum<[10, -2]>;
+def X11 : GP8<R11, "r11">, DwarfRegNum<[11, -2]>;
+def X12 : GP8<R12, "r12">, DwarfRegNum<[12, -2]>;
+def X13 : GP8<R13, "r13">, DwarfRegNum<[13, -2]>;
+def X14 : GP8<R14, "r14">, DwarfRegNum<[14, -2]>;
+def X15 : GP8<R15, "r15">, DwarfRegNum<[15, -2]>;
+def X16 : GP8<R16, "r16">, DwarfRegNum<[16, -2]>;
+def X17 : GP8<R17, "r17">, DwarfRegNum<[17, -2]>;
+def X18 : GP8<R18, "r18">, DwarfRegNum<[18, -2]>;
+def X19 : GP8<R19, "r19">, DwarfRegNum<[19, -2]>;
+def X20 : GP8<R20, "r20">, DwarfRegNum<[20, -2]>;
+def X21 : GP8<R21, "r21">, DwarfRegNum<[21, -2]>;
+def X22 : GP8<R22, "r22">, DwarfRegNum<[22, -2]>;
+def X23 : GP8<R23, "r23">, DwarfRegNum<[23, -2]>;
+def X24 : GP8<R24, "r24">, DwarfRegNum<[24, -2]>;
+def X25 : GP8<R25, "r25">, DwarfRegNum<[25, -2]>;
+def X26 : GP8<R26, "r26">, DwarfRegNum<[26, -2]>;
+def X27 : GP8<R27, "r27">, DwarfRegNum<[27, -2]>;
+def X28 : GP8<R28, "r28">, DwarfRegNum<[28, -2]>;
+def X29 : GP8<R29, "r29">, DwarfRegNum<[29, -2]>;
+def X30 : GP8<R30, "r30">, DwarfRegNum<[30, -2]>;
+def X31 : GP8<R31, "r31">, DwarfRegNum<[31, -2]>;
 
 // Floating-point registers
-def F0  : FPR< 0,  "f0">, DwarfRegNum<[32]>;
-def F1  : FPR< 1,  "f1">, DwarfRegNum<[33]>;
-def F2  : FPR< 2,  "f2">, DwarfRegNum<[34]>;
-def F3  : FPR< 3,  "f3">, DwarfRegNum<[35]>;
-def F4  : FPR< 4,  "f4">, DwarfRegNum<[36]>;
-def F5  : FPR< 5,  "f5">, DwarfRegNum<[37]>;
-def F6  : FPR< 6,  "f6">, DwarfRegNum<[38]>;
-def F7  : FPR< 7,  "f7">, DwarfRegNum<[39]>;
-def F8  : FPR< 8,  "f8">, DwarfRegNum<[40]>;
-def F9  : FPR< 9,  "f9">, DwarfRegNum<[41]>;
-def F10 : FPR<10, "f10">, DwarfRegNum<[42]>;
-def F11 : FPR<11, "f11">, DwarfRegNum<[43]>;
-def F12 : FPR<12, "f12">, DwarfRegNum<[44]>;
-def F13 : FPR<13, "f13">, DwarfRegNum<[45]>;
-def F14 : FPR<14, "f14">, DwarfRegNum<[46]>;
-def F15 : FPR<15, "f15">, DwarfRegNum<[47]>;
-def F16 : FPR<16, "f16">, DwarfRegNum<[48]>;
-def F17 : FPR<17, "f17">, DwarfRegNum<[49]>;
-def F18 : FPR<18, "f18">, DwarfRegNum<[50]>;
-def F19 : FPR<19, "f19">, DwarfRegNum<[51]>;
-def F20 : FPR<20, "f20">, DwarfRegNum<[52]>;
-def F21 : FPR<21, "f21">, DwarfRegNum<[53]>;
-def F22 : FPR<22, "f22">, DwarfRegNum<[54]>;
-def F23 : FPR<23, "f23">, DwarfRegNum<[55]>;
-def F24 : FPR<24, "f24">, DwarfRegNum<[56]>;
-def F25 : FPR<25, "f25">, DwarfRegNum<[57]>;
-def F26 : FPR<26, "f26">, DwarfRegNum<[58]>;
-def F27 : FPR<27, "f27">, DwarfRegNum<[59]>;
-def F28 : FPR<28, "f28">, DwarfRegNum<[60]>;
-def F29 : FPR<29, "f29">, DwarfRegNum<[61]>;
-def F30 : FPR<30, "f30">, DwarfRegNum<[62]>;
-def F31 : FPR<31, "f31">, DwarfRegNum<[63]>;
+def F0  : FPR< 0,  "f0">, DwarfRegNum<[32, 32]>;
+def F1  : FPR< 1,  "f1">, DwarfRegNum<[33, 33]>;
+def F2  : FPR< 2,  "f2">, DwarfRegNum<[34, 34]>;
+def F3  : FPR< 3,  "f3">, DwarfRegNum<[35, 35]>;
+def F4  : FPR< 4,  "f4">, DwarfRegNum<[36, 36]>;
+def F5  : FPR< 5,  "f5">, DwarfRegNum<[37, 37]>;
+def F6  : FPR< 6,  "f6">, DwarfRegNum<[38, 38]>;
+def F7  : FPR< 7,  "f7">, DwarfRegNum<[39, 39]>;
+def F8  : FPR< 8,  "f8">, DwarfRegNum<[40, 40]>;
+def F9  : FPR< 9,  "f9">, DwarfRegNum<[41, 41]>;
+def F10 : FPR<10, "f10">, DwarfRegNum<[42, 42]>;
+def F11 : FPR<11, "f11">, DwarfRegNum<[43, 43]>;
+def F12 : FPR<12, "f12">, DwarfRegNum<[44, 44]>;
+def F13 : FPR<13, "f13">, DwarfRegNum<[45, 45]>;
+def F14 : FPR<14, "f14">, DwarfRegNum<[46, 46]>;
+def F15 : FPR<15, "f15">, DwarfRegNum<[47, 47]>;
+def F16 : FPR<16, "f16">, DwarfRegNum<[48, 48]>;
+def F17 : FPR<17, "f17">, DwarfRegNum<[49, 49]>;
+def F18 : FPR<18, "f18">, DwarfRegNum<[50, 50]>;
+def F19 : FPR<19, "f19">, DwarfRegNum<[51, 51]>;
+def F20 : FPR<20, "f20">, DwarfRegNum<[52, 52]>;
+def F21 : FPR<21, "f21">, DwarfRegNum<[53, 53]>;
+def F22 : FPR<22, "f22">, DwarfRegNum<[54, 54]>;
+def F23 : FPR<23, "f23">, DwarfRegNum<[55, 55]>;
+def F24 : FPR<24, "f24">, DwarfRegNum<[56, 56]>;
+def F25 : FPR<25, "f25">, DwarfRegNum<[57, 57]>;
+def F26 : FPR<26, "f26">, DwarfRegNum<[58, 58]>;
+def F27 : FPR<27, "f27">, DwarfRegNum<[59, 59]>;
+def F28 : FPR<28, "f28">, DwarfRegNum<[60, 60]>;
+def F29 : FPR<29, "f29">, DwarfRegNum<[61, 61]>;
+def F30 : FPR<30, "f30">, DwarfRegNum<[62, 62]>;
+def F31 : FPR<31, "f31">, DwarfRegNum<[63, 63]>;
 
 // Vector registers
-def V0  : VR< 0,  "v0">, DwarfRegNum<[77]>;
-def V1  : VR< 1,  "v1">, DwarfRegNum<[78]>;
-def V2  : VR< 2,  "v2">, DwarfRegNum<[79]>;
-def V3  : VR< 3,  "v3">, DwarfRegNum<[80]>;
-def V4  : VR< 4,  "v4">, DwarfRegNum<[81]>;
-def V5  : VR< 5,  "v5">, DwarfRegNum<[82]>;
-def V6  : VR< 6,  "v6">, DwarfRegNum<[83]>;
-def V7  : VR< 7,  "v7">, DwarfRegNum<[84]>;
-def V8  : VR< 8,  "v8">, DwarfRegNum<[85]>;
-def V9  : VR< 9,  "v9">, DwarfRegNum<[86]>;
-def V10 : VR<10, "v10">, DwarfRegNum<[87]>;
-def V11 : VR<11, "v11">, DwarfRegNum<[88]>;
-def V12 : VR<12, "v12">, DwarfRegNum<[89]>;
-def V13 : VR<13, "v13">, DwarfRegNum<[90]>;
-def V14 : VR<14, "v14">, DwarfRegNum<[91]>;
-def V15 : VR<15, "v15">, DwarfRegNum<[92]>;
-def V16 : VR<16, "v16">, DwarfRegNum<[93]>;
-def V17 : VR<17, "v17">, DwarfRegNum<[94]>;
-def V18 : VR<18, "v18">, DwarfRegNum<[95]>;
-def V19 : VR<19, "v19">, DwarfRegNum<[96]>;
-def V20 : VR<20, "v20">, DwarfRegNum<[97]>;
-def V21 : VR<21, "v21">, DwarfRegNum<[98]>;
-def V22 : VR<22, "v22">, DwarfRegNum<[99]>;
-def V23 : VR<23, "v23">, DwarfRegNum<[100]>;
-def V24 : VR<24, "v24">, DwarfRegNum<[101]>;
-def V25 : VR<25, "v25">, DwarfRegNum<[102]>;
-def V26 : VR<26, "v26">, DwarfRegNum<[103]>;
-def V27 : VR<27, "v27">, DwarfRegNum<[104]>;
-def V28 : VR<28, "v28">, DwarfRegNum<[105]>;
-def V29 : VR<29, "v29">, DwarfRegNum<[106]>;
-def V30 : VR<30, "v30">, DwarfRegNum<[107]>;
-def V31 : VR<31, "v31">, DwarfRegNum<[108]>;
+def V0  : VR< 0,  "v0">, DwarfRegNum<[77, 77]>;
+def V1  : VR< 1,  "v1">, DwarfRegNum<[78, 78]>;
+def V2  : VR< 2,  "v2">, DwarfRegNum<[79, 79]>;
+def V3  : VR< 3,  "v3">, DwarfRegNum<[80, 80]>;
+def V4  : VR< 4,  "v4">, DwarfRegNum<[81, 81]>;
+def V5  : VR< 5,  "v5">, DwarfRegNum<[82, 82]>;
+def V6  : VR< 6,  "v6">, DwarfRegNum<[83, 83]>;
+def V7  : VR< 7,  "v7">, DwarfRegNum<[84, 84]>;
+def V8  : VR< 8,  "v8">, DwarfRegNum<[85, 85]>;
+def V9  : VR< 9,  "v9">, DwarfRegNum<[86, 86]>;
+def V10 : VR<10, "v10">, DwarfRegNum<[87, 87]>;
+def V11 : VR<11, "v11">, DwarfRegNum<[88, 88]>;
+def V12 : VR<12, "v12">, DwarfRegNum<[89, 89]>;
+def V13 : VR<13, "v13">, DwarfRegNum<[90, 90]>;
+def V14 : VR<14, "v14">, DwarfRegNum<[91, 91]>;
+def V15 : VR<15, "v15">, DwarfRegNum<[92, 92]>;
+def V16 : VR<16, "v16">, DwarfRegNum<[93, 93]>;
+def V17 : VR<17, "v17">, DwarfRegNum<[94, 94]>;
+def V18 : VR<18, "v18">, DwarfRegNum<[95, 95]>;
+def V19 : VR<19, "v19">, DwarfRegNum<[96, 96]>;
+def V20 : VR<20, "v20">, DwarfRegNum<[97, 97]>;
+def V21 : VR<21, "v21">, DwarfRegNum<[98, 98]>;
+def V22 : VR<22, "v22">, DwarfRegNum<[99, 99]>;
+def V23 : VR<23, "v23">, DwarfRegNum<[100, 100]>;
+def V24 : VR<24, "v24">, DwarfRegNum<[101, 101]>;
+def V25 : VR<25, "v25">, DwarfRegNum<[102, 102]>;
+def V26 : VR<26, "v26">, DwarfRegNum<[103, 103]>;
+def V27 : VR<27, "v27">, DwarfRegNum<[104, 104]>;
+def V28 : VR<28, "v28">, DwarfRegNum<[105, 105]>;
+def V29 : VR<29, "v29">, DwarfRegNum<[106, 106]>;
+def V30 : VR<30, "v30">, DwarfRegNum<[107, 107]>;
+def V31 : VR<31, "v31">, DwarfRegNum<[108, 108]>;
 
 // Condition register bits
-def CR0LT : CRBIT< 0, "0">, DwarfRegNum<[0]>;
-def CR0GT : CRBIT< 1, "1">, DwarfRegNum<[0]>;
-def CR0EQ : CRBIT< 2, "2">, DwarfRegNum<[0]>;
-def CR0UN : CRBIT< 3, "3">, DwarfRegNum<[0]>;
-def CR1LT : CRBIT< 4, "4">, DwarfRegNum<[0]>;
-def CR1GT : CRBIT< 5, "5">, DwarfRegNum<[0]>;
-def CR1EQ : CRBIT< 6, "6">, DwarfRegNum<[0]>;
-def CR1UN : CRBIT< 7, "7">, DwarfRegNum<[0]>;
-def CR2LT : CRBIT< 8, "8">, DwarfRegNum<[0]>;
-def CR2GT : CRBIT< 9, "9">, DwarfRegNum<[0]>;
-def CR2EQ : CRBIT<10, "10">, DwarfRegNum<[0]>;
-def CR2UN : CRBIT<11, "11">, DwarfRegNum<[0]>;
-def CR3LT : CRBIT<12, "12">, DwarfRegNum<[0]>;
-def CR3GT : CRBIT<13, "13">, DwarfRegNum<[0]>;
-def CR3EQ : CRBIT<14, "14">, DwarfRegNum<[0]>;
-def CR3UN : CRBIT<15, "15">, DwarfRegNum<[0]>;
-def CR4LT : CRBIT<16, "16">, DwarfRegNum<[0]>;
-def CR4GT : CRBIT<17, "17">, DwarfRegNum<[0]>;
-def CR4EQ : CRBIT<18, "18">, DwarfRegNum<[0]>;
-def CR4UN : CRBIT<19, "19">, DwarfRegNum<[0]>;
-def CR5LT : CRBIT<20, "20">, DwarfRegNum<[0]>;
-def CR5GT : CRBIT<21, "21">, DwarfRegNum<[0]>;
-def CR5EQ : CRBIT<22, "22">, DwarfRegNum<[0]>;
-def CR5UN : CRBIT<23, "23">, DwarfRegNum<[0]>;
-def CR6LT : CRBIT<24, "24">, DwarfRegNum<[0]>;
-def CR6GT : CRBIT<25, "25">, DwarfRegNum<[0]>;
-def CR6EQ : CRBIT<26, "26">, DwarfRegNum<[0]>;
-def CR6UN : CRBIT<27, "27">, DwarfRegNum<[0]>;
-def CR7LT : CRBIT<28, "28">, DwarfRegNum<[0]>;
-def CR7GT : CRBIT<29, "29">, DwarfRegNum<[0]>;
-def CR7EQ : CRBIT<30, "30">, DwarfRegNum<[0]>;
-def CR7UN : CRBIT<31, "31">, DwarfRegNum<[0]>;
+def CR0LT : CRBIT< 0, "0">;
+def CR0GT : CRBIT< 1, "1">;
+def CR0EQ : CRBIT< 2, "2">;
+def CR0UN : CRBIT< 3, "3">;
+def CR1LT : CRBIT< 4, "4">;
+def CR1GT : CRBIT< 5, "5">;
+def CR1EQ : CRBIT< 6, "6">;
+def CR1UN : CRBIT< 7, "7">;
+def CR2LT : CRBIT< 8, "8">;
+def CR2GT : CRBIT< 9, "9">;
+def CR2EQ : CRBIT<10, "10">;
+def CR2UN : CRBIT<11, "11">;
+def CR3LT : CRBIT<12, "12">;
+def CR3GT : CRBIT<13, "13">;
+def CR3EQ : CRBIT<14, "14">;
+def CR3UN : CRBIT<15, "15">;
+def CR4LT : CRBIT<16, "16">;
+def CR4GT : CRBIT<17, "17">;
+def CR4EQ : CRBIT<18, "18">;
+def CR4UN : CRBIT<19, "19">;
+def CR5LT : CRBIT<20, "20">;
+def CR5GT : CRBIT<21, "21">;
+def CR5EQ : CRBIT<22, "22">;
+def CR5UN : CRBIT<23, "23">;
+def CR6LT : CRBIT<24, "24">;
+def CR6GT : CRBIT<25, "25">;
+def CR6EQ : CRBIT<26, "26">;
+def CR6UN : CRBIT<27, "27">;
+def CR7LT : CRBIT<28, "28">;
+def CR7GT : CRBIT<29, "29">;
+def CR7EQ : CRBIT<30, "30">;
+def CR7UN : CRBIT<31, "31">;
 
 // Condition registers
 let SubRegIndices = [sub_lt, sub_gt, sub_eq, sub_un] in {
-def CR0 : CR<0, "cr0", [CR0LT, CR0GT, CR0EQ, CR0UN]>, DwarfRegNum<[68]>;
-def CR1 : CR<1, "cr1", [CR1LT, CR1GT, CR1EQ, CR1UN]>, DwarfRegNum<[69]>;
-def CR2 : CR<2, "cr2", [CR2LT, CR2GT, CR2EQ, CR2UN]>, DwarfRegNum<[70]>;
-def CR3 : CR<3, "cr3", [CR3LT, CR3GT, CR3EQ, CR3UN]>, DwarfRegNum<[71]>;
-def CR4 : CR<4, "cr4", [CR4LT, CR4GT, CR4EQ, CR4UN]>, DwarfRegNum<[72]>;
-def CR5 : CR<5, "cr5", [CR5LT, CR5GT, CR5EQ, CR5UN]>, DwarfRegNum<[73]>;
-def CR6 : CR<6, "cr6", [CR6LT, CR6GT, CR6EQ, CR6UN]>, DwarfRegNum<[74]>;
-def CR7 : CR<7, "cr7", [CR7LT, CR7GT, CR7EQ, CR7UN]>, DwarfRegNum<[75]>;
+def CR0 : CR<0, "cr0", [CR0LT, CR0GT, CR0EQ, CR0UN]>, DwarfRegNum<[68, 68]>;
+def CR1 : CR<1, "cr1", [CR1LT, CR1GT, CR1EQ, CR1UN]>, DwarfRegNum<[69, 69]>;
+def CR2 : CR<2, "cr2", [CR2LT, CR2GT, CR2EQ, CR2UN]>, DwarfRegNum<[70, 70]>;
+def CR3 : CR<3, "cr3", [CR3LT, CR3GT, CR3EQ, CR3UN]>, DwarfRegNum<[71, 71]>;
+def CR4 : CR<4, "cr4", [CR4LT, CR4GT, CR4EQ, CR4UN]>, DwarfRegNum<[72, 72]>;
+def CR5 : CR<5, "cr5", [CR5LT, CR5GT, CR5EQ, CR5UN]>, DwarfRegNum<[73, 73]>;
+def CR6 : CR<6, "cr6", [CR6LT, CR6GT, CR6EQ, CR6UN]>, DwarfRegNum<[74, 74]>;
+def CR7 : CR<7, "cr7", [CR7LT, CR7GT, CR7EQ, CR7UN]>, DwarfRegNum<[75, 75]>;
 }
 
 // Link register
-def LR  : SPR<8, "lr">, DwarfRegNum<[65]>;
+def LR  : SPR<8, "lr">, DwarfRegNum<[-2, 65]>;
 //let Aliases = [LR] in
-def LR8 : SPR<8, "lr">, DwarfRegNum<[65]>;
+def LR8 : SPR<8, "lr">, DwarfRegNum<[65, -2]>;
 
 // Count register
-def CTR  : SPR<9, "ctr">, DwarfRegNum<[66]>;
-def CTR8 : SPR<9, "ctr">, DwarfRegNum<[66]>;
+def CTR  : SPR<9, "ctr">, DwarfRegNum<[-2, 66]>;
+def CTR8 : SPR<9, "ctr">, DwarfRegNum<[66, -2]>;
 
 // VRsave register
-def VRSAVE: SPR<256, "VRsave">, DwarfRegNum<[107]>;
+def VRSAVE: SPR<256, "VRsave">, DwarfRegNum<[109]>;
 
 // Carry bit.  In the architecture this is really bit 0 of the XER register
 // (which really is SPR register 1);  this is the only bit interesting to a
 // compiler.
-def CARRY: SPR<1, "ca">, DwarfRegNum<[0]>;
+def CARRY: SPR<1, "ca">;
 
 // FP rounding mode:  bits 30 and 31 of the FP status and control register
 // This is not allocated as a normal register; it appears only in
@@ -271,76 +271,18 @@ def CARRY: SPR<1, "ca">, DwarfRegNum<[0]>;
 // return and call instructions are described as Uses of RM, so instructions
 // that do nothing but change RM will not get deleted.
 // Also, in the architecture it is not really a SPR; 512 is arbitrary.
-def RM: SPR<512, "**ROUNDING MODE**">, DwarfRegNum<[0]>;
+def RM: SPR<512, "**ROUNDING MODE**">;
 
 /// Register classes
 // Allocate volatiles first
 // then nonvolatiles in reverse order since stmw/lmw save from rN to r31
-def GPRC : RegisterClass<"PPC", [i32], 32,
-     [R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12,
-      R30, R29, R28, R27, R26, R25, R24, R23, R22, R21, R20, R19, R18, R17,
-      R16, R15, R14, R13, R31, R0, R1, LR]>
-{
-  let MethodProtos = [{
-    iterator allocation_order_begin(const MachineFunction &MF) const;
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    GPRCClass::iterator
-    GPRCClass::allocation_order_begin(const MachineFunction &MF) const {
-      // 32-bit SVR4 ABI: r2 is reserved for the OS.
-      // 64-bit SVR4 ABI: r2 is reserved for the TOC pointer.
-      // Darwin: R2 is reserved for CR save/restore sequence.
-      return begin()+1;
-    }
-    GPRCClass::iterator
-    GPRCClass::allocation_order_end(const MachineFunction &MF) const {
-      // On PPC64, r13 is the thread pointer.  Never allocate this register.
-      // Note that this is overconservative, as it also prevents allocation of
-      // R31 when the FP is not needed.
-      // When using the 32-bit SVR4 ABI, r13 is reserved for the Small Data Area
-      // pointer.
-      const PPCSubtarget &Subtarget = MF.getTarget().getSubtarget<PPCSubtarget>();
-      const PPCFrameLowering *PPCFI =
-        static_cast<const PPCFrameLowering*>(MF.getTarget().getFrameLowering());
-   
-      if (Subtarget.isPPC64() || Subtarget.isSVR4ABI())
-        return end()-5;  // don't allocate R13, R31, R0, R1, LR
-        
-      if (PPCFI->needsFP(MF))
-        return end()-4;  // don't allocate R31, R0, R1, LR
-      else
-        return end()-3;  // don't allocate R0, R1, LR
-    }
-  }];
-}
-def G8RC : RegisterClass<"PPC", [i64], 64,
-     [X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X12,
-      X30, X29, X28, X27, X26, X25, X24, X23, X22, X21, X20, X19, X18, X17,
-      X16, X15, X14, X31, X13, X0, X1, LR8]>
-{
-  let MethodProtos = [{
-    iterator allocation_order_begin(const MachineFunction &MF) const;
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    G8RCClass::iterator
-    G8RCClass::allocation_order_begin(const MachineFunction &MF) const {
-      // 64-bit SVR4 ABI: r2 is reserved for the TOC pointer.
-      // Darwin: r2 is reserved for CR save/restore sequence.
-      return begin()+1;
-    }
-    G8RCClass::iterator
-    G8RCClass::allocation_order_end(const MachineFunction &MF) const {
-      const PPCFrameLowering *PPCFI =
-        static_cast<const PPCFrameLowering*>(MF.getTarget().getFrameLowering());
-      if (PPCFI->needsFP(MF))
-        return end()-5;
-      else
-        return end()-4;
-    }
-  }];
-}
+def GPRC : RegisterClass<"PPC", [i32], 32, (add (sequence "R%u", 2, 12),
+                                                (sequence "R%u", 30, 13),
+                                                R31, R0, R1, LR)>;
+
+def G8RC : RegisterClass<"PPC", [i64], 64, (add (sequence "X%u", 2, 12),
+                                                (sequence "X%u", 30, 14),
+                                                X31, X13, X0, X1, LR8)>;
 
 // Allocate volatiles first, then non-volatiles in reverse order. With the SVR4
 // ABI the size of the Floating-point register save area is determined by the
@@ -349,41 +291,36 @@ def G8RC : RegisterClass<"PPC", [i64], 64,
 // previous stack frame. By allocating non-volatiles in reverse order we make
 // sure that the Floating-point register save area is always as small as
 // possible because there aren't any unused spill slots.
-def F8RC : RegisterClass<"PPC", [f64], 64, [F0, F1, F2, F3, F4, F5, F6, F7,
-  F8, F9, F10, F11, F12, F13, F31, F30, F29, F28, F27, F26, F25, F24, F23,
-  F22, F21, F20, F19, F18, F17, F16, F15, F14]>;
-def F4RC : RegisterClass<"PPC", [f32], 32, [F0, F1, F2, F3, F4, F5, F6, F7,
-  F8, F9, F10, F11, F12, F13, F31, F30, F29, F28, F27, F26, F25, F24, F23,
-  F22, F21, F20, F19, F18, F17, F16, F15, F14]>;
+def F8RC : RegisterClass<"PPC", [f64], 64, (add (sequence "F%u", 0, 13),
+                                                (sequence "F%u", 31, 14))>;
+def F4RC : RegisterClass<"PPC", [f32], 32, (add F8RC)>;
 
 def VRRC : RegisterClass<"PPC", [v16i8,v8i16,v4i32,v4f32], 128,
- [V2, V3, V4, V5, V0, V1, 
-  V6, V7, V8, V9, V10, V11, V12, V13, V14, V15, V16, V17, V18, V19, V31, V30,
-  V29, V28, V27, V26, V25, V24, V23, V22, V21, V20]>;
+                         (add V2, V3, V4, V5, V0, V1, V6, V7, V8, V9, V10, V11,
+                             V12, V13, V14, V15, V16, V17, V18, V19, V31, V30,
+                             V29, V28, V27, V26, V25, V24, V23, V22, V21, V20)>;
 
 def CRBITRC : RegisterClass<"PPC", [i32], 32,
-  [CR0LT, CR0GT, CR0EQ, CR0UN,
-   CR1LT, CR1GT, CR1EQ, CR1UN,
-   CR2LT, CR2GT, CR2EQ, CR2UN,
-   CR3LT, CR3GT, CR3EQ, CR3UN,
-   CR4LT, CR4GT, CR4EQ, CR4UN,
-   CR5LT, CR5GT, CR5EQ, CR5UN,
-   CR6LT, CR6GT, CR6EQ, CR6UN,
-   CR7LT, CR7GT, CR7EQ, CR7UN
-  ]>
+  (add CR0LT, CR0GT, CR0EQ, CR0UN,
+       CR1LT, CR1GT, CR1EQ, CR1UN,
+       CR2LT, CR2GT, CR2EQ, CR2UN,
+       CR3LT, CR3GT, CR3EQ, CR3UN,
+       CR4LT, CR4GT, CR4EQ, CR4UN,
+       CR5LT, CR5GT, CR5EQ, CR5UN,
+       CR6LT, CR6GT, CR6EQ, CR6UN,
+       CR7LT, CR7GT, CR7EQ, CR7UN)>
 {
   let CopyCost = -1;
 }
 
-def CRRC : RegisterClass<"PPC", [i32], 32, [CR0, CR1, CR5, CR6, CR7, CR2, 
-  CR3, CR4]>
-{
+def CRRC : RegisterClass<"PPC", [i32], 32, (add CR0, CR1, CR5, CR6,
+                                                CR7, CR2, CR3, CR4)> {
   let SubRegClasses = [(CRBITRC sub_lt, sub_gt, sub_eq, sub_un)];
 }
 
-def CTRRC : RegisterClass<"PPC", [i32], 32, [CTR]>;
-def CTRRC8 : RegisterClass<"PPC", [i64], 64, [CTR8]>;
-def VRSAVERC : RegisterClass<"PPC", [i32], 32, [VRSAVE]>;
-def CARRYRC : RegisterClass<"PPC", [i32], 32, [CARRY]> {
+def CTRRC : RegisterClass<"PPC", [i32], 32, (add CTR)>;
+def CTRRC8 : RegisterClass<"PPC", [i64], 64, (add CTR8)>;
+def VRSAVERC : RegisterClass<"PPC", [i32], 32, (add VRSAVE)>;
+def CARRYRC : RegisterClass<"PPC", [i32], 32, (add CARRY)> {
   let CopyCost = -1;
 }
diff --git a/lib/Target/PowerPC/PPCSubtarget.cpp b/lib/Target/PowerPC/PPCSubtarget.cpp
index 72a1dee..5f3aa23 100644
--- a/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -70,7 +70,7 @@ PPCSubtarget::PPCSubtarget(const std::string &TT, const std::string &FS,
   , HasSTFIWX(false)
   , HasLazyResolverStubs(false)
   , IsJITCodeModel(false)
-  , DarwinVers(0) {
+  , TargetTriple(TT) {
 
   // Determine default and user specified characteristics
   std::string CPU = "generic";
@@ -92,19 +92,6 @@ PPCSubtarget::PPCSubtarget(const std::string &TT, const std::string &FS,
   // support it, ignore.
   if (use64BitRegs() && !has64BitSupport())
     Use64BitRegs = false;
-  
-  // Set the boolean corresponding to the current target triple, or the default
-  // if one cannot be determined, to true.
-  if (TT.length() > 7) {
-    // Determine which version of darwin this is.
-    size_t DarwinPos = TT.find("-darwin");
-    if (DarwinPos != std::string::npos) {
-      if (isdigit(TT[DarwinPos+7]))
-        DarwinVers = atoi(&TT[DarwinPos+7]);
-      else
-        DarwinVers = 8;  // Minimum supported darwin is Tiger.
-    }
-  }
 
   // Set up darwin-specific properties.
   if (isDarwin())
diff --git a/lib/Target/PowerPC/PPCSubtarget.h b/lib/Target/PowerPC/PPCSubtarget.h
index 00ec747..8fd1a44 100644
--- a/lib/Target/PowerPC/PPCSubtarget.h
+++ b/lib/Target/PowerPC/PPCSubtarget.h
@@ -14,6 +14,7 @@
 #ifndef POWERPCSUBTARGET_H
 #define POWERPCSUBTARGET_H
 
+#include "llvm/ADT/Triple.h"
 #include "llvm/Target/TargetInstrItineraries.h"
 #include "llvm/Target/TargetSubtarget.h"
 
@@ -65,9 +66,9 @@ protected:
   bool HasLazyResolverStubs;
   bool IsJITCodeModel;
   
-  /// DarwinVers - Nonzero if this is a darwin platform.  Otherwise, the numeric
-  /// version of the platform, e.g. 8 = 10.4 (Tiger), 9 = 10.5 (Leopard), etc.
-  unsigned char DarwinVers; // Is any darwin-ppc platform.
+  /// TargetTriple - What processor and OS we're targeting.
+  Triple TargetTriple;
+
 public:
   /// This constructor initializes the data members to match that
   /// of the specified triple.
@@ -134,13 +135,10 @@ public:
   bool hasAltivec() const { return HasAltivec; }
   bool isGigaProcessor() const { return IsGigaProcessor; }
 
-  /// isDarwin - True if this is any darwin platform.
-  bool isDarwin() const { return DarwinVers != 0; }
-  /// isDarwin - True if this is darwin9 (leopard, 10.5) or above.
-  bool isDarwin9() const { return DarwinVers >= 9; }
+  const Triple &getTargetTriple() const { return TargetTriple; }
 
-  /// getDarwinVers - Return the darwin version number, 8 = tiger, 9 = leopard.
-  unsigned getDarwinVers() const { return DarwinVers; }
+  /// isDarwin - True if this is any darwin platform.
+  bool isDarwin() const { return TargetTriple.isMacOSX(); }
 
   bool isDarwinABI() const { return isDarwin(); }
   bool isSVR4ABI() const { return !isDarwin(); }
diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp
index 212b450..d27e54e 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -24,7 +24,7 @@ using namespace llvm;
 static MCAsmInfo *createMCAsmInfo(const Target &T, StringRef TT) {
   Triple TheTriple(TT);
   bool isPPC64 = TheTriple.getArch() == Triple::ppc64;
-  if (TheTriple.getOS() == Triple::Darwin)
+  if (TheTriple.isOSDarwin())
     return new PPCMCAsmInfoDarwin(isPPC64);
   return new PPCLinuxMCAsmInfo(isPPC64);
   
@@ -37,12 +37,10 @@ static MCStreamer *createMCStreamer(const Target &T, const std::string &TT,
                                     MCCodeEmitter *Emitter,
                                     bool RelaxAll,
                                     bool NoExecStack) {
-  switch (Triple(TT).getOS()) {
-  case Triple::Darwin:
+  if (Triple(TT).isOSDarwin())
     return createMachOStreamer(Ctx, TAB, OS, Emitter, RelaxAll);
-  default:
-    return NULL;
-  }
+
+  return NULL;
 }
 
 extern "C" void LLVMInitializePowerPCTarget() {
diff --git a/lib/Target/README.txt b/lib/Target/README.txt
index 99616b4..4e382e8 100644
--- a/lib/Target/README.txt
+++ b/lib/Target/README.txt
@@ -870,11 +870,6 @@ rshift_gt (unsigned int a)
    bar ();
 }
 
-void neg_eq_cst(unsigned int a) {
-if (-a == 123)
-bar();
-}
-
 All should simplify to a single comparison.  All of these are
 currently not optimized with "clang -emit-llvm-bc | opt
 -std-compile-opts".
@@ -2258,3 +2253,103 @@ SimplifyDemandedBits shrinks the "and" constant to 2 but instcombine misses the
 icmp transform.
 
 //===---------------------------------------------------------------------===//
+
+This code:
+
+typedef struct {
+int f1:1;
+int f2:1;
+int f3:1;
+int f4:29;
+} t1;
+
+typedef struct {
+int f1:1;
+int f2:1;
+int f3:30;
+} t2;
+
+t1 s1;
+t2 s2;
+
+void func1(void)
+{
+s1.f1 = s2.f1;
+s1.f2 = s2.f2;
+}
+
+Compiles into this IR (on x86-64 at least):
+
+%struct.t1 = type { i8, [3 x i8] }
+@s2 = global %struct.t1 zeroinitializer, align 4
+@s1 = global %struct.t1 zeroinitializer, align 4
+define void @func1() nounwind ssp noredzone {
+entry:
+  %0 = load i32* bitcast (%struct.t1* @s2 to i32*), align 4
+  %bf.val.sext5 = and i32 %0, 1
+  %1 = load i32* bitcast (%struct.t1* @s1 to i32*), align 4
+  %2 = and i32 %1, -4
+  %3 = or i32 %2, %bf.val.sext5
+  %bf.val.sext26 = and i32 %0, 2
+  %4 = or i32 %3, %bf.val.sext26
+  store i32 %4, i32* bitcast (%struct.t1* @s1 to i32*), align 4
+  ret void
+}
+
+The two or/and's should be merged into one each.
+
+//===---------------------------------------------------------------------===//
+
+Machine level code hoisting can be useful in some cases.  For example, PR9408
+is about:
+
+typedef union {
+ void (*f1)(int);
+ void (*f2)(long);
+} funcs;
+
+void foo(funcs f, int which) {
+ int a = 5;
+ if (which) {
+   f.f1(a);
+ } else {
+   f.f2(a);
+ }
+}
+
+which we compile to:
+
+foo:                                    # @foo
+# BB#0:                                 # %entry
+       pushq   %rbp
+       movq    %rsp, %rbp
+       testl   %esi, %esi
+       movq    %rdi, %rax
+       je      .LBB0_2
+# BB#1:                                 # %if.then
+       movl    $5, %edi
+       callq   *%rax
+       popq    %rbp
+       ret
+.LBB0_2:                                # %if.else
+       movl    $5, %edi
+       callq   *%rax
+       popq    %rbp
+       ret
+
+Note that bb1 and bb2 are the same.  This doesn't happen at the IR level
+because one call is passing an i32 and the other is passing an i64.
+
+//===---------------------------------------------------------------------===//
+
+I see this sort of pattern in 176.gcc in a few places (e.g. the start of
+store_bit_field).  The rem should be replaced with a multiply and subtract:
+
+  %3 = sdiv i32 %A, %B
+  %4 = srem i32 %A, %B
+
+Similarly for udiv/urem.  Note that this shouldn't be done on X86 or ARM,
+which can do this in a single operation (instruction or libcall).  It is
+probably best to do this in the code generator.
+
+//===---------------------------------------------------------------------===//
diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp
index 70574c3..0b4612d 100644
--- a/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/lib/Target/Sparc/SparcISelLowering.cpp
@@ -91,8 +91,8 @@ SparcTargetLowering::LowerReturn(SDValue Chain,
   SmallVector<CCValAssign, 16> RVLocs;
 
   // CCState - Info about the registers and stack slot.
-  CCState CCInfo(CallConv, isVarArg, DAG.getTarget(),
-                 RVLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 DAG.getTarget(), RVLocs, *DAG.getContext());
 
   // Analize return values.
   CCInfo.AnalyzeReturn(Outs, RetCC_Sparc32);
@@ -139,7 +139,7 @@ SparcTargetLowering::LowerReturn(SDValue Chain,
   if (Flag.getNode())
     return DAG.getNode(SPISD::RET_FLAG, dl, MVT::Other, Chain,
                        RetAddrOffsetNode, Flag);
-  return DAG.getNode(SPISD::RET_FLAG, dl, MVT::Other, Chain, 
+  return DAG.getNode(SPISD::RET_FLAG, dl, MVT::Other, Chain,
                      RetAddrOffsetNode);
 }
 
@@ -161,8 +161,8 @@ SparcTargetLowering::LowerFormalArguments(SDValue Chain,
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
-                 ArgLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), ArgLocs, *DAG.getContext());
   CCInfo.AnalyzeFormalArguments(Ins, CC_Sparc32);
 
   const unsigned StackOffset = 92;
@@ -182,8 +182,6 @@ SparcTargetLowering::LowerFormalArguments(SDValue Chain,
     }
 
     if (VA.isRegLoc()) {
-      EVT RegVT = VA.getLocVT();
-
       if (VA.needsCustom()) {
         assert(VA.getLocVT() == MVT::f64);
         unsigned VRegHi = RegInfo.createVirtualRegister(&SP::IntRegsRegClass);
@@ -362,8 +360,8 @@ SparcTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getTarget(), ArgLocs,
-                 *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 DAG.getTarget(), ArgLocs, *DAG.getContext());
   CCInfo.AnalyzeCallOperands(Outs, CC_Sparc32);
 
   // Get the size of the outgoing arguments stack space requirement.
@@ -544,7 +542,7 @@ SparcTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
 
   // Build a sequence of copy-to-reg nodes chained together with token
   // chain and flag operands which copy the outgoing args into registers.
-  // The InFlag in necessary since all emited instructions must be
+  // The InFlag in necessary since all emitted instructions must be
   // stuck together.
   SDValue InFlag;
   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
@@ -593,8 +591,8 @@ SparcTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
 
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState RVInfo(CallConv, isVarArg, DAG.getTarget(),
-                 RVLocs, *DAG.getContext());
+  CCState RVInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 DAG.getTarget(), RVLocs, *DAG.getContext());
 
   RVInfo.AnalyzeCallResult(Ins, RetCC_Sparc32);
 
@@ -801,6 +799,8 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
   if (TM.getSubtarget<SparcSubtarget>().isV9())
     setOperationAction(ISD::CTPOP, MVT::i32, Legal);
 
+  setMinFunctionAlignment(2);
+
   computeRegisterProperties();
 }
 
@@ -1290,8 +1290,3 @@ SparcTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
   // The Sparc target isn't yet aware of offsets.
   return false;
 }
-
-/// getFunctionAlignment - Return the Log2 alignment of this function.
-unsigned SparcTargetLowering::getFunctionAlignment(const Function *) const {
-  return 2;
-}
diff --git a/lib/Target/Sparc/SparcISelLowering.h b/lib/Target/Sparc/SparcISelLowering.h
index 7d02df8..9ea6e16 100644
--- a/lib/Target/Sparc/SparcISelLowering.h
+++ b/lib/Target/Sparc/SparcISelLowering.h
@@ -71,9 +71,6 @@ namespace llvm {
 
     virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
 
-    /// getFunctionAlignment - Return the Log2 alignment of this function.
-    virtual unsigned getFunctionAlignment(const Function *F) const;
-
     virtual SDValue
       LowerFormalArguments(SDValue Chain,
                            CallingConv::ID CallConv,
diff --git a/lib/Target/Sparc/SparcRegisterInfo.cpp b/lib/Target/Sparc/SparcRegisterInfo.cpp
index b010d04..9fcf028 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.cpp
+++ b/lib/Target/Sparc/SparcRegisterInfo.cpp
@@ -39,6 +39,8 @@ const unsigned* SparcRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF)
 
 BitVector SparcRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
+  // FIXME: G1 reserved for now for large imm generation by frame code.
+  Reserved.set(SP::G1);
   Reserved.set(SP::G2);
   Reserved.set(SP::G3);
   Reserved.set(SP::G4);
@@ -130,5 +132,9 @@ int SparcRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
   return SparcGenRegisterInfo::getDwarfRegNumFull(RegNum, 0);
 }
 
+int SparcRegisterInfo::getLLVMRegNum(unsigned DwarfRegNo, bool isEH) const {
+  return SparcGenRegisterInfo::getLLVMRegNumFull(DwarfRegNo,0);
+}
+
 #include "SparcGenRegisterInfo.inc"
 
diff --git a/lib/Target/Sparc/SparcRegisterInfo.h b/lib/Target/Sparc/SparcRegisterInfo.h
index d930b53..56c8068 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.h
+++ b/lib/Target/Sparc/SparcRegisterInfo.h
@@ -52,6 +52,7 @@ struct SparcRegisterInfo : public SparcGenRegisterInfo {
   unsigned getEHHandlerRegister() const;
 
   int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+  int getLLVMRegNum(unsigned RegNum, bool isEH) const;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/Sparc/SparcRegisterInfo.td b/lib/Target/Sparc/SparcRegisterInfo.td
index 5ef4dae..cf92829 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.td
+++ b/lib/Target/Sparc/SparcRegisterInfo.td
@@ -117,59 +117,43 @@ def F30 : Rf<30, "F30">, DwarfRegNum<[62]>;
 def F31 : Rf<31, "F31">, DwarfRegNum<[63]>;
 
 // Aliases of the F* registers used to hold 64-bit fp values (doubles)
-def D0  : Rd< 0,  "F0", [F0,   F1]>, DwarfRegNum<[32]>;
-def D1  : Rd< 2,  "F2", [F2,   F3]>, DwarfRegNum<[34]>; 
-def D2  : Rd< 4,  "F4", [F4,   F5]>, DwarfRegNum<[36]>;
-def D3  : Rd< 6,  "F6", [F6,   F7]>, DwarfRegNum<[38]>; 
-def D4  : Rd< 8,  "F8", [F8,   F9]>, DwarfRegNum<[40]>;
-def D5  : Rd<10, "F10", [F10, F11]>, DwarfRegNum<[42]>;
-def D6  : Rd<12, "F12", [F12, F13]>, DwarfRegNum<[44]>;
-def D7  : Rd<14, "F14", [F14, F15]>, DwarfRegNum<[46]>; 
-def D8  : Rd<16, "F16", [F16, F17]>, DwarfRegNum<[48]>;
-def D9  : Rd<18, "F18", [F18, F19]>, DwarfRegNum<[50]>; 
-def D10 : Rd<20, "F20", [F20, F21]>, DwarfRegNum<[52]>;
-def D11 : Rd<22, "F22", [F22, F23]>, DwarfRegNum<[54]>;
-def D12 : Rd<24, "F24", [F24, F25]>, DwarfRegNum<[56]>;
-def D13 : Rd<26, "F26", [F26, F27]>, DwarfRegNum<[58]>; 
-def D14 : Rd<28, "F28", [F28, F29]>, DwarfRegNum<[60]>;
-def D15 : Rd<30, "F30", [F30, F31]>, DwarfRegNum<[62]>;
+def D0  : Rd< 0,  "F0", [F0,   F1]>, DwarfRegNum<[72]>;
+def D1  : Rd< 2,  "F2", [F2,   F3]>, DwarfRegNum<[73]>;
+def D2  : Rd< 4,  "F4", [F4,   F5]>, DwarfRegNum<[74]>;
+def D3  : Rd< 6,  "F6", [F6,   F7]>, DwarfRegNum<[75]>;
+def D4  : Rd< 8,  "F8", [F8,   F9]>, DwarfRegNum<[76]>;
+def D5  : Rd<10, "F10", [F10, F11]>, DwarfRegNum<[77]>;
+def D6  : Rd<12, "F12", [F12, F13]>, DwarfRegNum<[78]>;
+def D7  : Rd<14, "F14", [F14, F15]>, DwarfRegNum<[79]>;
+def D8  : Rd<16, "F16", [F16, F17]>, DwarfRegNum<[80]>;
+def D9  : Rd<18, "F18", [F18, F19]>, DwarfRegNum<[81]>;
+def D10 : Rd<20, "F20", [F20, F21]>, DwarfRegNum<[82]>;
+def D11 : Rd<22, "F22", [F22, F23]>, DwarfRegNum<[83]>;
+def D12 : Rd<24, "F24", [F24, F25]>, DwarfRegNum<[84]>;
+def D13 : Rd<26, "F26", [F26, F27]>, DwarfRegNum<[85]>;
+def D14 : Rd<28, "F28", [F28, F29]>, DwarfRegNum<[86]>;
+def D15 : Rd<30, "F30", [F30, F31]>, DwarfRegNum<[87]>;
 
 // Register classes.
 //
 // FIXME: the register order should be defined in terms of the preferred
 // allocation order...
 //
-def IntRegs : RegisterClass<"SP", [i32], 32, [L0, L1, L2, L3, L4, L5, L6, L7,
-                                     I0, I1, I2, I3, I4, I5,
-                                     O0, O1, O2, O3, O4, O5, O7,
+def IntRegs : RegisterClass<"SP", [i32], 32,
+                            (add L0, L1, L2, L3, L4, L5, L6,
+                                 L7, I0, I1, I2, I3, I4, I5,
+                                 O0, O1, O2, O3, O4, O5, O7,
+                                 G1,
+                                 // Non-allocatable regs:
+                                 G2, G3, G4, // FIXME: OK for use only in
+                                             // applications, not libraries.
+                                 O6, // stack ptr
+                                 I6, // frame ptr
+                                 I7, // return address
+                                 G0, // constant zero
+                                 G5, G6, G7 // reserved for kernel
+                                 )>;
 
-   // FIXME: G1 reserved for now for large imm generation by frame code.
-                                     G1,
-                                     // Non-allocatable regs:
-                                     G2, G3, G4, // FIXME: OK for use only in
-                                                 // applications, not libraries.
-                                     O6, // stack ptr
-                                     I6, // frame ptr
-                                     I7, // return address
-                                     G0, // constant zero
-                                     G5, G6, G7 // reserved for kernel
-                                     ]> {
-  let MethodProtos = [{
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    IntRegsClass::iterator
-    IntRegsClass::allocation_order_end(const MachineFunction &MF) const {
-      // FIXME: These special regs should be taken out of the regclass!
-      return end()-10  // Don't allocate special registers
-         -1;  // FIXME: G1 reserved for large imm generation by frame code.
-    }
-  }];
-}
-
-def FPRegs : RegisterClass<"SP", [f32], 32, [F0, F1, F2, F3, F4, F5, F6, F7, F8,
-  F9, F10, F11, F12, F13, F14, F15, F16, F17, F18, F19, F20, F21, F22,
-  F23, F24, F25, F26, F27, F28, F29, F30, F31]>;
+def FPRegs : RegisterClass<"SP", [f32], 32, (sequence "F%u", 0, 31)>;
 
-def DFPRegs : RegisterClass<"SP", [f64], 64, [D0, D1, D2, D3, D4, D5, D6, D7,
-  D8, D9, D10, D11, D12, D13, D14, D15]>;
+def DFPRegs : RegisterClass<"SP", [f64], 64, (sequence "D%u", 0, 15)>;
diff --git a/lib/Target/SubtargetFeature.cpp b/lib/Target/SubtargetFeature.cpp
index 3cf95b5..e0a9de8 100644
--- a/lib/Target/SubtargetFeature.cpp
+++ b/lib/Target/SubtargetFeature.cpp
@@ -211,7 +211,7 @@ const std::string & SubtargetFeatures::getCPU() const {
 /// feature, set it.
 ///
 static
-void SetImpliedBits(uint32_t &Bits, const SubtargetFeatureKV *FeatureEntry,
+void SetImpliedBits(uint64_t &Bits, const SubtargetFeatureKV *FeatureEntry,
                     const SubtargetFeatureKV *FeatureTable,
                     size_t FeatureTableSize) {
   for (size_t i = 0; i < FeatureTableSize; ++i) {
@@ -230,7 +230,7 @@ void SetImpliedBits(uint32_t &Bits, const SubtargetFeatureKV *FeatureEntry,
 /// feature, clear it.
 /// 
 static
-void ClearImpliedBits(uint32_t &Bits, const SubtargetFeatureKV *FeatureEntry,
+void ClearImpliedBits(uint64_t &Bits, const SubtargetFeatureKV *FeatureEntry,
                       const SubtargetFeatureKV *FeatureTable,
                       size_t FeatureTableSize) {
   for (size_t i = 0; i < FeatureTableSize; ++i) {
@@ -247,7 +247,7 @@ void ClearImpliedBits(uint32_t &Bits, const SubtargetFeatureKV *FeatureEntry,
 
 /// getBits - Get feature bits.
 ///
-uint32_t SubtargetFeatures::getBits(const SubtargetFeatureKV *CPUTable,
+uint64_t SubtargetFeatures::getBits(const SubtargetFeatureKV *CPUTable,
                                           size_t CPUTableSize,
                                     const SubtargetFeatureKV *FeatureTable,
                                           size_t FeatureTableSize) {
@@ -263,7 +263,7 @@ uint32_t SubtargetFeatures::getBits(const SubtargetFeatureKV *CPUTable,
           "CPU features table is not sorted");
   }
 #endif
-  uint32_t Bits = 0;                    // Resulting bits
+  uint64_t Bits = 0;                    // Resulting bits
 
   // Check if help is needed
   if (Features[0] == "help")
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
index 90939c3..af85df5 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -153,6 +153,8 @@ SystemZTargetLowering::SystemZTargetLowering(SystemZTargetMachine &tm) :
   setOperationAction(ISD::FP_TO_UINT,       MVT::i64, Expand);
 
   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
+  setMinFunctionAlignment(1);
 }
 
 SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
@@ -289,8 +291,8 @@ SystemZTargetLowering::LowerCCCArguments(SDValue Chain,
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
-                 ArgLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), ArgLocs, *DAG.getContext());
   CCInfo.AnalyzeFormalArguments(Ins, CC_SystemZ);
 
   if (isVarArg)
@@ -382,8 +384,8 @@ SystemZTargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
-                 ArgLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), ArgLocs, *DAG.getContext());
 
   CCInfo.AnalyzeCallOperands(Outs, CC_SystemZ);
 
@@ -451,7 +453,7 @@ SystemZTargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
 
   // Build a sequence of copy-to-reg nodes chained together with token chain and
   // flag operands which copy the outgoing args into registers.  The InFlag in
-  // necessary since all emited instructions must be stuck together.
+  // necessary since all emitted instructions must be stuck together.
   SDValue InFlag;
   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
@@ -511,8 +513,8 @@ SystemZTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
 
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(), RVLocs,
-                 *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), RVLocs, *DAG.getContext());
 
   CCInfo.AnalyzeCallResult(Ins, RetCC_SystemZ);
 
@@ -556,8 +558,8 @@ SystemZTargetLowering::LowerReturn(SDValue Chain,
   SmallVector<CCValAssign, 16> RVLocs;
 
   // CCState - Info about the registers and stack slot.
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
-                 RVLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), RVLocs, *DAG.getContext());
 
   // Analize return values.
   CCInfo.AnalyzeReturn(Outs, RetCC_SystemZ);
diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h
index 3019242..bab3dc2 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/lib/Target/SystemZ/SystemZISelLowering.h
@@ -66,11 +66,6 @@ namespace llvm {
     /// DAG node.
     virtual const char *getTargetNodeName(unsigned Opcode) const;
 
-    /// getFunctionAlignment - Return the Log2 alignment of this function.
-    virtual unsigned getFunctionAlignment(const Function *F) const {
-      return 1;
-    }
-
     std::pair<unsigned, const TargetRegisterClass*>
     getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const;
     TargetLowering::ConstraintType
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
index 28f94f4..d5c165f 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.cpp
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
@@ -51,13 +51,37 @@ BitVector SystemZRegisterInfo::getReservedRegs(const MachineFunction &MF) const
   BitVector Reserved(getNumRegs());
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
 
-  if (TFI->hasFP(MF))
+  if (TFI->hasFP(MF)) {
+    // R11D is the frame pointer. Reserve all aliases.
     Reserved.set(SystemZ::R11D);
+    Reserved.set(SystemZ::R11W);
+    Reserved.set(SystemZ::R10P);
+    Reserved.set(SystemZ::R10Q);
+  }
+
   Reserved.set(SystemZ::R14D);
   Reserved.set(SystemZ::R15D);
+  Reserved.set(SystemZ::R14W);
+  Reserved.set(SystemZ::R15W);
+  Reserved.set(SystemZ::R14P);
+  Reserved.set(SystemZ::R14Q);
   return Reserved;
 }
 
+const TargetRegisterClass*
+SystemZRegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A,
+                                              const TargetRegisterClass *B,
+                                              unsigned Idx) const {
+  switch(Idx) {
+  // Exact sub-classes don't exist for the other sub-register indexes.
+  default: return 0;
+  case SystemZ::subreg_32bit:
+    if (B == SystemZ::ADDR32RegisterClass)
+      return A->getSize() == 8 ? SystemZ::ADDR64RegisterClass : 0;
+    return A;
+  }
+}
+
 void SystemZRegisterInfo::
 eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator I) const {
@@ -125,4 +149,10 @@ int SystemZRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
   return -1;
 }
 
+int SystemZRegisterInfo::getLLVMRegNum(unsigned DwarfRegNo, bool isEH) const {
+  assert(0 && "What is the dwarf register number");
+  return -1;
+}
+
+
 #include "SystemZGenRegisterInfo.inc"
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.h b/lib/Target/SystemZ/SystemZRegisterInfo.h
index b450798..cd8f20f 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.h
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.h
@@ -34,6 +34,10 @@ struct SystemZRegisterInfo : public SystemZGenRegisterInfo {
 
   BitVector getReservedRegs(const MachineFunction &MF) const;
 
+  const TargetRegisterClass*
+  getMatchingSuperRegClass(const TargetRegisterClass *A,
+                           const TargetRegisterClass *B, unsigned Idx) const;
+
   void eliminateCallFramePseudoInstr(MachineFunction &MF,
                                      MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator I) const;
@@ -50,6 +54,7 @@ struct SystemZRegisterInfo : public SystemZGenRegisterInfo {
   unsigned getEHHandlerRegister() const;
 
   int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+  int getLLVMRegNum(unsigned RegNum, bool isEH) const;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.td b/lib/Target/SystemZ/SystemZRegisterInfo.td
index 0028c85..a24cbcf 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.td
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.td
@@ -61,22 +61,22 @@ def subreg_odd    : SubRegIndex;
 }
 
 // General-purpose registers
-def R0W  : GPR32< 0,  "r0">, DwarfRegNum<[0]>;
-def R1W  : GPR32< 1,  "r1">, DwarfRegNum<[1]>;
-def R2W  : GPR32< 2,  "r2">, DwarfRegNum<[2]>;
-def R3W  : GPR32< 3,  "r3">, DwarfRegNum<[3]>;
-def R4W  : GPR32< 4,  "r4">, DwarfRegNum<[4]>;
-def R5W  : GPR32< 5,  "r5">, DwarfRegNum<[5]>;
-def R6W  : GPR32< 6,  "r6">, DwarfRegNum<[6]>;
-def R7W  : GPR32< 7,  "r7">, DwarfRegNum<[7]>;
-def R8W  : GPR32< 8,  "r8">, DwarfRegNum<[8]>;
-def R9W  : GPR32< 9,  "r9">, DwarfRegNum<[9]>;
-def R10W : GPR32<10, "r10">, DwarfRegNum<[10]>;
-def R11W : GPR32<11, "r11">, DwarfRegNum<[11]>;
-def R12W : GPR32<12, "r12">, DwarfRegNum<[12]>;
-def R13W : GPR32<13, "r13">, DwarfRegNum<[13]>;
-def R14W : GPR32<14, "r14">, DwarfRegNum<[14]>;
-def R15W : GPR32<15, "r15">, DwarfRegNum<[15]>;
+def R0W  : GPR32< 0,  "r0">;
+def R1W  : GPR32< 1,  "r1">;
+def R2W  : GPR32< 2,  "r2">;
+def R3W  : GPR32< 3,  "r3">;
+def R4W  : GPR32< 4,  "r4">;
+def R5W  : GPR32< 5,  "r5">;
+def R6W  : GPR32< 6,  "r6">;
+def R7W  : GPR32< 7,  "r7">;
+def R8W  : GPR32< 8,  "r8">;
+def R9W  : GPR32< 9,  "r9">;
+def R10W : GPR32<10, "r10">;
+def R11W : GPR32<11, "r11">;
+def R12W : GPR32<12, "r12">;
+def R13W : GPR32<13, "r13">;
+def R14W : GPR32<14, "r14">;
+def R15W : GPR32<15, "r15">;
 
 let SubRegIndices = [subreg_32bit] in {
 def R0D  : GPR64< 0,  "r0", [R0W]>,  DwarfRegNum<[0]>;
@@ -99,26 +99,26 @@ def R15D : GPR64<15, "r15", [R15W]>, DwarfRegNum<[15]>;
 
 // Register pairs
 let SubRegIndices = [subreg_32bit, subreg_odd32] in {
-def R0P  : GPR64< 0,  "r0", [R0W,  R1W],  [R0D,  R1D]>,  DwarfRegNum<[0]>;
-def R2P  : GPR64< 2,  "r2", [R2W,  R3W],  [R2D,  R3D]>,  DwarfRegNum<[2]>;
-def R4P  : GPR64< 4,  "r4", [R4W,  R5W],  [R4D,  R5D]>,  DwarfRegNum<[4]>;
-def R6P  : GPR64< 6,  "r6", [R6W,  R7W],  [R6D,  R7D]>,  DwarfRegNum<[6]>;
-def R8P  : GPR64< 8,  "r8", [R8W,  R9W],  [R8D,  R9D]>,  DwarfRegNum<[8]>;
-def R10P : GPR64<10, "r10", [R10W, R11W], [R10D, R11D]>, DwarfRegNum<[10]>;
-def R12P : GPR64<12, "r12", [R12W, R13W], [R12D, R13D]>, DwarfRegNum<[12]>;
-def R14P : GPR64<14, "r14", [R14W, R15W], [R14D, R15D]>, DwarfRegNum<[14]>;
+def R0P  : GPR64< 0,  "r0", [R0W,  R1W],  [R0D,  R1D]>;
+def R2P  : GPR64< 2,  "r2", [R2W,  R3W],  [R2D,  R3D]>;
+def R4P  : GPR64< 4,  "r4", [R4W,  R5W],  [R4D,  R5D]>;
+def R6P  : GPR64< 6,  "r6", [R6W,  R7W],  [R6D,  R7D]>;
+def R8P  : GPR64< 8,  "r8", [R8W,  R9W],  [R8D,  R9D]>;
+def R10P : GPR64<10, "r10", [R10W, R11W], [R10D, R11D]>;
+def R12P : GPR64<12, "r12", [R12W, R13W], [R12D, R13D]>;
+def R14P : GPR64<14, "r14", [R14W, R15W], [R14D, R15D]>;
 }
 
 let SubRegIndices = [subreg_even, subreg_odd],
  CompositeIndices = [(subreg_odd32  subreg_odd,  subreg_32bit)] in {
-def R0Q  : GPR128< 0,  "r0", [R0D,  R1D],  [R0P]>,  DwarfRegNum<[0]>;
-def R2Q  : GPR128< 2,  "r2", [R2D,  R3D],  [R2P]>,  DwarfRegNum<[2]>;
-def R4Q  : GPR128< 4,  "r4", [R4D,  R5D],  [R4P]>,  DwarfRegNum<[4]>;
-def R6Q  : GPR128< 6,  "r6", [R6D,  R7D],  [R6P]>,  DwarfRegNum<[6]>;
-def R8Q  : GPR128< 8,  "r8", [R8D,  R9D],  [R8P]>,  DwarfRegNum<[8]>;
-def R10Q : GPR128<10, "r10", [R10D, R11D], [R10P]>, DwarfRegNum<[10]>;
-def R12Q : GPR128<12, "r12", [R12D, R13D], [R12P]>, DwarfRegNum<[12]>;
-def R14Q : GPR128<14, "r14", [R14D, R15D], [R14P]>, DwarfRegNum<[14]>;
+def R0Q  : GPR128< 0,  "r0", [R0D,  R1D],  [R0P]>;
+def R2Q  : GPR128< 2,  "r2", [R2D,  R3D],  [R2P]>;
+def R4Q  : GPR128< 4,  "r4", [R4D,  R5D],  [R4P]>;
+def R6Q  : GPR128< 6,  "r6", [R6D,  R7D],  [R6P]>;
+def R8Q  : GPR128< 8,  "r8", [R8D,  R9D],  [R8P]>;
+def R10Q : GPR128<10, "r10", [R10D, R11D], [R10P]>;
+def R12Q : GPR128<12, "r12", [R12D, R13D], [R12P]>;
+def R14Q : GPR128<14, "r14", [R14D, R15D], [R14P]>;
 }
 
 // Floating-point registers
@@ -140,339 +140,66 @@ def F14S : FPRS<14, "f14">, DwarfRegNum<[30]>;
 def F15S : FPRS<15, "f15">, DwarfRegNum<[31]>;
 
 let SubRegIndices = [subreg_32bit] in {
-def F0L  : FPRL< 0,  "f0", [F0S]>,  DwarfRegNum<[16]>;
-def F1L  : FPRL< 1,  "f1", [F1S]>,  DwarfRegNum<[17]>;
-def F2L  : FPRL< 2,  "f2", [F2S]>,  DwarfRegNum<[18]>;
-def F3L  : FPRL< 3,  "f3", [F3S]>,  DwarfRegNum<[19]>;
-def F4L  : FPRL< 4,  "f4", [F4S]>,  DwarfRegNum<[20]>;
-def F5L  : FPRL< 5,  "f5", [F5S]>,  DwarfRegNum<[21]>;
-def F6L  : FPRL< 6,  "f6", [F6S]>,  DwarfRegNum<[22]>;
-def F7L  : FPRL< 7,  "f7", [F7S]>,  DwarfRegNum<[23]>;
-def F8L  : FPRL< 8,  "f8", [F8S]>,  DwarfRegNum<[24]>;
-def F9L  : FPRL< 9,  "f9", [F9S]>,  DwarfRegNum<[25]>;
-def F10L : FPRL<10, "f10", [F10S]>, DwarfRegNum<[26]>;
-def F11L : FPRL<11, "f11", [F11S]>, DwarfRegNum<[27]>;
-def F12L : FPRL<12, "f12", [F12S]>, DwarfRegNum<[28]>;
-def F13L : FPRL<13, "f13", [F13S]>, DwarfRegNum<[29]>;
-def F14L : FPRL<14, "f14", [F14S]>, DwarfRegNum<[30]>;
-def F15L : FPRL<15, "f15", [F15S]>, DwarfRegNum<[31]>;
+def F0L  : FPRL< 0,  "f0", [F0S]>;
+def F1L  : FPRL< 1,  "f1", [F1S]>;
+def F2L  : FPRL< 2,  "f2", [F2S]>;
+def F3L  : FPRL< 3,  "f3", [F3S]>;
+def F4L  : FPRL< 4,  "f4", [F4S]>;
+def F5L  : FPRL< 5,  "f5", [F5S]>;
+def F6L  : FPRL< 6,  "f6", [F6S]>;
+def F7L  : FPRL< 7,  "f7", [F7S]>;
+def F8L  : FPRL< 8,  "f8", [F8S]>;
+def F9L  : FPRL< 9,  "f9", [F9S]>;
+def F10L : FPRL<10, "f10", [F10S]>;
+def F11L : FPRL<11, "f11", [F11S]>;
+def F12L : FPRL<12, "f12", [F12S]>;
+def F13L : FPRL<13, "f13", [F13S]>;
+def F14L : FPRL<14, "f14", [F14S]>;
+def F15L : FPRL<15, "f15", [F15S]>;
 }
 
 // Status register
 def PSW : SystemZReg<"psw">;
 
-/// Register classes
-def GR32 : RegisterClass<"SystemZ", [i32], 32,
-   // Volatile registers
-  [R0W, R1W, R2W, R3W, R4W, R5W, R6W, R7W, R8W, R9W, R10W, R12W, R13W,
-   // Frame pointer, sometimes allocable
-   R11W,
-   // Volatile, but not allocable
-   R14W, R15W]>
-{
-  let MethodProtos = [{
-    iterator allocation_order_begin(const MachineFunction &MF) const;
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    static const unsigned SystemZ_REG32[] = {
-      SystemZ::R1W,  SystemZ::R2W,  SystemZ::R3W,  SystemZ::R4W,
-      SystemZ::R5W,  SystemZ::R0W,  SystemZ::R12W, SystemZ::R11W,
-      SystemZ::R10W, SystemZ::R9W,  SystemZ::R8W,  SystemZ::R7W,
-      SystemZ::R6W,  SystemZ::R14W, SystemZ::R13W
-    };
-    static const unsigned SystemZ_REG32_nofp[] = {
-      SystemZ::R1W,  SystemZ::R2W,  SystemZ::R3W,  SystemZ::R4W,
-      SystemZ::R5W,  SystemZ::R0W,  SystemZ::R12W, /* No R11W */
-      SystemZ::R10W, SystemZ::R9W,  SystemZ::R8W,  SystemZ::R7W,
-      SystemZ::R6W,  SystemZ::R14W, SystemZ::R13W
-    };
-    GR32Class::iterator
-    GR32Class::allocation_order_begin(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const TargetFrameLowering *TFI = TM.getFrameLowering();
-      if (TFI->hasFP(MF))
-        return SystemZ_REG32_nofp;
-      else
-        return SystemZ_REG32;
-    }
-    GR32Class::iterator
-    GR32Class::allocation_order_end(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const TargetFrameLowering *TFI = TM.getFrameLowering();
-      if (TFI->hasFP(MF))
-        return SystemZ_REG32_nofp + (sizeof(SystemZ_REG32_nofp) / sizeof(unsigned));
-      else
-        return SystemZ_REG32 + (sizeof(SystemZ_REG32) / sizeof(unsigned));
-    }
-  }];
-}
+/// Register classes.
+/// Allocate the callee-saved R6-R12 backwards. That way they can be saved
+/// together with R14 and R15 in one prolog instruction.
+def GR32 : RegisterClass<"SystemZ", [i32], 32, (add (sequence "R%uW",  0, 5),
+                                                    (sequence "R%uW", 15, 6))>;
 
 /// Registers used to generate address. Everything except R0.
-def ADDR32 : RegisterClass<"SystemZ", [i32], 32,
-   // Volatile registers
-  [R1W, R2W, R3W, R4W, R5W, R6W, R7W, R8W, R9W, R10W, R12W, R13W,
-   // Frame pointer, sometimes allocable
-   R11W,
-   // Volatile, but not allocable
-   R14W, R15W]>
-{
-  let MethodProtos = [{
-    iterator allocation_order_begin(const MachineFunction &MF) const;
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    static const unsigned SystemZ_ADDR32[] = {
-      SystemZ::R1W,  SystemZ::R2W,  SystemZ::R3W,  SystemZ::R4W,
-      SystemZ::R5W,  /* No R0W */   SystemZ::R12W, SystemZ::R11W,
-      SystemZ::R10W, SystemZ::R9W,  SystemZ::R8W,  SystemZ::R7W,
-      SystemZ::R6W,  SystemZ::R14W, SystemZ::R13W
-    };
-    static const unsigned SystemZ_ADDR32_nofp[] = {
-      SystemZ::R1W,  SystemZ::R2W,  SystemZ::R3W,  SystemZ::R4W,
-      SystemZ::R5W,  /* No R0W */   SystemZ::R12W, /* No R11W */
-      SystemZ::R10W, SystemZ::R9W,  SystemZ::R8W,  SystemZ::R7W,
-      SystemZ::R6W,  SystemZ::R14W, SystemZ::R13W
-    };
-    ADDR32Class::iterator
-    ADDR32Class::allocation_order_begin(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const TargetFrameLowering *TFI = TM.getFrameLowering();
-      if (TFI->hasFP(MF))
-        return SystemZ_ADDR32_nofp;
-      else
-        return SystemZ_ADDR32;
-    }
-    ADDR32Class::iterator
-    ADDR32Class::allocation_order_end(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const TargetFrameLowering *TFI = TM.getFrameLowering();
-      if (TFI->hasFP(MF))
-        return SystemZ_ADDR32_nofp + (sizeof(SystemZ_ADDR32_nofp) / sizeof(unsigned));
-      else
-        return SystemZ_ADDR32 + (sizeof(SystemZ_ADDR32) / sizeof(unsigned));
-    }
-  }];
-}
+def ADDR32 : RegisterClass<"SystemZ", [i32], 32, (sub GR32, R0W)>;
 
-def GR64 : RegisterClass<"SystemZ", [i64], 64,
-   // Volatile registers
-  [R0D, R1D, R2D, R3D, R4D, R5D, R6D, R7D, R8D, R9D, R10D, R12D, R13D,
-   // Frame pointer, sometimes allocable
-   R11D,
-   // Volatile, but not allocable
-   R14D, R15D]>
-{
+def GR64 : RegisterClass<"SystemZ", [i64], 64, (add (sequence "R%uD",  0, 5),
+                                                    (sequence "R%uD", 15, 6))> {
   let SubRegClasses = [(GR32 subreg_32bit)];
-  let MethodProtos = [{
-    iterator allocation_order_begin(const MachineFunction &MF) const;
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    static const unsigned SystemZ_REG64[] = {
-      SystemZ::R1D,  SystemZ::R2D,  SystemZ::R3D,  SystemZ::R4D,
-      SystemZ::R5D,  SystemZ::R0D,  SystemZ::R12D, SystemZ::R11D,
-      SystemZ::R10D, SystemZ::R9D,  SystemZ::R8D,  SystemZ::R7D,
-      SystemZ::R6D,  SystemZ::R14D, SystemZ::R13D
-    };
-    static const unsigned SystemZ_REG64_nofp[] = {
-      SystemZ::R1D,  SystemZ::R2D,  SystemZ::R3D,  SystemZ::R4D,
-      SystemZ::R5D,  SystemZ::R0D,  SystemZ::R12D, /* No R11D */
-      SystemZ::R10D, SystemZ::R9D,  SystemZ::R8D,  SystemZ::R7D,
-      SystemZ::R6D,  SystemZ::R14D, SystemZ::R13D
-    };
-    GR64Class::iterator
-    GR64Class::allocation_order_begin(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const TargetFrameLowering *TFI = TM.getFrameLowering();
-      if (TFI->hasFP(MF))
-        return SystemZ_REG64_nofp;
-      else
-        return SystemZ_REG64;
-    }
-    GR64Class::iterator
-    GR64Class::allocation_order_end(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const TargetFrameLowering *TFI = TM.getFrameLowering();
-      if (TFI->hasFP(MF))
-        return SystemZ_REG64_nofp + (sizeof(SystemZ_REG64_nofp) / sizeof(unsigned));
-      else
-        return SystemZ_REG64 + (sizeof(SystemZ_REG64) / sizeof(unsigned));
-    }
-  }];
 }
 
-def ADDR64 : RegisterClass<"SystemZ", [i64], 64,
-   // Volatile registers
-  [R1D, R2D, R3D, R4D, R5D, R6D, R7D, R8D, R9D, R10D, R12D, R13D,
-   // Frame pointer, sometimes allocable
-   R11D,
-   // Volatile, but not allocable
-   R14D, R15D]>
-{
+def ADDR64 : RegisterClass<"SystemZ", [i64], 64, (sub GR64, R0D)> {
   let SubRegClasses = [(ADDR32 subreg_32bit)];
-  let MethodProtos = [{
-    iterator allocation_order_begin(const MachineFunction &MF) const;
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    static const unsigned SystemZ_ADDR64[] = {
-      SystemZ::R1D,  SystemZ::R2D,  SystemZ::R3D,  SystemZ::R4D,
-      SystemZ::R5D,  /* No R0D */   SystemZ::R12D, SystemZ::R11D,
-      SystemZ::R10D, SystemZ::R9D,  SystemZ::R8D,  SystemZ::R7D,
-      SystemZ::R6D,  SystemZ::R14D, SystemZ::R13D
-    };
-    static const unsigned SystemZ_ADDR64_nofp[] = {
-      SystemZ::R1D,  SystemZ::R2D,  SystemZ::R3D,  SystemZ::R4D,
-      SystemZ::R5D,  /* No R0D */   SystemZ::R12D, /* No R11D */
-      SystemZ::R10D, SystemZ::R9D,  SystemZ::R8D,  SystemZ::R7D,
-      SystemZ::R6D,  SystemZ::R14D, SystemZ::R13D
-    };
-    ADDR64Class::iterator
-    ADDR64Class::allocation_order_begin(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const TargetFrameLowering *TFI = TM.getFrameLowering();
-      if (TFI->hasFP(MF))
-        return SystemZ_ADDR64_nofp;
-      else
-        return SystemZ_ADDR64;
-    }
-    ADDR64Class::iterator
-    ADDR64Class::allocation_order_end(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const TargetFrameLowering *TFI = TM.getFrameLowering();
-      if (TFI->hasFP(MF))
-        return SystemZ_ADDR64_nofp + (sizeof(SystemZ_ADDR64_nofp) / sizeof(unsigned));
-      else
-        return SystemZ_ADDR64 + (sizeof(SystemZ_ADDR64) / sizeof(unsigned));
-    }
-  }];
 }
 
 // Even-odd register pairs
-def GR64P : RegisterClass<"SystemZ", [v2i32], 64,
-  [R0P, R2P, R4P, R6P, R8P, R10P, R12P, R14P]>
-{
+def GR64P : RegisterClass<"SystemZ", [v2i32], 64, (add R0P, R2P, R4P,
+                                                       R12P, R10P, R8P, R6P,
+                                                       R14P)> {
   let SubRegClasses = [(GR32 subreg_32bit, subreg_odd32)];
-  let MethodProtos = [{
-    iterator allocation_order_begin(const MachineFunction &MF) const;
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    static const unsigned SystemZ_REG64P[] = {
-      SystemZ::R0P,  SystemZ::R2P,  SystemZ::R4P, SystemZ::R10P,
-      SystemZ::R8P,  SystemZ::R6P };
-    static const unsigned SystemZ_REG64P_nofp[] = {
-      SystemZ::R0P,  SystemZ::R2P,  SystemZ::R4P, /* NO R10P */
-      SystemZ::R8P,  SystemZ::R6P };
-    GR64PClass::iterator
-    GR64PClass::allocation_order_begin(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const TargetFrameLowering *TFI = TM.getFrameLowering();
-      if (TFI->hasFP(MF))
-        return SystemZ_REG64P_nofp;
-      else
-        return SystemZ_REG64P;
-    }
-    GR64PClass::iterator
-    GR64PClass::allocation_order_end(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const TargetFrameLowering *TFI = TM.getFrameLowering();
-      if (TFI->hasFP(MF))
-        return SystemZ_REG64P_nofp + (sizeof(SystemZ_REG64P_nofp) / sizeof(unsigned));
-      else
-        return SystemZ_REG64P + (sizeof(SystemZ_REG64P) / sizeof(unsigned));
-    }
-  }];
 }
 
-def GR128 : RegisterClass<"SystemZ", [v2i64], 128,
-  [R0Q, R2Q, R4Q, R6Q, R8Q, R10Q, R12Q, R14Q]>
-{
+def GR128 : RegisterClass<"SystemZ", [v2i64], 128, (add R0Q, R2Q, R4Q,
+                                                        R12Q, R10Q, R8Q, R6Q,
+                                                        R14Q)> {
   let SubRegClasses = [(GR32 subreg_32bit, subreg_odd32),
-                         (GR64 subreg_even, subreg_odd)];
-  let MethodProtos = [{
-    iterator allocation_order_begin(const MachineFunction &MF) const;
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    static const unsigned SystemZ_REG128[] = {
-      SystemZ::R0Q,  SystemZ::R2Q,  SystemZ::R4Q,  SystemZ::R10Q,
-      SystemZ::R8Q,  SystemZ::R6Q };
-    static const unsigned SystemZ_REG128_nofp[] = {
-      SystemZ::R0Q,  SystemZ::R2Q,  SystemZ::R4Q, /* NO R10Q */
-      SystemZ::R8Q,  SystemZ::R6Q };
-    GR128Class::iterator
-    GR128Class::allocation_order_begin(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const TargetFrameLowering *TFI = TM.getFrameLowering();
-      if (TFI->hasFP(MF))
-        return SystemZ_REG128_nofp;
-      else
-        return SystemZ_REG128;
-    }
-    GR128Class::iterator
-    GR128Class::allocation_order_end(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const TargetFrameLowering *TFI = TM.getFrameLowering();
-      if (TFI->hasFP(MF))
-        return SystemZ_REG128_nofp + (sizeof(SystemZ_REG128_nofp) / sizeof(unsigned));
-      else
-        return SystemZ_REG128 + (sizeof(SystemZ_REG128) / sizeof(unsigned));
-    }
-  }];
+                       (GR64 subreg_even, subreg_odd)];
 }
 
-def FP32 : RegisterClass<"SystemZ", [f32], 32,
- [F0S, F1S,  F2S,  F3S,  F4S,  F5S,  F6S,  F7S,
-  F8S, F9S, F10S, F11S, F12S, F13S, F14S, F15S]> {
-  let MethodProtos = [{
-    iterator allocation_order_begin(const MachineFunction &MF) const;
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    static const unsigned SystemZ_REGFP32[] = {
-      SystemZ::F0S,  SystemZ::F2S,  SystemZ::F4S,  SystemZ::F6S,
-      SystemZ::F1S,  SystemZ::F3S,  SystemZ::F5S,  SystemZ::F7S,
-      SystemZ::F8S,  SystemZ::F9S,  SystemZ::F10S, SystemZ::F11S,
-      SystemZ::F12S, SystemZ::F13S, SystemZ::F14S, SystemZ::F15S };
-    FP32Class::iterator
-    FP32Class::allocation_order_begin(const MachineFunction &MF) const {
-      return SystemZ_REGFP32;
-    }
-    FP32Class::iterator
-    FP32Class::allocation_order_end(const MachineFunction &MF) const {
-      return SystemZ_REGFP32 + (sizeof(SystemZ_REGFP32) / sizeof(unsigned));
-    }
-  }];
-}
+def FP32 : RegisterClass<"SystemZ", [f32], 32, (sequence "F%uS", 0, 15)>;
 
-def FP64 : RegisterClass<"SystemZ", [f64], 64,
- [F0L, F1L,  F2L,  F3L,  F4L,  F5L,  F6L,  F7L, 
-  F8L, F9L, F10L, F11L, F12L, F13L, F14L, F15L]> {
+def FP64 : RegisterClass<"SystemZ", [f64], 64, (sequence "F%uL", 0, 15)> {
   let SubRegClasses = [(FP32 subreg_32bit)];
-  let MethodProtos = [{
-    iterator allocation_order_begin(const MachineFunction &MF) const;
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    static const unsigned SystemZ_REGFP64[] = {
-      SystemZ::F0L,  SystemZ::F2L,  SystemZ::F4L,  SystemZ::F6L,
-      SystemZ::F1L,  SystemZ::F3L,  SystemZ::F5L,  SystemZ::F7L,
-      SystemZ::F8L,  SystemZ::F9L,  SystemZ::F10L, SystemZ::F11L,
-      SystemZ::F12L, SystemZ::F13L, SystemZ::F14L, SystemZ::F15L };
-    FP64Class::iterator
-    FP64Class::allocation_order_begin(const MachineFunction &MF) const {
-      return SystemZ_REGFP64;
-    }
-    FP64Class::iterator
-    FP64Class::allocation_order_end(const MachineFunction &MF) const {
-      return SystemZ_REGFP64 + (sizeof(SystemZ_REGFP64) / sizeof(unsigned));
-    }
-  }];
 }
 
 // Status flags registers.
-def CCR : RegisterClass<"SystemZ", [i64], 64, [PSW]> {
+def CCR : RegisterClass<"SystemZ", [i64], 64, (add PSW)> {
   let CopyCost = -1;  // Don't allow copying of status registers.
 }
diff --git a/lib/Target/TargetData.cpp b/lib/Target/TargetData.cpp
index c628df0..1990bc7 100644
--- a/lib/Target/TargetData.cpp
+++ b/lib/Target/TargetData.cpp
@@ -617,10 +617,14 @@ uint64_t TargetData::getIndexedOffset(const Type *ptrTy, Value* const* Indices,
 unsigned TargetData::getPreferredAlignment(const GlobalVariable *GV) const {
   const Type *ElemType = GV->getType()->getElementType();
   unsigned Alignment = getPrefTypeAlignment(ElemType);
-  if (GV->getAlignment() > Alignment)
-    Alignment = GV->getAlignment();
+  unsigned GVAlignment = GV->getAlignment();
+  if (GVAlignment >= Alignment) {
+    Alignment = GVAlignment;
+  } else if (GVAlignment != 0) {
+    Alignment = std::max(GVAlignment, getABITypeAlignment(ElemType));
+  }
 
-  if (GV->hasInitializer()) {
+  if (GV->hasInitializer() && GVAlignment == 0) {
     if (Alignment < 16) {
       // If the global is not external, see if it is large.  If so, give it a
       // larger alignment.
diff --git a/lib/Target/TargetLibraryInfo.cpp b/lib/Target/TargetLibraryInfo.cpp
index 90ea343..709dfd2 100644
--- a/lib/Target/TargetLibraryInfo.cpp
+++ b/lib/Target/TargetLibraryInfo.cpp
@@ -28,11 +28,18 @@ static void initialize(TargetLibraryInfo &TLI, const Triple &T) {
 
   
   // memset_pattern16 is only available on iOS 3.0 and Mac OS/X 10.5 and later.
-  if (T.getOS() != Triple::Darwin || T.getDarwinMajorNumber() < 9)
+  if (T.isMacOSX()) {
+    if (T.isMacOSXVersionLT(10, 5))
+      TLI.setUnavailable(LibFunc::memset_pattern16);
+  } else if (T.getOS() == Triple::IOS) {
+    if (T.isOSVersionLT(3, 0))
+      TLI.setUnavailable(LibFunc::memset_pattern16);
+  } else {
     TLI.setUnavailable(LibFunc::memset_pattern16);
+  }
 
-  // iprintf and friends are only available on XCore.
-  if (T.getArch() != Triple::xcore) {
+  // iprintf and friends are only available on XCore and TCE.
+  if (T.getArch() != Triple::xcore && T.getArch() != Triple::tce) {
     TLI.setUnavailable(LibFunc::iprintf);
     TLI.setUnavailable(LibFunc::siprintf);
     TLI.setUnavailable(LibFunc::fiprintf);
@@ -54,6 +61,12 @@ TargetLibraryInfo::TargetLibraryInfo(const Triple &T) : ImmutablePass(ID) {
   initialize(*this, T);
 }
 
+TargetLibraryInfo::TargetLibraryInfo(const TargetLibraryInfo &TLI)
+  : ImmutablePass(ID) {
+  memcpy(AvailableArray, TLI.AvailableArray, sizeof(AvailableArray));
+}
+
+
 /// disableAllFunctions - This disables all builtins, which is used for options
 /// like -fno-builtin.
 void TargetLibraryInfo::disableAllFunctions() {
diff --git a/lib/Target/TargetLoweringObjectFile.cpp b/lib/Target/TargetLoweringObjectFile.cpp
index 5d34c7d..3343384 100644
--- a/lib/Target/TargetLoweringObjectFile.cpp
+++ b/lib/Target/TargetLoweringObjectFile.cpp
@@ -58,7 +58,6 @@ TargetLoweringObjectFile::TargetLoweringObjectFile() : Ctx(0) {
   DwarfRangesSection = 0;
   DwarfMacroInfoSection = 0;
   
-  IsFunctionEHSymbolGlobal = false;
   IsFunctionEHFrameSymbolPrivate = true;
   SupportsWeakOmittedEHFrame = true;
 }
@@ -120,6 +119,18 @@ static bool IsNullTerminatedString(const Constant *C) {
   return false;
 }
 
+MCSymbol *TargetLoweringObjectFile::
+getCFIPersonalitySymbol(const GlobalValue *GV, Mangler *Mang,
+                        MachineModuleInfo *MMI) const {
+  return Mang->getSymbol(GV);
+}
+
+void TargetLoweringObjectFile::emitPersonalityValue(MCStreamer &Streamer,
+                                                    const TargetMachine &TM,
+                                                    const MCSymbol *Sym) const {
+}
+
+
 /// getKindForGlobal - This is a top-level target-independent classifier for
 /// a global variable.  Given an global variable and information from TM, it
 /// classifies the global in a variety of ways that make various target
@@ -305,16 +316,15 @@ getExprForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang,
                                MachineModuleInfo *MMI, unsigned Encoding,
                                MCStreamer &Streamer) const {
   const MCSymbol *Sym = Mang->getSymbol(GV);
-  return getExprForDwarfReference(Sym, Mang, MMI, Encoding, Streamer);
+  return getExprForDwarfReference(Sym, Encoding, Streamer);
 }
 
 const MCExpr *TargetLoweringObjectFile::
-getExprForDwarfReference(const MCSymbol *Sym, Mangler *Mang,
-                         MachineModuleInfo *MMI, unsigned Encoding,
+getExprForDwarfReference(const MCSymbol *Sym, unsigned Encoding,
                          MCStreamer &Streamer) const {
   const MCExpr *Res = MCSymbolRefExpr::Create(Sym, getContext());
 
-  switch (Encoding & 0xF0) {
+  switch (Encoding & 0x70) {
   default:
     report_fatal_error("We do not support this DWARF encoding yet!");
   case dwarf::DW_EH_PE_absptr:
@@ -339,7 +349,7 @@ unsigned TargetLoweringObjectFile::getLSDAEncoding() const {
   return dwarf::DW_EH_PE_absptr;
 }
 
-unsigned TargetLoweringObjectFile::getFDEEncoding() const {
+unsigned TargetLoweringObjectFile::getFDEEncoding(bool CFI) const {
   return dwarf::DW_EH_PE_absptr;
 }
 
diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp
index 8c7330a..863b811 100644
--- a/lib/Target/TargetMachine.cpp
+++ b/lib/Target/TargetMachine.cpp
@@ -40,7 +40,6 @@ namespace llvm {
   bool JITExceptionHandling;
   bool JITEmitDebugInfo;
   bool JITEmitDebugInfoToDisk;
-  bool UnwindTablesMandatory;
   Reloc::Model RelocationModel;
   CodeModel::Model CMModel;
   bool GuaranteedTailCallOpt;
@@ -143,11 +142,6 @@ EmitJitDebugInfoToDisk("jit-emit-debug-to-disk",
   cl::desc("Emit debug info objfiles to disk"),
   cl::location(JITEmitDebugInfoToDisk),
   cl::init(false));
-static cl::opt<bool, true>
-EnableUnwindTables("unwind-tables",
-  cl::desc("Generate unwinding tables for all functions"),
-  cl::location(UnwindTablesMandatory),
-  cl::init(false));
 
 static cl::opt<llvm::Reloc::Model, true>
 DefRelocationModel("relocation-model",
@@ -206,11 +200,10 @@ EnableStrongPHIElim(cl::Hidden, "strong-phi-elim",
   cl::desc("Use strong PHI elimination."),
   cl::location(StrongPHIElim),
   cl::init(false));
-static cl::opt<bool, true>
-UseDivMod("use-divmod-libcall",
-  cl::desc("Use __{u}divmod libcalls for div / rem pairs"),
-  cl::location(HasDivModLibcall),
-  cl::init(false));
+static cl::opt<std::string>
+TrapFuncName("trap-func", cl::Hidden,
+  cl::desc("Emit a call to trap function rather than a trap instruction"),
+  cl::init(""));
 static cl::opt<bool>
 DataSections("fdata-sections",
   cl::desc("Emit data into separate sections"),
@@ -228,7 +221,8 @@ TargetMachine::TargetMachine(const Target &T)
     MCRelaxAll(false),
     MCNoExecStack(false),
     MCSaveTempLabels(false),
-    MCUseLoc(true) {
+    MCUseLoc(true),
+    MCUseCFI(true) {
   // Typically it will be subtargets that will adjust FloatABIType from Default
   // to Soft or Hard.
   if (UseSoftFloat)
@@ -310,4 +304,11 @@ namespace llvm {
   bool HonorSignDependentRoundingFPMath() {
     return !UnsafeFPMath && HonorSignDependentRoundingFPMathOption;
   }
+
+  /// getTrapFunctionName - If this returns a non-empty string, this means isel
+  /// should lower Intrinsic::trap to a call to the specified function name
+  /// instead of an ISD::TRAP node.
+  StringRef getTrapFunctionName() {
+    return TrapFuncName;
+  }
 }
diff --git a/lib/Target/TargetRegisterInfo.cpp b/lib/Target/TargetRegisterInfo.cpp
index 4811ba5..e36e136 100644
--- a/lib/Target/TargetRegisterInfo.cpp
+++ b/lib/Target/TargetRegisterInfo.cpp
@@ -23,12 +23,8 @@ using namespace llvm;
 TargetRegisterInfo::TargetRegisterInfo(const TargetRegisterDesc *D, unsigned NR,
                              regclass_iterator RCB, regclass_iterator RCE,
                              const char *const *subregindexnames,
-                             int CFSO, int CFDO,
-                             const unsigned* subregs, const unsigned subregsize,
-                         const unsigned* aliases, const unsigned aliasessize)
-  : SubregHash(subregs), SubregHashSize(subregsize),
-    AliasesHash(aliases), AliasesHashSize(aliasessize),
-    Desc(D), SubRegIndexNames(subregindexnames), NumRegs(NR),
+                             int CFSO, int CFDO)
+  : Desc(D), SubRegIndexNames(subregindexnames), NumRegs(NR),
     RegClassBegin(RCB), RegClassEnd(RCE) {
   assert(isPhysicalRegister(NumRegs) &&
          "Target has too many physical registers!");
@@ -96,7 +92,8 @@ BitVector TargetRegisterInfo::getAllocatableSet(const MachineFunction &MF,
   } else {
     for (TargetRegisterInfo::regclass_iterator I = regclass_begin(),
          E = regclass_end(); I != E; ++I)
-      getAllocatableSetForRC(MF, *I, Allocatable);
+      if ((*I)->isAllocatable())
+        getAllocatableSetForRC(MF, *I, Allocatable);
   }
 
   // Mask out the reserved registers
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index e0989b0..c352bfc 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -928,6 +928,18 @@ ParseInstruction(StringRef Name, SMLoc NameLoc,
       Operands.erase(Operands.begin() + 1);
     }
   }
+  
+  // Transforms "int $3" into "int3" as a size optimization.  We can't write an
+  // instalias with an immediate operand yet.
+  if (Name == "int" && Operands.size() == 2) {
+    X86Operand *Op1 = static_cast<X86Operand*>(Operands[1]);
+    if (Op1->isImm() && isa<MCConstantExpr>(Op1->getImm()) &&
+        cast<MCConstantExpr>(Op1->getImm())->getValue() == 3) {
+      delete Operands[1];
+      Operands.erase(Operands.begin() + 1);
+      static_cast<X86Operand*>(Operands[0])->setTokenValue("int3");
+    }
+  }
 
   return false;
 }
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
index d4a88d7..a9c90f8 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
@@ -485,7 +485,7 @@ struct InternalInstruction {
      consumed___ indicates that the byte was already consumed and does not
      need to be consumed again */
 
-  /* The VEX.vvvv field, which contains a thrid register operand for some AVX
+  /* The VEX.vvvv field, which contains a third register operand for some AVX
      instructions */
   Reg                           vvvv;
   
diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
index d006eca..68247d2 100644
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
@@ -41,8 +41,15 @@ X86ATTInstPrinter::X86ATTInstPrinter(TargetMachine &TM, const MCAsmInfo &MAI)
             &TM.getSubtarget<X86Subtarget>()));
 }
 
+void X86ATTInstPrinter::printRegName(raw_ostream &OS,
+                                     unsigned RegNo) const {
+  OS << '%' << getRegisterName(RegNo);
+}
+
 void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS) {
-  printInstruction(MI, OS);
+  // Try to print any aliases first.
+  if (!printAliasInstr(MI, OS))
+    printInstruction(MI, OS);
   
   // If verbose assembly is enabled, we can print some informative comments.
   if (CommentStream)
diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
index f24674f..5f939b6 100644
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
+++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
@@ -26,11 +26,14 @@ class X86ATTInstPrinter : public MCInstPrinter {
 public:
   X86ATTInstPrinter(TargetMachine &TM, const MCAsmInfo &MAI);
   
+  virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
   virtual void printInst(const MCInst *MI, raw_ostream &OS);
   virtual StringRef getOpcodeName(unsigned Opcode) const;
 
   // Methods used to print the alias of an instruction.
   unsigned ComputeAvailableFeatures(const X86Subtarget *Subtarget) const;
+  // Autogenerated by tblgen, returns true if we successfully printed an
+  // alias.
   bool printAliasInstr(const MCInst *MI, raw_ostream &OS);
 
   // Autogenerated by tblgen.
diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
index 47253eb..5f581ba 100644
--- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
@@ -29,6 +29,10 @@ using namespace llvm;
 #define GET_INSTRUCTION_NAME
 #include "X86GenAsmWriter1.inc"
 
+void X86IntelInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+  OS << getRegisterName(RegNo);
+}
+
 void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS) {
   printInstruction(MI, OS);
   
diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
index ca99dc0..c8030c3 100644
--- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
+++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
@@ -27,6 +27,7 @@ public:
   X86IntelInstPrinter(TargetMachine &TM, const MCAsmInfo &MAI)
     : MCInstPrinter(MAI) {}
 
+  virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
   virtual void printInst(const MCInst *MI, raw_ostream &OS);
   virtual StringRef getOpcodeName(unsigned Opcode) const;
   
diff --git a/lib/Target/X86/README-X86-64.txt b/lib/Target/X86/README-X86-64.txt
index e21d69a..bcfdf0b 100644
--- a/lib/Target/X86/README-X86-64.txt
+++ b/lib/Target/X86/README-X86-64.txt
@@ -36,7 +36,7 @@ _conv:
 	cmovb %rcx, %rax
 	ret
 
-Seems like the jb branch has high likelyhood of being taken. It would have
+Seems like the jb branch has high likelihood of being taken. It would have
 saved a few instructions.
 
 //===---------------------------------------------------------------------===//
@@ -124,51 +124,6 @@ if we have whole-function selectiondags.
 
 //===---------------------------------------------------------------------===//
 
-Take the following C code
-(from http://gcc.gnu.org/bugzilla/show_bug.cgi?id=43640):
-
-struct u1
-{
-        float x;
-        float y;
-};
-
-float foo(struct u1 u)
-{
-        return u.x + u.y;
-}
-
-Optimizes to the following IR:
-define float @foo(double %u.0) nounwind readnone {
-entry:
-  %tmp8 = bitcast double %u.0 to i64              ; <i64> [#uses=2]
-  %tmp6 = trunc i64 %tmp8 to i32                  ; <i32> [#uses=1]
-  %tmp7 = bitcast i32 %tmp6 to float              ; <float> [#uses=1]
-  %tmp2 = lshr i64 %tmp8, 32                      ; <i64> [#uses=1]
-  %tmp3 = trunc i64 %tmp2 to i32                  ; <i32> [#uses=1]
-  %tmp4 = bitcast i32 %tmp3 to float              ; <float> [#uses=1]
-  %0 = fadd float %tmp7, %tmp4                    ; <float> [#uses=1]
-  ret float %0
-}
-
-And current llvm-gcc/clang output:
-	movd	%xmm0, %rax
-	movd	%eax, %xmm1
-	shrq	$32, %rax
-	movd	%eax, %xmm0
-	addss	%xmm1, %xmm0
-	ret
-
-We really shouldn't move the floats to RAX, only to immediately move them
-straight back to the XMM registers.
-
-There really isn't any good way to handle this purely in IR optimizers; it
-could possibly be handled by changing the output of the fronted, though.  It
-would also be feasible to add a x86-specific DAGCombine to optimize the
-bitcast+trunc+(lshr+)bitcast combination.
-
-//===---------------------------------------------------------------------===//
-
 Take the following code
 (from http://gcc.gnu.org/bugzilla/show_bug.cgi?id=34653):
 extern unsigned long table[];
diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt
index 1ac2305..560947a 100644
--- a/lib/Target/X86/README.txt
+++ b/lib/Target/X86/README.txt
@@ -7,14 +7,6 @@ copy (3-addr bswap + memory support?)  This is available on Atom processors.
 
 //===---------------------------------------------------------------------===//
 
-CodeGen/X86/lea-3.ll:test3 should be a single LEA, not a shift/move.  The X86
-backend knows how to three-addressify this shift, but it appears the register
-allocator isn't even asking it to do so in this case.  We should investigate
-why this isn't happening, it could have significant impact on other important
-cases for X86 as well.
-
-//===---------------------------------------------------------------------===//
-
 This should be one DIV/IDIV instruction, not a libcall:
 
 unsigned test(unsigned long long X, unsigned Y) {
@@ -1572,7 +1564,7 @@ Implement processor-specific optimizations for parity with GCC on these
 processors.  GCC does two optimizations:
 
 1. ix86_pad_returns inserts a noop before ret instructions if immediately
-   preceeded by a conditional branch or is the target of a jump.
+   preceded by a conditional branch or is the target of a jump.
 2. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of
    code contains more than 3 branches.
    
@@ -1736,26 +1728,6 @@ are functionally identical.
 //===---------------------------------------------------------------------===//
 
 Take the following C code:
-int x(int y) { return (y & 63) << 14; }
-
-Code produced by gcc:
-	andl	$63, %edi
-	sall	$14, %edi
-	movl	%edi, %eax
-	ret
-
-Code produced by clang:
-	shll	$14, %edi
-	movl	%edi, %eax
-	andl	$1032192, %eax
-	ret
-
-The code produced by gcc is 3 bytes shorter.  This sort of construct often
-shows up with bitfields.
-
-//===---------------------------------------------------------------------===//
-
-Take the following C code:
 int f(int a, int b) { return (unsigned char)a == (unsigned char)b; }
 
 We generate the following IR with clang:
@@ -2016,3 +1988,81 @@ We currently generate:
 We could save an instruction here by commuting the addss.
 
 //===---------------------------------------------------------------------===//
+
+This (from PR9661):
+
+float clamp_float(float a) {
+        if (a > 1.0f)
+                return 1.0f;
+        else if (a < 0.0f)
+                return 0.0f;
+        else
+                return a;
+}
+
+Could compile to:
+
+clamp_float:                            # @clamp_float
+        movss   .LCPI0_0(%rip), %xmm1
+        minss   %xmm1, %xmm0
+        pxor    %xmm1, %xmm1
+        maxss   %xmm1, %xmm0
+        ret
+
+with -ffast-math.
+
+//===---------------------------------------------------------------------===//
+
+This function (from PR9803):
+
+int clamp2(int a) {
+        if (a > 5)
+                a = 5;
+        if (a < 0) 
+                return 0;
+        return a;
+}
+
+Compiles to:
+
+_clamp2:                                ## @clamp2
+        pushq   %rbp
+        movq    %rsp, %rbp
+        cmpl    $5, %edi
+        movl    $5, %ecx
+        cmovlel %edi, %ecx
+        testl   %ecx, %ecx
+        movl    $0, %eax
+        cmovnsl %ecx, %eax
+        popq    %rbp
+        ret
+
+The move of 0 could be scheduled above the test to make it is xor reg,reg.
+
+//===---------------------------------------------------------------------===//
+
+GCC PR48986.  We currently compile this:
+
+void bar(void);
+void yyy(int* p) {
+    if (__sync_fetch_and_add(p, -1) == 1)
+      bar();
+}
+
+into:
+	movl	$-1, %eax
+	lock
+	xaddl	%eax, (%rdi)
+	cmpl	$1, %eax
+	je	LBB0_2
+
+Instead we could generate:
+
+	lock
+	dec %rdi
+	je LBB0_2
+
+The trick is to match "fetch_and_add(X, -C) == C".
+
+//===---------------------------------------------------------------------===//
+
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index efb6c8c..7bb9676 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -1,13 +1,13 @@
 //===- X86.td - Target definition file for the Intel X86 ---*- tablegen -*-===//
-// 
+//
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
-// 
+//
 //===----------------------------------------------------------------------===//
 //
-// This is a target description file for the Intel i386 architecture, refered to
+// This is a target description file for the Intel i386 architecture, referred to
 // here as the "X86" architecture.
 //
 //===----------------------------------------------------------------------===//
@@ -32,7 +32,7 @@ def FeatureMMX     : SubtargetFeature<"mmx","X86SSELevel", "MMX",
 def FeatureSSE1    : SubtargetFeature<"sse", "X86SSELevel", "SSE1",
                                       "Enable SSE instructions",
                                       // SSE codegen depends on cmovs, and all
-                                      // SSE1+ processors support them. 
+                                      // SSE1+ processors support them.
                                       [FeatureMMX, FeatureCMOV]>;
 def FeatureSSE2    : SubtargetFeature<"sse2", "X86SSELevel", "SSE2",
                                       "Enable SSE2 instructions",
@@ -50,7 +50,8 @@ def FeatureSSE42   : SubtargetFeature<"sse42", "X86SSELevel", "SSE42",
                                       "Enable SSE 4.2 instructions",
                                       [FeatureSSE41, FeaturePOPCNT]>;
 def Feature3DNow   : SubtargetFeature<"3dnow", "X863DNowLevel", "ThreeDNow",
-                                      "Enable 3DNow! instructions">;
+                                      "Enable 3DNow! instructions",
+                                      [FeatureMMX]>;
 def Feature3DNowA  : SubtargetFeature<"3dnowa", "X863DNowLevel", "ThreeDNowA",
                                       "Enable 3DNow! Athlon instructions",
                                       [Feature3DNow]>;
@@ -100,8 +101,10 @@ def : Proc<"i686",            []>;
 def : Proc<"pentiumpro",      [FeatureCMOV]>;
 def : Proc<"pentium2",        [FeatureMMX, FeatureCMOV]>;
 def : Proc<"pentium3",        [FeatureSSE1]>;
+def : Proc<"pentium3m",       [FeatureSSE1, FeatureSlowBTMem]>;
 def : Proc<"pentium-m",       [FeatureSSE2, FeatureSlowBTMem]>;
 def : Proc<"pentium4",        [FeatureSSE2]>;
+def : Proc<"pentium4m",       [FeatureSSE2, FeatureSlowBTMem]>;
 def : Proc<"x86-64",          [FeatureSSE2,   Feature64Bit, FeatureSlowBTMem]>;
 def : Proc<"yonah",           [FeatureSSE3, FeatureSlowBTMem]>;
 def : Proc<"prescott",        [FeatureSSE3, FeatureSlowBTMem]>;
@@ -121,14 +124,14 @@ def : Proc<"westmere",        [FeatureSSE42,  Feature64Bit, FeatureSlowBTMem,
 // SSE is not listed here since llvm treats AVX as a reimplementation of SSE,
 // rather than a superset.
 // FIXME: Disabling AVX for now since it's not ready.
-def : Proc<"sandybridge",     [FeatureSSE42, Feature64Bit,
+def : Proc<"corei7-avx",      [FeatureSSE42, Feature64Bit,
                                FeatureAES, FeatureCLMUL]>;
 
 def : Proc<"k6",              [FeatureMMX]>;
-def : Proc<"k6-2",            [FeatureMMX,    Feature3DNow]>;
-def : Proc<"k6-3",            [FeatureMMX,    Feature3DNow]>;
-def : Proc<"athlon",          [FeatureMMX,    Feature3DNowA, FeatureSlowBTMem]>;
-def : Proc<"athlon-tbird",    [FeatureMMX,    Feature3DNowA, FeatureSlowBTMem]>;
+def : Proc<"k6-2",            [Feature3DNow]>;
+def : Proc<"k6-3",            [Feature3DNow]>;
+def : Proc<"athlon",          [Feature3DNowA, FeatureSlowBTMem]>;
+def : Proc<"athlon-tbird",    [Feature3DNowA, FeatureSlowBTMem]>;
 def : Proc<"athlon-4",        [FeatureSSE1,   Feature3DNowA, FeatureSlowBTMem]>;
 def : Proc<"athlon-xp",       [FeatureSSE1,   Feature3DNowA, FeatureSlowBTMem]>;
 def : Proc<"athlon-mp",       [FeatureSSE1,   Feature3DNowA, FeatureSlowBTMem]>;
@@ -156,8 +159,8 @@ def : Proc<"shanghai",        [Feature3DNowA, Feature64Bit, FeatureSSE4A,
                                Feature3DNowA]>;
 
 def : Proc<"winchip-c6",      [FeatureMMX]>;
-def : Proc<"winchip2",        [FeatureMMX, Feature3DNow]>;
-def : Proc<"c3",              [FeatureMMX, Feature3DNow]>;
+def : Proc<"winchip2",        [Feature3DNow]>;
+def : Proc<"c3",              [Feature3DNow]>;
 def : Proc<"c3-2",            [FeatureSSE1]>;
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86AsmBackend.cpp b/lib/Target/X86/X86AsmBackend.cpp
index a7581eb..4d7d96d 100644
--- a/lib/Target/X86/X86AsmBackend.cpp
+++ b/lib/Target/X86/X86AsmBackend.cpp
@@ -21,6 +21,7 @@
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/Object/MachOFormat.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -28,6 +29,13 @@
 #include "llvm/Target/TargetAsmBackend.h"
 using namespace llvm;
 
+// Option to allow disabling arithmetic relaxation to workaround PR9807, which
+// is useful when running bitwise comparison experiments on Darwin. We should be
+// able to remove this once PR9807 is resolved.
+static cl::opt<bool>
+MCDisableArithRelaxation("mc-x86-disable-arith-relaxation",
+         cl::desc("Disable relaxation of arithmetic instruction for X86"));
+
 static unsigned getFixupKindLog2Size(unsigned Kind) {
   switch (Kind) {
   default: assert(0 && "invalid fixup kind!");
@@ -201,6 +209,9 @@ bool X86AsmBackend::MayNeedRelaxation(const MCInst &Inst) const {
   if (getRelaxedOpcodeBranch(Inst.getOpcode()) != Inst.getOpcode())
     return true;
 
+  if (MCDisableArithRelaxation)
+    return false;
+
   // Check if this instruction is ever relaxable.
   if (getRelaxedOpcodeArith(Inst.getOpcode()) == Inst.getOpcode())
     return false;
@@ -414,34 +425,26 @@ public:
 
 TargetAsmBackend *llvm::createX86_32AsmBackend(const Target &T,
                                                const std::string &TT) {
-  switch (Triple(TT).getOS()) {
-  case Triple::Darwin:
+  Triple TheTriple(TT);
+
+  if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO)
     return new DarwinX86_32AsmBackend(T);
-  case Triple::MinGW32:
-  case Triple::Cygwin:
-  case Triple::Win32:
-    if (Triple(TT).getEnvironment() == Triple::MachO)
-      return new DarwinX86_32AsmBackend(T);
-    else
-      return new WindowsX86AsmBackend(T, false);
-  default:
-    return new ELFX86_32AsmBackend(T, Triple(TT).getOS());
-  }
+
+  if (TheTriple.isOSWindows())
+    return new WindowsX86AsmBackend(T, false);
+
+  return new ELFX86_32AsmBackend(T, TheTriple.getOS());
 }
 
 TargetAsmBackend *llvm::createX86_64AsmBackend(const Target &T,
                                                const std::string &TT) {
-  switch (Triple(TT).getOS()) {
-  case Triple::Darwin:
+  Triple TheTriple(TT);
+
+  if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO)
     return new DarwinX86_64AsmBackend(T);
-  case Triple::MinGW32:
-  case Triple::Cygwin:
-  case Triple::Win32:
-    if (Triple(TT).getEnvironment() == Triple::MachO)
-      return new DarwinX86_64AsmBackend(T);
-    else
-      return new WindowsX86AsmBackend(T, true);
-  default:
-    return new ELFX86_64AsmBackend(T, Triple(TT).getOS());
-  }
+
+  if (TheTriple.isOSWindows())
+    return new WindowsX86AsmBackend(T, true);
+
+  return new ELFX86_64AsmBackend(T, TheTriple.getOS());
 }
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index 8874486..f1b9972 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -23,6 +23,7 @@
 #include "llvm/GlobalVariable.h"
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
+#include "llvm/Operator.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/FastISel.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
@@ -77,10 +78,8 @@ private:
 
   bool X86FastEmitLoad(EVT VT, const X86AddressMode &AM, unsigned &RR);
 
-  bool X86FastEmitStore(EVT VT, const Value *Val,
-                        const X86AddressMode &AM);
-  bool X86FastEmitStore(EVT VT, unsigned Val,
-                        const X86AddressMode &AM);
+  bool X86FastEmitStore(EVT VT, const Value *Val, const X86AddressMode &AM);
+  bool X86FastEmitStore(EVT VT, unsigned Val, const X86AddressMode &AM);
 
   bool X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT,
                          unsigned &ResultReg);
@@ -109,11 +108,11 @@ private:
   bool X86SelectFPExt(const Instruction *I);
   bool X86SelectFPTrunc(const Instruction *I);
 
-  bool X86SelectExtractValue(const Instruction *I);
-
   bool X86VisitIntrinsicCall(const IntrinsicInst &I);
   bool X86SelectCall(const Instruction *I);
 
+  bool DoSelectCall(const Instruction *I, const char *MemIntName);
+
   const X86InstrInfo *getInstrInfo() const {
     return getTargetMachine()->getInstrInfo();
   }
@@ -125,6 +124,8 @@ private:
 
   unsigned TargetMaterializeAlloca(const AllocaInst *C);
 
+  unsigned TargetMaterializeFloatZero(const ConstantFP *CF);
+
   /// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is
   /// computed in an SSE register, not on the X87 floating point stack.
   bool isScalarFPTypeInSSEReg(EVT VT) const {
@@ -133,6 +134,11 @@ private:
   }
 
   bool isTypeLegal(const Type *Ty, MVT &VT, bool AllowI1 = false);
+
+  bool IsMemcpySmall(uint64_t Len);
+
+  bool TryEmitSmallMemcpy(X86AddressMode DestAM,
+                          X86AddressMode SrcAM, uint64_t Len);
 };
 
 } // end anonymous namespace.
@@ -224,8 +230,7 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM,
 /// and a displacement offset, or a GlobalAddress,
 /// i.e. V. Return true if it is possible.
 bool
-X86FastISel::X86FastEmitStore(EVT VT, unsigned Val,
-                              const X86AddressMode &AM) {
+X86FastISel::X86FastEmitStore(EVT VT, unsigned Val, const X86AddressMode &AM) {
   // Get opcode and regclass of the output for the given store instruction.
   unsigned Opc = 0;
   switch (VT.getSimpleVT().SimpleTy) {
@@ -395,43 +400,45 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
       const Value *Op = *i;
       if (const StructType *STy = dyn_cast<StructType>(*GTI)) {
         const StructLayout *SL = TD.getStructLayout(STy);
-        unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();
-        Disp += SL->getElementOffset(Idx);
-      } else {
-        uint64_t S = TD.getTypeAllocSize(GTI.getIndexedType());
-        for (;;) {
-          if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
-            // Constant-offset addressing.
-            Disp += CI->getSExtValue() * S;
-            break;
-          }
-          if (isa<AddOperator>(Op) &&
-              (!isa<Instruction>(Op) ||
-               FuncInfo.MBBMap[cast<Instruction>(Op)->getParent()]
-                 == FuncInfo.MBB) &&
-              isa<ConstantInt>(cast<AddOperator>(Op)->getOperand(1))) {
-            // An add (in the same block) with a constant operand. Fold the
-            // constant.
-            ConstantInt *CI =
-              cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
-            Disp += CI->getSExtValue() * S;
-            // Iterate on the other operand.
-            Op = cast<AddOperator>(Op)->getOperand(0);
-            continue;
-          }
-          if (IndexReg == 0 &&
-              (!AM.GV || !Subtarget->isPICStyleRIPRel()) &&
-              (S == 1 || S == 2 || S == 4 || S == 8)) {
-            // Scaled-index addressing.
-            Scale = S;
-            IndexReg = getRegForGEPIndex(Op).first;
-            if (IndexReg == 0)
-              return false;
-            break;
-          }
-          // Unsupported.
-          goto unsupported_gep;
+        Disp += SL->getElementOffset(cast<ConstantInt>(Op)->getZExtValue());
+        continue;
+      }
+
+      // A array/variable index is always of the form i*S where S is the
+      // constant scale size.  See if we can push the scale into immediates.
+      uint64_t S = TD.getTypeAllocSize(GTI.getIndexedType());
+      for (;;) {
+        if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
+          // Constant-offset addressing.
+          Disp += CI->getSExtValue() * S;
+          break;
+        }
+        if (isa<AddOperator>(Op) &&
+            (!isa<Instruction>(Op) ||
+             FuncInfo.MBBMap[cast<Instruction>(Op)->getParent()]
+               == FuncInfo.MBB) &&
+            isa<ConstantInt>(cast<AddOperator>(Op)->getOperand(1))) {
+          // An add (in the same block) with a constant operand. Fold the
+          // constant.
+          ConstantInt *CI =
+            cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
+          Disp += CI->getSExtValue() * S;
+          // Iterate on the other operand.
+          Op = cast<AddOperator>(Op)->getOperand(0);
+          continue;
+        }
+        if (IndexReg == 0 &&
+            (!AM.GV || !Subtarget->isPICStyleRIPRel()) &&
+            (S == 1 || S == 2 || S == 4 || S == 8)) {
+          // Scaled-index addressing.
+          Scale = S;
+          IndexReg = getRegForGEPIndex(Op).first;
+          if (IndexReg == 0)
+            return false;
+          break;
         }
+        // Unsupported.
+        goto unsupported_gep;
       }
     }
     // Check for displacement overflow.
@@ -445,7 +452,7 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
     if (X86SelectAddress(U->getOperand(0), AM))
       return true;
 
-    // If we couldn't merge the sub value into this addr mode, revert back to
+    // If we couldn't merge the gep value into this addr mode, revert back to
     // our address and just match the value instead of completely failing.
     AM = SavedAM;
     break;
@@ -457,91 +464,91 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
 
   // Handle constant address.
   if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
-    // Can't handle alternate code models yet.
+    // Can't handle alternate code models or TLS yet.
     if (TM.getCodeModel() != CodeModel::Small)
       return false;
 
-    // RIP-relative addresses can't have additional register operands.
-    if (Subtarget->isPICStyleRIPRel() &&
-        (AM.Base.Reg != 0 || AM.IndexReg != 0))
-      return false;
-
-    // Can't handle TLS yet.
     if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
       if (GVar->isThreadLocal())
         return false;
 
-    // Okay, we've committed to selecting this global. Set up the basic address.
-    AM.GV = GV;
-
-    // Allow the subtarget to classify the global.
-    unsigned char GVFlags = Subtarget->ClassifyGlobalReference(GV, TM);
-
-    // If this reference is relative to the pic base, set it now.
-    if (isGlobalRelativeToPICBase(GVFlags)) {
-      // FIXME: How do we know Base.Reg is free??
-      AM.Base.Reg = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
-    }
+    // RIP-relative addresses can't have additional register operands, so if
+    // we've already folded stuff into the addressing mode, just force the
+    // global value into its own register, which we can use as the basereg.
+    if (!Subtarget->isPICStyleRIPRel() ||
+        (AM.Base.Reg == 0 && AM.IndexReg == 0)) {
+      // Okay, we've committed to selecting this global. Set up the address.
+      AM.GV = GV;
+
+      // Allow the subtarget to classify the global.
+      unsigned char GVFlags = Subtarget->ClassifyGlobalReference(GV, TM);
+
+      // If this reference is relative to the pic base, set it now.
+      if (isGlobalRelativeToPICBase(GVFlags)) {
+        // FIXME: How do we know Base.Reg is free??
+        AM.Base.Reg = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
+      }
 
-    // Unless the ABI requires an extra load, return a direct reference to
-    // the global.
-    if (!isGlobalStubReference(GVFlags)) {
-      if (Subtarget->isPICStyleRIPRel()) {
-        // Use rip-relative addressing if we can.  Above we verified that the
-        // base and index registers are unused.
-        assert(AM.Base.Reg == 0 && AM.IndexReg == 0);
-        AM.Base.Reg = X86::RIP;
+      // Unless the ABI requires an extra load, return a direct reference to
+      // the global.
+      if (!isGlobalStubReference(GVFlags)) {
+        if (Subtarget->isPICStyleRIPRel()) {
+          // Use rip-relative addressing if we can.  Above we verified that the
+          // base and index registers are unused.
+          assert(AM.Base.Reg == 0 && AM.IndexReg == 0);
+          AM.Base.Reg = X86::RIP;
+        }
+        AM.GVOpFlags = GVFlags;
+        return true;
       }
-      AM.GVOpFlags = GVFlags;
-      return true;
-    }
 
-    // Ok, we need to do a load from a stub.  If we've already loaded from this
-    // stub, reuse the loaded pointer, otherwise emit the load now.
-    DenseMap<const Value*, unsigned>::iterator I = LocalValueMap.find(V);
-    unsigned LoadReg;
-    if (I != LocalValueMap.end() && I->second != 0) {
-      LoadReg = I->second;
-    } else {
-      // Issue load from stub.
-      unsigned Opc = 0;
-      const TargetRegisterClass *RC = NULL;
-      X86AddressMode StubAM;
-      StubAM.Base.Reg = AM.Base.Reg;
-      StubAM.GV = GV;
-      StubAM.GVOpFlags = GVFlags;
-
-      // Prepare for inserting code in the local-value area.
-      SavePoint SaveInsertPt = enterLocalValueArea();
-
-      if (TLI.getPointerTy() == MVT::i64) {
-        Opc = X86::MOV64rm;
-        RC  = X86::GR64RegisterClass;
-
-        if (Subtarget->isPICStyleRIPRel())
-          StubAM.Base.Reg = X86::RIP;
+      // Ok, we need to do a load from a stub.  If we've already loaded from
+      // this stub, reuse the loaded pointer, otherwise emit the load now.
+      DenseMap<const Value*, unsigned>::iterator I = LocalValueMap.find(V);
+      unsigned LoadReg;
+      if (I != LocalValueMap.end() && I->second != 0) {
+        LoadReg = I->second;
       } else {
-        Opc = X86::MOV32rm;
-        RC  = X86::GR32RegisterClass;
-      }
+        // Issue load from stub.
+        unsigned Opc = 0;
+        const TargetRegisterClass *RC = NULL;
+        X86AddressMode StubAM;
+        StubAM.Base.Reg = AM.Base.Reg;
+        StubAM.GV = GV;
+        StubAM.GVOpFlags = GVFlags;
+
+        // Prepare for inserting code in the local-value area.
+        SavePoint SaveInsertPt = enterLocalValueArea();
+
+        if (TLI.getPointerTy() == MVT::i64) {
+          Opc = X86::MOV64rm;
+          RC  = X86::GR64RegisterClass;
+
+          if (Subtarget->isPICStyleRIPRel())
+            StubAM.Base.Reg = X86::RIP;
+        } else {
+          Opc = X86::MOV32rm;
+          RC  = X86::GR32RegisterClass;
+        }
 
-      LoadReg = createResultReg(RC);
-      MachineInstrBuilder LoadMI =
-        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), LoadReg);
-      addFullAddress(LoadMI, StubAM);
+        LoadReg = createResultReg(RC);
+        MachineInstrBuilder LoadMI =
+          BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), LoadReg);
+        addFullAddress(LoadMI, StubAM);
 
-      // Ok, back to normal mode.
-      leaveLocalValueArea(SaveInsertPt);
+        // Ok, back to normal mode.
+        leaveLocalValueArea(SaveInsertPt);
 
-      // Prevent loading GV stub multiple times in same MBB.
-      LocalValueMap[V] = LoadReg;
-    }
+        // Prevent loading GV stub multiple times in same MBB.
+        LocalValueMap[V] = LoadReg;
+      }
 
-    // Now construct the final address. Note that the Disp, Scale,
-    // and Index values may already be set here.
-    AM.Base.Reg = LoadReg;
-    AM.GV = 0;
-    return true;
+      // Now construct the final address. Note that the Disp, Scale,
+      // and Index values may already be set here.
+      AM.Base.Reg = LoadReg;
+      AM.GV = 0;
+      return true;
+    }
   }
 
   // If all else fails, try to materialize the value in a register.
@@ -699,7 +706,8 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
 
     // Analyze operands of the call, assigning locations to each operand.
     SmallVector<CCValAssign, 16> ValLocs;
-    CCState CCInfo(CC, F.isVarArg(), TM, ValLocs, I->getContext());
+    CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, TM, ValLocs,
+		   I->getContext());
     CCInfo.AnalyzeReturn(Outs, RetCC_X86);
 
     const Value *RV = Ret->getOperand(0);
@@ -719,18 +727,38 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
     // Only handle register returns for now.
     if (!VA.isRegLoc())
       return false;
-    // TODO: For now, don't try to handle cases where getLocInfo()
-    // says Full but the types don't match.
-    if (TLI.getValueType(RV->getType()) != VA.getValVT())
-      return false;
 
     // The calling-convention tables for x87 returns don't tell
     // the whole story.
     if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1)
       return false;
 
-    // Make the copy.
     unsigned SrcReg = Reg + VA.getValNo();
+    EVT SrcVT = TLI.getValueType(RV->getType());
+    EVT DstVT = VA.getValVT();
+    // Special handling for extended integers.
+    if (SrcVT != DstVT) {
+      if (SrcVT != MVT::i1 && SrcVT != MVT::i8 && SrcVT != MVT::i16)
+        return false;
+
+      if (!Outs[0].Flags.isZExt() && !Outs[0].Flags.isSExt())
+        return false;
+
+      assert(DstVT == MVT::i32 && "X86 should always ext to i32");
+
+      if (SrcVT == MVT::i1) {
+        if (Outs[0].Flags.isSExt())
+          return false;
+        SrcReg = FastEmitZExtFromI1(MVT::i8, SrcReg, /*TODO: Kill=*/false);
+        SrcVT = MVT::i8;
+      }
+      unsigned Op = Outs[0].Flags.isZExt() ? ISD::ZERO_EXTEND :
+                                             ISD::SIGN_EXTEND;
+      SrcReg = FastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Op,
+                          SrcReg, /*TODO: Kill=*/false);
+    }
+
+    // Make the copy.
     unsigned DstReg = VA.getLocReg();
     const TargetRegisterClass* SrcRC = MRI.getRegClass(SrcReg);
     // Avoid a cross-class copy. This is very unlikely.
@@ -862,12 +890,9 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) {
 
     unsigned NEReg = createResultReg(&X86::GR8RegClass);
     unsigned PReg = createResultReg(&X86::GR8RegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
-            TII.get(X86::SETNEr), NEReg);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
-            TII.get(X86::SETPr), PReg);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
-            TII.get(X86::OR8rr), ResultReg)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::SETNEr), NEReg);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::SETPr), PReg);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::OR8rr),ResultReg)
       .addReg(PReg).addReg(NEReg);
     UpdateValueMap(I, ResultReg);
     return true;
@@ -914,18 +939,31 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) {
 
 bool X86FastISel::X86SelectZExt(const Instruction *I) {
   // Handle zero-extension from i1 to i8, which is common.
-  if (I->getType()->isIntegerTy(8) &&
-      I->getOperand(0)->getType()->isIntegerTy(1)) {
-    unsigned ResultReg = getRegForValue(I->getOperand(0));
-    if (ResultReg == 0) return false;
-    // Set the high bits to zero.
-    ResultReg = FastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false);
-    if (ResultReg == 0) return false;
-    UpdateValueMap(I, ResultReg);
-    return true;
+  if (!I->getOperand(0)->getType()->isIntegerTy(1))
+    return false;
+
+  EVT DstVT = TLI.getValueType(I->getType());
+  if (!TLI.isTypeLegal(DstVT))
+    return false;
+
+  unsigned ResultReg = getRegForValue(I->getOperand(0));
+  if (ResultReg == 0)
+    return false;
+
+  // Set the high bits to zero.
+  ResultReg = FastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false);
+  if (ResultReg == 0)
+    return false;
+
+  if (DstVT != MVT::i8) {
+    ResultReg = FastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::ZERO_EXTEND,
+                           ResultReg, /*Kill=*/true);
+    if (ResultReg == 0)
+      return false;
   }
 
-  return false;
+  UpdateValueMap(I, ResultReg);
+  return true;
 }
 
 
@@ -1008,71 +1046,49 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
       FuncInfo.MBB->addSuccessor(TrueMBB);
       return true;
     }
-  } else if (ExtractValueInst *EI =
-             dyn_cast<ExtractValueInst>(BI->getCondition())) {
-    // Check to see if the branch instruction is from an "arithmetic with
-    // overflow" intrinsic. The main way these intrinsics are used is:
-    //
-    //   %t = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
-    //   %sum = extractvalue { i32, i1 } %t, 0
-    //   %obit = extractvalue { i32, i1 } %t, 1
-    //   br i1 %obit, label %overflow, label %normal
-    //
-    // The %sum and %obit are converted in an ADD and a SETO/SETB before
-    // reaching the branch. Therefore, we search backwards through the MBB
-    // looking for the SETO/SETB instruction. If an instruction modifies the
-    // EFLAGS register before we reach the SETO/SETB instruction, then we can't
-    // convert the branch into a JO/JB instruction.
-    if (const IntrinsicInst *CI =
-          dyn_cast<IntrinsicInst>(EI->getAggregateOperand())){
-      if (CI->getIntrinsicID() == Intrinsic::sadd_with_overflow ||
-          CI->getIntrinsicID() == Intrinsic::uadd_with_overflow) {
-        const MachineInstr *SetMI = 0;
-        unsigned Reg = getRegForValue(EI);
-
-        for (MachineBasicBlock::const_reverse_iterator
-               RI = FuncInfo.MBB->rbegin(), RE = FuncInfo.MBB->rend();
-             RI != RE; ++RI) {
-          const MachineInstr &MI = *RI;
-
-          if (MI.definesRegister(Reg)) {
-            if (MI.isCopy()) {
-              Reg = MI.getOperand(1).getReg();
-              continue;
-            }
-
-            SetMI = &MI;
-            break;
-          }
-
-          const TargetInstrDesc &TID = MI.getDesc();
-          if (TID.hasImplicitDefOfPhysReg(X86::EFLAGS) ||
-              MI.hasUnmodeledSideEffects())
-            break;
-        }
+  } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) {
+    // Handle things like "%cond = trunc i32 %X to i1 / br i1 %cond", which
+    // typically happen for _Bool and C++ bools.
+    MVT SourceVT;
+    if (TI->hasOneUse() && TI->getParent() == I->getParent() &&
+        isTypeLegal(TI->getOperand(0)->getType(), SourceVT)) {
+      unsigned TestOpc = 0;
+      switch (SourceVT.SimpleTy) {
+      default: break;
+      case MVT::i8:  TestOpc = X86::TEST8ri; break;
+      case MVT::i16: TestOpc = X86::TEST16ri; break;
+      case MVT::i32: TestOpc = X86::TEST32ri; break;
+      case MVT::i64: TestOpc = X86::TEST64ri32; break;
+      }
+      if (TestOpc) {
+        unsigned OpReg = getRegForValue(TI->getOperand(0));
+        if (OpReg == 0) return false;
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TestOpc))
+          .addReg(OpReg).addImm(1);
 
-        if (SetMI) {
-          unsigned OpCode = SetMI->getOpcode();
-
-          if (OpCode == X86::SETOr || OpCode == X86::SETBr) {
-            BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
-                    TII.get(OpCode == X86::SETOr ?  X86::JO_4 : X86::JB_4))
-              .addMBB(TrueMBB);
-            FastEmitBranch(FalseMBB, DL);
-            FuncInfo.MBB->addSuccessor(TrueMBB);
-            return true;
-          }
+        unsigned JmpOpc = X86::JNE_4;
+        if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) {
+          std::swap(TrueMBB, FalseMBB);
+          JmpOpc = X86::JE_4;
         }
+
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(JmpOpc))
+          .addMBB(TrueMBB);
+        FastEmitBranch(FalseMBB, DL);
+        FuncInfo.MBB->addSuccessor(TrueMBB);
+        return true;
       }
     }
   }
 
   // Otherwise do a clumsy setcc and re-test it.
+  // Note that i1 essentially gets ANY_EXTEND'ed to i8 where it isn't used
+  // in an explicit cast, so make sure to handle that correctly.
   unsigned OpReg = getRegForValue(BI->getCondition());
   if (OpReg == 0) return false;
 
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::TEST8rr))
-    .addReg(OpReg).addReg(OpReg);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::TEST8ri))
+    .addReg(OpReg).addImm(1);
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::JNE_4))
     .addMBB(TrueMBB);
   FastEmitBranch(FalseMBB, DL);
@@ -1081,42 +1097,42 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
 }
 
 bool X86FastISel::X86SelectShift(const Instruction *I) {
-  unsigned CReg = 0, OpReg = 0, OpImm = 0;
+  unsigned CReg = 0, OpReg = 0;
   const TargetRegisterClass *RC = NULL;
   if (I->getType()->isIntegerTy(8)) {
     CReg = X86::CL;
     RC = &X86::GR8RegClass;
     switch (I->getOpcode()) {
-    case Instruction::LShr: OpReg = X86::SHR8rCL; OpImm = X86::SHR8ri; break;
-    case Instruction::AShr: OpReg = X86::SAR8rCL; OpImm = X86::SAR8ri; break;
-    case Instruction::Shl:  OpReg = X86::SHL8rCL; OpImm = X86::SHL8ri; break;
+    case Instruction::LShr: OpReg = X86::SHR8rCL; break;
+    case Instruction::AShr: OpReg = X86::SAR8rCL; break;
+    case Instruction::Shl:  OpReg = X86::SHL8rCL; break;
     default: return false;
     }
   } else if (I->getType()->isIntegerTy(16)) {
     CReg = X86::CX;
     RC = &X86::GR16RegClass;
     switch (I->getOpcode()) {
-    case Instruction::LShr: OpReg = X86::SHR16rCL; OpImm = X86::SHR16ri; break;
-    case Instruction::AShr: OpReg = X86::SAR16rCL; OpImm = X86::SAR16ri; break;
-    case Instruction::Shl:  OpReg = X86::SHL16rCL; OpImm = X86::SHL16ri; break;
+    case Instruction::LShr: OpReg = X86::SHR16rCL; break;
+    case Instruction::AShr: OpReg = X86::SAR16rCL; break;
+    case Instruction::Shl:  OpReg = X86::SHL16rCL; break;
     default: return false;
     }
   } else if (I->getType()->isIntegerTy(32)) {
     CReg = X86::ECX;
     RC = &X86::GR32RegClass;
     switch (I->getOpcode()) {
-    case Instruction::LShr: OpReg = X86::SHR32rCL; OpImm = X86::SHR32ri; break;
-    case Instruction::AShr: OpReg = X86::SAR32rCL; OpImm = X86::SAR32ri; break;
-    case Instruction::Shl:  OpReg = X86::SHL32rCL; OpImm = X86::SHL32ri; break;
+    case Instruction::LShr: OpReg = X86::SHR32rCL; break;
+    case Instruction::AShr: OpReg = X86::SAR32rCL; break;
+    case Instruction::Shl:  OpReg = X86::SHL32rCL; break;
     default: return false;
     }
   } else if (I->getType()->isIntegerTy(64)) {
     CReg = X86::RCX;
     RC = &X86::GR64RegClass;
     switch (I->getOpcode()) {
-    case Instruction::LShr: OpReg = X86::SHR64rCL; OpImm = X86::SHR64ri; break;
-    case Instruction::AShr: OpReg = X86::SAR64rCL; OpImm = X86::SAR64ri; break;
-    case Instruction::Shl:  OpReg = X86::SHL64rCL; OpImm = X86::SHL64ri; break;
+    case Instruction::LShr: OpReg = X86::SHR64rCL; break;
+    case Instruction::AShr: OpReg = X86::SAR64rCL; break;
+    case Instruction::Shl:  OpReg = X86::SHL64rCL; break;
     default: return false;
     }
   } else {
@@ -1130,15 +1146,6 @@ bool X86FastISel::X86SelectShift(const Instruction *I) {
   unsigned Op0Reg = getRegForValue(I->getOperand(0));
   if (Op0Reg == 0) return false;
 
-  // Fold immediate in shl(x,3).
-  if (const ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1))) {
-    unsigned ResultReg = createResultReg(RC);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(OpImm),
-            ResultReg).addReg(Op0Reg).addImm(CI->getZExtValue() & 0xff);
-    UpdateValueMap(I, ResultReg);
-    return true;
-  }
-
   unsigned Op1Reg = getRegForValue(I->getOperand(1));
   if (Op1Reg == 0) return false;
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
@@ -1238,18 +1245,13 @@ bool X86FastISel::X86SelectFPTrunc(const Instruction *I) {
 }
 
 bool X86FastISel::X86SelectTrunc(const Instruction *I) {
-  if (Subtarget->is64Bit())
-    // All other cases should be handled by the tblgen generated code.
-    return false;
   EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType());
   EVT DstVT = TLI.getValueType(I->getType());
 
-  // This code only handles truncation to byte right now.
+  // This code only handles truncation to byte.
   if (DstVT != MVT::i8 && DstVT != MVT::i1)
-    // All other cases should be handled by the tblgen generated code.
     return false;
-  if (SrcVT != MVT::i16 && SrcVT != MVT::i32)
-    // All other cases should be handled by the tblgen generated code.
+  if (!TLI.isTypeLegal(SrcVT))
     return false;
 
   unsigned InputReg = getRegForValue(I->getOperand(0));
@@ -1257,16 +1259,26 @@ bool X86FastISel::X86SelectTrunc(const Instruction *I) {
     // Unhandled operand.  Halt "fast" selection and bail.
     return false;
 
-  // First issue a copy to GR16_ABCD or GR32_ABCD.
-  const TargetRegisterClass *CopyRC = (SrcVT == MVT::i16)
-    ? X86::GR16_ABCDRegisterClass : X86::GR32_ABCDRegisterClass;
-  unsigned CopyReg = createResultReg(CopyRC);
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
-          CopyReg).addReg(InputReg);
+  if (SrcVT == MVT::i8) {
+    // Truncate from i8 to i1; no code needed.
+    UpdateValueMap(I, InputReg);
+    return true;
+  }
 
-  // Then issue an extract_subreg.
+  if (!Subtarget->is64Bit()) {
+    // If we're on x86-32; we can't extract an i8 from a general register.
+    // First issue a copy to GR16_ABCD or GR32_ABCD.
+    const TargetRegisterClass *CopyRC = (SrcVT == MVT::i16)
+      ? X86::GR16_ABCDRegisterClass : X86::GR32_ABCDRegisterClass;
+    unsigned CopyReg = createResultReg(CopyRC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+            CopyReg).addReg(InputReg);
+    InputReg = CopyReg;
+  }
+
+  // Issue an extract_subreg.
   unsigned ResultReg = FastEmitInst_extractsubreg(MVT::i8,
-                                                  CopyReg, /*Kill=*/true,
+                                                  InputReg, /*Kill=*/true,
                                                   X86::sub_8bit);
   if (!ResultReg)
     return false;
@@ -1275,35 +1287,92 @@ bool X86FastISel::X86SelectTrunc(const Instruction *I) {
   return true;
 }
 
-bool X86FastISel::X86SelectExtractValue(const Instruction *I) {
-  const ExtractValueInst *EI = cast<ExtractValueInst>(I);
-  const Value *Agg = EI->getAggregateOperand();
+bool X86FastISel::IsMemcpySmall(uint64_t Len) {
+  return Len <= (Subtarget->is64Bit() ? 32 : 16);
+}
 
-  if (const IntrinsicInst *CI = dyn_cast<IntrinsicInst>(Agg)) {
-    switch (CI->getIntrinsicID()) {
-    default: break;
-    case Intrinsic::sadd_with_overflow:
-    case Intrinsic::uadd_with_overflow: {
-      // Cheat a little. We know that the registers for "add" and "seto" are
-      // allocated sequentially. However, we only keep track of the register
-      // for "add" in the value map. Use extractvalue's index to get the
-      // correct register for "seto".
-      unsigned OpReg = getRegForValue(Agg);
-      if (OpReg == 0)
-        return false;
-      UpdateValueMap(I, OpReg + *EI->idx_begin());
-      return true;
-    }
+bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM,
+                                     X86AddressMode SrcAM, uint64_t Len) {
+
+  // Make sure we don't bloat code by inlining very large memcpy's.
+  if (!IsMemcpySmall(Len))
+    return false;
+
+  bool i64Legal = Subtarget->is64Bit();
+
+  // We don't care about alignment here since we just emit integer accesses.
+  while (Len) {
+    MVT VT;
+    if (Len >= 8 && i64Legal)
+      VT = MVT::i64;
+    else if (Len >= 4)
+      VT = MVT::i32;
+    else if (Len >= 2)
+      VT = MVT::i16;
+    else {
+      assert(Len == 1);
+      VT = MVT::i8;
     }
+
+    unsigned Reg;
+    bool RV = X86FastEmitLoad(VT, SrcAM, Reg);
+    RV &= X86FastEmitStore(VT, Reg, DestAM);
+    assert(RV && "Failed to emit load or store??");
+
+    unsigned Size = VT.getSizeInBits()/8;
+    Len -= Size;
+    DestAM.Disp += Size;
+    SrcAM.Disp += Size;
   }
 
-  return false;
+  return true;
 }
 
 bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) {
   // FIXME: Handle more intrinsics.
   switch (I.getIntrinsicID()) {
   default: return false;
+  case Intrinsic::memcpy: {
+    const MemCpyInst &MCI = cast<MemCpyInst>(I);
+    // Don't handle volatile or variable length memcpys.
+    if (MCI.isVolatile())
+      return false;
+
+    if (isa<ConstantInt>(MCI.getLength())) {
+      // Small memcpy's are common enough that we want to do them
+      // without a call if possible.
+      uint64_t Len = cast<ConstantInt>(MCI.getLength())->getZExtValue();
+      if (IsMemcpySmall(Len)) {
+        X86AddressMode DestAM, SrcAM;
+        if (!X86SelectAddress(MCI.getRawDest(), DestAM) ||
+            !X86SelectAddress(MCI.getRawSource(), SrcAM))
+          return false;
+        TryEmitSmallMemcpy(DestAM, SrcAM, Len);
+        return true;
+      }
+    }
+
+    unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32;
+    if (!MCI.getLength()->getType()->isIntegerTy(SizeWidth))
+      return false;
+
+    if (MCI.getSourceAddressSpace() > 255 || MCI.getDestAddressSpace() > 255)
+      return false;
+
+    return DoSelectCall(&I, "memcpy");
+  }
+  case Intrinsic::memset: {
+    const MemSetInst &MSI = cast<MemSetInst>(I);
+
+    unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32;
+    if (!MSI.getLength()->getType()->isIntegerTy(SizeWidth))
+      return false;
+
+    if (MSI.getDestAddressSpace() > 255)
+      return false;
+
+    return DoSelectCall(&I, "memset");
+  }
   case Intrinsic::stackprotector: {
     // Emit code inline code to store the stack guard onto the stack.
     EVT PtrTy = TLI.getPointerTy();
@@ -1314,33 +1383,7 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) {
     // Grab the frame index.
     X86AddressMode AM;
     if (!X86SelectAddress(Slot, AM)) return false;
-
     if (!X86FastEmitStore(PtrTy, Op1, AM)) return false;
-
-    return true;
-  }
-  case Intrinsic::objectsize: {
-    ConstantInt *CI = dyn_cast<ConstantInt>(I.getArgOperand(1));
-    const Type *Ty = I.getCalledFunction()->getReturnType();
-
-    assert(CI && "Non-constant type in Intrinsic::objectsize?");
-
-    MVT VT;
-    if (!isTypeLegal(Ty, VT))
-      return false;
-
-    unsigned OpC = 0;
-    if (VT == MVT::i32)
-      OpC = X86::MOV32ri;
-    else if (VT == MVT::i64)
-      OpC = X86::MOV64ri;
-    else
-      return false;
-
-    unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(OpC), ResultReg).
-                                  addImm(CI->isZero() ? -1ULL : 0);
-    UpdateValueMap(&I, ResultReg);
     return true;
   }
   case Intrinsic::dbg_declare: {
@@ -1362,11 +1405,10 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) {
   }
   case Intrinsic::sadd_with_overflow:
   case Intrinsic::uadd_with_overflow: {
+    // FIXME: Should fold immediates.
+
     // Replace "add with overflow" intrinsics with an "add" instruction followed
-    // by a seto/setc instruction. Later on, when the "extractvalue"
-    // instructions are encountered, we use the fact that two registers were
-    // created sequentially to get the correct registers for the "sum" and the
-    // "overflow bit".
+    // by a seto/setc instruction.
     const Function *Callee = I.getCalledFunction();
     const Type *RetTy =
       cast<StructType>(Callee->getReturnType())->getTypeAtIndex(unsigned(0));
@@ -1392,27 +1434,18 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) {
     else
       return false;
 
-    unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
+    // The call to CreateRegs builds two sequential registers, to store the
+    // both the the returned values.
+    unsigned ResultReg = FuncInfo.CreateRegs(I.getType());
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(OpC), ResultReg)
       .addReg(Reg1).addReg(Reg2);
-    unsigned DestReg1 = UpdateValueMap(&I, ResultReg);
-
-    // If the add with overflow is an intra-block value then we just want to
-    // create temporaries for it like normal.  If it is a cross-block value then
-    // UpdateValueMap will return the cross-block register used.  Since we
-    // *really* want the value to be live in the register pair known by
-    // UpdateValueMap, we have to use DestReg1+1 as the destination register in
-    // the cross block case.  In the non-cross-block case, we should just make
-    // another register for the value.
-    if (DestReg1 != ResultReg)
-      ResultReg = DestReg1+1;
-    else
-      ResultReg = createResultReg(TLI.getRegClassFor(MVT::i8));
 
     unsigned Opc = X86::SETBr;
     if (I.getIntrinsicID() == Intrinsic::sadd_with_overflow)
       Opc = X86::SETOr;
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg+1);
+
+    UpdateValueMap(&I, ResultReg, 2);
     return true;
   }
   }
@@ -1430,11 +1463,18 @@ bool X86FastISel::X86SelectCall(const Instruction *I) {
   if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI))
     return X86VisitIntrinsicCall(*II);
 
+  return DoSelectCall(I, 0);
+}
+
+// Select either a call, or an llvm.memcpy/memmove/memset intrinsic
+bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
+  const CallInst *CI = cast<CallInst>(I);
+  const Value *Callee = CI->getCalledValue();
+
   // Handle only C and fastcc calling conventions for now.
   ImmutableCallSite CS(CI);
   CallingConv::ID CC = CS.getCallingConv();
-  if (CC != CallingConv::C &&
-      CC != CallingConv::Fast &&
+  if (CC != CallingConv::C && CC != CallingConv::Fast &&
       CC != CallingConv::X86_FastCall)
     return false;
 
@@ -1443,22 +1483,28 @@ bool X86FastISel::X86SelectCall(const Instruction *I) {
   if (CC == CallingConv::Fast && GuaranteedTailCallOpt)
     return false;
 
-  // Let SDISel handle vararg functions.
   const PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
   const FunctionType *FTy = cast<FunctionType>(PT->getElementType());
-  if (FTy->isVarArg())
+  bool isVarArg = FTy->isVarArg();
+
+  // Don't know how to handle Win64 varargs yet.  Nothing special needed for
+  // x86-32.  Special handling for x86-64 is implemented.
+  if (isVarArg && Subtarget->isTargetWin64())
     return false;
 
   // Fast-isel doesn't know about callee-pop yet.
-  if (Subtarget->IsCalleePop(FTy->isVarArg(), CC))
+  if (Subtarget->IsCalleePop(isVarArg, CC))
     return false;
 
-  // Handle *simple* calls for now.
-  const Type *RetTy = CS.getType();
-  MVT RetVT;
-  if (RetTy->isVoidTy())
-    RetVT = MVT::isVoid;
-  else if (!isTypeLegal(RetTy, RetVT, true))
+  // Check whether the function can return without sret-demotion.
+  SmallVector<ISD::OutputArg, 4> Outs;
+  SmallVector<uint64_t, 4> Offsets;
+  GetReturnInfo(I->getType(), CS.getAttributes().getRetAttributes(),
+                Outs, TLI, &Offsets);
+  bool CanLowerReturn = TLI.CanLowerReturn(CS.getCallingConv(),
+					   *FuncInfo.MF, FTy->isVarArg(),
+					   Outs, FTy->getContext());
+  if (!CanLowerReturn)
     return false;
 
   // Materialize callee address in a register. FIXME: GV address can be
@@ -1475,13 +1521,6 @@ bool X86FastISel::X86SelectCall(const Instruction *I) {
   } else
     return false;
 
-  // Allow calls which produce i1 results.
-  bool AndToI1 = false;
-  if (RetVT == MVT::i1) {
-    RetVT = MVT::i8;
-    AndToI1 = true;
-  }
-
   // Deal with call operands first.
   SmallVector<const Value *, 8> ArgVals;
   SmallVector<unsigned, 8> Args;
@@ -1493,9 +1532,11 @@ bool X86FastISel::X86SelectCall(const Instruction *I) {
   ArgFlags.reserve(CS.arg_size());
   for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
        i != e; ++i) {
-    unsigned Arg = getRegForValue(*i);
-    if (Arg == 0)
-      return false;
+    // If we're lowering a mem intrinsic instead of a regular call, skip the
+    // last two arguments, which should not passed to the underlying functions.
+    if (MemIntName && e-i <= 2)
+      break;
+    Value *ArgVal = *i;
     ISD::ArgFlagsTy Flags;
     unsigned AttrInd = i - CS.arg_begin() + 1;
     if (CS.paramHasAttr(AttrInd, Attribute::SExt))
@@ -1503,34 +1544,83 @@ bool X86FastISel::X86SelectCall(const Instruction *I) {
     if (CS.paramHasAttr(AttrInd, Attribute::ZExt))
       Flags.setZExt();
 
-    // FIXME: Only handle *easy* calls for now.
-    if (CS.paramHasAttr(AttrInd, Attribute::InReg) ||
-        CS.paramHasAttr(AttrInd, Attribute::StructRet) ||
-        CS.paramHasAttr(AttrInd, Attribute::Nest) ||
-        CS.paramHasAttr(AttrInd, Attribute::ByVal))
-      return false;
+    if (CS.paramHasAttr(AttrInd, Attribute::ByVal)) {
+      const PointerType *Ty = cast<PointerType>(ArgVal->getType());
+      const Type *ElementTy = Ty->getElementType();
+      unsigned FrameSize = TD.getTypeAllocSize(ElementTy);
+      unsigned FrameAlign = CS.getParamAlignment(AttrInd);
+      if (!FrameAlign)
+        FrameAlign = TLI.getByValTypeAlignment(ElementTy);
+      Flags.setByVal();
+      Flags.setByValSize(FrameSize);
+      Flags.setByValAlign(FrameAlign);
+      if (!IsMemcpySmall(FrameSize))
+        return false;
+    }
+
+    if (CS.paramHasAttr(AttrInd, Attribute::InReg))
+      Flags.setInReg();
+    if (CS.paramHasAttr(AttrInd, Attribute::Nest))
+      Flags.setNest();
+
+    // If this is an i1/i8/i16 argument, promote to i32 to avoid an extra
+    // instruction.  This is safe because it is common to all fastisel supported
+    // calling conventions on x86.
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(ArgVal)) {
+      if (CI->getBitWidth() == 1 || CI->getBitWidth() == 8 ||
+          CI->getBitWidth() == 16) {
+        if (Flags.isSExt())
+          ArgVal = ConstantExpr::getSExt(CI,Type::getInt32Ty(CI->getContext()));
+        else
+          ArgVal = ConstantExpr::getZExt(CI,Type::getInt32Ty(CI->getContext()));
+      }
+    }
+
+    unsigned ArgReg;
 
-    const Type *ArgTy = (*i)->getType();
+    // Passing bools around ends up doing a trunc to i1 and passing it.
+    // Codegen this as an argument + "and 1".
+    if (ArgVal->getType()->isIntegerTy(1) && isa<TruncInst>(ArgVal) &&
+        cast<TruncInst>(ArgVal)->getParent() == I->getParent() &&
+        ArgVal->hasOneUse()) {
+      ArgVal = cast<TruncInst>(ArgVal)->getOperand(0);
+      ArgReg = getRegForValue(ArgVal);
+      if (ArgReg == 0) return false;
+
+      MVT ArgVT;
+      if (!isTypeLegal(ArgVal->getType(), ArgVT)) return false;
+
+      ArgReg = FastEmit_ri(ArgVT, ArgVT, ISD::AND, ArgReg,
+                           ArgVal->hasOneUse(), 1);
+    } else {
+      ArgReg = getRegForValue(ArgVal);
+    }
+
+    if (ArgReg == 0) return false;
+
+    const Type *ArgTy = ArgVal->getType();
     MVT ArgVT;
     if (!isTypeLegal(ArgTy, ArgVT))
       return false;
+    if (ArgVT == MVT::x86mmx)
+      return false;
     unsigned OriginalAlignment = TD.getABITypeAlignment(ArgTy);
     Flags.setOrigAlign(OriginalAlignment);
 
-    Args.push_back(Arg);
-    ArgVals.push_back(*i);
+    Args.push_back(ArgReg);
+    ArgVals.push_back(ArgVal);
     ArgVTs.push_back(ArgVT);
     ArgFlags.push_back(Flags);
   }
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CC, false, TM, ArgLocs, I->getParent()->getContext());
+  CCState CCInfo(CC, isVarArg, *FuncInfo.MF, TM, ArgLocs,
+		 I->getParent()->getContext());
 
   // Allocate shadow area for Win64
-  if (Subtarget->isTargetWin64()) {
+  if (Subtarget->isTargetWin64())
     CCInfo.AllocateStack(32, 8);
-  }
 
   CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CC_X86);
 
@@ -1555,6 +1645,8 @@ bool X86FastISel::X86SelectCall(const Instruction *I) {
     default: llvm_unreachable("Unknown loc info!");
     case CCValAssign::Full: break;
     case CCValAssign::SExt: {
+      assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
+             "Unexpected extend");
       bool Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(),
                                        Arg, ArgVT, Arg);
       assert(Emitted && "Failed to emit a sext!"); (void)Emitted;
@@ -1562,6 +1654,8 @@ bool X86FastISel::X86SelectCall(const Instruction *I) {
       break;
     }
     case CCValAssign::ZExt: {
+      assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
+             "Unexpected extend");
       bool Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(),
                                        Arg, ArgVT, Arg);
       assert(Emitted && "Failed to emit a zext!"); (void)Emitted;
@@ -1569,9 +1663,8 @@ bool X86FastISel::X86SelectCall(const Instruction *I) {
       break;
     }
     case CCValAssign::AExt: {
-      // We don't handle MMX parameters yet.
-      if (VA.getLocVT().isVector() && VA.getLocVT().getSizeInBits() == 128)
-        return false;
+      assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
+             "Unexpected extend");
       bool Emitted = X86FastEmitExtend(ISD::ANY_EXTEND, VA.getLocVT(),
                                        Arg, ArgVT, Arg);
       if (!Emitted)
@@ -1605,14 +1698,21 @@ bool X86FastISel::X86SelectCall(const Instruction *I) {
       AM.Base.Reg = StackPtr;
       AM.Disp = LocMemOffset;
       const Value *ArgVal = ArgVals[VA.getValNo()];
-
-      // If this is a really simple value, emit this with the Value* version of
-      // X86FastEmitStore.  If it isn't simple, we don't want to do this, as it
-      // can cause us to reevaluate the argument.
-      if (isa<ConstantInt>(ArgVal) || isa<ConstantPointerNull>(ArgVal))
+      ISD::ArgFlagsTy Flags = ArgFlags[VA.getValNo()];
+
+      if (Flags.isByVal()) {
+        X86AddressMode SrcAM;
+        SrcAM.Base.Reg = Arg;
+        bool Res = TryEmitSmallMemcpy(AM, SrcAM, Flags.getByValSize());
+        assert(Res && "memcpy length already checked!"); (void)Res;
+      } else if (isa<ConstantInt>(ArgVal) || isa<ConstantPointerNull>(ArgVal)) {
+        // If this is a really simple value, emit this with the Value* version
+        //of X86FastEmitStore.  If it isn't simple, we don't want to do this,
+        // as it can cause us to reevaluate the argument.
         X86FastEmitStore(ArgVT, ArgVal, AM);
-      else
+      } else {
         X86FastEmitStore(ArgVT, Arg, AM);
+      }
     }
   }
 
@@ -1624,6 +1724,17 @@ bool X86FastISel::X86SelectCall(const Instruction *I) {
             X86::EBX).addReg(Base);
   }
 
+  if (Subtarget->is64Bit() && isVarArg && !Subtarget->isTargetWin64()) {
+    // Count the number of XMM registers allocated.
+    static const unsigned XMMArgRegs[] = {
+      X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
+      X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
+    };
+    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::MOV8ri),
+            X86::AL).addImm(NumXMMRegs);
+  }
+
   // Issue the call.
   MachineInstrBuilder MIB;
   if (CalleeOp) {
@@ -1662,7 +1773,8 @@ bool X86FastISel::X86SelectCall(const Instruction *I) {
       OpFlags = X86II::MO_PLT;
     } else if (Subtarget->isPICStyleStubAny() &&
                (GV->isDeclaration() || GV->isWeakForLinker()) &&
-               Subtarget->getDarwinVers() < 9) {
+               (!Subtarget->getTargetTriple().isMacOSX() ||
+                Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
       // PC-relative references to external symbols should go through $stub,
       // unless we're building with the leopard linker or later, which
       // automatically synthesizes these stubs.
@@ -1670,80 +1782,100 @@ bool X86FastISel::X86SelectCall(const Instruction *I) {
     }
 
 
-    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CallOpc))
-      .addGlobalAddress(GV, 0, OpFlags);
+    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CallOpc));
+    if (MemIntName)
+      MIB.addExternalSymbol(MemIntName, OpFlags);
+    else
+      MIB.addGlobalAddress(GV, 0, OpFlags);
   }
 
   // Add an implicit use GOT pointer in EBX.
   if (Subtarget->isPICStyleGOT())
     MIB.addReg(X86::EBX);
 
+  if (Subtarget->is64Bit() && isVarArg && !Subtarget->isTargetWin64())
+    MIB.addReg(X86::AL);
+
   // Add implicit physical register uses to the call.
   for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
     MIB.addReg(RegArgs[i]);
 
   // Issue CALLSEQ_END
   unsigned AdjStackUp = TM.getRegisterInfo()->getCallFrameDestroyOpcode();
+  unsigned NumBytesCallee = 0;
+  if (!Subtarget->is64Bit() && CS.paramHasAttr(1, Attribute::StructRet))
+    NumBytesCallee = 4;
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(AdjStackUp))
-    .addImm(NumBytes).addImm(0);
+    .addImm(NumBytes).addImm(NumBytesCallee);
+
+  // Build info for return calling conv lowering code.
+  // FIXME: This is practically a copy-paste from TargetLowering::LowerCallTo.
+  SmallVector<ISD::InputArg, 32> Ins;
+  SmallVector<EVT, 4> RetTys;
+  ComputeValueVTs(TLI, I->getType(), RetTys);
+  for (unsigned i = 0, e = RetTys.size(); i != e; ++i) {
+    EVT VT = RetTys[i];
+    EVT RegisterVT = TLI.getRegisterType(I->getParent()->getContext(), VT);
+    unsigned NumRegs = TLI.getNumRegisters(I->getParent()->getContext(), VT);
+    for (unsigned j = 0; j != NumRegs; ++j) {
+      ISD::InputArg MyFlags;
+      MyFlags.VT = RegisterVT.getSimpleVT();
+      MyFlags.Used = !CS.getInstruction()->use_empty();
+      if (CS.paramHasAttr(0, Attribute::SExt))
+        MyFlags.Flags.setSExt();
+      if (CS.paramHasAttr(0, Attribute::ZExt))
+        MyFlags.Flags.setZExt();
+      if (CS.paramHasAttr(0, Attribute::InReg))
+        MyFlags.Flags.setInReg();
+      Ins.push_back(MyFlags);
+    }
+  }
 
-  // Now handle call return value (if any).
+  // Now handle call return values.
   SmallVector<unsigned, 4> UsedRegs;
-  if (RetVT != MVT::isVoid) {
-    SmallVector<CCValAssign, 16> RVLocs;
-    CCState CCInfo(CC, false, TM, RVLocs, I->getParent()->getContext());
-    CCInfo.AnalyzeCallResult(RetVT, RetCC_X86);
-
-    // Copy all of the result registers out of their specified physreg.
-    assert(RVLocs.size() == 1 && "Can't handle multi-value calls!");
-    EVT CopyVT = RVLocs[0].getValVT();
-    TargetRegisterClass* DstRC = TLI.getRegClassFor(CopyVT);
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCRetInfo(CC, false, *FuncInfo.MF, TM, RVLocs,
+		    I->getParent()->getContext());
+  unsigned ResultReg = FuncInfo.CreateRegs(I->getType());
+  CCRetInfo.AnalyzeCallResult(Ins, RetCC_X86);
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    EVT CopyVT = RVLocs[i].getValVT();
+    unsigned CopyReg = ResultReg + i;
 
     // If this is a call to a function that returns an fp value on the x87 fp
     // stack, but where we prefer to use the value in xmm registers, copy it
     // out as F80 and use a truncate to move it from fp stack reg to xmm reg.
-    if ((RVLocs[0].getLocReg() == X86::ST0 ||
-         RVLocs[0].getLocReg() == X86::ST1) &&
+    if ((RVLocs[i].getLocReg() == X86::ST0 ||
+         RVLocs[i].getLocReg() == X86::ST1) &&
         isScalarFPTypeInSSEReg(RVLocs[0].getValVT())) {
       CopyVT = MVT::f80;
-      DstRC = X86::RFP80RegisterClass;
+      CopyReg = createResultReg(X86::RFP80RegisterClass);
     }
 
-    unsigned ResultReg = createResultReg(DstRC);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
-            ResultReg).addReg(RVLocs[0].getLocReg());
-    UsedRegs.push_back(RVLocs[0].getLocReg());
+            CopyReg).addReg(RVLocs[i].getLocReg());
+    UsedRegs.push_back(RVLocs[i].getLocReg());
 
-    if (CopyVT != RVLocs[0].getValVT()) {
+    if (CopyVT != RVLocs[i].getValVT()) {
       // Round the F80 the right size, which also moves to the appropriate xmm
       // register. This is accomplished by storing the F80 value in memory and
       // then loading it back. Ewww...
-      EVT ResVT = RVLocs[0].getValVT();
+      EVT ResVT = RVLocs[i].getValVT();
       unsigned Opc = ResVT == MVT::f32 ? X86::ST_Fp80m32 : X86::ST_Fp80m64;
       unsigned MemSize = ResVT.getSizeInBits()/8;
       int FI = MFI.CreateStackObject(MemSize, MemSize, false);
       addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                                 TII.get(Opc)), FI)
-        .addReg(ResultReg);
-      DstRC = ResVT == MVT::f32
-        ? X86::FR32RegisterClass : X86::FR64RegisterClass;
+        .addReg(CopyReg);
       Opc = ResVT == MVT::f32 ? X86::MOVSSrm : X86::MOVSDrm;
-      ResultReg = createResultReg(DstRC);
       addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
-                                TII.get(Opc), ResultReg), FI);
-    }
-
-    if (AndToI1) {
-      // Mask out all but lowest bit for some call which produces an i1.
-      unsigned AndResult = createResultReg(X86::GR8RegisterClass);
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
-              TII.get(X86::AND8ri), AndResult).addReg(ResultReg).addImm(1);
-      ResultReg = AndResult;
+                                TII.get(Opc), ResultReg + i), FI);
     }
-
-    UpdateValueMap(I, ResultReg);
   }
 
+  if (RVLocs.size())
+    UpdateValueMap(I, ResultReg, RVLocs.size());
+
   // Set all unused physreg defs as dead.
   static_cast<MachineInstr *>(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI);
 
@@ -1782,8 +1914,6 @@ X86FastISel::TargetSelectInstruction(const Instruction *I)  {
     return X86SelectFPExt(I);
   case Instruction::FPTrunc:
     return X86SelectFPTrunc(I);
-  case Instruction::ExtractValue:
-    return X86SelectExtractValue(I);
   case Instruction::IntToPtr: // Deliberate fall-through.
   case Instruction::PtrToInt: {
     EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType());
@@ -1856,10 +1986,13 @@ unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) {
   if (isa<GlobalValue>(C)) {
     X86AddressMode AM;
     if (X86SelectAddress(C, AM)) {
-      if (TLI.getPointerTy() == MVT::i32)
-        Opc = X86::LEA32r;
-      else
-        Opc = X86::LEA64r;
+      // If the expression is just a basereg, then we're done, otherwise we need
+      // to emit an LEA.
+      if (AM.BaseType == X86AddressMode::RegBase &&
+          AM.IndexReg == 0 && AM.Disp == 0 && AM.GV == 0)
+        return AM.Base.Reg;
+
+      Opc = TLI.getPointerTy() == MVT::i32 ? X86::LEA32r : X86::LEA64r;
       unsigned ResultReg = createResultReg(RC);
       addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                              TII.get(Opc), ResultReg), AM);
@@ -1921,6 +2054,45 @@ unsigned X86FastISel::TargetMaterializeAlloca(const AllocaInst *C) {
   return ResultReg;
 }
 
+unsigned X86FastISel::TargetMaterializeFloatZero(const ConstantFP *CF) {
+  MVT VT;
+  if (!isTypeLegal(CF->getType(), VT))
+    return false;
+
+  // Get opcode and regclass for the given zero.
+  unsigned Opc = 0;
+  const TargetRegisterClass *RC = NULL;
+  switch (VT.SimpleTy) {
+    default: return false;
+    case MVT::f32:
+      if (Subtarget->hasSSE1()) {
+        Opc = X86::FsFLD0SS;
+        RC  = X86::FR32RegisterClass;
+      } else {
+        Opc = X86::LD_Fp032;
+        RC  = X86::RFP32RegisterClass;
+      }
+      break;
+    case MVT::f64:
+      if (Subtarget->hasSSE2()) {
+        Opc = X86::FsFLD0SD;
+        RC  = X86::FR64RegisterClass;
+      } else {
+        Opc = X86::LD_Fp064;
+        RC  = X86::RFP64RegisterClass;
+      }
+      break;
+    case MVT::f80:
+      // No f80 support yet.
+      return false;
+  }
+
+  unsigned ResultReg = createResultReg(RC);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg);
+  return ResultReg;
+}
+
+
 /// TryToFoldLoad - The specified machine instr operand is a vreg, and that
 /// vreg is being provided by the specified load instruction.  If possible,
 /// try to fold the load as an operand to the instruction, returning true if
diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp
index 3aaa693..325d061 100644
--- a/lib/Target/X86/X86FloatingPoint.cpp
+++ b/lib/Target/X86/X86FloatingPoint.cpp
@@ -1307,7 +1307,7 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) {
     // set up by FpSET_ST0, and our StackTop is off by one because of it.
     unsigned Op0 = getFPReg(MI->getOperand(0));
     // Restore the actual StackTop from before Fp_SET_ST0.
-    // Note we can't handle Fp_SET_ST1 without a preceeding Fp_SET_ST0, and we
+    // Note we can't handle Fp_SET_ST1 without a preceding Fp_SET_ST0, and we
     // are not enforcing the constraint.
     ++StackTop;
     unsigned RegOnTop = getStackEntry(0); // This reg must remain in st(0).
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 071fbe0..cd4e954 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -1,4 +1,4 @@
-//=======- X86FrameLowering.cpp - X86 Frame Information ------------*- C++ -*-====//
+//=======- X86FrameLowering.cpp - X86 Frame Information --------*- C++ -*-====//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -22,6 +22,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Support/CommandLine.h"
@@ -159,8 +160,10 @@ void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
         Opc = isSub
           ? (Is64Bit ? X86::PUSH64r : X86::PUSH32r)
           : (Is64Bit ? X86::POP64r  : X86::POP32r);
-        BuildMI(MBB, MBBI, DL, TII.get(Opc))
+        MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc))
           .addReg(Reg, getDefRegState(!isSub) | getUndefRegState(isSub));
+        if (isSub)
+          MI->setFlag(MachineInstr::FrameSetup);
         Offset -= ThisVal;
         continue;
       }
@@ -170,6 +173,8 @@ void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
       BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
       .addReg(StackPtr)
       .addImm(ThisVal);
+    if (isSub)
+      MI->setFlag(MachineInstr::FrameSetup);
     MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
     Offset -= ThisVal;
   }
@@ -296,7 +301,7 @@ void X86FrameLowering::emitCalleeSavedFrameMoves(MachineFunction &MF,
   // FIXME: This is dirty hack. The code itself is pretty mess right now.
   // It should be rewritten from scratch and generalized sometimes.
 
-  // Determine maximum offset (minumum due to stack growth).
+  // Determine maximum offset (minimum due to stack growth).
   int64_t MaxOffset = 0;
   for (std::vector<CalleeSavedInfo>::const_iterator
          I = CSI.begin(), E = CSI.end(); I != E; ++I)
@@ -354,7 +359,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
   MachineModuleInfo &MMI = MF.getMMI();
   X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
   bool needsFrameMoves = MMI.hasDebugInfo() ||
-                          !Fn->doesNotThrow() || UnwindTablesMandatory;
+    Fn->needsUnwindTableEntry();
   uint64_t MaxAlign  = MFI->getMaxAlignment(); // Desired stack alignment.
   uint64_t StackSize = MFI->getStackSize();    // Number of bytes to allocate.
   bool HasFP = hasFP(MF);
@@ -408,7 +413,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
               TII.get(getSUBriOpcode(Is64Bit, -TailCallReturnAddrDelta)),
               StackPtr)
         .addReg(StackPtr)
-        .addImm(-TailCallReturnAddrDelta);
+        .addImm(-TailCallReturnAddrDelta)
+        .setMIFlag(MachineInstr::FrameSetup);
     MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
   }
 
@@ -446,7 +452,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
 
     // Save EBP/RBP into the appropriate stack slot.
     BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
-      .addReg(FramePtr, RegState::Kill);
+      .addReg(FramePtr, RegState::Kill)
+      .setMIFlag(MachineInstr::FrameSetup);
 
     if (needsFrameMoves) {
       // Mark the place where EBP/RBP was saved.
@@ -473,7 +480,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
     // Update EBP with the new base value...
     BuildMI(MBB, MBBI, DL,
             TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr), FramePtr)
-        .addReg(StackPtr);
+        .addReg(StackPtr)
+        .setMIFlag(MachineInstr::FrameSetup);
 
     if (needsFrameMoves) {
       // Mark effective beginning of when frame pointer becomes valid.
@@ -615,7 +623,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
     emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit,
                  TII, *RegInfo);
 
-  if ((NumBytes || PushedRegs) && needsFrameMoves) {
+  if (( (!HasFP && NumBytes) || PushedRegs) && needsFrameMoves) {
     // Mark end of stack pointer adjustment.
     MCSymbol *Label = MMI.getContext().CreateTempSymbol();
     BuildMI(MBB, MBBI, DL, TII.get(X86::PROLOG_LABEL)).addSym(Label);
@@ -641,7 +649,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
 }
 
 void X86FrameLowering::emitEpilogue(MachineFunction &MF,
-                                MachineBasicBlock &MBB) const {
+                                    MachineBasicBlock &MBB) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
   const X86RegisterInfo *RegInfo = TM.getRegisterInfo();
@@ -785,7 +793,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
     assert(Offset >= 0 && "Offset should never be negative");
 
     if (Offset) {
-      // Check for possible merge with preceeding ADD instruction.
+      // Check for possible merge with preceding ADD instruction.
       Offset += mergeSPUpdates(MBB, MBBI, StackPtr, true);
       emitSPUpdate(MBB, MBBI, StackPtr, Offset, Is64Bit, TII, *RegInfo);
     }
@@ -829,7 +837,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
     int delta = -1*X86FI->getTCReturnAddrDelta();
     MBBI = MBB.getLastNonDebugInstr();
 
-    // Check for possible merge with preceeding ADD instruction.
+    // Check for possible merge with preceding ADD instruction.
     delta += mergeSPUpdates(MBB, MBBI, StackPtr, true);
     emitSPUpdate(MBB, MBBI, StackPtr, delta, Is64Bit, TII, *RegInfo);
   }
@@ -918,7 +926,8 @@ bool X86FrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
       // X86RegisterInfo::emitPrologue will handle spilling of frame register.
       continue;
     CalleeFrameSize += SlotSize;
-    BuildMI(MBB, MI, DL, TII.get(Opc)).addReg(Reg, RegState::Kill);
+    BuildMI(MBB, MI, DL, TII.get(Opc)).addReg(Reg, RegState::Kill)
+      .setMIFlag(MachineInstr::FrameSetup);
   }
 
   X86FI->setCalleeSavedFrameSize(CalleeFrameSize);
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 9b0ec6e..1fcc274 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -189,6 +189,7 @@ namespace {
     SDNode *Select(SDNode *N);
     SDNode *SelectAtomic64(SDNode *Node, unsigned Opc);
     SDNode *SelectAtomicLoadAdd(SDNode *Node, EVT NVT);
+    SDNode *SelectAtomicLoadArith(SDNode *Node, EVT NVT);
 
     bool MatchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM);
     bool MatchWrapper(SDValue N, X86ISelAddressMode &AM);
@@ -1329,6 +1330,8 @@ SDNode *X86DAGToDAGISel::SelectAtomic64(SDNode *Node, unsigned Opc) {
   return ResNode;
 }
 
+// FIXME: Figure out some way to unify this with the 'or' and other code
+// below.
 SDNode *X86DAGToDAGISel::SelectAtomicLoadAdd(SDNode *Node, EVT NVT) {
   if (Node->hasAnyUseOfValue(0))
     return 0;
@@ -1479,6 +1482,158 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadAdd(SDNode *Node, EVT NVT) {
   }
 }
 
+enum AtomicOpc {
+  OR,
+  AND,
+  XOR,
+  AtomicOpcEnd
+};
+
+enum AtomicSz {
+  ConstantI8,
+  I8,
+  SextConstantI16,
+  ConstantI16,
+  I16,
+  SextConstantI32,
+  ConstantI32,
+  I32,
+  SextConstantI64,
+  ConstantI64,
+  I64,
+  AtomicSzEnd
+};
+
+static const unsigned int AtomicOpcTbl[AtomicOpcEnd][AtomicSzEnd] = {
+  {
+    X86::LOCK_OR8mi,
+    X86::LOCK_OR8mr,
+    X86::LOCK_OR16mi8,
+    X86::LOCK_OR16mi,
+    X86::LOCK_OR16mr,
+    X86::LOCK_OR32mi8,
+    X86::LOCK_OR32mi,
+    X86::LOCK_OR32mr,
+    X86::LOCK_OR64mi8,
+    X86::LOCK_OR64mi32,
+    X86::LOCK_OR64mr
+  },
+  {
+    X86::LOCK_AND8mi,
+    X86::LOCK_AND8mr,
+    X86::LOCK_AND16mi8,
+    X86::LOCK_AND16mi,
+    X86::LOCK_AND16mr,
+    X86::LOCK_AND32mi8,
+    X86::LOCK_AND32mi,
+    X86::LOCK_AND32mr,
+    X86::LOCK_AND64mi8,
+    X86::LOCK_AND64mi32,
+    X86::LOCK_AND64mr
+  },
+  {
+    X86::LOCK_XOR8mi,
+    X86::LOCK_XOR8mr,
+    X86::LOCK_XOR16mi8,
+    X86::LOCK_XOR16mi,
+    X86::LOCK_XOR16mr,
+    X86::LOCK_XOR32mi8,
+    X86::LOCK_XOR32mi,
+    X86::LOCK_XOR32mr,
+    X86::LOCK_XOR64mi8,
+    X86::LOCK_XOR64mi32,
+    X86::LOCK_XOR64mr
+  }
+};
+
+SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, EVT NVT) {
+  if (Node->hasAnyUseOfValue(0))
+    return 0;
+  
+  // Optimize common patterns for __sync_or_and_fetch and similar arith
+  // operations where the result is not used. This allows us to use the "lock"
+  // version of the arithmetic instruction.
+  // FIXME: Same as for 'add' and 'sub', try to merge those down here.
+  SDValue Chain = Node->getOperand(0);
+  SDValue Ptr = Node->getOperand(1);
+  SDValue Val = Node->getOperand(2);
+  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+  if (!SelectAddr(Node, Ptr, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4))
+    return 0;
+
+  // Which index into the table.
+  enum AtomicOpc Op;
+  switch (Node->getOpcode()) {
+    case ISD::ATOMIC_LOAD_OR:
+      Op = OR;
+      break;
+    case ISD::ATOMIC_LOAD_AND:
+      Op = AND;
+      break;
+    case ISD::ATOMIC_LOAD_XOR:
+      Op = XOR;
+      break;
+    default:
+      return 0;
+  }
+  
+  bool isCN = false;
+  ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val);
+  if (CN) {
+    isCN = true;
+    Val = CurDAG->getTargetConstant(CN->getSExtValue(), NVT);
+  }
+  
+  unsigned Opc = 0;
+  switch (NVT.getSimpleVT().SimpleTy) {
+    default: return 0;
+    case MVT::i8:
+      if (isCN)
+        Opc = AtomicOpcTbl[Op][ConstantI8];
+      else
+        Opc = AtomicOpcTbl[Op][I8];
+      break;
+    case MVT::i16:
+      if (isCN) {
+        if (immSext8(Val.getNode()))
+          Opc = AtomicOpcTbl[Op][SextConstantI16];
+        else
+          Opc = AtomicOpcTbl[Op][ConstantI16];
+      } else
+        Opc = AtomicOpcTbl[Op][I16];
+      break;
+    case MVT::i32:
+      if (isCN) {
+        if (immSext8(Val.getNode()))
+          Opc = AtomicOpcTbl[Op][SextConstantI32];
+        else
+          Opc = AtomicOpcTbl[Op][ConstantI32];
+      } else
+        Opc = AtomicOpcTbl[Op][I32];
+      break;
+    case MVT::i64:
+      if (isCN) {
+        if (immSext8(Val.getNode()))
+          Opc = AtomicOpcTbl[Op][SextConstantI64];
+        else if (i64immSExt32(Val.getNode()))
+          Opc = AtomicOpcTbl[Op][ConstantI64];
+      } else
+        Opc = AtomicOpcTbl[Op][I64];
+      break;
+  }
+  
+  DebugLoc dl = Node->getDebugLoc();
+  SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
+                                                 dl, NVT), 0);
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemSDNode>(Node)->getMemOperand();
+  SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Val, Chain };
+  SDValue Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops, 7), 0);
+  cast<MachineSDNode>(Ret)->setMemRefs(MemOp, MemOp + 1);
+  SDValue RetVals[] = { Undef, Ret };
+  return CurDAG->getMergeValues(RetVals, 2, dl).getNode();
+}
+
 /// HasNoSignedComparisonUses - Test whether the given X86ISD::CMP node has
 /// any uses which require the SF or OF bits to be accurate.
 static bool HasNoSignedComparisonUses(SDNode *N) {
@@ -1580,6 +1735,89 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
       return RetVal;
     break;
   }
+  case ISD::ATOMIC_LOAD_XOR:
+  case ISD::ATOMIC_LOAD_AND:
+  case ISD::ATOMIC_LOAD_OR: {
+    SDNode *RetVal = SelectAtomicLoadArith(Node, NVT);
+    if (RetVal)
+      return RetVal;
+    break;
+  }
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR: {
+    // For operations of the form (x << C1) op C2, check if we can use a smaller
+    // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
+    SDValue N0 = Node->getOperand(0);
+    SDValue N1 = Node->getOperand(1);
+
+    if (N0->getOpcode() != ISD::SHL || !N0->hasOneUse())
+      break;
+
+    // i8 is unshrinkable, i16 should be promoted to i32.
+    if (NVT != MVT::i32 && NVT != MVT::i64)
+      break;
+
+    ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1);
+    ConstantSDNode *ShlCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
+    if (!Cst || !ShlCst)
+      break;
+
+    int64_t Val = Cst->getSExtValue();
+    uint64_t ShlVal = ShlCst->getZExtValue();
+
+    // Make sure that we don't change the operation by removing bits.
+    // This only matters for OR and XOR, AND is unaffected.
+    if (Opcode != ISD::AND && ((Val >> ShlVal) << ShlVal) != Val)
+      break;
+
+    unsigned ShlOp, Op = 0;
+    EVT CstVT = NVT;
+
+    // Check the minimum bitwidth for the new constant.
+    // TODO: AND32ri is the same as AND64ri32 with zext imm.
+    // TODO: MOV32ri+OR64r is cheaper than MOV64ri64+OR64rr
+    // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
+    if (!isInt<8>(Val) && isInt<8>(Val >> ShlVal))
+      CstVT = MVT::i8;
+    else if (!isInt<32>(Val) && isInt<32>(Val >> ShlVal))
+      CstVT = MVT::i32;
+
+    // Bail if there is no smaller encoding.
+    if (NVT == CstVT)
+      break;
+
+    switch (NVT.getSimpleVT().SimpleTy) {
+    default: llvm_unreachable("Unsupported VT!");
+    case MVT::i32:
+      assert(CstVT == MVT::i8);
+      ShlOp = X86::SHL32ri;
+
+      switch (Opcode) {
+      case ISD::AND: Op = X86::AND32ri8; break;
+      case ISD::OR:  Op =  X86::OR32ri8; break;
+      case ISD::XOR: Op = X86::XOR32ri8; break;
+      }
+      break;
+    case MVT::i64:
+      assert(CstVT == MVT::i8 || CstVT == MVT::i32);
+      ShlOp = X86::SHL64ri;
+
+      switch (Opcode) {
+      case ISD::AND: Op = CstVT==MVT::i8? X86::AND64ri8 : X86::AND64ri32; break;
+      case ISD::OR:  Op = CstVT==MVT::i8?  X86::OR64ri8 :  X86::OR64ri32; break;
+      case ISD::XOR: Op = CstVT==MVT::i8? X86::XOR64ri8 : X86::XOR64ri32; break;
+      }
+      break;
+    }
+
+    // Emit the smaller op and the shift.
+    SDValue NewCst = CurDAG->getTargetConstant(Val >> ShlVal, CstVT);
+    SDNode *New = CurDAG->getMachineNode(Op, dl, NVT, N0->getOperand(0),NewCst);
+    return CurDAG->SelectNodeTo(Node, ShlOp, NVT, SDValue(New, 0),
+                                getI8Imm(ShlVal));
+    break;
+  }
   case X86ISD::UMUL: {
     SDValue N0 = Node->getOperand(0);
     SDValue N1 = Node->getOperand(1);
@@ -1768,17 +2006,17 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
       if (TryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
         SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
         Move =
-          SDValue(CurDAG->getMachineNode(X86::MOVZX16rm8, dl, MVT::i16,
+          SDValue(CurDAG->getMachineNode(X86::MOVZX32rm8, dl, MVT::i32,
                                          MVT::Other, Ops,
                                          array_lengthof(Ops)), 0);
         Chain = Move.getValue(1);
         ReplaceUses(N0.getValue(1), Chain);
       } else {
         Move =
-          SDValue(CurDAG->getMachineNode(X86::MOVZX16rr8, dl, MVT::i16, N0),0);
+          SDValue(CurDAG->getMachineNode(X86::MOVZX32rr8, dl, MVT::i32, N0),0);
         Chain = CurDAG->getEntryNode();
       }
-      Chain  = CurDAG->getCopyToReg(Chain, dl, X86::AX, Move, SDValue());
+      Chain  = CurDAG->getCopyToReg(Chain, dl, X86::EAX, Move, SDValue());
       InFlag = Chain.getValue(1);
     } else {
       InFlag =
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index cd1d201..1cdf2b6 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -222,7 +222,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
 
   // X86 is weird, it always uses i8 for shift amounts and setcc results.
   setBooleanContents(ZeroOrOneBooleanContent);
-    
+
   // For 64-bit since we have so many registers use the ILP scheduler, for
   // 32-bit code use the register pressure specific scheduling.
   if (Subtarget->is64Bit())
@@ -574,6 +574,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
 
+    // Lower this to FGETSIGNx86 plus an AND.
+    setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
+    setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
+
     // We don't support sin/cos/fmod
     setOperationAction(ISD::FSIN , MVT::f64, Expand);
     setOperationAction(ISD::FCOS , MVT::f64, Expand);
@@ -927,7 +931,6 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     // Can turn SHL into an integer multiply.
     setOperationAction(ISD::SHL,                MVT::v4i32, Custom);
     setOperationAction(ISD::SHL,                MVT::v16i8, Custom);
-    setOperationAction(ISD::SRL,                MVT::v4i32, Legal);
 
     // i8 and i16 vectors are custom , because the source register and source
     // source memory operand types are not the same width.  f32 vectors are
@@ -949,6 +952,19 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     }
   }
 
+  if (Subtarget->hasSSE2()) {
+    setOperationAction(ISD::SRL,               MVT::v2i64, Custom);
+    setOperationAction(ISD::SRL,               MVT::v4i32, Custom);
+    setOperationAction(ISD::SRL,               MVT::v16i8, Custom);
+
+    setOperationAction(ISD::SHL,               MVT::v2i64, Custom);
+    setOperationAction(ISD::SHL,               MVT::v4i32, Custom);
+    setOperationAction(ISD::SHL,               MVT::v8i16, Custom);
+
+    setOperationAction(ISD::SRA,               MVT::v4i32, Custom);
+    setOperationAction(ISD::SRA,               MVT::v8i16, Custom);
+  }
+
   if (Subtarget->hasSSE42())
     setOperationAction(ISD::VSETCC,             MVT::v2i64, Custom);
 
@@ -1081,6 +1097,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   setTargetDAGCombine(ISD::SUB);
   setTargetDAGCombine(ISD::STORE);
   setTargetDAGCombine(ISD::ZERO_EXTEND);
+  setTargetDAGCombine(ISD::SINT_TO_FP);
   if (Subtarget->is64Bit())
     setTargetDAGCombine(ISD::MUL);
 
@@ -1096,6 +1113,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   maxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
   setPrefLoopAlignment(16);
   benefitFromCodePlacementOpt = true;
+
+  setPrefFunctionAlignment(4);
 }
 
 
@@ -1247,11 +1266,6 @@ getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
   return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
 }
 
-/// getFunctionAlignment - Return the Log2 alignment of this function.
-unsigned X86TargetLowering::getFunctionAlignment(const Function *F) const {
-  return F->hasFnAttr(Attribute::OptimizeForSize) ? 0 : 4;
-}
-
 // FIXME: Why this routine is here? Move to RegInfo!
 std::pair<const TargetRegisterClass*, uint8_t>
 X86TargetLowering::findRepresentativeClass(EVT VT) const{
@@ -1306,11 +1320,12 @@ bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
 #include "X86GenCallingConv.inc"
 
 bool
-X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, bool isVarArg,
+X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
+				  MachineFunction &MF, bool isVarArg,
                         const SmallVectorImpl<ISD::OutputArg> &Outs,
                         LLVMContext &Context) const {
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
                  RVLocs, Context);
   return CCInfo.CheckReturn(Outs, RetCC_X86);
 }
@@ -1325,7 +1340,7 @@ X86TargetLowering::LowerReturn(SDValue Chain,
   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
 
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
                  RVLocs, *DAG.getContext());
   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
 
@@ -1476,8 +1491,8 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
   bool Is64Bit = Subtarget->is64Bit();
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
-                 RVLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), RVLocs, *DAG.getContext());
   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
 
   // Copy all of the result registers out of their specified physreg.
@@ -1518,20 +1533,6 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
         Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
                           // This truncation won't change the value.
                           DAG.getIntPtrConstant(1));
-    } else if (Is64Bit && CopyVT.isVector() && CopyVT.getSizeInBits() == 64) {
-      // For x86-64, MMX values are returned in XMM0 / XMM1 except for v1i64.
-      if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
-        Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
-                                   MVT::v2i64, InFlag).getValue(1);
-        Val = Chain.getValue(0);
-        Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
-                          Val, DAG.getConstant(0, MVT::i64));
-      } else {
-        Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
-                                   MVT::i64, InFlag).getValue(1);
-        Val = Chain.getValue(0);
-      }
-      Val = DAG.getNode(ISD::BITCAST, dl, CopyVT, Val);
     } else {
       Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
                                  CopyVT, InFlag).getValue(1);
@@ -1680,7 +1681,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
                  ArgLocs, *DAG.getContext());
 
   // Allocate shadow area for Win64
@@ -1952,7 +1953,7 @@ X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
   return SDValue(OutRetAddr.getNode(), 1);
 }
 
-/// EmitTailCallStoreRetAddr - Emit a store of the return adress if tail call
+/// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call
 /// optimization is performed and it is required (FPDiff!=0).
 static SDValue
 EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
@@ -2007,7 +2008,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
                  ArgLocs, *DAG.getContext());
 
   // Allocate shadow area for Win64
@@ -2043,7 +2044,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
     Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
 
   SDValue RetAddrFrIdx;
-  // Load return adress for tail calls.
+  // Load return address for tail calls.
   if (isTailCall && FPDiff)
     Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
                                     Is64Bit, FPDiff, dl);
@@ -2200,7 +2201,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
     SmallVector<SDValue, 8> MemOpChains2;
     SDValue FIN;
     int FI = 0;
-    // Do not flag preceeding copytoreg stuff together with the following stuff.
+    // Do not flag preceding copytoreg stuff together with the following stuff.
     InFlag = SDValue();
     if (GuaranteedTailCallOpt) {
       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
@@ -2270,6 +2271,8 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
     const GlobalValue *GV = G->getGlobal();
     if (!GV->hasDLLImportLinkage()) {
       unsigned char OpFlags = 0;
+      bool ExtraLoad = false;
+      unsigned WrapperKind = ISD::DELETED_NODE;
 
       // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
       // external symbols most go through the PLT in PIC mode.  If the symbol
@@ -2281,15 +2284,34 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
         OpFlags = X86II::MO_PLT;
       } else if (Subtarget->isPICStyleStubAny() &&
                  (GV->isDeclaration() || GV->isWeakForLinker()) &&
-                 Subtarget->getDarwinVers() < 9) {
+                 (!Subtarget->getTargetTriple().isMacOSX() ||
+                  Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
         // PC-relative references to external symbols should go through $stub,
         // unless we're building with the leopard linker or later, which
         // automatically synthesizes these stubs.
         OpFlags = X86II::MO_DARWIN_STUB;
+      } else if (Subtarget->isPICStyleRIPRel() &&
+                 isa<Function>(GV) &&
+                 cast<Function>(GV)->hasFnAttr(Attribute::NonLazyBind)) {
+        // If the function is marked as non-lazy, generate an indirect call
+        // which loads from the GOT directly. This avoids runtime overhead
+        // at the cost of eager binding (and one extra byte of encoding).
+        OpFlags = X86II::MO_GOTPCREL;
+        WrapperKind = X86ISD::WrapperRIP;
+        ExtraLoad = true;
       }
 
       Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
                                           G->getOffset(), OpFlags);
+
+      // Add a wrapper if needed.
+      if (WrapperKind != ISD::DELETED_NODE)
+        Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee);
+      // Add extra indirection if needed.
+      if (ExtraLoad)
+        Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
+                             MachinePointerInfo::getGOT(),
+                             false, false, 0);
     }
   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
     unsigned char OpFlags = 0;
@@ -2300,7 +2322,8 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
         getTargetMachine().getRelocationModel() == Reloc::PIC_) {
       OpFlags = X86II::MO_PLT;
     } else if (Subtarget->isPICStyleStubAny() &&
-               Subtarget->getDarwinVers() < 9) {
+               (!Subtarget->getTargetTriple().isMacOSX() ||
+                Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
       // PC-relative references to external symbols should go through $stub,
       // unless we're building with the leopard linker or later, which
       // automatically synthesizes these stubs.
@@ -2528,16 +2551,30 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
   if (RegInfo->needsStackRealignment(MF))
     return false;
 
-  // Do not sibcall optimize vararg calls unless the call site is not passing
-  // any arguments.
-  if (isVarArg && !Outs.empty())
-    return false;
-
   // Also avoid sibcall optimization if either caller or callee uses struct
   // return semantics.
   if (isCalleeStructRet || isCallerStructRet)
     return false;
 
+  // Do not sibcall optimize vararg calls unless all arguments are passed via
+  // registers.
+  if (isVarArg && !Outs.empty()) {
+
+    // Optimizing for varargs on Win64 is unlikely to be safe without
+    // additional testing.
+    if (Subtarget->isTargetWin64())
+      return false;
+
+    SmallVector<CCValAssign, 16> ArgLocs;
+    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
+		   getTargetMachine(), ArgLocs, *DAG.getContext());
+
+    CCInfo.AnalyzeCallOperands(Outs, CC_X86);
+    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
+      if (!ArgLocs[i].isRegLoc())
+        return false;
+  }
+
   // If the call result is in ST0 / ST1, it needs to be popped off the x87 stack.
   // Therefore if it's not used by the call it is not safe to optimize this into
   // a sibcall.
@@ -2550,8 +2587,8 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
   }
   if (Unused) {
     SmallVector<CCValAssign, 16> RVLocs;
-    CCState CCInfo(CalleeCC, false, getTargetMachine(),
-                   RVLocs, *DAG.getContext());
+    CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(),
+		   getTargetMachine(), RVLocs, *DAG.getContext());
     CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
     for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
       CCValAssign &VA = RVLocs[i];
@@ -2564,13 +2601,13 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
   // results are returned in the same way as what the caller expects.
   if (!CCMatch) {
     SmallVector<CCValAssign, 16> RVLocs1;
-    CCState CCInfo1(CalleeCC, false, getTargetMachine(),
-                    RVLocs1, *DAG.getContext());
+    CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
+		    getTargetMachine(), RVLocs1, *DAG.getContext());
     CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
 
     SmallVector<CCValAssign, 16> RVLocs2;
-    CCState CCInfo2(CallerCC, false, getTargetMachine(),
-                    RVLocs2, *DAG.getContext());
+    CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
+		    getTargetMachine(), RVLocs2, *DAG.getContext());
     CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
 
     if (RVLocs1.size() != RVLocs2.size())
@@ -2596,8 +2633,8 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
     // Check if stack adjustment is needed. For now, do not do this if any
     // argument is passed on the stack.
     SmallVector<CCValAssign, 16> ArgLocs;
-    CCState CCInfo(CalleeCC, isVarArg, getTargetMachine(),
-                   ArgLocs, *DAG.getContext());
+    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
+		   getTargetMachine(), ArgLocs, *DAG.getContext());
 
     // Allocate shadow area for Win64
     if (Subtarget->isTargetWin64()) {
@@ -4018,7 +4055,7 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
 
 /// getNumOfConsecutiveZeros - Return the number of elements of a vector
 /// shuffle operation which come from a consecutively from a zero. The
-/// search can start in two diferent directions, from left or right.
+/// search can start in two different directions, from left or right.
 static
 unsigned getNumOfConsecutiveZeros(SDNode *N, int NumElems,
                                   bool ZerosFromLeft, SelectionDAG &DAG) {
@@ -6617,9 +6654,9 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
 }
 
 
-/// LowerShift - Lower SRA_PARTS and friends, which return two i32 values and
+/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values and
 /// take a 2 x i32 value to shift plus a shift amount.
-SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const {
   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
   EVT VT = Op.getValueType();
   unsigned VTBits = VT.getSizeInBits();
@@ -6708,12 +6745,18 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
 
   unsigned ByteSize = SrcVT.getSizeInBits()/8;
 
-  int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
-  MachineMemOperand *MMO =
-    DAG.getMachineFunction()
-    .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
-                          MachineMemOperand::MOLoad, ByteSize, ByteSize);
-
+  FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
+  MachineMemOperand *MMO;
+  if (FI) {
+    int SSFI = FI->getIndex();
+    MMO =
+      DAG.getMachineFunction()
+      .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
+                            MachineMemOperand::MOLoad, ByteSize, ByteSize);
+  } else {
+    MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
+    StackSlot = StackSlot.getOperand(1);
+  }
   SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
   SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
                                            X86ISD::FILD, DL,
@@ -7204,6 +7247,17 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
 }
 
+SDValue X86TargetLowering::LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) const {
+  SDValue N0 = Op.getOperand(0);
+  DebugLoc dl = Op.getDebugLoc();
+  EVT VT = Op.getValueType();
+
+  // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1).
+  SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0,
+                                  DAG.getConstant(1, VT));
+  return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT));
+}
+
 /// Emit nodes that will be selected as "test Op0,Op0", or something
 /// equivalent.
 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
@@ -8779,16 +8833,71 @@ SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const {
   return Res;
 }
 
-SDValue X86TargetLowering::LowerSHL(SDValue Op, SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
+
   EVT VT = Op.getValueType();
   DebugLoc dl = Op.getDebugLoc();
   SDValue R = Op.getOperand(0);
+  SDValue Amt = Op.getOperand(1);
 
   LLVMContext *Context = DAG.getContext();
 
-  assert(Subtarget->hasSSE41() && "Cannot lower SHL without SSE4.1 or later");
+  // Must have SSE2.
+  if (!Subtarget->hasSSE2()) return SDValue();
+
+  // Optimize shl/srl/sra with constant shift amount.
+  if (isSplatVector(Amt.getNode())) {
+    SDValue SclrAmt = Amt->getOperand(0);
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) {
+      uint64_t ShiftAmt = C->getZExtValue();
+
+      if (VT == MVT::v2i64 && Op.getOpcode() == ISD::SHL)
+       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                     DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
+                     R, DAG.getConstant(ShiftAmt, MVT::i32));
+
+      if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SHL)
+       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                     DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32),
+                     R, DAG.getConstant(ShiftAmt, MVT::i32));
+
+      if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SHL)
+       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                     DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32),
+                     R, DAG.getConstant(ShiftAmt, MVT::i32));
+
+      if (VT == MVT::v2i64 && Op.getOpcode() == ISD::SRL)
+       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                     DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
+                     R, DAG.getConstant(ShiftAmt, MVT::i32));
+
+      if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SRL)
+       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                     DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32),
+                     R, DAG.getConstant(ShiftAmt, MVT::i32));
+
+      if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SRL)
+       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                     DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32),
+                     R, DAG.getConstant(ShiftAmt, MVT::i32));
+
+      if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SRA)
+       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                     DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32),
+                     R, DAG.getConstant(ShiftAmt, MVT::i32));
+
+      if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SRA)
+       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                     DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32),
+                     R, DAG.getConstant(ShiftAmt, MVT::i32));
+    }
+  }
+
+  // Lower SHL with variable shift amount.
+  // Cannot lower SHL without SSE4.1 or later.
+  if (!Subtarget->hasSSE41()) return SDValue();
 
-  if (VT == MVT::v4i32) {
+  if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
     Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
                      DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32),
                      Op.getOperand(1), DAG.getConstant(23, MVT::i32));
@@ -8807,7 +8916,7 @@ SDValue X86TargetLowering::LowerSHL(SDValue Op, SelectionDAG &DAG) const {
     Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
     return DAG.getNode(ISD::MUL, dl, VT, Op, R);
   }
-  if (VT == MVT::v16i8) {
+  if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) {
     // a = a << 5;
     Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
                      DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32),
@@ -9112,7 +9221,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
   case ISD::SHL_PARTS:
   case ISD::SRA_PARTS:
-  case ISD::SRL_PARTS:          return LowerShift(Op, DAG);
+  case ISD::SRL_PARTS:          return LowerShiftParts(Op, DAG);
   case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
   case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
   case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
@@ -9120,6 +9229,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::FABS:               return LowerFABS(Op, DAG);
   case ISD::FNEG:               return LowerFNEG(Op, DAG);
   case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
+  case ISD::FGETSIGN:           return LowerFGETSIGN(Op, DAG);
   case ISD::SETCC:              return LowerSETCC(Op, DAG);
   case ISD::VSETCC:             return LowerVSETCC(Op, DAG);
   case ISD::SELECT:             return LowerSELECT(Op, DAG);
@@ -9140,7 +9250,9 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::CTLZ:               return LowerCTLZ(Op, DAG);
   case ISD::CTTZ:               return LowerCTTZ(Op, DAG);
   case ISD::MUL:                return LowerMUL_V2I64(Op, DAG);
-  case ISD::SHL:                return LowerSHL(Op, DAG);
+  case ISD::SRA:
+  case ISD::SRL:
+  case ISD::SHL:                return LowerShift(Op, DAG);
   case ISD::SADDO:
   case ISD::UADDO:
   case ISD::SSUBO:
@@ -9307,6 +9419,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::UCOMI:              return "X86ISD::UCOMI";
   case X86ISD::SETCC:              return "X86ISD::SETCC";
   case X86ISD::SETCC_CARRY:        return "X86ISD::SETCC_CARRY";
+  case X86ISD::FSETCCsd:           return "X86ISD::FSETCCsd";
+  case X86ISD::FSETCCss:           return "X86ISD::FSETCCss";
   case X86ISD::CMOV:               return "X86ISD::CMOV";
   case X86ISD::BRCOND:             return "X86ISD::BRCOND";
   case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
@@ -10984,14 +11098,14 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
        UE = Uses.end(); UI != UE; ++UI) {
     SDNode *Extract = *UI;
 
-    // Compute the element's address.
+    // cOMpute the element's address.
     SDValue Idx = Extract->getOperand(1);
     unsigned EltSize =
         InputVector.getValueType().getVectorElementType().getSizeInBits()/8;
     uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue();
     SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy());
 
-    SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(),
+    SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(),
                                      StackPtr, OffsetVal);
 
     // Load the scalar.
@@ -11264,15 +11378,28 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
   if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
     return SDValue();
 
+  SDValue FalseOp = N->getOperand(0);
+  SDValue TrueOp = N->getOperand(1);
+  X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
+  SDValue Cond = N->getOperand(3);
+  if (CC == X86::COND_E || CC == X86::COND_NE) {
+    switch (Cond.getOpcode()) {
+    default: break;
+    case X86ISD::BSR:
+    case X86ISD::BSF:
+      // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
+      if (DAG.isKnownNeverZero(Cond.getOperand(0)))
+        return (CC == X86::COND_E) ? FalseOp : TrueOp;
+    }
+  }
+
   // If this is a select between two integer constants, try to do some
   // optimizations.  Note that the operands are ordered the opposite of SELECT
   // operands.
-  if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
-    if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
+  if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
+    if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
       // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
       // larger than FalseC (the false value).
-      X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
-
       if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
         CC = X86::GetOppositeBranchCondition(CC);
         std::swap(TrueC, FalseC);
@@ -11282,7 +11409,6 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
       // This is efficient for any integer data type (including i8/i16) and
       // shift amount.
       if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
-        SDValue Cond = N->getOperand(3);
         Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
                            DAG.getConstant(CC, MVT::i8), Cond);
 
@@ -11300,7 +11426,6 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
       // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
       // for any integer data type, including i8/i16.
       if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
-        SDValue Cond = N->getOperand(3);
         Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
                            DAG.getConstant(CC, MVT::i8), Cond);
 
@@ -11339,7 +11464,6 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
 
         if (isFastMultiplier) {
           APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
-          SDValue Cond = N->getOperand(3);
           Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
                              DAG.getConstant(CC, MVT::i8), Cond);
           // Zero extend the condition if needed.
@@ -11574,12 +11698,94 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
 }
 
 
+// CMPEQCombine - Recognize the distinctive  (AND (setcc ...) (setcc ..))
+// where both setccs reference the same FP CMP, and rewrite for CMPEQSS
+// and friends.  Likewise for OR -> CMPNEQSS.
+static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
+                            TargetLowering::DAGCombinerInfo &DCI,
+                            const X86Subtarget *Subtarget) {
+  unsigned opcode;
+
+  // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
+  // we're requiring SSE2 for both.
+  if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
+    SDValue N0 = N->getOperand(0);
+    SDValue N1 = N->getOperand(1);
+    SDValue CMP0 = N0->getOperand(1);
+    SDValue CMP1 = N1->getOperand(1);
+    DebugLoc DL = N->getDebugLoc();
+
+    // The SETCCs should both refer to the same CMP.
+    if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
+      return SDValue();
+
+    SDValue CMP00 = CMP0->getOperand(0);
+    SDValue CMP01 = CMP0->getOperand(1);
+    EVT     VT    = CMP00.getValueType();
+
+    if (VT == MVT::f32 || VT == MVT::f64) {
+      bool ExpectingFlags = false;
+      // Check for any users that want flags:
+      for (SDNode::use_iterator UI = N->use_begin(),
+             UE = N->use_end();
+           !ExpectingFlags && UI != UE; ++UI)
+        switch (UI->getOpcode()) {
+        default:
+        case ISD::BR_CC:
+        case ISD::BRCOND:
+        case ISD::SELECT:
+          ExpectingFlags = true;
+          break;
+        case ISD::CopyToReg:
+        case ISD::SIGN_EXTEND:
+        case ISD::ZERO_EXTEND:
+        case ISD::ANY_EXTEND:
+          break;
+        }
+
+      if (!ExpectingFlags) {
+        enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
+        enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
+
+        if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
+          X86::CondCode tmp = cc0;
+          cc0 = cc1;
+          cc1 = tmp;
+        }
+
+        if ((cc0 == X86::COND_E  && cc1 == X86::COND_NP) ||
+            (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
+          bool is64BitFP = (CMP00.getValueType() == MVT::f64);
+          X86ISD::NodeType NTOperator = is64BitFP ?
+            X86ISD::FSETCCsd : X86ISD::FSETCCss;
+          // FIXME: need symbolic constants for these magic numbers.
+          // See X86ATTInstPrinter.cpp:printSSECC().
+          unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
+          SDValue OnesOrZeroesF = DAG.getNode(NTOperator, DL, MVT::f32, CMP00, CMP01,
+                                              DAG.getConstant(x86cc, MVT::i8));
+          SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, MVT::i32,
+                                              OnesOrZeroesF);
+          SDValue ANDed = DAG.getNode(ISD::AND, DL, MVT::i32, OnesOrZeroesI,
+                                      DAG.getConstant(1, MVT::i32));
+          SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed);
+          return OneBitOfTruth;
+        }
+      }
+    }
+  }
+  return SDValue();
+}
+
 static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const X86Subtarget *Subtarget) {
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
+  SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
+  if (R.getNode())
+    return R;
+
   // Want to form PANDN nodes, in the hopes of then easily combining them with
   // OR and AND nodes to form PBLEND/PSIGN.
   EVT VT = N->getValueType(0);
@@ -11609,6 +11815,10 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
+  SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
+  if (R.getNode())
+    return R;
+
   EVT VT = N->getValueType(0);
   if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64 && VT != MVT::v2i64)
     return SDValue();
@@ -11976,6 +12186,26 @@ static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG) {
   return SDValue();
 }
 
+static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, const X86TargetLowering *XTLI) {
+  DebugLoc dl = N->getDebugLoc();
+  SDValue Op0 = N->getOperand(0);
+  // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
+  // a 32-bit target where SSE doesn't support i64->FP operations.
+  if (Op0.getOpcode() == ISD::LOAD) {
+    LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
+    EVT VT = Ld->getValueType(0);
+    if (!Ld->isVolatile() && !N->getValueType(0).isVector() &&
+        ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
+        !XTLI->getSubtarget()->is64Bit() &&
+        !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
+      SDValue FILDChain = XTLI->BuildFILD(SDValue(N, 0), Ld->getValueType(0), Ld->getChain(), Op0, DAG);
+      DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
+      return FILDChain;
+    }
+  }
+  return SDValue();
+}
+
 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
 static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
                                  X86TargetLowering::DAGCombinerInfo &DCI) {
@@ -12060,6 +12290,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::AND:            return PerformAndCombine(N, DAG, DCI, Subtarget);
   case ISD::OR:             return PerformOrCombine(N, DAG, DCI, Subtarget);
   case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
+  case ISD::SINT_TO_FP:     return PerformSINT_TO_FPCombine(N, DAG, this);
   case X86ISD::FXOR:
   case X86ISD::FOR:         return PerformFORCombine(N, DAG);
   case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
@@ -12216,7 +12447,7 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
     AsmPieces.clear();
     SplitString(AsmStr, AsmPieces, " \t");  // Split with whitespace.
 
-    // FIXME: this should verify that we are targetting a 486 or better.  If not,
+    // FIXME: this should verify that we are targeting a 486 or better.  If not,
     // we will turn this bswap into something that will be lowered to logical ops
     // instead of emitting the bswap asm.  For now, we don't support 486 or lower
     // so don't worry about this.
@@ -12489,12 +12720,16 @@ LowerXConstraint(EVT ConstraintVT) const {
 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
 /// vector.  If it is invalid, don't add anything to Ops.
 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
-                                                     char Constraint,
+                                                     std::string &Constraint,
                                                      std::vector<SDValue>&Ops,
                                                      SelectionDAG &DAG) const {
   SDValue Result(0, 0);
 
-  switch (Constraint) {
+  // Only support length 1 constraints for now.
+  if (Constraint.length() > 1) return;
+
+  char ConstraintLetter = Constraint[0];
+  switch (ConstraintLetter) {
   default: break;
   case 'I':
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
@@ -12686,7 +12921,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
         return std::make_pair(0U, X86::GR8RegisterClass);
       if (VT == MVT::i16)
         return std::make_pair(0U, X86::GR16RegisterClass);
-      if (VT == MVT::i32 || !Subtarget->is64Bit())
+      if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit())
         return std::make_pair(0U, X86::GR32RegisterClass);
       return std::make_pair(0U, X86::GR64RegisterClass);
     case 'R':   // LEGACY_REGS
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 6301057..d61a125 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -94,6 +94,15 @@ namespace llvm {
       // one's or all zero's.
       SETCC_CARRY,  // R = carry_bit ? ~0 : 0
 
+      /// X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD.
+      /// Operands are two FP values to compare; result is a mask of
+      /// 0s or 1s.  Generally DTRT for C/C++ with NaNs.
+      FSETCCss, FSETCCsd,
+
+      /// X86 MOVMSK{pd|ps}, extracts sign bits of two or four FP values,
+      /// result in an integer GPR.  Needs masking for scalar result.
+      FGETSIGNx86,
+
       /// X86 conditional moves. Operand 0 and operand 1 are the two values
       /// to select from. Operand 2 is the condition code, and operand 3 is the
       /// flag operand produced by a CMP or TEST instruction. It also writes a
@@ -592,7 +601,7 @@ namespace llvm {
     /// true it means one of the asm constraint of the inline asm instruction
     /// being processed is 'm'.
     virtual void LowerAsmOperandForConstraint(SDValue Op,
-                                              char ConstraintLetter,
+                                              std::string &Constraint,
                                               std::vector<SDValue> &Ops,
                                               SelectionDAG &DAG) const;
 
@@ -674,15 +683,15 @@ namespace llvm {
     /// or null if the target does not support "fast" ISel.
     virtual FastISel *createFastISel(FunctionLoweringInfo &funcInfo) const;
 
-    /// getFunctionAlignment - Return the Log2 alignment of this function.
-    virtual unsigned getFunctionAlignment(const Function *F) const;
-
     /// getStackCookieLocation - Return true if the target stores stack
     /// protector cookies at a fixed offset in some non-standard address
     /// space, and populates the address space and offset as
     /// appropriate.
     virtual bool getStackCookieLocation(unsigned &AddressSpace, unsigned &Offset) const;
 
+    SDValue BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue StackSlot,
+                      SelectionDAG &DAG) const;
+
   protected:
     std::pair<const TargetRegisterClass*, uint8_t>
     findRepresentativeClass(EVT VT) const;
@@ -773,9 +782,7 @@ namespace llvm {
     SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerShift(SDValue Op, SelectionDAG &DAG) const;
-    SDValue BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue StackSlot,
-                      SelectionDAG &DAG) const;
+    SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBITCAST(SDValue op, SelectionDAG &DAG) const;
     SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
@@ -786,6 +793,7 @@ namespace llvm {
     SDValue LowerFABS(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFNEG(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerToBT(SDValue And, ISD::CondCode CC,
                       DebugLoc dl, SelectionDAG &DAG) const;
     SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
@@ -808,7 +816,7 @@ namespace llvm {
     SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerSHL(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerShift(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) const;
 
     SDValue LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
@@ -850,9 +858,10 @@ namespace llvm {
                              ISD::NodeType ExtendKind) const;
 
     virtual bool
-      CanLowerReturn(CallingConv::ID CallConv, bool isVarArg,
-                     const SmallVectorImpl<ISD::OutputArg> &Outs,
-                     LLVMContext &Context) const;
+    CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+		   bool isVarArg,
+		   const SmallVectorImpl<ISD::OutputArg> &Outs,
+		   LLVMContext &Context) const;
 
     void ReplaceATOMIC_BINARY_64(SDNode *N, SmallVectorImpl<SDValue> &Results,
                                  SelectionDAG &DAG, unsigned NewOp) const;
diff --git a/lib/Target/X86/X86Instr3DNow.td b/lib/Target/X86/X86Instr3DNow.td
index 45d1c6b..dd4f6a5 100644
--- a/lib/Target/X86/X86Instr3DNow.td
+++ b/lib/Target/X86/X86Instr3DNow.td
@@ -12,66 +12,91 @@
 //
 //===----------------------------------------------------------------------===//
 
-// FIXME: We don't support any intrinsics for these instructions yet.
+class I3DNow<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pat>
+      : I<o, F, outs, ins, asm, pat>, TB, Requires<[Has3DNow]> {
+}
 
-class I3DNow<bits<8> o, Format F, dag outs, dag ins, string asm, 
-             list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern>, TB, Requires<[Has3DNow]> {
+class I3DNow_binop<bits<8> o, Format F, dag ins, string Mnemonic, list<dag> pat>
+      : I3DNow<o, F, (outs VR64:$dst), ins,
+          !strconcat(Mnemonic, "\t{$src2, $dst|$dst, $src2}"), pat>,
+        Has3DNow0F0FOpcode {
+  // FIXME: The disassembler doesn't support Has3DNow0F0FOpcode yet.
+  let isAsmParserOnly = 1;
+  let Constraints = "$src1 = $dst";
 }
 
-class I3DNow_binop<bits<8> o, Format F, dag ins, string Mnemonic>
-      : I<o, F, (outs VR64:$dst), ins,
-          !strconcat(Mnemonic, "\t{$src2, $dst|$dst, $src2}"), []>,
-          TB, Requires<[Has3DNow]>, Has3DNow0F0FOpcode {
+class I3DNow_conv<bits<8> o, Format F, dag ins, string Mnemonic, list<dag> pat>
+      : I3DNow<o, F, (outs VR64:$dst), ins,
+          !strconcat(Mnemonic, "\t{$src, $dst|$dst, $src}"), pat>,
+        Has3DNow0F0FOpcode {
   // FIXME: The disassembler doesn't support Has3DNow0F0FOpcode yet.
   let isAsmParserOnly = 1;
 }
 
+multiclass I3DNow_binop_rm<bits<8> opc, string Mn> {
+  def rr : I3DNow_binop<opc, MRMSrcReg, (ins VR64:$src1, VR64:$src2), Mn, []>;
+  def rm : I3DNow_binop<opc, MRMSrcMem, (ins VR64:$src1, i64mem:$src2), Mn, []>;
+}
+
+multiclass I3DNow_binop_rm_int<bits<8> opc, string Mn, string Ver = ""> {
+  def rr : I3DNow_binop<opc, MRMSrcReg, (ins VR64:$src1, VR64:$src2), Mn,
+    [(set VR64:$dst, (!cast<Intrinsic>(
+      !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src1, VR64:$src2))]>;
+  def rm : I3DNow_binop<opc, MRMSrcMem, (ins VR64:$src1, i64mem:$src2), Mn,
+    [(set VR64:$dst, (!cast<Intrinsic>(
+      !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src1,
+        (bitconvert (load_mmx addr:$src2))))]>;
+}
+
+multiclass I3DNow_conv_rm<bits<8> opc, string Mn> {
+  def rr : I3DNow_conv<opc, MRMSrcReg, (ins VR64:$src1), Mn, []>;
+  def rm : I3DNow_conv<opc, MRMSrcMem, (ins i64mem:$src1), Mn, []>;
+}
 
-let Constraints = "$src1 = $dst" in {
-  // MMXI_binop_rm_int - Simple MMX binary operator based on intrinsic.
-  // When this is cleaned up, remove the FIXME from X86RecognizableInstr.cpp.
-  multiclass I3DNow_binop_rm<bits<8> opc, string Mn> {
-    def rr : I3DNow_binop<opc, MRMSrcReg, (ins VR64:$src1, VR64:$src2), Mn>;
-    def rm : I3DNow_binop<opc, MRMSrcMem, (ins VR64:$src1, i64mem:$src2), Mn>;
-  }
+multiclass I3DNow_conv_rm_int<bits<8> opc, string Mn, string Ver = ""> {
+  def rr : I3DNow_conv<opc, MRMSrcReg, (ins VR64:$src), Mn,
+    [(set VR64:$dst, (!cast<Intrinsic>(
+      !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src))]>;
+  def rm : I3DNow_conv<opc, MRMSrcMem, (ins i64mem:$src), Mn,
+    [(set VR64:$dst, (!cast<Intrinsic>(
+      !strconcat("int_x86_3dnow", Ver, "_", Mn))
+        (bitconvert (load_mmx addr:$src))))]>;
 }
 
-defm PAVGUSB  : I3DNow_binop_rm<0xBF, "pavgusb">;
-defm PF2ID    : I3DNow_binop_rm<0x1D, "pf2id">;
-defm PFACC    : I3DNow_binop_rm<0xAE, "pfacc">;
-defm PFADD    : I3DNow_binop_rm<0x9E, "pfadd">;
-defm PFCMPEQ  : I3DNow_binop_rm<0xB0, "pfcmpeq">;
-defm PFCMPGE  : I3DNow_binop_rm<0x90, "pfcmpge">;
-defm PFCMPGT  : I3DNow_binop_rm<0xA0, "pfcmpgt">;
-defm PFMAX    : I3DNow_binop_rm<0xA4, "pfmax">;
-defm PFMIN    : I3DNow_binop_rm<0x94, "pfmin">;
-defm PFMUL    : I3DNow_binop_rm<0xB4, "pfmul">;
-defm PFRCP    : I3DNow_binop_rm<0x96, "pfrcp">;
-defm PFRCPIT1 : I3DNow_binop_rm<0xA6, "pfrcpit1">;
-defm PFRCPIT2 : I3DNow_binop_rm<0xB6, "pfrcpit2">;
-defm PFRSQIT1 : I3DNow_binop_rm<0xA7, "pfrsqit1">;
-defm PFRSQRT  : I3DNow_binop_rm<0x97, "pfrsqrt">;
-defm PFSUB    : I3DNow_binop_rm<0x9A, "pfsub">;
-defm PFSUBR   : I3DNow_binop_rm<0xAA, "pfsubr">;
-defm PI2FD    : I3DNow_binop_rm<0x0D, "pi2fd">;
-defm PMULHRW  : I3DNow_binop_rm<0xB7, "pmulhrw">;
+defm PAVGUSB  : I3DNow_binop_rm_int<0xBF, "pavgusb">;
+defm PF2ID    : I3DNow_conv_rm_int<0x1D, "pf2id">;
+defm PFACC    : I3DNow_binop_rm_int<0xAE, "pfacc">;
+defm PFADD    : I3DNow_binop_rm_int<0x9E, "pfadd">;
+defm PFCMPEQ  : I3DNow_binop_rm_int<0xB0, "pfcmpeq">;
+defm PFCMPGE  : I3DNow_binop_rm_int<0x90, "pfcmpge">;
+defm PFCMPGT  : I3DNow_binop_rm_int<0xA0, "pfcmpgt">;
+defm PFMAX    : I3DNow_binop_rm_int<0xA4, "pfmax">;
+defm PFMIN    : I3DNow_binop_rm_int<0x94, "pfmin">;
+defm PFMUL    : I3DNow_binop_rm_int<0xB4, "pfmul">;
+defm PFRCP    : I3DNow_conv_rm_int<0x96, "pfrcp">;
+defm PFRCPIT1 : I3DNow_binop_rm_int<0xA6, "pfrcpit1">;
+defm PFRCPIT2 : I3DNow_binop_rm_int<0xB6, "pfrcpit2">;
+defm PFRSQIT1 : I3DNow_binop_rm_int<0xA7, "pfrsqit1">;
+defm PFRSQRT  : I3DNow_conv_rm_int<0x97, "pfrsqrt">;
+defm PFSUB    : I3DNow_binop_rm_int<0x9A, "pfsub">;
+defm PFSUBR   : I3DNow_binop_rm_int<0xAA, "pfsubr">;
+defm PI2FD    : I3DNow_conv_rm_int<0x0D, "pi2fd">;
+defm PMULHRW  : I3DNow_binop_rm_int<0xB7, "pmulhrw">;
 
 
 def FEMMS : I3DNow<0x0E, RawFrm, (outs), (ins), "femms", [(int_x86_mmx_femms)]>;
 
 def PREFETCH  : I3DNow<0x0D, MRM0m, (outs), (ins i32mem:$addr),
                        "prefetch $addr", []>;
-                       
+
 // FIXME: Diassembler gets a bogus decode conflict.
-let isAsmParserOnly = 1 in {
+let isAsmParserOnly = 1 in
 def PREFETCHW : I3DNow<0x0D, MRM1m, (outs), (ins i16mem:$addr),
                        "prefetchw $addr", []>;
-}
 
 // "3DNowA" instructions
-defm PF2IW    : I3DNow_binop_rm<0x1C, "pf2iw">;
-defm PI2FW    : I3DNow_binop_rm<0x0C, "pi2fw">;
-defm PFNACC   : I3DNow_binop_rm<0x8A, "pfnacc">;
-defm PFPNACC  : I3DNow_binop_rm<0x8E, "pfpnacc">;
-defm PSWAPD   : I3DNow_binop_rm<0xBB, "pswapd">;
+defm PF2IW    : I3DNow_conv_rm_int<0x1C, "pf2iw", "a">;
+defm PI2FW    : I3DNow_conv_rm_int<0x0C, "pi2fw", "a">;
+defm PFNACC   : I3DNow_binop_rm_int<0x8A, "pfnacc", "a">;
+defm PFPNACC  : I3DNow_binop_rm_int<0x8E, "pfpnacc", "a">;
+defm PSWAPD   : I3DNow_conv_rm_int<0xBB, "pswapd", "a">;
diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td
index f0ea068..9f7a4b0 100644
--- a/lib/Target/X86/X86InstrArithmetic.td
+++ b/lib/Target/X86/X86InstrArithmetic.td
@@ -163,7 +163,7 @@ def IMUL64rm : RI<0xAF, MRMSrcMem, (outs GR64:$dst),
 
 } // Defs = [EFLAGS]
 
-// Suprisingly enough, these are not two address instructions!
+// Surprisingly enough, these are not two address instructions!
 let Defs = [EFLAGS] in {
 // Register-Integer Signed Integer Multiply
 def IMUL16rri  : Ii16<0x69, MRMSrcReg,                      // GR16 = GR16*I16
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index 4c915d9..adcc747 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -214,6 +214,30 @@ def : Pat<(i64 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
 def : Pat<(and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1),
           (SETBr)>;
 
+// (add OP, SETB) -> (adc OP, 0)
+def : Pat<(add (and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR8:$op),
+          (ADC8ri GR8:$op, 0)>;
+def : Pat<(add (and (i32 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR32:$op),
+          (ADC32ri8 GR32:$op, 0)>;
+def : Pat<(add (and (i64 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR64:$op),
+          (ADC64ri8 GR64:$op, 0)>;
+
+// (sub OP, SETB) -> (sbb OP, 0)
+def : Pat<(sub GR8:$op, (and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1)),
+          (SBB8ri GR8:$op, 0)>;
+def : Pat<(sub GR32:$op, (and (i32 (X86setcc_c X86_COND_B, EFLAGS)), 1)),
+          (SBB32ri8 GR32:$op, 0)>;
+def : Pat<(sub GR64:$op, (and (i64 (X86setcc_c X86_COND_B, EFLAGS)), 1)),
+          (SBB64ri8 GR64:$op, 0)>;
+
+// (sub OP, SETCC_CARRY) -> (adc OP, 0)
+def : Pat<(sub GR8:$op, (i8 (X86setcc_c X86_COND_B, EFLAGS))),
+          (ADC8ri GR8:$op, 0)>;
+def : Pat<(sub GR32:$op, (i32 (X86setcc_c X86_COND_B, EFLAGS))),
+          (ADC32ri8 GR32:$op, 0)>;
+def : Pat<(sub GR64:$op, (i64 (X86setcc_c X86_COND_B, EFLAGS))),
+          (ADC64ri8 GR64:$op, 0)>;
+
 //===----------------------------------------------------------------------===//
 // String Pseudo Instructions
 //
@@ -519,85 +543,98 @@ def Int_MemBarrierNoSSE64  : RI<0x09, MRM1r, (outs), (ins GR64:$zero),
                            Requires<[In64BitMode]>, LOCK;
 
 
-// Optimized codegen when the non-memory output is not used.
+// RegOpc corresponds to the mr version of the instruction
+// ImmOpc corresponds to the mi version of the instruction
+// ImmOpc8 corresponds to the mi8 version of the instruction
+// ImmMod corresponds to the instruction format of the mi and mi8 versions
+multiclass LOCK_ArithBinOp<bits<8> RegOpc, bits<8> ImmOpc, bits<8> ImmOpc8,
+                           Format ImmMod, string mnemonic> {
 let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1 in {
-def LOCK_ADD8mr  : I<0x00, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2),
-                    "lock\n\t"
-                    "add{b}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
-def LOCK_ADD16mr  : I<0x01, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
-                    "lock\n\t"
-                    "add{w}\t{$src2, $dst|$dst, $src2}", []>, OpSize, LOCK;
-def LOCK_ADD32mr  : I<0x01, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
-                    "lock\n\t"
-                    "add{l}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
-def LOCK_ADD64mr : RI<0x01, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
-                      "lock\n\t"
-                      "add{q}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
 
-def LOCK_ADD8mi   : Ii8<0x80, MRM0m, (outs), (ins i8mem :$dst, i8imm :$src2),
-                    "lock\n\t"
-                    "add{b}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
-def LOCK_ADD16mi  : Ii16<0x81, MRM0m, (outs), (ins i16mem:$dst, i16imm:$src2),
-                    "lock\n\t"
-                     "add{w}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
-def LOCK_ADD32mi  : Ii32<0x81, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src2),
-                    "lock\n\t"
-                    "add{l}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
-def LOCK_ADD64mi32 : RIi32<0x81, MRM0m, (outs),
-                                        (ins i64mem:$dst, i64i32imm :$src2),
-                      "lock\n\t"
-                      "add{q}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
-
-def LOCK_ADD16mi8 : Ii8<0x83, MRM0m, (outs), (ins i16mem:$dst, i16i8imm :$src2),
-                    "lock\n\t"
-                    "add{w}\t{$src2, $dst|$dst, $src2}", []>, OpSize, LOCK;
-def LOCK_ADD32mi8 : Ii8<0x83, MRM0m, (outs), (ins i32mem:$dst, i32i8imm :$src2),
-                    "lock\n\t"
-                    "add{l}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
-def LOCK_ADD64mi8 : RIi8<0x83, MRM0m, (outs),
-                                      (ins i64mem:$dst, i64i8imm :$src2),
-                    "lock\n\t"
-                    "add{q}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
-
-def LOCK_SUB8mr   : I<0x28, MRMDestMem, (outs), (ins i8mem :$dst, GR8 :$src2),
-                    "lock\n\t"
-                    "sub{b}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
-def LOCK_SUB16mr  : I<0x29, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
-                    "lock\n\t"
-                    "sub{w}\t{$src2, $dst|$dst, $src2}", []>, OpSize, LOCK;
-def LOCK_SUB32mr  : I<0x29, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
-                    "lock\n\t"
-                    "sub{l}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
-def LOCK_SUB64mr : RI<0x29, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
-                      "lock\n\t"
-                      "sub{q}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+def #NAME#8mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
+                   RegOpc{3}, RegOpc{2}, RegOpc{1}, 0 },
+                   MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2),
+                   !strconcat("lock\n\t", mnemonic, "{b}\t",
+                              "{$src2, $dst|$dst, $src2}"),
+                   []>, LOCK;
+def #NAME#16mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
+                    RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
+                    MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
+                    !strconcat("lock\n\t", mnemonic, "{w}\t",
+                               "{$src2, $dst|$dst, $src2}"),
+                    []>, OpSize, LOCK;
+def #NAME#32mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
+                    RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
+                    MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
+                    !strconcat("lock\n\t", mnemonic, "{l}\t",
+                               "{$src2, $dst|$dst, $src2}"),
+                    []>, LOCK;
+def #NAME#64mr : RI<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
+                     RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
+                     MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
+                     !strconcat("lock\n\t", mnemonic, "{q}\t",
+                                "{$src2, $dst|$dst, $src2}"),
+                     []>, LOCK;
+
+def #NAME#8mi : Ii8<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
+                     ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 0 },
+                     ImmMod, (outs), (ins i8mem :$dst, i8imm :$src2),
+                     !strconcat("lock\n\t", mnemonic, "{b}\t",
+                                "{$src2, $dst|$dst, $src2}"),
+                     []>, LOCK;
+
+def #NAME#16mi : Ii16<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
+                       ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
+                       ImmMod, (outs), (ins i16mem :$dst, i16imm :$src2),
+                       !strconcat("lock\n\t", mnemonic, "{w}\t",
+                                  "{$src2, $dst|$dst, $src2}"),
+                       []>, LOCK;
+
+def #NAME#32mi : Ii32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
+                       ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
+                       ImmMod, (outs), (ins i32mem :$dst, i32imm :$src2),
+                       !strconcat("lock\n\t", mnemonic, "{l}\t",
+                                  "{$src2, $dst|$dst, $src2}"),
+                       []>, LOCK;
+
+def #NAME#64mi32 : RIi32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
+                          ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
+                          ImmMod, (outs), (ins i64mem :$dst, i64i32imm :$src2),
+                          !strconcat("lock\n\t", mnemonic, "{q}\t",
+                                     "{$src2, $dst|$dst, $src2}"),
+                          []>, LOCK;
+
+def #NAME#16mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
+                       ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
+                       ImmMod, (outs), (ins i16mem :$dst, i16i8imm :$src2),
+                       !strconcat("lock\n\t", mnemonic, "{w}\t",
+                                  "{$src2, $dst|$dst, $src2}"),
+                       []>, LOCK;
+def #NAME#32mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
+                       ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
+                       ImmMod, (outs), (ins i32mem :$dst, i32i8imm :$src2),
+                       !strconcat("lock\n\t", mnemonic, "{l}\t",
+                                  "{$src2, $dst|$dst, $src2}"),
+                       []>, LOCK;
+def #NAME#64mi8 : RIi8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
+                        ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
+                        ImmMod, (outs), (ins i64mem :$dst, i64i8imm :$src2),
+                        !strconcat("lock\n\t", mnemonic, "{q}\t",
+                                   "{$src2, $dst|$dst, $src2}"),
+                        []>, LOCK;
 
+}
 
-def LOCK_SUB8mi   : Ii8<0x80, MRM5m, (outs), (ins i8mem :$dst, i8imm:$src2),
-                    "lock\n\t"
-                    "sub{b}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
-def LOCK_SUB16mi  : Ii16<0x81, MRM5m, (outs), (ins i16mem:$dst, i16imm:$src2),
-                    "lock\n\t"
-                    "sub{w}\t{$src2, $dst|$dst, $src2}", []>, OpSize, LOCK;
-def LOCK_SUB32mi  : Ii32<0x81, MRM5m, (outs), (ins i32mem:$dst, i32imm:$src2),
-                    "lock\n\t"
-                     "sub{l}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
-def LOCK_SUB64mi32 : RIi32<0x81, MRM5m, (outs),
-                                        (ins i64mem:$dst, i64i32imm:$src2),
-                      "lock\n\t"
-                      "sub{q}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+}
 
+defm LOCK_ADD : LOCK_ArithBinOp<0x00, 0x80, 0x83, MRM0m, "add">;
+defm LOCK_SUB : LOCK_ArithBinOp<0x28, 0x80, 0x83, MRM5m, "sub">;
+defm LOCK_OR  : LOCK_ArithBinOp<0x08, 0x80, 0x83, MRM1m, "or">;
+defm LOCK_AND : LOCK_ArithBinOp<0x08, 0x80, 0x83, MRM4m, "and">;
+defm LOCK_XOR : LOCK_ArithBinOp<0x08, 0x80, 0x83, MRM6m, "xor">;
 
-def LOCK_SUB16mi8 : Ii8<0x83, MRM5m, (outs), (ins i16mem:$dst, i16i8imm :$src2),
-                    "lock\n\t"
-                     "sub{w}\t{$src2, $dst|$dst, $src2}", []>, OpSize, LOCK;
-def LOCK_SUB32mi8 : Ii8<0x83, MRM5m, (outs), (ins i32mem:$dst, i32i8imm :$src2),
-                    "lock\n\t"
-                     "sub{l}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
-def LOCK_SUB64mi8 : RIi8<0x83, MRM5m, (outs),
-                                      (ins i64mem:$dst, i64i8imm :$src2),
-                      "lock\n\t"
-                      "sub{q}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+// Optimized codegen when the non-memory output is not used.
+let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1 in {
 
 def LOCK_INC8m  : I<0xFE, MRM0m, (outs), (ins i8mem :$dst),
                     "lock\n\t"
@@ -960,7 +997,8 @@ def : Pat<(extloadi64i32 addr:$src),
 
 // anyext. Define these to do an explicit zero-extend to
 // avoid partial-register updates.
-def : Pat<(i16 (anyext GR8 :$src)), (MOVZX16rr8  GR8 :$src)>;
+def : Pat<(i16 (anyext GR8 :$src)), (EXTRACT_SUBREG
+                                     (MOVZX32rr8 GR8 :$src), sub_16bit)>;
 def : Pat<(i32 (anyext GR8 :$src)), (MOVZX32rr8  GR8 :$src)>;
 
 // Except for i16 -> i32 since isel expect i16 ops to be promoted to i32.
@@ -1127,9 +1165,9 @@ def : Pat<(and GR32:$src1, 0xff),
       Requires<[In32BitMode]>;
 // r & (2^8-1) ==> movz
 def : Pat<(and GR16:$src1, 0xff),
-          (MOVZX16rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src1,
-                                                             GR16_ABCD)),
-                                      sub_8bit))>,
+           (EXTRACT_SUBREG (MOVZX32rr8 (EXTRACT_SUBREG
+            (i16 (COPY_TO_REGCLASS GR16:$src1, GR16_ABCD)), sub_8bit)),
+             sub_16bit)>,
       Requires<[In32BitMode]>;
 
 // r & (2^32-1) ==> movz
@@ -1147,7 +1185,8 @@ def : Pat<(and GR32:$src1, 0xff),
       Requires<[In64BitMode]>;
 // r & (2^8-1) ==> movz
 def : Pat<(and GR16:$src1, 0xff),
-           (MOVZX16rr8 (i8 (EXTRACT_SUBREG GR16:$src1, sub_8bit)))>,
+           (EXTRACT_SUBREG (MOVZX32rr8 (i8
+            (EXTRACT_SUBREG GR16:$src1, sub_8bit))), sub_16bit)>,
       Requires<[In64BitMode]>;
 
 
@@ -1159,10 +1198,11 @@ def : Pat<(sext_inreg GR32:$src, i8),
                                                              GR32_ABCD)),
                                       sub_8bit))>,
       Requires<[In32BitMode]>;
+
 def : Pat<(sext_inreg GR16:$src, i8),
-          (MOVSX16rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src,
-                                                             GR16_ABCD)),
-                                      sub_8bit))>,
+           (EXTRACT_SUBREG (i32 (MOVSX32rr8 (EXTRACT_SUBREG
+            (i32 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), sub_8bit))),
+             sub_16bit)>,
       Requires<[In32BitMode]>;
 
 def : Pat<(sext_inreg GR64:$src, i32),
@@ -1175,9 +1215,19 @@ def : Pat<(sext_inreg GR32:$src, i8),
           (MOVSX32rr8 (EXTRACT_SUBREG GR32:$src, sub_8bit))>,
       Requires<[In64BitMode]>;
 def : Pat<(sext_inreg GR16:$src, i8),
-          (MOVSX16rr8 (i8 (EXTRACT_SUBREG GR16:$src, sub_8bit)))>,
+           (EXTRACT_SUBREG (MOVSX32rr8
+            (EXTRACT_SUBREG GR16:$src, sub_8bit)), sub_16bit)>,
       Requires<[In64BitMode]>;
 
+// sext, sext_load, zext, zext_load
+def: Pat<(i16 (sext GR8:$src)),
+          (EXTRACT_SUBREG (MOVSX32rr8 GR8:$src), sub_16bit)>;
+def: Pat<(sextloadi16i8 addr:$src),
+          (EXTRACT_SUBREG (MOVSX32rm8 addr:$src), sub_16bit)>;
+def: Pat<(i16 (zext GR8:$src)),
+          (EXTRACT_SUBREG (MOVZX32rr8 GR8:$src), sub_16bit)>;
+def: Pat<(zextloadi16i8 addr:$src),
+          (EXTRACT_SUBREG (MOVZX32rm8 addr:$src), sub_16bit)>;
 
 // trunc patterns
 def : Pat<(i16 (trunc GR32:$src)),
@@ -1318,6 +1368,11 @@ def : Pat<(store (i8 (trunc_su (srl_su GR16:$src, (i8 8)))), addr:$dst),
 
 
 // (shl x, 1) ==> (add x, x)
+// Note that if x is undef (immediate or otherwise), we could theoretically
+// end up with the two uses of x getting different values, producing a result
+// where the least significant bit is not 0. However, the probability of this
+// happening is considered low enough that this is officially not a
+// "real problem".
 def : Pat<(shl GR8 :$src1, (i8 1)), (ADD8rr  GR8 :$src1, GR8 :$src1)>;
 def : Pat<(shl GR16:$src1, (i8 1)), (ADD16rr GR16:$src1, GR16:$src1)>;
 def : Pat<(shl GR32:$src1, (i8 1)), (ADD32rr GR32:$src1, GR32:$src1)>;
@@ -1474,12 +1529,6 @@ def : Pat<(mul (loadi16 addr:$src1), i16immSExt8:$src2),
 def : Pat<(mul (loadi32 addr:$src1), i32immSExt8:$src2),
           (IMUL32rmi8 addr:$src1, i32immSExt8:$src2)>;
 
-// Optimize multiply by 2 with EFLAGS result.
-let AddedComplexity = 2 in {
-def : Pat<(X86smul_flag GR16:$src1, 2), (ADD16rr GR16:$src1, GR16:$src1)>;
-def : Pat<(X86smul_flag GR32:$src1, 2), (ADD32rr GR32:$src1, GR32:$src1)>;
-}
-
 // Patterns for nodes that do not produce flags, for instructions that do.
 
 // addition
diff --git a/lib/Target/X86/X86InstrExtension.td b/lib/Target/X86/X86InstrExtension.td
index 867c0f8..2e1d523 100644
--- a/lib/Target/X86/X86InstrExtension.td
+++ b/lib/Target/X86/X86InstrExtension.td
@@ -38,22 +38,11 @@ let neverHasSideEffects = 1 in {
 
 
 // Sign/Zero extenders
-// Use movsbl intead of movsbw; we don't care about the high 16 bits
-// of the register here. This has a smaller encoding and avoids a
-// partial-register update.  Actual movsbw included for the disassembler.
-def MOVSX16rr8W : I<0xBE, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src),
-                    "movs{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
-def MOVSX16rm8W : I<0xBE, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src),
-                    "movs{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
-
-// FIXME: Use a pat pattern or define a syntax here.                    
-let isCodeGenOnly=1 in {
-def MOVSX16rr8 : I<0xBE, MRMSrcReg, (outs GR16:$dst), (ins GR8 :$src),
-                   "", [(set GR16:$dst, (sext GR8:$src))]>, TB;
-def MOVSX16rm8 : I<0xBE, MRMSrcMem, (outs GR16:$dst), (ins i8mem :$src),
-                   "", [(set GR16:$dst, (sextloadi16i8 addr:$src))]>, TB;
-}
-def MOVSX32rr8 : I<0xBE, MRMSrcReg, (outs GR32:$dst), (ins GR8 :$src),
+def MOVSX16rr8 : I<0xBE, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src),
+                   "movs{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+def MOVSX16rm8 : I<0xBE, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src),
+                   "movs{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+def MOVSX32rr8 : I<0xBE, MRMSrcReg, (outs GR32:$dst), (ins GR8:$src),
                    "movs{bl|x}\t{$src, $dst|$dst, $src}",
                    [(set GR32:$dst, (sext GR8:$src))]>, TB;
 def MOVSX32rm8 : I<0xBE, MRMSrcMem, (outs GR32:$dst), (ins i8mem :$src),
@@ -66,20 +55,10 @@ def MOVSX32rm16: I<0xBF, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
                    "movs{wl|x}\t{$src, $dst|$dst, $src}",
                    [(set GR32:$dst, (sextloadi32i16 addr:$src))]>, TB;
 
-// Use movzbl intead of movzbw; we don't care about the high 16 bits
-// of the register here. This has a smaller encoding and avoids a
-// partial-register update.  Actual movzbw included for the disassembler.
-def MOVZX16rr8W : I<0xB6, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src),
-                    "movz{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
-def MOVZX16rm8W : I<0xB6, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src),
-                    "movz{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;  
-// FIXME: Use a pat pattern or define a syntax here.                    
-let isCodeGenOnly=1 in {
-def MOVZX16rr8 : I<0xB6, MRMSrcReg, (outs GR16:$dst), (ins GR8 :$src),
-                   "", [(set GR16:$dst, (zext GR8:$src))]>, TB;
-def MOVZX16rm8 : I<0xB6, MRMSrcMem, (outs GR16:$dst), (ins i8mem :$src),
-                   "", [(set GR16:$dst, (zextloadi16i8 addr:$src))]>, TB;
-}
+def MOVZX16rr8 : I<0xB6, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src),
+                   "movz{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+def MOVZX16rm8 : I<0xB6, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src),
+                   "movz{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
 def MOVZX32rr8 : I<0xB6, MRMSrcReg, (outs GR32:$dst), (ins GR8 :$src),
                    "movz{bl|x}\t{$src, $dst|$dst, $src}",
                    [(set GR32:$dst, (zext GR8:$src))]>, TB;
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 3cbfac1..7c9a9f7 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -38,8 +38,11 @@ def X86fxor    : SDNode<"X86ISD::FXOR",      SDTFPBinOp,
 def X86frsqrt  : SDNode<"X86ISD::FRSQRT",    SDTFPUnaryOp>;
 def X86frcp    : SDNode<"X86ISD::FRCP",      SDTFPUnaryOp>;
 def X86fsrl    : SDNode<"X86ISD::FSRL",      SDTX86FPShiftOp>;
+def X86fgetsign: SDNode<"X86ISD::FGETSIGNx86",SDTFPToIntOp>;
 def X86comi    : SDNode<"X86ISD::COMI",      SDTX86CmpTest>;
 def X86ucomi   : SDNode<"X86ISD::UCOMI",     SDTX86CmpTest>;
+def X86cmpss   : SDNode<"X86ISD::FSETCCss",    SDTX86Cmpss>;
+def X86cmpsd   : SDNode<"X86ISD::FSETCCsd",    SDTX86Cmpsd>;
 def X86pshufb  : SDNode<"X86ISD::PSHUFB",
                  SDTypeProfile<1, 2, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>,
                                       SDTCisSameAs<0,2>]>>;
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 21df57c..b3237d5 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -232,7 +232,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     assert(!RegOp2MemOpTable2Addr.count(RegOp) && "Duplicated entries?");
     RegOp2MemOpTable2Addr[RegOp] = std::make_pair(MemOp, 0U);
 
-    // If this is not a reversable operation (because there is a many->one)
+    // If this is not a reversible operation (because there is a many->one)
     // mapping, don't insert the reverse of the operation into MemOp2RegOpTable.
     if (OpTbl2Addr[i][1] & TB_NOT_REVERSABLE)
       continue;
@@ -335,7 +335,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     assert(!RegOp2MemOpTable0.count(RegOp) && "Duplicated entries?");
     RegOp2MemOpTable0[RegOp] = std::make_pair(MemOp, Align);
 
-    // If this is not a reversable operation (because there is a many->one)
+    // If this is not a reversible operation (because there is a many->one)
     // mapping, don't insert the reverse of the operation into MemOp2RegOpTable.
     if (OpTbl0[i][1] & TB_NOT_REVERSABLE)
       continue;
@@ -460,7 +460,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     assert(!RegOp2MemOpTable1.count(RegOp) && "Duplicate entries");
     RegOp2MemOpTable1[RegOp] = std::make_pair(MemOp, Align);
 
-    // If this is not a reversable operation (because there is a many->one)
+    // If this is not a reversible operation (because there is a many->one)
     // mapping, don't insert the reverse of the operation into MemOp2RegOpTable.
     if (OpTbl1[i][1] & TB_NOT_REVERSABLE)
       continue;
@@ -682,7 +682,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     assert(!RegOp2MemOpTable2.count(RegOp) && "Duplicate entry!");
     RegOp2MemOpTable2[RegOp] = std::make_pair(MemOp, Align);
 
-    // If this is not a reversable operation (because there is a many->one)
+    // If this is not a reversible operation (because there is a many->one)
     // mapping, don't insert the reverse of the operation into MemOp2RegOpTable.
     if (OpTbl2[i][1] & TB_NOT_REVERSABLE)
       continue;
@@ -916,7 +916,6 @@ X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI,
     case X86::MOVSDrm:
     case X86::MOVAPSrm:
     case X86::MOVUPSrm:
-    case X86::MOVUPSrm_Int:
     case X86::MOVAPDrm:
     case X86::MOVDQArm:
     case X86::MMX_MOVD64rm:
@@ -1790,7 +1789,6 @@ bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
           .addMBB(UnCondBrIter->getOperand(0).getMBB());
         BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(X86::JMP_4))
           .addMBB(TargetBB);
-        MBB.addSuccessor(TargetBB);
 
         OldInst->eraseFromParent();
         UnCondBrIter->eraseFromParent();
@@ -2016,62 +2014,48 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
                                       bool isStackAligned,
                                       const TargetMachine &TM,
                                       bool load) {
-  switch (RC->getID()) {
+  switch (RC->getSize()) {
   default:
-    llvm_unreachable("Unknown regclass");
-  case X86::GR64RegClassID:
-  case X86::GR64_ABCDRegClassID:
-  case X86::GR64_NOREXRegClassID:
-  case X86::GR64_NOREX_NOSPRegClassID:
-  case X86::GR64_NOSPRegClassID:
-  case X86::GR64_TCRegClassID:
-  case X86::GR64_TCW64RegClassID:
-    return load ? X86::MOV64rm : X86::MOV64mr;
-  case X86::GR32RegClassID:
-  case X86::GR32_ABCDRegClassID:
-  case X86::GR32_ADRegClassID:
-  case X86::GR32_NOREXRegClassID:
-  case X86::GR32_NOSPRegClassID:
-  case X86::GR32_TCRegClassID:
-    return load ? X86::MOV32rm : X86::MOV32mr;
-  case X86::GR16RegClassID:
-  case X86::GR16_ABCDRegClassID:
-  case X86::GR16_NOREXRegClassID:
-    return load ? X86::MOV16rm : X86::MOV16mr;
-  case X86::GR8RegClassID:
-    // Copying to or from a physical H register on x86-64 requires a NOREX
-    // move.  Otherwise use a normal move.
-    if (isHReg(Reg) &&
-        TM.getSubtarget<X86Subtarget>().is64Bit())
-      return load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX;
-    else
-      return load ? X86::MOV8rm : X86::MOV8mr;
-  case X86::GR8_ABCD_LRegClassID:
-  case X86::GR8_NOREXRegClassID:
-    return load ? X86::MOV8rm :X86::MOV8mr;
-  case X86::GR8_ABCD_HRegClassID:
+    llvm_unreachable("Unknown spill size");
+  case 1:
+    assert(X86::GR8RegClass.hasSubClassEq(RC) && "Unknown 1-byte regclass");
     if (TM.getSubtarget<X86Subtarget>().is64Bit())
-      return load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX;
-    else
-      return load ? X86::MOV8rm : X86::MOV8mr;
-  case X86::RFP80RegClassID:
+      // Copying to or from a physical H register on x86-64 requires a NOREX
+      // move.  Otherwise use a normal move.
+      if (isHReg(Reg) || X86::GR8_ABCD_HRegClass.hasSubClassEq(RC))
+        return load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX;
+    return load ? X86::MOV8rm : X86::MOV8mr;
+  case 2:
+    assert(X86::GR16RegClass.hasSubClassEq(RC) && "Unknown 2-byte regclass");
+    return load ? X86::MOV16rm : X86::MOV16mr;
+  case 4:
+    if (X86::GR32RegClass.hasSubClassEq(RC))
+      return load ? X86::MOV32rm : X86::MOV32mr;
+    if (X86::FR32RegClass.hasSubClassEq(RC))
+      return load ? X86::MOVSSrm : X86::MOVSSmr;
+    if (X86::RFP32RegClass.hasSubClassEq(RC))
+      return load ? X86::LD_Fp32m : X86::ST_Fp32m;
+    llvm_unreachable("Unknown 4-byte regclass");
+  case 8:
+    if (X86::GR64RegClass.hasSubClassEq(RC))
+      return load ? X86::MOV64rm : X86::MOV64mr;
+    if (X86::FR64RegClass.hasSubClassEq(RC))
+      return load ? X86::MOVSDrm : X86::MOVSDmr;
+    if (X86::VR64RegClass.hasSubClassEq(RC))
+      return load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr;
+    if (X86::RFP64RegClass.hasSubClassEq(RC))
+      return load ? X86::LD_Fp64m : X86::ST_Fp64m;
+    llvm_unreachable("Unknown 8-byte regclass");
+  case 10:
+    assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass");
     return load ? X86::LD_Fp80m : X86::ST_FpP80m;
-  case X86::RFP64RegClassID:
-    return load ? X86::LD_Fp64m : X86::ST_Fp64m;
-  case X86::RFP32RegClassID:
-    return load ? X86::LD_Fp32m : X86::ST_Fp32m;
-  case X86::FR32RegClassID:
-    return load ? X86::MOVSSrm : X86::MOVSSmr;
-  case X86::FR64RegClassID:
-    return load ? X86::MOVSDrm : X86::MOVSDmr;
-  case X86::VR128RegClassID:
+  case 16:
+    assert(X86::VR128RegClass.hasSubClassEq(RC) && "Unknown 16-byte regclass");
     // If stack is realigned we can use aligned stores.
     if (isStackAligned)
       return load ? X86::MOVAPSrm : X86::MOVAPSmr;
     else
       return load ? X86::MOVUPSrm : X86::MOVUPSmr;
-  case X86::VR64RegClassID:
-    return load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr;
   }
 }
 
@@ -2241,6 +2225,12 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
   bool isTwoAddr = NumOps > 1 &&
     MI->getDesc().getOperandConstraint(1, TOI::TIED_TO) != -1;
 
+  // FIXME: AsmPrinter doesn't know how to handle
+  // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding.
+  if (MI->getOpcode() == X86::ADD32ri &&
+      MI->getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS)
+    return NULL;
+
   MachineInstr *NewMI = NULL;
   // Folding a memory location into the two-address part of a two-address
   // instruction is different than folding it other places.  It requires
@@ -2429,7 +2419,7 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
       Alignment = 4;
       break;
     default:
-      llvm_unreachable("Don't know how to fold this instruction!");
+      return 0;
     }
   if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
     unsigned NewOpc = 0;
@@ -2535,6 +2525,12 @@ bool X86InstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
     case X86::TEST32rr:
     case X86::TEST64rr:
       return true;
+    case X86::ADD32ri:
+      // FIXME: AsmPrinter doesn't know how to handle
+      // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding.
+      if (MI->getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS)
+        return false;
+      break;
     }
   }
 
@@ -2845,11 +2841,9 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
   case X86::FsMOVAPDrm:
   case X86::MOVAPSrm:
   case X86::MOVUPSrm:
-  case X86::MOVUPSrm_Int:
   case X86::MOVAPDrm:
   case X86::MOVDQArm:
   case X86::MOVDQUrm:
-  case X86::MOVDQUrm_Int:
     break;
   }
   switch (Opc2) {
@@ -2869,11 +2863,9 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
   case X86::FsMOVAPDrm:
   case X86::MOVAPSrm:
   case X86::MOVUPSrm:
-  case X86::MOVUPSrm_Int:
   case X86::MOVAPDrm:
   case X86::MOVDQArm:
   case X86::MOVDQUrm:
-  case X86::MOVDQUrm_Int:
     break;
   }
 
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index 4625b4c..d895023 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -449,7 +449,6 @@ namespace X86II {
     SSEDomainShift = SegOvrShift + 2,
 
     OpcodeShift   = SSEDomainShift + 2,
-    OpcodeMask    = 0xFFULL << OpcodeShift,
 
     //===------------------------------------------------------------------===//
     /// VEX - The opcode prefix used by AVX instructions
@@ -807,7 +806,7 @@ public:
                                        int64_t &Offset1, int64_t &Offset2) const;
 
   /// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to
-  /// determine (in conjuction with areLoadsFromSameBasePtr) if two loads should
+  /// determine (in conjunction with areLoadsFromSameBasePtr) if two loads should
   /// be scheduled togther. On some targets if two loads are loading from
   /// addresses in the same cache line, it's better if they are scheduled
   /// together. This function takes two integers that represent the load offsets
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index f832a7c..8cab808 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -23,6 +23,9 @@ def SDTIntShiftDOp: SDTypeProfile<1, 3,
 
 def SDTX86CmpTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisSameAs<1, 2>]>;
 
+def SDTX86Cmpsd : SDTypeProfile<1, 3, [SDTCisVT<0, f64>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;
+def SDTX86Cmpss : SDTypeProfile<1, 3, [SDTCisVT<0, f32>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;
+
 def SDTX86Cmov    : SDTypeProfile<1, 4,
                                   [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>,
                                    SDTCisVT<3, i8>, SDTCisVT<4, i32>]>;
@@ -459,7 +462,7 @@ def CallImmAddr  : Predicate<"Subtarget->IsLegalToCallImmediateAddr(TM)">;
 include "X86InstrFormats.td"
 
 //===----------------------------------------------------------------------===//
-// Pattern fragments...
+// Pattern fragments.
 //
 
 // X86 specific condition code. These correspond to CondCode in
@@ -481,21 +484,21 @@ def X86_COND_O   : PatLeaf<(i8 13)>;
 def X86_COND_P   : PatLeaf<(i8 14)>; // alt. COND_PE
 def X86_COND_S   : PatLeaf<(i8 15)>;
 
-def immSext8 : PatLeaf<(imm), [{ return immSext8(N); }]>;
+let FastIselShouldIgnore = 1 in { // FastIsel should ignore all simm8 instrs.
+  def i16immSExt8  : ImmLeaf<i16, [{ return Imm == (int8_t)Imm; }]>;
+  def i32immSExt8  : ImmLeaf<i32, [{ return Imm == (int8_t)Imm; }]>;
+  def i64immSExt8  : ImmLeaf<i64, [{ return Imm == (int8_t)Imm; }]>;
+}
 
-def i16immSExt8  : PatLeaf<(i16 immSext8)>;
-def i32immSExt8  : PatLeaf<(i32 immSext8)>;
-def i64immSExt8  : PatLeaf<(i64 immSext8)>;
-def i64immSExt32  : PatLeaf<(i64 imm), [{ return i64immSExt32(N); }]>;
-def i64immZExt32  : PatLeaf<(i64 imm), [{
-  // i64immZExt32 predicate - True if the 64-bit immediate fits in a 32-bit
-  // unsignedsign extended field.
-  return (uint64_t)N->getZExtValue() == (uint32_t)N->getZExtValue();
-}]>;
+def i64immSExt32 : ImmLeaf<i64, [{ return Imm == (int32_t)Imm; }]>;
+
+
+// i64immZExt32 predicate - True if the 64-bit immediate fits in a 32-bit
+// unsigned field.
+def i64immZExt32 : ImmLeaf<i64, [{ return (uint64_t)Imm == (uint32_t)Imm; }]>;
 
-def i64immZExt32SExt8 : PatLeaf<(i64 imm), [{
-    uint64_t v = N->getZExtValue();
-    return v == (uint32_t)v && (int32_t)v == (int8_t)v;
+def i64immZExt32SExt8 : ImmLeaf<i64, [{
+  return (uint64_t)Imm == (uint32_t)Imm && (int32_t)Imm == (int8_t)Imm;
 }]>;
 
 // Helper fragments for loads.
@@ -1437,7 +1440,7 @@ def : InstAlias<"idivq $src, %rax", (IDIV64m i64mem:$src)>;
 
 // Various unary fpstack operations default to operating on on ST1.
 // For example, "fxch" -> "fxch %st(1)"
-def : InstAlias<"faddp",        (ADD_FPrST0  ST1)>;
+def : InstAlias<"faddp",        (ADD_FPrST0  ST1), 0>;
 def : InstAlias<"fsubp",        (SUBR_FPrST0 ST1)>;
 def : InstAlias<"fsubrp",       (SUB_FPrST0  ST1)>;
 def : InstAlias<"fmulp",        (MUL_FPrST0  ST1)>;
@@ -1455,13 +1458,15 @@ def : InstAlias<"fucompi",      (UCOM_FIPr   ST1)>;
 // For example, "fadd %st(4), %st(0)" -> "fadd %st(4)".  We also disambiguate
 // instructions like "fadd %st(0), %st(0)" as "fadd %st(0)" for consistency with
 // gas.
-multiclass FpUnaryAlias<string Mnemonic, Instruction Inst> {
- def : InstAlias<!strconcat(Mnemonic, " $op, %st(0)"),    (Inst RST:$op)>;
- def : InstAlias<!strconcat(Mnemonic, " %st(0), %st(0)"), (Inst ST0)>;
+multiclass FpUnaryAlias<string Mnemonic, Instruction Inst, bit EmitAlias = 1> {
+ def : InstAlias<!strconcat(Mnemonic, " $op, %st(0)"),
+                 (Inst RST:$op), EmitAlias>;
+ def : InstAlias<!strconcat(Mnemonic, " %st(0), %st(0)"),
+                 (Inst ST0), EmitAlias>;
 }
 
 defm : FpUnaryAlias<"fadd",   ADD_FST0r>;
-defm : FpUnaryAlias<"faddp",  ADD_FPrST0>;
+defm : FpUnaryAlias<"faddp",  ADD_FPrST0, 0>;
 defm : FpUnaryAlias<"fsub",   SUB_FST0r>;
 defm : FpUnaryAlias<"fsubp",  SUBR_FPrST0>;
 defm : FpUnaryAlias<"fsubr",  SUBR_FST0r>;
@@ -1472,8 +1477,8 @@ defm : FpUnaryAlias<"fdiv",   DIV_FST0r>;
 defm : FpUnaryAlias<"fdivp",  DIVR_FPrST0>;
 defm : FpUnaryAlias<"fdivr",  DIVR_FST0r>;
 defm : FpUnaryAlias<"fdivrp", DIV_FPrST0>;
-defm : FpUnaryAlias<"fcomi",   COM_FIr>;
-defm : FpUnaryAlias<"fucomi",  UCOM_FIr>;
+defm : FpUnaryAlias<"fcomi",   COM_FIr, 0>;
+defm : FpUnaryAlias<"fucomi",  UCOM_FIr, 0>;
 defm : FpUnaryAlias<"fcompi",   COM_FIPr>;
 defm : FpUnaryAlias<"fucompi",  UCOM_FIPr>;
 
@@ -1481,8 +1486,9 @@ defm : FpUnaryAlias<"fucompi",  UCOM_FIPr>;
 // Handle "f{mulp,addp} st(0), $op" the same as "f{mulp,addp} $op", since they
 // commute.  We also allow fdiv[r]p/fsubrp even though they don't commute,
 // solely because gas supports it.
-def : InstAlias<"faddp %st(0), $op", (ADD_FPrST0 RST:$op)>;
+def : InstAlias<"faddp %st(0), $op", (ADD_FPrST0 RST:$op), 0>;
 def : InstAlias<"fmulp %st(0), $op", (MUL_FPrST0 RST:$op)>;
+def : InstAlias<"fsubp %st(0), $op", (SUBR_FPrST0 RST:$op)>;
 def : InstAlias<"fsubrp %st(0), $op", (SUB_FPrST0 RST:$op)>;
 def : InstAlias<"fdivp %st(0), $op", (DIVR_FPrST0 RST:$op)>;
 def : InstAlias<"fdivrp %st(0), $op", (DIV_FPrST0 RST:$op)>;
@@ -1534,29 +1540,31 @@ def : InstAlias<"mov $seg, $mem", (MOV32ms i32mem:$mem, SEGMENT_REG:$seg)>;
 def : InstAlias<"movq $imm, $reg", (MOV64ri GR64:$reg, i64imm:$imm)>;
 
 // Match 'movq GR64, MMX' as an alias for movd.
-def : InstAlias<"movq $src, $dst", (MMX_MOVD64to64rr VR64:$dst, GR64:$src)>;
-def : InstAlias<"movq $src, $dst", (MMX_MOVD64from64rr GR64:$dst, VR64:$src)>;
+def : InstAlias<"movq $src, $dst",
+                (MMX_MOVD64to64rr VR64:$dst, GR64:$src), 0>;
+def : InstAlias<"movq $src, $dst",
+                (MMX_MOVD64from64rr GR64:$dst, VR64:$src), 0>;
 
 // movsd with no operands (as opposed to the SSE scalar move of a double) is an
 // alias for movsl. (as in rep; movsd)
 def : InstAlias<"movsd", (MOVSD)>;
 
 // movsx aliases
-def : InstAlias<"movsx $src, $dst", (MOVSX16rr8W GR16:$dst, GR8:$src)>;
-def : InstAlias<"movsx $src, $dst", (MOVSX16rm8W GR16:$dst, i8mem:$src)>;
-def : InstAlias<"movsx $src, $dst", (MOVSX32rr8 GR32:$dst, GR8:$src)>;
-def : InstAlias<"movsx $src, $dst", (MOVSX32rr16 GR32:$dst, GR16:$src)>;
-def : InstAlias<"movsx $src, $dst", (MOVSX64rr8 GR64:$dst, GR8:$src)>;
-def : InstAlias<"movsx $src, $dst", (MOVSX64rr16 GR64:$dst, GR16:$src)>;
-def : InstAlias<"movsx $src, $dst", (MOVSX64rr32 GR64:$dst, GR32:$src)>;
+def : InstAlias<"movsx $src, $dst", (MOVSX16rr8 GR16:$dst, GR8:$src), 0>;
+def : InstAlias<"movsx $src, $dst", (MOVSX16rm8 GR16:$dst, i8mem:$src), 0>;
+def : InstAlias<"movsx $src, $dst", (MOVSX32rr8 GR32:$dst, GR8:$src), 0>;
+def : InstAlias<"movsx $src, $dst", (MOVSX32rr16 GR32:$dst, GR16:$src), 0>;
+def : InstAlias<"movsx $src, $dst", (MOVSX64rr8 GR64:$dst, GR8:$src), 0>;
+def : InstAlias<"movsx $src, $dst", (MOVSX64rr16 GR64:$dst, GR16:$src), 0>;
+def : InstAlias<"movsx $src, $dst", (MOVSX64rr32 GR64:$dst, GR32:$src), 0>;
 
 // movzx aliases
-def : InstAlias<"movzx $src, $dst", (MOVZX16rr8W GR16:$dst, GR8:$src)>;
-def : InstAlias<"movzx $src, $dst", (MOVZX16rm8W GR16:$dst, i8mem:$src)>;
-def : InstAlias<"movzx $src, $dst", (MOVZX32rr8 GR32:$dst, GR8:$src)>;
-def : InstAlias<"movzx $src, $dst", (MOVZX32rr16 GR32:$dst, GR16:$src)>;
-def : InstAlias<"movzx $src, $dst", (MOVZX64rr8_Q GR64:$dst, GR8:$src)>;
-def : InstAlias<"movzx $src, $dst", (MOVZX64rr16_Q GR64:$dst, GR16:$src)>;
+def : InstAlias<"movzx $src, $dst", (MOVZX16rr8 GR16:$dst, GR8:$src), 0>;
+def : InstAlias<"movzx $src, $dst", (MOVZX16rm8 GR16:$dst, i8mem:$src), 0>;
+def : InstAlias<"movzx $src, $dst", (MOVZX32rr8 GR32:$dst, GR8:$src), 0>;
+def : InstAlias<"movzx $src, $dst", (MOVZX32rr16 GR32:$dst, GR16:$src), 0>;
+def : InstAlias<"movzx $src, $dst", (MOVZX64rr8_Q GR64:$dst, GR8:$src), 0>;
+def : InstAlias<"movzx $src, $dst", (MOVZX64rr16_Q GR64:$dst, GR16:$src), 0>;
 // Note: No GR32->GR64 movzx form.
 
 // outb %dx -> outb %al, %dx
diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td
index bb2165a..b2d9fca 100644
--- a/lib/Target/X86/X86InstrMMX.td
+++ b/lib/Target/X86/X86InstrMMX.td
@@ -285,7 +285,7 @@ let Constraints = "$src1 = $dst" in
 defm MMX_PAND : MMXI_binop_rm_int<0xDB, "pand", int_x86_mmx_pand, 1>;
 defm MMX_POR  : MMXI_binop_rm_int<0xEB, "por" , int_x86_mmx_por,  1>;
 defm MMX_PXOR : MMXI_binop_rm_int<0xEF, "pxor", int_x86_mmx_pxor, 1>;
-defm MMX_PANDN : MMXI_binop_rm_int<0xDF, "pandn", int_x86_mmx_pandn, 1>;
+defm MMX_PANDN : MMXI_binop_rm_int<0xDF, "pandn", int_x86_mmx_pandn>;
 
 // Shift Instructions
 defm MMX_PSRLW : MMXI_binop_rmi_int<0xD1, 0x71, MRM2r, "psrlw",
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 8f08e68..7774057 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -135,18 +135,16 @@ class sse12_move_rm<RegisterClass RC, X86MemOperand x86memop,
 // is used instead. Register-to-register movss/movsd is not modeled as an
 // INSERT_SUBREG because INSERT_SUBREG requires that the insert be implementable
 // in terms of a copy, and just mentioned, we don't use movss/movsd for copies.
-let isAsmParserOnly = 0 in {
-  def VMOVSSrr : sse12_move_rr<FR32, v4f32,
-                  "movss\t{$src2, $src1, $dst|$dst, $src1, $src2}">, XS, VEX_4V;
-  def VMOVSDrr : sse12_move_rr<FR64, v2f64,
-                  "movsd\t{$src2, $src1, $dst|$dst, $src1, $src2}">, XD, VEX_4V;
+def VMOVSSrr : sse12_move_rr<FR32, v4f32,
+                "movss\t{$src2, $src1, $dst|$dst, $src1, $src2}">, XS, VEX_4V;
+def VMOVSDrr : sse12_move_rr<FR64, v2f64,
+                "movsd\t{$src2, $src1, $dst|$dst, $src1, $src2}">, XD, VEX_4V;
 
-  let canFoldAsLoad = 1, isReMaterializable = 1 in {
-    def VMOVSSrm : sse12_move_rm<FR32, f32mem, loadf32, "movss">, XS, VEX;
+let canFoldAsLoad = 1, isReMaterializable = 1 in {
+  def VMOVSSrm : sse12_move_rm<FR32, f32mem, loadf32, "movss">, XS, VEX;
 
-    let AddedComplexity = 20 in
-      def VMOVSDrm : sse12_move_rm<FR64, f64mem, loadf64, "movsd">, XD, VEX;
-  }
+  let AddedComplexity = 20 in
+    def VMOVSDrm : sse12_move_rm<FR64, f64mem, loadf64, "movsd">, XD, VEX;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -218,14 +216,12 @@ def MOVSDmr : SDI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src),
                   "movsd\t{$src, $dst|$dst, $src}",
                   [(store FR64:$src, addr:$dst)]>;
 
-let isAsmParserOnly = 0 in {
 def VMOVSSmr : SI<0x11, MRMDestMem, (outs), (ins f32mem:$dst, FR32:$src),
                   "movss\t{$src, $dst|$dst, $src}",
                   [(store FR32:$src, addr:$dst)]>, XS, VEX;
 def VMOVSDmr : SI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src),
                   "movsd\t{$src, $dst|$dst, $src}",
                   [(store FR64:$src, addr:$dst)]>, XD, VEX;
-}
 
 // Extract and store.
 def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
@@ -251,7 +247,6 @@ let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable in
                    [(set RC:$dst, (ld_frag addr:$src))], d>;
 }
 
-let isAsmParserOnly = 0 in {
 defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
                               "movaps", SSEPackedSingle>, VEX;
 defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
@@ -269,7 +264,6 @@ defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32,
                               "movups", SSEPackedSingle>, VEX;
 defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64,
                               "movupd", SSEPackedDouble, 0>, OpSize, VEX;
-}
 defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
                               "movaps", SSEPackedSingle>, TB;
 defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
@@ -279,7 +273,6 @@ defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
 defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
                               "movupd", SSEPackedDouble, 0>, TB, OpSize;
 
-let isAsmParserOnly = 0 in {
 def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                    "movaps\t{$src, $dst|$dst, $src}",
                    [(alignedstore (v4f32 VR128:$src), addr:$dst)]>, VEX;
@@ -304,7 +297,6 @@ def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
 def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
                    "movupd\t{$src, $dst|$dst, $src}",
                    [(store (v4f64 VR256:$src), addr:$dst)]>, VEX;
-}
 
 def : Pat<(int_x86_avx_loadu_ps_256 addr:$src), (VMOVUPSYrm addr:$src)>;
 def : Pat<(int_x86_avx_storeu_ps_256 addr:$dst, VR256:$src),
@@ -328,32 +320,14 @@ def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                    [(store (v2f64 VR128:$src), addr:$dst)]>;
 
 // Intrinsic forms of MOVUPS/D load and store
-let isAsmParserOnly = 0 in {
-  let canFoldAsLoad = 1, isReMaterializable = 1 in
-  def VMOVUPSrm_Int : VPSI<0x10, MRMSrcMem, (outs VR128:$dst),
-             (ins f128mem:$src),
-             "movups\t{$src, $dst|$dst, $src}",
-             [(set VR128:$dst, (int_x86_sse_loadu_ps addr:$src))]>, VEX;
-  def VMOVUPDrm_Int : VPDI<0x10, MRMSrcMem, (outs VR128:$dst),
-             (ins f128mem:$src),
-             "movupd\t{$src, $dst|$dst, $src}",
-             [(set VR128:$dst, (int_x86_sse2_loadu_pd addr:$src))]>, VEX;
-  def VMOVUPSmr_Int : VPSI<0x11, MRMDestMem, (outs),
-             (ins f128mem:$dst, VR128:$src),
-             "movups\t{$src, $dst|$dst, $src}",
-             [(int_x86_sse_storeu_ps addr:$dst, VR128:$src)]>, VEX;
-  def VMOVUPDmr_Int : VPDI<0x11, MRMDestMem, (outs),
-             (ins f128mem:$dst, VR128:$src),
-             "movupd\t{$src, $dst|$dst, $src}",
-             [(int_x86_sse2_storeu_pd addr:$dst, VR128:$src)]>, VEX;
-}
-let canFoldAsLoad = 1, isReMaterializable = 1 in
-def MOVUPSrm_Int : PSI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
-                       "movups\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (int_x86_sse_loadu_ps addr:$src))]>;
-def MOVUPDrm_Int : PDI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
-                       "movupd\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (int_x86_sse2_loadu_pd addr:$src))]>;
+def VMOVUPSmr_Int : VPSI<0x11, MRMDestMem, (outs),
+           (ins f128mem:$dst, VR128:$src),
+           "movups\t{$src, $dst|$dst, $src}",
+           [(int_x86_sse_storeu_ps addr:$dst, VR128:$src)]>, VEX;
+def VMOVUPDmr_Int : VPDI<0x11, MRMDestMem, (outs),
+           (ins f128mem:$dst, VR128:$src),
+           "movupd\t{$src, $dst|$dst, $src}",
+           [(int_x86_sse2_storeu_pd addr:$dst, VR128:$src)]>, VEX;
 
 def MOVUPSmr_Int : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                        "movups\t{$src, $dst|$dst, $src}",
@@ -382,7 +356,7 @@ multiclass sse12_mov_hilo_packed<bits<8>opc, RegisterClass RC,
               SSEPackedDouble>, TB, OpSize;
 }
 
-let isAsmParserOnly = 0, AddedComplexity = 20 in {
+let AddedComplexity = 20 in {
   defm VMOVL : sse12_mov_hilo_packed<0x12, VR128, movlp, "movlp",
                      "\t{$src2, $src1, $dst|$dst, $src1, $src2}">, VEX_4V;
   defm VMOVH : sse12_mov_hilo_packed<0x16, VR128, movlhps, "movhp",
@@ -395,7 +369,6 @@ let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
                                    "\t{$src2, $dst|$dst, $src2}">;
 }
 
-let isAsmParserOnly = 0 in {
 def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
                    "movlps\t{$src, $dst|$dst, $src}",
                    [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)),
@@ -404,7 +377,6 @@ def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
                    "movlpd\t{$src, $dst|$dst, $src}",
                    [(store (f64 (vector_extract (v2f64 VR128:$src),
                                  (iPTR 0))), addr:$dst)]>, VEX;
-}
 def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
                    "movlps\t{$src, $dst|$dst, $src}",
                    [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)),
@@ -416,7 +388,6 @@ def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
 
 // v2f64 extract element 1 is always custom lowered to unpack high to low
 // and extract element 0 so the non-store version isn't too horrible.
-let isAsmParserOnly = 0 in {
 def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
                    "movhps\t{$src, $dst|$dst, $src}",
                    [(store (f64 (vector_extract
@@ -429,7 +400,6 @@ def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
                                  (v2f64 (unpckh VR128:$src, (undef))),
                                  (iPTR 0))), addr:$dst)]>,
                    VEX;
-}
 def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
                    "movhps\t{$src, $dst|$dst, $src}",
                    [(store (f64 (vector_extract
@@ -441,7 +411,7 @@ def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
                                  (v2f64 (unpckh VR128:$src, (undef))),
                                  (iPTR 0))), addr:$dst)]>;
 
-let isAsmParserOnly = 0, AddedComplexity = 20 in {
+let AddedComplexity = 20 in {
   def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst),
                                        (ins VR128:$src1, VR128:$src2),
                       "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -516,7 +486,6 @@ multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
               !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>;
 }
 
-let isAsmParserOnly = 0 in {
 defm VCVTTSS2SI   : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
                                 "cvttss2si\t{$src, $dst|$dst, $src}">, XS, VEX;
 defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
@@ -542,7 +511,6 @@ defm VCVTSI2SDL  : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}">, XD,
                                   VEX_4V;
 defm VCVTSI2SD64 : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}">, XD,
                                   VEX_4V, VEX_W;
-}
 
 defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
                       "cvttss2si\t{$src, $dst|$dst, $src}">, XS;
@@ -591,27 +559,25 @@ multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
               [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))]>;
 }
 
-let isAsmParserOnly = 0 in {
-  defm Int_VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
-                        f32mem, load, "cvtss2si">, XS, VEX;
-  defm Int_VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64,
-                          int_x86_sse_cvtss2si64, f32mem, load, "cvtss2si">,
-                          XS, VEX, VEX_W;
-  defm Int_VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si,
-                        f128mem, load, "cvtsd2si">, XD, VEX;
-  defm Int_VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64,
-                        int_x86_sse2_cvtsd2si64, f128mem, load, "cvtsd2si">,
-                        XD, VEX, VEX_W;
-
-  // FIXME: The asm matcher has a hack to ignore instructions with _Int and Int_
-  // Get rid of this hack or rename the intrinsics, there are several
-  // intructions that only match with the intrinsic form, why create duplicates
-  // to let them be recognized by the assembler?
-  defm VCVTSD2SI_alt : sse12_cvt_s_np<0x2D, FR64, GR32, f64mem,
-                        "cvtsd2si\t{$src, $dst|$dst, $src}">, XD, VEX;
-  defm VCVTSD2SI64   : sse12_cvt_s_np<0x2D, FR64, GR64, f64mem,
-                        "cvtsd2si\t{$src, $dst|$dst, $src}">, XD, VEX, VEX_W;
-}
+defm Int_VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
+                      f32mem, load, "cvtss2si">, XS, VEX;
+defm Int_VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64,
+                        int_x86_sse_cvtss2si64, f32mem, load, "cvtss2si">,
+                        XS, VEX, VEX_W;
+defm Int_VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si,
+                      f128mem, load, "cvtsd2si">, XD, VEX;
+defm Int_VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64,
+                      int_x86_sse2_cvtsd2si64, f128mem, load, "cvtsd2si">,
+                      XD, VEX, VEX_W;
+
+// FIXME: The asm matcher has a hack to ignore instructions with _Int and Int_
+// Get rid of this hack or rename the intrinsics, there are several
+// intructions that only match with the intrinsic form, why create duplicates
+// to let them be recognized by the assembler?
+defm VCVTSD2SI_alt : sse12_cvt_s_np<0x2D, FR64, GR32, f64mem,
+                      "cvtsd2si\t{$src, $dst|$dst, $src}">, XD, VEX;
+defm VCVTSD2SI64   : sse12_cvt_s_np<0x2D, FR64, GR64, f64mem,
+                      "cvtsd2si\t{$src, $dst|$dst, $src}">, XD, VEX, VEX_W;
 defm Int_CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
                       f32mem, load, "cvtss2si">, XS;
 defm Int_CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
@@ -622,18 +588,16 @@ defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse2_cvtsd2si64,
                   f128mem, load, "cvtsd2si{q}">, XD, REX_W;
 
 
-let isAsmParserOnly = 0 in {
-  defm Int_VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
-            int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss", 0>, XS, VEX_4V;
-  defm Int_VCVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
-            int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss", 0>, XS, VEX_4V,
-            VEX_W;
-  defm Int_VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
-            int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd", 0>, XD, VEX_4V;
-  defm Int_VCVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
-            int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd", 0>, XD,
-            VEX_4V, VEX_W;
-}
+defm Int_VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
+          int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss", 0>, XS, VEX_4V;
+defm Int_VCVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
+          int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss", 0>, XS, VEX_4V,
+          VEX_W;
+defm Int_VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
+          int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd", 0>, XD, VEX_4V;
+defm Int_VCVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
+          int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd", 0>, XD,
+          VEX_4V, VEX_W;
 
 let Constraints = "$src1 = $dst" in {
   defm Int_CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
@@ -653,7 +617,6 @@ let Constraints = "$src1 = $dst" in {
 /// SSE 1 Only
 
 // Aliases for intrinsics
-let isAsmParserOnly = 0 in {
 defm Int_VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
                                     f32mem, load, "cvttss2si">, XS, VEX;
 defm Int_VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
@@ -664,7 +627,6 @@ defm Int_VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
 defm Int_VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
                                     int_x86_sse2_cvttsd2si64, f128mem, load,
                                     "cvttsd2si">, XD, VEX, VEX_W;
-}
 defm Int_CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
                                     f32mem, load, "cvttss2si">, XS;
 defm Int_CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
@@ -676,7 +638,7 @@ defm Int_CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
                                     int_x86_sse2_cvttsd2si64, f128mem, load,
                                     "cvttsd2si{q}">, XD, REX_W;
 
-let isAsmParserOnly = 0, Pattern = []<dag> in {
+let Pattern = []<dag> in {
 defm VCVTSS2SI   : sse12_cvt_s<0x2D, FR32, GR32, undef, f32mem, load,
                                "cvtss2si{l}\t{$src, $dst|$dst, $src}">, XS, VEX;
 defm VCVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, undef, f32mem, load,
@@ -702,7 +664,6 @@ defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, undef, i128mem, load /*dummy*/,
 /// SSE 2 Only
 
 // Convert scalar double to scalar single
-let isAsmParserOnly = 0 in {
 def VCVTSD2SSrr  : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
                        (ins FR64:$src1, FR64:$src2),
                       "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
@@ -711,7 +672,6 @@ def VCVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst),
                        (ins FR64:$src1, f64mem:$src2),
                       "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                       []>, XD, Requires<[HasAVX, OptForSize]>, VEX_4V;
-}
 def : Pat<(f32 (fround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>,
         Requires<[HasAVX]>;
 
@@ -723,7 +683,6 @@ def CVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
                       [(set FR32:$dst, (fround (loadf64 addr:$src)))]>, XD,
                   Requires<[HasSSE2, OptForSize]>;
 
-let isAsmParserOnly = 0 in
 defm Int_VCVTSD2SS: sse12_cvt_sint_3addr<0x5A, VR128, VR128,
                       int_x86_sse2_cvtsd2ss, f64mem, load, "cvtsd2ss", 0>,
                       XS, VEX_4V;
@@ -732,7 +691,7 @@ defm Int_CVTSD2SS: sse12_cvt_sint_3addr<0x5A, VR128, VR128,
                       int_x86_sse2_cvtsd2ss, f64mem, load, "cvtsd2ss">, XS;
 
 // Convert scalar single to scalar double
-let isAsmParserOnly = 0 in { // SSE2 instructions with XS prefix
+// SSE2 instructions with XS prefix
 def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
                     (ins FR32:$src1, FR32:$src2),
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -741,7 +700,6 @@ def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
                     (ins FR32:$src1, f32mem:$src2),
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     []>, XS, VEX_4V, Requires<[HasAVX, OptForSize]>;
-}
 def : Pat<(f64 (fextend FR32:$src)), (VCVTSS2SDrr FR32:$src, FR32:$src)>,
         Requires<[HasAVX]>;
 
@@ -754,7 +712,6 @@ def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
                    [(set FR64:$dst, (extloadf32 addr:$src))]>, XS,
                  Requires<[HasSSE2, OptForSize]>;
 
-let isAsmParserOnly = 0 in {
 def Int_VCVTSS2SDrr: I<0x5A, MRMSrcReg,
                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -767,7 +724,6 @@ def Int_VCVTSS2SDrm: I<0x5A, MRMSrcMem,
                     [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1,
                                        (load addr:$src2)))]>, XS, VEX_4V,
                     Requires<[HasAVX]>;
-}
 let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
 def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg,
                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
@@ -788,7 +744,7 @@ def : Pat<(extloadf32 addr:$src),
       Requires<[HasSSE2, OptForSpeed]>;
 
 // Convert doubleword to packed single/double fp
-let isAsmParserOnly = 0 in { // SSE2 instructions without OpSize prefix
+// SSE2 instructions without OpSize prefix
 def Int_VCVTDQ2PSrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "vcvtdq2ps\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (int_x86_sse2_cvtdq2ps VR128:$src))]>,
@@ -798,7 +754,6 @@ def Int_VCVTDQ2PSrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                       [(set VR128:$dst, (int_x86_sse2_cvtdq2ps
                                         (bitconvert (memopv2i64 addr:$src))))]>,
                      TB, VEX, Requires<[HasAVX]>;
-}
 def Int_CVTDQ2PSrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvtdq2ps\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (int_x86_sse2_cvtdq2ps VR128:$src))]>,
@@ -810,7 +765,7 @@ def Int_CVTDQ2PSrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                      TB, Requires<[HasSSE2]>;
 
 // FIXME: why the non-intrinsic version is described as SSE3?
-let isAsmParserOnly = 0 in { // SSE2 instructions with XS prefix
+// SSE2 instructions with XS prefix
 def Int_VCVTDQ2PDrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "vcvtdq2pd\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))]>,
@@ -820,7 +775,6 @@ def Int_VCVTDQ2PDrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                        [(set VR128:$dst, (int_x86_sse2_cvtdq2pd
                                         (bitconvert (memopv2i64 addr:$src))))]>,
                      XS, VEX, Requires<[HasAVX]>;
-}
 def Int_CVTDQ2PDrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvtdq2pd\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))]>,
@@ -833,7 +787,6 @@ def Int_CVTDQ2PDrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
 
 
 // Convert packed single/double fp to doubleword
-let isAsmParserOnly = 0 in {
 def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvtps2dq\t{$src, $dst|$dst, $src}", []>, VEX;
 def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
@@ -842,13 +795,11 @@ def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                         "cvtps2dq\t{$src, $dst|$dst, $src}", []>, VEX;
 def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                         "cvtps2dq\t{$src, $dst|$dst, $src}", []>, VEX;
-}
 def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                      "cvtps2dq\t{$src, $dst|$dst, $src}", []>;
 def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                      "cvtps2dq\t{$src, $dst|$dst, $src}", []>;
 
-let isAsmParserOnly = 0 in {
 def Int_VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                         "cvtps2dq\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))]>,
@@ -858,7 +809,6 @@ def Int_VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst),
                          "cvtps2dq\t{$src, $dst|$dst, $src}",
                          [(set VR128:$dst, (int_x86_sse2_cvtps2dq
                                             (memop addr:$src)))]>, VEX;
-}
 def Int_CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                         "cvtps2dq\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))]>;
@@ -867,7 +817,7 @@ def Int_CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                          [(set VR128:$dst, (int_x86_sse2_cvtps2dq
                                             (memop addr:$src)))]>;
 
-let isAsmParserOnly = 0 in { // SSE2 packed instructions with XD prefix
+// SSE2 packed instructions with XD prefix
 def Int_VCVTPD2DQrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "vcvtpd2dq\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))]>,
@@ -877,7 +827,6 @@ def Int_VCVTPD2DQrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                        [(set VR128:$dst, (int_x86_sse2_cvtpd2dq
                                           (memop addr:$src)))]>,
                      XD, VEX, Requires<[HasAVX]>;
-}
 def Int_CVTPD2DQrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvtpd2dq\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))]>,
@@ -890,7 +839,7 @@ def Int_CVTPD2DQrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
 
 
 // Convert with truncation packed single/double fp to doubleword
-let isAsmParserOnly = 0 in { // SSE2 packed instructions with XS prefix
+// SSE2 packed instructions with XS prefix
 def VCVTTPS2DQrr : VSSI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                       "cvttps2dq\t{$src, $dst|$dst, $src}", []>, VEX;
 def VCVTTPS2DQrm : VSSI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
@@ -899,7 +848,6 @@ def VCVTTPS2DQYrr : VSSI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                       "cvttps2dq\t{$src, $dst|$dst, $src}", []>, VEX;
 def VCVTTPS2DQYrm : VSSI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                       "cvttps2dq\t{$src, $dst|$dst, $src}", []>, VEX;
-}
 def CVTTPS2DQrr : SSI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                       "cvttps2dq\t{$src, $dst|$dst, $src}",
                       [(set VR128:$dst,
@@ -910,7 +858,6 @@ def CVTTPS2DQrm : SSI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                             (int_x86_sse2_cvttps2dq (memop addr:$src)))]>;
 
 
-let isAsmParserOnly = 0 in {
 def Int_VCVTTPS2DQrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                         "vcvttps2dq\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst,
@@ -921,9 +868,7 @@ def Int_VCVTTPS2DQrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                         [(set VR128:$dst, (int_x86_sse2_cvttps2dq
                                            (memop addr:$src)))]>,
                       XS, VEX, Requires<[HasAVX]>;
-}
 
-let isAsmParserOnly = 0 in {
 def Int_VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst),
                             (ins VR128:$src),
                           "cvttpd2dq\t{$src, $dst|$dst, $src}",
@@ -934,7 +879,6 @@ def Int_VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst),
                           "cvttpd2dq\t{$src, $dst|$dst, $src}",
                           [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
                                              (memop addr:$src)))]>, VEX;
-}
 def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                       "cvttpd2dq\t{$src, $dst|$dst, $src}",
                       [(set VR128:$dst, (int_x86_sse2_cvttpd2dq VR128:$src))]>;
@@ -943,7 +887,6 @@ def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
                       [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
                                         (memop addr:$src)))]>;
 
-let isAsmParserOnly = 0 in {
 // The assembler can recognize rr 256-bit instructions by seeing a ymm
 // register, but the same isn't true when using memory operands instead.
 // Provide other assembly rr and rm forms to address this explicitly.
@@ -963,10 +906,9 @@ def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
                          "cvttpd2dqy\t{$src, $dst|$dst, $src}", []>, VEX;
 def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
                          "cvttpd2dqy\t{$src, $dst|$dst, $src}", []>, VEX, VEX_L;
-}
 
 // Convert packed single to packed double
-let isAsmParserOnly = 0, Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in {
                   // SSE2 instructions without OpSize prefix
 def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                      "vcvtps2pd\t{$src, $dst|$dst, $src}", []>, VEX;
@@ -982,7 +924,6 @@ def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
 def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
                        "cvtps2pd\t{$src, $dst|$dst, $src}", []>, TB;
 
-let isAsmParserOnly = 0 in {
 def Int_VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "vcvtps2pd\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))]>,
@@ -992,7 +933,6 @@ def Int_VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
                        [(set VR128:$dst, (int_x86_sse2_cvtps2pd
                                           (load addr:$src)))]>,
                      VEX, Requires<[HasAVX]>;
-}
 def Int_CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvtps2pd\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))]>,
@@ -1004,7 +944,6 @@ def Int_CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
                      TB, Requires<[HasSSE2]>;
 
 // Convert packed double to packed single
-let isAsmParserOnly = 0 in {
 // The assembler can recognize rr 256-bit instructions by seeing a ymm
 // register, but the same isn't true when using memory operands instead.
 // Provide other assembly rr and rm forms to address this explicitly.
@@ -1024,14 +963,12 @@ def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
                         "cvtpd2psy\t{$src, $dst|$dst, $src}", []>, VEX;
 def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
                         "cvtpd2psy\t{$src, $dst|$dst, $src}", []>, VEX, VEX_L;
-}
 def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                      "cvtpd2ps\t{$src, $dst|$dst, $src}", []>;
 def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                      "cvtpd2ps\t{$src, $dst|$dst, $src}", []>;
 
 
-let isAsmParserOnly = 0 in {
 def Int_VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                          "cvtpd2ps\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))]>;
@@ -1040,7 +977,6 @@ def Int_VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst),
                          "cvtpd2ps\t{$src, $dst|$dst, $src}",
                          [(set VR128:$dst, (int_x86_sse2_cvtpd2ps
                                             (memop addr:$src)))]>;
-}
 def Int_CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                          "cvtpd2ps\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))]>;
@@ -1109,7 +1045,7 @@ multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
                 asm_alt, []>;
 }
 
-let neverHasSideEffects = 1, isAsmParserOnly = 0 in {
+let neverHasSideEffects = 1 in {
   defm VCMPSS  : sse12_cmp_scalar<FR32, f32mem,
                   "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}",
                   "cmpss\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}">,
@@ -1120,13 +1056,37 @@ let neverHasSideEffects = 1, isAsmParserOnly = 0 in {
                   XD, VEX_4V;
 }
 
+let Constraints = "$src1 = $dst" in {
+def CMPSSrr : SIi8<0xC2, MRMSrcReg,
+                  (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, SSECC:$cc),
+                  "cmp${cc}ss\t{$src2, $dst|$dst, $src2}",
+                  [(set FR32:$dst, (X86cmpss (f32 FR32:$src1), FR32:$src2, imm:$cc))]>, XS;
+def CMPSSrm : SIi8<0xC2, MRMSrcMem,
+                  (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, SSECC:$cc),
+                  "cmp${cc}ss\t{$src2, $dst|$dst, $src2}",
+                  [(set FR32:$dst, (X86cmpss (f32 FR32:$src1), (loadf32 addr:$src2), imm:$cc))]>, XS;
+def CMPSDrr : SIi8<0xC2, MRMSrcReg,
+                  (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, SSECC:$cc),
+                  "cmp${cc}sd\t{$src2, $dst|$dst, $src2}",
+                  [(set FR64:$dst, (X86cmpsd (f64 FR64:$src1), FR64:$src2, imm:$cc))]>, XD;
+def CMPSDrm : SIi8<0xC2, MRMSrcMem,
+                  (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, SSECC:$cc),
+                  "cmp${cc}sd\t{$src2, $dst|$dst, $src2}",
+                  [(set FR64:$dst, (X86cmpsd (f64 FR64:$src1), (loadf64 addr:$src2), imm:$cc))]>, XD;
+}
 let Constraints = "$src1 = $dst", neverHasSideEffects = 1 in {
-  defm CMPSS  : sse12_cmp_scalar<FR32, f32mem,
-                    "cmp${cc}ss\t{$src, $dst|$dst, $src}",
-                    "cmpss\t{$src2, $src, $dst|$dst, $src, $src2}">, XS;
-  defm CMPSD  : sse12_cmp_scalar<FR64, f64mem,
-                    "cmp${cc}sd\t{$src, $dst|$dst, $src}",
-                    "cmpsd\t{$src2, $src, $dst|$dst, $src, $src2}">, XD;
+def CMPSSrr_alt : SIi8<0xC2, MRMSrcReg,
+                  (outs FR32:$dst), (ins FR32:$src1, FR32:$src, i8imm:$src2),
+                  "cmpss\t{$src2, $src, $dst|$dst, $src, $src2}", []>, XS;
+def CMPSSrm_alt : SIi8<0xC2, MRMSrcMem,
+                  (outs FR32:$dst), (ins FR32:$src1, f32mem:$src, i8imm:$src2),
+                  "cmpss\t{$src2, $src, $dst|$dst, $src, $src2}", []>, XS;
+def CMPSDrr_alt : SIi8<0xC2, MRMSrcReg,
+                  (outs FR64:$dst), (ins FR64:$src1, FR64:$src, i8imm:$src2),
+                  "cmpsd\t{$src2, $src, $dst|$dst, $src, $src2}", []>, XD;
+def CMPSDrm_alt : SIi8<0xC2, MRMSrcMem,
+                  (outs FR64:$dst), (ins FR64:$src1, f64mem:$src, i8imm:$src2),
+                  "cmpsd\t{$src2, $src, $dst|$dst, $src, $src2}", []>, XD;
 }
 
 multiclass sse12_cmp_scalar_int<RegisterClass RC, X86MemOperand x86memop,
@@ -1142,14 +1102,12 @@ multiclass sse12_cmp_scalar_int<RegisterClass RC, X86MemOperand x86memop,
 }
 
 // Aliases to match intrinsics which expect XMM operand(s).
-let isAsmParserOnly = 0 in {
-  defm Int_VCMPSS  : sse12_cmp_scalar_int<VR128, f32mem, int_x86_sse_cmp_ss,
-                       "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}">,
-                       XS, VEX_4V;
-  defm Int_VCMPSD  : sse12_cmp_scalar_int<VR128, f64mem, int_x86_sse2_cmp_sd,
-                       "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}">,
-                       XD, VEX_4V;
-}
+defm Int_VCMPSS  : sse12_cmp_scalar_int<VR128, f32mem, int_x86_sse_cmp_ss,
+                     "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}">,
+                     XS, VEX_4V;
+defm Int_VCMPSD  : sse12_cmp_scalar_int<VR128, f64mem, int_x86_sse2_cmp_sd,
+                     "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}">,
+                     XD, VEX_4V;
 let Constraints = "$src1 = $dst" in {
   defm Int_CMPSS  : sse12_cmp_scalar_int<VR128, f32mem, int_x86_sse_cmp_ss,
                        "cmp${cc}ss\t{$src, $dst|$dst, $src}">, XS;
@@ -1172,28 +1130,26 @@ multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
 }
 
 let Defs = [EFLAGS] in {
-  let isAsmParserOnly = 0 in {
-    defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
-                                    "ucomiss", SSEPackedSingle>, VEX;
-    defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
-                                    "ucomisd", SSEPackedDouble>, OpSize, VEX;
-    let Pattern = []<dag> in {
-      defm VCOMISS  : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load,
-                                      "comiss", SSEPackedSingle>, VEX;
-      defm VCOMISD  : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load,
-                                      "comisd", SSEPackedDouble>, OpSize, VEX;
-    }
-
-    defm Int_VUCOMISS  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
-                              load, "ucomiss", SSEPackedSingle>, VEX;
-    defm Int_VUCOMISD  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
-                              load, "ucomisd", SSEPackedDouble>, OpSize, VEX;
-
-    defm Int_VCOMISS  : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem,
-                              load, "comiss", SSEPackedSingle>, VEX;
-    defm Int_VCOMISD  : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem,
-                              load, "comisd", SSEPackedDouble>, OpSize, VEX;
+  defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
+                                  "ucomiss", SSEPackedSingle>, VEX;
+  defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
+                                  "ucomisd", SSEPackedDouble>, OpSize, VEX;
+  let Pattern = []<dag> in {
+    defm VCOMISS  : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load,
+                                    "comiss", SSEPackedSingle>, VEX;
+    defm VCOMISD  : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load,
+                                    "comisd", SSEPackedDouble>, OpSize, VEX;
   }
+
+  defm Int_VUCOMISS  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
+                            load, "ucomiss", SSEPackedSingle>, VEX;
+  defm Int_VUCOMISD  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
+                            load, "ucomisd", SSEPackedDouble>, OpSize, VEX;
+
+  defm Int_VCOMISS  : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem,
+                            load, "comiss", SSEPackedSingle>, VEX;
+  defm Int_VCOMISD  : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem,
+                            load, "comisd", SSEPackedDouble>, OpSize, VEX;
   defm UCOMISS  : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
                                   "ucomiss", SSEPackedSingle>, TB;
   defm UCOMISD  : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
@@ -1239,24 +1195,22 @@ multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
              asm_alt, [], d>;
 }
 
-let isAsmParserOnly = 0 in {
-  defm VCMPPS : sse12_cmp_packed<VR128, f128mem, int_x86_sse_cmp_ps,
-                 "cmp${cc}ps\t{$src, $src1, $dst|$dst, $src1, $src}",
-                 "cmpps\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}",
-                 SSEPackedSingle>, VEX_4V;
-  defm VCMPPD : sse12_cmp_packed<VR128, f128mem, int_x86_sse2_cmp_pd,
-                 "cmp${cc}pd\t{$src, $src1, $dst|$dst, $src1, $src}",
-                 "cmppd\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}",
-                 SSEPackedDouble>, OpSize, VEX_4V;
-  defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, int_x86_avx_cmp_ps_256,
-                 "cmp${cc}ps\t{$src, $src1, $dst|$dst, $src1, $src}",
-                 "cmpps\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}",
-                 SSEPackedSingle>, VEX_4V;
-  defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, int_x86_avx_cmp_pd_256,
-                 "cmp${cc}pd\t{$src, $src1, $dst|$dst, $src1, $src}",
-                 "cmppd\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}",
-                 SSEPackedDouble>, OpSize, VEX_4V;
-}
+defm VCMPPS : sse12_cmp_packed<VR128, f128mem, int_x86_sse_cmp_ps,
+               "cmp${cc}ps\t{$src, $src1, $dst|$dst, $src1, $src}",
+               "cmpps\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}",
+               SSEPackedSingle>, VEX_4V;
+defm VCMPPD : sse12_cmp_packed<VR128, f128mem, int_x86_sse2_cmp_pd,
+               "cmp${cc}pd\t{$src, $src1, $dst|$dst, $src1, $src}",
+               "cmppd\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}",
+               SSEPackedDouble>, OpSize, VEX_4V;
+defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, int_x86_avx_cmp_ps_256,
+               "cmp${cc}ps\t{$src, $src1, $dst|$dst, $src1, $src}",
+               "cmpps\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}",
+               SSEPackedSingle>, VEX_4V;
+defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, int_x86_avx_cmp_pd_256,
+               "cmp${cc}pd\t{$src, $src1, $dst|$dst, $src1, $src}",
+               "cmppd\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}",
+               SSEPackedDouble>, OpSize, VEX_4V;
 let Constraints = "$src1 = $dst" in {
   defm CMPPS : sse12_cmp_packed<VR128, f128mem, int_x86_sse_cmp_ps,
                  "cmp${cc}ps\t{$src, $dst|$dst, $src}",
@@ -1296,20 +1250,18 @@ multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
                             (vt (shufp:$src3 RC:$src1, RC:$src2)))], d>;
 }
 
-let isAsmParserOnly = 0 in {
-  defm VSHUFPS  : sse12_shuffle<VR128, f128mem, v4f32,
-             "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-             memopv4f32, SSEPackedSingle>, TB, VEX_4V;
-  defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
-             "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-             memopv8f32, SSEPackedSingle>, TB, VEX_4V;
-  defm VSHUFPD  : sse12_shuffle<VR128, f128mem, v2f64,
-             "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src2, $src2, $src3}",
-             memopv2f64, SSEPackedDouble>, TB, OpSize, VEX_4V;
-  defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
-             "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src2, $src2, $src3}",
-             memopv4f64, SSEPackedDouble>, TB, OpSize, VEX_4V;
-}
+defm VSHUFPS  : sse12_shuffle<VR128, f128mem, v4f32,
+           "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+           memopv4f32, SSEPackedSingle>, TB, VEX_4V;
+defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
+           "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+           memopv8f32, SSEPackedSingle>, TB, VEX_4V;
+defm VSHUFPD  : sse12_shuffle<VR128, f128mem, v2f64,
+           "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src2, $src2, $src3}",
+           memopv2f64, SSEPackedDouble>, TB, OpSize, VEX_4V;
+defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
+           "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src2, $src2, $src3}",
+           memopv4f64, SSEPackedDouble>, TB, OpSize, VEX_4V;
 
 let Constraints = "$src1 = $dst" in {
   defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
@@ -1342,33 +1294,31 @@ multiclass sse12_unpack_interleave<bits<8> opc, PatFrag OpNode, ValueType vt,
 }
 
 let AddedComplexity = 10 in {
-  let isAsmParserOnly = 0 in {
-    defm VUNPCKHPS: sse12_unpack_interleave<0x15, unpckh, v4f32, memopv4f32,
-          VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                         SSEPackedSingle>, VEX_4V;
-    defm VUNPCKHPD: sse12_unpack_interleave<0x15, unpckh, v2f64, memopv2f64,
-          VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                         SSEPackedDouble>, OpSize, VEX_4V;
-    defm VUNPCKLPS: sse12_unpack_interleave<0x14, unpckl, v4f32, memopv4f32,
-          VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                         SSEPackedSingle>, VEX_4V;
-    defm VUNPCKLPD: sse12_unpack_interleave<0x14, unpckl, v2f64, memopv2f64,
-          VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                         SSEPackedDouble>, OpSize, VEX_4V;
-
-    defm VUNPCKHPSY: sse12_unpack_interleave<0x15, unpckh, v8f32, memopv8f32,
-          VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                         SSEPackedSingle>, VEX_4V;
-    defm VUNPCKHPDY: sse12_unpack_interleave<0x15, unpckh, v4f64, memopv4f64,
-          VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                         SSEPackedDouble>, OpSize, VEX_4V;
-    defm VUNPCKLPSY: sse12_unpack_interleave<0x14, unpckl, v8f32, memopv8f32,
-          VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                         SSEPackedSingle>, VEX_4V;
-    defm VUNPCKLPDY: sse12_unpack_interleave<0x14, unpckl, v4f64, memopv4f64,
-          VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                         SSEPackedDouble>, OpSize, VEX_4V;
-  }
+  defm VUNPCKHPS: sse12_unpack_interleave<0x15, unpckh, v4f32, memopv4f32,
+        VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                       SSEPackedSingle>, VEX_4V;
+  defm VUNPCKHPD: sse12_unpack_interleave<0x15, unpckh, v2f64, memopv2f64,
+        VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                       SSEPackedDouble>, OpSize, VEX_4V;
+  defm VUNPCKLPS: sse12_unpack_interleave<0x14, unpckl, v4f32, memopv4f32,
+        VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                       SSEPackedSingle>, VEX_4V;
+  defm VUNPCKLPD: sse12_unpack_interleave<0x14, unpckl, v2f64, memopv2f64,
+        VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                       SSEPackedDouble>, OpSize, VEX_4V;
+
+  defm VUNPCKHPSY: sse12_unpack_interleave<0x15, unpckh, v8f32, memopv8f32,
+        VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                       SSEPackedSingle>, VEX_4V;
+  defm VUNPCKHPDY: sse12_unpack_interleave<0x15, unpckh, v4f64, memopv4f64,
+        VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                       SSEPackedDouble>, OpSize, VEX_4V;
+  defm VUNPCKLPSY: sse12_unpack_interleave<0x14, unpckl, v8f32, memopv8f32,
+        VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                       SSEPackedSingle>, VEX_4V;
+  defm VUNPCKLPDY: sse12_unpack_interleave<0x14, unpckl, v4f64, memopv4f64,
+        VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                       SSEPackedDouble>, OpSize, VEX_4V;
 
   let Constraints = "$src1 = $dst" in {
     defm UNPCKHPS: sse12_unpack_interleave<0x15, unpckh, v4f32, memopv4f32,
@@ -1401,35 +1351,46 @@ multiclass sse12_extr_sign_mask<RegisterClass RC, Intrinsic Int, string asm,
 }
 
 // Mask creation
+defm VMOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps,
+                                      "movmskps", SSEPackedSingle>, VEX;
+defm VMOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd,
+                                      "movmskpd", SSEPackedDouble>, OpSize,
+                                      VEX;
+defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_ps_256,
+                                      "movmskps", SSEPackedSingle>, VEX;
+defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_pd_256,
+                                      "movmskpd", SSEPackedDouble>, OpSize,
+                                      VEX;
 defm MOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, "movmskps",
                                      SSEPackedSingle>, TB;
 defm MOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, "movmskpd",
                                      SSEPackedDouble>, TB, OpSize;
 
-let isAsmParserOnly = 0 in {
-  defm VMOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps,
-                                        "movmskps", SSEPackedSingle>, VEX;
-  defm VMOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd,
-                                        "movmskpd", SSEPackedDouble>, OpSize,
-                                        VEX;
-  defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_ps_256,
-                                        "movmskps", SSEPackedSingle>, VEX;
-  defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_pd_256,
-                                        "movmskpd", SSEPackedDouble>, OpSize,
-                                        VEX;
+// X86fgetsign
+def MOVMSKPDrr32_alt : PI<0x50, MRMSrcReg, (outs GR32:$dst), (ins FR64:$src),
+                    "movmskpd\t{$src, $dst|$dst, $src}",
+                    [(set GR32:$dst, (X86fgetsign FR64:$src))], SSEPackedDouble>, TB, OpSize;
+def MOVMSKPDrr64_alt : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins FR64:$src),
+                    "movmskpd\t{$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (X86fgetsign FR64:$src))], SSEPackedDouble>, TB, OpSize;
+def MOVMSKPSrr32_alt : PI<0x50, MRMSrcReg, (outs GR32:$dst), (ins FR32:$src),
+                    "movmskps\t{$src, $dst|$dst, $src}",
+                    [(set GR32:$dst, (X86fgetsign FR32:$src))], SSEPackedSingle>, TB;
+def MOVMSKPSrr64_alt : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins FR32:$src),
+                    "movmskps\t{$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (X86fgetsign FR32:$src))], SSEPackedSingle>, TB;
 
-  // Assembler Only
-  def VMOVMSKPSr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
-             "movmskps\t{$src, $dst|$dst, $src}", [], SSEPackedSingle>, VEX;
-  def VMOVMSKPDr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
-             "movmskpd\t{$src, $dst|$dst, $src}", [], SSEPackedDouble>, OpSize,
-             VEX;
-  def VMOVMSKPSYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src),
-             "movmskps\t{$src, $dst|$dst, $src}", [], SSEPackedSingle>, VEX;
-  def VMOVMSKPDYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src),
-             "movmskpd\t{$src, $dst|$dst, $src}", [], SSEPackedDouble>, OpSize,
-             VEX;
-}
+// Assembler Only
+def VMOVMSKPSr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
+           "movmskps\t{$src, $dst|$dst, $src}", [], SSEPackedSingle>, VEX;
+def VMOVMSKPDr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
+           "movmskpd\t{$src, $dst|$dst, $src}", [], SSEPackedDouble>, OpSize,
+           VEX;
+def VMOVMSKPSYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src),
+           "movmskps\t{$src, $dst|$dst, $src}", [], SSEPackedSingle>, VEX;
+def VMOVMSKPDYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src),
+           "movmskpd\t{$src, $dst|$dst, $src}", [], SSEPackedDouble>, OpSize,
+           VEX;
 
 //===----------------------------------------------------------------------===//
 // SSE 1 & 2 - Misc aliasing of packed SSE 1 & 2 instructions
@@ -1484,13 +1445,11 @@ def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
 ///
 multiclass sse12_fp_alias_pack_logical<bits<8> opc, string OpcodeStr,
                                        SDNode OpNode> {
-  let isAsmParserOnly = 0 in {
-    defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
-                FR32, f32, f128mem, memopfsf32, SSEPackedSingle, 0>, VEX_4V;
+  defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
+              FR32, f32, f128mem, memopfsf32, SSEPackedSingle, 0>, VEX_4V;
 
-    defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
-          FR64, f64, f128mem, memopfsf64, SSEPackedDouble, 0>, OpSize, VEX_4V;
-  }
+  defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
+        FR64, f64, f128mem, memopfsf64, SSEPackedDouble, 0>, OpSize, VEX_4V;
 
   let Constraints = "$src1 = $dst" in {
     defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, FR32,
@@ -1516,7 +1475,7 @@ let neverHasSideEffects = 1, Pattern = []<dag>, isCommutable = 0 in
 multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
                                  SDNode OpNode, int HasPat = 0,
                                  list<list<dag>> Pattern = []> {
-  let isAsmParserOnly = 0, Pattern = []<dag> in {
+  let Pattern = []<dag> in {
     defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
          !strconcat(OpcodeStr, "ps"), f128mem,
          !if(HasPat, Pattern[0], // rr
@@ -1563,7 +1522,6 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
 
 /// sse12_fp_packed_logical_y - AVX 256-bit SSE 1 & 2 logical ops forms
 ///
-let isAsmParserOnly = 0 in {
 multiclass sse12_fp_packed_logical_y<bits<8> opc, string OpcodeStr> {
     defm PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle,
           !strconcat(OpcodeStr, "ps"), f256mem, [], [], 0>, VEX_4V;
@@ -1571,7 +1529,6 @@ multiclass sse12_fp_packed_logical_y<bits<8> opc, string OpcodeStr> {
     defm PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
           !strconcat(OpcodeStr, "pd"), f256mem, [], [], 0>, OpSize, VEX_4V;
 }
-}
 
 // AVX 256-bit packed logical ops forms
 defm VAND : sse12_fp_packed_logical_y<0x54, "and">;
@@ -1669,38 +1626,36 @@ multiclass basic_sse12_fp_binop_p_y_int<bits<8> opc, string OpcodeStr> {
 }
 
 // Binary Arithmetic instructions
-let isAsmParserOnly = 0 in {
-  defm VADD : basic_sse12_fp_binop_s<0x58, "add", fadd, 0>,
-              basic_sse12_fp_binop_s_int<0x58, "add", 0>,
-              basic_sse12_fp_binop_p<0x58, "add", fadd, 0>,
-              basic_sse12_fp_binop_p_y<0x58, "add", fadd>, VEX_4V;
-  defm VMUL : basic_sse12_fp_binop_s<0x59, "mul", fmul, 0>,
-              basic_sse12_fp_binop_s_int<0x59, "mul", 0>,
-              basic_sse12_fp_binop_p<0x59, "mul", fmul, 0>,
-              basic_sse12_fp_binop_p_y<0x59, "mul", fmul>, VEX_4V;
+defm VADD : basic_sse12_fp_binop_s<0x58, "add", fadd, 0>,
+            basic_sse12_fp_binop_s_int<0x58, "add", 0>,
+            basic_sse12_fp_binop_p<0x58, "add", fadd, 0>,
+            basic_sse12_fp_binop_p_y<0x58, "add", fadd>, VEX_4V;
+defm VMUL : basic_sse12_fp_binop_s<0x59, "mul", fmul, 0>,
+            basic_sse12_fp_binop_s_int<0x59, "mul", 0>,
+            basic_sse12_fp_binop_p<0x59, "mul", fmul, 0>,
+            basic_sse12_fp_binop_p_y<0x59, "mul", fmul>, VEX_4V;
 
-  let isCommutable = 0 in {
-    defm VSUB : basic_sse12_fp_binop_s<0x5C, "sub", fsub, 0>,
-                basic_sse12_fp_binop_s_int<0x5C, "sub", 0>,
-                basic_sse12_fp_binop_p<0x5C, "sub", fsub, 0>,
-                basic_sse12_fp_binop_p_y<0x5C, "sub", fsub>, VEX_4V;
-    defm VDIV : basic_sse12_fp_binop_s<0x5E, "div", fdiv, 0>,
-                basic_sse12_fp_binop_s_int<0x5E, "div", 0>,
-                basic_sse12_fp_binop_p<0x5E, "div", fdiv, 0>,
-                basic_sse12_fp_binop_p_y<0x5E, "div", fdiv>, VEX_4V;
-    defm VMAX : basic_sse12_fp_binop_s<0x5F, "max", X86fmax, 0>,
-                basic_sse12_fp_binop_s_int<0x5F, "max", 0>,
-                basic_sse12_fp_binop_p<0x5F, "max", X86fmax, 0>,
-                basic_sse12_fp_binop_p_int<0x5F, "max", 0>,
-                basic_sse12_fp_binop_p_y<0x5F, "max", X86fmax>,
-                basic_sse12_fp_binop_p_y_int<0x5F, "max">, VEX_4V;
-    defm VMIN : basic_sse12_fp_binop_s<0x5D, "min", X86fmin, 0>,
-                basic_sse12_fp_binop_s_int<0x5D, "min", 0>,
-                basic_sse12_fp_binop_p<0x5D, "min", X86fmin, 0>,
-                basic_sse12_fp_binop_p_int<0x5D, "min", 0>,
-                basic_sse12_fp_binop_p_y_int<0x5D, "min">,
-                basic_sse12_fp_binop_p_y<0x5D, "min", X86fmin>, VEX_4V;
-  }
+let isCommutable = 0 in {
+  defm VSUB : basic_sse12_fp_binop_s<0x5C, "sub", fsub, 0>,
+              basic_sse12_fp_binop_s_int<0x5C, "sub", 0>,
+              basic_sse12_fp_binop_p<0x5C, "sub", fsub, 0>,
+              basic_sse12_fp_binop_p_y<0x5C, "sub", fsub>, VEX_4V;
+  defm VDIV : basic_sse12_fp_binop_s<0x5E, "div", fdiv, 0>,
+              basic_sse12_fp_binop_s_int<0x5E, "div", 0>,
+              basic_sse12_fp_binop_p<0x5E, "div", fdiv, 0>,
+              basic_sse12_fp_binop_p_y<0x5E, "div", fdiv>, VEX_4V;
+  defm VMAX : basic_sse12_fp_binop_s<0x5F, "max", X86fmax, 0>,
+              basic_sse12_fp_binop_s_int<0x5F, "max", 0>,
+              basic_sse12_fp_binop_p<0x5F, "max", X86fmax, 0>,
+              basic_sse12_fp_binop_p_int<0x5F, "max", 0>,
+              basic_sse12_fp_binop_p_y<0x5F, "max", X86fmax>,
+              basic_sse12_fp_binop_p_y_int<0x5F, "max">, VEX_4V;
+  defm VMIN : basic_sse12_fp_binop_s<0x5D, "min", X86fmin, 0>,
+              basic_sse12_fp_binop_s_int<0x5D, "min", 0>,
+              basic_sse12_fp_binop_p<0x5D, "min", X86fmin, 0>,
+              basic_sse12_fp_binop_p_int<0x5D, "min", 0>,
+              basic_sse12_fp_binop_p_y_int<0x5D, "min">,
+              basic_sse12_fp_binop_p_y<0x5D, "min", X86fmin>, VEX_4V;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -1901,7 +1856,7 @@ multiclass sse2_fp_unop_p_y_int<bits<8> opc, string OpcodeStr,
                     [(set VR256:$dst, (V2F64Int (memopv4f64 addr:$src)))]>;
 }
 
-let isAsmParserOnly = 0, Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in {
   // Square root.
   defm VSQRT  : sse1_fp_unop_s_avx<0x51, "vsqrt", fsqrt, int_x86_sse_sqrt_ss>,
                 sse2_fp_unop_s_avx<0x51, "vsqrt", fsqrt, int_x86_sse2_sqrt_sd>,
@@ -1957,67 +1912,54 @@ defm RCP   : sse1_fp_unop_s<0x53, "rcp", X86frcp, int_x86_sse_rcp_ss>,
 // SSE 1 & 2 - Non-temporal stores
 //===----------------------------------------------------------------------===//
 
-let isAsmParserOnly = 0 in {
-  def VMOVNTPSmr_Int : VPSI<0x2B, MRMDestMem, (outs),
-                         (ins i128mem:$dst, VR128:$src),
-                         "movntps\t{$src, $dst|$dst, $src}",
-                         [(int_x86_sse_movnt_ps addr:$dst, VR128:$src)]>, VEX;
-  def VMOVNTPDmr_Int : VPDI<0x2B, MRMDestMem, (outs),
-                         (ins i128mem:$dst, VR128:$src),
-                         "movntpd\t{$src, $dst|$dst, $src}",
-                         [(int_x86_sse2_movnt_pd addr:$dst, VR128:$src)]>, VEX;
-
-  let ExeDomain = SSEPackedInt in
-    def VMOVNTDQmr_Int : VPDI<0xE7, MRMDestMem, (outs),
+let AddedComplexity = 400 in { // Prefer non-temporal versions
+  def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs),
                        (ins f128mem:$dst, VR128:$src),
-                       "movntdq\t{$src, $dst|$dst, $src}",
-                       [(int_x86_sse2_movnt_dq addr:$dst, VR128:$src)]>, VEX;
-
-  let AddedComplexity = 400 in { // Prefer non-temporal versions
-    def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs),
-                         (ins f128mem:$dst, VR128:$src),
-                         "movntps\t{$src, $dst|$dst, $src}",
-                         [(alignednontemporalstore (v4f32 VR128:$src),
-                                                   addr:$dst)]>, VEX;
-    def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs),
-                         (ins f128mem:$dst, VR128:$src),
-                         "movntpd\t{$src, $dst|$dst, $src}",
-                         [(alignednontemporalstore (v2f64 VR128:$src),
-                                                   addr:$dst)]>, VEX;
-    def VMOVNTDQ_64mr : VPDI<0xE7, MRMDestMem, (outs),
-                          (ins f128mem:$dst, VR128:$src),
-                          "movntdq\t{$src, $dst|$dst, $src}",
-                          [(alignednontemporalstore (v2f64 VR128:$src),
-                                                    addr:$dst)]>, VEX;
-    let ExeDomain = SSEPackedInt in
-    def VMOVNTDQmr : VPDI<0xE7, MRMDestMem, (outs),
+                       "movntps\t{$src, $dst|$dst, $src}",
+                       [(alignednontemporalstore (v4f32 VR128:$src),
+                                                 addr:$dst)]>, VEX;
+  def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs),
+                       (ins f128mem:$dst, VR128:$src),
+                       "movntpd\t{$src, $dst|$dst, $src}",
+                       [(alignednontemporalstore (v2f64 VR128:$src),
+                                                 addr:$dst)]>, VEX;
+  def VMOVNTDQ_64mr : VPDI<0xE7, MRMDestMem, (outs),
                         (ins f128mem:$dst, VR128:$src),
                         "movntdq\t{$src, $dst|$dst, $src}",
-                        [(alignednontemporalstore (v4f32 VR128:$src),
+                        [(alignednontemporalstore (v2f64 VR128:$src),
                                                   addr:$dst)]>, VEX;
 
-    def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs),
-                         (ins f256mem:$dst, VR256:$src),
-                         "movntps\t{$src, $dst|$dst, $src}",
-                         [(alignednontemporalstore (v8f32 VR256:$src),
-                                                   addr:$dst)]>, VEX;
-    def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs),
-                         (ins f256mem:$dst, VR256:$src),
-                         "movntpd\t{$src, $dst|$dst, $src}",
-                         [(alignednontemporalstore (v4f64 VR256:$src),
-                                                   addr:$dst)]>, VEX;
-    def VMOVNTDQY_64mr : VPDI<0xE7, MRMDestMem, (outs),
-                          (ins f256mem:$dst, VR256:$src),
-                          "movntdq\t{$src, $dst|$dst, $src}",
-                          [(alignednontemporalstore (v4f64 VR256:$src),
-                                                    addr:$dst)]>, VEX;
-    let ExeDomain = SSEPackedInt in
-    def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs),
+  let ExeDomain = SSEPackedInt in
+  def VMOVNTDQmr    : VPDI<0xE7, MRMDestMem, (outs),
+                           (ins f128mem:$dst, VR128:$src),
+                           "movntdq\t{$src, $dst|$dst, $src}",
+                           [(alignednontemporalstore (v4f32 VR128:$src),
+                                                     addr:$dst)]>, VEX;
+
+  def : Pat<(alignednontemporalstore (v2i64 VR128:$src), addr:$dst),
+            (VMOVNTDQmr addr:$dst, VR128:$src)>, Requires<[HasAVX]>;
+
+  def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs),
+                       (ins f256mem:$dst, VR256:$src),
+                       "movntps\t{$src, $dst|$dst, $src}",
+                       [(alignednontemporalstore (v8f32 VR256:$src),
+                                                 addr:$dst)]>, VEX;
+  def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs),
+                       (ins f256mem:$dst, VR256:$src),
+                       "movntpd\t{$src, $dst|$dst, $src}",
+                       [(alignednontemporalstore (v4f64 VR256:$src),
+                                                 addr:$dst)]>, VEX;
+  def VMOVNTDQY_64mr : VPDI<0xE7, MRMDestMem, (outs),
                         (ins f256mem:$dst, VR256:$src),
                         "movntdq\t{$src, $dst|$dst, $src}",
-                        [(alignednontemporalstore (v8f32 VR256:$src),
+                        [(alignednontemporalstore (v4f64 VR256:$src),
                                                   addr:$dst)]>, VEX;
-  }
+  let ExeDomain = SSEPackedInt in
+  def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs),
+                      (ins f256mem:$dst, VR256:$src),
+                      "movntdq\t{$src, $dst|$dst, $src}",
+                      [(alignednontemporalstore (v8f32 VR256:$src),
+                                                addr:$dst)]>, VEX;
 }
 
 def : Pat<(int_x86_avx_movnt_dq_256 addr:$dst, VR256:$src),
@@ -2027,18 +1969,6 @@ def : Pat<(int_x86_avx_movnt_pd_256 addr:$dst, VR256:$src),
 def : Pat<(int_x86_avx_movnt_ps_256 addr:$dst, VR256:$src),
           (VMOVNTPSYmr addr:$dst, VR256:$src)>;
 
-def MOVNTPSmr_Int : PSI<0x2B, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
-                    "movntps\t{$src, $dst|$dst, $src}",
-                    [(int_x86_sse_movnt_ps addr:$dst, VR128:$src)]>;
-def MOVNTPDmr_Int : PDI<0x2B, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
-                        "movntpd\t{$src, $dst|$dst, $src}",
-                        [(int_x86_sse2_movnt_pd addr:$dst, VR128:$src)]>;
-
-let ExeDomain = SSEPackedInt in
-def MOVNTDQmr_Int : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
-                        "movntdq\t{$src, $dst|$dst, $src}",
-                        [(int_x86_sse2_movnt_dq addr:$dst, VR128:$src)]>;
-
 let AddedComplexity = 400 in { // Prefer non-temporal versions
 def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                     "movntps\t{$src, $dst|$dst, $src}",
@@ -2056,22 +1986,19 @@ def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                     "movntdq\t{$src, $dst|$dst, $src}",
                     [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>;
 
+def : Pat<(alignednontemporalstore (v2i64 VR128:$src), addr:$dst),
+          (MOVNTDQmr addr:$dst, VR128:$src)>;
+
 // There is no AVX form for instructions below this point
 def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
                  "movnti\t{$src, $dst|$dst, $src}",
                  [(nontemporalstore (i32 GR32:$src), addr:$dst)]>,
                TB, Requires<[HasSSE2]>;
-
 def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
                      "movnti\t{$src, $dst|$dst, $src}",
                      [(nontemporalstore (i64 GR64:$src), addr:$dst)]>,
                   TB, Requires<[HasSSE2]>;
-
 }
-def MOVNTImr_Int  :   I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
-                    "movnti\t{$src, $dst|$dst, $src}",
-                    [(int_x86_sse2_movnt_i addr:$dst, GR32:$src)]>,
-                  TB, Requires<[HasSSE2]>;
 
 //===----------------------------------------------------------------------===//
 // SSE 1 & 2 - Misc Instructions (No AVX form)
@@ -2079,13 +2006,13 @@ def MOVNTImr_Int  :   I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
 
 // Prefetch intrinsic.
 def PREFETCHT0   : PSI<0x18, MRM1m, (outs), (ins i8mem:$src),
-    "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3))]>;
+    "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))]>;
 def PREFETCHT1   : PSI<0x18, MRM2m, (outs), (ins i8mem:$src),
-    "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2))]>;
+    "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))]>;
 def PREFETCHT2   : PSI<0x18, MRM3m, (outs), (ins i8mem:$src),
-    "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1))]>;
+    "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))]>;
 def PREFETCHNTA  : PSI<0x18, MRM0m, (outs), (ins i8mem:$src),
-    "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0))]>;
+    "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))]>;
 
 // Load, store, and memory fence
 def SFENCE : I<0xAE, MRM_F8, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>,
@@ -2136,16 +2063,23 @@ def : Pat<(v16i8 immAllZerosV), (V_SET0PI)>;
 def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
           (f32 (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
 
+// FIXME: According to the intel manual, DEST[127:64] <- SRC1[127:64], while
+// in the non-AVX version bits 127:64 aren't touched. Find a better way to
+// represent this instead of always zeroing SRC1. One possible solution is
+// to represent the instruction w/ something similar as the "$src1 = $dst"
+// constraint but without the tied operands.
+def : Pat<(extloadf32 addr:$src),
+          (VCVTSS2SDrm (f32 (EXTRACT_SUBREG (AVX_SET0PS), sub_ss)), addr:$src)>,
+      Requires<[HasAVX, OptForSpeed]>;
+
 //===----------------------------------------------------------------------===//
 // SSE 1 & 2 - Load/Store XCSR register
 //===----------------------------------------------------------------------===//
 
-let isAsmParserOnly = 0 in {
-  def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
-                    "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>, VEX;
-  def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
-                    "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>, VEX;
-}
+def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
+                  "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>, VEX;
+def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
+                  "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>, VEX;
 
 def LDMXCSR : PSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
                   "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>;
@@ -2158,45 +2092,43 @@ def STMXCSR : PSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
 
 let ExeDomain = SSEPackedInt in { // SSE integer instructions
 
-let isAsmParserOnly = 0 in {
-  let neverHasSideEffects = 1 in {
-  def VMOVDQArr  : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                      "movdqa\t{$src, $dst|$dst, $src}", []>, VEX;
-  def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
-                      "movdqa\t{$src, $dst|$dst, $src}", []>, VEX;
-  }
-  def VMOVDQUrr  : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                      "movdqu\t{$src, $dst|$dst, $src}", []>, XS, VEX;
-  def VMOVDQUYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
-                      "movdqu\t{$src, $dst|$dst, $src}", []>, XS, VEX;
-
-  let canFoldAsLoad = 1, mayLoad = 1 in {
-  def VMOVDQArm  : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
-                     "movdqa\t{$src, $dst|$dst, $src}", []>, VEX;
-  def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
-                     "movdqa\t{$src, $dst|$dst, $src}", []>, VEX;
-  let Predicates = [HasAVX] in {
-    def VMOVDQUrm  : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
-                      "vmovdqu\t{$src, $dst|$dst, $src}",[]>, XS, VEX;
-    def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
-                      "vmovdqu\t{$src, $dst|$dst, $src}",[]>, XS, VEX;
-  }
-  }
+let neverHasSideEffects = 1 in {
+def VMOVDQArr  : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                    "movdqa\t{$src, $dst|$dst, $src}", []>, VEX;
+def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+                    "movdqa\t{$src, $dst|$dst, $src}", []>, VEX;
+}
+def VMOVDQUrr  : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                    "movdqu\t{$src, $dst|$dst, $src}", []>, XS, VEX;
+def VMOVDQUYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+                    "movdqu\t{$src, $dst|$dst, $src}", []>, XS, VEX;
 
-  let mayStore = 1 in {
-  def VMOVDQAmr  : VPDI<0x7F, MRMDestMem, (outs),
-                       (ins i128mem:$dst, VR128:$src),
-                       "movdqa\t{$src, $dst|$dst, $src}", []>, VEX;
-  def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs),
-                       (ins i256mem:$dst, VR256:$src),
-                       "movdqa\t{$src, $dst|$dst, $src}", []>, VEX;
-  let Predicates = [HasAVX] in {
-  def VMOVDQUmr  : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+let canFoldAsLoad = 1, mayLoad = 1 in {
+def VMOVDQArm  : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                   "movdqa\t{$src, $dst|$dst, $src}", []>, VEX;
+def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
+                   "movdqa\t{$src, $dst|$dst, $src}", []>, VEX;
+let Predicates = [HasAVX] in {
+  def VMOVDQUrm  : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                     "vmovdqu\t{$src, $dst|$dst, $src}",[]>, XS, VEX;
-  def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
+  def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
                     "vmovdqu\t{$src, $dst|$dst, $src}",[]>, XS, VEX;
-  }
-  }
+}
+}
+
+let mayStore = 1 in {
+def VMOVDQAmr  : VPDI<0x7F, MRMDestMem, (outs),
+                     (ins i128mem:$dst, VR128:$src),
+                     "movdqa\t{$src, $dst|$dst, $src}", []>, VEX;
+def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs),
+                     (ins i256mem:$dst, VR256:$src),
+                     "movdqa\t{$src, $dst|$dst, $src}", []>, VEX;
+let Predicates = [HasAVX] in {
+def VMOVDQUmr  : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+                  "vmovdqu\t{$src, $dst|$dst, $src}",[]>, XS, VEX;
+def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
+                  "vmovdqu\t{$src, $dst|$dst, $src}",[]>, XS, VEX;
+}
 }
 
 let neverHasSideEffects = 1 in
@@ -2228,23 +2160,11 @@ def MOVDQUmr :   I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
 }
 
 // Intrinsic forms of MOVDQU load and store
-let isAsmParserOnly = 0 in {
-let canFoldAsLoad = 1 in
-def VMOVDQUrm_Int : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
-                       "vmovdqu\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (int_x86_sse2_loadu_dq addr:$src))]>,
-                     XS, VEX, Requires<[HasAVX]>;
 def VMOVDQUmr_Int : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
                        "vmovdqu\t{$src, $dst|$dst, $src}",
                        [(int_x86_sse2_storeu_dq addr:$dst, VR128:$src)]>,
                      XS, VEX, Requires<[HasAVX]>;
-}
 
-let canFoldAsLoad = 1 in
-def MOVDQUrm_Int :   I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
-                       "movdqu\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (int_x86_sse2_loadu_dq addr:$src))]>,
-                 XS, Requires<[HasSSE2]>;
 def MOVDQUmr_Int :   I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
                        "movdqu\t{$src, $dst|$dst, $src}",
                        [(int_x86_sse2_storeu_dq addr:$dst, VR128:$src)]>,
@@ -2349,7 +2269,7 @@ multiclass PDI_binop_rm_v2i64<bits<8> opc, string OpcodeStr, SDNode OpNode,
 
 // 128-bit Integer Arithmetic
 
-let isAsmParserOnly = 0, Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in {
 defm VPADDB  : PDI_binop_rm<0xFC, "vpaddb", add, v16i8, 1, 0 /*3addr*/>, VEX_4V;
 defm VPADDW  : PDI_binop_rm<0xFD, "vpaddw", add, v8i16, 1, 0>, VEX_4V;
 defm VPADDD  : PDI_binop_rm<0xFE, "vpaddd", add, v4i32, 1, 0>, VEX_4V;
@@ -2439,7 +2359,7 @@ defm PSADBW  : PDI_binop_rm_int<0xF6, "psadbw", int_x86_sse2_psad_bw, 1>;
 // SSE2 - Packed Integer Logical Instructions
 //===---------------------------------------------------------------------===//
 
-let isAsmParserOnly = 0, Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in {
 defm VPSLLW : PDI_binop_rmi_int<0xF1, 0x71, MRM6r, "vpsllw",
                                 int_x86_sse2_psll_w, int_x86_sse2_pslli_w, 0>,
                                 VEX_4V;
@@ -2586,7 +2506,7 @@ let Predicates = [HasSSE2] in {
 // SSE2 - Packed Integer Comparison Instructions
 //===---------------------------------------------------------------------===//
 
-let isAsmParserOnly = 0, Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in {
   defm VPCMPEQB  : PDI_binop_rm_int<0x74, "vpcmpeqb", int_x86_sse2_pcmpeq_b, 1,
                                     0>, VEX_4V;
   defm VPCMPEQW  : PDI_binop_rm_int<0x75, "vpcmpeqw", int_x86_sse2_pcmpeq_w, 1,
@@ -2640,7 +2560,7 @@ def : Pat<(v4i32 (X86pcmpgtd VR128:$src1, (memop addr:$src2))),
 // SSE2 - Packed Integer Pack Instructions
 //===---------------------------------------------------------------------===//
 
-let isAsmParserOnly = 0, Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in {
 defm VPACKSSWB : PDI_binop_rm_int<0x63, "vpacksswb", int_x86_sse2_packsswb_128,
                                   0, 0>, VEX_4V;
 defm VPACKSSDW : PDI_binop_rm_int<0x6B, "vpackssdw", int_x86_sse2_packssdw_128,
@@ -2678,7 +2598,7 @@ def mi : Ii8<0x70, MRMSrcMem,
 }
 } // ExeDomain = SSEPackedInt
 
-let isAsmParserOnly = 0, Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in {
   let AddedComplexity = 5 in
   defm VPSHUFD : sse2_pshuffle<"vpshufd", v4i32, pshufd, bc_v4i32>, OpSize,
                                VEX;
@@ -2726,7 +2646,7 @@ multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
                                                addr:$src2))))]>;
 }
 
-let isAsmParserOnly = 0, Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in {
   defm VPUNPCKLBW  : sse2_unpack<0x60, "vpunpcklbw", v16i8, unpckl, bc_v16i8,
                                  0>, VEX_4V;
   defm VPUNPCKLWD  : sse2_unpack<0x61, "vpunpcklwd", v8i16, unpckl, bc_v8i16,
@@ -2836,7 +2756,7 @@ multiclass sse2_pinsrw<bit Is2Addr = 1> {
 }
 
 // Extract
-let isAsmParserOnly = 0, Predicates = [HasAVX] in
+let Predicates = [HasAVX] in
 def VPEXTRWri : Ii8<0xC5, MRMSrcReg,
                     (outs GR32:$dst), (ins VR128:$src1, i32i8imm:$src2),
                     "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -2849,7 +2769,7 @@ def PEXTRWri : PDIi8<0xC5, MRMSrcReg,
                                                 imm:$src2))]>;
 
 // Insert
-let isAsmParserOnly = 0, Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in {
   defm VPINSRW : sse2_pinsrw<0>, OpSize, VEX_4V;
   def  VPINSRWrr64i : Ii8<0xC4, MRMSrcReg, (outs VR128:$dst),
        (ins VR128:$src1, GR64:$src2, i32i8imm:$src3),
@@ -2868,13 +2788,11 @@ let Constraints = "$src1 = $dst" in
 
 let ExeDomain = SSEPackedInt in {
 
-let isAsmParserOnly = 0 in {
 def VPMOVMSKBrr  : VPDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
            "pmovmskb\t{$src, $dst|$dst, $src}",
            [(set GR32:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))]>, VEX;
 def VPMOVMSKBr64r : VPDI<0xD7, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
            "pmovmskb\t{$src, $dst|$dst, $src}", []>, VEX;
-}
 def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
            "pmovmskb\t{$src, $dst|$dst, $src}",
            [(set GR32:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))]>;
@@ -2887,7 +2805,6 @@ def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
 
 let ExeDomain = SSEPackedInt in {
 
-let isAsmParserOnly = 0 in {
 let Uses = [EDI] in
 def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs),
            (ins VR128:$src, VR128:$mask),
@@ -2898,7 +2815,6 @@ def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs),
            (ins VR128:$src, VR128:$mask),
            "maskmovdqu\t{$mask, $src|$src, $mask}",
            [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>, VEX;
-}
 
 let Uses = [EDI] in
 def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
@@ -2916,7 +2832,6 @@ def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
 //===---------------------------------------------------------------------===//
 
 // Move Int Doubleword to Packed Double Int
-let isAsmParserOnly = 0 in {
 def VMOVDI2PDIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(set VR128:$dst,
@@ -2926,7 +2841,6 @@ def VMOVDI2PDIrm : VPDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
                       [(set VR128:$dst,
                         (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>,
                       VEX;
-}
 def MOVDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(set VR128:$dst,
@@ -2945,7 +2859,6 @@ def MOV64toSDrr : RPDI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
 
 
 // Move Int Doubleword to Single Scalar
-let isAsmParserOnly = 0 in {
 def VMOVDI2SSrr  : VPDI<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(set FR32:$dst, (bitconvert GR32:$src))]>, VEX;
@@ -2954,7 +2867,6 @@ def VMOVDI2SSrm  : VPDI<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>,
                       VEX;
-}
 def MOVDI2SSrr  : PDI<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(set FR32:$dst, (bitconvert GR32:$src))]>;
@@ -2964,7 +2876,6 @@ def MOVDI2SSrm  : PDI<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
                       [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>;
 
 // Move Packed Doubleword Int to Packed Double Int
-let isAsmParserOnly = 0 in {
 def VMOVPDI2DIrr  : VPDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
                        "movd\t{$src, $dst|$dst, $src}",
                        [(set GR32:$dst, (vector_extract (v4i32 VR128:$src),
@@ -2974,7 +2885,6 @@ def VMOVPDI2DImr  : VPDI<0x7E, MRMDestMem, (outs),
                        "movd\t{$src, $dst|$dst, $src}",
                        [(store (i32 (vector_extract (v4i32 VR128:$src),
                                      (iPTR 0))), addr:$dst)]>, VEX;
-}
 def MOVPDI2DIrr  : PDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
                        "movd\t{$src, $dst|$dst, $src}",
                        [(set GR32:$dst, (vector_extract (v4i32 VR128:$src),
@@ -3000,14 +2910,12 @@ def MOVSDto64mr  : RPDI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
                         [(store (i64 (bitconvert FR64:$src)), addr:$dst)]>;
 
 // Move Scalar Single to Double Int
-let isAsmParserOnly = 0 in {
 def VMOVSS2DIrr  : VPDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(set GR32:$dst, (bitconvert FR32:$src))]>, VEX;
 def VMOVSS2DImr  : VPDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>, VEX;
-}
 def MOVSS2DIrr  : PDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(set GR32:$dst, (bitconvert FR32:$src))]>;
@@ -3016,7 +2924,7 @@ def MOVSS2DImr  : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
                       [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>;
 
 // movd / movq to XMM register zero-extends
-let AddedComplexity = 15, isAsmParserOnly = 0 in {
+let AddedComplexity = 15 in {
 def VMOVZDI2PDIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
                        "movd\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (v4i32 (X86vzmovl
@@ -3040,7 +2948,6 @@ def MOVZQI2PQIrr : RPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
 }
 
 let AddedComplexity = 20 in {
-let isAsmParserOnly = 0 in
 def VMOVZDI2PDIrm : VPDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
                        "movd\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst,
@@ -3066,7 +2973,6 @@ def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
 //===---------------------------------------------------------------------===//
 
 // Move Quadword Int to Packed Quadword Int
-let isAsmParserOnly = 0 in
 def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                     "vmovq\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst,
@@ -3079,7 +2985,6 @@ def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                     Requires<[HasSSE2]>; // SSE2 instruction with XS Prefix
 
 // Move Packed Quadword Int to Quadword Int
-let isAsmParserOnly = 0 in
 def VMOVPQI2QImr : VPDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
                       "movq\t{$src, $dst|$dst, $src}",
                       [(store (i64 (vector_extract (v2i64 VR128:$src),
@@ -3093,7 +2998,6 @@ def : Pat<(f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
           (f64 (EXTRACT_SUBREG (v2f64 VR128:$src), sub_sd))>;
 
 // Store / copy lower 64-bits of a XMM register.
-let isAsmParserOnly = 0 in
 def VMOVLQ128mr : VPDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
                      "movq\t{$src, $dst|$dst, $src}",
                      [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)]>, VEX;
@@ -3101,7 +3005,7 @@ def MOVLQ128mr : PDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
                      "movq\t{$src, $dst|$dst, $src}",
                      [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)]>;
 
-let AddedComplexity = 20, isAsmParserOnly = 0 in
+let AddedComplexity = 20 in
 def VMOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                      "vmovq\t{$src, $dst|$dst, $src}",
                      [(set VR128:$dst,
@@ -3126,7 +3030,7 @@ def : Pat<(v2i64 (X86vzload addr:$src)), (MOVZQI2PQIrm addr:$src)>;
 
 // Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in
 // IA32 document. movq xmm1, xmm2 does clear the high bits.
-let isAsmParserOnly = 0, AddedComplexity = 15 in
+let AddedComplexity = 15 in
 def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                         "vmovq\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
@@ -3137,7 +3041,7 @@ def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                     [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
                       XS, Requires<[HasSSE2]>;
 
-let AddedComplexity = 20, isAsmParserOnly = 0 in
+let AddedComplexity = 20 in
 def VMOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                         "vmovq\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst, (v2i64 (X86vzmovl
@@ -3155,7 +3059,6 @@ def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4i32 addr:$src)))),
 }
 
 // Instructions to match in the assembler
-let isAsmParserOnly = 0 in {
 def VMOVQs64rr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
                       "movq\t{$src, $dst|$dst, $src}", []>, VEX, VEX_W;
 def VMOVQd64rr : VPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
@@ -3163,13 +3066,12 @@ def VMOVQd64rr : VPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
 // Recognize "movd" with GR64 destination, but encode as a "movq"
 def VMOVQd64rr_alt : VPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
                           "movd\t{$src, $dst|$dst, $src}", []>, VEX, VEX_W;
-}
 
 // Instructions for the disassembler
 // xr = XMM register
 // xm = mem64
 
-let isAsmParserOnly = 0, Predicates = [HasAVX] in
+let Predicates = [HasAVX] in
 def VMOVQxrxr: I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                  "vmovq\t{$src, $dst|$dst, $src}", []>, VEX, XS;
 def MOVQxrxr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
@@ -3211,7 +3113,7 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
 //===---------------------------------------------------------------------===//
 
 // Convert Packed Double FP to Packed DW Integers
-let isAsmParserOnly = 0, Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in {
 // The assembler can recognize rr 256-bit instructions by seeing a ymm
 // register, but the same isn't true when using memory operands instead.
 // Provide other assembly rr and rm forms to address this explicitly.
@@ -3239,7 +3141,7 @@ def CVTPD2DQrr  : S3DI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvtpd2dq\t{$src, $dst|$dst, $src}", []>;
 
 // Convert Packed DW Integers to Packed Double FP
-let isAsmParserOnly = 0, Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in {
 def VCVTDQ2PDrm  : S3SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                      "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX;
 def VCVTDQ2PDrr  : S3SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
@@ -3290,7 +3192,7 @@ def rm : S3SI<op, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
               !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>;
 }
 
-let isAsmParserOnly = 0, Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in {
   // FIXME: Merge above classes when we have patterns for the ymm version
   defm VMOVSHDUP  : sse3_replicate_sfp<0x16, movshdup, "vmovshdup">, VEX;
   defm VMOVSLDUP  : sse3_replicate_sfp<0x12, movsldup, "vmovsldup">, VEX;
@@ -3321,7 +3223,7 @@ def rm  : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                     []>;
 }
 
-let isAsmParserOnly = 0, Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in {
   // FIXME: Merge above classes when we have patterns for the ymm version
   defm VMOVDDUP  : sse3_replicate_dfp<"vmovddup">, VEX;
   defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX;
@@ -3329,7 +3231,7 @@ let isAsmParserOnly = 0, Predicates = [HasAVX] in {
 defm MOVDDUP : sse3_replicate_dfp<"movddup">;
 
 // Move Unaligned Integer
-let isAsmParserOnly = 0, Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in {
   def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                    "vlddqu\t{$src, $dst|$dst, $src}",
                    [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, VEX;
@@ -3393,7 +3295,7 @@ multiclass sse3_addsub<Intrinsic Int, string OpcodeStr, RegisterClass RC,
        [(set RC:$dst, (Int RC:$src1, (memop addr:$src2)))]>;
 }
 
-let isAsmParserOnly = 0, Predicates = [HasAVX],
+let Predicates = [HasAVX],
   ExeDomain = SSEPackedDouble in {
   defm VADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "vaddsubps", VR128,
                                f128mem, 0>, TB, XD, VEX_4V;
@@ -3446,7 +3348,7 @@ multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
       [(set RC:$dst, (vt (IntId RC:$src1, (memop addr:$src2))))]>;
 }
 
-let isAsmParserOnly = 0, Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in {
   defm VHADDPS  : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem,
                           int_x86_sse3_hadd_ps, 0>, VEX_4V;
   defm VHADDPD  : S3_Int <0x7C, "vhaddpd", v2f64, VR128, f128mem,
@@ -3498,7 +3400,7 @@ multiclass SS3I_unop_rm_int<bits<8> opc, string OpcodeStr,
                        (bitconvert (mem_frag128 addr:$src))))]>, OpSize;
 }
 
-let isAsmParserOnly = 0, Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in {
   defm VPABSB  : SS3I_unop_rm_int<0x1C, "vpabsb", memopv16i8,
                                   int_x86_ssse3_pabs_b_128>, VEX;
   defm VPABSW  : SS3I_unop_rm_int<0x1D, "vpabsw", memopv8i16,
@@ -3540,7 +3442,7 @@ multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
           (bitconvert (memopv16i8 addr:$src2))))]>, OpSize;
 }
 
-let isAsmParserOnly = 0, Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in {
 let isCommutable = 0 in {
   defm VPHADDW    : SS3I_binop_rm_int<0x01, "vphaddw", memopv8i16,
                                       int_x86_ssse3_phadd_w_128, 0>, VEX_4V;
@@ -3632,7 +3534,7 @@ multiclass ssse3_palign<string asm, bit Is2Addr = 1> {
       []>, OpSize;
 }
 
-let isAsmParserOnly = 0, Predicates = [HasAVX] in
+let Predicates = [HasAVX] in
   defm VPALIGN : ssse3_palign<"vpalignr", 0>, VEX_4V;
 let Constraints = "$src1 = $dst" in
   defm PALIGN : ssse3_palign<"palignr">;
@@ -3696,6 +3598,16 @@ let Predicates = [HasSSE2] in
  def : Pat<(fextend (loadf32 addr:$src)),
            (CVTSS2SDrm addr:$src)>;
 
+// FIXME: According to the intel manual, DEST[127:64] <- SRC1[127:64], while
+// in the non-AVX version bits 127:64 aren't touched. Find a better way to
+// represent this instead of always zeroing SRC1. One possible solution is
+// to represent the instruction w/ something similar as the "$src1 = $dst"
+// constraint but without the tied operands.
+let Predicates = [HasAVX] in
+ def : Pat<(fextend (loadf32 addr:$src)),
+           (VCVTSS2SDrm (f32 (EXTRACT_SUBREG (AVX_SET0PS), sub_ss)),
+                        addr:$src)>;
+
 // bit_convert
 let Predicates = [HasXMMInt] in {
   def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>;
@@ -3987,7 +3899,7 @@ multiclass SS41I_binop_rm_int8<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
        OpSize;
 }
 
-let isAsmParserOnly = 0, Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in {
 defm VPMOVSXBW : SS41I_binop_rm_int8<0x20, "vpmovsxbw", int_x86_sse41_pmovsxbw>,
                                      VEX;
 defm VPMOVSXWD : SS41I_binop_rm_int8<0x23, "vpmovsxwd", int_x86_sse41_pmovsxwd>,
@@ -4053,7 +3965,7 @@ multiclass SS41I_binop_rm_int4<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
           OpSize;
 }
 
-let isAsmParserOnly = 0, Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in {
 defm VPMOVSXBD : SS41I_binop_rm_int4<0x21, "vpmovsxbd", int_x86_sse41_pmovsxbd>,
                                      VEX;
 defm VPMOVSXWQ : SS41I_binop_rm_int4<0x24, "vpmovsxwq", int_x86_sse41_pmovsxwq>,
@@ -4094,7 +4006,7 @@ multiclass SS41I_binop_rm_int2<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
                  OpSize;
 }
 
-let isAsmParserOnly = 0, Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in {
 defm VPMOVSXBQ : SS41I_binop_rm_int2<0x22, "vpmovsxbq", int_x86_sse41_pmovsxbq>,
                                      VEX;
 defm VPMOVZXBQ : SS41I_binop_rm_int2<0x32, "vpmovzxbq", int_x86_sse41_pmovzxbq>,
@@ -4136,7 +4048,7 @@ multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
 // (store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), imm:$src2))), addr:$dst)
 }
 
-let isAsmParserOnly = 0, Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in {
   defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX;
   def  VPEXTRBrr64 : SS4AIi8<0x14, MRMDestReg, (outs GR64:$dst),
          (ins VR128:$src1, i32i8imm:$src2),
@@ -4158,7 +4070,7 @@ multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
 // (store (i16 (trunc (X86pextrw (v16i8 VR128:$src1), imm:$src2))), addr:$dst)
 }
 
-let isAsmParserOnly = 0, Predicates = [HasAVX] in
+let Predicates = [HasAVX] in
   defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX;
 
 defm PEXTRW      : SS41I_extract16<0x15, "pextrw">;
@@ -4180,7 +4092,7 @@ multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
                           addr:$dst)]>, OpSize;
 }
 
-let isAsmParserOnly = 0, Predicates = [HasAVX] in
+let Predicates = [HasAVX] in
   defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX;
 
 defm PEXTRD      : SS41I_extract32<0x16, "pextrd">;
@@ -4201,7 +4113,7 @@ multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> {
                           addr:$dst)]>, OpSize, REX_W;
 }
 
-let isAsmParserOnly = 0, Predicates = [HasAVX] in
+let Predicates = [HasAVX] in
   defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W;
 
 defm PEXTRQ      : SS41I_extract64<0x16, "pextrq">;
@@ -4224,7 +4136,7 @@ multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> {
                           addr:$dst)]>, OpSize;
 }
 
-let isAsmParserOnly = 0, Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in {
   defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX;
   def VEXTRACTPSrr64 : SS4AIi8<0x17, MRMDestReg, (outs GR64:$dst),
                   (ins VR128:$src1, i32i8imm:$src2),
@@ -4264,7 +4176,7 @@ multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
                    imm:$src3))]>, OpSize;
 }
 
-let isAsmParserOnly = 0, Predicates = [HasAVX] in
+let Predicates = [HasAVX] in
   defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V;
 let Constraints = "$src1 = $dst" in
   defm PINSRB  : SS41I_insert8<0x20, "pinsrb">;
@@ -4290,7 +4202,7 @@ multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
                           imm:$src3)))]>, OpSize;
 }
 
-let isAsmParserOnly = 0, Predicates = [HasAVX] in
+let Predicates = [HasAVX] in
   defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V;
 let Constraints = "$src1 = $dst" in
   defm PINSRD : SS41I_insert32<0x22, "pinsrd">;
@@ -4316,7 +4228,7 @@ multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
                           imm:$src3)))]>, OpSize;
 }
 
-let isAsmParserOnly = 0, Predicates = [HasAVX] in
+let Predicates = [HasAVX] in
   defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W;
 let Constraints = "$src1 = $dst" in
   defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W;
@@ -4349,7 +4261,7 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> {
 
 let Constraints = "$src1 = $dst" in
   defm INSERTPS : SS41I_insertf32<0x21, "insertps">;
-let isAsmParserOnly = 0, Predicates = [HasAVX] in
+let Predicates = [HasAVX] in
   defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, VEX_4V;
 
 def : Pat<(int_x86_sse41_insertps VR128:$src1, VR128:$src2, imm:$src3),
@@ -4519,7 +4431,7 @@ multiclass sse41_fp_binop_rm_avx_s<bits<8> opcss, bits<8> opcsd,
 }
 
 // FP round - roundss, roundps, roundsd, roundpd
-let isAsmParserOnly = 0, Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in {
   // Intrinsic form
   defm VROUND  : sse41_fp_unop_rm<0x08, 0x09, "vround", f128mem, VR128,
                                   memopv4f32, memopv2f64,
@@ -4554,7 +4466,7 @@ defm ROUND  : sse41_fp_binop_rm<0x0A, 0x0B, "round",
 
 // ptest instruction we'll lower to this in X86ISelLowering primarily from
 // the intel intrinsic that corresponds to this.
-let Defs = [EFLAGS], isAsmParserOnly = 0, Predicates = [HasAVX] in {
+let Defs = [EFLAGS], Predicates = [HasAVX] in {
 def VPTESTrr  : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
                 "vptest\t{$src2, $src1|$src1, $src2}",
                 [(set EFLAGS, (X86ptest VR128:$src1, (v4f32 VR128:$src2)))]>,
@@ -4597,7 +4509,7 @@ multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
             OpSize, VEX;
 }
 
-let Defs = [EFLAGS], isAsmParserOnly = 0, Predicates = [HasAVX] in {
+let Defs = [EFLAGS], Predicates = [HasAVX] in {
 defm VTESTPS  : avx_bittest<0x0E, "vtestps", VR128, f128mem, memopv4f32, v4f32>;
 defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, memopv8f32, v8f32>;
 defm VTESTPD  : avx_bittest<0x0F, "vtestpd", VR128, f128mem, memopv2f64, v2f64>;
@@ -4646,7 +4558,7 @@ multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
                        (bitconvert (memopv8i16 addr:$src))))]>, OpSize;
 }
 
-let isAsmParserOnly = 0, Predicates = [HasAVX] in
+let Predicates = [HasAVX] in
 defm VPHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "vphminposuw",
                                          int_x86_sse41_phminposuw>, VEX;
 defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw",
@@ -4672,7 +4584,7 @@ multiclass SS41I_binop_rm_int<bits<8> opc, string OpcodeStr,
           (bitconvert (memopv16i8 addr:$src2))))]>, OpSize;
 }
 
-let isAsmParserOnly = 0, Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in {
   let isCommutable = 0 in
   defm VPACKUSDW : SS41I_binop_rm_int<0x2B, "vpackusdw", int_x86_sse41_packusdw,
                                                          0>, VEX_4V;
@@ -4739,7 +4651,7 @@ multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
        OpSize;
 }
 
-let isAsmParserOnly = 0, Predicates = [HasAVX] in
+let Predicates = [HasAVX] in
   defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, 0>, VEX_4V;
 let Constraints = "$src1 = $dst" in
   defm PMULLD : SS48I_binop_rm<0x40, "pmulld", mul, v4i32>;
@@ -4771,7 +4683,7 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
         OpSize;
 }
 
-let isAsmParserOnly = 0, Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in {
   let isCommutable = 0 in {
   defm VBLENDPS : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps,
                                       VR128, memopv16i8, i128mem, 0>, VEX_4V;
@@ -4812,7 +4724,7 @@ let Constraints = "$src1 = $dst" in {
 }
 
 /// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators
-let isAsmParserOnly = 0, Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in {
 multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
                                     RegisterClass RC, X86MemOperand x86memop,
                                     PatFrag mem_frag, Intrinsic IntId> {
@@ -4851,14 +4763,14 @@ let Uses = [XMM0], Constraints = "$src1 = $dst" in {
     def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
                     (ins VR128:$src1, VR128:$src2),
                     !strconcat(OpcodeStr,
-                     "\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}"),
+                     "\t{$src2, $dst|$dst, $src2}"),
                     [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))]>,
                     OpSize;
 
     def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
                     (ins VR128:$src1, i128mem:$src2),
                     !strconcat(OpcodeStr,
-                     "\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}"),
+                     "\t{$src2, $dst|$dst, $src2}"),
                     [(set VR128:$dst,
                       (IntId VR128:$src1,
                        (bitconvert (memopv16i8 addr:$src2)), XMM0))]>, OpSize;
@@ -4872,7 +4784,7 @@ defm PBLENDVB     : SS41I_ternary_int<0x10, "pblendvb", int_x86_sse41_pblendvb>;
 def : Pat<(X86pblendv VR128:$src1, VR128:$src2, XMM0),
           (PBLENDVBrr0 VR128:$src1, VR128:$src2)>;
 
-let isAsmParserOnly = 0, Predicates = [HasAVX] in
+let Predicates = [HasAVX] in
 def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                        "vmovntdqa\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>,
@@ -4906,7 +4818,7 @@ multiclass SS42I_binop_rm_int<bits<8> opc, string OpcodeStr,
           (bitconvert (memopv16i8 addr:$src2))))]>, OpSize;
 }
 
-let isAsmParserOnly = 0, Predicates = [HasAVX] in
+let Predicates = [HasAVX] in
   defm VPCMPGTQ : SS42I_binop_rm_int<0x37, "vpcmpgtq", int_x86_sse42_pcmpgtq,
                                      0>, VEX_4V;
 let Constraints = "$src1 = $dst" in
@@ -4938,8 +4850,7 @@ let Defs = [EFLAGS], usesCustomInserter = 1 in {
   defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128">, Requires<[HasAVX]>;
 }
 
-let Defs = [XMM0, EFLAGS], isAsmParserOnly = 0,
-    Predicates = [HasAVX] in {
+let Defs = [XMM0, EFLAGS], Predicates = [HasAVX] in {
   def VPCMPISTRM128rr : SS42AI<0x62, MRMSrcReg, (outs),
       (ins VR128:$src1, VR128:$src2, i8imm:$src3),
       "vpcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}", []>, OpSize, VEX;
@@ -4974,7 +4885,7 @@ let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
   defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128">, Requires<[HasAVX]>;
 }
 
-let isAsmParserOnly = 0, Predicates = [HasAVX],
+let Predicates = [HasAVX],
     Defs = [XMM0, EFLAGS], Uses = [EAX, EDX] in {
   def VPCMPESTRM128rr : SS42AI<0x60, MRMSrcReg, (outs),
       (ins VR128:$src1, VR128:$src3, i8imm:$src5),
@@ -5009,7 +4920,7 @@ let Defs = [ECX, EFLAGS] in {
   }
 }
 
-let isAsmParserOnly = 0, Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in {
 defm VPCMPISTRI  : SS42AI_pcmpistri<int_x86_sse42_pcmpistri128, "vpcmpistri">,
                                     VEX;
 defm VPCMPISTRIA : SS42AI_pcmpistri<int_x86_sse42_pcmpistria128, "vpcmpistri">,
@@ -5048,7 +4959,7 @@ let Defs = [ECX, EFLAGS], Uses = [EAX, EDX] in {
   }
 }
 
-let isAsmParserOnly = 0, Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in {
 defm VPCMPESTRI  : SS42AI_pcmpestri<int_x86_sse42_pcmpestri128, "vpcmpestri">,
                                     VEX;
 defm VPCMPESTRIA : SS42AI_pcmpestri<int_x86_sse42_pcmpestria128, "vpcmpestri">,
@@ -5080,66 +4991,66 @@ defm PCMPESTRIZ : SS42AI_pcmpestri<int_x86_sse42_pcmpestriz128>;
 // This set of instructions are only rm, the only difference is the size
 // of r and m.
 let Constraints = "$src1 = $dst" in {
-  def CRC32m8  : SS42FI<0xF0, MRMSrcMem, (outs GR32:$dst),
+  def CRC32r32m8  : SS42FI<0xF0, MRMSrcMem, (outs GR32:$dst),
                       (ins GR32:$src1, i8mem:$src2),
                       "crc32{b} \t{$src2, $src1|$src1, $src2}",
                        [(set GR32:$dst,
-                         (int_x86_sse42_crc32_8 GR32:$src1,
+                         (int_x86_sse42_crc32_32_8 GR32:$src1,
                          (load addr:$src2)))]>;
-  def CRC32r8  : SS42FI<0xF0, MRMSrcReg, (outs GR32:$dst),
+  def CRC32r32r8  : SS42FI<0xF0, MRMSrcReg, (outs GR32:$dst),
                       (ins GR32:$src1, GR8:$src2),
                       "crc32{b} \t{$src2, $src1|$src1, $src2}",
                        [(set GR32:$dst,
-                         (int_x86_sse42_crc32_8 GR32:$src1, GR8:$src2))]>;
-  def CRC32m16  : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst),
+                         (int_x86_sse42_crc32_32_8 GR32:$src1, GR8:$src2))]>;
+  def CRC32r32m16  : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst),
                       (ins GR32:$src1, i16mem:$src2),
                       "crc32{w} \t{$src2, $src1|$src1, $src2}",
                        [(set GR32:$dst,
-                         (int_x86_sse42_crc32_16 GR32:$src1,
+                         (int_x86_sse42_crc32_32_16 GR32:$src1,
                          (load addr:$src2)))]>,
                          OpSize;
-  def CRC32r16  : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst),
+  def CRC32r32r16  : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst),
                       (ins GR32:$src1, GR16:$src2),
                       "crc32{w} \t{$src2, $src1|$src1, $src2}",
                        [(set GR32:$dst,
-                         (int_x86_sse42_crc32_16 GR32:$src1, GR16:$src2))]>,
+                         (int_x86_sse42_crc32_32_16 GR32:$src1, GR16:$src2))]>,
                          OpSize;
-  def CRC32m32  : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst),
+  def CRC32r32m32  : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst),
                       (ins GR32:$src1, i32mem:$src2),
                       "crc32{l} \t{$src2, $src1|$src1, $src2}",
                        [(set GR32:$dst,
-                         (int_x86_sse42_crc32_32 GR32:$src1,
+                         (int_x86_sse42_crc32_32_32 GR32:$src1,
                          (load addr:$src2)))]>;
-  def CRC32r32  : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst),
+  def CRC32r32r32  : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst),
                       (ins GR32:$src1, GR32:$src2),
                       "crc32{l} \t{$src2, $src1|$src1, $src2}",
                        [(set GR32:$dst,
-                         (int_x86_sse42_crc32_32 GR32:$src1, GR32:$src2))]>;
-  def CRC64m8  : SS42FI<0xF0, MRMSrcMem, (outs GR64:$dst),
+                         (int_x86_sse42_crc32_32_32 GR32:$src1, GR32:$src2))]>;
+  def CRC32r64m8  : SS42FI<0xF0, MRMSrcMem, (outs GR64:$dst),
                       (ins GR64:$src1, i8mem:$src2),
                       "crc32{b} \t{$src2, $src1|$src1, $src2}",
                        [(set GR64:$dst,
-                         (int_x86_sse42_crc64_8 GR64:$src1,
+                         (int_x86_sse42_crc32_64_8 GR64:$src1,
                          (load addr:$src2)))]>,
                          REX_W;
-  def CRC64r8  : SS42FI<0xF0, MRMSrcReg, (outs GR64:$dst),
+  def CRC32r64r8  : SS42FI<0xF0, MRMSrcReg, (outs GR64:$dst),
                       (ins GR64:$src1, GR8:$src2),
                       "crc32{b} \t{$src2, $src1|$src1, $src2}",
                        [(set GR64:$dst,
-                         (int_x86_sse42_crc64_8 GR64:$src1, GR8:$src2))]>,
+                         (int_x86_sse42_crc32_64_8 GR64:$src1, GR8:$src2))]>,
                          REX_W;
-  def CRC64m64  : SS42FI<0xF1, MRMSrcMem, (outs GR64:$dst),
+  def CRC32r64m64  : SS42FI<0xF1, MRMSrcMem, (outs GR64:$dst),
                       (ins GR64:$src1, i64mem:$src2),
                       "crc32{q} \t{$src2, $src1|$src1, $src2}",
                        [(set GR64:$dst,
-                         (int_x86_sse42_crc64_64 GR64:$src1,
+                         (int_x86_sse42_crc32_64_64 GR64:$src1,
                          (load addr:$src2)))]>,
                          REX_W;
-  def CRC64r64  : SS42FI<0xF1, MRMSrcReg, (outs GR64:$dst),
+  def CRC32r64r64  : SS42FI<0xF1, MRMSrcReg, (outs GR64:$dst),
                       (ins GR64:$src1, GR64:$src2),
                       "crc32{q} \t{$src2, $src1|$src1, $src2}",
                        [(set GR64:$dst,
-                         (int_x86_sse42_crc64_64 GR64:$src1, GR64:$src2))]>,
+                         (int_x86_sse42_crc32_64_64 GR64:$src1, GR64:$src2))]>,
                          REX_W;
 }
 
@@ -5167,7 +5078,7 @@ multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr,
 }
 
 // Perform One Round of an AES Encryption/Decryption Flow
-let isAsmParserOnly = 0, Predicates = [HasAVX, HasAES] in {
+let Predicates = [HasAVX, HasAES] in {
   defm VAESENC          : AESI_binop_rm_int<0xDC, "vaesenc",
                          int_x86_aesni_aesenc, 0>, VEX_4V;
   defm VAESENCLAST      : AESI_binop_rm_int<0xDD, "vaesenclast",
@@ -5207,7 +5118,7 @@ def : Pat<(v2i64 (int_x86_aesni_aesdeclast VR128:$src1, (memop addr:$src2))),
           (AESDECLASTrm VR128:$src1, addr:$src2)>;
 
 // Perform the AES InvMixColumn Transformation
-let isAsmParserOnly = 0, Predicates = [HasAVX, HasAES] in {
+let Predicates = [HasAVX, HasAES] in {
   def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
       (ins VR128:$src1),
       "vaesimc\t{$src1, $dst|$dst, $src1}",
@@ -5235,7 +5146,7 @@ def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
   OpSize;
 
 // AES Round Key Generation Assist
-let isAsmParserOnly = 0, Predicates = [HasAVX, HasAES] in {
+let Predicates = [HasAVX, HasAES] in {
   def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
       (ins VR128:$src1, i8imm:$src2),
       "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -5271,7 +5182,6 @@ def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
 // Only the AVX version of CLMUL instructions are described here.
 
 // Carry-less Multiplication instructions
-let isAsmParserOnly = 0 in {
 def VPCLMULQDQrr : CLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
            (ins VR128:$src1, VR128:$src2, i8imm:$src3),
            "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
@@ -5297,13 +5207,10 @@ defm VPCLMULHQLQDQ : avx_vpclmul<"vpclmulhqlqdq">;
 defm VPCLMULLQHQDQ : avx_vpclmul<"vpclmullqhqdq">;
 defm VPCLMULLQLQDQ : avx_vpclmul<"vpclmullqlqdq">;
 
-} // isAsmParserOnly
-
 //===----------------------------------------------------------------------===//
 // AVX Instructions
 //===----------------------------------------------------------------------===//
 
-let isAsmParserOnly = 0 in {
 
 // Load from memory and broadcast to all elements of the destination operand
 class avx_broadcast<bits<8> opc, string OpcodeStr, RegisterClass RC,
@@ -5437,8 +5344,6 @@ def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall",
 def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
                    [(int_x86_avx_vzeroupper)]>, VEX, Requires<[HasAVX]>;
 
-} // isAsmParserOnly
-
 def : Pat<(int_x86_avx_vinsertf128_pd_256 VR256:$src1, VR128:$src2, imm:$src3),
           (VINSERTF128rr VR256:$src1, VR128:$src2, imm:$src3)>;
 def : Pat<(int_x86_avx_vinsertf128_ps_256 VR256:$src1, VR128:$src2, imm:$src3),
diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td
index 2710425..f73cff3 100644
--- a/lib/Target/X86/X86InstrSystem.td
+++ b/lib/Target/X86/X86InstrSystem.td
@@ -34,9 +34,16 @@ let Uses = [EFLAGS] in
   def INTO : I<0xce, RawFrm, (outs), (ins), "into", []>;
 def INT3 : I<0xcc, RawFrm, (outs), (ins), "int3",
               [(int_x86_int (i8 3))]>;
+
+// The long form of "int $3" turns into int3 as a size optimization.
+// FIXME: This doesn't work because InstAlias can't match immediate constants.
+//def : InstAlias<"int\t$3", (INT3)>;
+
+
 def INT : Ii8<0xcd, RawFrm, (outs), (ins i8imm:$trap), "int\t$trap",
               [(int_x86_int imm:$trap)]>;
 
+
 def SYSCALL  : I<0x05, RawFrm, (outs), (ins), "syscall", []>, TB;
 def SYSRETL  : I<0x07, RawFrm, (outs), (ins), "sysretl", []>, TB;
 def SYSRETQ  :RI<0x07, RawFrm, (outs), (ins), "sysretq", []>, TB,
diff --git a/lib/Target/X86/X86MCAsmInfo.cpp b/lib/Target/X86/X86MCAsmInfo.cpp
index 6686214..2e1ec63 100644
--- a/lib/Target/X86/X86MCAsmInfo.cpp
+++ b/lib/Target/X86/X86MCAsmInfo.cpp
@@ -15,7 +15,9 @@
 #include "X86TargetMachine.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ELF.h"
 using namespace llvm;
@@ -69,7 +71,22 @@ X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &Triple) {
   DwarfUsesInlineInfoSection = true;
 
   // Exceptions handling
-  ExceptionsType = ExceptionHandling::DwarfTable;
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+}
+
+const MCExpr *
+X86_64MCAsmInfoDarwin::getExprForPersonalitySymbol(const MCSymbol *Sym,
+                                                   unsigned Encoding,
+                                                   MCStreamer &Streamer) const {
+  MCContext &Context = Streamer.getContext();
+  const MCExpr *Res =
+    MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOTPCREL, Context);
+  const MCExpr *Four = MCConstantExpr::Create(4, Context);
+  return MCBinaryExpr::CreateAdd(Res, Four, Context);
+}
+
+X86_64MCAsmInfoDarwin::X86_64MCAsmInfoDarwin(const Triple &Triple)
+  : X86MCAsmInfoDarwin(Triple) {
 }
 
 X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) {
@@ -89,7 +106,7 @@ X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) {
   SupportsDebugInformation = true;
 
   // Exceptions handling
-  ExceptionsType = ExceptionHandling::DwarfTable;
+  ExceptionsType = ExceptionHandling::DwarfCFI;
 
   // OpenBSD has buggy support for .quad in 32-bit mode, just split into two
   // .words.
diff --git a/lib/Target/X86/X86MCAsmInfo.h b/lib/Target/X86/X86MCAsmInfo.h
index 5815225..2cd4c8e 100644
--- a/lib/Target/X86/X86MCAsmInfo.h
+++ b/lib/Target/X86/X86MCAsmInfo.h
@@ -25,6 +25,14 @@ namespace llvm {
     explicit X86MCAsmInfoDarwin(const Triple &Triple);
   };
 
+  struct X86_64MCAsmInfoDarwin : public X86MCAsmInfoDarwin {
+    explicit X86_64MCAsmInfoDarwin(const Triple &Triple);
+    virtual const MCExpr *
+    getExprForPersonalitySymbol(const MCSymbol *Sym,
+                                unsigned Encoding,
+                                MCStreamer &Streamer) const;
+  };
+
   struct X86ELFMCAsmInfo : public MCAsmInfo {
     explicit X86ELFMCAsmInfo(const Triple &Triple);
     virtual const MCSection *getNonexecutableStackSection(MCContext &Ctx) const;
diff --git a/lib/Target/X86/X86MCCodeEmitter.cpp b/lib/Target/X86/X86MCCodeEmitter.cpp
index a2bd638..55aceba 100644
--- a/lib/Target/X86/X86MCCodeEmitter.cpp
+++ b/lib/Target/X86/X86MCCodeEmitter.cpp
@@ -514,7 +514,7 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
     }
 
     // To only check operands before the memory address ones, start
-    // the search from the begining
+    // the search from the beginning
     if (IsDestMem)
       CurOp = 0;
 
@@ -1015,7 +1015,8 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
     } else {
       unsigned FixupKind;
       // FIXME: Is there a better way to know that we need a signed relocation?
-      if (MI.getOpcode() == X86::MOV64ri32 ||
+      if (MI.getOpcode() == X86::ADD64ri32 ||
+          MI.getOpcode() == X86::MOV64ri32 ||
           MI.getOpcode() == X86::MOV64mi32 ||
           MI.getOpcode() == X86::PUSH64i32)
         FixupKind = X86::reloc_signed_4byte;
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index cbe6db2..793156f 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -355,10 +355,6 @@ ReSimplify:
     assert(OutMI.getOperand(1+X86::AddrSegmentReg).getReg() == 0 &&
            "LEA has segment specified!");
     break;
-  case X86::MOVZX16rr8:   LowerSubReg32_Op0(OutMI, X86::MOVZX32rr8); break;
-  case X86::MOVZX16rm8:   LowerSubReg32_Op0(OutMI, X86::MOVZX32rm8); break;
-  case X86::MOVSX16rr8:   LowerSubReg32_Op0(OutMI, X86::MOVSX32rr8); break;
-  case X86::MOVSX16rm8:   LowerSubReg32_Op0(OutMI, X86::MOVSX32rm8); break;
   case X86::MOVZX64rr32:  LowerSubReg32_Op0(OutMI, X86::MOV32rr); break;
   case X86::MOVZX64rm32:  LowerSubReg32_Op0(OutMI, X86::MOV32rm); break;
   case X86::MOV64ri64i32: LowerSubReg32_Op0(OutMI, X86::MOV32ri); break;
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index 1f464f4..1ad6203 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -73,29 +73,61 @@ X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm,
   }
 }
 
-/// getDwarfRegNum - This function maps LLVM register identifiers to the DWARF
-/// specific numbering, used in debug info and exception tables.
-int X86RegisterInfo::getDwarfRegNum(unsigned RegNo, bool isEH) const {
-  const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
-  unsigned Flavour = DWARFFlavour::X86_64;
-
+static unsigned getFlavour(const X86Subtarget *Subtarget, bool isEH) {
   if (!Subtarget->is64Bit()) {
     if (Subtarget->isTargetDarwin()) {
       if (isEH)
-        Flavour = DWARFFlavour::X86_32_DarwinEH;
+        return DWARFFlavour::X86_32_DarwinEH;
       else
-        Flavour = DWARFFlavour::X86_32_Generic;
+        return DWARFFlavour::X86_32_Generic;
     } else if (Subtarget->isTargetCygMing()) {
       // Unsupported by now, just quick fallback
-      Flavour = DWARFFlavour::X86_32_Generic;
+      return DWARFFlavour::X86_32_Generic;
     } else {
-      Flavour = DWARFFlavour::X86_32_Generic;
+      return DWARFFlavour::X86_32_Generic;
     }
   }
+  return DWARFFlavour::X86_64;
+}
+
+/// getDwarfRegNum - This function maps LLVM register identifiers to the DWARF
+/// specific numbering, used in debug info and exception tables.
+int X86RegisterInfo::getDwarfRegNum(unsigned RegNo, bool isEH) const {
+  const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
+  unsigned Flavour = getFlavour(Subtarget, isEH);
 
   return X86GenRegisterInfo::getDwarfRegNumFull(RegNo, Flavour);
 }
 
+/// getLLVMRegNum - This function maps DWARF register numbers to LLVM register.
+int X86RegisterInfo::getLLVMRegNum(unsigned DwarfRegNo, bool isEH) const {
+  const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
+  unsigned Flavour = getFlavour(Subtarget, isEH);
+
+  return X86GenRegisterInfo::getLLVMRegNumFull(DwarfRegNo, Flavour);
+}
+
+int
+X86RegisterInfo::getSEHRegNum(unsigned i) const {
+  int reg = getX86RegNum(i);
+  switch (i) {
+  case X86::R8:  case X86::R8D:  case X86::R8W:  case X86::R8B:
+  case X86::R9:  case X86::R9D:  case X86::R9W:  case X86::R9B:
+  case X86::R10: case X86::R10D: case X86::R10W: case X86::R10B:
+  case X86::R11: case X86::R11D: case X86::R11W: case X86::R11B:
+  case X86::R12: case X86::R12D: case X86::R12W: case X86::R12B:
+  case X86::R13: case X86::R13D: case X86::R13W: case X86::R13B:
+  case X86::R14: case X86::R14D: case X86::R14W: case X86::R14B:
+  case X86::R15: case X86::R15D: case X86::R15W: case X86::R15B:
+  case X86::XMM8: case X86::XMM9: case X86::XMM10: case X86::XMM11:
+  case X86::XMM12: case X86::XMM13: case X86::XMM14: case X86::XMM15:
+  case X86::YMM8: case X86::YMM9: case X86::YMM10: case X86::YMM11:
+  case X86::YMM12: case X86::YMM13: case X86::YMM14: case X86::YMM15:
+    reg += 8;
+  }
+  return reg;
+}
+
 /// getX86RegNum - This function maps LLVM register identifiers to their X86
 /// specific numbering, which is used in various places encoding instructions.
 unsigned X86RegisterInfo::getX86RegNum(unsigned RegNo) {
@@ -229,19 +261,13 @@ X86RegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A,
     }
     break;
   case X86::sub_8bit_hi:
-    if (B == &X86::GR8_ABCD_HRegClass) {
-      if (A == &X86::GR64RegClass || A == &X86::GR64_ABCDRegClass ||
-          A == &X86::GR64_NOREXRegClass ||
-          A == &X86::GR64_NOSPRegClass ||
-          A == &X86::GR64_NOREX_NOSPRegClass)
-        return &X86::GR64_ABCDRegClass;
-      else if (A == &X86::GR32RegClass || A == &X86::GR32_ABCDRegClass ||
-               A == &X86::GR32_NOREXRegClass || A == &X86::GR32_NOSPRegClass)
-        return &X86::GR32_ABCDRegClass;
-      else if (A == &X86::GR16RegClass || A == &X86::GR16_ABCDRegClass ||
-               A == &X86::GR16_NOREXRegClass)
-        return &X86::GR16_ABCDRegClass;
-    }
+    if (B->hasSubClassEq(&X86::GR8_ABCD_HRegClass))
+      switch (A->getSize()) {
+        case 2: return getCommonSubClass(A, &X86::GR16_ABCDRegClass);
+        case 4: return getCommonSubClass(A, &X86::GR32_ABCDRegClass);
+        case 8: return getCommonSubClass(A, &X86::GR64_ABCDRegClass);
+        default: return 0;
+      }
     break;
   case X86::sub_16bit:
     if (B == &X86::GR16RegClass) {
@@ -285,9 +311,16 @@ X86RegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A,
           A == &X86::GR64_NOREX_NOSPRegClass)
         return &X86::GR64_ABCDRegClass;
     } else if (B == &X86::GR32_NOREXRegClass) {
+      if (A == &X86::GR64RegClass || A == &X86::GR64_NOREXRegClass)
+        return &X86::GR64_NOREXRegClass;
+      else if (A == &X86::GR64_NOSPRegClass || A == &X86::GR64_NOREX_NOSPRegClass)
+        return &X86::GR64_NOREX_NOSPRegClass;
+      else if (A == &X86::GR64_ABCDRegClass)
+        return &X86::GR64_ABCDRegClass;
+    } else if (B == &X86::GR32_NOREX_NOSPRegClass) {
       if (A == &X86::GR64RegClass || A == &X86::GR64_NOREXRegClass ||
           A == &X86::GR64_NOSPRegClass || A == &X86::GR64_NOREX_NOSPRegClass)
-        return &X86::GR64_NOREXRegClass;
+        return &X86::GR64_NOREX_NOSPRegClass;
       else if (A == &X86::GR64_ABCDRegClass)
         return &X86::GR64_ABCDRegClass;
     }
@@ -308,6 +341,33 @@ X86RegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A,
   return 0;
 }
 
+const TargetRegisterClass*
+X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC) const{
+  const TargetRegisterClass *Super = RC;
+  TargetRegisterClass::sc_iterator I = RC->superclasses_begin();
+  do {
+    switch (Super->getID()) {
+    case X86::GR8RegClassID:
+    case X86::GR16RegClassID:
+    case X86::GR32RegClassID:
+    case X86::GR64RegClassID:
+    case X86::FR32RegClassID:
+    case X86::FR64RegClassID:
+    case X86::RFP32RegClassID:
+    case X86::RFP64RegClassID:
+    case X86::RFP80RegClassID:
+    case X86::VR128RegClassID:
+    case X86::VR256RegClassID:
+      // Don't return a super-class that would shrink the spill size.
+      // That can happen with the vector and float classes.
+      if (Super->getSize() == RC->getSize())
+        return Super;
+    }
+    Super = *I++;
+  } while (Super);
+  return RC;
+}
+
 const TargetRegisterClass *
 X86RegisterInfo::getPointerRegClass(unsigned Kind) const {
   switch (Kind) {
@@ -446,6 +506,34 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   Reserved.set(X86::ST5);
   Reserved.set(X86::ST6);
   Reserved.set(X86::ST7);
+
+  // Mark the segment registers as reserved.
+  Reserved.set(X86::CS);
+  Reserved.set(X86::SS);
+  Reserved.set(X86::DS);
+  Reserved.set(X86::ES);
+  Reserved.set(X86::FS);
+  Reserved.set(X86::GS);
+
+  // Reserve the registers that only exist in 64-bit mode.
+  if (!Is64Bit) {
+    for (unsigned n = 0; n != 8; ++n) {
+      const unsigned GPR64[] = {
+        X86::R8,  X86::R9,  X86::R10, X86::R11,
+        X86::R12, X86::R13, X86::R14, X86::R15
+      };
+      for (const unsigned *AI = getOverlaps(GPR64[n]); unsigned Reg = *AI;
+           ++AI)
+        Reserved.set(Reg);
+
+      // XMM8, XMM9, ...
+      assert(X86::XMM15 == X86::XMM8+7);
+      for (const unsigned *AI = getOverlaps(X86::XMM8 + n); unsigned Reg = *AI;
+           ++AI)
+        Reserved.set(Reg);
+    }
+  }
+
   return Reserved;
 }
 
@@ -470,7 +558,7 @@ bool X86RegisterInfo::needsStackRealignment(const MachineFunction &MF) const {
   // FIXME: It's more complicated than this...
   if (0 && requiresRealignment && MFI->hasVarSizedObjects())
     report_fatal_error(
-      "Stack realignment in presense of dynamic allocas is not supported");
+      "Stack realignment in presence of dynamic allocas is not supported");
 
   // If we've requested that we force align the stack do so now.
   if (ForceStackAlign)
diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h
index cccddfa..dd3d3dc 100644
--- a/lib/Target/X86/X86RegisterInfo.h
+++ b/lib/Target/X86/X86RegisterInfo.h
@@ -80,6 +80,10 @@ public:
   /// getDwarfRegNum - allows modification of X86GenRegisterInfo::getDwarfRegNum
   /// (created by TableGen) for target dependencies.
   int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+  int getLLVMRegNum(unsigned RegNum, bool isEH) const;
+
+  // FIXME: This should be tablegen'd like getDwarfRegNum is
+  int getSEHRegNum(unsigned i) const;
 
   /// Code Generation virtual methods...
   /// 
@@ -91,6 +95,9 @@ public:
   getMatchingSuperRegClass(const TargetRegisterClass *A,
                            const TargetRegisterClass *B, unsigned Idx) const;
 
+  const TargetRegisterClass*
+  getLargestLegalSuperClass(const TargetRegisterClass *RC) const;
+
   /// getPointerRegClass - Returns a TargetRegisterClass used for pointer
   /// values.
   const TargetRegisterClass *getPointerRegClass(unsigned Kind = 0) const;
diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td
index 612fac2..590b38b 100644
--- a/lib/Target/X86/X86RegisterInfo.td
+++ b/lib/Target/X86/X86RegisterInfo.td
@@ -41,80 +41,83 @@ let Namespace = "X86" in {
 
   // 8-bit registers
   // Low registers
-  def AL : Register<"al">, DwarfRegNum<[0, 0, 0]>;
-  def DL : Register<"dl">, DwarfRegNum<[1, 2, 2]>;
-  def CL : Register<"cl">, DwarfRegNum<[2, 1, 1]>;
-  def BL : Register<"bl">, DwarfRegNum<[3, 3, 3]>;
-
-  // X86-64 only
-  def SIL : Register<"sil">, DwarfRegNum<[4, 6, 6]>;
-  def DIL : Register<"dil">, DwarfRegNum<[5, 7, 7]>;
-  def BPL : Register<"bpl">, DwarfRegNum<[6, 4, 5]>;
-  def SPL : Register<"spl">, DwarfRegNum<[7, 5, 4]>;
-  def R8B  : Register<"r8b">,  DwarfRegNum<[8, -2, -2]>;
-  def R9B  : Register<"r9b">,  DwarfRegNum<[9, -2, -2]>;
-  def R10B : Register<"r10b">, DwarfRegNum<[10, -2, -2]>;
-  def R11B : Register<"r11b">, DwarfRegNum<[11, -2, -2]>;
-  def R12B : Register<"r12b">, DwarfRegNum<[12, -2, -2]>;
-  def R13B : Register<"r13b">, DwarfRegNum<[13, -2, -2]>;
-  def R14B : Register<"r14b">, DwarfRegNum<[14, -2, -2]>;
-  def R15B : Register<"r15b">, DwarfRegNum<[15, -2, -2]>;
+  def AL : Register<"al">;
+  def DL : Register<"dl">;
+  def CL : Register<"cl">;
+  def BL : Register<"bl">;
+
+  // X86-64 only, requires REX.
+  let CostPerUse = 1 in {
+  def SIL : Register<"sil">;
+  def DIL : Register<"dil">;
+  def BPL : Register<"bpl">;
+  def SPL : Register<"spl">;
+  def R8B  : Register<"r8b">;
+  def R9B  : Register<"r9b">;
+  def R10B : Register<"r10b">;
+  def R11B : Register<"r11b">;
+  def R12B : Register<"r12b">;
+  def R13B : Register<"r13b">;
+  def R14B : Register<"r14b">;
+  def R15B : Register<"r15b">;
+  }
 
   // High registers. On x86-64, these cannot be used in any instruction
   // with a REX prefix.
-  def AH : Register<"ah">, DwarfRegNum<[0, 0, 0]>;
-  def DH : Register<"dh">, DwarfRegNum<[1, 2, 2]>;
-  def CH : Register<"ch">, DwarfRegNum<[2, 1, 1]>;
-  def BH : Register<"bh">, DwarfRegNum<[3, 3, 3]>;
+  def AH : Register<"ah">;
+  def DH : Register<"dh">;
+  def CH : Register<"ch">;
+  def BH : Register<"bh">;
 
   // 16-bit registers
   let SubRegIndices = [sub_8bit, sub_8bit_hi] in {
-  def AX : RegisterWithSubRegs<"ax", [AL,AH]>, DwarfRegNum<[0, 0, 0]>;
-  def DX : RegisterWithSubRegs<"dx", [DL,DH]>, DwarfRegNum<[1, 2, 2]>;
-  def CX : RegisterWithSubRegs<"cx", [CL,CH]>, DwarfRegNum<[2, 1, 1]>;
-  def BX : RegisterWithSubRegs<"bx", [BL,BH]>, DwarfRegNum<[3, 3, 3]>;
+  def AX : RegisterWithSubRegs<"ax", [AL,AH]>;
+  def DX : RegisterWithSubRegs<"dx", [DL,DH]>;
+  def CX : RegisterWithSubRegs<"cx", [CL,CH]>;
+  def BX : RegisterWithSubRegs<"bx", [BL,BH]>;
   }
   let SubRegIndices = [sub_8bit] in {
-  def SI : RegisterWithSubRegs<"si", [SIL]>, DwarfRegNum<[4, 6, 6]>;
-  def DI : RegisterWithSubRegs<"di", [DIL]>, DwarfRegNum<[5, 7, 7]>;
-  def BP : RegisterWithSubRegs<"bp", [BPL]>, DwarfRegNum<[6, 4, 5]>;
-  def SP : RegisterWithSubRegs<"sp", [SPL]>, DwarfRegNum<[7, 5, 4]>;
+  def SI : RegisterWithSubRegs<"si", [SIL]>;
+  def DI : RegisterWithSubRegs<"di", [DIL]>;
+  def BP : RegisterWithSubRegs<"bp", [BPL]>;
+  def SP : RegisterWithSubRegs<"sp", [SPL]>;
   }
-  def IP : Register<"ip">, DwarfRegNum<[16]>;
-
-  // X86-64 only
-  let SubRegIndices = [sub_8bit] in {
-  def R8W  : RegisterWithSubRegs<"r8w", [R8B]>, DwarfRegNum<[8, -2, -2]>;
-  def R9W  : RegisterWithSubRegs<"r9w", [R9B]>, DwarfRegNum<[9, -2, -2]>;
-  def R10W : RegisterWithSubRegs<"r10w", [R10B]>, DwarfRegNum<[10, -2, -2]>;
-  def R11W : RegisterWithSubRegs<"r11w", [R11B]>, DwarfRegNum<[11, -2, -2]>;
-  def R12W : RegisterWithSubRegs<"r12w", [R12B]>, DwarfRegNum<[12, -2, -2]>;
-  def R13W : RegisterWithSubRegs<"r13w", [R13B]>, DwarfRegNum<[13, -2, -2]>;
-  def R14W : RegisterWithSubRegs<"r14w", [R14B]>, DwarfRegNum<[14, -2, -2]>;
-  def R15W : RegisterWithSubRegs<"r15w", [R15B]>, DwarfRegNum<[15, -2, -2]>;
+  def IP : Register<"ip">;
+
+  // X86-64 only, requires REX.
+  let SubRegIndices = [sub_8bit], CostPerUse = 1 in {
+  def R8W  : RegisterWithSubRegs<"r8w", [R8B]>;
+  def R9W  : RegisterWithSubRegs<"r9w", [R9B]>;
+  def R10W : RegisterWithSubRegs<"r10w", [R10B]>;
+  def R11W : RegisterWithSubRegs<"r11w", [R11B]>;
+  def R12W : RegisterWithSubRegs<"r12w", [R12B]>;
+  def R13W : RegisterWithSubRegs<"r13w", [R13B]>;
+  def R14W : RegisterWithSubRegs<"r14w", [R14B]>;
+  def R15W : RegisterWithSubRegs<"r15w", [R15B]>;
   }
   // 32-bit registers
   let SubRegIndices = [sub_16bit] in {
-  def EAX : RegisterWithSubRegs<"eax", [AX]>, DwarfRegNum<[0, 0, 0]>;
-  def EDX : RegisterWithSubRegs<"edx", [DX]>, DwarfRegNum<[1, 2, 2]>;
-  def ECX : RegisterWithSubRegs<"ecx", [CX]>, DwarfRegNum<[2, 1, 1]>;
-  def EBX : RegisterWithSubRegs<"ebx", [BX]>, DwarfRegNum<[3, 3, 3]>;
-  def ESI : RegisterWithSubRegs<"esi", [SI]>, DwarfRegNum<[4, 6, 6]>;
-  def EDI : RegisterWithSubRegs<"edi", [DI]>, DwarfRegNum<[5, 7, 7]>;
-  def EBP : RegisterWithSubRegs<"ebp", [BP]>, DwarfRegNum<[6, 4, 5]>;
-  def ESP : RegisterWithSubRegs<"esp", [SP]>, DwarfRegNum<[7, 5, 4]>;
-  def EIP : RegisterWithSubRegs<"eip", [IP]>, DwarfRegNum<[16, 8, 8]>;
-
-  // X86-64 only
-  def R8D  : RegisterWithSubRegs<"r8d", [R8W]>, DwarfRegNum<[8, -2, -2]>;
-  def R9D  : RegisterWithSubRegs<"r9d", [R9W]>, DwarfRegNum<[9, -2, -2]>;
-  def R10D : RegisterWithSubRegs<"r10d", [R10W]>, DwarfRegNum<[10, -2, -2]>;
-  def R11D : RegisterWithSubRegs<"r11d", [R11W]>, DwarfRegNum<[11, -2, -2]>;
-  def R12D : RegisterWithSubRegs<"r12d", [R12W]>, DwarfRegNum<[12, -2, -2]>;
-  def R13D : RegisterWithSubRegs<"r13d", [R13W]>, DwarfRegNum<[13, -2, -2]>;
-  def R14D : RegisterWithSubRegs<"r14d", [R14W]>, DwarfRegNum<[14, -2, -2]>;
-  def R15D : RegisterWithSubRegs<"r15d", [R15W]>, DwarfRegNum<[15, -2, -2]>;
-  }
+  def EAX : RegisterWithSubRegs<"eax", [AX]>, DwarfRegNum<[-2, 0, 0]>;
+  def EDX : RegisterWithSubRegs<"edx", [DX]>, DwarfRegNum<[-2, 2, 2]>;
+  def ECX : RegisterWithSubRegs<"ecx", [CX]>, DwarfRegNum<[-2, 1, 1]>;
+  def EBX : RegisterWithSubRegs<"ebx", [BX]>, DwarfRegNum<[-2, 3, 3]>;
+  def ESI : RegisterWithSubRegs<"esi", [SI]>, DwarfRegNum<[-2, 6, 6]>;
+  def EDI : RegisterWithSubRegs<"edi", [DI]>, DwarfRegNum<[-2, 7, 7]>;
+  def EBP : RegisterWithSubRegs<"ebp", [BP]>, DwarfRegNum<[-2, 4, 5]>;
+  def ESP : RegisterWithSubRegs<"esp", [SP]>, DwarfRegNum<[-2, 5, 4]>;
+  def EIP : RegisterWithSubRegs<"eip", [IP]>, DwarfRegNum<[-2, 8, 8]>;
+
+  // X86-64 only, requires REX
+  let CostPerUse = 1 in {
+  def R8D  : RegisterWithSubRegs<"r8d", [R8W]>;
+  def R9D  : RegisterWithSubRegs<"r9d", [R9W]>;
+  def R10D : RegisterWithSubRegs<"r10d", [R10W]>;
+  def R11D : RegisterWithSubRegs<"r11d", [R11W]>;
+  def R12D : RegisterWithSubRegs<"r12d", [R12W]>;
+  def R13D : RegisterWithSubRegs<"r13d", [R13W]>;
+  def R14D : RegisterWithSubRegs<"r14d", [R14W]>;
+  def R15D : RegisterWithSubRegs<"r15d", [R15W]>;
+  }}
 
   // 64-bit registers, X86-64 only
   let SubRegIndices = [sub_32bit] in {
@@ -127,6 +130,8 @@ let Namespace = "X86" in {
   def RBP : RegisterWithSubRegs<"rbp", [EBP]>, DwarfRegNum<[6, -2, -2]>;
   def RSP : RegisterWithSubRegs<"rsp", [ESP]>, DwarfRegNum<[7, -2, -2]>;
 
+  // These also require REX.
+  let CostPerUse = 1 in {
   def R8  : RegisterWithSubRegs<"r8", [R8D]>, DwarfRegNum<[8, -2, -2]>;
   def R9  : RegisterWithSubRegs<"r9", [R9D]>, DwarfRegNum<[9, -2, -2]>;
   def R10 : RegisterWithSubRegs<"r10", [R10D]>, DwarfRegNum<[10, -2, -2]>;
@@ -136,7 +141,7 @@ let Namespace = "X86" in {
   def R14 : RegisterWithSubRegs<"r14", [R14D]>, DwarfRegNum<[14, -2, -2]>;
   def R15 : RegisterWithSubRegs<"r15", [R15D]>, DwarfRegNum<[15, -2, -2]>;
   def RIP : RegisterWithSubRegs<"rip", [EIP]>,  DwarfRegNum<[16, -2, -2]>;
-  }
+  }}
 
   // MMX Registers. These are actually aliased to ST0 .. ST7
   def MM0 : Register<"mm0">, DwarfRegNum<[41, 29, 29]>;
@@ -170,6 +175,7 @@ let Namespace = "X86" in {
   def XMM7: Register<"xmm7">, DwarfRegNum<[24, 28, 28]>;
 
   // X86-64 only
+  let CostPerUse = 1 in {
   def XMM8:  Register<"xmm8">,  DwarfRegNum<[25, -2, -2]>;
   def XMM9:  Register<"xmm9">,  DwarfRegNum<[26, -2, -2]>;
   def XMM10: Register<"xmm10">, DwarfRegNum<[27, -2, -2]>;
@@ -178,26 +184,26 @@ let Namespace = "X86" in {
   def XMM13: Register<"xmm13">, DwarfRegNum<[30, -2, -2]>;
   def XMM14: Register<"xmm14">, DwarfRegNum<[31, -2, -2]>;
   def XMM15: Register<"xmm15">, DwarfRegNum<[32, -2, -2]>;
-  }
+  }}
 
   // YMM Registers, used by AVX instructions
   let SubRegIndices = [sub_xmm] in {
-  def YMM0: RegisterWithSubRegs<"ymm0", [XMM0]>, DwarfRegNum<[17, 21, 21]>;
-  def YMM1: RegisterWithSubRegs<"ymm1", [XMM1]>, DwarfRegNum<[18, 22, 22]>;
-  def YMM2: RegisterWithSubRegs<"ymm2", [XMM2]>, DwarfRegNum<[19, 23, 23]>;
-  def YMM3: RegisterWithSubRegs<"ymm3", [XMM3]>, DwarfRegNum<[20, 24, 24]>;
-  def YMM4: RegisterWithSubRegs<"ymm4", [XMM4]>, DwarfRegNum<[21, 25, 25]>;
-  def YMM5: RegisterWithSubRegs<"ymm5", [XMM5]>, DwarfRegNum<[22, 26, 26]>;
-  def YMM6: RegisterWithSubRegs<"ymm6", [XMM6]>, DwarfRegNum<[23, 27, 27]>;
-  def YMM7: RegisterWithSubRegs<"ymm7", [XMM7]>, DwarfRegNum<[24, 28, 28]>;
-  def YMM8:  RegisterWithSubRegs<"ymm8", [XMM8]>,  DwarfRegNum<[25, -2, -2]>;
-  def YMM9:  RegisterWithSubRegs<"ymm9", [XMM9]>,  DwarfRegNum<[26, -2, -2]>;
-  def YMM10: RegisterWithSubRegs<"ymm10", [XMM10]>, DwarfRegNum<[27, -2, -2]>;
-  def YMM11: RegisterWithSubRegs<"ymm11", [XMM11]>, DwarfRegNum<[28, -2, -2]>;
-  def YMM12: RegisterWithSubRegs<"ymm12", [XMM12]>, DwarfRegNum<[29, -2, -2]>;
-  def YMM13: RegisterWithSubRegs<"ymm13", [XMM13]>, DwarfRegNum<[30, -2, -2]>;
-  def YMM14: RegisterWithSubRegs<"ymm14", [XMM14]>, DwarfRegNum<[31, -2, -2]>;
-  def YMM15: RegisterWithSubRegs<"ymm15", [XMM15]>, DwarfRegNum<[32, -2, -2]>;
+  def YMM0: RegisterWithSubRegs<"ymm0", [XMM0]>, DwarfRegAlias<XMM0>;
+  def YMM1: RegisterWithSubRegs<"ymm1", [XMM1]>, DwarfRegAlias<XMM1>;
+  def YMM2: RegisterWithSubRegs<"ymm2", [XMM2]>, DwarfRegAlias<XMM2>;
+  def YMM3: RegisterWithSubRegs<"ymm3", [XMM3]>, DwarfRegAlias<XMM3>;
+  def YMM4: RegisterWithSubRegs<"ymm4", [XMM4]>, DwarfRegAlias<XMM4>;
+  def YMM5: RegisterWithSubRegs<"ymm5", [XMM5]>, DwarfRegAlias<XMM5>;
+  def YMM6: RegisterWithSubRegs<"ymm6", [XMM6]>, DwarfRegAlias<XMM6>;
+  def YMM7: RegisterWithSubRegs<"ymm7", [XMM7]>, DwarfRegAlias<XMM7>;
+  def YMM8:  RegisterWithSubRegs<"ymm8", [XMM8]>, DwarfRegAlias<XMM8>;
+  def YMM9:  RegisterWithSubRegs<"ymm9", [XMM9]>, DwarfRegAlias<XMM9>;
+  def YMM10: RegisterWithSubRegs<"ymm10", [XMM10]>, DwarfRegAlias<XMM10>;
+  def YMM11: RegisterWithSubRegs<"ymm11", [XMM11]>, DwarfRegAlias<XMM11>;
+  def YMM12: RegisterWithSubRegs<"ymm12", [XMM12]>, DwarfRegAlias<XMM12>;
+  def YMM13: RegisterWithSubRegs<"ymm13", [XMM13]>, DwarfRegAlias<XMM13>;
+  def YMM14: RegisterWithSubRegs<"ymm14", [XMM14]>, DwarfRegAlias<XMM14>;
+  def YMM15: RegisterWithSubRegs<"ymm15", [XMM15]>, DwarfRegAlias<XMM15>;
   }
 
   // Floating point stack registers
@@ -273,8 +279,8 @@ let Namespace = "X86" in {
 // require a REX prefix. For example, "addb %ah, %dil" and "movzbl %ah, %r8d"
 // cannot be encoded.
 def GR8 : RegisterClass<"X86", [i8],  8,
-                        [AL, CL, DL, AH, CH, DH, BL, BH, SIL, DIL, BPL, SPL,
-                         R8B, R9B, R10B, R11B, R14B, R15B, R12B, R13B]> {
+                        (add AL, CL, DL, AH, CH, DH, BL, BH, SIL, DIL, BPL, SPL,
+                             R8B, R9B, R10B, R11B, R14B, R15B, R12B, R13B)> {
   let MethodProtos = [{
     iterator allocation_order_begin(const MachineFunction &MF) const;
     iterator allocation_order_end(const MachineFunction &MF) const;
@@ -317,152 +323,38 @@ def GR8 : RegisterClass<"X86", [i8],  8,
 }
 
 def GR16 : RegisterClass<"X86", [i16], 16,
-                         [AX, CX, DX, SI, DI, BX, BP, SP,
-                          R8W, R9W, R10W, R11W, R14W, R15W, R12W, R13W]> {
+                         (add AX, CX, DX, SI, DI, BX, BP, SP,
+                              R8W, R9W, R10W, R11W, R14W, R15W, R12W, R13W)> {
   let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi)];
-  let MethodProtos = [{
-    iterator allocation_order_begin(const MachineFunction &MF) const;
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    static const unsigned X86_GR16_AO_64[] = {
-      X86::AX,  X86::CX,   X86::DX,   X86::SI,   X86::DI,
-      X86::R8W, X86::R9W,  X86::R10W, X86::R11W,
-      X86::BX, X86::R14W, X86::R15W,  X86::R12W, X86::R13W, X86::BP
-    };
-
-    GR16Class::iterator
-    GR16Class::allocation_order_begin(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
-      if (Subtarget.is64Bit())
-        return X86_GR16_AO_64;
-      else
-        return begin();
-    }
-
-    GR16Class::iterator
-    GR16Class::allocation_order_end(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const TargetFrameLowering *TFI = TM.getFrameLowering();
-      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
-      const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>();
-      if (Subtarget.is64Bit()) {
-        // Does the function dedicate RBP to being a frame ptr?
-        if (TFI->hasFP(MF) || MFI->getReserveFP())
-          // If so, don't allocate SP or BP.
-          return array_endof(X86_GR16_AO_64) - 1;
-        else
-          // If not, just don't allocate SP.
-          return array_endof(X86_GR16_AO_64);
-      } else {
-        // Does the function dedicate EBP to being a frame ptr?
-        if (TFI->hasFP(MF) || MFI->getReserveFP())
-          // If so, don't allocate SP or BP.
-          return begin() + 6;
-        else
-          // If not, just don't allocate SP.
-          return begin() + 7;
-      }
-    }
-  }];
 }
 
 def GR32 : RegisterClass<"X86", [i32], 32,
-                         [EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP,
-                          R8D, R9D, R10D, R11D, R14D, R15D, R12D, R13D]> {
+                         (add EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP,
+                              R8D, R9D, R10D, R11D, R14D, R15D, R12D, R13D)> {
   let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), (GR16 sub_16bit)];
-  let MethodProtos = [{
-    iterator allocation_order_begin(const MachineFunction &MF) const;
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    static const unsigned X86_GR32_AO_64[] = {
-      X86::EAX, X86::ECX,  X86::EDX,  X86::ESI,  X86::EDI,
-      X86::R8D, X86::R9D,  X86::R10D, X86::R11D,
-      X86::EBX, X86::R14D, X86::R15D, X86::R12D, X86::R13D, X86::EBP
-    };
-
-    GR32Class::iterator
-    GR32Class::allocation_order_begin(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
-      if (Subtarget.is64Bit())
-        return X86_GR32_AO_64;
-      else
-        return begin();
-    }
-
-    GR32Class::iterator
-    GR32Class::allocation_order_end(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const TargetFrameLowering *TFI = TM.getFrameLowering();
-      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
-      const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>();
-      if (Subtarget.is64Bit()) {
-        // Does the function dedicate RBP to being a frame ptr?
-        if (TFI->hasFP(MF) || MFI->getReserveFP())
-          // If so, don't allocate ESP or EBP.
-          return array_endof(X86_GR32_AO_64) - 1;
-        else
-          // If not, just don't allocate ESP.
-          return array_endof(X86_GR32_AO_64);
-      } else {
-        // Does the function dedicate EBP to being a frame ptr?
-        if (TFI->hasFP(MF) || MFI->getReserveFP())
-          // If so, don't allocate ESP or EBP.
-          return begin() + 6;
-        else
-          // If not, just don't allocate ESP.
-          return begin() + 7;
-      }
-    }
-  }];
 }
 
 // GR64 - 64-bit GPRs. This oddly includes RIP, which isn't accurate, since
 // RIP isn't really a register and it can't be used anywhere except in an
 // address, but it doesn't cause trouble.
 def GR64 : RegisterClass<"X86", [i64], 64,
-                         [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
-                          RBX, R14, R15, R12, R13, RBP, RSP, RIP]> {
+                         (add RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
+                              RBX, R14, R15, R12, R13, RBP, RSP, RIP)> {
   let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi),
                        (GR16 sub_16bit),
                        (GR32 sub_32bit)];
-  let MethodProtos = [{
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    GR64Class::iterator
-    GR64Class::allocation_order_end(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const TargetFrameLowering *TFI = TM.getFrameLowering();
-      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
-      const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>();
-      if (!Subtarget.is64Bit())
-        return begin();  // None of these are allocatable in 32-bit.
-      // Does the function dedicate RBP to being a frame ptr?
-      if (TFI->hasFP(MF) || MFI->getReserveFP())
-        return end()-3;  // If so, don't allocate RIP, RSP or RBP
-      else
-        return end()-2;  // If not, just don't allocate RIP or RSP
-    }
-  }];
 }
 
 // Segment registers for use by MOV instructions (and others) that have a
 //   segment register as one operand.  Always contain a 16-bit segment
 //   descriptor.
-def SEGMENT_REG : RegisterClass<"X86", [i16], 16, [CS, DS, SS, ES, FS, GS]>;
+def SEGMENT_REG : RegisterClass<"X86", [i16], 16, (add CS, DS, SS, ES, FS, GS)>;
 
 // Debug registers.
-def DEBUG_REG : RegisterClass<"X86", [i32], 32,
-                              [DR0, DR1, DR2, DR3, DR4, DR5, DR6, DR7]>;
+def DEBUG_REG : RegisterClass<"X86", [i32], 32, (sequence "DR%u", 0, 7)>;
 
 // Control registers.
-def CONTROL_REG : RegisterClass<"X86", [i64], 64,
-                                [CR0, CR1, CR2, CR3, CR4, CR5, CR6, CR7, CR8,
-                                 CR9, CR10, CR11, CR12, CR13, CR14, CR15]>;
+def CONTROL_REG : RegisterClass<"X86", [i64], 64, (sequence "CR%u", 0, 15)>;
 
 // GR8_ABCD_L, GR8_ABCD_H, GR16_ABCD, GR32_ABCD, GR64_ABCD - Subclasses of
 // GR8, GR16, GR32, and GR64 which contain just the "a" "b", "c", and "d"
@@ -470,38 +362,38 @@ def CONTROL_REG : RegisterClass<"X86", [i64], 64,
 // that support 8-bit subreg operations. On x86-64, GR16_ABCD, GR32_ABCD,
 // and GR64_ABCD are classes for registers that support 8-bit h-register
 // operations.
-def GR8_ABCD_L : RegisterClass<"X86", [i8], 8, [AL, CL, DL, BL]>;
-def GR8_ABCD_H : RegisterClass<"X86", [i8], 8, [AH, CH, DH, BH]>;
-def GR16_ABCD : RegisterClass<"X86", [i16], 16, [AX, CX, DX, BX]> {
+def GR8_ABCD_L : RegisterClass<"X86", [i8], 8, (add AL, CL, DL, BL)>;
+def GR8_ABCD_H : RegisterClass<"X86", [i8], 8, (add AH, CH, DH, BH)>;
+def GR16_ABCD : RegisterClass<"X86", [i16], 16, (add AX, CX, DX, BX)> {
   let SubRegClasses = [(GR8_ABCD_L sub_8bit), (GR8_ABCD_H sub_8bit_hi)];
 }
-def GR32_ABCD : RegisterClass<"X86", [i32], 32, [EAX, ECX, EDX, EBX]> {
+def GR32_ABCD : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX, EBX)> {
   let SubRegClasses = [(GR8_ABCD_L sub_8bit),
                        (GR8_ABCD_H sub_8bit_hi),
                        (GR16_ABCD sub_16bit)];
 }
-def GR64_ABCD : RegisterClass<"X86", [i64], 64, [RAX, RCX, RDX, RBX]> {
+def GR64_ABCD : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RBX)> {
   let SubRegClasses = [(GR8_ABCD_L sub_8bit),
                        (GR8_ABCD_H sub_8bit_hi),
                        (GR16_ABCD sub_16bit),
                        (GR32_ABCD sub_32bit)];
 }
-def GR32_TC   : RegisterClass<"X86", [i32], 32, [EAX, ECX, EDX]> {
+def GR32_TC   : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX)> {
   let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), (GR16 sub_16bit)];
 }
-def GR64_TC   : RegisterClass<"X86", [i64], 64, [RAX, RCX, RDX, RSI, RDI,
-                                                 R8, R9, R11]> {
+def GR64_TC   : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RSI, RDI,
+                                                     R8, R9, R11, RIP)> {
   let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi),
                        (GR16 sub_16bit),
                        (GR32_TC sub_32bit)];
 }
 
-def GR64_TCW64   : RegisterClass<"X86", [i64], 64, [RAX, RCX, RDX,
-                                                    R8, R9, R11]>;
+def GR64_TCW64 : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX,
+                                                      R8, R9, R11)>;
 
 // GR8_NOREX - GR8 registers which do not require a REX prefix.
 def GR8_NOREX : RegisterClass<"X86", [i8], 8,
-                              [AL, CL, DL, AH, CH, DH, BL, BH]> {
+                              (add AL, CL, DL, AH, CH, DH, BL, BH)> {
   let MethodProtos = [{
     iterator allocation_order_begin(const MachineFunction &MF) const;
     iterator allocation_order_end(const MachineFunction &MF) const;
@@ -535,232 +427,62 @@ def GR8_NOREX : RegisterClass<"X86", [i8], 8,
 }
 // GR16_NOREX - GR16 registers which do not require a REX prefix.
 def GR16_NOREX : RegisterClass<"X86", [i16], 16,
-                               [AX, CX, DX, SI, DI, BX, BP, SP]> {
+                               (add AX, CX, DX, SI, DI, BX, BP, SP)> {
   let SubRegClasses = [(GR8_NOREX sub_8bit, sub_8bit_hi)];
-  let MethodProtos = [{
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    GR16_NOREXClass::iterator
-    GR16_NOREXClass::allocation_order_end(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const TargetFrameLowering *TFI = TM.getFrameLowering();
-      const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>();
-      // Does the function dedicate RBP / EBP to being a frame ptr?
-      if (TFI->hasFP(MF) || MFI->getReserveFP())
-        // If so, don't allocate SP or BP.
-        return end() - 2;
-      else
-        // If not, just don't allocate SP.
-        return end() - 1;
-    }
-  }];
 }
 // GR32_NOREX - GR32 registers which do not require a REX prefix.
 def GR32_NOREX : RegisterClass<"X86", [i32], 32,
-                               [EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP]> {
+                               (add EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP)> {
   let SubRegClasses = [(GR8_NOREX sub_8bit, sub_8bit_hi),
                        (GR16_NOREX sub_16bit)];
-  let MethodProtos = [{
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    GR32_NOREXClass::iterator
-    GR32_NOREXClass::allocation_order_end(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const TargetFrameLowering *TFI = TM.getFrameLowering();
-      const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>();
-      // Does the function dedicate RBP / EBP to being a frame ptr?
-      if (TFI->hasFP(MF) || MFI->getReserveFP())
-        // If so, don't allocate ESP or EBP.
-        return end() - 2;
-      else
-        // If not, just don't allocate ESP.
-        return end() - 1;
-    }
-  }];
 }
 // GR64_NOREX - GR64 registers which do not require a REX prefix.
 def GR64_NOREX : RegisterClass<"X86", [i64], 64,
-                               [RAX, RCX, RDX, RSI, RDI, RBX, RBP, RSP, RIP]> {
+                            (add RAX, RCX, RDX, RSI, RDI, RBX, RBP, RSP, RIP)> {
   let SubRegClasses = [(GR8_NOREX sub_8bit, sub_8bit_hi),
                        (GR16_NOREX sub_16bit),
                        (GR32_NOREX sub_32bit)];
-  let MethodProtos = [{
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    GR64_NOREXClass::iterator
-    GR64_NOREXClass::allocation_order_end(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const TargetFrameLowering *TFI = TM.getFrameLowering();
-      const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>();
-      // Does the function dedicate RBP to being a frame ptr?
-      if (TFI->hasFP(MF) || MFI->getReserveFP())
-        // If so, don't allocate RIP, RSP or RBP.
-        return end() - 3;
-      else
-        // If not, just don't allocate RIP or RSP.
-        return end() - 2;
-    }
-  }];
 }
 
 // GR32_NOSP - GR32 registers except ESP.
-def GR32_NOSP : RegisterClass<"X86", [i32], 32,
-                              [EAX, ECX, EDX, ESI, EDI, EBX, EBP,
-                               R8D, R9D, R10D, R11D, R14D, R15D, R12D, R13D]> {
+def GR32_NOSP : RegisterClass<"X86", [i32], 32, (sub GR32, ESP)> {
   let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), (GR16 sub_16bit)];
-  let MethodProtos = [{
-    iterator allocation_order_begin(const MachineFunction &MF) const;
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    static const unsigned X86_GR32_NOSP_AO_64[] = {
-      X86::EAX, X86::ECX,  X86::EDX,  X86::ESI,  X86::EDI,
-      X86::R8D, X86::R9D,  X86::R10D, X86::R11D,
-      X86::EBX, X86::R14D, X86::R15D, X86::R12D, X86::R13D, X86::EBP
-    };
-
-    GR32_NOSPClass::iterator
-    GR32_NOSPClass::allocation_order_begin(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
-      if (Subtarget.is64Bit())
-        return X86_GR32_NOSP_AO_64;
-      else
-        return begin();
-    }
-
-    GR32_NOSPClass::iterator
-    GR32_NOSPClass::allocation_order_end(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const TargetFrameLowering *TFI = TM.getFrameLowering();
-      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
-      const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>();
-      if (Subtarget.is64Bit()) {
-        // Does the function dedicate RBP to being a frame ptr?
-        if (TFI->hasFP(MF) || MFI->getReserveFP())
-          // If so, don't allocate EBP.
-          return array_endof(X86_GR32_NOSP_AO_64) - 1;
-        else
-          // If not, any reg in this class is ok.
-          return array_endof(X86_GR32_NOSP_AO_64);
-      } else {
-        // Does the function dedicate EBP to being a frame ptr?
-        if (TFI->hasFP(MF) || MFI->getReserveFP())
-          // If so, don't allocate EBP.
-          return begin() + 6;
-        else
-          // If not, any reg in this class is ok.
-          return begin() + 7;
-      }
-    }
-  }];
 }
 
 // GR64_NOSP - GR64 registers except RSP (and RIP).
-def GR64_NOSP : RegisterClass<"X86", [i64], 64,
-                              [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
-                               RBX, R14, R15, R12, R13, RBP]> {
+def GR64_NOSP : RegisterClass<"X86", [i64], 64, (sub GR64, RSP, RIP)> {
   let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi),
                        (GR16 sub_16bit),
                        (GR32_NOSP sub_32bit)];
-  let MethodProtos = [{
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    GR64_NOSPClass::iterator
-    GR64_NOSPClass::allocation_order_end(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const TargetFrameLowering *TFI = TM.getFrameLowering();
-      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
-      const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>();
-      if (!Subtarget.is64Bit())
-        return begin();  // None of these are allocatable in 32-bit.
-      // Does the function dedicate RBP to being a frame ptr?
-      if (TFI->hasFP(MF) || MFI->getReserveFP())
-        return end()-1;  // If so, don't allocate RBP
-      else
-        return end();  // If not, any reg in this class is ok.
-    }
-  }];
+}
+
+// GR32_NOREX_NOSP - GR32 registers which do not require a REX prefix except
+// ESP.
+def GR32_NOREX_NOSP : RegisterClass<"X86", [i32], 32,
+                                    (and GR32_NOREX, GR32_NOSP)> {
+  let SubRegClasses = [(GR8_NOREX sub_8bit, sub_8bit_hi),
+                       (GR16_NOREX sub_16bit)];
 }
 
 // GR64_NOREX_NOSP - GR64_NOREX registers except RSP.
 def GR64_NOREX_NOSP : RegisterClass<"X86", [i64], 64,
-                                    [RAX, RCX, RDX, RSI, RDI, RBX, RBP]> {
+                                    (and GR64_NOREX, GR64_NOSP)> {
   let SubRegClasses = [(GR8_NOREX sub_8bit, sub_8bit_hi),
                        (GR16_NOREX sub_16bit),
-                       (GR32_NOREX sub_32bit)];
-  let MethodProtos = [{
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    GR64_NOREX_NOSPClass::iterator
-    GR64_NOREX_NOSPClass::allocation_order_end(const MachineFunction &MF) const
-  {
-      const TargetMachine &TM = MF.getTarget();
-      const TargetFrameLowering *TFI = TM.getFrameLowering();
-      const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>();
-      // Does the function dedicate RBP to being a frame ptr?
-      if (TFI->hasFP(MF) || MFI->getReserveFP())
-        // If so, don't allocate RBP.
-        return end() - 1;
-      else
-        // If not, any reg in this class is ok.
-        return end();
-    }
-  }];
+                       (GR32_NOREX_NOSP sub_32bit)];
 }
 
 // A class to support the 'A' assembler constraint: EAX then EDX.
-def GR32_AD : RegisterClass<"X86", [i32], 32, [EAX, EDX]> {
+def GR32_AD : RegisterClass<"X86", [i32], 32, (add EAX, EDX)> {
   let SubRegClasses = [(GR8_ABCD_L sub_8bit),
                        (GR8_ABCD_H sub_8bit_hi),
                        (GR16_ABCD sub_16bit)];
 }
 
 // Scalar SSE2 floating point registers.
-def FR32 : RegisterClass<"X86", [f32], 32,
-                         [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
-                          XMM8, XMM9, XMM10, XMM11,
-                          XMM12, XMM13, XMM14, XMM15]> {
-  let MethodProtos = [{
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    FR32Class::iterator
-    FR32Class::allocation_order_end(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
-      if (!Subtarget.is64Bit())
-        return end()-8; // Only XMM0 to XMM7 are available in 32-bit mode.
-      else
-        return end();
-    }
-  }];
-}
+def FR32 : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 15)>;
 
-def FR64 : RegisterClass<"X86", [f64], 64,
-                         [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
-                          XMM8, XMM9, XMM10, XMM11,
-                          XMM12, XMM13, XMM14, XMM15]> {
-  let MethodProtos = [{
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    FR64Class::iterator
-    FR64Class::allocation_order_end(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
-      if (!Subtarget.is64Bit())
-        return end()-8; // Only XMM0 to XMM7 are available in 32-bit mode.
-      else
-        return end();
-    }
-  }];
-}
+def FR64 : RegisterClass<"X86", [f64], 64, (add FR32)>;
 
 
 // FIXME: This sets up the floating point register files as though they are f64
@@ -769,85 +491,31 @@ def FR64 : RegisterClass<"X86", [f64], 64,
 // faster on common hardware.  In reality, this should be controlled by a
 // command line option or something.
 
-def RFP32 : RegisterClass<"X86",[f32], 32, [FP0, FP1, FP2, FP3, FP4, FP5, FP6]>;
-def RFP64 : RegisterClass<"X86",[f64], 32, [FP0, FP1, FP2, FP3, FP4, FP5, FP6]>;
-def RFP80 : RegisterClass<"X86",[f80], 32, [FP0, FP1, FP2, FP3, FP4, FP5, FP6]>;
+def RFP32 : RegisterClass<"X86",[f32], 32, (sequence "FP%u", 0, 6)>;
+def RFP64 : RegisterClass<"X86",[f64], 32, (add RFP32)>;
+def RFP80 : RegisterClass<"X86",[f80], 32, (add RFP32)>;
 
 // Floating point stack registers (these are not allocatable by the
 // register allocator - the floating point stackifier is responsible
 // for transforming FPn allocations to STn registers)
-def RST : RegisterClass<"X86", [f80, f64, f32], 32,
-                        [ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7]> {
-    let MethodProtos = [{
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    RSTClass::iterator
-    RSTClass::allocation_order_end(const MachineFunction &MF) const {
-      return begin();
-    }
-  }];
+def RST : RegisterClass<"X86", [f80, f64, f32], 32, (sequence "ST%u", 0, 7)> {
+  let isAllocatable = 0;
 }
 
 // Generic vector registers: VR64 and VR128.
-def VR64: RegisterClass<"X86", [x86mmx], 64,
-                          [MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7]>;
-def VR128 : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],128,
-                          [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
-                           XMM8, XMM9, XMM10, XMM11,
-                           XMM12, XMM13, XMM14, XMM15]> {
+def VR64: RegisterClass<"X86", [x86mmx], 64, (sequence "MM%u", 0, 7)>;
+def VR128 : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+                          128, (add FR32)> {
   let SubRegClasses = [(FR32 sub_ss), (FR64 sub_sd)];
-
-  let MethodProtos = [{
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    VR128Class::iterator
-    VR128Class::allocation_order_end(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
-      if (!Subtarget.is64Bit())
-        return end()-8; // Only XMM0 to XMM7 are available in 32-bit mode.
-      else
-        return end();
-    }
-  }];
 }
 
 def VR256 : RegisterClass<"X86", [v32i8, v8i32, v4i64, v8f32, v4f64], 256,
-                          [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
-                           YMM8, YMM9, YMM10, YMM11,
-                           YMM12, YMM13, YMM14, YMM15]> {
+                          (sequence "YMM%u", 0, 15)> {
   let SubRegClasses = [(FR32 sub_ss), (FR64 sub_sd), (VR128 sub_xmm)];
-
-  let MethodProtos = [{
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    VR256Class::iterator
-    VR256Class::allocation_order_end(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
-      if (!Subtarget.is64Bit())
-        return end()-8; // Only YMM0 to YMM7 are available in 32-bit mode.
-      else
-        return end();
-    }
-  }];
 }
 
 // Status flags registers.
-def CCR : RegisterClass<"X86", [i32], 32, [EFLAGS]> {
+def CCR : RegisterClass<"X86", [i32], 32, (add EFLAGS)> {
   let CopyCost = -1;  // Don't allow copying of status registers.
-
-  // EFLAGS is not allocatable.
-  let MethodProtos = [{
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    CCRClass::iterator
-    CCRClass::allocation_order_end(const MachineFunction &MF) const {
-      return allocation_order_begin(MF);
-    }
-  }];
+  let isAllocatable = 0;
 }
diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp
index 42e8193..02754f9 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -178,7 +178,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
                                         bool isVolatile, bool AlwaysInline,
                                          MachinePointerInfo DstPtrInfo,
                                          MachinePointerInfo SrcPtrInfo) const {
-  // This requires the copy size to be a constant, preferrably
+  // This requires the copy size to be a constant, preferably
   // within a subtarget-specific limit.
   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
   if (!ConstantSize)
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index 1ee7312..481e821 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -144,7 +144,8 @@ ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const {
 /// passed as the second argument. Otherwise it returns null.
 const char *X86Subtarget::getBZeroEntry() const {
   // Darwin 10 has a __bzero entry point for this purpose.
-  if (getDarwinVers() >= 10)
+  if (getTargetTriple().isMacOSX() &&
+      !getTargetTriple().isMacOSXVersionLT(10, 6))
     return "__bzero";
 
   return 0;
@@ -264,6 +265,7 @@ void X86Subtarget::AutoDetectSubtargetFeatures() {
 
   HasCLMUL = IsIntel && ((ECX >> 1) & 0x1);
   HasFMA3  = IsIntel && ((ECX >> 12) & 0x1);
+  HasPOPCNT = IsIntel && ((ECX >> 23) & 0x1);
   HasAES   = IsIntel && ((ECX >> 25) & 0x1);
 
   if (IsIntel || IsAMD) {
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index 0a62a02..286a798 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -165,9 +165,15 @@ public:
   bool isUnalignedMemAccessFast() const { return IsUAMemFast; }
   bool hasVectorUAMem() const { return HasVectorUAMem; }
 
-  bool isTargetDarwin() const { return TargetTriple.getOS() == Triple::Darwin; }
-  bool isTargetFreeBSD() const { return TargetTriple.getOS() == Triple::FreeBSD; }
-  bool isTargetSolaris() const { return TargetTriple.getOS() == Triple::Solaris; }
+  const Triple &getTargetTriple() const { return TargetTriple; }
+
+  bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
+  bool isTargetFreeBSD() const {
+    return TargetTriple.getOS() == Triple::FreeBSD;
+  }
+  bool isTargetSolaris() const {
+    return TargetTriple.getOS() == Triple::Solaris;
+  }
 
   // ELF is a reasonably sane default and the only other X86 targets we
   // support are Darwin and Windows. Just use "not those".
@@ -215,13 +221,6 @@ public:
     return PICStyle == PICStyles::StubDynamicNoPIC ||
            PICStyle == PICStyles::StubPIC; }
 
-  /// getDarwinVers - Return the darwin version number, 8 = Tiger, 9 = Leopard,
-  /// 10 = Snow Leopard, etc.
-  unsigned getDarwinVers() const {
-    if (isTargetDarwin()) return TargetTriple.getDarwinMajorNumber();
-    return 0;
-  }
-
   /// ClassifyGlobalReference - Classify a global variable reference for the
   /// current subtarget according to how we should reference it in a non-pcrel
   /// context.
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 8fb9470..7483329 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -26,19 +26,18 @@ using namespace llvm;
 
 static MCAsmInfo *createMCAsmInfo(const Target &T, StringRef TT) {
   Triple TheTriple(TT);
-  switch (TheTriple.getOS()) {
-  case Triple::Darwin:
-    return new X86MCAsmInfoDarwin(TheTriple);
-  case Triple::MinGW32:
-  case Triple::Cygwin:
-  case Triple::Win32:
-    if (TheTriple.getEnvironment() == Triple::MachO)
-      return new X86MCAsmInfoDarwin(TheTriple);
+
+  if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO) {
+    if (TheTriple.getArch() == Triple::x86_64)
+      return new X86_64MCAsmInfoDarwin(TheTriple);
     else
-      return new X86MCAsmInfoCOFF(TheTriple);
-  default:
-    return new X86ELFMCAsmInfo(TheTriple);
+      return new X86MCAsmInfoDarwin(TheTriple);
   }
+
+  if (TheTriple.isOSWindows())
+    return new X86MCAsmInfoCOFF(TheTriple);
+
+  return new X86ELFMCAsmInfo(TheTriple);
 }
 
 static MCStreamer *createMCStreamer(const Target &T, const std::string &TT,
@@ -48,19 +47,14 @@ static MCStreamer *createMCStreamer(const Target &T, const std::string &TT,
                                     bool RelaxAll,
                                     bool NoExecStack) {
   Triple TheTriple(TT);
-  switch (TheTriple.getOS()) {
-  case Triple::Darwin:
+
+  if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO)
     return createMachOStreamer(Ctx, TAB, _OS, _Emitter, RelaxAll);
-  case Triple::MinGW32:
-  case Triple::Cygwin:
-  case Triple::Win32:
-    if (TheTriple.getEnvironment() == Triple::MachO)
-      return createMachOStreamer(Ctx, TAB, _OS, _Emitter, RelaxAll);
-    else
-      return createWinCOFFStreamer(Ctx, TAB, *_Emitter, _OS, RelaxAll);
-  default:
-    return createELFStreamer(Ctx, TAB, _OS, _Emitter, RelaxAll, NoExecStack);
-  }
+
+  if (TheTriple.isOSWindows())
+    return createWinCOFFStreamer(Ctx, TAB, *_Emitter, _OS, RelaxAll);
+
+  return createELFStreamer(Ctx, TAB, _OS, _Emitter, RelaxAll, NoExecStack);
 }
 
 extern "C" void LLVMInitializeX86Target() {
diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp
index c15dfbb..1231798 100644
--- a/lib/Target/X86/X86TargetObjectFile.cpp
+++ b/lib/Target/X86/X86TargetObjectFile.cpp
@@ -38,6 +38,12 @@ getExprForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang,
     getExprForDwarfGlobalReference(GV, Mang, MMI, Encoding, Streamer);
 }
 
+MCSymbol *X8664_MachoTargetObjectFile::
+getCFIPersonalitySymbol(const GlobalValue *GV, Mangler *Mang,
+                        MachineModuleInfo *MMI) const {
+  return Mang->getSymbol(GV);
+}
+
 unsigned X8632_ELFTargetObjectFile::getPersonalityEncoding() const {
   if (TM.getRelocationModel() == Reloc::PIC_)
     return DW_EH_PE_indirect | DW_EH_PE_pcrel | DW_EH_PE_sdata4;
@@ -52,7 +58,7 @@ unsigned X8632_ELFTargetObjectFile::getLSDAEncoding() const {
     return DW_EH_PE_absptr;
 }
 
-unsigned X8632_ELFTargetObjectFile::getFDEEncoding() const {
+unsigned X8632_ELFTargetObjectFile::getFDEEncoding(bool FDE) const {
   if (TM.getRelocationModel() == Reloc::PIC_)
     return DW_EH_PE_pcrel | DW_EH_PE_sdata4;
   else
@@ -91,17 +97,14 @@ unsigned X8664_ELFTargetObjectFile::getLSDAEncoding() const {
   return DW_EH_PE_absptr;
 }
 
-unsigned X8664_ELFTargetObjectFile::getFDEEncoding() const {
-  CodeModel::Model Model = TM.getCodeModel();
-  if (TM.getRelocationModel() == Reloc::PIC_)
-    return DW_EH_PE_pcrel | (Model == CodeModel::Small ||
-                             Model == CodeModel::Medium ?
-                             DW_EH_PE_sdata4 : DW_EH_PE_sdata8);
+unsigned X8664_ELFTargetObjectFile::getFDEEncoding(bool CFI) const {
+  if (CFI)
+    return DW_EH_PE_pcrel | DW_EH_PE_sdata4;
 
-  if (Model == CodeModel::Small || Model == CodeModel::Medium)
-    return DW_EH_PE_udata4;
+  if (TM.getRelocationModel() == Reloc::PIC_)
+    return DW_EH_PE_pcrel | DW_EH_PE_sdata4;
 
-  return DW_EH_PE_absptr;
+  return DW_EH_PE_udata4;
 }
 
 unsigned X8664_ELFTargetObjectFile::getTTypeEncoding() const {
diff --git a/lib/Target/X86/X86TargetObjectFile.h b/lib/Target/X86/X86TargetObjectFile.h
index f2fd49c..e21b5bf 100644
--- a/lib/Target/X86/X86TargetObjectFile.h
+++ b/lib/Target/X86/X86TargetObjectFile.h
@@ -25,6 +25,12 @@ namespace llvm {
     getExprForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang,
                                    MachineModuleInfo *MMI, unsigned Encoding,
                                    MCStreamer &Streamer) const;
+
+    // getCFIPersonalitySymbol - The symbol that gets passed to
+    // .cfi_personality.
+    virtual MCSymbol *
+    getCFIPersonalitySymbol(const GlobalValue *GV, Mangler *Mang,
+                            MachineModuleInfo *MMI) const;
   };
 
   class X8632_ELFTargetObjectFile : public TargetLoweringObjectFileELF {
@@ -34,7 +40,7 @@ namespace llvm {
       :TM(tm) { }
     virtual unsigned getPersonalityEncoding() const;
     virtual unsigned getLSDAEncoding() const;
-    virtual unsigned getFDEEncoding() const;
+    virtual unsigned getFDEEncoding(bool CFI) const;
     virtual unsigned getTTypeEncoding() const;
   };
 
@@ -45,7 +51,7 @@ namespace llvm {
       :TM(tm) { }
     virtual unsigned getPersonalityEncoding() const;
     virtual unsigned getLSDAEncoding() const;
-    virtual unsigned getFDEEncoding() const;
+    virtual unsigned getFDEEncoding(bool CFI) const;
     virtual unsigned getTTypeEncoding() const;
   };
 
diff --git a/lib/Target/XCore/XCoreISelDAGToDAG.cpp b/lib/Target/XCore/XCoreISelDAGToDAG.cpp
index 8ce93fd..a8dd847 100644
--- a/lib/Target/XCore/XCoreISelDAGToDAG.cpp
+++ b/lib/Target/XCore/XCoreISelDAGToDAG.cpp
@@ -30,8 +30,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include <queue>
-#include <set>
 using namespace llvm;
 
 /// XCoreDAGToDAGISel - XCore specific code to select XCore machine
@@ -207,6 +205,16 @@ SDNode *XCoreDAGToDAGISel::Select(SDNode *N) {
     return CurDAG->getMachineNode(XCore::LMUL_l6r, dl, MVT::i32, MVT::i32,
                                   Ops, 4);
   }
+  case ISD::INTRINSIC_WO_CHAIN: {
+    unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+    switch (IntNo) {
+    case Intrinsic::xcore_crc8:
+      SDValue Ops[] = { N->getOperand(1), N->getOperand(2), N->getOperand(3) };
+      return CurDAG->getMachineNode(XCore::CRC8_l4r, dl, MVT::i32, MVT::i32,
+                                    Ops, 3);
+    }
+    break;
+  }
   case ISD::BRIND:
     if (SDNode *ResNode = SelectBRIND(N))
       return ResNode;
diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp
index 4817787..8cabbbf 100644
--- a/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/lib/Target/XCore/XCoreISelLowering.cpp
@@ -37,8 +37,6 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/ADT/VectorExtras.h"
-#include <queue>
-#include <set>
 using namespace llvm;
 
 const char *XCoreTargetLowering::
@@ -158,6 +156,8 @@ XCoreTargetLowering::XCoreTargetLowering(XCoreTargetMachine &XTM)
   // We have target-specific dag combine patterns for the following nodes:
   setTargetDAGCombine(ISD::STORE);
   setTargetDAGCombine(ISD::ADD);
+
+  setMinFunctionAlignment(1);
 }
 
 SDValue XCoreTargetLowering::
@@ -203,12 +203,6 @@ void XCoreTargetLowering::ReplaceNodeResults(SDNode *N,
   }
 }
 
-/// getFunctionAlignment - Return the Log2 alignment of this function.
-unsigned XCoreTargetLowering::
-getFunctionAlignment(const Function *) const {
-  return 1;
-}
-
 //===----------------------------------------------------------------------===//
 //  Misc Lower Operation implementation
 //===----------------------------------------------------------------------===//
@@ -250,9 +244,6 @@ LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const
 {
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
   SDValue GA = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(), MVT::i32);
-  // If it's a debug information descriptor, don't mess with it.
-  if (DAG.isVerifiedDebugInfoDesc(Op))
-    return GA;
   return getGlobalAddressWrapper(GA, GV, DAG);
 }
 
@@ -906,8 +897,8 @@ XCoreTargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
-                 ArgLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), ArgLocs, *DAG.getContext());
 
   // The ABI dictates there should be one stack slot available to the callee
   // on function entry (for saving lr).
@@ -967,7 +958,7 @@ XCoreTargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
 
   // Build a sequence of copy-to-reg nodes chained together with token
   // chain and flag operands which copy the outgoing args into registers.
-  // The InFlag in necessary since all emited instructions must be
+  // The InFlag in necessary since all emitted instructions must be
   // stuck together.
   SDValue InFlag;
   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
@@ -1029,8 +1020,8 @@ XCoreTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
 
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
-                 RVLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), RVLocs, *DAG.getContext());
 
   CCInfo.AnalyzeCallResult(Ins, RetCC_XCore);
 
@@ -1089,8 +1080,8 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain,
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
-                 ArgLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), ArgLocs, *DAG.getContext());
 
   CCInfo.AnalyzeFormalArguments(Ins, CC_XCore);
 
@@ -1194,12 +1185,12 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain,
 //===----------------------------------------------------------------------===//
 
 bool XCoreTargetLowering::
-CanLowerReturn(CallingConv::ID CallConv, bool isVarArg,
+CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+	       bool isVarArg,
                const SmallVectorImpl<ISD::OutputArg> &Outs,
                LLVMContext &Context) const {
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
-                 RVLocs, Context);
+  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), RVLocs, Context);
   return CCInfo.CheckReturn(Outs, RetCC_XCore);
 }
 
@@ -1215,10 +1206,10 @@ XCoreTargetLowering::LowerReturn(SDValue Chain,
   SmallVector<CCValAssign, 16> RVLocs;
 
   // CCState - Info about the registers and stack slot.
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
-                 RVLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), RVLocs, *DAG.getContext());
 
-  // Analize return values.
+  // Analyze return values.
   CCInfo.AnalyzeReturn(Outs, RetCC_XCore);
 
   // If this is the first return lowered for this function, add
diff --git a/lib/Target/XCore/XCoreISelLowering.h b/lib/Target/XCore/XCoreISelLowering.h
index bb3f2cc..a8d67d4 100644
--- a/lib/Target/XCore/XCoreISelLowering.h
+++ b/lib/Target/XCore/XCoreISelLowering.h
@@ -103,9 +103,6 @@ namespace llvm {
     virtual bool isLegalAddressingMode(const AddrMode &AM,
                                        const Type *Ty) const;
 
-    /// getFunctionAlignment - Return the Log2 alignment of this function.
-    virtual unsigned getFunctionAlignment(const Function *F) const;
-
   private:
     const XCoreTargetMachine &TM;
     const XCoreSubtarget &Subtarget;
@@ -194,7 +191,8 @@ namespace llvm {
                   DebugLoc dl, SelectionDAG &DAG) const;
 
     virtual bool
-      CanLowerReturn(CallingConv::ID CallConv, bool isVarArg,
+      CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+		     bool isVarArg,
                      const SmallVectorImpl<ISD::OutputArg> &ArgsFlags,
                      LLVMContext &Context) const;
   };
diff --git a/lib/Target/XCore/XCoreInstrInfo.td b/lib/Target/XCore/XCoreInstrInfo.td
index 789546e..55c7527 100644
--- a/lib/Target/XCore/XCoreInstrInfo.td
+++ b/lib/Target/XCore/XCoreInstrInfo.td
@@ -472,7 +472,16 @@ def REMU_l3r : FL3R<"remu", urem>;
 }
 def XOR_l3r : FL3R<"xor", xor>;
 defm ASHR : FL3R_L2RBITP<"ashr", sra>;
-// TODO crc32, crc8, inpw, outpw
+
+let Constraints = "$src1 = $dst" in
+def CRC_l3r : _FL3R<(outs GRRegs:$dst),
+                     (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3),
+                     "crc32 $dst, $src2, $src3",
+                     [(set GRRegs:$dst,
+                        (int_xcore_crc32 GRRegs:$src1, GRRegs:$src2,
+                                         GRRegs:$src3))]>;
+
+// TODO inpw, outpw
 let mayStore=1 in {
 def ST16_l3r : _FL3R<(outs), (ins GRRegs:$val, GRRegs:$addr, GRRegs:$offset),
                 "st16 $val, $addr[$offset]",
@@ -498,6 +507,12 @@ def MACCS_l4r : _L4R<(outs GRRegs:$dst1, GRRegs:$dst2),
                     []>;
 }
 
+let Constraints = "$src1 = $dst1" in
+def CRC8_l4r : _L4R<(outs GRRegs:$dst1, GRRegs:$dst2),
+                    (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3),
+                    "crc8 $dst1, $dst2, $src2, $src3",
+                    []>;
+
 // Five operand long
 
 def LADD_l5r : _L5R<(outs GRRegs:$dst1, GRRegs:$dst2),
diff --git a/lib/Target/XCore/XCoreRegisterInfo.cpp b/lib/Target/XCore/XCoreRegisterInfo.cpp
index 0287a51..46c9e57 100644
--- a/lib/Target/XCore/XCoreRegisterInfo.cpp
+++ b/lib/Target/XCore/XCoreRegisterInfo.cpp
@@ -68,8 +68,8 @@ unsigned XCoreRegisterInfo::getNumArgRegs(const MachineFunction *MF)
 }
 
 bool XCoreRegisterInfo::needsFrameMoves(const MachineFunction &MF) {
-  return MF.getMMI().hasDebugInfo() || !MF.getFunction()->doesNotThrow() ||
-          UnwindTablesMandatory;
+  return MF.getMMI().hasDebugInfo() ||
+    MF.getFunction()->needsUnwindTableEntry();
 }
 
 const unsigned* XCoreRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF)
@@ -315,6 +315,10 @@ int XCoreRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
   return XCoreGenRegisterInfo::getDwarfRegNumFull(RegNum, 0);
 }
 
+int XCoreRegisterInfo::getLLVMRegNum(unsigned DwarfRegNo, bool isEH) const {
+  return XCoreGenRegisterInfo::getLLVMRegNumFull(DwarfRegNo,0);
+}
+
 unsigned XCoreRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
 
diff --git a/lib/Target/XCore/XCoreRegisterInfo.h b/lib/Target/XCore/XCoreRegisterInfo.h
index 770483b..7a9bc9f 100644
--- a/lib/Target/XCore/XCoreRegisterInfo.h
+++ b/lib/Target/XCore/XCoreRegisterInfo.h
@@ -75,6 +75,7 @@ public:
 
   //! Get DWARF debugging register number
   int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+  int getLLVMRegNum(unsigned RegNum, bool isEH) const;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/XCore/XCoreRegisterInfo.td b/lib/Target/XCore/XCoreRegisterInfo.td
index 765f717..c354230 100644
--- a/lib/Target/XCore/XCoreRegisterInfo.td
+++ b/lib/Target/XCore/XCoreRegisterInfo.td
@@ -44,48 +44,13 @@ def LR : Ri<15, "lr">, DwarfRegNum<[15]>;
 //
 def GRRegs : RegisterClass<"XCore", [i32], 32,
   // Return values and arguments
-  [R0, R1, R2, R3,
+  (add R0, R1, R2, R3,
   // Not preserved across procedure calls
   R11,
   // Callee save
-  R4, R5, R6, R7, R8, R9, R10]> {
-  let MethodProtos = [{
-    iterator allocation_order_begin(const MachineFunction &MF) const;
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    GRRegsClass::iterator
-    GRRegsClass::allocation_order_begin(const MachineFunction &MF) const {
-      return begin();
-    }
-    GRRegsClass::iterator
-    GRRegsClass::allocation_order_end(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const TargetFrameLowering *TFI = TM.getFrameLowering();
-      if (TFI->hasFP(MF))
-        return end()-1;  // don't allocate R10
-      else
-        return end();
-    }
-  }];
-}
+  R4, R5, R6, R7, R8, R9, R10)>;
 
-def RRegs : RegisterClass<"XCore", [i32], 32,
-  // Reserved
-  [CP, DP, SP, LR]> {
-  let MethodProtos = [{
-    iterator allocation_order_begin(const MachineFunction &MF) const;
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    RRegsClass::iterator
-    RRegsClass::allocation_order_begin(const MachineFunction &MF) const {
-      return begin();
-    }
-    RRegsClass::iterator
-    RRegsClass::allocation_order_end(const MachineFunction &MF) const {
-      // No allocatable registers
-      return begin();
-    }
-  }];
+// Reserved
+def RRegs : RegisterClass<"XCore", [i32], 32, (add CP, DP, SP, LR)> {
+  let isAllocatable = 0;
 }
diff --git a/lib/Transforms/IPO/ArgumentPromotion.cpp b/lib/Transforms/IPO/ArgumentPromotion.cpp
index 0c650cf..54a7f67 100644
--- a/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -771,8 +771,8 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
   // function empty.
   NF->getBasicBlockList().splice(NF->begin(), F->getBasicBlockList());
 
-  // Loop over the argument list, transfering uses of the old arguments over to
-  // the new arguments, also transfering over the names as well.
+  // Loop over the argument list, transferring uses of the old arguments over to
+  // the new arguments, also transferring over the names as well.
   //
   for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(),
        I2 = NF->arg_begin(); I != E; ++I) {
diff --git a/lib/Transforms/IPO/CMakeLists.txt b/lib/Transforms/IPO/CMakeLists.txt
index efdeec5..179b150 100644
--- a/lib/Transforms/IPO/CMakeLists.txt
+++ b/lib/Transforms/IPO/CMakeLists.txt
@@ -20,5 +20,4 @@ add_llvm_library(LLVMipo
   PruneEH.cpp
   StripDeadPrototypes.cpp
   StripSymbols.cpp
-  StructRetPromotion.cpp
   )
diff --git a/lib/Transforms/IPO/DeadArgumentElimination.cpp b/lib/Transforms/IPO/DeadArgumentElimination.cpp
index 4d1f7ab..d4eaf0c 100644
--- a/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -49,7 +49,7 @@ namespace {
 
     /// Struct that represents (part of) either a return value or a function
     /// argument.  Used so that arguments and return values can be used
-    /// interchangably.
+    /// interchangeably.
     struct RetOrArg {
       RetOrArg(const Function *F, unsigned Idx, bool IsArg) : F(F), Idx(Idx),
                IsArg(IsArg) {}
@@ -273,8 +273,8 @@ bool DAE::DeleteDeadVarargs(Function &Fn) {
   // function empty.
   NF->getBasicBlockList().splice(NF->begin(), Fn.getBasicBlockList());
 
-  // Loop over the argument list, transfering uses of the old arguments over to
-  // the new arguments, also transfering over the names as well.  While we're at
+  // Loop over the argument list, transferring uses of the old arguments over to
+  // the new arguments, also transferring over the names as well.  While we're at
   // it, remove the dead arguments from the DeadArguments list.
   //
   for (Function::arg_iterator I = Fn.arg_begin(), E = Fn.arg_end(),
@@ -379,7 +379,7 @@ DAE::Liveness DAE::SurveyUse(Value::const_use_iterator U,
       // The value is returned from a function. It's only live when the
       // function's return value is live. We use RetValNum here, for the case
       // that U is really a use of an insertvalue instruction that uses the
-      // orginal Use.
+      // original Use.
       RetOrArg Use = CreateRet(RI->getParent()->getParent(), RetValNum);
       // We might be live, depending on the liveness of Use.
       return MarkIfNotLive(Use, MaybeLiveUses);
@@ -894,8 +894,8 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
   // function empty.
   NF->getBasicBlockList().splice(NF->begin(), F->getBasicBlockList());
 
-  // Loop over the argument list, transfering uses of the old arguments over to
-  // the new arguments, also transfering over the names as well.
+  // Loop over the argument list, transferring uses of the old arguments over to
+  // the new arguments, also transferring over the names as well.
   i = 0;
   for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(),
        I2 = NF->arg_begin(); I != E; ++I, ++i)
diff --git a/lib/Transforms/IPO/DeadTypeElimination.cpp b/lib/Transforms/IPO/DeadTypeElimination.cpp
index a509931..d3d4963 100644
--- a/lib/Transforms/IPO/DeadTypeElimination.cpp
+++ b/lib/Transforms/IPO/DeadTypeElimination.cpp
@@ -83,7 +83,8 @@ bool DTE::runOnModule(Module &M) {
   bool Changed = false;
 
   TypeSymbolTable &ST = M.getTypeSymbolTable();
-  std::set<const Type *> UsedTypes = getAnalysis<FindUsedTypes>().getTypes();
+  const SetVector<const Type*> &T = getAnalysis<FindUsedTypes>().getTypes();
+  std::set<const Type*> UsedTypes(T.begin(), T.end());
 
   // Check the symbol table for superfluous type entries...
   //
diff --git a/lib/Transforms/IPO/ExtractGV.cpp b/lib/Transforms/IPO/ExtractGV.cpp
index 9d432de..d9911bf 100644
--- a/lib/Transforms/IPO/ExtractGV.cpp
+++ b/lib/Transforms/IPO/ExtractGV.cpp
@@ -51,20 +51,32 @@ namespace {
       // Visit the GlobalVariables.
       for (Module::global_iterator I = M.global_begin(), E = M.global_end();
            I != E; ++I) {
+        if (deleteStuff == (bool)Named.count(I) && !I->isDeclaration()) {
+          I->setInitializer(0);
+	} else {
+	  if (I->hasAvailableExternallyLinkage())
+	    continue;
+	  if (I->getName() == "llvm.global_ctors")
+	    continue;
+	}
+
         if (I->hasLocalLinkage())
           I->setVisibility(GlobalValue::HiddenVisibility);
         I->setLinkage(GlobalValue::ExternalLinkage);
-        if (deleteStuff == (bool)Named.count(I) && !I->isDeclaration())
-          I->setInitializer(0);
       }
 
       // Visit the Functions.
       for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
+        if (deleteStuff == (bool)Named.count(I) && !I->isDeclaration()) {
+          I->deleteBody();
+	} else {
+	  if (I->hasAvailableExternallyLinkage())
+	    continue;
+	}
+
         if (I->hasLocalLinkage())
           I->setVisibility(GlobalValue::HiddenVisibility);
         I->setLinkage(GlobalValue::ExternalLinkage);
-        if (deleteStuff == (bool)Named.count(I) && !I->isDeclaration())
-          I->deleteBody();
       }
 
       return true;
diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp
index 5b6ed2c..cdf7b76 100644
--- a/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/lib/Transforms/IPO/GlobalOpt.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Module.h"
+#include "llvm/Operator.h"
 #include "llvm/Pass.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
@@ -240,15 +241,15 @@ static bool AnalyzeGlobal(const Value *V, GlobalStatus &GS,
         GS.HasPHIUser = true;
       } else if (isa<CmpInst>(I)) {
         GS.isCompared = true;
-      } else if (isa<MemTransferInst>(I)) {
-        const MemTransferInst *MTI = cast<MemTransferInst>(I);
+      } else if (const MemTransferInst *MTI = dyn_cast<MemTransferInst>(I)) {
+        if (MTI->isVolatile()) return true;
         if (MTI->getArgOperand(0) == V)
           GS.StoredType = GlobalStatus::isStored;
         if (MTI->getArgOperand(1) == V)
           GS.isLoaded = true;
-      } else if (isa<MemSetInst>(I)) {
-        assert(cast<MemSetInst>(I)->getArgOperand(0) == V &&
-               "Memset only takes one pointer!");
+      } else if (const MemSetInst *MSI = dyn_cast<MemSetInst>(I)) {
+        assert(MSI->getArgOperand(0) == V && "Memset only takes one pointer!");
+        if (MSI->isVolatile()) return true;
         GS.StoredType = GlobalStatus::isStored;
       } else {
         return true;  // Any other non-load instruction might take address!
@@ -798,7 +799,8 @@ static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV) {
       // If we get here we could have other crazy uses that are transitively
       // loaded.
       assert((isa<PHINode>(GlobalUser) || isa<SelectInst>(GlobalUser) ||
-              isa<ConstantExpr>(GlobalUser)) && "Only expect load and stores!");
+              isa<ConstantExpr>(GlobalUser) || isa<CmpInst>(GlobalUser)) &&
+             "Only expect load and stores!");
     }
   }
 
@@ -1588,8 +1590,7 @@ static bool OptimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal,
       GV->getInitializer()->isNullValue()) {
     if (Constant *SOVC = dyn_cast<Constant>(StoredOnceVal)) {
       if (GV->getInitializer()->getType() != SOVC->getType())
-        SOVC =
-         ConstantExpr::getBitCast(SOVC, GV->getInitializer()->getType());
+        SOVC = ConstantExpr::getBitCast(SOVC, GV->getInitializer()->getType());
 
       // Optimize away any trapping uses of the loaded value.
       if (OptimizeAwayTrappingUsesOfLoads(GV, SOVC))
@@ -1953,12 +1954,15 @@ GlobalVariable *GlobalOpt::FindGlobalCtors(Module &M) {
   // Verify that the initializer is simple enough for us to handle. We are
   // only allowed to optimize the initializer if it is unique.
   if (!GV->hasUniqueInitializer()) return 0;
-  
+
+  if (isa<ConstantAggregateZero>(GV->getInitializer()))
+    return GV;
   ConstantArray *CA = cast<ConstantArray>(GV->getInitializer());
-  
+
   for (User::op_iterator i = CA->op_begin(), e = CA->op_end(); i != e; ++i) {
+    if (isa<ConstantAggregateZero>(*i))
+      continue;
     ConstantStruct *CS = cast<ConstantStruct>(*i);
-    
     if (isa<ConstantPointerNull>(CS->getOperand(1)))
       continue;
 
@@ -1978,6 +1982,8 @@ GlobalVariable *GlobalOpt::FindGlobalCtors(Module &M) {
 /// ParseGlobalCtors - Given a llvm.global_ctors list that we can understand,
 /// return a list of the functions and null terminator as a vector.
 static std::vector<Function*> ParseGlobalCtors(GlobalVariable *GV) {
+  if (GV->getInitializer()->isNullValue())
+    return std::vector<Function*>();
   ConstantArray *CA = cast<ConstantArray>(GV->getInitializer());
   std::vector<Function*> Result;
   Result.reserve(CA->getNumOperands());
@@ -2008,7 +2014,7 @@ static GlobalVariable *InstallGlobalCtors(GlobalVariable *GCL,
       const PointerType *PFTy = PointerType::getUnqual(FTy);
       CSVals[1] = Constant::getNullValue(PFTy);
       CSVals[0] = ConstantInt::get(Type::getInt32Ty(GCL->getContext()),
-                                   2147483647);
+                                   0x7fffffff);
     }
     CAList.push_back(ConstantStruct::get(GCL->getContext(), CSVals, false));
   }
@@ -2432,6 +2438,20 @@ static bool EvaluateFunction(Function *F, Constant *&RetVal,
       // Cannot handle inline asm.
       if (isa<InlineAsm>(CI->getCalledValue())) return false;
 
+      if (MemSetInst *MSI = dyn_cast<MemSetInst>(CI)) {
+        if (MSI->isVolatile()) return false;
+        Constant *Ptr = getVal(Values, MSI->getDest());
+        Constant *Val = getVal(Values, MSI->getValue());
+        Constant *DestVal = ComputeLoadResult(getVal(Values, Ptr),
+                                              MutatedMemory);
+        if (Val->isNullValue() && DestVal && DestVal->isNullValue()) {
+          // This memset is a no-op.
+          ++CurInst;
+          continue;
+        }
+        return false;
+      }
+
       // Resolve function pointers.
       Function *Callee = dyn_cast<Function>(getVal(Values,
                                                    CI->getCalledValue()));
diff --git a/lib/Transforms/IPO/IPO.cpp b/lib/Transforms/IPO/IPO.cpp
index fbe90ce..21dcb51 100644
--- a/lib/Transforms/IPO/IPO.cpp
+++ b/lib/Transforms/IPO/IPO.cpp
@@ -45,7 +45,6 @@ void llvm::initializeIPO(PassRegistry &Registry) {
   initializeStripDebugDeclarePass(Registry);
   initializeStripDeadDebugInfoPass(Registry);
   initializeStripNonDebugSymbolsPass(Registry);
-  initializeSRETPromotionPass(Registry);
 }
 
 void LLVMInitializeIPO(LLVMPassRegistryRef R) {
diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp
index 37eafd7..57f3e77 100644
--- a/lib/Transforms/IPO/Inliner.cpp
+++ b/lib/Transforms/IPO/Inliner.cpp
@@ -29,7 +29,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
-#include <set>
 using namespace llvm;
 
 STATISTIC(NumInlined, "Number of functions inlined");
diff --git a/lib/Transforms/IPO/MergeFunctions.cpp b/lib/Transforms/IPO/MergeFunctions.cpp
index a38d2c2..f741443 100644
--- a/lib/Transforms/IPO/MergeFunctions.cpp
+++ b/lib/Transforms/IPO/MergeFunctions.cpp
@@ -55,6 +55,7 @@
 #include "llvm/Instructions.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
+#include "llvm/Operator.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CallSite.h"
 #include "llvm/Support/Debug.h"
diff --git a/lib/Transforms/IPO/PruneEH.cpp b/lib/Transforms/IPO/PruneEH.cpp
index d91c2c4..2f3baeb 100644
--- a/lib/Transforms/IPO/PruneEH.cpp
+++ b/lib/Transforms/IPO/PruneEH.cpp
@@ -27,7 +27,6 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Support/CFG.h"
-#include <set>
 #include <algorithm>
 using namespace llvm;
 
@@ -181,6 +180,7 @@ bool PruneEH::SimplifyFunction(Function *F) {
         Call->takeName(II);
         Call->setCallingConv(II->getCallingConv());
         Call->setAttributes(II->getAttributes());
+        Call->setDebugLoc(II->getDebugLoc());
 
         // Anything that used the value produced by the invoke instruction
         // now uses the value produced by the call instruction.  Note that we
@@ -239,7 +239,7 @@ void PruneEH::DeleteBasicBlock(BasicBlock *BB) {
   for (BasicBlock::iterator I = BB->end(), E = BB->begin(); I != E; ) {
     --I;
     if (CallInst *CI = dyn_cast<CallInst>(I)) {
-      if (!isa<DbgInfoIntrinsic>(I))
+      if (!isa<IntrinsicInst>(I))
         CGN->removeCallEdgeFor(CI);
     } else if (InvokeInst *II = dyn_cast<InvokeInst>(I))
       CGN->removeCallEdgeFor(II);
diff --git a/lib/Transforms/IPO/StructRetPromotion.cpp b/lib/Transforms/IPO/StructRetPromotion.cpp
deleted file mode 100644
index 584deac..0000000
--- a/lib/Transforms/IPO/StructRetPromotion.cpp
+++ /dev/null
@@ -1,357 +0,0 @@
-//===-- StructRetPromotion.cpp - Promote sret arguments -------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass finds functions that return a struct (using a pointer to the struct
-// as the first argument of the function, marked with the 'sret' attribute) and
-// replaces them with a new function that simply returns each of the elements of
-// that struct (using multiple return values).
-//
-// This pass works under a number of conditions:
-//  1. The returned struct must not contain other structs
-//  2. The returned struct must only be used to load values from
-//  3. The placeholder struct passed in is the result of an alloca
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "sretpromotion"
-#include "llvm/Transforms/IPO.h"
-#include "llvm/Constants.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/Module.h"
-#include "llvm/CallGraphSCCPass.h"
-#include "llvm/Instructions.h"
-#include "llvm/Analysis/CallGraph.h"
-#include "llvm/Support/CallSite.h"
-#include "llvm/Support/CFG.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Support/raw_ostream.h"
-using namespace llvm;
-
-STATISTIC(NumRejectedSRETUses , "Number of sret rejected due to unexpected uses");
-STATISTIC(NumSRET , "Number of sret promoted");
-namespace {
-  /// SRETPromotion - This pass removes sret parameter and updates
-  /// function to use multiple return value.
-  ///
-  struct SRETPromotion : public CallGraphSCCPass {
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-      CallGraphSCCPass::getAnalysisUsage(AU);
-    }
-
-    virtual bool runOnSCC(CallGraphSCC &SCC);
-    static char ID; // Pass identification, replacement for typeid
-    SRETPromotion() : CallGraphSCCPass(ID) {
-      initializeSRETPromotionPass(*PassRegistry::getPassRegistry());
-    }
-
-  private:
-    CallGraphNode *PromoteReturn(CallGraphNode *CGN);
-    bool isSafeToUpdateAllCallers(Function *F);
-    Function *cloneFunctionBody(Function *F, const StructType *STy);
-    CallGraphNode *updateCallSites(Function *F, Function *NF);
-  };
-}
-
-char SRETPromotion::ID = 0;
-INITIALIZE_PASS_BEGIN(SRETPromotion, "sretpromotion",
-                "Promote sret arguments to multiple ret values", false, false)
-INITIALIZE_AG_DEPENDENCY(CallGraph)
-INITIALIZE_PASS_END(SRETPromotion, "sretpromotion",
-                "Promote sret arguments to multiple ret values", false, false)
-
-Pass *llvm::createStructRetPromotionPass() {
-  return new SRETPromotion();
-}
-
-bool SRETPromotion::runOnSCC(CallGraphSCC &SCC) {
-  bool Changed = false;
-
-  for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I)
-    if (CallGraphNode *NewNode = PromoteReturn(*I)) {
-      SCC.ReplaceNode(*I, NewNode);
-      Changed = true;
-    }
-
-  return Changed;
-}
-
-/// PromoteReturn - This method promotes function that uses StructRet paramater 
-/// into a function that uses multiple return values.
-CallGraphNode *SRETPromotion::PromoteReturn(CallGraphNode *CGN) {
-  Function *F = CGN->getFunction();
-
-  if (!F || F->isDeclaration() || !F->hasLocalLinkage())
-    return 0;
-
-  // Make sure that function returns struct.
-  if (F->arg_size() == 0 || !F->hasStructRetAttr() || F->doesNotReturn())
-    return 0;
-
-  DEBUG(dbgs() << "SretPromotion: Looking at sret function " 
-        << F->getName() << "\n");
-
-  assert(F->getReturnType()->isVoidTy() && "Invalid function return type");
-  Function::arg_iterator AI = F->arg_begin();
-  const llvm::PointerType *FArgType = dyn_cast<PointerType>(AI->getType());
-  assert(FArgType && "Invalid sret parameter type");
-  const llvm::StructType *STy = 
-    dyn_cast<StructType>(FArgType->getElementType());
-  assert(STy && "Invalid sret parameter element type");
-
-  // Check if it is ok to perform this promotion.
-  if (isSafeToUpdateAllCallers(F) == false) {
-    DEBUG(dbgs() << "SretPromotion: Not all callers can be updated\n");
-    ++NumRejectedSRETUses;
-    return 0;
-  }
-
-  DEBUG(dbgs() << "SretPromotion: sret argument will be promoted\n");
-  ++NumSRET;
-  // [1] Replace use of sret parameter 
-  AllocaInst *TheAlloca = new AllocaInst(STy, NULL, "mrv", 
-                                         F->getEntryBlock().begin());
-  Value *NFirstArg = F->arg_begin();
-  NFirstArg->replaceAllUsesWith(TheAlloca);
-
-  // [2] Find and replace ret instructions
-  for (Function::iterator FI = F->begin(), FE = F->end();  FI != FE; ++FI) 
-    for(BasicBlock::iterator BI = FI->begin(), BE = FI->end(); BI != BE; ) {
-      Instruction *I = BI;
-      ++BI;
-      if (isa<ReturnInst>(I)) {
-        Value *NV = new LoadInst(TheAlloca, "mrv.ld", I);
-        ReturnInst *NR = ReturnInst::Create(F->getContext(), NV, I);
-        I->replaceAllUsesWith(NR);
-        I->eraseFromParent();
-      }
-    }
-
-  // [3] Create the new function body and insert it into the module.
-  Function *NF = cloneFunctionBody(F, STy);
-
-  // [4] Update all call sites to use new function
-  CallGraphNode *NF_CFN = updateCallSites(F, NF);
-
-  CallGraph &CG = getAnalysis<CallGraph>();
-  NF_CFN->stealCalledFunctionsFrom(CG[F]);
-
-  delete CG.removeFunctionFromModule(F);
-  return NF_CFN;
-}
-
-// Check if it is ok to perform this promotion.
-bool SRETPromotion::isSafeToUpdateAllCallers(Function *F) {
-
-  if (F->use_empty())
-    // No users. OK to modify signature.
-    return true;
-
-  for (Value::use_iterator FnUseI = F->use_begin(), FnUseE = F->use_end();
-       FnUseI != FnUseE; ++FnUseI) {
-    // The function is passed in as an argument to (possibly) another function,
-    // we can't change it!
-    CallSite CS(*FnUseI);
-    Instruction *Call = CS.getInstruction();
-    // The function is used by something else than a call or invoke instruction,
-    // we can't change it!
-    if (!Call || !CS.isCallee(FnUseI))
-      return false;
-    CallSite::arg_iterator AI = CS.arg_begin();
-    Value *FirstArg = *AI;
-
-    if (!isa<AllocaInst>(FirstArg))
-      return false;
-
-    // Check FirstArg's users.
-    for (Value::use_iterator ArgI = FirstArg->use_begin(), 
-           ArgE = FirstArg->use_end(); ArgI != ArgE; ++ArgI) {
-      User *U = *ArgI;
-      // If FirstArg user is a CallInst that does not correspond to current
-      // call site then this function F is not suitable for sret promotion.
-      if (CallInst *CI = dyn_cast<CallInst>(U)) {
-        if (CI != Call)
-          return false;
-      }
-      // If FirstArg user is a GEP whose all users are not LoadInst then
-      // this function F is not suitable for sret promotion.
-      else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(U)) {
-        // TODO : Use dom info and insert PHINodes to collect get results
-        // from multiple call sites for this GEP.
-        if (GEP->getParent() != Call->getParent())
-          return false;
-        for (Value::use_iterator GEPI = GEP->use_begin(), GEPE = GEP->use_end();
-             GEPI != GEPE; ++GEPI) 
-          if (!isa<LoadInst>(*GEPI))
-            return false;
-      } 
-      // Any other FirstArg users make this function unsuitable for sret 
-      // promotion.
-      else
-        return false;
-    }
-  }
-
-  return true;
-}
-
-/// cloneFunctionBody - Create a new function based on F and
-/// insert it into module. Remove first argument. Use STy as
-/// the return type for new function.
-Function *SRETPromotion::cloneFunctionBody(Function *F, 
-                                           const StructType *STy) {
-
-  const FunctionType *FTy = F->getFunctionType();
-  std::vector<const Type*> Params;
-
-  // Attributes - Keep track of the parameter attributes for the arguments.
-  SmallVector<AttributeWithIndex, 8> AttributesVec;
-  const AttrListPtr &PAL = F->getAttributes();
-
-  // Add any return attributes.
-  if (Attributes attrs = PAL.getRetAttributes())
-    AttributesVec.push_back(AttributeWithIndex::get(0, attrs));
-
-  // Skip first argument.
-  Function::arg_iterator I = F->arg_begin(), E = F->arg_end();
-  ++I;
-  // 0th parameter attribute is reserved for return type.
-  // 1th parameter attribute is for first 1st sret argument.
-  unsigned ParamIndex = 2; 
-  while (I != E) {
-    Params.push_back(I->getType());
-    if (Attributes Attrs = PAL.getParamAttributes(ParamIndex))
-      AttributesVec.push_back(AttributeWithIndex::get(ParamIndex - 1, Attrs));
-    ++I;
-    ++ParamIndex;
-  }
-
-  // Add any fn attributes.
-  if (Attributes attrs = PAL.getFnAttributes())
-    AttributesVec.push_back(AttributeWithIndex::get(~0, attrs));
-
-
-  FunctionType *NFTy = FunctionType::get(STy, Params, FTy->isVarArg());
-  Function *NF = Function::Create(NFTy, F->getLinkage());
-  NF->takeName(F);
-  NF->copyAttributesFrom(F);
-  NF->setAttributes(AttrListPtr::get(AttributesVec.begin(), AttributesVec.end()));
-  F->getParent()->getFunctionList().insert(F, NF);
-  NF->getBasicBlockList().splice(NF->begin(), F->getBasicBlockList());
-
-  // Replace arguments
-  I = F->arg_begin();
-  E = F->arg_end();
-  Function::arg_iterator NI = NF->arg_begin();
-  ++I;
-  while (I != E) {
-    I->replaceAllUsesWith(NI);
-    NI->takeName(I);
-    ++I;
-    ++NI;
-  }
-
-  return NF;
-}
-
-/// updateCallSites - Update all sites that call F to use NF.
-CallGraphNode *SRETPromotion::updateCallSites(Function *F, Function *NF) {
-  CallGraph &CG = getAnalysis<CallGraph>();
-  SmallVector<Value*, 16> Args;
-
-  // Attributes - Keep track of the parameter attributes for the arguments.
-  SmallVector<AttributeWithIndex, 8> ArgAttrsVec;
-
-  // Get a new callgraph node for NF.
-  CallGraphNode *NF_CGN = CG.getOrInsertFunction(NF);
-
-  while (!F->use_empty()) {
-    CallSite CS(*F->use_begin());
-    Instruction *Call = CS.getInstruction();
-
-    const AttrListPtr &PAL = F->getAttributes();
-    // Add any return attributes.
-    if (Attributes attrs = PAL.getRetAttributes())
-      ArgAttrsVec.push_back(AttributeWithIndex::get(0, attrs));
-
-    // Copy arguments, however skip first one.
-    CallSite::arg_iterator AI = CS.arg_begin(), AE = CS.arg_end();
-    Value *FirstCArg = *AI;
-    ++AI;
-    // 0th parameter attribute is reserved for return type.
-    // 1th parameter attribute is for first 1st sret argument.
-    unsigned ParamIndex = 2; 
-    while (AI != AE) {
-      Args.push_back(*AI); 
-      if (Attributes Attrs = PAL.getParamAttributes(ParamIndex))
-        ArgAttrsVec.push_back(AttributeWithIndex::get(ParamIndex - 1, Attrs));
-      ++ParamIndex;
-      ++AI;
-    }
-
-    // Add any function attributes.
-    if (Attributes attrs = PAL.getFnAttributes())
-      ArgAttrsVec.push_back(AttributeWithIndex::get(~0, attrs));
-    
-    AttrListPtr NewPAL = AttrListPtr::get(ArgAttrsVec.begin(), ArgAttrsVec.end());
-    
-    // Build new call instruction.
-    Instruction *New;
-    if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
-      New = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(),
-                               Args.begin(), Args.end(), "", Call);
-      cast<InvokeInst>(New)->setCallingConv(CS.getCallingConv());
-      cast<InvokeInst>(New)->setAttributes(NewPAL);
-    } else {
-      New = CallInst::Create(NF, Args.begin(), Args.end(), "", Call);
-      cast<CallInst>(New)->setCallingConv(CS.getCallingConv());
-      cast<CallInst>(New)->setAttributes(NewPAL);
-      if (cast<CallInst>(Call)->isTailCall())
-        cast<CallInst>(New)->setTailCall();
-    }
-    Args.clear();
-    ArgAttrsVec.clear();
-    New->takeName(Call);
-
-    // Update the callgraph to know that the callsite has been transformed.
-    CallGraphNode *CalleeNode = CG[Call->getParent()->getParent()];
-    CalleeNode->removeCallEdgeFor(Call);
-    CalleeNode->addCalledFunction(New, NF_CGN);
-    
-    // Update all users of sret parameter to extract value using extractvalue.
-    for (Value::use_iterator UI = FirstCArg->use_begin(), 
-           UE = FirstCArg->use_end(); UI != UE; ) {
-      User *U2 = *UI++;
-      CallInst *C2 = dyn_cast<CallInst>(U2);
-      if (C2 && (C2 == Call))
-        continue;
-      
-      GetElementPtrInst *UGEP = cast<GetElementPtrInst>(U2);
-      ConstantInt *Idx = cast<ConstantInt>(UGEP->getOperand(2));
-      Value *GR = ExtractValueInst::Create(New, Idx->getZExtValue(),
-                                           "evi", UGEP);
-      while(!UGEP->use_empty()) {
-        // isSafeToUpdateAllCallers has checked that all GEP uses are
-        // LoadInsts
-        LoadInst *L = cast<LoadInst>(*UGEP->use_begin());
-        L->replaceAllUsesWith(GR);
-        L->eraseFromParent();
-      }
-      UGEP->eraseFromParent();
-      continue;
-    }
-    Call->eraseFromParent();
-  }
-  
-  return NF_CGN;
-}
-
diff --git a/lib/Transforms/InstCombine/InstCombine.h b/lib/Transforms/InstCombine/InstCombine.h
index 625f546..8257d6b 100644
--- a/lib/Transforms/InstCombine/InstCombine.h
+++ b/lib/Transforms/InstCombine/InstCombine.h
@@ -11,6 +11,7 @@
 #define INSTCOMBINE_INSTCOMBINE_H
 
 #include "InstCombineWorklist.h"
+#include "llvm/Operator.h"
 #include "llvm/Pass.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Support/IRBuilder.h"
@@ -69,7 +70,6 @@ class LLVM_LIBRARY_VISIBILITY InstCombiner
                              : public FunctionPass,
                                public InstVisitor<InstCombiner, Instruction*> {
   TargetData *TD;
-  bool MustPreserveLCSSA;
   bool MadeIRChange;
 public:
   /// Worklist - All of the instructions that need to be simplified.
@@ -233,7 +233,15 @@ public:
     Worklist.Add(New);
     return New;
   }
-      
+
+  // InsertNewInstWith - same as InsertNewInstBefore, but also sets the 
+  // debug loc.
+  //
+  Instruction *InsertNewInstWith(Instruction *New, Instruction &Old) {
+    New->setDebugLoc(Old.getDebugLoc());
+    return InsertNewInstBefore(New, Old);
+  }
+
   // ReplaceInstUsesWith - This method is to be used when an instruction is
   // found to be dead, replacable with another preexisting expression.  Here
   // we add all uses of I to the worklist, replace all uses of I with the new
diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 1cb18e1..a08446e 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -331,7 +331,7 @@ Instruction *InstCombiner::OptAndOp(Instruction *Op,
 
 
 /// InsertRangeTest - Emit a computation of: (V >= Lo && V < Hi) if Inside is
-/// true, otherwise (V < Lo || V >= Hi).  In pratice, we emit the more efficient
+/// true, otherwise (V < Lo || V >= Hi).  In practice, we emit the more efficient
 /// (V-Lo) <u Hi-Lo.  This method expects that Lo <= Hi. isSigned indicates
 /// whether to treat the V, Lo and HI as signed or not. IB is the location to
 /// insert new instructions.
@@ -769,6 +769,42 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
       return Builder->CreateICmp(LHSCC, NewOr, LHSCst);
     }
   }
+
+  // (trunc x) == C1 & (and x, CA) == C2 -> (and x, CA|CMAX) == C1|C2
+  // where CMAX is the all ones value for the truncated type,
+  // iff the lower bits of C2 and CA are zero.
+  if (LHSCC == RHSCC && ICmpInst::isEquality(LHSCC) &&
+      LHS->hasOneUse() && RHS->hasOneUse()) {
+    Value *V;
+    ConstantInt *AndCst, *SmallCst = 0, *BigCst = 0;
+
+    // (trunc x) == C1 & (and x, CA) == C2
+    if (match(Val2, m_Trunc(m_Value(V))) &&
+        match(Val, m_And(m_Specific(V), m_ConstantInt(AndCst)))) {
+      SmallCst = RHSCst;
+      BigCst = LHSCst;
+    }
+    // (and x, CA) == C2 & (trunc x) == C1
+    else if (match(Val, m_Trunc(m_Value(V))) &&
+             match(Val2, m_And(m_Specific(V), m_ConstantInt(AndCst)))) {
+      SmallCst = LHSCst;
+      BigCst = RHSCst;
+    }
+
+    if (SmallCst && BigCst) {
+      unsigned BigBitSize = BigCst->getType()->getBitWidth();
+      unsigned SmallBitSize = SmallCst->getType()->getBitWidth();
+
+      // Check that the low bits are zero.
+      APInt Low = APInt::getLowBitsSet(BigBitSize, SmallBitSize);
+      if ((Low & AndCst->getValue()) == 0 && (Low & BigCst->getValue()) == 0) {
+        Value *NewAnd = Builder->CreateAnd(V, Low | AndCst->getValue());
+        APInt N = SmallCst->getValue().zext(BigBitSize) | BigCst->getValue();
+        Value *NewVal = ConstantInt::get(AndCst->getType()->getContext(), N);
+        return Builder->CreateICmp(LHSCC, NewAnd, NewVal);
+      }
+    }
+  }
   
   // From here on, we only handle:
   //    (icmp1 A, C1) & (icmp2 A, C2) --> something simpler.
@@ -2003,7 +2039,14 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
       }
     }
   }
-  
+
+  // or(sext(A), B) -> A ? -1 : B where A is an i1
+  // or(A, sext(B)) -> B ? -1 : A where B is an i1
+  if (match(Op0, m_SExt(m_Value(A))) && A->getType()->isIntegerTy(1))
+    return SelectInst::Create(A, ConstantInt::getSigned(I.getType(), -1), Op1);
+  if (match(Op1, m_SExt(m_Value(A))) && A->getType()->isIntegerTy(1))
+    return SelectInst::Create(A, ConstantInt::getSigned(I.getType(), -1), Op0);
+
   // Note: If we've gotten to the point of visiting the outer OR, then the
   // inner one couldn't be simplified.  If it was a constant, then it won't
   // be simplified by a later pass either, so we try swapping the inner/outer
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 875e9ca..ef67701 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -111,10 +111,10 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
   
   Value *Src = Builder->CreateBitCast(MI->getArgOperand(1), NewSrcPtrTy);
   Value *Dest = Builder->CreateBitCast(MI->getArgOperand(0), NewDstPtrTy);
-  Instruction *L = new LoadInst(Src, "tmp", MI->isVolatile(), SrcAlign);
-  InsertNewInstBefore(L, *MI);
-  InsertNewInstBefore(new StoreInst(L, Dest, MI->isVolatile(), DstAlign),
-                      *MI);
+  LoadInst *L = Builder->CreateLoad(Src, MI->isVolatile());
+  L->setAlignment(SrcAlign);
+  StoreInst *S = Builder->CreateStore(L, Dest, MI->isVolatile());
+  S->setAlignment(DstAlign);
 
   // Set the size of the copy to 0, it will be deleted on the next iteration.
   MI->setArgOperand(2, Constant::getNullValue(MemOpLength->getType()));
@@ -154,8 +154,9 @@ Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) {
     
     // Extract the fill value and store.
     uint64_t Fill = FillC->getZExtValue()*0x0101010101010101ULL;
-    InsertNewInstBefore(new StoreInst(ConstantInt::get(ITy, Fill),
-                                      Dest, false, Alignment), *MI);
+    StoreInst *S = Builder->CreateStore(ConstantInt::get(ITy, Fill), Dest,
+                                        MI->isVolatile());
+    S->setAlignment(Alignment);
     
     // Set the size of the copy to 0, it will be deleted on the next iteration.
     MI->setLength(Constant::getNullValue(LenC->getType()));
@@ -405,20 +406,21 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       if (LHSKnownNegative && RHSKnownNegative) {
         // The sign bit is set in both cases: this MUST overflow.
         // Create a simple add instruction, and insert it into the struct.
-        Instruction *Add = BinaryOperator::CreateAdd(LHS, RHS, "", &CI);
-        Worklist.Add(Add);
+        Value *Add = Builder->CreateAdd(LHS, RHS);
+        Add->takeName(&CI);
         Constant *V[] = {
-          UndefValue::get(LHS->getType()),ConstantInt::getTrue(II->getContext())
+          UndefValue::get(LHS->getType()),
+          ConstantInt::getTrue(II->getContext())
         };
         Constant *Struct = ConstantStruct::get(II->getContext(), V, 2, false);
         return InsertValueInst::Create(Struct, Add, 0);
       }
-      
+
       if (LHSKnownPositive && RHSKnownPositive) {
         // The sign bit is clear in both cases: this CANNOT overflow.
         // Create a simple add instruction, and insert it into the struct.
-        Instruction *Add = BinaryOperator::CreateNUWAdd(LHS, RHS, "", &CI);
-        Worklist.Add(Add);
+        Value *Add = Builder->CreateNUWAdd(LHS, RHS);
+        Add->takeName(&CI);
         Constant *V[] = {
           UndefValue::get(LHS->getType()),
           ConstantInt::getFalse(II->getContext())
@@ -537,11 +539,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     break;
   case Intrinsic::ppc_altivec_lvx:
   case Intrinsic::ppc_altivec_lvxl:
-  case Intrinsic::x86_sse_loadu_ps:
-  case Intrinsic::x86_sse2_loadu_pd:
-  case Intrinsic::x86_sse2_loadu_dq:
-    // Turn PPC lvx     -> load if the pointer is known aligned.
-    // Turn X86 loadups -> load if the pointer is known aligned.
+    // Turn PPC lvx -> load if the pointer is known aligned.
     if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, TD) >= 16) {
       Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0),
                                          PointerType::getUnqual(II->getType()));
@@ -592,6 +590,28 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     break;
   }
 
+
+  case Intrinsic::x86_sse41_pmovsxbw:
+  case Intrinsic::x86_sse41_pmovsxwd:
+  case Intrinsic::x86_sse41_pmovsxdq:
+  case Intrinsic::x86_sse41_pmovzxbw:
+  case Intrinsic::x86_sse41_pmovzxwd:
+  case Intrinsic::x86_sse41_pmovzxdq: {
+    // pmov{s|z}x ignores the upper half of their input vectors.
+    unsigned VWidth =
+      cast<VectorType>(II->getArgOperand(0)->getType())->getNumElements();
+    unsigned LowHalfElts = VWidth / 2;
+    APInt InputDemandedElts(APInt::getBitsSet(VWidth, 0, LowHalfElts));
+    APInt UndefElts(VWidth, 0);
+    if (Value *TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0),
+                                                 InputDemandedElts,
+                                                 UndefElts)) {
+      II->setArgOperand(0, TmpV);
+      return II;
+    }
+    break;
+  }
+
   case Intrinsic::ppc_altivec_vperm:
     // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant.
     if (ConstantVector *Mask = dyn_cast<ConstantVector>(II->getArgOperand(2))) {
@@ -817,7 +837,7 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) {
       // If OldCall dues not return void then replaceAllUsesWith undef.
       // This allows ValueHandlers and custom metadata to adjust itself.
       if (!OldCall->getType()->isVoidTy())
-        OldCall->replaceAllUsesWith(UndefValue::get(OldCall->getType()));
+        ReplaceInstUsesWith(*OldCall, UndefValue::get(OldCall->getType()));
       if (isa<CallInst>(OldCall))
         return EraseInstFromFunction(*OldCall);
       
@@ -839,8 +859,8 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) {
     // If CS does not return void then replaceAllUsesWith undef.
     // This allows ValueHandlers and custom metadata to adjust itself.
     if (!CS.getInstruction()->getType()->isVoidTy())
-      CS.getInstruction()->
-        replaceAllUsesWith(UndefValue::get(CS.getInstruction()->getType()));
+      ReplaceInstUsesWith(*CS.getInstruction(),
+                          UndefValue::get(CS.getInstruction()->getType()));
 
     if (InvokeInst *II = dyn_cast<InvokeInst>(CS.getInstruction())) {
       // Don't break the CFG, insert a dummy cond branch.
@@ -1088,15 +1108,15 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
 
   Instruction *NC;
   if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) {
-    NC = InvokeInst::Create(Callee, II->getNormalDest(), II->getUnwindDest(),
-                            Args.begin(), Args.end(),
-                            Caller->getName(), Caller);
+    NC = Builder->CreateInvoke(Callee, II->getNormalDest(),
+                               II->getUnwindDest(), Args.begin(), Args.end());
+    NC->takeName(II);
     cast<InvokeInst>(NC)->setCallingConv(II->getCallingConv());
     cast<InvokeInst>(NC)->setAttributes(NewCallerPAL);
   } else {
-    NC = CallInst::Create(Callee, Args.begin(), Args.end(),
-                          Caller->getName(), Caller);
     CallInst *CI = cast<CallInst>(Caller);
+    NC = Builder->CreateCall(Callee, Args.begin(), Args.end());
+    NC->takeName(CI);
     if (CI->isTailCall())
       cast<CallInst>(NC)->setTailCall();
     cast<CallInst>(NC)->setCallingConv(CI->getCallingConv());
@@ -1110,6 +1130,7 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
       Instruction::CastOps opcode =
         CastInst::getCastOpcode(NC, false, OldRetTy, false);
       NV = NC = CastInst::Create(opcode, NC, OldRetTy, "tmp");
+      NC->setDebugLoc(Caller->getDebugLoc());
 
       // If this is an invoke instruction, we should insert it after the first
       // non-phi, instruction in the normal successor block.
@@ -1127,8 +1148,8 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
   }
 
   if (!Caller->use_empty())
-    Caller->replaceAllUsesWith(NV);
-  
+    ReplaceInstUsesWith(*Caller, NV);
+
   EraseInstFromFunction(*Caller);
   return true;
 }
@@ -1193,7 +1214,7 @@ Instruction *InstCombiner::transformCallThroughTrampoline(CallSite CS) {
             // Add the chain argument and attributes.
             Value *NestVal = Tramp->getArgOperand(2);
             if (NestVal->getType() != NestTy)
-              NestVal = new BitCastInst(NestVal, NestTy, "nest", Caller);
+              NestVal = Builder->CreateBitCast(NestVal, NestTy, "nest");
             NewArgs.push_back(NestVal);
             NewAttrs.push_back(AttributeWithIndex::get(NestIdx, NestAttr));
           }
@@ -1259,24 +1280,19 @@ Instruction *InstCombiner::transformCallThroughTrampoline(CallSite CS) {
       if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) {
         NewCaller = InvokeInst::Create(NewCallee,
                                        II->getNormalDest(), II->getUnwindDest(),
-                                       NewArgs.begin(), NewArgs.end(),
-                                       Caller->getName(), Caller);
+                                       NewArgs.begin(), NewArgs.end());
         cast<InvokeInst>(NewCaller)->setCallingConv(II->getCallingConv());
         cast<InvokeInst>(NewCaller)->setAttributes(NewPAL);
       } else {
-        NewCaller = CallInst::Create(NewCallee, NewArgs.begin(), NewArgs.end(),
-                                     Caller->getName(), Caller);
+        NewCaller = CallInst::Create(NewCallee, NewArgs.begin(), NewArgs.end());
         if (cast<CallInst>(Caller)->isTailCall())
           cast<CallInst>(NewCaller)->setTailCall();
         cast<CallInst>(NewCaller)->
           setCallingConv(cast<CallInst>(Caller)->getCallingConv());
         cast<CallInst>(NewCaller)->setAttributes(NewPAL);
       }
-      if (!Caller->getType()->isVoidTy())
-        Caller->replaceAllUsesWith(NewCaller);
-      Caller->eraseFromParent();
-      Worklist.Remove(Caller);
-      return 0;
+
+      return NewCaller;
     }
   }
 
diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 6f70de8..601d9b4 100644
--- a/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -71,6 +71,11 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
   // This requires TargetData to get the alloca alignment and size information.
   if (!TD) return 0;
 
+  // Insist that the amount-to-allocate not overflow.
+  OverflowingBinaryOperator *OBI =
+    dyn_cast<OverflowingBinaryOperator>(AI.getOperand(0));
+  if (OBI && !(OBI->hasNoSignedWrap() || OBI->hasNoUnsignedWrap())) return 0;
+
   const PointerType *PTy = cast<PointerType>(CI.getType());
   
   BuilderTy AllocaBuilder(*Builder);
@@ -133,7 +138,7 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
     // New is the allocation instruction, pointer typed. AI is the original
     // allocation instruction, also pointer typed. Thus, cast to use is BitCast.
     Value *NewCast = AllocaBuilder.CreateBitCast(New, AI.getType(), "tmpcast");
-    AI.replaceAllUsesWith(NewCast);
+    ReplaceInstUsesWith(AI, NewCast);
   }
   return ReplaceInstUsesWith(CI, New);
 }
@@ -211,7 +216,7 @@ Value *InstCombiner::EvaluateInDifferentType(Value *V, const Type *Ty,
   }
   
   Res->takeName(I);
-  return InsertNewInstBefore(Res, *I);
+  return InsertNewInstWith(Res, *I);
 }
 
 
@@ -1228,7 +1233,7 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
       
       
       // Remove the old Call.  With -fmath-errno, it won't get marked readnone.
-      Call->replaceAllUsesWith(UndefValue::get(Call->getType()));
+      ReplaceInstUsesWith(*Call, UndefValue::get(Call->getType()));
       EraseInstFromFunction(*Call);
       return ret;
     }
@@ -1684,8 +1689,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
     // If we found a path from the src to dest, create the getelementptr now.
     if (SrcElTy == DstElTy) {
       SmallVector<Value*, 8> Idxs(NumZeros+1, ZeroUInt);
-      return GetElementPtrInst::CreateInBounds(Src, Idxs.begin(), Idxs.end(),"",
-                                               ((Instruction*)NULL));
+      return GetElementPtrInst::CreateInBounds(Src, Idxs.begin(), Idxs.end());
     }
   }
   
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 8afd2f8..42db444 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -469,8 +469,7 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
 ///
 /// If we can't emit an optimized form for this expression, this returns null.
 /// 
-static Value *EvaluateGEPOffsetExpression(User *GEP, Instruction &I,
-                                          InstCombiner &IC) {
+static Value *EvaluateGEPOffsetExpression(User *GEP, InstCombiner &IC) {
   TargetData &TD = *IC.getTargetData();
   gep_type_iterator GTI = gep_type_begin(GEP);
   
@@ -533,10 +532,10 @@ static Value *EvaluateGEPOffsetExpression(User *GEP, Instruction &I,
     // Cast to intptrty in case a truncation occurs.  If an extension is needed,
     // we don't need to bother extending: the extension won't affect where the
     // computation crosses zero.
-    if (VariableIdx->getType()->getPrimitiveSizeInBits() > IntPtrWidth)
-      VariableIdx = new TruncInst(VariableIdx, 
-                                  TD.getIntPtrType(VariableIdx->getContext()),
-                                  VariableIdx->getName(), &I);
+    if (VariableIdx->getType()->getPrimitiveSizeInBits() > IntPtrWidth) {
+      const Type *IntPtrTy = TD.getIntPtrType(VariableIdx->getContext());
+      VariableIdx = IC.Builder->CreateTrunc(VariableIdx, IntPtrTy);
+    }
     return VariableIdx;
   }
   
@@ -558,11 +557,10 @@ static Value *EvaluateGEPOffsetExpression(User *GEP, Instruction &I,
   // Okay, we can do this evaluation.  Start by converting the index to intptr.
   const Type *IntPtrTy = TD.getIntPtrType(VariableIdx->getContext());
   if (VariableIdx->getType() != IntPtrTy)
-    VariableIdx = CastInst::CreateIntegerCast(VariableIdx, IntPtrTy,
-                                              true /*SExt*/, 
-                                              VariableIdx->getName(), &I);
+    VariableIdx = IC.Builder->CreateIntCast(VariableIdx, IntPtrTy,
+                                            true /*Signed*/);
   Constant *OffsetVal = ConstantInt::get(IntPtrTy, NewOffs);
-  return BinaryOperator::CreateAdd(VariableIdx, OffsetVal, "offset", &I);
+  return IC.Builder->CreateAdd(VariableIdx, OffsetVal, "offset");
 }
 
 /// FoldGEPICmp - Fold comparisons between a GEP instruction and something
@@ -580,7 +578,7 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
     // This transformation (ignoring the base and scales) is valid because we
     // know pointers can't overflow since the gep is inbounds.  See if we can
     // output an optimized form.
-    Value *Offset = EvaluateGEPOffsetExpression(GEPLHS, I, *this);
+    Value *Offset = EvaluateGEPOffsetExpression(GEPLHS, *this);
     
     // If not, synthesize the offset the hard way.
     if (Offset == 0)
@@ -634,6 +632,7 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
     if (AllZeros)
       return FoldGEPICmp(GEPLHS, GEPRHS->getOperand(0), Cond, I);
 
+    bool GEPsInBounds = GEPLHS->isInBounds() && GEPRHS->isInBounds();
     if (GEPLHS->getNumOperands() == GEPRHS->getNumOperands()) {
       // If the GEPs only differ by one index, compare it.
       unsigned NumDifferences = 0;  // Keep track of # differences.
@@ -656,7 +655,7 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
                                ConstantInt::get(Type::getInt1Ty(I.getContext()),
                                              ICmpInst::isTrueWhenEqual(Cond)));
 
-      else if (NumDifferences == 1) {
+      else if (NumDifferences == 1 && GEPsInBounds) {
         Value *LHSV = GEPLHS->getOperand(DiffOperand);
         Value *RHSV = GEPRHS->getOperand(DiffOperand);
         // Make sure we do a signed comparison here.
@@ -667,6 +666,7 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
     // Only lower this if the icmp is the only user of the GEP or if we expect
     // the result to fold to a constant!
     if (TD &&
+        GEPsInBounds &&
         (isa<ConstantExpr>(GEPLHS) || GEPLHS->hasOneUse()) &&
         (isa<ConstantExpr>(GEPRHS) || GEPRHS->hasOneUse())) {
       // ((gep Ptr, OFFSET1) cmp (gep Ptr, OFFSET2)  --->  (OFFSET1 cmp OFFSET2)
@@ -699,7 +699,7 @@ Instruction *InstCombiner::FoldICmpAddOpCst(ICmpInst &ICI,
     return ReplaceInstUsesWith(ICI, ConstantInt::getTrue(X->getContext()));
 
   // From this point on, we know that (X+C <= X) --> (X+C < X) because C != 0,
-  // so the values can never be equal.  Similiarly for all other "or equals"
+  // so the values can never be equal.  Similarly for all other "or equals"
   // operators.
   
   // (X+1) <u X        --> X >u (MAXUINT-1)        --> X == 255
@@ -919,11 +919,11 @@ Instruction *InstCombiner::FoldICmpShrCst(ICmpInst &ICI, BinaryOperator *Shr,
     if (ICI.isSigned() != (Shr->getOpcode() == Instruction::AShr))
       return 0;
     
-    // Otherwise, all lshr and all exact ashr's are equivalent to a udiv/sdiv by
-    // a power of 2.  Since we already have logic to simplify these, transform
-    // to div and then simplify the resultant comparison.
+    // Otherwise, all lshr and most exact ashr's are equivalent to a udiv/sdiv
+    // by a power of 2.  Since we already have logic to simplify these,
+    // transform to div and then simplify the resultant comparison.
     if (Shr->getOpcode() == Instruction::AShr &&
-        !Shr->isExact())
+        (!Shr->isExact() || ShAmtVal == TypeBits - 1))
       return 0;
     
     // Revisit the shift (to delete it).
@@ -1087,22 +1087,33 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
         // have its sign bit set or if it is an equality comparison. 
         // Extending a relational comparison when we're checking the sign
         // bit would not work.
-        if (Cast->hasOneUse() &&
-            (ICI.isEquality() ||
-             (AndCST->getValue().isNonNegative() && RHSV.isNonNegative()))) {
-          uint32_t BitWidth = 
-            cast<IntegerType>(Cast->getOperand(0)->getType())->getBitWidth();
-          APInt NewCST = AndCST->getValue().zext(BitWidth);
-          APInt NewCI = RHSV.zext(BitWidth);
-          Value *NewAnd = 
+        if (ICI.isEquality() ||
+            (AndCST->getValue().isNonNegative() && RHSV.isNonNegative())) {
+          Value *NewAnd =
             Builder->CreateAnd(Cast->getOperand(0),
-                           ConstantInt::get(ICI.getContext(), NewCST),
-                               LHSI->getName());
+                               ConstantExpr::getZExt(AndCST, Cast->getSrcTy()));
+          NewAnd->takeName(LHSI);
           return new ICmpInst(ICI.getPredicate(), NewAnd,
-                              ConstantInt::get(ICI.getContext(), NewCI));
+                              ConstantExpr::getZExt(RHS, Cast->getSrcTy()));
         }
       }
-      
+
+      // If the LHS is an AND of a zext, and we have an equality compare, we can
+      // shrink the and/compare to the smaller type, eliminating the cast.
+      if (ZExtInst *Cast = dyn_cast<ZExtInst>(LHSI->getOperand(0))) {
+        const IntegerType *Ty = cast<IntegerType>(Cast->getSrcTy());
+        // Make sure we don't compare the upper bits, SimplifyDemandedBits
+        // should fold the icmp to true/false in that case.
+        if (ICI.isEquality() && RHSV.getActiveBits() <= Ty->getBitWidth()) {
+          Value *NewAnd =
+            Builder->CreateAnd(Cast->getOperand(0),
+                               ConstantExpr::getTrunc(AndCST, Ty));
+          NewAnd->takeName(LHSI);
+          return new ICmpInst(ICI.getPredicate(), NewAnd,
+                              ConstantExpr::getTrunc(RHS, Ty));
+        }
+      }
+
       // If this is: (X >> C1) & C2 != C3 (where any shift and any compare
       // could exist), turn it into (X & (C2 << C1)) != (C3 << C1).  This
       // happens a LOT in code produced by the C front-end, for bitfield
@@ -1384,9 +1395,9 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
           
           if (Value *NegVal = dyn_castNegVal(BOp1))
             return new ICmpInst(ICI.getPredicate(), BOp0, NegVal);
-          else if (Value *NegVal = dyn_castNegVal(BOp0))
+          if (Value *NegVal = dyn_castNegVal(BOp0))
             return new ICmpInst(ICI.getPredicate(), NegVal, BOp1);
-          else if (BO->hasOneUse()) {
+          if (BO->hasOneUse()) {
             Value *Neg = Builder->CreateNeg(BOp1);
             Neg->takeName(BO);
             return new ICmpInst(ICI.getPredicate(), BOp0, Neg);
@@ -1396,18 +1407,27 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
       case Instruction::Xor:
         // For the xor case, we can xor two constants together, eliminating
         // the explicit xor.
-        if (Constant *BOC = dyn_cast<Constant>(BO->getOperand(1)))
-          return new ICmpInst(ICI.getPredicate(), BO->getOperand(0), 
+        if (Constant *BOC = dyn_cast<Constant>(BO->getOperand(1))) {
+          return new ICmpInst(ICI.getPredicate(), BO->getOperand(0),
                               ConstantExpr::getXor(RHS, BOC));
-        
-        // FALLTHROUGH
+        } else if (RHSV == 0) {
+          // Replace ((xor A, B) != 0) with (A != B)
+          return new ICmpInst(ICI.getPredicate(), BO->getOperand(0),
+                              BO->getOperand(1));
+        }
+        break;
       case Instruction::Sub:
-        // Replace (([sub|xor] A, B) != 0) with (A != B)
-        if (RHSV == 0)
+        // Replace ((sub A, B) != C) with (B != A-C) if A & C are constants.
+        if (ConstantInt *BOp0C = dyn_cast<ConstantInt>(BO->getOperand(0))) {
+          if (BO->hasOneUse())
+            return new ICmpInst(ICI.getPredicate(), BO->getOperand(1),
+                                ConstantExpr::getSub(BOp0C, RHS));
+        } else if (RHSV == 0) {
+          // Replace ((sub A, B) != 0) with (A != B)
           return new ICmpInst(ICI.getPredicate(), BO->getOperand(0),
                               BO->getOperand(1));
+        }
         break;
-        
       case Instruction::Or:
         // If bits are being or'd in that are not present in the constant we
         // are comparing against, then the comparison could never succeed!
@@ -2400,7 +2420,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
         // fall-through
       case Instruction::SDiv:
       case Instruction::AShr:
-        if (!BO0->isExact() && !BO1->isExact())
+        if (!BO0->isExact() || !BO1->isExact())
           break;
         return new ICmpInst(I.getPredicate(), BO0->getOperand(0),
                             BO1->getOperand(0));
@@ -2483,9 +2503,8 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
     }
 
     // (X&Z) == (Y&Z) -> (X^Y) & Z == 0
-    if (Op0->hasOneUse() && Op1->hasOneUse() &&
-        match(Op0, m_And(m_Value(A), m_Value(B))) && 
-        match(Op1, m_And(m_Value(C), m_Value(D)))) {
+    if (match(Op0, m_OneUse(m_And(m_Value(A), m_Value(B)))) && 
+        match(Op1, m_OneUse(m_And(m_Value(C), m_Value(D))))) {
       Value *X = 0, *Y = 0, *Z = 0;
       
       if (A == C) {
@@ -2506,6 +2525,32 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
         return &I;
       }
     }
+    
+    // Transform "icmp eq (trunc (lshr(X, cst1)), cst" to
+    // "icmp (and X, mask), cst"
+    uint64_t ShAmt = 0;
+    ConstantInt *Cst1;
+    if (Op0->hasOneUse() &&
+        match(Op0, m_Trunc(m_OneUse(m_LShr(m_Value(A),
+                                           m_ConstantInt(ShAmt))))) &&
+        match(Op1, m_ConstantInt(Cst1)) &&
+        // Only do this when A has multiple uses.  This is most important to do
+        // when it exposes other optimizations.
+        !A->hasOneUse()) {
+      unsigned ASize =cast<IntegerType>(A->getType())->getPrimitiveSizeInBits();
+      
+      if (ShAmt < ASize) {
+        APInt MaskV =
+          APInt::getLowBitsSet(ASize, Op0->getType()->getPrimitiveSizeInBits());
+        MaskV <<= ShAmt;
+        
+        APInt CmpV = Cst1->getValue().zext(ASize);
+        CmpV <<= ShAmt;
+        
+        Value *Mask = Builder->CreateAnd(A, Builder->getInt(MaskV));
+        return new ICmpInst(I.getPredicate(), Mask, Builder->getInt(CmpV));
+      }
+    }
   }
   
   {
diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 432adc9..f499290 100644
--- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -57,12 +57,14 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
       Value *Idx[2];
       Idx[0] = NullIdx;
       Idx[1] = NullIdx;
-      Value *V = GetElementPtrInst::CreateInBounds(New, Idx, Idx + 2,
-                                                   New->getName()+".sub", It);
+      Instruction *GEP =
+           GetElementPtrInst::CreateInBounds(New, Idx, Idx + 2,
+                                             New->getName()+".sub");
+      InsertNewInstBefore(GEP, *It);
 
       // Now make everything use the getelementptr instead of the original
       // allocation.
-      return ReplaceInstUsesWith(AI, V);
+      return ReplaceInstUsesWith(AI, GEP);
     } else if (isa<UndefValue>(AI.getArraySize())) {
       return ReplaceInstUsesWith(AI, Constant::getNullValue(AI.getType()));
     }
@@ -600,10 +602,12 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
   // Advance to a place where it is safe to insert the new store and
   // insert it.
   BBI = DestBB->getFirstNonPHI();
-  InsertNewInstBefore(new StoreInst(MergedVal, SI.getOperand(1),
-                                    OtherStore->isVolatile(),
-                                    SI.getAlignment()), *BBI);
-  
+  StoreInst *NewSI = new StoreInst(MergedVal, SI.getOperand(1),
+                                   OtherStore->isVolatile(),
+                                   SI.getAlignment());
+  InsertNewInstBefore(NewSI, *BBI);
+  NewSI->setDebugLoc(OtherStore->getDebugLoc()); 
+
   // Nuke the old stores.
   EraseInstFromFunction(SI);
   EraseInstFromFunction(*OtherStore);
diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 6651387..2d29403 100644
--- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -19,6 +19,60 @@
 using namespace llvm;
 using namespace PatternMatch;
 
+
+/// simplifyValueKnownNonZero - The specific integer value is used in a context
+/// where it is known to be non-zero.  If this allows us to simplify the
+/// computation, do so and return the new operand, otherwise return null.
+static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC) {
+  // If V has multiple uses, then we would have to do more analysis to determine
+  // if this is safe.  For example, the use could be in dynamically unreached
+  // code.
+  if (!V->hasOneUse()) return 0;
+  
+  bool MadeChange = false;
+
+  // ((1 << A) >>u B) --> (1 << (A-B))
+  // Because V cannot be zero, we know that B is less than A.
+  Value *A = 0, *B = 0, *PowerOf2 = 0;
+  if (match(V, m_LShr(m_OneUse(m_Shl(m_Value(PowerOf2), m_Value(A))),
+                      m_Value(B))) &&
+      // The "1" can be any value known to be a power of 2.
+      isPowerOfTwo(PowerOf2, IC.getTargetData())) {
+    A = IC.Builder->CreateSub(A, B, "tmp");
+    return IC.Builder->CreateShl(PowerOf2, A);
+  }
+  
+  // (PowerOfTwo >>u B) --> isExact since shifting out the result would make it
+  // inexact.  Similarly for <<.
+  if (BinaryOperator *I = dyn_cast<BinaryOperator>(V))
+    if (I->isLogicalShift() &&
+        isPowerOfTwo(I->getOperand(0), IC.getTargetData())) {
+      // We know that this is an exact/nuw shift and that the input is a
+      // non-zero context as well.
+      if (Value *V2 = simplifyValueKnownNonZero(I->getOperand(0), IC)) {
+        I->setOperand(0, V2);
+        MadeChange = true;
+      }
+      
+      if (I->getOpcode() == Instruction::LShr && !I->isExact()) {
+        I->setIsExact();
+        MadeChange = true;
+      }
+      
+      if (I->getOpcode() == Instruction::Shl && !I->hasNoUnsignedWrap()) {
+        I->setHasNoUnsignedWrap();
+        MadeChange = true;
+      }
+    }
+
+  // TODO: Lots more we could do here:
+  //    If V is a phi node, we can call this on each of its operands.
+  //    "select cond, X, 0" can simplify to "X".
+  
+  return MadeChange ? V : 0;
+}
+
+
 /// MultiplyOverflows - True if the multiply can not be expressed in an int
 /// this size.
 static bool MultiplyOverflows(ConstantInt *C1, ConstantInt *C2, bool sign) {
@@ -81,6 +135,29 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
         return BinaryOperator::CreateAdd(Add, Builder->CreateMul(C1, CI));
       }
     }
+
+    // (Y - X) * (-(2**n)) -> (X - Y) * (2**n), for positive nonzero n
+    // (Y + const) * (-(2**n)) -> (-constY) * (2**n), for positive nonzero n
+    // The "* (2**n)" thus becomes a potential shifting opportunity.
+    {
+      const APInt &   Val = CI->getValue();
+      const APInt &PosVal = Val.abs();
+      if (Val.isNegative() && PosVal.isPowerOf2()) {
+        Value *X = 0, *Y = 0;
+        if (Op0->hasOneUse()) {
+          ConstantInt *C1;
+          Value *Sub = 0;
+          if (match(Op0, m_Sub(m_Value(Y), m_Value(X))))
+            Sub = Builder->CreateSub(X, Y, "suba");
+          else if (match(Op0, m_Add(m_Value(Y), m_ConstantInt(C1))))
+            Sub = Builder->CreateSub(Builder->CreateNeg(C1), Y, "subc");
+          if (Sub)
+            return
+              BinaryOperator::CreateMul(Sub,
+                                        ConstantInt::get(Y->getType(), PosVal));
+        }
+      }
+    }
   }
   
   // Simplify mul instructions with a constant RHS.
@@ -293,6 +370,12 @@ bool InstCombiner::SimplifyDivRemOfSelect(BinaryOperator &I) {
 Instruction *InstCombiner::commonIDivTransforms(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
+  // The RHS is known non-zero.
+  if (Value *V = simplifyValueKnownNonZero(I.getOperand(1), *this)) {
+    I.setOperand(1, V);
+    return &I;
+  }
+  
   // Handle cases involving: [su]div X, (select Cond, Y, Z)
   // This does not apply for fdiv.
   if (isa<SelectInst>(Op1) && SimplifyDivRemOfSelect(I))
@@ -320,6 +403,10 @@ Instruction *InstCombiner::commonIDivTransforms(BinaryOperator &I) {
     }
   }
 
+  // See if we can fold away this div instruction.
+  if (SimplifyDemandedInstructionBits(I))
+    return &I;
+
   // (X - (X rem Y)) / Y -> X / Y; usually originates as ((X / Y) * Y) / Y
   Value *X = 0, *Z = 0;
   if (match(Op0, m_Sub(m_Value(X), m_Value(Z)))) { // (X - Z) / Y; Y = Op1
@@ -332,6 +419,19 @@ Instruction *InstCombiner::commonIDivTransforms(BinaryOperator &I) {
   return 0;
 }
 
+/// dyn_castZExtVal - Checks if V is a zext or constant that can
+/// be truncated to Ty without losing bits.
+static Value *dyn_castZExtVal(Value *V, const Type *Ty) {
+  if (ZExtInst *Z = dyn_cast<ZExtInst>(V)) {
+    if (Z->getSrcTy() == Ty)
+      return Z->getOperand(0);
+  } else if (ConstantInt *C = dyn_cast<ConstantInt>(V)) {
+    if (C->getValue().getActiveBits() <= cast<IntegerType>(Ty)->getBitWidth())
+      return ConstantExpr::getTrunc(C, Ty);
+  }
+  return 0;
+}
+
 Instruction *InstCombiner::visitUDiv(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
@@ -390,6 +490,14 @@ Instruction *InstCombiner::visitUDiv(BinaryOperator &I) {
       return SelectInst::Create(Cond, TSI, FSI);
     }
   }
+
+  // (zext A) udiv (zext B) --> zext (A udiv B)
+  if (ZExtInst *ZOp0 = dyn_cast<ZExtInst>(Op0))
+    if (Value *ZOp1 = dyn_castZExtVal(Op1, ZOp0->getSrcTy()))
+      return new ZExtInst(Builder->CreateUDiv(ZOp0->getOperand(0), ZOp1, "div",
+                                              I.isExact()),
+                          I.getType());
+
   return 0;
 }
 
@@ -467,28 +575,6 @@ Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
   return 0;
 }
 
-/// This function implements the transforms on rem instructions that work
-/// regardless of the kind of rem instruction it is (urem, srem, or frem). It 
-/// is used by the visitors to those instructions.
-/// @brief Transforms common to all three rem instructions
-Instruction *InstCombiner::commonRemTransforms(BinaryOperator &I) {
-  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
-
-  if (isa<UndefValue>(Op0)) {             // undef % X -> 0
-    if (I.getType()->isFPOrFPVectorTy())
-      return ReplaceInstUsesWith(I, Op0);  // X % undef -> undef (could be SNaN)
-    return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
-  }
-  if (isa<UndefValue>(Op1))
-    return ReplaceInstUsesWith(I, Op1);  // X % undef -> undef
-
-  // Handle cases involving: rem X, (select Cond, Y, Z)
-  if (isa<SelectInst>(Op1) && SimplifyDivRemOfSelect(I))
-    return &I;
-
-  return 0;
-}
-
 /// This function implements the transforms common to both integer remainder
 /// instructions (urem and srem). It is called by the visitors to those integer
 /// remainder instructions.
@@ -496,26 +582,17 @@ Instruction *InstCombiner::commonRemTransforms(BinaryOperator &I) {
 Instruction *InstCombiner::commonIRemTransforms(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
-  if (Instruction *common = commonRemTransforms(I))
-    return common;
-
-  // X % X == 0
-  if (Op0 == Op1)
-    return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
-
-  // 0 % X == 0 for integer, we don't need to preserve faults!
-  if (Constant *LHS = dyn_cast<Constant>(Op0))
-    if (LHS->isNullValue())
-      return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
+  // The RHS is known non-zero.
+  if (Value *V = simplifyValueKnownNonZero(I.getOperand(1), *this)) {
+    I.setOperand(1, V);
+    return &I;
+  }
 
-  if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) {
-    // X % 0 == undef, we don't need to preserve faults!
-    if (RHS->equalsInt(0))
-      return ReplaceInstUsesWith(I, UndefValue::get(I.getType()));
-    
-    if (RHS->equalsInt(1))  // X % 1 == 0
-      return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
+  // Handle cases involving: rem X, (select Cond, Y, Z)
+  if (isa<SelectInst>(Op1) && SimplifyDivRemOfSelect(I))
+    return &I;
 
+  if (isa<ConstantInt>(Op1)) {
     if (Instruction *Op0I = dyn_cast<Instruction>(Op0)) {
       if (SelectInst *SI = dyn_cast<SelectInst>(Op0I)) {
         if (Instruction *R = FoldOpIntoSelect(I, SI))
@@ -537,6 +614,9 @@ Instruction *InstCombiner::commonIRemTransforms(BinaryOperator &I) {
 Instruction *InstCombiner::visitURem(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
+  if (Value *V = SimplifyURemInst(Op0, Op1, TD))
+    return ReplaceInstUsesWith(I, V);
+
   if (Instruction *common = commonIRemTransforms(I))
     return common;
   
@@ -564,13 +644,22 @@ Instruction *InstCombiner::visitURem(BinaryOperator &I) {
       return SelectInst::Create(Cond, TrueAnd, FalseAnd);
     }
   }
-  
+
+  // (zext A) urem (zext B) --> zext (A urem B)
+  if (ZExtInst *ZOp0 = dyn_cast<ZExtInst>(Op0))
+    if (Value *ZOp1 = dyn_castZExtVal(Op1, ZOp0->getSrcTy()))
+      return new ZExtInst(Builder->CreateURem(ZOp0->getOperand(0), ZOp1),
+                          I.getType());
+
   return 0;
 }
 
 Instruction *InstCombiner::visitSRem(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
+  if (Value *V = SimplifySRemInst(Op0, Op1, TD))
+    return ReplaceInstUsesWith(I, V);
+
   // Handle the integer rem common cases
   if (Instruction *Common = commonIRemTransforms(I))
     return Common;
@@ -629,6 +718,14 @@ Instruction *InstCombiner::visitSRem(BinaryOperator &I) {
 }
 
 Instruction *InstCombiner::visitFRem(BinaryOperator &I) {
-  return commonRemTransforms(I);
-}
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
+  if (Value *V = SimplifyFRemInst(Op0, Op1, TD))
+    return ReplaceInstUsesWith(I, V);
+
+  // Handle cases involving: rem X, (select Cond, Y, Z)
+  if (isa<SelectInst>(Op1) && SimplifyDivRemOfSelect(I))
+    return &I;
+
+  return 0;
+}
diff --git a/lib/Transforms/InstCombine/InstCombinePHI.cpp b/lib/Transforms/InstCombine/InstCombinePHI.cpp
index c5f31fb..3777340 100644
--- a/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -110,16 +110,20 @@ Instruction *InstCombiner::FoldPHIArgBinOpIntoPHI(PHINode &PN) {
     }
   }
     
-  if (CmpInst *CIOp = dyn_cast<CmpInst>(FirstInst))
-    return CmpInst::Create(CIOp->getOpcode(), CIOp->getPredicate(),
-                           LHSVal, RHSVal);
-  
+  if (CmpInst *CIOp = dyn_cast<CmpInst>(FirstInst)) {
+    CmpInst *NewCI = CmpInst::Create(CIOp->getOpcode(), CIOp->getPredicate(),
+                                     LHSVal, RHSVal);
+    NewCI->setDebugLoc(FirstInst->getDebugLoc());
+    return NewCI;
+  }
+
   BinaryOperator *BinOp = cast<BinaryOperator>(FirstInst);
   BinaryOperator *NewBinOp =
     BinaryOperator::Create(BinOp->getOpcode(), LHSVal, RHSVal);
   if (isNUW) NewBinOp->setHasNoUnsignedWrap();
   if (isNSW) NewBinOp->setHasNoSignedWrap();
   if (isExact) NewBinOp->setIsExact();
+  NewBinOp->setDebugLoc(FirstInst->getDebugLoc());
   return NewBinOp;
 }
 
@@ -228,6 +232,7 @@ Instruction *InstCombiner::FoldPHIArgGEPIntoPHI(PHINode &PN) {
     GetElementPtrInst::Create(Base, FixedOperands.begin()+1,
                               FixedOperands.end());
   if (AllInBounds) NewGEP->setIsInBounds();
+  NewGEP->setDebugLoc(FirstInst->getDebugLoc());
   return NewGEP;
 }
 
@@ -237,7 +242,7 @@ Instruction *InstCombiner::FoldPHIArgGEPIntoPHI(PHINode &PN) {
 /// obvious the value of the load is not changed from the point of the load to
 /// the end of the block it is in.
 ///
-/// Finally, it is safe, but not profitable, to sink a load targetting a
+/// Finally, it is safe, but not profitable, to sink a load targeting a
 /// non-address-taken alloca.  Doing so will cause us to not promote the alloca
 /// to a register.
 static bool isSafeAndProfitableToSinkLoad(LoadInst *L) {
@@ -369,7 +374,9 @@ Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) {
     for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i)
       cast<LoadInst>(PN.getIncomingValue(i))->setVolatile(false);
   
-  return new LoadInst(PhiVal, "", isVolatile, LoadAlignment);
+  LoadInst *NewLI = new LoadInst(PhiVal, "", isVolatile, LoadAlignment);
+  NewLI->setDebugLoc(FirstLI->getDebugLoc());
+  return NewLI;
 }
 
 
@@ -469,20 +476,27 @@ Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) {
   }
 
   // Insert and return the new operation.
-  if (CastInst *FirstCI = dyn_cast<CastInst>(FirstInst))
-    return CastInst::Create(FirstCI->getOpcode(), PhiVal, PN.getType());
+  if (CastInst *FirstCI = dyn_cast<CastInst>(FirstInst)) {
+    CastInst *NewCI = CastInst::Create(FirstCI->getOpcode(), PhiVal,
+                                       PN.getType());
+    NewCI->setDebugLoc(FirstInst->getDebugLoc());
+    return NewCI;
+  }
   
   if (BinaryOperator *BinOp = dyn_cast<BinaryOperator>(FirstInst)) {
     BinOp = BinaryOperator::Create(BinOp->getOpcode(), PhiVal, ConstantOp);
     if (isNUW) BinOp->setHasNoUnsignedWrap();
     if (isNSW) BinOp->setHasNoSignedWrap();
     if (isExact) BinOp->setIsExact();
+    BinOp->setDebugLoc(FirstInst->getDebugLoc());
     return BinOp;
   }
   
   CmpInst *CIOp = cast<CmpInst>(FirstInst);
-  return CmpInst::Create(CIOp->getOpcode(), CIOp->getPredicate(),
-                         PhiVal, ConstantOp);
+  CmpInst *NewCI = CmpInst::Create(CIOp->getOpcode(), CIOp->getPredicate(),
+                                   PhiVal, ConstantOp);
+  NewCI->setDebugLoc(FirstInst->getDebugLoc());
+  return NewCI;
 }
 
 /// DeadPHICycle - Return true if this PHI node is only used by a PHI node cycle
@@ -774,9 +788,6 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) {
 // PHINode simplification
 //
 Instruction *InstCombiner::visitPHINode(PHINode &PN) {
-  // If LCSSA is around, don't mess with Phi nodes
-  if (MustPreserveLCSSA) return 0;
-
   if (Value *V = SimplifyInstruction(&PN, TD))
     return ReplaceInstUsesWith(PN, V);
 
@@ -824,18 +835,18 @@ Instruction *InstCombiner::visitPHINode(PHINode &PN) {
   // quick check to see if the PHI node only contains a single non-phi value, if
   // so, scan to see if the phi cycle is actually equal to that value.
   {
-    unsigned InValNo = 0, NumOperandVals = PN.getNumIncomingValues();
+    unsigned InValNo = 0, NumIncomingVals = PN.getNumIncomingValues();
     // Scan for the first non-phi operand.
-    while (InValNo != NumOperandVals && 
+    while (InValNo != NumIncomingVals &&
            isa<PHINode>(PN.getIncomingValue(InValNo)))
       ++InValNo;
 
-    if (InValNo != NumOperandVals) {
-      Value *NonPhiInVal = PN.getOperand(InValNo);
+    if (InValNo != NumIncomingVals) {
+      Value *NonPhiInVal = PN.getIncomingValue(InValNo);
       
       // Scan the rest of the operands to see if there are any conflicts, if so
       // there is no need to recursively scan other phis.
-      for (++InValNo; InValNo != NumOperandVals; ++InValNo) {
+      for (++InValNo; InValNo != NumIncomingVals; ++InValNo) {
         Value *OpVal = PN.getIncomingValue(InValNo);
         if (OpVal != NonPhiInVal && !isa<PHINode>(OpVal))
           break;
@@ -844,7 +855,7 @@ Instruction *InstCombiner::visitPHINode(PHINode &PN) {
       // If we scanned over all operands, then we have one unique value plus
       // phi values.  Scan PHI nodes to see if they all merge in each other or
       // the value.
-      if (InValNo == NumOperandVals) {
+      if (InValNo == NumIncomingVals) {
         SmallPtrSet<PHINode*, 16> ValueEqualPHIs;
         if (PHIsEqualValue(&PN, NonPhiInVal, ValueEqualPHIs))
           return ReplaceInstUsesWith(PN, NonPhiInVal);
diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 61a433a..aeb3c3e 100644
--- a/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -133,9 +133,8 @@ Instruction *InstCombiner::FoldSelectOpOp(SelectInst &SI, Instruction *TI,
     }
 
     // Fold this by inserting a select from the input values.
-    SelectInst *NewSI = SelectInst::Create(SI.getCondition(), TI->getOperand(0),
-                                          FI->getOperand(0), SI.getName()+".v");
-    InsertNewInstBefore(NewSI, SI);
+    Value *NewSI = Builder->CreateSelect(SI.getCondition(), TI->getOperand(0),
+                                         FI->getOperand(0), SI.getName()+".v");
     return CastInst::Create(Instruction::CastOps(TI->getOpcode()), NewSI,
                             TI->getType());
   }
@@ -174,9 +173,8 @@ Instruction *InstCombiner::FoldSelectOpOp(SelectInst &SI, Instruction *TI,
   }
 
   // If we reach here, they do have operations in common.
-  SelectInst *NewSI = SelectInst::Create(SI.getCondition(), OtherOpT,
-                                         OtherOpF, SI.getName()+".v");
-  InsertNewInstBefore(NewSI, SI);
+  Value *NewSI = Builder->CreateSelect(SI.getCondition(), OtherOpT,
+                                       OtherOpF, SI.getName()+".v");
 
   if (BinaryOperator *BO = dyn_cast<BinaryOperator>(TI)) {
     if (MatchIsOpZero)
@@ -224,8 +222,7 @@ Instruction *InstCombiner::FoldSelectIntoOp(SelectInst &SI, Value *TrueVal,
           // Avoid creating select between 2 constants unless it's selecting
           // between 0, 1 and -1.
           if (!isa<Constant>(OOp) || isSelect01(C, cast<Constant>(OOp))) {
-            Instruction *NewSel = SelectInst::Create(SI.getCondition(), OOp, C);
-            InsertNewInstBefore(NewSel, SI);
+            Value *NewSel = Builder->CreateSelect(SI.getCondition(), OOp, C);
             NewSel->takeName(TVI);
             BinaryOperator *TVI_BO = cast<BinaryOperator>(TVI);
             BinaryOperator *BO = BinaryOperator::Create(TVI_BO->getOpcode(),
@@ -260,8 +257,7 @@ Instruction *InstCombiner::FoldSelectIntoOp(SelectInst &SI, Value *TrueVal,
           // Avoid creating select between 2 constants unless it's selecting
           // between 0, 1 and -1.
           if (!isa<Constant>(OOp) || isSelect01(C, cast<Constant>(OOp))) {
-            Instruction *NewSel = SelectInst::Create(SI.getCondition(), C, OOp);
-            InsertNewInstBefore(NewSel, SI);
+            Value *NewSel = Builder->CreateSelect(SI.getCondition(), C, OOp);
             NewSel->takeName(FVI);
             BinaryOperator *FVI_BO = cast<BinaryOperator>(FVI);
             BinaryOperator *BO = BinaryOperator::Create(FVI_BO->getOpcode(),
@@ -282,6 +278,59 @@ Instruction *InstCombiner::FoldSelectIntoOp(SelectInst &SI, Value *TrueVal,
   return 0;
 }
 
+/// SimplifyWithOpReplaced - See if V simplifies when its operand Op is
+/// replaced with RepOp.
+static Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
+                                     const TargetData *TD) {
+  // Trivial replacement.
+  if (V == Op)
+    return RepOp;
+
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I)
+    return 0;
+
+  // If this is a binary operator, try to simplify it with the replaced op.
+  if (BinaryOperator *B = dyn_cast<BinaryOperator>(I)) {
+    if (B->getOperand(0) == Op)
+      return SimplifyBinOp(B->getOpcode(), RepOp, B->getOperand(1), TD);
+    if (B->getOperand(1) == Op)
+      return SimplifyBinOp(B->getOpcode(), B->getOperand(0), RepOp, TD);
+  }
+
+  // Same for CmpInsts.
+  if (CmpInst *C = dyn_cast<CmpInst>(I)) {
+    if (C->getOperand(0) == Op)
+      return SimplifyCmpInst(C->getPredicate(), RepOp, C->getOperand(1), TD);
+    if (C->getOperand(1) == Op)
+      return SimplifyCmpInst(C->getPredicate(), C->getOperand(0), RepOp, TD);
+  }
+
+  // TODO: We could hand off more cases to instsimplify here.
+
+  // If all operands are constant after substituting Op for RepOp then we can
+  // constant fold the instruction.
+  if (Constant *CRepOp = dyn_cast<Constant>(RepOp)) {
+    // Build a list of all constant operands.
+    SmallVector<Constant*, 8> ConstOps;
+    for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+      if (I->getOperand(i) == Op)
+        ConstOps.push_back(CRepOp);
+      else if (Constant *COp = dyn_cast<Constant>(I->getOperand(i)))
+        ConstOps.push_back(COp);
+      else
+        break;
+    }
+
+    // All operands were constants, fold it.
+    if (ConstOps.size() == I->getNumOperands())
+      return ConstantFoldInstOperands(I->getOpcode(), I->getType(),
+                                      ConstOps.data(), ConstOps.size(), TD);
+  }
+
+  return 0;
+}
+
 /// visitSelectInstWithICmp - Visit a SelectInst that has an
 /// ICmpInst as its first operand.
 ///
@@ -420,25 +469,21 @@ Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI,
     }
   }
 
-  if (CmpLHS == TrueVal && CmpRHS == FalseVal) {
-    // Transform (X == Y) ? X : Y  -> Y
-    if (Pred == ICmpInst::ICMP_EQ)
+  // If we have an equality comparison then we know the value in one of the
+  // arms of the select. See if substituting this value into the arm and
+  // simplifying the result yields the same value as the other arm.
+  if (Pred == ICmpInst::ICMP_EQ) {
+    if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, TD) == TrueVal ||
+        SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, TD) == TrueVal)
       return ReplaceInstUsesWith(SI, FalseVal);
-    // Transform (X != Y) ? X : Y  -> X
-    if (Pred == ICmpInst::ICMP_NE)
+  } else if (Pred == ICmpInst::ICMP_NE) {
+    if (SimplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, TD) == FalseVal ||
+        SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, TD) == FalseVal)
       return ReplaceInstUsesWith(SI, TrueVal);
-    /// NOTE: if we wanted to, this is where to detect integer MIN/MAX
-
-  } else if (CmpLHS == FalseVal && CmpRHS == TrueVal) {
-    // Transform (X == Y) ? Y : X  -> X
-    if (Pred == ICmpInst::ICMP_EQ)
-      return ReplaceInstUsesWith(SI, FalseVal);
-    // Transform (X != Y) ? Y : X  -> Y
-    if (Pred == ICmpInst::ICMP_NE)
-      return ReplaceInstUsesWith(SI, TrueVal);
-    /// NOTE: if we wanted to, this is where to detect integer MIN/MAX
   }
 
+  // NOTE: if we wanted to, this is where to detect integer MIN/MAX
+
   if (isa<Constant>(CmpRHS)) {
     if (CmpLHS == TrueVal && Pred == ICmpInst::ICMP_EQ) {
       // Transform (X == C) ? X : Y -> (X == C) ? C : Y
@@ -604,9 +649,7 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
         return BinaryOperator::CreateOr(CondVal, FalseVal);
       }
       // Change: A = select B, false, C --> A = and !B, C
-      Value *NotCond =
-        InsertNewInstBefore(BinaryOperator::CreateNot(CondVal,
-                                           "not."+CondVal->getName()), SI);
+      Value *NotCond = Builder->CreateNot(CondVal, "not."+CondVal->getName());
       return BinaryOperator::CreateAnd(NotCond, FalseVal);
     } else if (ConstantInt *C = dyn_cast<ConstantInt>(FalseVal)) {
       if (C->getZExtValue() == false) {
@@ -614,9 +657,7 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
         return BinaryOperator::CreateAnd(CondVal, TrueVal);
       }
       // Change: A = select B, C, true --> A = or !B, C
-      Value *NotCond =
-        InsertNewInstBefore(BinaryOperator::CreateNot(CondVal,
-                                           "not."+CondVal->getName()), SI);
+      Value *NotCond = Builder->CreateNot(CondVal, "not."+CondVal->getName());
       return BinaryOperator::CreateOr(NotCond, TrueVal);
     }
 
@@ -755,27 +796,20 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
             // So at this point we know we have (Y -> OtherAddOp):
             //        select C, (add X, Y), (sub X, Z)
             Value *NegVal;  // Compute -Z
-            if (Constant *C = dyn_cast<Constant>(SubOp->getOperand(1))) {
-              NegVal = ConstantExpr::getNeg(C);
-            } else if (SI.getType()->isFloatingPointTy()) {
-              NegVal = InsertNewInstBefore(
-                    BinaryOperator::CreateFNeg(SubOp->getOperand(1),
-                                              "tmp"), SI);
+            if (SI.getType()->isFloatingPointTy()) {
+              NegVal = Builder->CreateFNeg(SubOp->getOperand(1));
             } else {
-              NegVal = InsertNewInstBefore(
-                    BinaryOperator::CreateNeg(SubOp->getOperand(1),
-                                              "tmp"), SI);
+              NegVal = Builder->CreateNeg(SubOp->getOperand(1));
             }
 
             Value *NewTrueOp = OtherAddOp;
             Value *NewFalseOp = NegVal;
             if (AddOp != TI)
               std::swap(NewTrueOp, NewFalseOp);
-            Instruction *NewSel =
-              SelectInst::Create(CondVal, NewTrueOp,
-                                 NewFalseOp, SI.getName() + ".p");
+            Value *NewSel = 
+              Builder->CreateSelect(CondVal, NewTrueOp,
+                                    NewFalseOp, SI.getName() + ".p");
 
-            NewSel = InsertNewInstBefore(NewSel, SI);
             if (SI.getType()->isFloatingPointTy())
               return BinaryOperator::CreateFAdd(SubOp->getOperand(0), NewSel);
             else
diff --git a/lib/Transforms/InstCombine/InstCombineShifts.cpp b/lib/Transforms/InstCombine/InstCombineShifts.cpp
index a7f8005..811f949 100644
--- a/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -644,7 +644,14 @@ Instruction *InstCombiner::visitShl(BinaryOperator &I) {
       return &I;
     }
   }
-  
+
+  // (C1 << A) << C2 -> (C1 << C2) << A
+  Constant *C1, *C2;
+  Value *A;
+  if (match(I.getOperand(0), m_OneUse(m_Shl(m_Constant(C1), m_Value(A)))) &&
+      match(I.getOperand(1), m_Constant(C2)))
+    return BinaryOperator::CreateShl(ConstantExpr::getShl(C1, C2), A);
+
   return 0;    
 }
 
diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 6e727ce..8fea8eb 100644
--- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -313,7 +313,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       Instruction *Or = 
         BinaryOperator::CreateOr(I->getOperand(0), I->getOperand(1),
                                  I->getName());
-      return InsertNewInstBefore(Or, *I);
+      return InsertNewInstWith(Or, *I);
     }
     
     // If all of the demanded bits on one side are known, and all of the set
@@ -327,7 +327,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
                                                    ~RHSKnownOne & DemandedMask);
         Instruction *And = 
           BinaryOperator::CreateAnd(I->getOperand(0), AndC, "tmp");
-        return InsertNewInstBefore(And, *I);
+        return InsertNewInstWith(And, *I);
       }
     }
     
@@ -353,13 +353,13 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
           ConstantInt::get(I->getType(), NewMask & AndRHS->getValue());
         Instruction *NewAnd = 
           BinaryOperator::CreateAnd(I->getOperand(0), AndC, "tmp");
-        InsertNewInstBefore(NewAnd, *I);
+        InsertNewInstWith(NewAnd, *I);
         
         Constant *XorC =
           ConstantInt::get(I->getType(), NewMask & XorRHS->getValue());
         Instruction *NewXor =
           BinaryOperator::CreateXor(NewAnd, XorC, "tmp");
-        return InsertNewInstBefore(NewXor, *I);
+        return InsertNewInstWith(NewXor, *I);
       }
 
     // Output known-0 bits are known if clear or set in both the LHS & RHS.
@@ -472,7 +472,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     if (KnownZero[SrcBitWidth-1] || (NewBits & ~DemandedMask) == NewBits) {
       // Convert to ZExt cast
       CastInst *NewCast = new ZExtInst(I->getOperand(0), VTy, I->getName());
-      return InsertNewInstBefore(NewCast, *I);
+      return InsertNewInstWith(NewCast, *I);
     } else if (KnownOne[SrcBitWidth-1]) {    // Input sign bit known set
       KnownOne |= NewBits;
     }
@@ -515,7 +515,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
         Instruction *Or =
           BinaryOperator::CreateOr(I->getOperand(0), I->getOperand(1),
                                    I->getName());
-        return InsertNewInstBefore(Or, *I);
+        return InsertNewInstWith(Or, *I);
       }
       
       // We can say something about the output known-zero and known-one bits,
@@ -632,7 +632,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       // Perform the logical shift right.
       Instruction *NewVal = BinaryOperator::CreateLShr(
                         I->getOperand(0), I->getOperand(1), I->getName());
-      return InsertNewInstBefore(NewVal, *I);
+      return InsertNewInstWith(NewVal, *I);
     }    
 
     // If the sign bit is the only bit demanded by this ashr, then there is no
@@ -676,7 +676,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
         // Perform the logical shift right.
         Instruction *NewVal = BinaryOperator::CreateLShr(
                           I->getOperand(0), SA, I->getName());
-        return InsertNewInstBefore(NewVal, *I);
+        return InsertNewInstWith(NewVal, *I);
       } else if ((KnownOne & SignBit) != 0) { // New bits are known one.
         KnownOne |= HighBits;
       }
@@ -774,12 +774,16 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
             NewVal = BinaryOperator::CreateShl(II->getArgOperand(0),
                     ConstantInt::get(I->getType(), ResultBit-InputBit));
           NewVal->takeName(I);
-          return InsertNewInstBefore(NewVal, *I);
+          return InsertNewInstWith(NewVal, *I);
         }
           
         // TODO: Could compute known zero/one bits based on the input.
         break;
       }
+      case Intrinsic::x86_sse42_crc32_64_8:
+      case Intrinsic::x86_sse42_crc32_64_64:
+        KnownZero = APInt::getHighBitsSet(64, 32);
+        return 0;
       }
     }
     ComputeMaskedBits(V, DemandedMask, KnownZero, KnownOne, Depth);
@@ -867,7 +871,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
   if (Depth == 10)
     return 0;
 
-  // If multiple users are using the root value, procede with
+  // If multiple users are using the root value, proceed with
   // simplification conservatively assuming that all elements
   // are needed.
   if (!V->hasOneUse()) {
@@ -1108,21 +1112,21 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
           Value *LHS = II->getArgOperand(0);
           Value *RHS = II->getArgOperand(1);
           // Extract the element as scalars.
-          LHS = InsertNewInstBefore(ExtractElementInst::Create(LHS, 
+          LHS = InsertNewInstWith(ExtractElementInst::Create(LHS, 
             ConstantInt::get(Type::getInt32Ty(I->getContext()), 0U)), *II);
-          RHS = InsertNewInstBefore(ExtractElementInst::Create(RHS,
+          RHS = InsertNewInstWith(ExtractElementInst::Create(RHS,
             ConstantInt::get(Type::getInt32Ty(I->getContext()), 0U)), *II);
           
           switch (II->getIntrinsicID()) {
           default: llvm_unreachable("Case stmts out of sync!");
           case Intrinsic::x86_sse_sub_ss:
           case Intrinsic::x86_sse2_sub_sd:
-            TmpV = InsertNewInstBefore(BinaryOperator::CreateFSub(LHS, RHS,
+            TmpV = InsertNewInstWith(BinaryOperator::CreateFSub(LHS, RHS,
                                                         II->getName()), *II);
             break;
           case Intrinsic::x86_sse_mul_ss:
           case Intrinsic::x86_sse2_mul_sd:
-            TmpV = InsertNewInstBefore(BinaryOperator::CreateFMul(LHS, RHS,
+            TmpV = InsertNewInstWith(BinaryOperator::CreateFMul(LHS, RHS,
                                                          II->getName()), *II);
             break;
           }
@@ -1132,7 +1136,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
               UndefValue::get(II->getType()), TmpV,
               ConstantInt::get(Type::getInt32Ty(I->getContext()), 0U, false),
                                       II->getName());
-          InsertNewInstBefore(New, *II);
+          InsertNewInstWith(New, *II);
           return New;
         }            
       }
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index 4be671f..92c10f5 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -76,7 +76,6 @@ INITIALIZE_PASS(InstCombiner, "instcombine",
                 "Combine redundant instructions", false, false)
 
 void InstCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addPreservedID(LCSSAID);
   AU.setPreservesCFG();
 }
 
@@ -241,9 +240,9 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
         Constant *C2 = cast<Constant>(Op1->getOperand(1));
 
         Constant *Folded = ConstantExpr::get(Opcode, C1, C2);
-        Instruction *New = BinaryOperator::Create(Opcode, A, B, Op1->getName(),
-                                                  &I);
-        Worklist.Add(New);
+        Instruction *New = BinaryOperator::Create(Opcode, A, B);
+        InsertNewInstWith(New, I);
+        New->takeName(Op1);
         I.setOperand(0, New);
         I.setOperand(1, Folded);
         // Conservatively clear the optional flags, since they may not be
@@ -600,7 +599,7 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
   }
 
   // Okay, we can do the transformation: create the new PHI node.
-  PHINode *NewPN = PHINode::Create(I.getType(), PN->getNumIncomingValues(), "");
+  PHINode *NewPN = PHINode::Create(I.getType(), PN->getNumIncomingValues());
   InsertNewInstBefore(NewPN, *PN);
   NewPN->takeName(PN);
   
@@ -1089,8 +1088,8 @@ Instruction *InstCombiner::visitFree(CallInst &FI) {
   // free undef -> unreachable.
   if (isa<UndefValue>(Op)) {
     // Insert a new store to null because we cannot modify the CFG here.
-    new StoreInst(ConstantInt::getTrue(FI.getContext()),
-           UndefValue::get(Type::getInt1PtrTy(FI.getContext())), &FI);
+    Builder->CreateStore(ConstantInt::getTrue(FI.getContext()),
+                         UndefValue::get(Type::getInt1PtrTy(FI.getContext())));
     return EraseInstFromFunction(FI);
   }
   
@@ -1262,7 +1261,7 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
       case Intrinsic::sadd_with_overflow:
         if (*EV.idx_begin() == 0) {  // Normal result.
           Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1);
-          II->replaceAllUsesWith(UndefValue::get(II->getType()));
+          ReplaceInstUsesWith(*II, UndefValue::get(II->getType()));
           EraseInstFromFunction(*II);
           return BinaryOperator::CreateAdd(LHS, RHS);
         }
@@ -1279,7 +1278,7 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
       case Intrinsic::ssub_with_overflow:
         if (*EV.idx_begin() == 0) {  // Normal result.
           Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1);
-          II->replaceAllUsesWith(UndefValue::get(II->getType()));
+          ReplaceInstUsesWith(*II, UndefValue::get(II->getType()));
           EraseInstFromFunction(*II);
           return BinaryOperator::CreateSub(LHS, RHS);
         }
@@ -1288,7 +1287,7 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
       case Intrinsic::smul_with_overflow:
         if (*EV.idx_begin() == 0) {  // Normal result.
           Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1);
-          II->replaceAllUsesWith(UndefValue::get(II->getType()));
+          ReplaceInstUsesWith(*II, UndefValue::get(II->getType()));
           EraseInstFromFunction(*II);
           return BinaryOperator::CreateMul(LHS, RHS);
         }
@@ -1386,8 +1385,8 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB,
   Worklist.push_back(BB);
 
   SmallVector<Instruction*, 128> InstrsForInstCombineWorklist;
-  SmallPtrSet<ConstantExpr*, 64> FoldedConstants;
-  
+  DenseMap<ConstantExpr*, Constant*> FoldedConstants;
+
   do {
     BB = Worklist.pop_back_val();
     
@@ -1422,14 +1421,15 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB,
              i != e; ++i) {
           ConstantExpr *CE = dyn_cast<ConstantExpr>(i);
           if (CE == 0) continue;
-          
-          // If we already folded this constant, don't try again.
-          if (!FoldedConstants.insert(CE))
-            continue;
-          
-          Constant *NewC = ConstantFoldConstantExpression(CE, TD);
-          if (NewC && NewC != CE) {
-            *i = NewC;
+
+          Constant*& FoldRes = FoldedConstants[CE];
+          if (!FoldRes)
+            FoldRes = ConstantFoldConstantExpression(CE, TD);
+          if (!FoldRes)
+            FoldRes = CE;
+
+          if (FoldRes != CE) {
+            *i = FoldRes;
             MadeIRChange = true;
           }
         }
@@ -1576,6 +1576,7 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) {
 
     // Now that we have an instruction, try combining it to simplify it.
     Builder->SetInsertPoint(I->getParent(), I);
+    Builder->SetCurrentDebugLocation(I->getDebugLoc());
     
 #ifndef NDEBUG
     std::string OrigI;
@@ -1590,7 +1591,8 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) {
         DEBUG(errs() << "IC: Old = " << *I << '\n'
                      << "    New = " << *Result << '\n');
 
-        Result->setDebugLoc(I->getDebugLoc());
+        if (!I->getDebugLoc().isUnknown())
+          Result->setDebugLoc(I->getDebugLoc());
         // Everything uses the new instruction now.
         I->replaceAllUsesWith(Result);
 
@@ -1637,7 +1639,6 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) {
 
 
 bool InstCombiner::runOnFunction(Function &F) {
-  MustPreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
   TD = getAnalysisIfAvailable<TargetData>();
 
   
diff --git a/lib/Transforms/Instrumentation/CMakeLists.txt b/lib/Transforms/Instrumentation/CMakeLists.txt
index 0ac1cb0..5700ac8 100644
--- a/lib/Transforms/Instrumentation/CMakeLists.txt
+++ b/lib/Transforms/Instrumentation/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_llvm_library(LLVMInstrumentation
   EdgeProfiling.cpp
+  GCOVProfiling.cpp
   Instrumentation.cpp
   OptimalEdgeProfiling.cpp
   PathProfiling.cpp
diff --git a/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
new file mode 100644
index 0000000..b902213
--- /dev/null
+++ b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -0,0 +1,673 @@
+//===- GCOVProfiling.cpp - Insert edge counters for gcov profiling --------===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements GCOV-style profiling. When this pass is run it emits
+// "gcno" files next to the existing source, and instruments the code that runs
+// to records the edges between blocks that run and emit a complementary "gcda"
+// file on exit.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "insert-gcov-profiling"
+
+#include "ProfilingUtils.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Analysis/DebugInfo.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Instructions.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/DebugLoc.h"
+#include "llvm/Support/InstIterator.h"
+#include "llvm/Support/IRBuilder.h"
+#include "llvm/Support/PathV2.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/UniqueVector.h"
+#include <string>
+#include <utility>
+using namespace llvm;
+
+namespace {
+  class GCOVProfiler : public ModulePass {
+  public:
+    static char ID;
+    GCOVProfiler()
+        : ModulePass(ID), EmitNotes(true), EmitData(true), Use402Format(false) {
+      initializeGCOVProfilerPass(*PassRegistry::getPassRegistry());
+    }
+    GCOVProfiler(bool EmitNotes, bool EmitData, bool use402Format = false)
+        : ModulePass(ID), EmitNotes(EmitNotes), EmitData(EmitData),
+          Use402Format(use402Format) {
+      assert((EmitNotes || EmitData) && "GCOVProfiler asked to do nothing?");
+      initializeGCOVProfilerPass(*PassRegistry::getPassRegistry());
+    }
+    virtual const char *getPassName() const {
+      return "GCOV Profiler";
+    }
+
+  private:
+    bool runOnModule(Module &M);
+
+    // Create the GCNO files for the Module based on DebugInfo.
+    void emitGCNO(DebugInfoFinder &DIF);
+
+    // Modify the program to track transitions along edges and call into the
+    // profiling runtime to emit .gcda files when run.
+    bool emitProfileArcs(DebugInfoFinder &DIF);
+
+    // Get pointers to the functions in the runtime library.
+    Constant *getStartFileFunc();
+    Constant *getIncrementIndirectCounterFunc();
+    Constant *getEmitFunctionFunc();
+    Constant *getEmitArcsFunc();
+    Constant *getEndFileFunc();
+
+    // Create or retrieve an i32 state value that is used to represent the
+    // pred block number for certain non-trivial edges.
+    GlobalVariable *getEdgeStateValue();
+
+    // Produce a table of pointers to counters, by predecessor and successor
+    // block number.
+    GlobalVariable *buildEdgeLookupTable(Function *F,
+                                         GlobalVariable *Counter,
+                                         const UniqueVector<BasicBlock *> &Preds,
+                                         const UniqueVector<BasicBlock *> &Succs);
+
+    // Add the function to write out all our counters to the global destructor
+    // list.
+    void insertCounterWriteout(DebugInfoFinder &,
+                               SmallVector<std::pair<GlobalVariable *,
+                                                     MDNode *>, 8> &);
+
+    std::string mangleName(DICompileUnit CU, std::string NewStem);
+
+    bool EmitNotes;
+    bool EmitData;
+    bool Use402Format;
+
+    Module *M;
+    LLVMContext *Ctx;
+  };
+}
+
+char GCOVProfiler::ID = 0;
+INITIALIZE_PASS(GCOVProfiler, "insert-gcov-profiling",
+                "Insert instrumentation for GCOV profiling", false, false)
+
+ModulePass *llvm::createGCOVProfilerPass(bool EmitNotes, bool EmitData,
+                                         bool Use402Format) {
+  return new GCOVProfiler(EmitNotes, EmitData, Use402Format);
+}
+
+static DISubprogram findSubprogram(DIScope Scope) {
+  while (!Scope.isSubprogram()) {
+    assert(Scope.isLexicalBlock() &&
+           "Debug location not lexical block or subprogram");
+    Scope = DILexicalBlock(Scope).getContext();
+  }
+  return DISubprogram(Scope);
+}
+
+namespace {
+  class GCOVRecord {
+   protected:
+    static const char *LinesTag;
+    static const char *FunctionTag;
+    static const char *BlockTag;
+    static const char *EdgeTag;
+
+    GCOVRecord() {}
+
+    void writeBytes(const char *Bytes, int Size) {
+      os->write(Bytes, Size);
+    }
+
+    void write(uint32_t i) {
+      writeBytes(reinterpret_cast<char*>(&i), 4);
+    }
+
+    // Returns the length measured in 4-byte blocks that will be used to
+    // represent this string in a GCOV file
+    unsigned lengthOfGCOVString(StringRef s) {
+      // A GCOV string is a length, followed by a NUL, then between 0 and 3 NULs
+      // padding out to the next 4-byte word. The length is measured in 4-byte
+      // words including padding, not bytes of actual string.
+      return (s.size() / 4) + 1;
+    }
+
+    void writeGCOVString(StringRef s) {
+      uint32_t Len = lengthOfGCOVString(s);
+      write(Len);
+      writeBytes(s.data(), s.size());
+
+      // Write 1 to 4 bytes of NUL padding.
+      assert((unsigned)(4 - (s.size() % 4)) > 0);
+      assert((unsigned)(4 - (s.size() % 4)) <= 4);
+      writeBytes("\0\0\0\0", 4 - (s.size() % 4));
+    }
+
+    raw_ostream *os;
+  };
+  const char *GCOVRecord::LinesTag = "\0\0\x45\x01";
+  const char *GCOVRecord::FunctionTag = "\0\0\0\1";
+  const char *GCOVRecord::BlockTag = "\0\0\x41\x01";
+  const char *GCOVRecord::EdgeTag = "\0\0\x43\x01";
+
+  class GCOVFunction;
+  class GCOVBlock;
+
+  // Constructed only by requesting it from a GCOVBlock, this object stores a
+  // list of line numbers and a single filename, representing lines that belong
+  // to the block.
+  class GCOVLines : public GCOVRecord {
+   public:
+    void addLine(uint32_t Line) {
+      Lines.push_back(Line);
+    }
+
+    uint32_t length() {
+      return lengthOfGCOVString(Filename) + 2 + Lines.size();
+    }
+
+   private:
+    friend class GCOVBlock;
+
+    GCOVLines(std::string Filename, raw_ostream *os)
+        : Filename(Filename) {
+      this->os = os;
+    }
+
+    std::string Filename;
+    SmallVector<uint32_t, 32> Lines;
+  };
+
+  // Represent a basic block in GCOV. Each block has a unique number in the
+  // function, number of lines belonging to each block, and a set of edges to
+  // other blocks.
+  class GCOVBlock : public GCOVRecord {
+   public:
+    GCOVLines &getFile(std::string Filename) {
+      GCOVLines *&Lines = LinesByFile[Filename];
+      if (!Lines) {
+        Lines = new GCOVLines(Filename, os);
+      }
+      return *Lines;
+    }
+
+    void addEdge(GCOVBlock &Successor) {
+      OutEdges.push_back(&Successor);
+    }
+
+    void writeOut() {
+      uint32_t Len = 3;
+      for (StringMap<GCOVLines *>::iterator I = LinesByFile.begin(),
+               E = LinesByFile.end(); I != E; ++I) {
+        Len += I->second->length();
+      }
+
+      writeBytes(LinesTag, 4);
+      write(Len);
+      write(Number);
+      for (StringMap<GCOVLines *>::iterator I = LinesByFile.begin(),
+               E = LinesByFile.end(); I != E; ++I) {
+        write(0);
+        writeGCOVString(I->second->Filename);
+        for (int i = 0, e = I->second->Lines.size(); i != e; ++i) {
+          write(I->second->Lines[i]);
+        }
+      }
+      write(0);
+      write(0);
+    }
+
+    ~GCOVBlock() {
+      DeleteContainerSeconds(LinesByFile);
+    }
+
+   private:
+    friend class GCOVFunction;
+
+    GCOVBlock(uint32_t Number, raw_ostream *os)
+        : Number(Number) {
+      this->os = os;
+    }
+
+    uint32_t Number;
+    StringMap<GCOVLines *> LinesByFile;
+    SmallVector<GCOVBlock *, 4> OutEdges;
+  };
+
+  // A function has a unique identifier, a checksum (we leave as zero) and a
+  // set of blocks and a map of edges between blocks. This is the only GCOV
+  // object users can construct, the blocks and lines will be rooted here.
+  class GCOVFunction : public GCOVRecord {
+   public:
+    GCOVFunction(DISubprogram SP, raw_ostream *os, bool Use402Format) {
+      this->os = os;
+
+      Function *F = SP.getFunction();
+      uint32_t i = 0;
+      for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+        Blocks[BB] = new GCOVBlock(i++, os);
+      }
+      ReturnBlock = new GCOVBlock(i++, os);
+
+      writeBytes(FunctionTag, 4);
+      uint32_t BlockLen = 1 + 1 + 1 + lengthOfGCOVString(SP.getName()) +
+          1 + lengthOfGCOVString(SP.getFilename()) + 1;
+      if (!Use402Format)
+        ++BlockLen; // For second checksum.
+      write(BlockLen);
+      uint32_t Ident = reinterpret_cast<intptr_t>((MDNode*)SP);
+      write(Ident);
+      write(0);  // checksum #1
+      if (!Use402Format)
+        write(0);  // checksum #2
+      writeGCOVString(SP.getName());
+      writeGCOVString(SP.getFilename());
+      write(SP.getLineNumber());
+    }
+
+    ~GCOVFunction() {
+      DeleteContainerSeconds(Blocks);
+      delete ReturnBlock;
+    }
+
+    GCOVBlock &getBlock(BasicBlock *BB) {
+      return *Blocks[BB];
+    }
+
+    GCOVBlock &getReturnBlock() {
+      return *ReturnBlock;
+    }
+
+    void writeOut() {
+      // Emit count of blocks.
+      writeBytes(BlockTag, 4);
+      write(Blocks.size() + 1);
+      for (int i = 0, e = Blocks.size() + 1; i != e; ++i) {
+        write(0);  // No flags on our blocks.
+      }
+
+      // Emit edges between blocks.
+      for (DenseMap<BasicBlock *, GCOVBlock *>::iterator I = Blocks.begin(),
+               E = Blocks.end(); I != E; ++I) {
+        GCOVBlock &Block = *I->second;
+        if (Block.OutEdges.empty()) continue;
+
+        writeBytes(EdgeTag, 4);
+        write(Block.OutEdges.size() * 2 + 1);
+        write(Block.Number);
+        for (int i = 0, e = Block.OutEdges.size(); i != e; ++i) {
+          write(Block.OutEdges[i]->Number);
+          write(0);  // no flags
+        }
+      }
+
+      // Emit lines for each block.
+      for (DenseMap<BasicBlock *, GCOVBlock *>::iterator I = Blocks.begin(),
+               E = Blocks.end(); I != E; ++I) {
+        I->second->writeOut();
+      }
+    }
+
+   private:
+    DenseMap<BasicBlock *, GCOVBlock *> Blocks;
+    GCOVBlock *ReturnBlock;
+  };
+}
+
+std::string GCOVProfiler::mangleName(DICompileUnit CU, std::string NewStem) {
+  if (NamedMDNode *GCov = M->getNamedMetadata("llvm.gcov")) {
+    for (int i = 0, e = GCov->getNumOperands(); i != e; ++i) {
+      MDNode *N = GCov->getOperand(i);
+      if (N->getNumOperands() != 2) continue;
+      MDString *GCovFile = dyn_cast<MDString>(N->getOperand(0));
+      MDNode *CompileUnit = dyn_cast<MDNode>(N->getOperand(1));
+      if (!GCovFile || !CompileUnit) continue;
+      if (CompileUnit == CU) {
+        SmallString<128> Filename = GCovFile->getString();
+        sys::path::replace_extension(Filename, NewStem);
+        return Filename.str();
+      }
+    }
+  }
+
+  SmallString<128> Filename = CU.getFilename();
+  sys::path::replace_extension(Filename, NewStem);
+  return sys::path::filename(Filename.str());
+}
+
+bool GCOVProfiler::runOnModule(Module &M) {
+  this->M = &M;
+  Ctx = &M.getContext();
+
+  DebugInfoFinder DIF;
+  DIF.processModule(M);
+
+  if (EmitNotes) emitGCNO(DIF);
+  if (EmitData) return emitProfileArcs(DIF);
+  return false;
+}
+
+void GCOVProfiler::emitGCNO(DebugInfoFinder &DIF) {
+  DenseMap<const MDNode *, raw_fd_ostream *> GcnoFiles;
+  for (DebugInfoFinder::iterator I = DIF.compile_unit_begin(),
+           E = DIF.compile_unit_end(); I != E; ++I) {
+    // Each compile unit gets its own .gcno file. This means that whether we run
+    // this pass over the original .o's as they're produced, or run it after
+    // LTO, we'll generate the same .gcno files.
+
+    DICompileUnit CU(*I);
+    raw_fd_ostream *&out = GcnoFiles[CU];
+    std::string ErrorInfo;
+    out = new raw_fd_ostream(mangleName(CU, "gcno").c_str(), ErrorInfo,
+                             raw_fd_ostream::F_Binary);
+    if (!Use402Format)
+      out->write("oncg*404MVLL", 12);
+    else
+      out->write("oncg*402MVLL", 12);
+  }
+
+  for (DebugInfoFinder::iterator SPI = DIF.subprogram_begin(),
+           SPE = DIF.subprogram_end(); SPI != SPE; ++SPI) {
+    DISubprogram SP(*SPI);
+    raw_fd_ostream *&os = GcnoFiles[SP.getCompileUnit()];
+
+    Function *F = SP.getFunction();
+    if (!F) continue;
+    GCOVFunction Func(SP, os, Use402Format);
+
+    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+      GCOVBlock &Block = Func.getBlock(BB);
+      TerminatorInst *TI = BB->getTerminator();
+      if (int successors = TI->getNumSuccessors()) {
+        for (int i = 0; i != successors; ++i) {
+          Block.addEdge(Func.getBlock(TI->getSuccessor(i)));
+        }
+      } else if (isa<ReturnInst>(TI)) {
+        Block.addEdge(Func.getReturnBlock());
+      }
+
+      uint32_t Line = 0;
+      for (BasicBlock::iterator I = BB->begin(), IE = BB->end(); I != IE; ++I) {
+        const DebugLoc &Loc = I->getDebugLoc();
+        if (Loc.isUnknown()) continue;
+        if (Line == Loc.getLine()) continue;
+        Line = Loc.getLine();
+        if (SP != findSubprogram(DIScope(Loc.getScope(*Ctx)))) continue;
+
+        GCOVLines &Lines = Block.getFile(SP.getFilename());
+        Lines.addLine(Loc.getLine());
+      }
+    }
+    Func.writeOut();
+  }
+
+  for (DenseMap<const MDNode *, raw_fd_ostream *>::iterator
+           I = GcnoFiles.begin(), E = GcnoFiles.end(); I != E; ++I) {
+    raw_fd_ostream *&out = I->second;
+    out->write("\0\0\0\0\0\0\0\0", 8);  // EOF
+    out->close();
+    delete out;
+  }
+}
+
+bool GCOVProfiler::emitProfileArcs(DebugInfoFinder &DIF) {
+  if (DIF.subprogram_begin() == DIF.subprogram_end())
+    return false;
+
+  SmallVector<std::pair<GlobalVariable *, MDNode *>, 8> CountersBySP;
+  for (DebugInfoFinder::iterator SPI = DIF.subprogram_begin(),
+           SPE = DIF.subprogram_end(); SPI != SPE; ++SPI) {
+    DISubprogram SP(*SPI);
+    Function *F = SP.getFunction();
+    if (!F) continue;
+
+    unsigned Edges = 0;
+    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+      TerminatorInst *TI = BB->getTerminator();
+      if (isa<ReturnInst>(TI))
+        ++Edges;
+      else
+        Edges += TI->getNumSuccessors();
+    }
+
+    const ArrayType *CounterTy =
+        ArrayType::get(Type::getInt64Ty(*Ctx), Edges);
+    GlobalVariable *Counters =
+        new GlobalVariable(*M, CounterTy, false,
+                           GlobalValue::InternalLinkage,
+                           Constant::getNullValue(CounterTy),
+                           "__llvm_gcov_ctr", 0, false, 0);
+    CountersBySP.push_back(std::make_pair(Counters, (MDNode*)SP));
+
+    UniqueVector<BasicBlock *> ComplexEdgePreds;
+    UniqueVector<BasicBlock *> ComplexEdgeSuccs;
+
+    unsigned Edge = 0;
+    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+      TerminatorInst *TI = BB->getTerminator();
+      int Successors = isa<ReturnInst>(TI) ? 1 : TI->getNumSuccessors();
+      if (Successors) {
+        IRBuilder<> Builder(TI);
+
+        if (Successors == 1) {
+          Value *Counter = Builder.CreateConstInBoundsGEP2_64(Counters, 0,
+                                                              Edge);
+          Value *Count = Builder.CreateLoad(Counter);
+          Count = Builder.CreateAdd(Count,
+                                    ConstantInt::get(Type::getInt64Ty(*Ctx),1));
+          Builder.CreateStore(Count, Counter);
+        } else if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+          Value *Sel = Builder.CreateSelect(
+              BI->getCondition(),
+              ConstantInt::get(Type::getInt64Ty(*Ctx), Edge),
+              ConstantInt::get(Type::getInt64Ty(*Ctx), Edge + 1));
+          SmallVector<Value *, 2> Idx;
+          Idx.push_back(Constant::getNullValue(Type::getInt64Ty(*Ctx)));
+          Idx.push_back(Sel);
+          Value *Counter = Builder.CreateInBoundsGEP(Counters,
+                                                     Idx.begin(), Idx.end());
+          Value *Count = Builder.CreateLoad(Counter);
+          Count = Builder.CreateAdd(Count,
+                                    ConstantInt::get(Type::getInt64Ty(*Ctx),1));
+          Builder.CreateStore(Count, Counter);
+        } else {
+          ComplexEdgePreds.insert(BB);
+          for (int i = 0; i != Successors; ++i)
+            ComplexEdgeSuccs.insert(TI->getSuccessor(i));
+        }
+        Edge += Successors;
+      }
+    }
+
+    if (!ComplexEdgePreds.empty()) {
+      GlobalVariable *EdgeTable =
+          buildEdgeLookupTable(F, Counters,
+                               ComplexEdgePreds, ComplexEdgeSuccs);
+      GlobalVariable *EdgeState = getEdgeStateValue();
+
+      const Type *Int32Ty = Type::getInt32Ty(*Ctx);
+      for (int i = 0, e = ComplexEdgePreds.size(); i != e; ++i) {
+        IRBuilder<> Builder(ComplexEdgePreds[i+1]->getTerminator());
+        Builder.CreateStore(ConstantInt::get(Int32Ty, i), EdgeState);
+      }
+      for (int i = 0, e = ComplexEdgeSuccs.size(); i != e; ++i) {
+        // call runtime to perform increment
+        IRBuilder<> Builder(ComplexEdgeSuccs[i+1]->getFirstNonPHI());
+        Value *CounterPtrArray =
+            Builder.CreateConstInBoundsGEP2_64(EdgeTable, 0,
+                                               i * ComplexEdgePreds.size());
+        Builder.CreateCall2(getIncrementIndirectCounterFunc(),
+                            EdgeState, CounterPtrArray);
+        // clear the predecessor number
+        Builder.CreateStore(ConstantInt::get(Int32Ty, 0xffffffff), EdgeState);
+      }
+    }
+  }
+
+  insertCounterWriteout(DIF, CountersBySP);
+
+  return true;
+}
+
+// All edges with successors that aren't branches are "complex", because it
+// requires complex logic to pick which counter to update.
+GlobalVariable *GCOVProfiler::buildEdgeLookupTable(
+    Function *F,
+    GlobalVariable *Counters,
+    const UniqueVector<BasicBlock *> &Preds,
+    const UniqueVector<BasicBlock *> &Succs) {
+  // TODO: support invoke, threads. We rely on the fact that nothing can modify
+  // the whole-Module pred edge# between the time we set it and the time we next
+  // read it. Threads and invoke make this untrue.
+
+  // emit [(succs * preds) x i64*], logically [succ x [pred x i64*]].
+  const Type *Int64PtrTy = Type::getInt64PtrTy(*Ctx);
+  const ArrayType *EdgeTableTy = ArrayType::get(
+      Int64PtrTy, Succs.size() * Preds.size());
+
+  Constant **EdgeTable = new Constant*[Succs.size() * Preds.size()];
+  Constant *NullValue = Constant::getNullValue(Int64PtrTy);
+  for (int i = 0, ie = Succs.size() * Preds.size(); i != ie; ++i)
+    EdgeTable[i] = NullValue;
+
+  unsigned Edge = 0;
+  for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+    TerminatorInst *TI = BB->getTerminator();
+    int Successors = isa<ReturnInst>(TI) ? 1 : TI->getNumSuccessors();
+    if (Successors > 1 && !isa<BranchInst>(TI) && !isa<ReturnInst>(TI)) {
+      for (int i = 0; i != Successors; ++i) {
+        BasicBlock *Succ = TI->getSuccessor(i);
+        IRBuilder<> builder(Succ);
+        Value *Counter = builder.CreateConstInBoundsGEP2_64(Counters, 0,
+                                                            Edge + i);
+        EdgeTable[((Succs.idFor(Succ)-1) * Preds.size()) +
+                  (Preds.idFor(BB)-1)] = cast<Constant>(Counter);
+      }
+    }
+    Edge += Successors;
+  }
+
+  GlobalVariable *EdgeTableGV =
+      new GlobalVariable(
+          *M, EdgeTableTy, true, GlobalValue::InternalLinkage,
+          ConstantArray::get(EdgeTableTy,
+                             &EdgeTable[0], Succs.size() * Preds.size()),
+          "__llvm_gcda_edge_table");
+  EdgeTableGV->setUnnamedAddr(true);
+  return EdgeTableGV;
+}
+
+Constant *GCOVProfiler::getStartFileFunc() {
+  const Type *Args[] = { Type::getInt8PtrTy(*Ctx) };
+  const FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx),
+                                              Args, false);
+  return M->getOrInsertFunction("llvm_gcda_start_file", FTy);
+}
+
+Constant *GCOVProfiler::getIncrementIndirectCounterFunc() {
+  const Type *Args[] = {
+    Type::getInt32PtrTy(*Ctx),                  // uint32_t *predecessor
+    Type::getInt64PtrTy(*Ctx)->getPointerTo(),  // uint64_t **state_table_row
+  };
+  const FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx),
+                                              Args, false);
+  return M->getOrInsertFunction("llvm_gcda_increment_indirect_counter", FTy);
+}
+
+Constant *GCOVProfiler::getEmitFunctionFunc() {
+  const Type *Args[2] = {
+    Type::getInt32Ty(*Ctx),    // uint32_t ident
+    Type::getInt8PtrTy(*Ctx),  // const char *function_name
+  };
+  const FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx),
+                                              Args, false);
+  return M->getOrInsertFunction("llvm_gcda_emit_function", FTy);
+}
+
+Constant *GCOVProfiler::getEmitArcsFunc() {
+  const Type *Args[] = {
+    Type::getInt32Ty(*Ctx),     // uint32_t num_counters
+    Type::getInt64PtrTy(*Ctx),  // uint64_t *counters
+  };
+  const FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx),
+                                              Args, false);
+  return M->getOrInsertFunction("llvm_gcda_emit_arcs", FTy);
+}
+
+Constant *GCOVProfiler::getEndFileFunc() {
+  const FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+  return M->getOrInsertFunction("llvm_gcda_end_file", FTy);
+}
+
+GlobalVariable *GCOVProfiler::getEdgeStateValue() {
+  GlobalVariable *GV = M->getGlobalVariable("__llvm_gcov_global_state_pred");
+  if (!GV) {
+    GV = new GlobalVariable(*M, Type::getInt32Ty(*Ctx), false,
+                            GlobalValue::InternalLinkage,
+                            ConstantInt::get(Type::getInt32Ty(*Ctx),
+                                             0xffffffff),
+                            "__llvm_gcov_global_state_pred");
+    GV->setUnnamedAddr(true);
+  }
+  return GV;
+}
+
+void GCOVProfiler::insertCounterWriteout(
+    DebugInfoFinder &DIF,
+    SmallVector<std::pair<GlobalVariable *, MDNode *>, 8> &CountersBySP) {
+  const FunctionType *WriteoutFTy =
+      FunctionType::get(Type::getVoidTy(*Ctx), false);
+  Function *WriteoutF = Function::Create(WriteoutFTy,
+                                         GlobalValue::InternalLinkage,
+                                         "__llvm_gcov_writeout", M);
+  WriteoutF->setUnnamedAddr(true);
+  BasicBlock *BB = BasicBlock::Create(*Ctx, "", WriteoutF);
+  IRBuilder<> Builder(BB);
+
+  Constant *StartFile = getStartFileFunc();
+  Constant *EmitFunction = getEmitFunctionFunc();
+  Constant *EmitArcs = getEmitArcsFunc();
+  Constant *EndFile = getEndFileFunc();
+
+  for (DebugInfoFinder::iterator CUI = DIF.compile_unit_begin(),
+           CUE = DIF.compile_unit_end(); CUI != CUE; ++CUI) {
+    DICompileUnit compile_unit(*CUI);
+    std::string FilenameGcda = mangleName(compile_unit, "gcda");
+    Builder.CreateCall(StartFile,
+                       Builder.CreateGlobalStringPtr(FilenameGcda));
+    for (SmallVector<std::pair<GlobalVariable *, MDNode *>, 8>::iterator
+             I = CountersBySP.begin(), E = CountersBySP.end();
+         I != E; ++I) {
+      DISubprogram SP(I->second);
+      intptr_t ident = reinterpret_cast<intptr_t>(I->second);
+      Builder.CreateCall2(EmitFunction,
+                          ConstantInt::get(Type::getInt32Ty(*Ctx), ident),
+                          Builder.CreateGlobalStringPtr(SP.getName()));
+                                                        
+      GlobalVariable *GV = I->first;
+      unsigned Arcs =
+          cast<ArrayType>(GV->getType()->getElementType())->getNumElements();
+      Builder.CreateCall2(EmitArcs,
+                          ConstantInt::get(Type::getInt32Ty(*Ctx), Arcs),
+                          Builder.CreateConstGEP2_64(GV, 0, 0));
+    }
+    Builder.CreateCall(EndFile);
+  }
+  Builder.CreateRetVoid();
+
+  InsertProfilingShutdownCall(WriteoutF, M);
+}
diff --git a/lib/Transforms/Instrumentation/Instrumentation.cpp b/lib/Transforms/Instrumentation/Instrumentation.cpp
index 96ed4fa..71adc1e 100644
--- a/lib/Transforms/Instrumentation/Instrumentation.cpp
+++ b/lib/Transforms/Instrumentation/Instrumentation.cpp
@@ -23,6 +23,7 @@ void llvm::initializeInstrumentation(PassRegistry &Registry) {
   initializeEdgeProfilerPass(Registry);
   initializeOptimalEdgeProfilerPass(Registry);
   initializePathProfilerPass(Registry);
+  initializeGCOVProfilerPass(Registry);
 }
 
 /// LLVMInitializeInstrumentation - C binding for
diff --git a/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp b/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp
index ae2f2e2..e09f882 100644
--- a/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp
+++ b/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp
@@ -14,6 +14,7 @@
 //===----------------------------------------------------------------------===//
 #define DEBUG_TYPE "insert-optimal-edge-profiling"
 #include "ProfilingUtils.h"
+#include "llvm/Constants.h"
 #include "llvm/Module.h"
 #include "llvm/Pass.h"
 #include "llvm/Analysis/Passes.h"
@@ -26,7 +27,6 @@
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "MaximumSpanningTree.h"
-#include <set>
 using namespace llvm;
 
 STATISTIC(NumEdgesInserted, "The # of edges inserted.");
diff --git a/lib/Transforms/Instrumentation/PathProfiling.cpp b/lib/Transforms/Instrumentation/PathProfiling.cpp
index 830251c..182a43d 100644
--- a/lib/Transforms/Instrumentation/PathProfiling.cpp
+++ b/lib/Transforms/Instrumentation/PathProfiling.cpp
@@ -63,7 +63,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Instrumentation.h"
-#include <map>
 #include <vector>
 
 #define HASH_THRESHHOLD 100000
@@ -259,7 +258,7 @@ private:
 };
 
 // ---------------------------------------------------------------------------
-// PathProfiler is a module pass which intruments path profiling instructions
+// PathProfiler is a module pass which instruments path profiling instructions
 // ---------------------------------------------------------------------------
 class PathProfiler : public ModulePass {
 private:
@@ -389,6 +388,9 @@ namespace llvm {
 
   // BallLarusEdge << operator overloading
   raw_ostream& operator<<(raw_ostream& os,
+                          const BLInstrumentationEdge& edge)
+      LLVM_ATTRIBUTE_USED;
+  raw_ostream& operator<<(raw_ostream& os,
                           const BLInstrumentationEdge& edge) {
     os << "[" << edge.getSource()->getName() << " -> "
        << edge.getTarget()->getName() << "] init: "
@@ -1349,8 +1351,6 @@ bool PathProfiler::runOnModule(Module &M) {
     return false;
   }
 
-  BasicBlock::iterator insertPoint = Main->getEntryBlock().getFirstNonPHI();
-
   llvmIncrementHashFunction = M.getOrInsertFunction(
     "llvm_increment_path_count",
     Type::getVoidTy(*Context), // return type
diff --git a/lib/Transforms/Instrumentation/ProfilingUtils.cpp b/lib/Transforms/Instrumentation/ProfilingUtils.cpp
index b57bbf6..7435bc3 100644
--- a/lib/Transforms/Instrumentation/ProfilingUtils.cpp
+++ b/lib/Transforms/Instrumentation/ProfilingUtils.cpp
@@ -110,7 +110,7 @@ void llvm::IncrementCounterInBlock(BasicBlock *BB, unsigned CounterNum,
                                    GlobalValue *CounterArray, bool beginning) {
   // Insert the increment after any alloca or PHI instructions...
   BasicBlock::iterator InsertPos = beginning ? BB->getFirstNonPHI() :
-                BB->getTerminator();
+                                   BB->getTerminator();
   while (isa<AllocaInst>(InsertPos))
     ++InsertPos;
 
@@ -121,8 +121,7 @@ void llvm::IncrementCounterInBlock(BasicBlock *BB, unsigned CounterNum,
   Indices[0] = Constant::getNullValue(Type::getInt32Ty(Context));
   Indices[1] = ConstantInt::get(Type::getInt32Ty(Context), CounterNum);
   Constant *ElementPtr =
-    ConstantExpr::getGetElementPtr(CounterArray, &Indices[0],
-                                          Indices.size());
+    ConstantExpr::getGetElementPtr(CounterArray, &Indices[0], Indices.size());
 
   // Load, increment and store the value back.
   Value *OldVal = new LoadInst(ElementPtr, "OldFuncCounter", InsertPos);
@@ -131,3 +130,41 @@ void llvm::IncrementCounterInBlock(BasicBlock *BB, unsigned CounterNum,
                                          "NewFuncCounter", InsertPos);
   new StoreInst(NewVal, ElementPtr, InsertPos);
 }
+
+void llvm::InsertProfilingShutdownCall(Function *Callee, Module *Mod) {
+  // llvm.global_dtors is an array of type { i32, void ()* }. Prepare those
+  // types.
+  const Type *GlobalDtorElems[2] = {
+    Type::getInt32Ty(Mod->getContext()),
+    FunctionType::get(Type::getVoidTy(Mod->getContext()), false)->getPointerTo()
+  };
+  const StructType *GlobalDtorElemTy =
+      StructType::get(Mod->getContext(), GlobalDtorElems, false);
+
+  // Construct the new element we'll be adding.
+  Constant *Elem[2] = {
+    ConstantInt::get(Type::getInt32Ty(Mod->getContext()), 65535),
+    ConstantExpr::getBitCast(Callee, GlobalDtorElems[1])
+  };
+
+  // If llvm.global_dtors exists, make a copy of the things in its list and
+  // delete it, to replace it with one that has a larger array type.
+  std::vector<Constant *> dtors;
+  if (GlobalVariable *GlobalDtors = Mod->getNamedGlobal("llvm.global_dtors")) {
+    if (ConstantArray *InitList =
+        dyn_cast<ConstantArray>(GlobalDtors->getInitializer())) {
+      for (unsigned i = 0, e = InitList->getType()->getNumElements();
+           i != e; ++i)
+        dtors.push_back(cast<Constant>(InitList->getOperand(i)));
+    }
+    GlobalDtors->eraseFromParent();
+  }
+
+  // Build up llvm.global_dtors with our new item in it.
+  GlobalVariable *GlobalDtors = new GlobalVariable(
+      *Mod, ArrayType::get(GlobalDtorElemTy, 1), false,
+      GlobalValue::AppendingLinkage, NULL, "llvm.global_dtors");
+  dtors.push_back(ConstantStruct::get(Mod->getContext(), Elem, 2, false));
+  GlobalDtors->setInitializer(ConstantArray::get(
+      cast<ArrayType>(GlobalDtors->getType()->getElementType()), dtors));
+}
diff --git a/lib/Transforms/Instrumentation/ProfilingUtils.h b/lib/Transforms/Instrumentation/ProfilingUtils.h
index a76e357..09b2217 100644
--- a/lib/Transforms/Instrumentation/ProfilingUtils.h
+++ b/lib/Transforms/Instrumentation/ProfilingUtils.h
@@ -18,9 +18,10 @@
 #define PROFILINGUTILS_H
 
 namespace llvm {
+  class BasicBlock;
   class Function;
   class GlobalValue;
-  class BasicBlock;
+  class Module;
   class PointerType;
 
   void InsertProfilingInitCall(Function *MainFn, const char *FnName,
@@ -29,6 +30,7 @@ namespace llvm {
   void IncrementCounterInBlock(BasicBlock *BB, unsigned CounterNum,
                                GlobalValue *CounterArray,
                                bool beginning = true);
+  void InsertProfilingShutdownCall(Function *Callee, Module *Mod);
 }
 
 #endif
diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt
index fcf914f..c223da6 100644
--- a/lib/Transforms/Scalar/CMakeLists.txt
+++ b/lib/Transforms/Scalar/CMakeLists.txt
@@ -20,6 +20,7 @@ add_llvm_library(LLVMScalarOpts
   LoopUnswitch.cpp
   LowerAtomic.cpp
   MemCpyOptimizer.cpp
+  ObjCARC.cpp
   Reassociate.cpp
   Reg2Mem.cpp
   SCCP.cpp
diff --git a/lib/Transforms/Scalar/CodeGenPrepare.cpp b/lib/Transforms/Scalar/CodeGenPrepare.cpp
index 2f7ccea..0af14ed 100644
--- a/lib/Transforms/Scalar/CodeGenPrepare.cpp
+++ b/lib/Transforms/Scalar/CodeGenPrepare.cpp
@@ -147,7 +147,7 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
   if (!DisableBranchOpts) {
     MadeChange = false;
     for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
-      MadeChange |= ConstantFoldTerminator(BB);
+      MadeChange |= ConstantFoldTerminator(BB, true);
 
     if (MadeChange)
       ModifiedDT = true;
@@ -371,9 +371,11 @@ static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI){
   // If these values will be promoted, find out what they will be promoted
   // to.  This helps us consider truncates on PPC as noop copies when they
   // are.
-  if (TLI.getTypeAction(SrcVT) == TargetLowering::Promote)
+  if (TLI.getTypeAction(CI->getContext(), SrcVT) ==
+      TargetLowering::TypePromoteInteger)
     SrcVT = TLI.getTypeToTransformTo(CI->getContext(), SrcVT);
-  if (TLI.getTypeAction(DstVT) == TargetLowering::Promote)
+  if (TLI.getTypeAction(CI->getContext(), DstVT) ==
+      TargetLowering::TypePromoteInteger)
     DstVT = TLI.getTypeToTransformTo(CI->getContext(), DstVT);
 
   // If, after promotion, these are the same types, this is a noop copy.
@@ -548,7 +550,23 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) {
 
   // From here on out we're working with named functions.
   if (CI->getCalledFunction() == 0) return false;
-  
+
+  // llvm.dbg.value is far away from the value then iSel may not be able
+  // handle it properly. iSel will drop llvm.dbg.value if it can not 
+  // find a node corresponding to the value.
+  if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(CI))
+    if (Instruction *VI = dyn_cast_or_null<Instruction>(DVI->getValue()))
+      if (!VI->isTerminator() &&
+          (DVI->getParent() != VI->getParent() || DT->dominates(DVI, VI))) {
+        DEBUG(dbgs() << "Moving Debug Value before :\n" << *DVI << ' ' << *VI);
+        DVI->removeFromParent();
+        if (isa<PHINode>(VI))
+          DVI->insertBefore(VI->getParent()->getFirstNonPHI());
+        else
+          DVI->insertAfter(VI);
+        return true;
+      }
+
   // We'll need TargetData from here on out.
   const TargetData *TD = TLI ? TLI->getTargetData() : 0;
   if (!TD) return false;
@@ -889,11 +907,26 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
 
   MemoryInst->replaceUsesOfWith(Repl, SunkAddr);
 
+  // If we have no uses, recursively delete the value and all dead instructions
+  // using it.
   if (Repl->use_empty()) {
+    // This can cause recursive deletion, which can invalidate our iterator.
+    // Use a WeakVH to hold onto it in case this happens.
+    WeakVH IterHandle(CurInstIterator);
+    BasicBlock *BB = CurInstIterator->getParent();
+    
     RecursivelyDeleteTriviallyDeadInstructions(Repl);
-    // This address is now available for reassignment, so erase the table entry;
-    // we don't want to match some completely different instruction.
-    SunkAddrs[Addr] = 0;
+
+    if (IterHandle != CurInstIterator) {
+      // If the iterator instruction was recursively deleted, start over at the
+      // start of the block.
+      CurInstIterator = BB->begin();
+      SunkAddrs.clear();
+    } else {
+      // This address is now available for reassignment, so erase the table
+      // entry; we don't want to match some completely different instruction.
+      SunkAddrs[Addr] = 0;
+    }    
   }
   ++NumMemoryInsts;
   return true;
diff --git a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index be12973..e275268 100644
--- a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -13,6 +13,7 @@
 
 #define DEBUG_TYPE "correlated-value-propagation"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Constants.h"
 #include "llvm/Function.h"
 #include "llvm/Instructions.h"
 #include "llvm/Pass.h"
diff --git a/lib/Transforms/Scalar/DCE.cpp b/lib/Transforms/Scalar/DCE.cpp
index dbb68f3..8dbcc23 100644
--- a/lib/Transforms/Scalar/DCE.cpp
+++ b/lib/Transforms/Scalar/DCE.cpp
@@ -23,7 +23,6 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/InstIterator.h"
 #include "llvm/ADT/Statistic.h"
-#include <set>
 using namespace llvm;
 
 STATISTIC(DIEEliminated, "Number of insts removed by DIE pass");
diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 9f36a38..cb9b5be 100644
--- a/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -437,12 +437,9 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
 
     MemDepResult InstDep = MD->getDependency(Inst);
     
-    // Ignore non-local store liveness.
+    // Ignore any store where we can't find a local dependence.
     // FIXME: cross-block DSE would be fun. :)
-    if (InstDep.isNonLocal() || 
-        // Ignore self dependence, which happens in the entry block of the
-        // function.
-        InstDep.getInst() == Inst)
+    if (InstDep.isNonLocal() || InstDep.isUnknown())
       continue;
      
     // If we're storing the same value back to a pointer that we just
@@ -478,14 +475,14 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
     if (Loc.Ptr == 0)
       continue;
     
-    while (!InstDep.isNonLocal()) {
+    while (!InstDep.isNonLocal() && !InstDep.isUnknown()) {
       // Get the memory clobbered by the instruction we depend on.  MemDep will
       // skip any instructions that 'Loc' clearly doesn't interact with.  If we
       // end up depending on a may- or must-aliased load, then we can't optimize
       // away the store and we bail out.  However, if we depend on on something
       // that overwrites the memory location we *can* potentially optimize it.
       //
-      // Find out what memory location the dependant instruction stores.
+      // Find out what memory location the dependent instruction stores.
       Instruction *DepWrite = InstDep.getInst();
       AliasAnalysis::Location DepLoc = getLocForWrite(DepWrite, *AA);
       // If we didn't get a useful location, or if it isn't a size, bail out.
@@ -542,24 +539,26 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
 /// HandleFree - Handle frees of entire structures whose dependency is a store
 /// to a field of that structure.
 bool DSE::HandleFree(CallInst *F) {
+  bool MadeChange = false;
+
   MemDepResult Dep = MD->getDependency(F);
-  do {
-    if (Dep.isNonLocal()) return false;
-    
+
+  while (!Dep.isNonLocal() && !Dep.isUnknown()) {
     Instruction *Dependency = Dep.getInst();
     if (!hasMemoryWrite(Dependency) || !isRemovable(Dependency))
-      return false;
+      return MadeChange;
   
     Value *DepPointer =
       GetUnderlyingObject(getStoredPointerOperand(Dependency));
 
     // Check for aliasing.
     if (!AA->isMustAlias(F->getArgOperand(0), DepPointer))
-      return false;
+      return MadeChange;
   
     // DCE instructions only used to calculate that store
     DeleteDeadInstruction(Dependency, *MD);
     ++NumFastStores;
+    MadeChange = true;
 
     // Inst's old Dependency is now deleted. Compute the next dependency,
     // which may also be dead, as in
@@ -567,9 +566,9 @@ bool DSE::HandleFree(CallInst *F) {
     //    s[1] = 0; // This has just been deleted.
     //    free(s);
     Dep = MD->getDependency(F);
-  } while (!Dep.isNonLocal());
+  };
   
-  return true;
+  return MadeChange;
 }
 
 /// handleEndBlock - Remove dead stores to stack-allocated locations in the
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index 45fd665..0e1e6f3 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -63,50 +63,48 @@ static cl::opt<bool> EnableLoadPRE("enable-load-pre", cl::init(true));
 namespace {
   struct Expression {
     uint32_t opcode;
-    const Type* type;
+    const Type *type;
     SmallVector<uint32_t, 4> varargs;
 
-    Expression() { }
-    Expression(uint32_t o) : opcode(o) { }
+    Expression(uint32_t o = ~2U) : opcode(o) { }
 
     bool operator==(const Expression &other) const {
       if (opcode != other.opcode)
         return false;
-      else if (opcode == ~0U || opcode == ~1U)
+      if (opcode == ~0U || opcode == ~1U)
         return true;
-      else if (type != other.type)
+      if (type != other.type)
         return false;
-      else if (varargs != other.varargs)
+      if (varargs != other.varargs)
         return false;
       return true;
     }
   };
 
   class ValueTable {
-    private:
-      DenseMap<Value*, uint32_t> valueNumbering;
-      DenseMap<Expression, uint32_t> expressionNumbering;
-      AliasAnalysis* AA;
-      MemoryDependenceAnalysis* MD;
-      DominatorTree* DT;
-
-      uint32_t nextValueNumber;
-
-      Expression create_expression(Instruction* I);
-      uint32_t lookup_or_add_call(CallInst* C);
-    public:
-      ValueTable() : nextValueNumber(1) { }
-      uint32_t lookup_or_add(Value *V);
-      uint32_t lookup(Value *V) const;
-      void add(Value *V, uint32_t num);
-      void clear();
-      void erase(Value *v);
-      void setAliasAnalysis(AliasAnalysis* A) { AA = A; }
-      AliasAnalysis *getAliasAnalysis() const { return AA; }
-      void setMemDep(MemoryDependenceAnalysis* M) { MD = M; }
-      void setDomTree(DominatorTree* D) { DT = D; }
-      uint32_t getNextUnusedValueNumber() { return nextValueNumber; }
-      void verifyRemoved(const Value *) const;
+    DenseMap<Value*, uint32_t> valueNumbering;
+    DenseMap<Expression, uint32_t> expressionNumbering;
+    AliasAnalysis *AA;
+    MemoryDependenceAnalysis *MD;
+    DominatorTree *DT;
+
+    uint32_t nextValueNumber;
+
+    Expression create_expression(Instruction* I);
+    uint32_t lookup_or_add_call(CallInst* C);
+  public:
+    ValueTable() : nextValueNumber(1) { }
+    uint32_t lookup_or_add(Value *V);
+    uint32_t lookup(Value *V) const;
+    void add(Value *V, uint32_t num);
+    void clear();
+    void erase(Value *v);
+    void setAliasAnalysis(AliasAnalysis* A) { AA = A; }
+    AliasAnalysis *getAliasAnalysis() const { return AA; }
+    void setMemDep(MemoryDependenceAnalysis* M) { MD = M; }
+    void setDomTree(DominatorTree* D) { DT = D; }
+    uint32_t getNextUnusedValueNumber() { return nextValueNumber; }
+    void verifyRemoved(const Value *) const;
   };
 }
 
@@ -229,21 +227,19 @@ uint32_t ValueTable::lookup_or_add_call(CallInst* C) {
     // Non-local case.
     const MemoryDependenceAnalysis::NonLocalDepInfo &deps =
       MD->getNonLocalCallDependency(CallSite(C));
-    // FIXME: call/call dependencies for readonly calls should return def, not
-    // clobber!  Move the checking logic to MemDep!
+    // FIXME: Move the checking logic to MemDep!
     CallInst* cdep = 0;
 
     // Check to see if we have a single dominating call instruction that is
     // identical to C.
     for (unsigned i = 0, e = deps.size(); i != e; ++i) {
       const NonLocalDepEntry *I = &deps[i];
-      // Ignore non-local dependencies.
       if (I->getResult().isNonLocal())
         continue;
 
-      // We don't handle non-depedencies.  If we already have a call, reject
+      // We don't handle non-definitions.  If we already have a call, reject
       // instruction dependencies.
-      if (I->getResult().isClobber() || cdep != 0) {
+      if (!I->getResult().isDef() || cdep != 0) {
         cdep = 0;
         break;
       }
@@ -364,14 +360,14 @@ uint32_t ValueTable::lookup(Value *V) const {
   return VI->second;
 }
 
-/// clear - Remove all entries from the ValueTable
+/// clear - Remove all entries from the ValueTable.
 void ValueTable::clear() {
   valueNumbering.clear();
   expressionNumbering.clear();
   nextValueNumber = 1;
 }
 
-/// erase - Remove a value from the value numbering
+/// erase - Remove a value from the value numbering.
 void ValueTable::erase(Value *V) {
   valueNumbering.erase(V);
 }
@@ -392,20 +388,11 @@ void ValueTable::verifyRemoved(const Value *V) const {
 namespace {
 
   class GVN : public FunctionPass {
-    bool runOnFunction(Function &F);
-  public:
-    static char ID; // Pass identification, replacement for typeid
-    explicit GVN(bool noloads = false)
-        : FunctionPass(ID), NoLoads(noloads), MD(0) {
-      initializeGVNPass(*PassRegistry::getPassRegistry());
-    }
-
-  private:
     bool NoLoads;
     MemoryDependenceAnalysis *MD;
     DominatorTree *DT;
-    const TargetData* TD;
-
+    const TargetData *TD;
+    
     ValueTable VN;
     
     /// LeaderTable - A mapping from value numbers to lists of Value*'s that
@@ -418,17 +405,39 @@ namespace {
     DenseMap<uint32_t, LeaderTableEntry> LeaderTable;
     BumpPtrAllocator TableAllocator;
     
+    SmallVector<Instruction*, 8> InstrsToErase;
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    explicit GVN(bool noloads = false)
+        : FunctionPass(ID), NoLoads(noloads), MD(0) {
+      initializeGVNPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnFunction(Function &F);
+    
+    /// markInstructionForDeletion - This removes the specified instruction from
+    /// our various maps and marks it for deletion.
+    void markInstructionForDeletion(Instruction *I) {
+      VN.erase(I);
+      InstrsToErase.push_back(I);
+    }
+    
+    const TargetData *getTargetData() const { return TD; }
+    DominatorTree &getDominatorTree() const { return *DT; }
+    AliasAnalysis *getAliasAnalysis() const { return VN.getAliasAnalysis(); }
+    MemoryDependenceAnalysis &getMemDep() const { return *MD; }
+  private:
     /// addToLeaderTable - Push a new Value to the LeaderTable onto the list for
     /// its value number.
     void addToLeaderTable(uint32_t N, Value *V, BasicBlock *BB) {
-      LeaderTableEntry& Curr = LeaderTable[N];
+      LeaderTableEntry &Curr = LeaderTable[N];
       if (!Curr.Val) {
         Curr.Val = V;
         Curr.BB = BB;
         return;
       }
       
-      LeaderTableEntry* Node = TableAllocator.Allocate<LeaderTableEntry>();
+      LeaderTableEntry *Node = TableAllocator.Allocate<LeaderTableEntry>();
       Node->Val = V;
       Node->BB = BB;
       Node->Next = Curr.Next;
@@ -474,19 +483,17 @@ namespace {
       AU.addPreserved<DominatorTree>();
       AU.addPreserved<AliasAnalysis>();
     }
+    
 
     // Helper fuctions
     // FIXME: eliminate or document these better
-    bool processLoad(LoadInst* L,
-                     SmallVectorImpl<Instruction*> &toErase);
-    bool processInstruction(Instruction *I,
-                            SmallVectorImpl<Instruction*> &toErase);
-    bool processNonLocalLoad(LoadInst* L,
-                             SmallVectorImpl<Instruction*> &toErase);
+    bool processLoad(LoadInst *L);
+    bool processInstruction(Instruction *I);
+    bool processNonLocalLoad(LoadInst *L);
     bool processBlock(BasicBlock *BB);
-    void dump(DenseMap<uint32_t, Value*>& d);
+    void dump(DenseMap<uint32_t, Value*> &d);
     bool iterateOnFunction(Function &F);
-    bool performPRE(Function& F);
+    bool performPRE(Function &F);
     Value *findLeader(BasicBlock *BB, uint32_t num);
     void cleanupGlobalSets();
     void verifyRemoved(const Instruction *I) const;
@@ -629,17 +636,17 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal,
   if (!CanCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, TD))
     return 0;
   
+  // If this is already the right type, just return it.
   const Type *StoredValTy = StoredVal->getType();
   
   uint64_t StoreSize = TD.getTypeStoreSizeInBits(StoredValTy);
-  uint64_t LoadSize = TD.getTypeSizeInBits(LoadedTy);
+  uint64_t LoadSize = TD.getTypeStoreSizeInBits(LoadedTy);
   
   // If the store and reload are the same size, we can always reuse it.
   if (StoreSize == LoadSize) {
-    if (StoredValTy->isPointerTy() && LoadedTy->isPointerTy()) {
-      // Pointer to Pointer -> use bitcast.
+    // Pointer to Pointer -> use bitcast.
+    if (StoredValTy->isPointerTy() && LoadedTy->isPointerTy())
       return new BitCastInst(StoredVal, LoadedTy, "", InsertPt);
-    }
     
     // Convert source pointers to integers, which can be bitcast.
     if (StoredValTy->isPointerTy()) {
@@ -796,6 +803,36 @@ static int AnalyzeLoadFromClobberingStore(const Type *LoadTy, Value *LoadPtr,
                                         StorePtr, StoreSize, TD);
 }
 
+/// AnalyzeLoadFromClobberingLoad - This function is called when we have a
+/// memdep query of a load that ends up being clobbered by another load.  See if
+/// the other load can feed into the second load.
+static int AnalyzeLoadFromClobberingLoad(const Type *LoadTy, Value *LoadPtr,
+                                         LoadInst *DepLI, const TargetData &TD){
+  // Cannot handle reading from store of first-class aggregate yet.
+  if (DepLI->getType()->isStructTy() || DepLI->getType()->isArrayTy())
+    return -1;
+  
+  Value *DepPtr = DepLI->getPointerOperand();
+  uint64_t DepSize = TD.getTypeSizeInBits(DepLI->getType());
+  int R = AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, DepSize, TD);
+  if (R != -1) return R;
+  
+  // If we have a load/load clobber an DepLI can be widened to cover this load,
+  // then we should widen it!
+  int64_t LoadOffs = 0;
+  const Value *LoadBase =
+    GetPointerBaseWithConstantOffset(LoadPtr, LoadOffs, TD);
+  unsigned LoadSize = TD.getTypeStoreSize(LoadTy);
+  
+  unsigned Size = MemoryDependenceAnalysis::
+    getLoadLoadClobberFullWidthSize(LoadBase, LoadOffs, LoadSize, DepLI, TD);
+  if (Size == 0) return -1;
+  
+  return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, Size*8, TD);
+}
+
+
+
 static int AnalyzeLoadFromClobberingMemInst(const Type *LoadTy, Value *LoadPtr,
                                             MemIntrinsic *MI,
                                             const TargetData &TD) {
@@ -843,9 +880,9 @@ static int AnalyzeLoadFromClobberingMemInst(const Type *LoadTy, Value *LoadPtr,
 
 /// GetStoreValueForLoad - This function is called when we have a
 /// memdep query of a load that ends up being a clobbering store.  This means
-/// that the store *may* provide bits used by the load but we can't be sure
-/// because the pointers don't mustalias.  Check this case to see if there is
-/// anything more we can do before we give up.
+/// that the store provides bits used by the load but we the pointers don't
+/// mustalias.  Check this case to see if there is anything more we can do
+/// before we give up.
 static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset,
                                    const Type *LoadTy,
                                    Instruction *InsertPt, const TargetData &TD){
@@ -881,6 +918,69 @@ static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset,
   return CoerceAvailableValueToLoadType(SrcVal, LoadTy, InsertPt, TD);
 }
 
+/// GetStoreValueForLoad - This function is called when we have a
+/// memdep query of a load that ends up being a clobbering load.  This means
+/// that the load *may* provide bits used by the load but we can't be sure
+/// because the pointers don't mustalias.  Check this case to see if there is
+/// anything more we can do before we give up.
+static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset,
+                                  const Type *LoadTy, Instruction *InsertPt,
+                                  GVN &gvn) {
+  const TargetData &TD = *gvn.getTargetData();
+  // If Offset+LoadTy exceeds the size of SrcVal, then we must be wanting to
+  // widen SrcVal out to a larger load.
+  unsigned SrcValSize = TD.getTypeStoreSize(SrcVal->getType());
+  unsigned LoadSize = TD.getTypeStoreSize(LoadTy);
+  if (Offset+LoadSize > SrcValSize) {
+    assert(!SrcVal->isVolatile() && "Cannot widen volatile load!");
+    assert(isa<IntegerType>(SrcVal->getType())&&"Can't widen non-integer load");
+    // If we have a load/load clobber an DepLI can be widened to cover this
+    // load, then we should widen it to the next power of 2 size big enough!
+    unsigned NewLoadSize = Offset+LoadSize;
+    if (!isPowerOf2_32(NewLoadSize))
+      NewLoadSize = NextPowerOf2(NewLoadSize);
+
+    Value *PtrVal = SrcVal->getPointerOperand();
+    
+    // Insert the new load after the old load.  This ensures that subsequent
+    // memdep queries will find the new load.  We can't easily remove the old
+    // load completely because it is already in the value numbering table.
+    IRBuilder<> Builder(SrcVal->getParent(), ++BasicBlock::iterator(SrcVal));
+    const Type *DestPTy = 
+      IntegerType::get(LoadTy->getContext(), NewLoadSize*8);
+    DestPTy = PointerType::get(DestPTy, 
+                       cast<PointerType>(PtrVal->getType())->getAddressSpace());
+    Builder.SetCurrentDebugLocation(SrcVal->getDebugLoc());
+    PtrVal = Builder.CreateBitCast(PtrVal, DestPTy);
+    LoadInst *NewLoad = Builder.CreateLoad(PtrVal);
+    NewLoad->takeName(SrcVal);
+    NewLoad->setAlignment(SrcVal->getAlignment());
+
+    DEBUG(dbgs() << "GVN WIDENED LOAD: " << *SrcVal << "\n");
+    DEBUG(dbgs() << "TO: " << *NewLoad << "\n");
+    
+    // Replace uses of the original load with the wider load.  On a big endian
+    // system, we need to shift down to get the relevant bits.
+    Value *RV = NewLoad;
+    if (TD.isBigEndian())
+      RV = Builder.CreateLShr(RV,
+                    NewLoadSize*8-SrcVal->getType()->getPrimitiveSizeInBits());
+    RV = Builder.CreateTrunc(RV, SrcVal->getType());
+    SrcVal->replaceAllUsesWith(RV);
+    
+    // We would like to use gvn.markInstructionForDeletion here, but we can't
+    // because the load is already memoized into the leader map table that GVN
+    // tracks.  It is potentially possible to remove the load from the table,
+    // but then there all of the operations based on it would need to be
+    // rehashed.  Just leave the dead load around.
+    gvn.getMemDep().removeInstruction(SrcVal);
+    SrcVal = NewLoad;
+  }
+  
+  return GetStoreValueForLoad(SrcVal, Offset, LoadTy, InsertPt, TD);
+}
+
+
 /// GetMemInstValueForLoad - This function is called when we have a
 /// memdep query of a load that ends up being a clobbering mem intrinsic.
 static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
@@ -943,11 +1043,12 @@ struct AvailableValueInBlock {
   BasicBlock *BB;
   enum ValType {
     SimpleVal,  // A simple offsetted value that is accessed.
+    LoadVal,    // A value produced by a load.
     MemIntrin   // A memory intrinsic which is loaded from.
   };
   
   /// V - The value that is live out of the block.
-  PointerIntPair<Value *, 1, ValType> Val;
+  PointerIntPair<Value *, 2, ValType> Val;
   
   /// Offset - The byte offset in Val that is interesting for the load query.
   unsigned Offset;
@@ -972,37 +1073,69 @@ struct AvailableValueInBlock {
     return Res;
   }
   
+  static AvailableValueInBlock getLoad(BasicBlock *BB, LoadInst *LI,
+                                       unsigned Offset = 0) {
+    AvailableValueInBlock Res;
+    Res.BB = BB;
+    Res.Val.setPointer(LI);
+    Res.Val.setInt(LoadVal);
+    Res.Offset = Offset;
+    return Res;
+  }
+
   bool isSimpleValue() const { return Val.getInt() == SimpleVal; }
+  bool isCoercedLoadValue() const { return Val.getInt() == LoadVal; }
+  bool isMemIntrinValue() const { return Val.getInt() == MemIntrin; }
+
   Value *getSimpleValue() const {
     assert(isSimpleValue() && "Wrong accessor");
     return Val.getPointer();
   }
   
+  LoadInst *getCoercedLoadValue() const {
+    assert(isCoercedLoadValue() && "Wrong accessor");
+    return cast<LoadInst>(Val.getPointer());
+  }
+  
   MemIntrinsic *getMemIntrinValue() const {
-    assert(!isSimpleValue() && "Wrong accessor");
+    assert(isMemIntrinValue() && "Wrong accessor");
     return cast<MemIntrinsic>(Val.getPointer());
   }
   
   /// MaterializeAdjustedValue - Emit code into this block to adjust the value
   /// defined here to the specified type.  This handles various coercion cases.
-  Value *MaterializeAdjustedValue(const Type *LoadTy,
-                                  const TargetData *TD) const {
+  Value *MaterializeAdjustedValue(const Type *LoadTy, GVN &gvn) const {
     Value *Res;
     if (isSimpleValue()) {
       Res = getSimpleValue();
       if (Res->getType() != LoadTy) {
+        const TargetData *TD = gvn.getTargetData();
         assert(TD && "Need target data to handle type mismatch case");
         Res = GetStoreValueForLoad(Res, Offset, LoadTy, BB->getTerminator(),
                                    *TD);
         
-        DEBUG(errs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset << "  "
+        DEBUG(dbgs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset << "  "
                      << *getSimpleValue() << '\n'
                      << *Res << '\n' << "\n\n\n");
       }
+    } else if (isCoercedLoadValue()) {
+      LoadInst *Load = getCoercedLoadValue();
+      if (Load->getType() == LoadTy && Offset == 0) {
+        Res = Load;
+      } else {
+        Res = GetLoadValueForLoad(Load, Offset, LoadTy, BB->getTerminator(),
+                                  gvn);
+        
+        DEBUG(dbgs() << "GVN COERCED NONLOCAL LOAD:\nOffset: " << Offset << "  "
+                     << *getCoercedLoadValue() << '\n'
+                     << *Res << '\n' << "\n\n\n");
+      }
     } else {
+      const TargetData *TD = gvn.getTargetData();
+      assert(TD && "Need target data to handle type mismatch case");
       Res = GetMemInstValueForLoad(getMemIntrinValue(), Offset,
                                    LoadTy, BB->getTerminator(), *TD);
-      DEBUG(errs() << "GVN COERCED NONLOCAL MEM INTRIN:\nOffset: " << Offset
+      DEBUG(dbgs() << "GVN COERCED NONLOCAL MEM INTRIN:\nOffset: " << Offset
                    << "  " << *getMemIntrinValue() << '\n'
                    << *Res << '\n' << "\n\n\n");
     }
@@ -1010,21 +1143,20 @@ struct AvailableValueInBlock {
   }
 };
 
-}
+} // end anonymous namespace
 
 /// ConstructSSAForLoadSet - Given a set of loads specified by ValuesPerBlock,
 /// construct SSA form, allowing us to eliminate LI.  This returns the value
 /// that should be used at LI's definition site.
 static Value *ConstructSSAForLoadSet(LoadInst *LI, 
                          SmallVectorImpl<AvailableValueInBlock> &ValuesPerBlock,
-                                     const TargetData *TD,
-                                     const DominatorTree &DT,
-                                     AliasAnalysis *AA) {
+                                     GVN &gvn) {
   // Check for the fully redundant, dominating load case.  In this case, we can
   // just use the dominating value directly.
   if (ValuesPerBlock.size() == 1 && 
-      DT.properlyDominates(ValuesPerBlock[0].BB, LI->getParent()))
-    return ValuesPerBlock[0].MaterializeAdjustedValue(LI->getType(), TD);
+      gvn.getDominatorTree().properlyDominates(ValuesPerBlock[0].BB,
+                                               LI->getParent()))
+    return ValuesPerBlock[0].MaterializeAdjustedValue(LI->getType(), gvn);
 
   // Otherwise, we have to construct SSA form.
   SmallVector<PHINode*, 8> NewPHIs;
@@ -1040,14 +1172,16 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI,
     if (SSAUpdate.HasValueForBlock(BB))
       continue;
 
-    SSAUpdate.AddAvailableValue(BB, AV.MaterializeAdjustedValue(LoadTy, TD));
+    SSAUpdate.AddAvailableValue(BB, AV.MaterializeAdjustedValue(LoadTy, gvn));
   }
   
   // Perform PHI construction.
   Value *V = SSAUpdate.GetValueInMiddleOfBlock(LI->getParent());
   
   // If new PHI nodes were created, notify alias analysis.
-  if (V->getType()->isPointerTy())
+  if (V->getType()->isPointerTy()) {
+    AliasAnalysis *AA = gvn.getAliasAnalysis();
+    
     for (unsigned i = 0, e = NewPHIs.size(); i != e; ++i)
       AA->copyValue(LI, NewPHIs[i]);
     
@@ -1059,6 +1193,7 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI,
       for (unsigned ii = 0, ee = P->getNumIncomingValues(); ii != ee; ++ii)
         AA->addEscapingUse(P->getOperandUse(2*ii));
     }
+  }
 
   return V;
 }
@@ -1071,8 +1206,7 @@ static bool isLifetimeStart(const Instruction *Inst) {
 
 /// processNonLocalLoad - Attempt to eliminate a load whose dependencies are
 /// non-local by performing PHI construction.
-bool GVN::processNonLocalLoad(LoadInst *LI,
-                              SmallVectorImpl<Instruction*> &toErase) {
+bool GVN::processNonLocalLoad(LoadInst *LI) {
   // Find the non-local dependencies of the load.
   SmallVector<NonLocalDepResult, 64> Deps;
   AliasAnalysis::Location Loc = VN.getAliasAnalysis()->getLocation(LI);
@@ -1088,11 +1222,11 @@ bool GVN::processNonLocalLoad(LoadInst *LI,
 
   // If we had a phi translation failure, we'll have a single entry which is a
   // clobber in the current block.  Reject this early.
-  if (Deps.size() == 1 && Deps[0].getResult().isClobber()) {
+  if (Deps.size() == 1 && Deps[0].getResult().isUnknown()) {
     DEBUG(
       dbgs() << "GVN: non-local load ";
       WriteAsOperand(dbgs(), LI);
-      dbgs() << " is clobbered by " << *Deps[0].getResult().getInst() << '\n';
+      dbgs() << " has unknown dependencies\n";
     );
     return false;
   }
@@ -1108,6 +1242,11 @@ bool GVN::processNonLocalLoad(LoadInst *LI,
     BasicBlock *DepBB = Deps[i].getBB();
     MemDepResult DepInfo = Deps[i].getResult();
 
+    if (DepInfo.isUnknown()) {
+      UnavailableBlocks.push_back(DepBB);
+      continue;
+    }
+
     if (DepInfo.isClobber()) {
       // The address being loaded in this non-local block may not be the same as
       // the pointer operand of the load if PHI translation occurs.  Make sure
@@ -1129,6 +1268,26 @@ bool GVN::processNonLocalLoad(LoadInst *LI,
           }
         }
       }
+      
+      // Check to see if we have something like this:
+      //    load i32* P
+      //    load i8* (P+1)
+      // if we have this, replace the later with an extraction from the former.
+      if (LoadInst *DepLI = dyn_cast<LoadInst>(DepInfo.getInst())) {
+        // If this is a clobber and L is the first instruction in its block, then
+        // we have the first instruction in the entry block.
+        if (DepLI != LI && Address && TD) {
+          int Offset = AnalyzeLoadFromClobberingLoad(LI->getType(),
+                                                     LI->getPointerOperand(),
+                                                     DepLI, *TD);
+          
+          if (Offset != -1) {
+            ValuesPerBlock.push_back(AvailableValueInBlock::getLoad(DepBB,DepLI,
+                                                                    Offset));
+            continue;
+          }
+        }
+      }
 
       // If the clobbering value is a memset/memcpy/memmove, see if we can
       // forward a value on from it.
@@ -1148,6 +1307,8 @@ bool GVN::processNonLocalLoad(LoadInst *LI,
       continue;
     }
 
+    assert(DepInfo.isDef() && "Expecting def here");
+
     Instruction *DepInst = DepInfo.getInst();
 
     // Loading the allocation -> undef.
@@ -1187,7 +1348,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI,
           continue;
         }          
       }
-      ValuesPerBlock.push_back(AvailableValueInBlock::get(DepBB, LD));
+      ValuesPerBlock.push_back(AvailableValueInBlock::getLoad(DepBB, LD));
       continue;
     }
     
@@ -1206,16 +1367,14 @@ bool GVN::processNonLocalLoad(LoadInst *LI,
     DEBUG(dbgs() << "GVN REMOVING NONLOCAL LOAD: " << *LI << '\n');
     
     // Perform PHI construction.
-    Value *V = ConstructSSAForLoadSet(LI, ValuesPerBlock, TD, *DT,
-                                      VN.getAliasAnalysis());
+    Value *V = ConstructSSAForLoadSet(LI, ValuesPerBlock, *this);
     LI->replaceAllUsesWith(V);
 
     if (isa<PHINode>(V))
       V->takeName(LI);
     if (V->getType()->isPointerTy())
       MD->invalidateCachedPointerInfo(V);
-    VN.erase(LI);
-    toErase.push_back(LI);
+    markInstructionForDeletion(LI);
     ++NumGVNLoad;
     return true;
   }
@@ -1421,6 +1580,9 @@ bool GVN::processNonLocalLoad(LoadInst *LI,
     if (MDNode *Tag = LI->getMetadata(LLVMContext::MD_tbaa))
       NewLoad->setMetadata(LLVMContext::MD_tbaa, Tag);
 
+    // Transfer DebugLoc.
+    NewLoad->setDebugLoc(LI->getDebugLoc());
+
     // Add the newly created load.
     ValuesPerBlock.push_back(AvailableValueInBlock::get(UnavailablePred,
                                                         NewLoad));
@@ -1429,33 +1591,37 @@ bool GVN::processNonLocalLoad(LoadInst *LI,
   }
 
   // Perform PHI construction.
-  Value *V = ConstructSSAForLoadSet(LI, ValuesPerBlock, TD, *DT,
-                                    VN.getAliasAnalysis());
+  Value *V = ConstructSSAForLoadSet(LI, ValuesPerBlock, *this);
   LI->replaceAllUsesWith(V);
   if (isa<PHINode>(V))
     V->takeName(LI);
   if (V->getType()->isPointerTy())
     MD->invalidateCachedPointerInfo(V);
-  VN.erase(LI);
-  toErase.push_back(LI);
+  markInstructionForDeletion(LI);
   ++NumPRELoad;
   return true;
 }
 
 /// processLoad - Attempt to eliminate a load, first by eliminating it
 /// locally, and then attempting non-local elimination if that fails.
-bool GVN::processLoad(LoadInst *L, SmallVectorImpl<Instruction*> &toErase) {
+bool GVN::processLoad(LoadInst *L) {
   if (!MD)
     return false;
 
   if (L->isVolatile())
     return false;
 
+  if (L->use_empty()) {
+    markInstructionForDeletion(L);
+    return true;
+  }
+  
   // ... to a pointer that has been loaded from before...
   MemDepResult Dep = MD->getDependency(L);
 
-  // If the value isn't available, don't do anything!
-  if (Dep.isClobber()) {
+  // If we have a clobber and target data is around, see if this is a clobber
+  // that we can fix up through code synthesis.
+  if (Dep.isClobber() && TD) {
     // Check to see if we have something like this:
     //   store i32 123, i32* %P
     //   %A = bitcast i32* %P to i8*
@@ -1467,26 +1633,40 @@ bool GVN::processLoad(LoadInst *L, SmallVectorImpl<Instruction*> &toErase) {
     // completely covers this load.  This sort of thing can happen in bitfield
     // access code.
     Value *AvailVal = 0;
-    if (StoreInst *DepSI = dyn_cast<StoreInst>(Dep.getInst()))
-      if (TD) {
-        int Offset = AnalyzeLoadFromClobberingStore(L->getType(),
-                                                    L->getPointerOperand(),
-                                                    DepSI, *TD);
-        if (Offset != -1)
-          AvailVal = GetStoreValueForLoad(DepSI->getValueOperand(), Offset,
-                                          L->getType(), L, *TD);
-      }
+    if (StoreInst *DepSI = dyn_cast<StoreInst>(Dep.getInst())) {
+      int Offset = AnalyzeLoadFromClobberingStore(L->getType(),
+                                                  L->getPointerOperand(),
+                                                  DepSI, *TD);
+      if (Offset != -1)
+        AvailVal = GetStoreValueForLoad(DepSI->getValueOperand(), Offset,
+                                        L->getType(), L, *TD);
+    }
+    
+    // Check to see if we have something like this:
+    //    load i32* P
+    //    load i8* (P+1)
+    // if we have this, replace the later with an extraction from the former.
+    if (LoadInst *DepLI = dyn_cast<LoadInst>(Dep.getInst())) {
+      // If this is a clobber and L is the first instruction in its block, then
+      // we have the first instruction in the entry block.
+      if (DepLI == L)
+        return false;
+      
+      int Offset = AnalyzeLoadFromClobberingLoad(L->getType(),
+                                                 L->getPointerOperand(),
+                                                 DepLI, *TD);
+      if (Offset != -1)
+        AvailVal = GetLoadValueForLoad(DepLI, Offset, L->getType(), L, *this);
+    }
     
     // If the clobbering value is a memset/memcpy/memmove, see if we can forward
     // a value on from it.
     if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(Dep.getInst())) {
-      if (TD) {
-        int Offset = AnalyzeLoadFromClobberingMemInst(L->getType(),
-                                                      L->getPointerOperand(),
-                                                      DepMI, *TD);
-        if (Offset != -1)
-          AvailVal = GetMemInstValueForLoad(DepMI, Offset, L->getType(), L,*TD);
-      }
+      int Offset = AnalyzeLoadFromClobberingMemInst(L->getType(),
+                                                    L->getPointerOperand(),
+                                                    DepMI, *TD);
+      if (Offset != -1)
+        AvailVal = GetMemInstValueForLoad(DepMI, Offset, L->getType(), L, *TD);
     }
         
     if (AvailVal) {
@@ -1497,14 +1677,16 @@ bool GVN::processLoad(LoadInst *L, SmallVectorImpl<Instruction*> &toErase) {
       L->replaceAllUsesWith(AvailVal);
       if (AvailVal->getType()->isPointerTy())
         MD->invalidateCachedPointerInfo(AvailVal);
-      VN.erase(L);
-      toErase.push_back(L);
+      markInstructionForDeletion(L);
       ++NumGVNLoad;
       return true;
     }
-        
+  }
+  
+  // If the value isn't available, don't do anything!
+  if (Dep.isClobber()) {
     DEBUG(
-      // fast print dep, using operator<< on instruction would be too slow
+      // fast print dep, using operator<< on instruction is too slow.
       dbgs() << "GVN: load ";
       WriteAsOperand(dbgs(), L);
       Instruction *I = Dep.getInst();
@@ -1513,9 +1695,21 @@ bool GVN::processLoad(LoadInst *L, SmallVectorImpl<Instruction*> &toErase) {
     return false;
   }
 
+  if (Dep.isUnknown()) {
+    DEBUG(
+      // fast print dep, using operator<< on instruction is too slow.
+      dbgs() << "GVN: load ";
+      WriteAsOperand(dbgs(), L);
+      dbgs() << " has unknown dependence\n";
+    );
+    return false;
+  }
+
   // If it is defined in another block, try harder.
   if (Dep.isNonLocal())
-    return processNonLocalLoad(L, toErase);
+    return processNonLocalLoad(L);
+
+  assert(Dep.isDef() && "Expecting def here");
 
   Instruction *DepInst = Dep.getInst();
   if (StoreInst *DepSI = dyn_cast<StoreInst>(DepInst)) {
@@ -1542,8 +1736,7 @@ bool GVN::processLoad(LoadInst *L, SmallVectorImpl<Instruction*> &toErase) {
     L->replaceAllUsesWith(StoredVal);
     if (StoredVal->getType()->isPointerTy())
       MD->invalidateCachedPointerInfo(StoredVal);
-    VN.erase(L);
-    toErase.push_back(L);
+    markInstructionForDeletion(L);
     ++NumGVNLoad;
     return true;
   }
@@ -1556,7 +1749,8 @@ bool GVN::processLoad(LoadInst *L, SmallVectorImpl<Instruction*> &toErase) {
     // (depending on its type).
     if (DepLI->getType() != L->getType()) {
       if (TD) {
-        AvailableVal = CoerceAvailableValueToLoadType(DepLI, L->getType(), L,*TD);
+        AvailableVal = CoerceAvailableValueToLoadType(DepLI, L->getType(),
+                                                      L, *TD);
         if (AvailableVal == 0)
           return false;
       
@@ -1571,8 +1765,7 @@ bool GVN::processLoad(LoadInst *L, SmallVectorImpl<Instruction*> &toErase) {
     L->replaceAllUsesWith(AvailableVal);
     if (DepLI->getType()->isPointerTy())
       MD->invalidateCachedPointerInfo(DepLI);
-    VN.erase(L);
-    toErase.push_back(L);
+    markInstructionForDeletion(L);
     ++NumGVNLoad;
     return true;
   }
@@ -1582,19 +1775,17 @@ bool GVN::processLoad(LoadInst *L, SmallVectorImpl<Instruction*> &toErase) {
   // intervening stores, for example.
   if (isa<AllocaInst>(DepInst) || isMalloc(DepInst)) {
     L->replaceAllUsesWith(UndefValue::get(L->getType()));
-    VN.erase(L);
-    toErase.push_back(L);
+    markInstructionForDeletion(L);
     ++NumGVNLoad;
     return true;
   }
   
   // If this load occurs either right after a lifetime begin,
   // then the loaded value is undefined.
-  if (IntrinsicInst* II = dyn_cast<IntrinsicInst>(DepInst)) {
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(DepInst)) {
     if (II->getIntrinsicID() == Intrinsic::lifetime_start) {
       L->replaceAllUsesWith(UndefValue::get(L->getType()));
-      VN.erase(L);
-      toErase.push_back(L);
+      markInstructionForDeletion(L);
       ++NumGVNLoad;
       return true;
     }
@@ -1634,8 +1825,7 @@ Value *GVN::findLeader(BasicBlock *BB, uint32_t num) {
 
 /// processInstruction - When calculating availability, handle an instruction
 /// by inserting it into the appropriate sets
-bool GVN::processInstruction(Instruction *I,
-                             SmallVectorImpl<Instruction*> &toErase) {
+bool GVN::processInstruction(Instruction *I) {
   // Ignore dbg info intrinsics.
   if (isa<DbgInfoIntrinsic>(I))
     return false;
@@ -1648,20 +1838,17 @@ bool GVN::processInstruction(Instruction *I,
     I->replaceAllUsesWith(V);
     if (MD && V->getType()->isPointerTy())
       MD->invalidateCachedPointerInfo(V);
-    VN.erase(I);
-    toErase.push_back(I);
+    markInstructionForDeletion(I);
     return true;
   }
 
   if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
-    bool Changed = processLoad(LI, toErase);
-
-    if (!Changed) {
-      unsigned Num = VN.lookup_or_add(LI);
-      addToLeaderTable(Num, LI, LI->getParent());
-    }
+    if (processLoad(LI))
+      return true;
 
-    return Changed;
+    unsigned Num = VN.lookup_or_add(LI);
+    addToLeaderTable(Num, LI, LI->getParent());
+    return false;
   }
 
   // For conditions branches, we can perform simple conditional propagation on
@@ -1720,11 +1907,10 @@ bool GVN::processInstruction(Instruction *I,
   }
   
   // Remove it!
-  VN.erase(I);
   I->replaceAllUsesWith(repl);
   if (MD && repl->getType()->isPointerTy())
     MD->invalidateCachedPointerInfo(repl);
-  toErase.push_back(I);
+  markInstructionForDeletion(I);
   return true;
 }
 
@@ -1781,35 +1967,36 @@ bool GVN::runOnFunction(Function& F) {
 
 
 bool GVN::processBlock(BasicBlock *BB) {
-  // FIXME: Kill off toErase by doing erasing eagerly in a helper function (and
-  // incrementing BI before processing an instruction).
-  SmallVector<Instruction*, 8> toErase;
+  // FIXME: Kill off InstrsToErase by doing erasing eagerly in a helper function
+  // (and incrementing BI before processing an instruction).
+  assert(InstrsToErase.empty() &&
+         "We expect InstrsToErase to be empty across iterations");
   bool ChangedFunction = false;
 
   for (BasicBlock::iterator BI = BB->begin(), BE = BB->end();
        BI != BE;) {
-    ChangedFunction |= processInstruction(BI, toErase);
-    if (toErase.empty()) {
+    ChangedFunction |= processInstruction(BI);
+    if (InstrsToErase.empty()) {
       ++BI;
       continue;
     }
 
     // If we need some instructions deleted, do it now.
-    NumGVNInstr += toErase.size();
+    NumGVNInstr += InstrsToErase.size();
 
     // Avoid iterator invalidation.
     bool AtStart = BI == BB->begin();
     if (!AtStart)
       --BI;
 
-    for (SmallVector<Instruction*, 4>::iterator I = toErase.begin(),
-         E = toErase.end(); I != E; ++I) {
+    for (SmallVector<Instruction*, 4>::iterator I = InstrsToErase.begin(),
+         E = InstrsToErase.end(); I != E; ++I) {
       DEBUG(dbgs() << "GVN removed: " << **I << '\n');
       if (MD) MD->removeInstruction(*I);
       (*I)->eraseFromParent();
       DEBUG(verifyRemoved(*I));
     }
-    toErase.clear();
+    InstrsToErase.clear();
 
     if (AtStart)
       BI = BB->begin();
@@ -1936,6 +2123,7 @@ bool GVN::performPRE(Function &F) {
 
       PREInstr->insertBefore(PREPred->getTerminator());
       PREInstr->setName(CurInst->getName() + ".pre");
+      PREInstr->setDebugLoc(CurInst->getDebugLoc());
       predMap[PREPred] = PREInstr;
       VN.add(PREInstr, ValNo);
       ++NumGVNPRE;
@@ -1955,7 +2143,7 @@ bool GVN::performPRE(Function &F) {
 
       VN.add(Phi, ValNo);
       addToLeaderTable(ValNo, Phi, CurrentBlock);
-
+      Phi->setDebugLoc(CurInst->getDebugLoc());
       CurInst->replaceAllUsesWith(Phi);
       if (Phi->getType()->isPointerTy()) {
         // Because we have added a PHI-use of the pointer value, it has now
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index eebcc69..04ee7c8 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -52,20 +52,30 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Support/CFG.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Target/TargetData.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/STLExtras.h"
 using namespace llvm;
 
 STATISTIC(NumRemoved , "Number of aux indvars removed");
+STATISTIC(NumWidened , "Number of indvars widened");
 STATISTIC(NumInserted, "Number of canonical indvars added");
 STATISTIC(NumReplaced, "Number of exit values replaced");
 STATISTIC(NumLFTR    , "Number of loop exit tests replaced");
+STATISTIC(NumElimExt , "Number of IV sign/zero extends eliminated");
+STATISTIC(NumElimRem , "Number of IV remainder operations eliminated");
+STATISTIC(NumElimCmp , "Number of IV comparisons eliminated");
+
+// DisableIVRewrite mode currently affects IVUsers, so is defined in libAnalysis
+// and referenced here.
+namespace llvm {
+  extern bool DisableIVRewrite;
+}
 
 namespace {
   class IndVarSimplify : public LoopPass {
@@ -73,12 +83,13 @@ namespace {
     LoopInfo        *LI;
     ScalarEvolution *SE;
     DominatorTree   *DT;
+    TargetData      *TD;
     SmallVector<WeakVH, 16> DeadInsts;
     bool Changed;
   public:
 
     static char ID; // Pass identification, replacement for typeid
-    IndVarSimplify() : LoopPass(ID) {
+    IndVarSimplify() : LoopPass(ID), IU(0), LI(0), SE(0), DT(0), TD(0) {
       initializeIndVarSimplifyPass(*PassRegistry::getPassRegistry());
     }
 
@@ -101,15 +112,18 @@ namespace {
   private:
     bool isValidRewrite(Value *FromVal, Value *ToVal);
 
-    void EliminateIVComparisons();
-    void EliminateIVRemainders();
+    void SimplifyIVUsers(SCEVExpander &Rewriter);
+    void EliminateIVComparison(ICmpInst *ICmp, Value *IVOperand);
+    void EliminateIVRemainder(BinaryOperator *Rem,
+                              Value *IVOperand,
+                              bool IsSigned,
+                              PHINode *IVPhi);
     void RewriteNonIntegerIVs(Loop *L);
 
     ICmpInst *LinearFunctionTestReplace(Loop *L, const SCEV *BackedgeTakenCount,
-                                   PHINode *IndVar,
-                                   BasicBlock *ExitingBlock,
-                                   BranchInst *BI,
-                                   SCEVExpander &Rewriter);
+                                        PHINode *IndVar,
+                                        SCEVExpander &Rewriter);
+
     void RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter);
 
     void RewriteIVExpressions(Loop *L, SCEVExpander &Rewriter);
@@ -122,7 +136,7 @@ namespace {
 
 char IndVarSimplify::ID = 0;
 INITIALIZE_PASS_BEGIN(IndVarSimplify, "indvars",
-                "Canonicalize Induction Variables", false, false)
+                "Induction Variable Simplification", false, false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTree)
 INITIALIZE_PASS_DEPENDENCY(LoopInfo)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
@@ -130,7 +144,7 @@ INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_DEPENDENCY(LCSSA)
 INITIALIZE_PASS_DEPENDENCY(IVUsers)
 INITIALIZE_PASS_END(IndVarSimplify, "indvars",
-                "Canonicalize Induction Variables", false, false)
+                "Induction Variable Simplification", false, false)
 
 Pass *llvm::createIndVarSimplifyPass() {
   return new IndVarSimplify();
@@ -183,17 +197,23 @@ bool IndVarSimplify::isValidRewrite(Value *FromVal, Value *ToVal) {
   return true;
 }
 
-/// LinearFunctionTestReplace - This method rewrites the exit condition of the
-/// loop to be a canonical != comparison against the incremented loop induction
-/// variable.  This pass is able to rewrite the exit tests of any loop where the
-/// SCEV analysis can determine a loop-invariant trip count of the loop, which
-/// is actually a much broader range than just linear tests.
-ICmpInst *IndVarSimplify::LinearFunctionTestReplace(Loop *L,
-                                   const SCEV *BackedgeTakenCount,
-                                   PHINode *IndVar,
-                                   BasicBlock *ExitingBlock,
-                                   BranchInst *BI,
-                                   SCEVExpander &Rewriter) {
+/// canExpandBackedgeTakenCount - Return true if this loop's backedge taken
+/// count expression can be safely and cheaply expanded into an instruction
+/// sequence that can be used by LinearFunctionTestReplace.
+static bool canExpandBackedgeTakenCount(Loop *L, ScalarEvolution *SE) {
+  const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L);
+  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount) ||
+      BackedgeTakenCount->isZero())
+    return false;
+
+  if (!L->getExitingBlock())
+    return false;
+
+  // Can't rewrite non-branch yet.
+  BranchInst *BI = dyn_cast<BranchInst>(L->getExitingBlock()->getTerminator());
+  if (!BI)
+    return false;
+
   // Special case: If the backedge-taken count is a UDiv, it's very likely a
   // UDiv that ScalarEvolution produced in order to compute a precise
   // expression, rather than a UDiv from the user's code. If we can't find a
@@ -201,23 +221,68 @@ ICmpInst *IndVarSimplify::LinearFunctionTestReplace(Loop *L,
   // rewriting the loop.
   if (isa<SCEVUDivExpr>(BackedgeTakenCount)) {
     ICmpInst *OrigCond = dyn_cast<ICmpInst>(BI->getCondition());
-    if (!OrigCond) return 0;
+    if (!OrigCond) return false;
     const SCEV *R = SE->getSCEV(OrigCond->getOperand(1));
     R = SE->getMinusSCEV(R, SE->getConstant(R->getType(), 1));
     if (R != BackedgeTakenCount) {
       const SCEV *L = SE->getSCEV(OrigCond->getOperand(0));
       L = SE->getMinusSCEV(L, SE->getConstant(L->getType(), 1));
       if (L != BackedgeTakenCount)
-        return 0;
+        return false;
     }
   }
+  return true;
+}
+
+/// getBackedgeIVType - Get the widest type used by the loop test after peeking
+/// through Truncs.
+///
+/// TODO: Unnecessary once LinearFunctionTestReplace is removed.
+static const Type *getBackedgeIVType(Loop *L) {
+  if (!L->getExitingBlock())
+    return 0;
+
+  // Can't rewrite non-branch yet.
+  BranchInst *BI = dyn_cast<BranchInst>(L->getExitingBlock()->getTerminator());
+  if (!BI)
+    return 0;
+
+  ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
+  if (!Cond)
+    return 0;
+
+  const Type *Ty = 0;
+  for(User::op_iterator OI = Cond->op_begin(), OE = Cond->op_end();
+      OI != OE; ++OI) {
+    assert((!Ty || Ty == (*OI)->getType()) && "bad icmp operand types");
+    TruncInst *Trunc = dyn_cast<TruncInst>(*OI);
+    if (!Trunc)
+      continue;
+
+    return Trunc->getSrcTy();
+  }
+  return Ty;
+}
+
+/// LinearFunctionTestReplace - This method rewrites the exit condition of the
+/// loop to be a canonical != comparison against the incremented loop induction
+/// variable.  This pass is able to rewrite the exit tests of any loop where the
+/// SCEV analysis can determine a loop-invariant trip count of the loop, which
+/// is actually a much broader range than just linear tests.
+ICmpInst *IndVarSimplify::
+LinearFunctionTestReplace(Loop *L,
+                          const SCEV *BackedgeTakenCount,
+                          PHINode *IndVar,
+                          SCEVExpander &Rewriter) {
+  assert(canExpandBackedgeTakenCount(L, SE) && "precondition");
+  BranchInst *BI = cast<BranchInst>(L->getExitingBlock()->getTerminator());
 
   // If the exiting block is not the same as the backedge block, we must compare
   // against the preincremented value, otherwise we prefer to compare against
   // the post-incremented value.
   Value *CmpIndVar;
   const SCEV *RHS = BackedgeTakenCount;
-  if (ExitingBlock == L->getLoopLatch()) {
+  if (L->getExitingBlock() == L->getLoopLatch()) {
     // Add one to the "backedge-taken" count to get the trip count.
     // If this addition may overflow, we have to be more pessimistic and
     // cast the induction variable before doing the add.
@@ -240,7 +305,7 @@ ICmpInst *IndVarSimplify::LinearFunctionTestReplace(Loop *L,
     // The BackedgeTaken expression contains the number of times that the
     // backedge branches to the loop header.  This is one less than the
     // number of times the loop executes, so use the incremented indvar.
-    CmpIndVar = IndVar->getIncomingValueForBlock(ExitingBlock);
+    CmpIndVar = IndVar->getIncomingValueForBlock(L->getExitingBlock());
   } else {
     // We have to use the preincremented value...
     RHS = SE->getTruncateOrZeroExtend(BackedgeTakenCount,
@@ -275,7 +340,7 @@ ICmpInst *IndVarSimplify::LinearFunctionTestReplace(Loop *L,
   // update the branch to use the new comparison; in the common case this
   // will make old comparison dead.
   BI->setCondition(Cond);
-  RecursivelyDeleteTriviallyDeadInstructions(OrigCond);
+  DeadInsts.push_back(OrigCond);
 
   ++NumLFTR;
   Changed = true;
@@ -418,96 +483,519 @@ void IndVarSimplify::RewriteNonIntegerIVs(Loop *L) {
     SE->forgetLoop(L);
 }
 
-void IndVarSimplify::EliminateIVComparisons() {
-  // Look for ICmp users.
-  for (IVUsers::iterator I = IU->begin(), E = IU->end(); I != E; ++I) {
-    IVStrideUse &UI = *I;
-    ICmpInst *ICmp = dyn_cast<ICmpInst>(UI.getUser());
-    if (!ICmp) continue;
-
-    bool Swapped = UI.getOperandValToReplace() == ICmp->getOperand(1);
-    ICmpInst::Predicate Pred = ICmp->getPredicate();
-    if (Swapped) Pred = ICmpInst::getSwappedPredicate(Pred);
-
-    // Get the SCEVs for the ICmp operands.
-    const SCEV *S = IU->getReplacementExpr(UI);
-    const SCEV *X = SE->getSCEV(ICmp->getOperand(!Swapped));
-
-    // Simplify unnecessary loops away.
-    const Loop *ICmpLoop = LI->getLoopFor(ICmp->getParent());
-    S = SE->getSCEVAtScope(S, ICmpLoop);
-    X = SE->getSCEVAtScope(X, ICmpLoop);
-
-    // If the condition is always true or always false, replace it with
-    // a constant value.
-    if (SE->isKnownPredicate(Pred, S, X))
-      ICmp->replaceAllUsesWith(ConstantInt::getTrue(ICmp->getContext()));
-    else if (SE->isKnownPredicate(ICmpInst::getInversePredicate(Pred), S, X))
-      ICmp->replaceAllUsesWith(ConstantInt::getFalse(ICmp->getContext()));
-    else
-      continue;
+namespace {
+  // Collect information about induction variables that are used by sign/zero
+  // extend operations. This information is recorded by CollectExtend and
+  // provides the input to WidenIV.
+  struct WideIVInfo {
+    const Type *WidestNativeType; // Widest integer type created [sz]ext
+    bool IsSigned;                // Was an sext user seen before a zext?
+
+    WideIVInfo() : WidestNativeType(0), IsSigned(false) {}
+  };
+  typedef std::map<PHINode *, WideIVInfo> WideIVMap;
+}
+
+/// CollectExtend - Update information about the induction variable that is
+/// extended by this sign or zero extend operation. This is used to determine
+/// the final width of the IV before actually widening it.
+static void CollectExtend(CastInst *Cast, PHINode *Phi, bool IsSigned,
+                          WideIVMap &IVMap, ScalarEvolution *SE,
+                          const TargetData *TD) {
+  const Type *Ty = Cast->getType();
+  uint64_t Width = SE->getTypeSizeInBits(Ty);
+  if (TD && !TD->isLegalInteger(Width))
+    return;
 
-    DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n');
-    DeadInsts.push_back(ICmp);
+  WideIVInfo &IVInfo = IVMap[Phi];
+  if (!IVInfo.WidestNativeType) {
+    IVInfo.WidestNativeType = SE->getEffectiveSCEVType(Ty);
+    IVInfo.IsSigned = IsSigned;
+    return;
   }
+
+  // We extend the IV to satisfy the sign of its first user, arbitrarily.
+  if (IVInfo.IsSigned != IsSigned)
+    return;
+
+  if (Width > SE->getTypeSizeInBits(IVInfo.WidestNativeType))
+    IVInfo.WidestNativeType = SE->getEffectiveSCEVType(Ty);
 }
 
-void IndVarSimplify::EliminateIVRemainders() {
-  // Look for SRem and URem users.
-  for (IVUsers::iterator I = IU->begin(), E = IU->end(); I != E; ++I) {
-    IVStrideUse &UI = *I;
-    BinaryOperator *Rem = dyn_cast<BinaryOperator>(UI.getUser());
-    if (!Rem) continue;
+namespace {
+/// WidenIV - The goal of this transform is to remove sign and zero extends
+/// without creating any new induction variables. To do this, it creates a new
+/// phi of the wider type and redirects all users, either removing extends or
+/// inserting truncs whenever we stop propagating the type.
+///
+class WidenIV {
+  PHINode *OrigPhi;
+  const Type *WideType;
+  bool IsSigned;
+
+  IVUsers *IU;
+  LoopInfo *LI;
+  Loop *L;
+  ScalarEvolution *SE;
+  DominatorTree *DT;
+  SmallVectorImpl<WeakVH> &DeadInsts;
+
+  PHINode *WidePhi;
+  Instruction *WideInc;
+  const SCEV *WideIncExpr;
+
+  SmallPtrSet<Instruction*,16> Processed;
+
+public:
+  WidenIV(PHINode *PN, const WideIVInfo &IVInfo, IVUsers *IUsers,
+          LoopInfo *LInfo, ScalarEvolution *SEv, DominatorTree *DTree,
+          SmallVectorImpl<WeakVH> &DI) :
+    OrigPhi(PN),
+    WideType(IVInfo.WidestNativeType),
+    IsSigned(IVInfo.IsSigned),
+    IU(IUsers),
+    LI(LInfo),
+    L(LI->getLoopFor(OrigPhi->getParent())),
+    SE(SEv),
+    DT(DTree),
+    DeadInsts(DI),
+    WidePhi(0),
+    WideInc(0),
+    WideIncExpr(0) {
+    assert(L->getHeader() == OrigPhi->getParent() && "Phi must be an IV");
+  }
 
-    bool isSigned = Rem->getOpcode() == Instruction::SRem;
-    if (!isSigned && Rem->getOpcode() != Instruction::URem)
-      continue;
+  bool CreateWideIV(SCEVExpander &Rewriter);
 
-    // We're only interested in the case where we know something about
-    // the numerator.
-    if (UI.getOperandValToReplace() != Rem->getOperand(0))
-      continue;
+protected:
+  Instruction *CloneIVUser(Instruction *NarrowUse,
+                           Instruction *NarrowDef,
+                           Instruction *WideDef);
 
-    // Get the SCEVs for the ICmp operands.
-    const SCEV *S = SE->getSCEV(Rem->getOperand(0));
-    const SCEV *X = SE->getSCEV(Rem->getOperand(1));
-
-    // Simplify unnecessary loops away.
-    const Loop *ICmpLoop = LI->getLoopFor(Rem->getParent());
-    S = SE->getSCEVAtScope(S, ICmpLoop);
-    X = SE->getSCEVAtScope(X, ICmpLoop);
-
-    // i % n  -->  i  if i is in [0,n).
-    if ((!isSigned || SE->isKnownNonNegative(S)) &&
-        SE->isKnownPredicate(isSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT,
-                             S, X))
-      Rem->replaceAllUsesWith(Rem->getOperand(0));
-    else {
-      // (i+1) % n  -->  (i+1)==n?0:(i+1)  if i is in [0,n).
-      const SCEV *LessOne =
-        SE->getMinusSCEV(S, SE->getConstant(S->getType(), 1));
-      if ((!isSigned || SE->isKnownNonNegative(LessOne)) &&
-          SE->isKnownPredicate(isSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT,
-                               LessOne, X)) {
-        ICmpInst *ICmp = new ICmpInst(Rem, ICmpInst::ICMP_EQ,
-                                      Rem->getOperand(0), Rem->getOperand(1),
-                                      "tmp");
-        SelectInst *Sel =
-          SelectInst::Create(ICmp,
-                             ConstantInt::get(Rem->getType(), 0),
-                             Rem->getOperand(0), "tmp", Rem);
-        Rem->replaceAllUsesWith(Sel);
-      } else
+  const SCEVAddRecExpr *GetWideRecurrence(Instruction *NarrowUse);
+
+  Instruction *WidenIVUse(Instruction *NarrowUse,
+                          Instruction *NarrowDef,
+                          Instruction *WideDef);
+};
+} // anonymous namespace
+
+/// SimplifyIVUsers - Iteratively perform simplification on IVUsers within this
+/// loop. IVUsers is treated as a worklist. Each successive simplification may
+/// push more users which may themselves be candidates for simplification.
+///
+void IndVarSimplify::SimplifyIVUsers(SCEVExpander &Rewriter) {
+  WideIVMap IVMap;
+
+  // Each round of simplification involves a round of eliminating operations
+  // followed by a round of widening IVs. A single IVUsers worklist is used
+  // across all rounds. The inner loop advances the user. If widening exposes
+  // more uses, then another pass through the outer loop is triggered.
+  for (IVUsers::iterator I = IU->begin(), E = IU->end(); I != E;) {
+    for(; I != E; ++I) {
+      Instruction *UseInst = I->getUser();
+      Value *IVOperand = I->getOperandValToReplace();
+
+      if (DisableIVRewrite) {
+        if (CastInst *Cast = dyn_cast<CastInst>(UseInst)) {
+          bool IsSigned = Cast->getOpcode() == Instruction::SExt;
+          if (IsSigned || Cast->getOpcode() == Instruction::ZExt) {
+            CollectExtend(Cast, I->getPhi(), IsSigned, IVMap, SE, TD);
+            continue;
+          }
+        }
+      }
+      if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
+        EliminateIVComparison(ICmp, IVOperand);
         continue;
+      }
+      if (BinaryOperator *Rem = dyn_cast<BinaryOperator>(UseInst)) {
+        bool IsSigned = Rem->getOpcode() == Instruction::SRem;
+        if (IsSigned || Rem->getOpcode() == Instruction::URem) {
+          EliminateIVRemainder(Rem, IVOperand, IsSigned, I->getPhi());
+          continue;
+        }
+      }
+    }
+    for (WideIVMap::const_iterator I = IVMap.begin(), E = IVMap.end();
+         I != E; ++I) {
+      WidenIV Widener(I->first, I->second, IU, LI, SE, DT, DeadInsts);
+      if (Widener.CreateWideIV(Rewriter))
+        Changed = true;
     }
+  }
+}
 
-    // Inform IVUsers about the new users.
-    if (Instruction *I = dyn_cast<Instruction>(Rem->getOperand(0)))
-      IU->AddUsersIfInteresting(I);
+static Value *getExtend( Value *NarrowOper, const Type *WideType,
+                               bool IsSigned, IRBuilder<> &Builder) {
+  return IsSigned ? Builder.CreateSExt(NarrowOper, WideType) :
+                    Builder.CreateZExt(NarrowOper, WideType);
+}
 
-    DEBUG(dbgs() << "INDVARS: Simplified rem: " << *Rem << '\n');
-    DeadInsts.push_back(Rem);
+/// CloneIVUser - Instantiate a wide operation to replace a narrow
+/// operation. This only needs to handle operations that can evaluation to
+/// SCEVAddRec. It can safely return 0 for any operation we decide not to clone.
+Instruction *WidenIV::CloneIVUser(Instruction *NarrowUse,
+                                  Instruction *NarrowDef,
+                                  Instruction *WideDef) {
+  unsigned Opcode = NarrowUse->getOpcode();
+  switch (Opcode) {
+  default:
+    return 0;
+  case Instruction::Add:
+  case Instruction::Mul:
+  case Instruction::UDiv:
+  case Instruction::Sub:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+    DEBUG(dbgs() << "Cloning IVUser: " << *NarrowUse << "\n");
+
+    IRBuilder<> Builder(NarrowUse);
+
+    // Replace NarrowDef operands with WideDef. Otherwise, we don't know
+    // anything about the narrow operand yet so must insert a [sz]ext. It is
+    // probably loop invariant and will be folded or hoisted. If it actually
+    // comes from a widened IV, it should be removed during a future call to
+    // WidenIVUse.
+    Value *LHS = (NarrowUse->getOperand(0) == NarrowDef) ? WideDef :
+      getExtend(NarrowUse->getOperand(0), WideType, IsSigned, Builder);
+    Value *RHS = (NarrowUse->getOperand(1) == NarrowDef) ? WideDef :
+      getExtend(NarrowUse->getOperand(1), WideType, IsSigned, Builder);
+
+    BinaryOperator *NarrowBO = cast<BinaryOperator>(NarrowUse);
+    BinaryOperator *WideBO = BinaryOperator::Create(NarrowBO->getOpcode(),
+                                                    LHS, RHS,
+                                                    NarrowBO->getName());
+    Builder.Insert(WideBO);
+    if (NarrowBO->hasNoUnsignedWrap()) WideBO->setHasNoUnsignedWrap();
+    if (NarrowBO->hasNoSignedWrap()) WideBO->setHasNoSignedWrap();
+
+    return WideBO;
   }
+  llvm_unreachable(0);
+}
+
+// GetWideRecurrence - Is this instruction potentially interesting from IVUsers'
+// perspective after widening it's type? In other words, can the extend be
+// safely hoisted out of the loop with SCEV reducing the value to a recurrence
+// on the same loop. If so, return the sign or zero extended
+// recurrence. Otherwise return NULL.
+const SCEVAddRecExpr *WidenIV::GetWideRecurrence(Instruction *NarrowUse) {
+  if (!SE->isSCEVable(NarrowUse->getType()))
+    return 0;
+
+  const SCEV *NarrowExpr = SE->getSCEV(NarrowUse);
+  const SCEV *WideExpr = IsSigned ?
+    SE->getSignExtendExpr(NarrowExpr, WideType) :
+    SE->getZeroExtendExpr(NarrowExpr, WideType);
+  const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(WideExpr);
+  if (!AddRec || AddRec->getLoop() != L)
+    return 0;
+
+  return AddRec;
+}
+
+/// HoistStep - Attempt to hoist an IV increment above a potential use.
+///
+/// To successfully hoist, two criteria must be met:
+/// - IncV operands dominate InsertPos and
+/// - InsertPos dominates IncV
+///
+/// Meeting the second condition means that we don't need to check all of IncV's
+/// existing uses (it's moving up in the domtree).
+///
+/// This does not yet recursively hoist the operands, although that would
+/// not be difficult.
+static bool HoistStep(Instruction *IncV, Instruction *InsertPos,
+                      const DominatorTree *DT)
+{
+  if (DT->dominates(IncV, InsertPos))
+    return true;
+
+  if (!DT->dominates(InsertPos->getParent(), IncV->getParent()))
+    return false;
+
+  if (IncV->mayHaveSideEffects())
+    return false;
+
+  // Attempt to hoist IncV
+  for (User::op_iterator OI = IncV->op_begin(), OE = IncV->op_end();
+       OI != OE; ++OI) {
+    Instruction *OInst = dyn_cast<Instruction>(OI);
+    if (OInst && !DT->dominates(OInst, InsertPos))
+      return false;
+  }
+  IncV->moveBefore(InsertPos);
+  return true;
+}
+
+/// WidenIVUse - Determine whether an individual user of the narrow IV can be
+/// widened. If so, return the wide clone of the user.
+Instruction *WidenIV::WidenIVUse(Instruction *NarrowUse,
+                                 Instruction *NarrowDef,
+                                 Instruction *WideDef) {
+  // To be consistent with IVUsers, stop traversing the def-use chain at
+  // inner-loop phis or post-loop phis.
+  if (isa<PHINode>(NarrowUse) && LI->getLoopFor(NarrowUse->getParent()) != L)
+    return 0;
+
+  // Handle data flow merges and bizarre phi cycles.
+  if (!Processed.insert(NarrowUse))
+    return 0;
+
+  // Our raison d'etre! Eliminate sign and zero extension.
+  if (IsSigned ? isa<SExtInst>(NarrowUse) : isa<ZExtInst>(NarrowUse)) {
+    Value *NewDef = WideDef;
+    if (NarrowUse->getType() != WideType) {
+      unsigned CastWidth = SE->getTypeSizeInBits(NarrowUse->getType());
+      unsigned IVWidth = SE->getTypeSizeInBits(WideType);
+      if (CastWidth < IVWidth) {
+        // The cast isn't as wide as the IV, so insert a Trunc.
+        IRBuilder<> Builder(NarrowUse);
+        NewDef = Builder.CreateTrunc(WideDef, NarrowUse->getType());
+      }
+      else {
+        // A wider extend was hidden behind a narrower one. This may induce
+        // another round of IV widening in which the intermediate IV becomes
+        // dead. It should be very rare.
+        DEBUG(dbgs() << "INDVARS: New IV " << *WidePhi
+              << " not wide enough to subsume " << *NarrowUse << "\n");
+        NarrowUse->replaceUsesOfWith(NarrowDef, WideDef);
+        NewDef = NarrowUse;
+      }
+    }
+    if (NewDef != NarrowUse) {
+      DEBUG(dbgs() << "INDVARS: eliminating " << *NarrowUse
+            << " replaced by " << *WideDef << "\n");
+      ++NumElimExt;
+      NarrowUse->replaceAllUsesWith(NewDef);
+      DeadInsts.push_back(NarrowUse);
+    }
+    // Now that the extend is gone, expose it's uses to IVUsers for potential
+    // further simplification within SimplifyIVUsers.
+    IU->AddUsersIfInteresting(WideDef, WidePhi);
+
+    // No further widening is needed. The deceased [sz]ext had done it for us.
+    return 0;
+  }
+  const SCEVAddRecExpr *WideAddRec = GetWideRecurrence(NarrowUse);
+  if (!WideAddRec) {
+    // This user does not evaluate to a recurence after widening, so don't
+    // follow it. Instead insert a Trunc to kill off the original use,
+    // eventually isolating the original narrow IV so it can be removed.
+    IRBuilder<> Builder(NarrowUse);
+    Value *Trunc = Builder.CreateTrunc(WideDef, NarrowDef->getType());
+    NarrowUse->replaceUsesOfWith(NarrowDef, Trunc);
+    return 0;
+  }
+  // Reuse the IV increment that SCEVExpander created as long as it dominates
+  // NarrowUse.
+  Instruction *WideUse = 0;
+  if (WideAddRec == WideIncExpr && HoistStep(WideInc, NarrowUse, DT)) {
+    WideUse = WideInc;
+  }
+  else {
+    WideUse = CloneIVUser(NarrowUse, NarrowDef, WideDef);
+    if (!WideUse)
+      return 0;
+  }
+  // GetWideRecurrence ensured that the narrow expression could be extended
+  // outside the loop without overflow. This suggests that the wide use
+  // evaluates to the same expression as the extended narrow use, but doesn't
+  // absolutely guarantee it. Hence the following failsafe check. In rare cases
+  // where it fails, we simple throw away the newly created wide use.
+  if (WideAddRec != SE->getSCEV(WideUse)) {
+    DEBUG(dbgs() << "Wide use expression mismatch: " << *WideUse
+          << ": " << *SE->getSCEV(WideUse) << " != " << *WideAddRec << "\n");
+    DeadInsts.push_back(WideUse);
+    return 0;
+  }
+
+  // Returning WideUse pushes it on the worklist.
+  return WideUse;
+}
+
+/// CreateWideIV - Process a single induction variable. First use the
+/// SCEVExpander to create a wide induction variable that evaluates to the same
+/// recurrence as the original narrow IV. Then use a worklist to forward
+/// traverse the narrow IV's def-use chain. After WidenIVUse as processed all
+/// interesting IV users, the narrow IV will be isolated for removal by
+/// DeleteDeadPHIs.
+///
+/// It would be simpler to delete uses as they are processed, but we must avoid
+/// invalidating SCEV expressions.
+///
+bool WidenIV::CreateWideIV(SCEVExpander &Rewriter) {
+  // Is this phi an induction variable?
+  const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(OrigPhi));
+  if (!AddRec)
+    return false;
+
+  // Widen the induction variable expression.
+  const SCEV *WideIVExpr = IsSigned ?
+    SE->getSignExtendExpr(AddRec, WideType) :
+    SE->getZeroExtendExpr(AddRec, WideType);
+
+  assert(SE->getEffectiveSCEVType(WideIVExpr->getType()) == WideType &&
+         "Expect the new IV expression to preserve its type");
+
+  // Can the IV be extended outside the loop without overflow?
+  AddRec = dyn_cast<SCEVAddRecExpr>(WideIVExpr);
+  if (!AddRec || AddRec->getLoop() != L)
+    return false;
+
+  // An AddRec must have loop-invariant operands. Since this AddRec it
+  // materialized by a loop header phi, the expression cannot have any post-loop
+  // operands, so they must dominate the loop header.
+  assert(SE->properlyDominates(AddRec->getStart(), L->getHeader()) &&
+         SE->properlyDominates(AddRec->getStepRecurrence(*SE), L->getHeader())
+         && "Loop header phi recurrence inputs do not dominate the loop");
+
+  // The rewriter provides a value for the desired IV expression. This may
+  // either find an existing phi or materialize a new one. Either way, we
+  // expect a well-formed cyclic phi-with-increments. i.e. any operand not part
+  // of the phi-SCC dominates the loop entry.
+  Instruction *InsertPt = L->getHeader()->begin();
+  WidePhi = cast<PHINode>(Rewriter.expandCodeFor(AddRec, WideType, InsertPt));
+
+  // Remembering the WideIV increment generated by SCEVExpander allows
+  // WidenIVUse to reuse it when widening the narrow IV's increment. We don't
+  // employ a general reuse mechanism because the call above is the only call to
+  // SCEVExpander. Henceforth, we produce 1-to-1 narrow to wide uses.
+  if (BasicBlock *LatchBlock = L->getLoopLatch()) {
+    WideInc =
+      cast<Instruction>(WidePhi->getIncomingValueForBlock(LatchBlock));
+    WideIncExpr = SE->getSCEV(WideInc);
+  }
+
+  DEBUG(dbgs() << "Wide IV: " << *WidePhi << "\n");
+  ++NumWidened;
+
+  // Traverse the def-use chain using a worklist starting at the original IV.
+  assert(Processed.empty() && "expect initial state" );
+
+  // Each worklist entry has a Narrow def-use link and Wide def.
+  SmallVector<std::pair<Use *, Instruction *>, 8> NarrowIVUsers;
+  for (Value::use_iterator UI = OrigPhi->use_begin(),
+         UE = OrigPhi->use_end(); UI != UE; ++UI) {
+    NarrowIVUsers.push_back(std::make_pair(&UI.getUse(), WidePhi));
+  }
+  while (!NarrowIVUsers.empty()) {
+    Use *NarrowDefUse;
+    Instruction *WideDef;
+    tie(NarrowDefUse, WideDef) = NarrowIVUsers.pop_back_val();
+
+    // Process a def-use edge. This may replace the use, so don't hold a
+    // use_iterator across it.
+    Instruction *NarrowDef = cast<Instruction>(NarrowDefUse->get());
+    Instruction *NarrowUse = cast<Instruction>(NarrowDefUse->getUser());
+    Instruction *WideUse = WidenIVUse(NarrowUse, NarrowDef, WideDef);
+
+    // Follow all def-use edges from the previous narrow use.
+    if (WideUse) {
+      for (Value::use_iterator UI = NarrowUse->use_begin(),
+             UE = NarrowUse->use_end(); UI != UE; ++UI) {
+        NarrowIVUsers.push_back(std::make_pair(&UI.getUse(), WideUse));
+      }
+    }
+    // WidenIVUse may have removed the def-use edge.
+    if (NarrowDef->use_empty())
+      DeadInsts.push_back(NarrowDef);
+  }
+  return true;
+}
+
+void IndVarSimplify::EliminateIVComparison(ICmpInst *ICmp, Value *IVOperand) {
+  unsigned IVOperIdx = 0;
+  ICmpInst::Predicate Pred = ICmp->getPredicate();
+  if (IVOperand != ICmp->getOperand(0)) {
+    // Swapped
+    assert(IVOperand == ICmp->getOperand(1) && "Can't find IVOperand");
+    IVOperIdx = 1;
+    Pred = ICmpInst::getSwappedPredicate(Pred);
+  }
+
+  // Get the SCEVs for the ICmp operands.
+  const SCEV *S = SE->getSCEV(ICmp->getOperand(IVOperIdx));
+  const SCEV *X = SE->getSCEV(ICmp->getOperand(1 - IVOperIdx));
+
+  // Simplify unnecessary loops away.
+  const Loop *ICmpLoop = LI->getLoopFor(ICmp->getParent());
+  S = SE->getSCEVAtScope(S, ICmpLoop);
+  X = SE->getSCEVAtScope(X, ICmpLoop);
+
+  // If the condition is always true or always false, replace it with
+  // a constant value.
+  if (SE->isKnownPredicate(Pred, S, X))
+    ICmp->replaceAllUsesWith(ConstantInt::getTrue(ICmp->getContext()));
+  else if (SE->isKnownPredicate(ICmpInst::getInversePredicate(Pred), S, X))
+    ICmp->replaceAllUsesWith(ConstantInt::getFalse(ICmp->getContext()));
+  else
+    return;
+
+  DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n');
+  ++NumElimCmp;
+  Changed = true;
+  DeadInsts.push_back(ICmp);
+}
+
+void IndVarSimplify::EliminateIVRemainder(BinaryOperator *Rem,
+                                          Value *IVOperand,
+                                          bool IsSigned,
+                                          PHINode *IVPhi) {
+  // We're only interested in the case where we know something about
+  // the numerator.
+  if (IVOperand != Rem->getOperand(0))
+    return;
+
+  // Get the SCEVs for the ICmp operands.
+  const SCEV *S = SE->getSCEV(Rem->getOperand(0));
+  const SCEV *X = SE->getSCEV(Rem->getOperand(1));
+
+  // Simplify unnecessary loops away.
+  const Loop *ICmpLoop = LI->getLoopFor(Rem->getParent());
+  S = SE->getSCEVAtScope(S, ICmpLoop);
+  X = SE->getSCEVAtScope(X, ICmpLoop);
+
+  // i % n  -->  i  if i is in [0,n).
+  if ((!IsSigned || SE->isKnownNonNegative(S)) &&
+      SE->isKnownPredicate(IsSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT,
+                           S, X))
+    Rem->replaceAllUsesWith(Rem->getOperand(0));
+  else {
+    // (i+1) % n  -->  (i+1)==n?0:(i+1)  if i is in [0,n).
+    const SCEV *LessOne =
+      SE->getMinusSCEV(S, SE->getConstant(S->getType(), 1));
+    if (IsSigned && !SE->isKnownNonNegative(LessOne))
+      return;
+
+    if (!SE->isKnownPredicate(IsSigned ?
+                              ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT,
+                              LessOne, X))
+      return;
+
+    ICmpInst *ICmp = new ICmpInst(Rem, ICmpInst::ICMP_EQ,
+                                  Rem->getOperand(0), Rem->getOperand(1),
+                                  "tmp");
+    SelectInst *Sel =
+      SelectInst::Create(ICmp,
+                         ConstantInt::get(Rem->getType(), 0),
+                         Rem->getOperand(0), "tmp", Rem);
+    Rem->replaceAllUsesWith(Sel);
+  }
+
+  // Inform IVUsers about the new users.
+  if (Instruction *I = dyn_cast<Instruction>(Rem->getOperand(0)))
+    IU->AddUsersIfInteresting(I, IVPhi);
+
+  DEBUG(dbgs() << "INDVARS: Simplified rem: " << *Rem << '\n');
+  ++NumElimRem;
+  Changed = true;
+  DeadInsts.push_back(Rem);
 }
 
 bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
@@ -526,6 +1014,8 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
   LI = &getAnalysis<LoopInfo>();
   SE = &getAnalysis<ScalarEvolution>();
   DT = &getAnalysis<DominatorTree>();
+  TD = getAnalysisIfAvailable<TargetData>();
+
   DeadInsts.clear();
   Changed = false;
 
@@ -533,11 +1023,12 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
   // transform them to use integer recurrences.
   RewriteNonIntegerIVs(L);
 
-  BasicBlock *ExitingBlock = L->getExitingBlock(); // may be null
   const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L);
 
   // Create a rewriter object which we'll use to transform the code with.
   SCEVExpander Rewriter(*SE);
+  if (DisableIVRewrite)
+    Rewriter.disableCanonicalMode();
 
   // Check to see if this loop has a computable loop-invariant execution count.
   // If so, this means that we can compute the final value of any expressions
@@ -548,33 +1039,42 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
   if (!isa<SCEVCouldNotCompute>(BackedgeTakenCount))
     RewriteLoopExitValues(L, Rewriter);
 
-  // Simplify ICmp IV users.
-  EliminateIVComparisons();
-
-  // Simplify SRem and URem IV users.
-  EliminateIVRemainders();
+  // Eliminate redundant IV users.
+  SimplifyIVUsers(Rewriter);
 
   // Compute the type of the largest recurrence expression, and decide whether
   // a canonical induction variable should be inserted.
   const Type *LargestType = 0;
   bool NeedCannIV = false;
-  if (!isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
-    LargestType = BackedgeTakenCount->getType();
-    LargestType = SE->getEffectiveSCEVType(LargestType);
+  bool ExpandBECount = canExpandBackedgeTakenCount(L, SE);
+  if (ExpandBECount) {
     // If we have a known trip count and a single exit block, we'll be
     // rewriting the loop exit test condition below, which requires a
     // canonical induction variable.
-    if (ExitingBlock)
-      NeedCannIV = true;
-  }
-  for (IVUsers::const_iterator I = IU->begin(), E = IU->end(); I != E; ++I) {
-    const Type *Ty =
-      SE->getEffectiveSCEVType(I->getOperandValToReplace()->getType());
+    NeedCannIV = true;
+    const Type *Ty = BackedgeTakenCount->getType();
+    if (DisableIVRewrite) {
+      // In this mode, SimplifyIVUsers may have already widened the IV used by
+      // the backedge test and inserted a Trunc on the compare's operand. Get
+      // the wider type to avoid creating a redundant narrow IV only used by the
+      // loop test.
+      LargestType = getBackedgeIVType(L);
+    }
     if (!LargestType ||
         SE->getTypeSizeInBits(Ty) >
+        SE->getTypeSizeInBits(LargestType))
+      LargestType = SE->getEffectiveSCEVType(Ty);
+  }
+  if (!DisableIVRewrite) {
+    for (IVUsers::const_iterator I = IU->begin(), E = IU->end(); I != E; ++I) {
+      NeedCannIV = true;
+      const Type *Ty =
+        SE->getEffectiveSCEVType(I->getOperandValToReplace()->getType());
+      if (!LargestType ||
+          SE->getTypeSizeInBits(Ty) >
           SE->getTypeSizeInBits(LargestType))
-      LargestType = Ty;
-    NeedCannIV = true;
+        LargestType = Ty;
+    }
   }
 
   // Now that we know the largest of the induction variable expressions
@@ -614,19 +1114,17 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
   // If we have a trip count expression, rewrite the loop's exit condition
   // using it.  We can currently only handle loops with a single exit.
   ICmpInst *NewICmp = 0;
-  if (!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
-      !BackedgeTakenCount->isZero() &&
-      ExitingBlock) {
+  if (ExpandBECount) {
+    assert(canExpandBackedgeTakenCount(L, SE) &&
+           "canonical IV disrupted BackedgeTaken expansion");
     assert(NeedCannIV &&
            "LinearFunctionTestReplace requires a canonical induction variable");
-    // Can't rewrite non-branch yet.
-    if (BranchInst *BI = dyn_cast<BranchInst>(ExitingBlock->getTerminator()))
-      NewICmp = LinearFunctionTestReplace(L, BackedgeTakenCount, IndVar,
-                                          ExitingBlock, BI, Rewriter);
+    NewICmp = LinearFunctionTestReplace(L, BackedgeTakenCount, IndVar,
+                                        Rewriter);
   }
-
   // Rewrite IV-derived expressions.
-  RewriteIVExpressions(L, Rewriter);
+  if (!DisableIVRewrite)
+    RewriteIVExpressions(L, Rewriter);
 
   // Clear the rewriter cache, because values that are in the rewriter's cache
   // can be deleted in the loop below, causing the AssertingVH in the cache to
@@ -649,7 +1147,8 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
   // For completeness, inform IVUsers of the IV use in the newly-created
   // loop exit test instruction.
   if (NewICmp)
-    IU->AddUsersIfInteresting(cast<Instruction>(NewICmp->getOperand(0)));
+    IU->AddUsersIfInteresting(cast<Instruction>(NewICmp->getOperand(0)),
+                              IndVar);
 
   // Clean up dead instructions.
   Changed |= DeleteDeadPHIs(L->getHeader());
@@ -1080,5 +1579,5 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) {
   }
 
   // Add a new IVUsers entry for the newly-created integer PHI.
-  IU->AddUsersIfInteresting(NewPHI);
+  IU->AddUsersIfInteresting(NewPHI, NewPHI);
 }
diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
index 8f90dfe..cf18ff0 100644
--- a/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/lib/Transforms/Scalar/JumpThreading.cpp
@@ -16,6 +16,7 @@
 #include "llvm/IntrinsicInst.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/Pass.h"
+#include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LazyValueInfo.h"
 #include "llvm/Analysis/Loads.h"
@@ -170,9 +171,9 @@ bool JumpThreading::runOnFunction(Function &F) {
         Changed = true;
         continue;
       }
-      
+
       BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator());
-      
+
       // Can't thread an unconditional jump, but if the block is "almost
       // empty", we can replace uses of it with uses of the successor and make
       // this dead.
@@ -608,7 +609,7 @@ static unsigned GetBestDestForJumpOnUndef(BasicBlock *BB) {
 
 static bool hasAddressTakenAndUsed(BasicBlock *BB) {
   if (!BB->hasAddressTaken()) return false;
-  
+
   // If the block has its address taken, it may be a tree of dead constants
   // hanging off of it.  These shouldn't keep the block alive.
   BlockAddress *BA = BlockAddress::get(BB);
@@ -668,6 +669,17 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) {
     return false; // Must be an invoke.
   }
 
+  // Run constant folding to see if we can reduce the condition to a simple
+  // constant.
+  if (Instruction *I = dyn_cast<Instruction>(Condition)) {
+    Value *SimpleVal = ConstantFoldInstruction(I, TD);
+    if (SimpleVal) {
+      I->replaceAllUsesWith(SimpleVal);
+      I->eraseFromParent();
+      Condition = SimpleVal;
+    }
+  }
+
   // If the terminator is branching on an undef, we can pick any of the
   // successors to branch to.  Let GetBestDestForJumpOnUndef decide.
   if (isa<UndefValue>(Condition)) {
@@ -694,7 +706,7 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) {
     DEBUG(dbgs() << "  In block '" << BB->getName()
           << "' folding terminator: " << *BB->getTerminator() << '\n');
     ++NumFolds;
-    ConstantFoldTerminator(BB);
+    ConstantFoldTerminator(BB, true);
     return true;
   }
 
@@ -917,9 +929,10 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
   if (UnavailablePred) {
     assert(UnavailablePred->getTerminator()->getNumSuccessors() == 1 &&
            "Can't handle critical edge here!");
-    Value *NewVal = new LoadInst(LoadedPtr, LI->getName()+".pr", false,
+    LoadInst *NewVal = new LoadInst(LoadedPtr, LI->getName()+".pr", false,
                                  LI->getAlignment(),
                                  UnavailablePred->getTerminator());
+    NewVal->setDebugLoc(LI->getDebugLoc());
     AvailablePreds.push_back(std::make_pair(UnavailablePred, NewVal));
   }
 
@@ -932,6 +945,7 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
   PHINode *PN = PHINode::Create(LI->getType(), std::distance(PB, PE), "",
                                 LoadBB->begin());
   PN->takeName(LI);
+  PN->setDebugLoc(LI->getDebugLoc());
 
   // Insert new entries into the PHI for each predecessor.  A single block may
   // have multiple entries here.
@@ -1363,7 +1377,8 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB,
 
   // We didn't copy the terminator from BB over to NewBB, because there is now
   // an unconditional jump to SuccBB.  Insert the unconditional jump.
-  BranchInst::Create(SuccBB, NewBB);
+  BranchInst *NewBI =BranchInst::Create(SuccBB, NewBB);
+  NewBI->setDebugLoc(BB->getTerminator()->getDebugLoc());
 
   // Check to see if SuccBB has PHI nodes. If so, we need to add entries to the
   // PHI nodes for NewBB now.
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index 93de9cf..13bd022 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -372,7 +372,11 @@ bool LICM::canSinkOrHoistInst(Instruction &I) {
     return !pointerInvalidatedByLoop(LI->getOperand(0), Size,
                                      LI->getMetadata(LLVMContext::MD_tbaa));
   } else if (CallInst *CI = dyn_cast<CallInst>(&I)) {
-    // Handle obvious cases efficiently.
+    // Don't sink or hoist dbg info; it's legal, but not useful.
+    if (isa<DbgInfoIntrinsic>(I))
+      return false;
+
+    // Handle simple cases by querying alias analysis.
     AliasAnalysis::ModRefBehavior Behavior = AA->getModRefBehavior(CI);
     if (Behavior == AliasAnalysis::DoesNotAccessMemory)
       return true;
@@ -445,8 +449,7 @@ void LICM::sink(Instruction &I) {
   // enough that we handle it as a special (more efficient) case.  It is more
   // efficient to handle because there are no PHI nodes that need to be placed.
   if (ExitBlocks.size() == 1) {
-    if (!isa<DbgInfoIntrinsic>(I) && 
-        !DT->dominates(I.getParent(), ExitBlocks[0])) {
+    if (!DT->dominates(I.getParent(), ExitBlocks[0])) {
       // Instruction is not used, just delete it.
       CurAST->deleteValue(&I);
       // If I has users in unreachable blocks, eliminate.
@@ -602,13 +605,15 @@ namespace {
     SmallPtrSet<Value*, 4> &PointerMustAliases;
     SmallVectorImpl<BasicBlock*> &LoopExitBlocks;
     AliasSetTracker &AST;
+    DebugLoc DL;
   public:
     LoopPromoter(Value *SP,
                  const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S,
                  SmallPtrSet<Value*, 4> &PMA,
-                 SmallVectorImpl<BasicBlock*> &LEB, AliasSetTracker &ast)
-      : LoadAndStorePromoter(Insts, S), SomePtr(SP), PointerMustAliases(PMA),
-        LoopExitBlocks(LEB), AST(ast) {}
+                 SmallVectorImpl<BasicBlock*> &LEB, AliasSetTracker &ast,
+                 DebugLoc dl)
+      : LoadAndStorePromoter(Insts, S, 0, 0), SomePtr(SP),
+        PointerMustAliases(PMA), LoopExitBlocks(LEB), AST(ast), DL(dl) {}
     
     virtual bool isInstInList(Instruction *I,
                               const SmallVectorImpl<Instruction*> &) const {
@@ -629,7 +634,8 @@ namespace {
         BasicBlock *ExitBlock = LoopExitBlocks[i];
         Value *LiveInValue = SSA.GetValueInMiddleOfBlock(ExitBlock);
         Instruction *InsertPos = ExitBlock->getFirstNonPHI();
-        new StoreInst(LiveInValue, SomePtr, InsertPos);
+        StoreInst *NewSI = new StoreInst(LiveInValue, SomePtr, InsertPos);
+        NewSI->setDebugLoc(DL);
       }
     }
 
@@ -727,6 +733,12 @@ void LICM::PromoteAliasSet(AliasSet &AS) {
   Changed = true;
   ++NumPromoted;
 
+  // Grab a debug location for the inserted loads/stores; given that the
+  // inserted loads/stores have little relation to the original loads/stores,
+  // this code just arbitrarily picks a location from one, since any debug
+  // location is better than none.
+  DebugLoc DL = LoopUses[0]->getDebugLoc();
+
   SmallVector<BasicBlock*, 8> ExitBlocks;
   CurLoop->getUniqueExitBlocks(ExitBlocks);
   
@@ -734,13 +746,14 @@ void LICM::PromoteAliasSet(AliasSet &AS) {
   SmallVector<PHINode*, 16> NewPHIs;
   SSAUpdater SSA(&NewPHIs);
   LoopPromoter Promoter(SomePtr, LoopUses, SSA, PointerMustAliases, ExitBlocks,
-                        *CurAST);
+                        *CurAST, DL);
   
   // Set up the preheader to have a definition of the value.  It is the live-out
   // value from the preheader that uses in the loop will use.
   LoadInst *PreheaderLoad =
     new LoadInst(SomePtr, SomePtr->getName()+".promoted",
                  Preheader->getTerminator());
+  PreheaderLoad->setDebugLoc(DL);
   SSA.AddAvailableValue(Preheader, PreheaderLoad);
 
   // Rewrite all the loads in the loop and remember all the definitions from
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 1366231..dbf6eec 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -128,11 +128,11 @@ INITIALIZE_PASS_END(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms",
 
 Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognize(); }
 
-/// DeleteDeadInstruction - Delete this instruction.  Before we do, go through
+/// deleteDeadInstruction - Delete this instruction.  Before we do, go through
 /// and zero out all the operands of this instruction.  If any of them become
 /// dead, delete them and the computation tree that feeds them.
 ///
-static void DeleteDeadInstruction(Instruction *I, ScalarEvolution &SE) {
+static void deleteDeadInstruction(Instruction *I, ScalarEvolution &SE) {
   SmallVector<Instruction*, 32> NowDeadInsts;
 
   NowDeadInsts.push_back(I);
@@ -162,6 +162,14 @@ static void DeleteDeadInstruction(Instruction *I, ScalarEvolution &SE) {
   } while (!NowDeadInsts.empty());
 }
 
+/// deleteIfDeadInstruction - If the specified value is a dead instruction,
+/// delete it and any recursively used instructions.
+static void deleteIfDeadInstruction(Value *V, ScalarEvolution &SE) {
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    if (isInstructionTriviallyDead(I))
+      deleteDeadInstruction(I, SE);    
+}
+
 bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) {
   CurLoop = L;
 
@@ -454,31 +462,35 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
     return false;
   }
 
-
-  // Okay, we have a strided store "p[i]" of a splattable value.  We can turn
-  // this into a memset in the loop preheader now if we want.  However, this
-  // would be unsafe to do if there is anything else in the loop that may read
-  // or write to the aliased location.  Check for an alias.
-  if (mayLoopAccessLocation(DestPtr, AliasAnalysis::ModRef,
-                            CurLoop, BECount,
-                            StoreSize, getAnalysis<AliasAnalysis>(), TheStore))
-    return false;
-
-  // Okay, everything looks good, insert the memset.
-  BasicBlock *Preheader = CurLoop->getLoopPreheader();
-
-  IRBuilder<> Builder(Preheader->getTerminator());
-
   // The trip count of the loop and the base pointer of the addrec SCEV is
   // guaranteed to be loop invariant, which means that it should dominate the
-  // header.  Just insert code for it in the preheader.
+  // header.  This allows us to insert code for it in the preheader.
+  BasicBlock *Preheader = CurLoop->getLoopPreheader();
+  IRBuilder<> Builder(Preheader->getTerminator());
   SCEVExpander Expander(*SE);
-
+  
+  // Okay, we have a strided store "p[i]" of a splattable value.  We can turn
+  // this into a memset in the loop preheader now if we want.  However, this
+  // would be unsafe to do if there is anything else in the loop that may read
+  // or write to the aliased location.  Check for any overlap by generating the
+  // base pointer and checking the region.
   unsigned AddrSpace = cast<PointerType>(DestPtr->getType())->getAddressSpace();
   Value *BasePtr =
     Expander.expandCodeFor(Ev->getStart(), Builder.getInt8PtrTy(AddrSpace),
                            Preheader->getTerminator());
 
+
+  if (mayLoopAccessLocation(BasePtr, AliasAnalysis::ModRef,
+                            CurLoop, BECount,
+                            StoreSize, getAnalysis<AliasAnalysis>(), TheStore)){
+    Expander.clear();
+    // If we generated new code for the base pointer, clean up.
+    deleteIfDeadInstruction(BasePtr, *SE);
+    return false;
+  }
+  
+  // Okay, everything looks good, insert the memset.
+
   // The # stored bytes is (BECount+1)*Size.  Expand the trip count out to
   // pointer size if it isn't already.
   const Type *IntPtr = TD->getIntPtrType(DestPtr->getContext());
@@ -521,7 +533,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
 
   // Okay, the memset has been formed.  Zap the original store and anything that
   // feeds into it.
-  DeleteDeadInstruction(TheStore, *SE);
+  deleteDeadInstruction(TheStore, *SE);
   ++NumMemSet;
   return true;
 }
@@ -539,41 +551,51 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize,
 
   LoadInst *LI = cast<LoadInst>(SI->getValueOperand());
 
+  // The trip count of the loop and the base pointer of the addrec SCEV is
+  // guaranteed to be loop invariant, which means that it should dominate the
+  // header.  This allows us to insert code for it in the preheader.
+  BasicBlock *Preheader = CurLoop->getLoopPreheader();
+  IRBuilder<> Builder(Preheader->getTerminator());
+  SCEVExpander Expander(*SE);
+  
   // Okay, we have a strided store "p[i]" of a loaded value.  We can turn
   // this into a memcpy in the loop preheader now if we want.  However, this
   // would be unsafe to do if there is anything else in the loop that may read
-  // or write to the stored location (including the load feeding the stores).
-  // Check for an alias.
-  if (mayLoopAccessLocation(SI->getPointerOperand(), AliasAnalysis::ModRef,
+  // or write the memory region we're storing to.  This includes the load that
+  // feeds the stores.  Check for an alias by generating the base address and
+  // checking everything.
+  Value *StoreBasePtr =
+    Expander.expandCodeFor(StoreEv->getStart(),
+                           Builder.getInt8PtrTy(SI->getPointerAddressSpace()),
+                           Preheader->getTerminator());
+  
+  if (mayLoopAccessLocation(StoreBasePtr, AliasAnalysis::ModRef,
                             CurLoop, BECount, StoreSize,
-                            getAnalysis<AliasAnalysis>(), SI))
+                            getAnalysis<AliasAnalysis>(), SI)) {
+    Expander.clear();
+    // If we generated new code for the base pointer, clean up.
+    deleteIfDeadInstruction(StoreBasePtr, *SE);
     return false;
+  }
 
   // For a memcpy, we have to make sure that the input array is not being
   // mutated by the loop.
-  if (mayLoopAccessLocation(LI->getPointerOperand(), AliasAnalysis::Mod,
-                            CurLoop, BECount, StoreSize,
-                            getAnalysis<AliasAnalysis>(), SI))
-    return false;
-
-  // Okay, everything looks good, insert the memcpy.
-  BasicBlock *Preheader = CurLoop->getLoopPreheader();
-
-  IRBuilder<> Builder(Preheader->getTerminator());
-
-  // The trip count of the loop and the base pointer of the addrec SCEV is
-  // guaranteed to be loop invariant, which means that it should dominate the
-  // header.  Just insert code for it in the preheader.
-  SCEVExpander Expander(*SE);
-
   Value *LoadBasePtr =
     Expander.expandCodeFor(LoadEv->getStart(),
                            Builder.getInt8PtrTy(LI->getPointerAddressSpace()),
                            Preheader->getTerminator());
-  Value *StoreBasePtr =
-    Expander.expandCodeFor(StoreEv->getStart(),
-                           Builder.getInt8PtrTy(SI->getPointerAddressSpace()),
-                           Preheader->getTerminator());
+
+  if (mayLoopAccessLocation(LoadBasePtr, AliasAnalysis::Mod, CurLoop, BECount,
+                            StoreSize, getAnalysis<AliasAnalysis>(), SI)) {
+    Expander.clear();
+    // If we generated new code for the base pointer, clean up.
+    deleteIfDeadInstruction(LoadBasePtr, *SE);
+    deleteIfDeadInstruction(StoreBasePtr, *SE);
+    return false;
+  }
+  
+  // Okay, everything is safe, we can transform this!
+  
 
   // The # stored bytes is (BECount+1)*Size.  Expand the trip count out to
   // pointer size if it isn't already.
@@ -589,18 +611,19 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize,
   Value *NumBytes =
     Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator());
 
-  Value *NewCall =
+  CallInst *NewCall =
     Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes,
                          std::min(SI->getAlignment(), LI->getAlignment()));
+  NewCall->setDebugLoc(SI->getDebugLoc());
 
   DEBUG(dbgs() << "  Formed memcpy: " << *NewCall << "\n"
                << "    from load ptr=" << *LoadEv << " at: " << *LI << "\n"
                << "    from store ptr=" << *StoreEv << " at: " << *SI << "\n");
-  (void)NewCall;
+  
 
   // Okay, the memset has been formed.  Zap the original store and anything that
   // feeds into it.
-  DeleteDeadInstruction(SI, *SE);
+  deleteDeadInstruction(SI, *SE);
   ++NumMemCpy;
   return true;
 }
diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp
index 95e1578..47dced3 100644
--- a/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/lib/Transforms/Scalar/LoopRotation.cpp
@@ -184,7 +184,11 @@ bool LoopRotate::rotateLoop(Loop *L) {
   // Now, this loop is suitable for rotation.
   BasicBlock *OrigPreheader = L->getLoopPreheader();
   BasicBlock *OrigLatch = L->getLoopLatch();
-  assert(OrigPreheader && OrigLatch && "Loop not in canonical form?");
+  
+  // If the loop could not be converted to canonical form, it must have an
+  // indirectbr in it, just give up.
+  if (OrigPreheader == 0 || OrigLatch == 0)
+    return false;
 
   // Anything ScalarEvolution may know about this loop or the PHI nodes
   // in its header will soon be invalidated.
@@ -322,7 +326,8 @@ bool LoopRotate::rotateLoop(Loop *L) {
     // We can fold the conditional branch in the preheader, this makes things
     // simpler. The first step is to remove the extra edge to the Exit block.
     Exit->removePredecessor(OrigPreheader, true /*preserve LCSSA*/);
-    BranchInst::Create(NewHeader, PHBI);
+    BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI);
+    NewBI->setDebugLoc(PHBI->getDebugLoc());
     PHBI->eraseFromParent();
     
     // With our CFG finalized, update DomTree if it is available.
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 87e78fa..73ebd61 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -209,7 +209,12 @@ struct Formula {
   /// when AM.Scale is not zero.
   const SCEV *ScaledReg;
 
-  Formula() : ScaledReg(0) {}
+  /// UnfoldedOffset - An additional constant offset which added near the
+  /// use. This requires a temporary register, but the offset itself can
+  /// live in an add immediate field rather than a register.
+  int64_t UnfoldedOffset;
+
+  Formula() : ScaledReg(0), UnfoldedOffset(0) {}
 
   void InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
 
@@ -379,6 +384,10 @@ void Formula::print(raw_ostream &OS) const {
       OS << "<unknown>";
     OS << ')';
   }
+  if (UnfoldedOffset != 0) {
+    if (!First) OS << " + "; else First = false;
+    OS << "imm(" << UnfoldedOffset << ')';
+  }
 }
 
 void Formula::dump() const {
@@ -572,9 +581,6 @@ static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
     switch (II->getIntrinsicID()) {
       default: break;
       case Intrinsic::prefetch:
-      case Intrinsic::x86_sse2_loadu_dq:
-      case Intrinsic::x86_sse2_loadu_pd:
-      case Intrinsic::x86_sse_loadu_ps:
       case Intrinsic::x86_sse_storeu_ps:
       case Intrinsic::x86_sse2_storeu_pd:
       case Intrinsic::x86_sse2_storeu_dq:
@@ -774,8 +780,10 @@ void Cost::RateFormula(const Formula &F,
     RatePrimaryRegister(BaseReg, Regs, L, SE, DT);
   }
 
-  if (F.BaseRegs.size() > 1)
-    NumBaseAdds += F.BaseRegs.size() - 1;
+  // Determine how many (unfolded) adds we'll need inside the loop.
+  size_t NumBaseParts = F.BaseRegs.size() + (F.UnfoldedOffset != 0);
+  if (NumBaseParts > 1)
+    NumBaseAdds += NumBaseParts - 1;
 
   // Tally up the non-zero immediates.
   for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(),
@@ -789,7 +797,7 @@ void Cost::RateFormula(const Formula &F,
   }
 }
 
-/// Loose - Set this cost to a loosing value.
+/// Loose - Set this cost to a losing value.
 void Cost::Loose() {
   NumRegs = ~0u;
   AddRecCost = ~0u;
@@ -1796,7 +1804,8 @@ LSRInstance::OptimizeLoopTermCond() {
         ExitingBlock->getInstList().insert(TermBr, Cond);
 
         // Clone the IVUse, as the old use still exists!
-        CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace());
+        CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace(),
+                              CondUse->getPhi());
         TermBr->replaceUsesOfWith(OldCond, Cond);
       }
     }
@@ -1827,7 +1836,7 @@ LSRInstance::OptimizeLoopTermCond() {
   }
 }
 
-/// reconcileNewOffset - Determine if the given use can accomodate a fixup
+/// reconcileNewOffset - Determine if the given use can accommodate a fixup
 /// at the given offset and other details. If so, update the use and
 /// return true.
 bool
@@ -1948,7 +1957,8 @@ LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
         if (F.BaseRegs == OrigF.BaseRegs &&
             F.ScaledReg == OrigF.ScaledReg &&
             F.AM.BaseGV == OrigF.AM.BaseGV &&
-            F.AM.Scale == OrigF.AM.Scale) {
+            F.AM.Scale == OrigF.AM.Scale &&
+            F.UnfoldedOffset == OrigF.UnfoldedOffset) {
           if (F.AM.BaseOffs == 0)
             return &LU;
           // This is the formula where all the registers and symbols matched;
@@ -2064,6 +2074,10 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
         // x == y  -->  x - y == 0
         const SCEV *N = SE.getSCEV(NV);
         if (SE.isLoopInvariant(N, L)) {
+          // S is normalized, so normalize N before folding it into S
+          // to keep the result normalized.
+          N = TransformForPostIncUse(Normalize, N, CI, 0,
+                                     LF.PostIncLoops, SE, DT);
           Kind = LSRUse::ICmpZero;
           S = SE.getMinusSCEV(N, S);
         }
@@ -2316,8 +2330,29 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
       if (InnerSum->isZero())
         continue;
       Formula F = Base;
-      F.BaseRegs[i] = InnerSum;
-      F.BaseRegs.push_back(*J);
+
+      // Add the remaining pieces of the add back into the new formula.
+      const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum);
+      if (TLI && InnerSumSC &&
+          SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 &&
+          TLI->isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
+                                   InnerSumSC->getValue()->getZExtValue())) {
+        F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset +
+                           InnerSumSC->getValue()->getZExtValue();
+        F.BaseRegs.erase(F.BaseRegs.begin() + i);
+      } else
+        F.BaseRegs[i] = InnerSum;
+
+      // Add J as its own register, or an unfolded immediate.
+      const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J);
+      if (TLI && SC && SE.getTypeSizeInBits(SC->getType()) <= 64 &&
+          TLI->isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
+                                   SC->getValue()->getZExtValue()))
+        F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset +
+                           SC->getValue()->getZExtValue();
+      else
+        F.BaseRegs.push_back(*J);
+
       if (InsertFormula(LU, LUIdx, F))
         // If that formula hadn't been seen before, recurse to find more like
         // it.
@@ -2485,6 +2520,15 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
         continue;
     }
 
+    // Check that multiplying with the unfolded offset doesn't overflow.
+    if (F.UnfoldedOffset != 0) {
+      if (F.UnfoldedOffset == INT64_MIN && Factor == -1)
+        continue;
+      F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset * Factor;
+      if (F.UnfoldedOffset / Factor != Base.UnfoldedOffset)
+        continue;
+    }
+
     // If we make it here and it's legal, add it.
     (void)InsertFormula(LU, LUIdx, F);
   next:;
@@ -2667,7 +2711,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
       // other orig regs.
       ImmMapTy::const_iterator OtherImms[] = {
         Imms.begin(), prior(Imms.end()),
-        Imms.upper_bound((Imms.begin()->first + prior(Imms.end())->first) / 2)
+        Imms.lower_bound((Imms.begin()->first + prior(Imms.end())->first) / 2)
       };
       for (size_t i = 0, e = array_lengthof(OtherImms); i != e; ++i) {
         ImmMapTy::const_iterator M = OtherImms[i];
@@ -2741,8 +2785,13 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
           Formula NewF = F;
           NewF.AM.BaseOffs = (uint64_t)NewF.AM.BaseOffs + Imm;
           if (!isLegalUse(NewF.AM, LU.MinOffset, LU.MaxOffset,
-                          LU.Kind, LU.AccessTy, TLI))
-            continue;
+                          LU.Kind, LU.AccessTy, TLI)) {
+            if (!TLI ||
+                !TLI->isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm))
+              continue;
+            NewF = F;
+            NewF.UnfoldedOffset = (uint64_t)NewF.UnfoldedOffset + Imm;
+          }
           NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg);
 
           // If the new formula has a constant in a register, and adding the
@@ -3491,6 +3540,14 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
     }
   }
 
+  // Expand the unfolded offset portion.
+  int64_t UnfoldedOffset = F.UnfoldedOffset;
+  if (UnfoldedOffset != 0) {
+    // Just add the immediate values.
+    Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy,
+                                                       UnfoldedOffset)));
+  }
+
   // Emit instructions summing all the operands.
   const SCEV *FullS = Ops.empty() ?
                       SE.getConstant(IntTy, 0) :
diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 80b263a..fef6bc3 100644
--- a/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -43,7 +43,13 @@ namespace {
   class LoopUnroll : public LoopPass {
   public:
     static char ID; // Pass ID, replacement for typeid
-    LoopUnroll() : LoopPass(ID) {
+    LoopUnroll(int T = -1, int C = -1,  int P = -1) : LoopPass(ID) {
+      CurrentThreshold = (T == -1) ? UnrollThreshold : unsigned(T);
+      CurrentCount = (C == -1) ? UnrollCount : unsigned(C);
+      CurrentAllowPartial = (P == -1) ? UnrollAllowPartial : (bool)P;
+
+      UserThreshold = (T != -1) || (UnrollThreshold.getNumOccurrences() > 0);
+     
       initializeLoopUnrollPass(*PassRegistry::getPassRegistry());
     }
 
@@ -56,7 +62,10 @@ namespace {
     // explicit -unroll-threshold).
     static const unsigned OptSizeUnrollThreshold = 50;
     
+    unsigned CurrentCount;
     unsigned CurrentThreshold;
+    bool     CurrentAllowPartial;
+    bool     UserThreshold;        // CurrentThreshold is user-specified.
 
     bool runOnLoop(Loop *L, LPPassManager &LPM);
 
@@ -87,7 +96,9 @@ INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_DEPENDENCY(LCSSA)
 INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
 
-Pass *llvm::createLoopUnrollPass() { return new LoopUnroll(); }
+Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial) {
+  return new LoopUnroll(Threshold, Count, AllowPartial);
+}
 
 /// ApproximateLoopSize - Approximate the size of the loop.
 static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls) {
@@ -119,14 +130,14 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
   // from UnrollThreshold, it is overridden to a smaller value if the current
   // function is marked as optimize-for-size, and the unroll threshold was
   // not user specified.
-  CurrentThreshold = UnrollThreshold;
-  if (Header->getParent()->hasFnAttr(Attribute::OptimizeForSize) &&
-      UnrollThreshold.getNumOccurrences() == 0)
-    CurrentThreshold = OptSizeUnrollThreshold;
+  unsigned Threshold = CurrentThreshold;
+  if (!UserThreshold && 
+      Header->getParent()->hasFnAttr(Attribute::OptimizeForSize))
+    Threshold = OptSizeUnrollThreshold;
 
   // Find trip count
   unsigned TripCount = L->getSmallConstantTripCount();
-  unsigned Count = UnrollCount;
+  unsigned Count = CurrentCount;
 
   // Automatically select an unroll count.
   if (Count == 0) {
@@ -140,7 +151,7 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
   }
 
   // Enforce the threshold.
-  if (CurrentThreshold != NoThreshold) {
+  if (Threshold != NoThreshold) {
     unsigned NumInlineCandidates;
     unsigned LoopSize = ApproximateLoopSize(L, NumInlineCandidates);
     DEBUG(dbgs() << "  Loop Size = " << LoopSize << "\n");
@@ -149,16 +160,16 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
       return false;
     }
     uint64_t Size = (uint64_t)LoopSize*Count;
-    if (TripCount != 1 && Size > CurrentThreshold) {
+    if (TripCount != 1 && Size > Threshold) {
       DEBUG(dbgs() << "  Too large to fully unroll with count: " << Count
-            << " because size: " << Size << ">" << CurrentThreshold << "\n");
-      if (!UnrollAllowPartial) {
+            << " because size: " << Size << ">" << Threshold << "\n");
+      if (!CurrentAllowPartial) {
         DEBUG(dbgs() << "  will not try to unroll partially because "
               << "-unroll-allow-partial not given\n");
         return false;
       }
       // Reduce unroll count to be modulo of TripCount for partial unrolling
-      Count = CurrentThreshold / LoopSize;
+      Count = Threshold / LoopSize;
       while (Count != 0 && TripCount%Count != 0) {
         Count--;
       }
diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp
index b4e3d31..e05f29c 100644
--- a/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -258,6 +258,7 @@ bool LoopUnswitch::processCurrentLoop() {
       if (LoopCond && SI->getNumCases() > 1) {
         // Find a value to unswitch on:
         // FIXME: this should chose the most expensive case!
+        // FIXME: scan for a case with a non-critical edge?
         Constant *UnswitchVal = SI->getCaseValue(1);
         // Do not process same value again and again.
         if (!UnswitchedVals.insert(UnswitchVal))
@@ -560,6 +561,8 @@ void LoopUnswitch::SplitExitEdges(Loop *L,
     BasicBlock *ExitBlock = ExitBlocks[i];
     SmallVector<BasicBlock *, 4> Preds(pred_begin(ExitBlock),
                                        pred_end(ExitBlock));
+    // Although SplitBlockPredecessors doesn't preserve loop-simplify in
+    // general, if we call it on all predecessors of all exits then it does.
     SplitBlockPredecessors(ExitBlock, Preds.data(), Preds.size(),
                            ".us-lcssa", this);
   }
@@ -786,8 +789,13 @@ void LoopUnswitch::RemoveBlockIfDead(BasicBlock *BB,
   // If this is the edge to the header block for a loop, remove the loop and
   // promote all subloops.
   if (Loop *BBLoop = LI->getLoopFor(BB)) {
-    if (BBLoop->getLoopLatch() == BB)
+    if (BBLoop->getLoopLatch() == BB) {
       RemoveLoopFromHierarchy(BBLoop);
+      if (currentLoop == BBLoop) {
+        currentLoop = 0;
+        redoLoop = false;
+      }
+    }
   }
 
   // Remove the block from the loop info, which removes it from any loops it
@@ -859,7 +867,6 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
   
   // FOLD boolean conditions (X|LIC), (X&LIC).  Fold conditional branches,
   // selects, switches.
-  std::vector<User*> Users(LIC->use_begin(), LIC->use_end());
   std::vector<Instruction*> Worklist;
   LLVMContext &Context = Val->getContext();
 
@@ -875,13 +882,14 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
       Replacement = ConstantInt::get(Type::getInt1Ty(Val->getContext()), 
                                      !cast<ConstantInt>(Val)->getZExtValue());
     
-    for (unsigned i = 0, e = Users.size(); i != e; ++i)
-      if (Instruction *U = cast<Instruction>(Users[i])) {
-        if (!L->contains(U))
-          continue;
-        U->replaceUsesOfWith(LIC, Replacement);
-        Worklist.push_back(U);
-      }
+    for (Value::use_iterator UI = LIC->use_begin(), E = LIC->use_end();
+         UI != E; ++UI) {
+      Instruction *U = dyn_cast<Instruction>(*UI);
+      if (!U || !L->contains(U))
+        continue;
+      U->replaceUsesOfWith(LIC, Replacement);
+      Worklist.push_back(U);
+    }
     SimplifyCode(Worklist, L);
     return;
   }
@@ -889,9 +897,10 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
   // Otherwise, we don't know the precise value of LIC, but we do know that it
   // is certainly NOT "Val".  As such, simplify any uses in the loop that we
   // can.  This case occurs when we unswitch switch statements.
-  for (unsigned i = 0, e = Users.size(); i != e; ++i) {
-    Instruction *U = cast<Instruction>(Users[i]);
-    if (!L->contains(U))
+  for (Value::use_iterator UI = LIC->use_begin(), E = LIC->use_end();
+       UI != E; ++UI) {
+    Instruction *U = dyn_cast<Instruction>(*UI);
+    if (!U || !L->contains(U))
       continue;
 
     Worklist.push_back(U);
@@ -909,13 +918,22 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
     // Found a dead case value.  Don't remove PHI nodes in the 
     // successor if they become single-entry, those PHI nodes may
     // be in the Users list.
-        
+
+    BasicBlock *Switch = SI->getParent();
+    BasicBlock *SISucc = SI->getSuccessor(DeadCase);
+    BasicBlock *Latch = L->getLoopLatch();
+    if (!SI->findCaseDest(SISucc)) continue;  // Edge is critical.
+    // If the DeadCase successor dominates the loop latch, then the
+    // transformation isn't safe since it will delete the sole predecessor edge
+    // to the latch.
+    if (Latch && DT->dominates(SISucc, Latch))
+      continue;
+
     // FIXME: This is a hack.  We need to keep the successor around
     // and hooked up so as to preserve the loop structure, because
     // trying to update it is complicated.  So instead we preserve the
     // loop structure and put the block on a dead code path.
-    BasicBlock *Switch = SI->getParent();
-    SplitEdge(Switch, SI->getSuccessor(DeadCase), this);
+    SplitEdge(Switch, SISucc, this);
     // Compute the successors instead of relying on the return value
     // of SplitEdge, since it may have split the switch successor
     // after PHI nodes.
diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index bde0e53..bd4c2d6 100644
--- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -23,11 +23,13 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Support/IRBuilder.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 #include <list>
 using namespace llvm;
 
@@ -299,12 +301,15 @@ void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr,
 namespace {
   class MemCpyOpt : public FunctionPass {
     MemoryDependenceAnalysis *MD;
+    TargetLibraryInfo *TLI;
     const TargetData *TD;
   public:
     static char ID; // Pass identification, replacement for typeid
     MemCpyOpt() : FunctionPass(ID) {
       initializeMemCpyOptPass(*PassRegistry::getPassRegistry());
       MD = 0;
+      TLI = 0;
+      TD = 0;
     }
 
     bool runOnFunction(Function &F);
@@ -316,6 +321,7 @@ namespace {
       AU.addRequired<DominatorTree>();
       AU.addRequired<MemoryDependenceAnalysis>();
       AU.addRequired<AliasAnalysis>();
+      AU.addRequired<TargetLibraryInfo>();
       AU.addPreserved<AliasAnalysis>();
       AU.addPreserved<MemoryDependenceAnalysis>();
     }
@@ -346,6 +352,7 @@ INITIALIZE_PASS_BEGIN(MemCpyOpt, "memcpyopt", "MemCpy Optimization",
                       false, false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTree)
 INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
 INITIALIZE_PASS_END(MemCpyOpt, "memcpyopt", "MemCpy Optimization",
                     false, false)
@@ -453,7 +460,10 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst,
           for (unsigned i = 0, e = Range.TheStores.size(); i != e; ++i)
             dbgs() << *Range.TheStores[i] << '\n';
           dbgs() << "With: " << *AMemSet << '\n');
-    
+
+    if (!Range.TheStores.empty())
+      AMemSet->setDebugLoc(Range.TheStores[0]->getDebugLoc());
+
     // Zap all the stores.
     for (SmallVector<Instruction*, 16>::const_iterator
          SI = Range.TheStores.begin(),
@@ -477,12 +487,27 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
   // happen to be using a load-store pair to implement it, rather than
   // a memcpy.
   if (LoadInst *LI = dyn_cast<LoadInst>(SI->getOperand(0))) {
-    if (!LI->isVolatile() && LI->hasOneUse()) {
-      MemDepResult dep = MD->getDependency(LI);
+    if (!LI->isVolatile() && LI->hasOneUse() &&
+        LI->getParent() == SI->getParent()) {
+      MemDepResult ldep = MD->getDependency(LI);
       CallInst *C = 0;
-      if (dep.isClobber() && !isa<MemCpyInst>(dep.getInst()))
-        C = dyn_cast<CallInst>(dep.getInst());
-      
+      if (ldep.isClobber() && !isa<MemCpyInst>(ldep.getInst()))
+        C = dyn_cast<CallInst>(ldep.getInst());
+
+      if (C) {
+        // Check that nothing touches the dest of the "copy" between
+        // the call and the store.
+        AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
+        AliasAnalysis::Location StoreLoc = AA.getLocation(SI);
+        for (BasicBlock::iterator I = --BasicBlock::iterator(SI),
+                                  E = C; I != E; --I) {
+          if (AA.getModRefInfo(&*I, StoreLoc) != AliasAnalysis::NoModRef) {
+            C = 0;
+            break;
+          }
+        }
+      }
+
       if (C) {
         bool changed = performCallSlotOptzn(LI,
                         SI->getPointerOperand()->stripPointerCasts(), 
@@ -688,7 +713,7 @@ bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep,
   if (M->getSource() == MDep->getSource())
     return false;
   
-  // Second, the length of the memcpy's must be the same, or the preceeding one
+  // Second, the length of the memcpy's must be the same, or the preceding one
   // must be larger than the following one.
   ConstantInt *MDepLen = dyn_cast<ConstantInt>(MDep->getLength());
   ConstantInt *MLen = dyn_cast<ConstantInt>(M->getLength());
@@ -804,6 +829,9 @@ bool MemCpyOpt::processMemCpy(MemCpyInst *M) {
 bool MemCpyOpt::processMemMove(MemMoveInst *M) {
   AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
 
+  if (!TLI->has(LibFunc::memmove))
+    return false;
+  
   // See if the pointers alias.
   if (!AA.isNoAlias(AA.getLocationForDest(M), AA.getLocationForSource(M)))
     return false;
@@ -854,12 +882,16 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) {
   if (C1 == 0 || C1->getValue().getZExtValue() < ByValSize)
     return false;
 
-  // Get the alignment of the byval.  If it is greater than the memcpy, then we
-  // can't do the substitution.  If the call doesn't specify the alignment, then
-  // it is some target specific value that we can't know.
+  // Get the alignment of the byval.  If the call doesn't specify the alignment,
+  // then it is some target specific value that we can't know.
   unsigned ByValAlign = CS.getParamAlignment(ArgNo+1);
-  if (ByValAlign == 0 || MDep->getAlignment() < ByValAlign)
-    return false;  
+  if (ByValAlign == 0) return false;
+  
+  // If it is greater than the memcpy, then we check to see if we can force the
+  // source of the memcpy to the alignment we need.  If we fail, we bail out.
+  if (MDep->getAlignment() < ByValAlign &&
+      getOrEnforceKnownAlignment(MDep->getSource(),ByValAlign, TD) < ByValAlign)
+    return false;
   
   // Verify that the copied-from memory doesn't change in between the memcpy and
   // the byval call.
@@ -935,6 +967,14 @@ bool MemCpyOpt::runOnFunction(Function &F) {
   bool MadeChange = false;
   MD = &getAnalysis<MemoryDependenceAnalysis>();
   TD = getAnalysisIfAvailable<TargetData>();
+  TLI = &getAnalysis<TargetLibraryInfo>();
+  
+  // If we don't have at least memset and memcpy, there is little point of doing
+  // anything here.  These are required by a freestanding implementation, so if
+  // even they are disabled, there is no point in trying hard.
+  if (!TLI->has(LibFunc::memset) || !TLI->has(LibFunc::memcpy))
+    return false;
+  
   while (1) {
     if (!iterateOnFunction(F))
       break;
diff --git a/lib/Transforms/Scalar/ObjCARC.cpp b/lib/Transforms/Scalar/ObjCARC.cpp
new file mode 100644
index 0000000..6cd35e5
--- /dev/null
+++ b/lib/Transforms/Scalar/ObjCARC.cpp
@@ -0,0 +1,3537 @@
+//===- ObjCARC.cpp - ObjC ARC Optimization --------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines ObjC ARC optimizations. ARC stands for
+// Automatic Reference Counting and is a system for managing reference counts
+// for objects in Objective C.
+//
+// The optimizations performed include elimination of redundant, partially
+// redundant, and inconsequential reference count operations, elimination of
+// redundant weak pointer operations, pattern-matching and replacement of
+// low-level operations into higher-level operations, and numerous minor
+// simplifications.
+//
+// This file also defines a simple ARC-aware AliasAnalysis.
+//
+// WARNING: This file knows about certain library functions. It recognizes them
+// by name, and hardwires knowedge of their semantics.
+//
+// WARNING: This file knows about how certain Objective-C library functions are
+// used. Naive LLVM IR transformations which would otherwise be
+// behavior-preserving may break these assumptions.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "objc-arc"
+#include "llvm/Function.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+using namespace llvm;
+
+// A handy option to enable/disable all optimizations in this file.
+static cl::opt<bool> EnableARCOpts("enable-objc-arc-opts", cl::init(true));
+
+//===----------------------------------------------------------------------===//
+// Misc. Utilities
+//===----------------------------------------------------------------------===//
+
+namespace {
+  /// MapVector - An associative container with fast insertion-order
+  /// (deterministic) iteration over its elements. Plus the special
+  /// blot operation.
+  template<class KeyT, class ValueT>
+  class MapVector {
+    /// Map - Map keys to indices in Vector.
+    typedef DenseMap<KeyT, size_t> MapTy;
+    MapTy Map;
+
+    /// Vector - Keys and values.
+    typedef std::vector<std::pair<KeyT, ValueT> > VectorTy;
+    VectorTy Vector;
+
+  public:
+    typedef typename VectorTy::iterator iterator;
+    typedef typename VectorTy::const_iterator const_iterator;
+    iterator begin() { return Vector.begin(); }
+    iterator end() { return Vector.end(); }
+    const_iterator begin() const { return Vector.begin(); }
+    const_iterator end() const { return Vector.end(); }
+
+#ifdef XDEBUG
+    ~MapVector() {
+      assert(Vector.size() >= Map.size()); // May differ due to blotting.
+      for (typename MapTy::const_iterator I = Map.begin(), E = Map.end();
+           I != E; ++I) {
+        assert(I->second < Vector.size());
+        assert(Vector[I->second].first == I->first);
+      }
+      for (typename VectorTy::const_iterator I = Vector.begin(),
+           E = Vector.end(); I != E; ++I)
+        assert(!I->first ||
+               (Map.count(I->first) &&
+                Map[I->first] == size_t(I - Vector.begin())));
+    }
+#endif
+
+    ValueT &operator[](KeyT Arg) {
+      std::pair<typename MapTy::iterator, bool> Pair =
+        Map.insert(std::make_pair(Arg, size_t(0)));
+      if (Pair.second) {
+        Pair.first->second = Vector.size();
+        Vector.push_back(std::make_pair(Arg, ValueT()));
+        return Vector.back().second;
+      }
+      return Vector[Pair.first->second].second;
+    }
+
+    std::pair<iterator, bool>
+    insert(const std::pair<KeyT, ValueT> &InsertPair) {
+      std::pair<typename MapTy::iterator, bool> Pair =
+        Map.insert(std::make_pair(InsertPair.first, size_t(0)));
+      if (Pair.second) {
+        Pair.first->second = Vector.size();
+        Vector.push_back(InsertPair);
+        return std::make_pair(llvm::prior(Vector.end()), true);
+      }
+      return std::make_pair(Vector.begin() + Pair.first->second, false);
+    }
+
+    const_iterator find(KeyT Key) const {
+      typename MapTy::const_iterator It = Map.find(Key);
+      if (It == Map.end()) return Vector.end();
+      return Vector.begin() + It->second;
+    }
+
+    /// blot - This is similar to erase, but instead of removing the element
+    /// from the vector, it just zeros out the key in the vector. This leaves
+    /// iterators intact, but clients must be prepared for zeroed-out keys when
+    /// iterating.
+    void blot(KeyT Key) {
+      typename MapTy::iterator It = Map.find(Key);
+      if (It == Map.end()) return;
+      Vector[It->second].first = KeyT();
+      Map.erase(It);
+    }
+
+    void clear() {
+      Map.clear();
+      Vector.clear();
+    }
+  };
+}
+
+//===----------------------------------------------------------------------===//
+// ARC Utilities.
+//===----------------------------------------------------------------------===//
+
+namespace {
+  /// InstructionClass - A simple classification for instructions.
+  enum InstructionClass {
+    IC_Retain,              ///< objc_retain
+    IC_RetainRV,            ///< objc_retainAutoreleasedReturnValue
+    IC_RetainBlock,         ///< objc_retainBlock
+    IC_Release,             ///< objc_release
+    IC_Autorelease,         ///< objc_autorelease
+    IC_AutoreleaseRV,       ///< objc_autoreleaseReturnValue
+    IC_AutoreleasepoolPush, ///< objc_autoreleasePoolPush
+    IC_AutoreleasepoolPop,  ///< objc_autoreleasePoolPop
+    IC_NoopCast,            ///< objc_retainedObject, etc.
+    IC_FusedRetainAutorelease, ///< objc_retainAutorelease
+    IC_FusedRetainAutoreleaseRV, ///< objc_retainAutoreleaseReturnValue
+    IC_LoadWeakRetained,    ///< objc_loadWeakRetained (primitive)
+    IC_StoreWeak,           ///< objc_storeWeak (primitive)
+    IC_InitWeak,            ///< objc_initWeak (derived)
+    IC_LoadWeak,            ///< objc_loadWeak (derived)
+    IC_MoveWeak,            ///< objc_moveWeak (derived)
+    IC_CopyWeak,            ///< objc_copyWeak (derived)
+    IC_DestroyWeak,         ///< objc_destroyWeak (derived)
+    IC_CallOrUser,          ///< could call objc_release and/or "use" pointers
+    IC_Call,                ///< could call objc_release
+    IC_User,                ///< could "use" a pointer
+    IC_None                 ///< anything else
+  };
+}
+
+/// IsPotentialUse - Test whether the given value is possible a
+/// reference-counted pointer.
+static bool IsPotentialUse(const Value *Op) {
+  // Pointers to static or stack storage are not reference-counted pointers.
+  if (isa<Constant>(Op) || isa<AllocaInst>(Op))
+    return false;
+  // Special arguments are not reference-counted.
+  if (const Argument *Arg = dyn_cast<Argument>(Op))
+    if (Arg->hasByValAttr() ||
+        Arg->hasNestAttr() ||
+        Arg->hasStructRetAttr())
+      return false;
+  // Only consider values with pointer types, and not function pointers.
+  const PointerType *Ty = dyn_cast<PointerType>(Op->getType());
+  if (!Ty || isa<FunctionType>(Ty->getElementType()))
+    return false;
+  // Conservatively assume anything else is a potential use.
+  return true;
+}
+
+/// GetCallSiteClass - Helper for GetInstructionClass. Determines what kind
+/// of construct CS is.
+static InstructionClass GetCallSiteClass(ImmutableCallSite CS) {
+  for (ImmutableCallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end();
+       I != E; ++I)
+    if (IsPotentialUse(*I))
+      return CS.onlyReadsMemory() ? IC_User : IC_CallOrUser;
+
+  return CS.onlyReadsMemory() ? IC_None : IC_Call;
+}
+
+/// GetFunctionClass - Determine if F is one of the special known Functions.
+/// If it isn't, return IC_CallOrUser.
+static InstructionClass GetFunctionClass(const Function *F) {
+  Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
+
+  // No arguments.
+  if (AI == AE)
+    return StringSwitch<InstructionClass>(F->getName())
+      .Case("objc_autoreleasePoolPush",  IC_AutoreleasepoolPush)
+      .Default(IC_CallOrUser);
+
+  // One argument.
+  const Argument *A0 = AI++;
+  if (AI == AE)
+    // Argument is a pointer.
+    if (const PointerType *PTy = dyn_cast<PointerType>(A0->getType())) {
+      const Type *ETy = PTy->getElementType();
+      // Argument is i8*.
+      if (ETy->isIntegerTy(8))
+        return StringSwitch<InstructionClass>(F->getName())
+          .Case("objc_retain",                IC_Retain)
+          .Case("objc_retainAutoreleasedReturnValue", IC_RetainRV)
+          .Case("objc_retainBlock",           IC_RetainBlock)
+          .Case("objc_release",               IC_Release)
+          .Case("objc_autorelease",           IC_Autorelease)
+          .Case("objc_autoreleaseReturnValue", IC_AutoreleaseRV)
+          .Case("objc_autoreleasePoolPop",    IC_AutoreleasepoolPop)
+          .Case("objc_retainedObject",        IC_NoopCast)
+          .Case("objc_unretainedObject",      IC_NoopCast)
+          .Case("objc_unretainedPointer",     IC_NoopCast)
+          .Case("objc_retain_autorelease",    IC_FusedRetainAutorelease)
+          .Case("objc_retainAutorelease",     IC_FusedRetainAutorelease)
+          .Case("objc_retainAutoreleaseReturnValue",IC_FusedRetainAutoreleaseRV)
+          .Default(IC_CallOrUser);
+
+      // Argument is i8**
+      if (const PointerType *Pte = dyn_cast<PointerType>(ETy))
+        if (Pte->getElementType()->isIntegerTy(8))
+          return StringSwitch<InstructionClass>(F->getName())
+            .Case("objc_loadWeakRetained",      IC_LoadWeakRetained)
+            .Case("objc_loadWeak",              IC_LoadWeak)
+            .Case("objc_destroyWeak",           IC_DestroyWeak)
+            .Default(IC_CallOrUser);
+    }
+
+  // Two arguments, first is i8**.
+  const Argument *A1 = AI++;
+  if (AI == AE)
+    if (const PointerType *PTy = dyn_cast<PointerType>(A0->getType()))
+      if (const PointerType *Pte = dyn_cast<PointerType>(PTy->getElementType()))
+        if (Pte->getElementType()->isIntegerTy(8))
+          if (const PointerType *PTy1 = dyn_cast<PointerType>(A1->getType())) {
+            const Type *ETy1 = PTy1->getElementType();
+            // Second argument is i8*
+            if (ETy1->isIntegerTy(8))
+              return StringSwitch<InstructionClass>(F->getName())
+                     .Case("objc_storeWeak",             IC_StoreWeak)
+                     .Case("objc_initWeak",              IC_InitWeak)
+                     .Default(IC_CallOrUser);
+            // Second argument is i8**.
+            if (const PointerType *Pte1 = dyn_cast<PointerType>(ETy1))
+              if (Pte1->getElementType()->isIntegerTy(8))
+                return StringSwitch<InstructionClass>(F->getName())
+                       .Case("objc_moveWeak",              IC_MoveWeak)
+                       .Case("objc_copyWeak",              IC_CopyWeak)
+                       .Default(IC_CallOrUser);
+          }
+
+  // Anything else.
+  return IC_CallOrUser;
+}
+
+/// GetInstructionClass - Determine what kind of construct V is.
+static InstructionClass GetInstructionClass(const Value *V) {
+  if (const Instruction *I = dyn_cast<Instruction>(V)) {
+    // Any instruction other than bitcast and gep with a pointer operand have a
+    // use of an objc pointer. Bitcasts, GEPs, Selects, PHIs transfer a pointer
+    // to a subsequent use, rather than using it themselves, in this sense.
+    // As a short cut, several other opcodes are known to have no pointer
+    // operands of interest. And ret is never followed by a release, so it's
+    // not interesting to examine.
+    switch (I->getOpcode()) {
+    case Instruction::Call: {
+      const CallInst *CI = cast<CallInst>(I);
+      // Check for calls to special functions.
+      if (const Function *F = CI->getCalledFunction()) {
+        InstructionClass Class = GetFunctionClass(F);
+        if (Class != IC_CallOrUser)
+          return Class;
+
+        // None of the intrinsic functions do objc_release. For intrinsics, the
+        // only question is whether or not they may be users.
+        switch (F->getIntrinsicID()) {
+        case 0: break;
+        case Intrinsic::bswap: case Intrinsic::ctpop:
+        case Intrinsic::ctlz: case Intrinsic::cttz:
+        case Intrinsic::returnaddress: case Intrinsic::frameaddress:
+        case Intrinsic::stacksave: case Intrinsic::stackrestore:
+        case Intrinsic::vastart: case Intrinsic::vacopy: case Intrinsic::vaend:
+        // Don't let dbg info affect our results.
+        case Intrinsic::dbg_declare: case Intrinsic::dbg_value:
+          // Short cut: Some intrinsics obviously don't use ObjC pointers.
+          return IC_None;
+        default:
+          for (Function::const_arg_iterator AI = F->arg_begin(),
+               AE = F->arg_end(); AI != AE; ++AI)
+            if (IsPotentialUse(AI))
+              return IC_User;
+          return IC_None;
+        }
+      }
+      return GetCallSiteClass(CI);
+    }
+    case Instruction::Invoke:
+      return GetCallSiteClass(cast<InvokeInst>(I));
+    case Instruction::BitCast:
+    case Instruction::GetElementPtr:
+    case Instruction::Select: case Instruction::PHI:
+    case Instruction::Ret: case Instruction::Br:
+    case Instruction::Switch: case Instruction::IndirectBr:
+    case Instruction::Alloca: case Instruction::VAArg:
+    case Instruction::Add: case Instruction::FAdd:
+    case Instruction::Sub: case Instruction::FSub:
+    case Instruction::Mul: case Instruction::FMul:
+    case Instruction::SDiv: case Instruction::UDiv: case Instruction::FDiv:
+    case Instruction::SRem: case Instruction::URem: case Instruction::FRem:
+    case Instruction::Shl: case Instruction::LShr: case Instruction::AShr:
+    case Instruction::And: case Instruction::Or: case Instruction::Xor:
+    case Instruction::SExt: case Instruction::ZExt: case Instruction::Trunc:
+    case Instruction::IntToPtr: case Instruction::FCmp:
+    case Instruction::FPTrunc: case Instruction::FPExt:
+    case Instruction::FPToUI: case Instruction::FPToSI:
+    case Instruction::UIToFP: case Instruction::SIToFP:
+    case Instruction::InsertElement: case Instruction::ExtractElement:
+    case Instruction::ShuffleVector:
+    case Instruction::ExtractValue:
+      break;
+    case Instruction::ICmp:
+      // Comparing a pointer with null, or any other constant, isn't an
+      // interesting use, because we don't care what the pointer points to, or
+      // about the values of any other dynamic reference-counted pointers.
+      if (IsPotentialUse(I->getOperand(1)))
+        return IC_User;
+      break;
+    default:
+      // For anything else, check all the operands.
+      for (User::const_op_iterator OI = I->op_begin(), OE = I->op_end();
+           OI != OE; ++OI)
+        if (IsPotentialUse(*OI))
+          return IC_User;
+    }
+  }
+
+  // Otherwise, it's totally inert for ARC purposes.
+  return IC_None;
+}
+
+/// GetBasicInstructionClass - Determine what kind of construct V is. This is
+/// similar to GetInstructionClass except that it only detects objc runtine
+/// calls. This allows it to be faster.
+static InstructionClass GetBasicInstructionClass(const Value *V) {
+  if (const CallInst *CI = dyn_cast<CallInst>(V)) {
+    if (const Function *F = CI->getCalledFunction())
+      return GetFunctionClass(F);
+    // Otherwise, be conservative.
+    return IC_CallOrUser;
+  }
+
+  // Otherwise, be conservative.
+  return IC_User;
+}
+
+/// IsRetain - Test if the the given class is objc_retain or
+/// equivalent.
+static bool IsRetain(InstructionClass Class) {
+  return Class == IC_Retain ||
+         Class == IC_RetainRV;
+}
+
+/// IsAutorelease - Test if the the given class is objc_autorelease or
+/// equivalent.
+static bool IsAutorelease(InstructionClass Class) {
+  return Class == IC_Autorelease ||
+         Class == IC_AutoreleaseRV;
+}
+
+/// IsForwarding - Test if the given class represents instructions which return
+/// their argument verbatim.
+static bool IsForwarding(InstructionClass Class) {
+  // objc_retainBlock technically doesn't always return its argument
+  // verbatim, but it doesn't matter for our purposes here.
+  return Class == IC_Retain ||
+         Class == IC_RetainRV ||
+         Class == IC_Autorelease ||
+         Class == IC_AutoreleaseRV ||
+         Class == IC_RetainBlock ||
+         Class == IC_NoopCast;
+}
+
+/// IsNoopOnNull - Test if the given class represents instructions which do
+/// nothing if passed a null pointer.
+static bool IsNoopOnNull(InstructionClass Class) {
+  return Class == IC_Retain ||
+         Class == IC_RetainRV ||
+         Class == IC_Release ||
+         Class == IC_Autorelease ||
+         Class == IC_AutoreleaseRV ||
+         Class == IC_RetainBlock;
+}
+
+/// IsAlwaysTail - Test if the given class represents instructions which are
+/// always safe to mark with the "tail" keyword.
+static bool IsAlwaysTail(InstructionClass Class) {
+  // IC_RetainBlock may be given a stack argument.
+  return Class == IC_Retain ||
+         Class == IC_RetainRV ||
+         Class == IC_Autorelease ||
+         Class == IC_AutoreleaseRV;
+}
+
+/// IsNoThrow - Test if the given class represents instructions which are always
+/// safe to mark with the nounwind attribute..
+static bool IsNoThrow(InstructionClass Class) {
+  return Class == IC_Retain ||
+         Class == IC_RetainRV ||
+         Class == IC_RetainBlock ||
+         Class == IC_Release ||
+         Class == IC_Autorelease ||
+         Class == IC_AutoreleaseRV ||
+         Class == IC_AutoreleasepoolPush ||
+         Class == IC_AutoreleasepoolPop;
+}
+
+/// EraseInstruction - Erase the given instruction. ObjC calls return their
+/// argument verbatim, so if it's such a call and the return value has users,
+/// replace them with the argument value.
+static void EraseInstruction(Instruction *CI) {
+  Value *OldArg = cast<CallInst>(CI)->getArgOperand(0);
+
+  bool Unused = CI->use_empty();
+
+  if (!Unused) {
+    // Replace the return value with the argument.
+    assert(IsForwarding(GetBasicInstructionClass(CI)) &&
+           "Can't delete non-forwarding instruction with users!");
+    CI->replaceAllUsesWith(OldArg);
+  }
+
+  CI->eraseFromParent();
+
+  if (Unused)
+    RecursivelyDeleteTriviallyDeadInstructions(OldArg);
+}
+
+/// GetUnderlyingObjCPtr - This is a wrapper around getUnderlyingObject which
+/// also knows how to look through objc_retain and objc_autorelease calls, which
+/// we know to return their argument verbatim.
+static const Value *GetUnderlyingObjCPtr(const Value *V) {
+  for (;;) {
+    V = GetUnderlyingObject(V);
+    if (!IsForwarding(GetBasicInstructionClass(V)))
+      break;
+    V = cast<CallInst>(V)->getArgOperand(0);
+  }
+
+  return V;
+}
+
+/// StripPointerCastsAndObjCCalls - This is a wrapper around
+/// Value::stripPointerCasts which also knows how to look through objc_retain
+/// and objc_autorelease calls, which we know to return their argument verbatim.
+static const Value *StripPointerCastsAndObjCCalls(const Value *V) {
+  for (;;) {
+    V = V->stripPointerCasts();
+    if (!IsForwarding(GetBasicInstructionClass(V)))
+      break;
+    V = cast<CallInst>(V)->getArgOperand(0);
+  }
+  return V;
+}
+
+/// StripPointerCastsAndObjCCalls - This is a wrapper around
+/// Value::stripPointerCasts which also knows how to look through objc_retain
+/// and objc_autorelease calls, which we know to return their argument verbatim.
+static Value *StripPointerCastsAndObjCCalls(Value *V) {
+  for (;;) {
+    V = V->stripPointerCasts();
+    if (!IsForwarding(GetBasicInstructionClass(V)))
+      break;
+    V = cast<CallInst>(V)->getArgOperand(0);
+  }
+  return V;
+}
+
+/// GetObjCArg - Assuming the given instruction is one of the special calls such
+/// as objc_retain or objc_release, return the argument value, stripped of no-op
+/// casts and forwarding calls.
+static Value *GetObjCArg(Value *Inst) {
+  return StripPointerCastsAndObjCCalls(cast<CallInst>(Inst)->getArgOperand(0));
+}
+
+/// IsObjCIdentifiedObject - This is similar to AliasAnalysis'
+/// isObjCIdentifiedObject, except that it uses special knowledge of
+/// ObjC conventions...
+static bool IsObjCIdentifiedObject(const Value *V) {
+  // Assume that call results and arguments have their own "provenance".
+  // Constants (including GlobalVariables) and Allocas are never
+  // reference-counted.
+  if (isa<CallInst>(V) || isa<InvokeInst>(V) ||
+      isa<Argument>(V) || isa<Constant>(V) ||
+      isa<AllocaInst>(V))
+    return true;
+
+  if (const LoadInst *LI = dyn_cast<LoadInst>(V)) {
+    const Value *Pointer =
+      StripPointerCastsAndObjCCalls(LI->getPointerOperand());
+    if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(Pointer)) {
+      StringRef Name = GV->getName();
+      // These special variables are known to hold values which are not
+      // reference-counted pointers.
+      if (Name.startswith("\01L_OBJC_SELECTOR_REFERENCES_") ||
+          Name.startswith("\01L_OBJC_CLASSLIST_REFERENCES_") ||
+          Name.startswith("\01L_OBJC_CLASSLIST_SUP_REFS_$_") ||
+          Name.startswith("\01L_OBJC_METH_VAR_NAME_") ||
+          Name.startswith("\01l_objc_msgSend_fixup_"))
+        return true;
+    }
+  }
+
+  return false;
+}
+
+/// FindSingleUseIdentifiedObject - This is similar to
+/// StripPointerCastsAndObjCCalls but it stops as soon as it finds a value
+/// with multiple uses.
+static const Value *FindSingleUseIdentifiedObject(const Value *Arg) {
+  if (Arg->hasOneUse()) {
+    if (const BitCastInst *BC = dyn_cast<BitCastInst>(Arg))
+      return FindSingleUseIdentifiedObject(BC->getOperand(0));
+    if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Arg))
+      if (GEP->hasAllZeroIndices())
+        return FindSingleUseIdentifiedObject(GEP->getPointerOperand());
+    if (IsForwarding(GetBasicInstructionClass(Arg)))
+      return FindSingleUseIdentifiedObject(
+               cast<CallInst>(Arg)->getArgOperand(0));
+    if (!IsObjCIdentifiedObject(Arg))
+      return 0;
+    return Arg;
+  }
+
+  // If we found an identifiable object but it has multiple uses, but they
+  // are trivial uses, we can still consider this to be a single-use
+  // value.
+  if (IsObjCIdentifiedObject(Arg)) {
+    for (Value::const_use_iterator UI = Arg->use_begin(), UE = Arg->use_end();
+         UI != UE; ++UI) {
+      const User *U = *UI;
+      if (!U->use_empty() || StripPointerCastsAndObjCCalls(U) != Arg)
+         return 0;
+    }
+
+    return Arg;
+  }
+
+  return 0;
+}
+
+//===----------------------------------------------------------------------===//
+// ARC AliasAnalysis.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Pass.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/Passes.h"
+
+namespace {
+  /// ObjCARCAliasAnalysis - This is a simple alias analysis
+  /// implementation that uses knowledge of ARC constructs to answer queries.
+  ///
+  /// TODO: This class could be generalized to know about other ObjC-specific
+  /// tricks. Such as knowing that ivars in the non-fragile ABI are non-aliasing
+  /// even though their offsets are dynamic.
+  class ObjCARCAliasAnalysis : public ImmutablePass,
+                               public AliasAnalysis {
+  public:
+    static char ID; // Class identification, replacement for typeinfo
+    ObjCARCAliasAnalysis() : ImmutablePass(ID) {
+      initializeObjCARCAliasAnalysisPass(*PassRegistry::getPassRegistry());
+    }
+
+  private:
+    virtual void initializePass() {
+      InitializeAliasAnalysis(this);
+    }
+
+    /// getAdjustedAnalysisPointer - This method is used when a pass implements
+    /// an analysis interface through multiple inheritance.  If needed, it
+    /// should override this to adjust the this pointer as needed for the
+    /// specified pass info.
+    virtual void *getAdjustedAnalysisPointer(const void *PI) {
+      if (PI == &AliasAnalysis::ID)
+        return (AliasAnalysis*)this;
+      return this;
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+    virtual AliasResult alias(const Location &LocA, const Location &LocB);
+    virtual bool pointsToConstantMemory(const Location &Loc, bool OrLocal);
+    virtual ModRefBehavior getModRefBehavior(ImmutableCallSite CS);
+    virtual ModRefBehavior getModRefBehavior(const Function *F);
+    virtual ModRefResult getModRefInfo(ImmutableCallSite CS,
+                                       const Location &Loc);
+    virtual ModRefResult getModRefInfo(ImmutableCallSite CS1,
+                                       ImmutableCallSite CS2);
+  };
+}  // End of anonymous namespace
+
+// Register this pass...
+char ObjCARCAliasAnalysis::ID = 0;
+INITIALIZE_AG_PASS(ObjCARCAliasAnalysis, AliasAnalysis, "objc-arc-aa",
+                   "ObjC-ARC-Based Alias Analysis", false, true, false)
+
+ImmutablePass *llvm::createObjCARCAliasAnalysisPass() {
+  return new ObjCARCAliasAnalysis();
+}
+
+void
+ObjCARCAliasAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AliasAnalysis::getAnalysisUsage(AU);
+}
+
+AliasAnalysis::AliasResult
+ObjCARCAliasAnalysis::alias(const Location &LocA, const Location &LocB) {
+  if (!EnableARCOpts)
+    return AliasAnalysis::alias(LocA, LocB);
+
+  // First, strip off no-ops, including ObjC-specific no-ops, and try making a
+  // precise alias query.
+  const Value *SA = StripPointerCastsAndObjCCalls(LocA.Ptr);
+  const Value *SB = StripPointerCastsAndObjCCalls(LocB.Ptr);
+  AliasResult Result =
+    AliasAnalysis::alias(Location(SA, LocA.Size, LocA.TBAATag),
+                         Location(SB, LocB.Size, LocB.TBAATag));
+  if (Result != MayAlias)
+    return Result;
+
+  // If that failed, climb to the underlying object, including climbing through
+  // ObjC-specific no-ops, and try making an imprecise alias query.
+  const Value *UA = GetUnderlyingObjCPtr(SA);
+  const Value *UB = GetUnderlyingObjCPtr(SB);
+  if (UA != SA || UB != SB) {
+    Result = AliasAnalysis::alias(Location(UA), Location(UB));
+    // We can't use MustAlias or PartialAlias results here because
+    // GetUnderlyingObjCPtr may return an offsetted pointer value.
+    if (Result == NoAlias)
+      return NoAlias;
+  }
+
+  // If that failed, fail. We don't need to chain here, since that's covered
+  // by the earlier precise query.
+  return MayAlias;
+}
+
+bool
+ObjCARCAliasAnalysis::pointsToConstantMemory(const Location &Loc,
+                                             bool OrLocal) {
+  if (!EnableARCOpts)
+    return AliasAnalysis::pointsToConstantMemory(Loc, OrLocal);
+
+  // First, strip off no-ops, including ObjC-specific no-ops, and try making
+  // a precise alias query.
+  const Value *S = StripPointerCastsAndObjCCalls(Loc.Ptr);
+  if (AliasAnalysis::pointsToConstantMemory(Location(S, Loc.Size, Loc.TBAATag),
+                                            OrLocal))
+    return true;
+
+  // If that failed, climb to the underlying object, including climbing through
+  // ObjC-specific no-ops, and try making an imprecise alias query.
+  const Value *U = GetUnderlyingObjCPtr(S);
+  if (U != S)
+    return AliasAnalysis::pointsToConstantMemory(Location(U), OrLocal);
+
+  // If that failed, fail. We don't need to chain here, since that's covered
+  // by the earlier precise query.
+  return false;
+}
+
+AliasAnalysis::ModRefBehavior
+ObjCARCAliasAnalysis::getModRefBehavior(ImmutableCallSite CS) {
+  // We have nothing to do. Just chain to the next AliasAnalysis.
+  return AliasAnalysis::getModRefBehavior(CS);
+}
+
+AliasAnalysis::ModRefBehavior
+ObjCARCAliasAnalysis::getModRefBehavior(const Function *F) {
+  if (!EnableARCOpts)
+    return AliasAnalysis::getModRefBehavior(F);
+
+  switch (GetFunctionClass(F)) {
+  case IC_NoopCast:
+    return DoesNotAccessMemory;
+  default:
+    break;
+  }
+
+  return AliasAnalysis::getModRefBehavior(F);
+}
+
+AliasAnalysis::ModRefResult
+ObjCARCAliasAnalysis::getModRefInfo(ImmutableCallSite CS, const Location &Loc) {
+  if (!EnableARCOpts)
+    return AliasAnalysis::getModRefInfo(CS, Loc);
+
+  switch (GetBasicInstructionClass(CS.getInstruction())) {
+  case IC_Retain:
+  case IC_RetainRV:
+  case IC_RetainBlock:
+  case IC_Autorelease:
+  case IC_AutoreleaseRV:
+  case IC_NoopCast:
+  case IC_AutoreleasepoolPush:
+  case IC_FusedRetainAutorelease:
+  case IC_FusedRetainAutoreleaseRV:
+    // These functions don't access any memory visible to the compiler.
+    return NoModRef;
+  default:
+    break;
+  }
+
+  return AliasAnalysis::getModRefInfo(CS, Loc);
+}
+
+AliasAnalysis::ModRefResult
+ObjCARCAliasAnalysis::getModRefInfo(ImmutableCallSite CS1,
+                                    ImmutableCallSite CS2) {
+  // TODO: Theoretically we could check for dependencies between objc_* calls
+  // and OnlyAccessesArgumentPointees calls or other well-behaved calls.
+  return AliasAnalysis::getModRefInfo(CS1, CS2);
+}
+
+//===----------------------------------------------------------------------===//
+// ARC expansion.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/InstIterator.h"
+#include "llvm/Transforms/Scalar.h"
+
+namespace {
+  /// ObjCARCExpand - Early ARC transformations.
+  class ObjCARCExpand : public FunctionPass {
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+    virtual bool runOnFunction(Function &F);
+
+  public:
+    static char ID;
+    ObjCARCExpand() : FunctionPass(ID) {
+      initializeObjCARCExpandPass(*PassRegistry::getPassRegistry());
+    }
+  };
+}
+
+char ObjCARCExpand::ID = 0;
+INITIALIZE_PASS(ObjCARCExpand,
+                "objc-arc-expand", "ObjC ARC expansion", false, false)
+
+Pass *llvm::createObjCARCExpandPass() {
+  return new ObjCARCExpand();
+}
+
+void ObjCARCExpand::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+}
+
+bool ObjCARCExpand::runOnFunction(Function &F) {
+  if (!EnableARCOpts)
+    return false;
+
+  bool Changed = false;
+
+  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ++I) {
+    Instruction *Inst = &*I;
+
+    switch (GetBasicInstructionClass(Inst)) {
+    case IC_Retain:
+    case IC_RetainRV:
+    case IC_Autorelease:
+    case IC_AutoreleaseRV:
+    case IC_FusedRetainAutorelease:
+    case IC_FusedRetainAutoreleaseRV:
+      // These calls return their argument verbatim, as a low-level
+      // optimization. However, this makes high-level optimizations
+      // harder. Undo any uses of this optimization that the front-end
+      // emitted here. We'll redo them in a later pass.
+      Changed = true;
+      Inst->replaceAllUsesWith(cast<CallInst>(Inst)->getArgOperand(0));
+      break;
+    default:
+      break;
+    }
+  }
+
+  return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+// ARC optimization.
+//===----------------------------------------------------------------------===//
+
+// TODO: On code like this:
+//
+// objc_retain(%x)
+// stuff_that_cannot_release()
+// objc_autorelease(%x)
+// stuff_that_cannot_release()
+// objc_retain(%x)
+// stuff_that_cannot_release()
+// objc_autorelease(%x)
+//
+// The second retain and autorelease can be deleted.
+
+// TODO: It should be possible to delete
+// objc_autoreleasePoolPush and objc_autoreleasePoolPop
+// pairs if nothing is actually autoreleased between them. Also, autorelease
+// calls followed by objc_autoreleasePoolPop calls (perhaps in ObjC++ code
+// after inlining) can be turned into plain release calls.
+
+// TODO: Critical-edge splitting. If the optimial insertion point is
+// a critical edge, the current algorithm has to fail, because it doesn't
+// know how to split edges. It should be possible to make the optimizer
+// think in terms of edges, rather than blocks, and then split critical
+// edges on demand.
+
+// TODO: OptimizeSequences could generalized to be Interprocedural.
+
+// TODO: Recognize that a bunch of other objc runtime calls have
+// non-escaping arguments and non-releasing arguments, and may be
+// non-autoreleasing.
+
+// TODO: Sink autorelease calls as far as possible. Unfortunately we
+// usually can't sink them past other calls, which would be the main
+// case where it would be useful.
+
+#include "llvm/GlobalAlias.h"
+#include "llvm/Module.h"
+#include "llvm/Constants.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/Statistic.h"
+
+STATISTIC(NumNoops,       "Number of no-op objc calls eliminated");
+STATISTIC(NumPartialNoops, "Number of partially no-op objc calls eliminated");
+STATISTIC(NumAutoreleases,"Number of autoreleases converted to releases");
+STATISTIC(NumRets,        "Number of return value forwarding "
+                          "retain+autoreleaes eliminated");
+STATISTIC(NumRRs,         "Number of retain+release paths eliminated");
+STATISTIC(NumPeeps,       "Number of calls peephole-optimized");
+
+namespace {
+  /// ProvenanceAnalysis - This is similar to BasicAliasAnalysis, and it
+  /// uses many of the same techniques, except it uses special ObjC-specific
+  /// reasoning about pointer relationships.
+  class ProvenanceAnalysis {
+    AliasAnalysis *AA;
+
+    typedef std::pair<const Value *, const Value *> ValuePairTy;
+    typedef DenseMap<ValuePairTy, bool> CachedResultsTy;
+    CachedResultsTy CachedResults;
+
+    bool relatedCheck(const Value *A, const Value *B);
+    bool relatedSelect(const SelectInst *A, const Value *B);
+    bool relatedPHI(const PHINode *A, const Value *B);
+
+    // Do not implement.
+    void operator=(const ProvenanceAnalysis &);
+    ProvenanceAnalysis(const ProvenanceAnalysis &);
+
+  public:
+    ProvenanceAnalysis() {}
+
+    void setAA(AliasAnalysis *aa) { AA = aa; }
+
+    AliasAnalysis *getAA() const { return AA; }
+
+    bool related(const Value *A, const Value *B);
+
+    void clear() {
+      CachedResults.clear();
+    }
+  };
+}
+
+bool ProvenanceAnalysis::relatedSelect(const SelectInst *A, const Value *B) {
+  // If the values are Selects with the same condition, we can do a more precise
+  // check: just check for relations between the values on corresponding arms.
+  if (const SelectInst *SB = dyn_cast<SelectInst>(B))
+    if (A->getCondition() == SB->getCondition()) {
+      if (related(A->getTrueValue(), SB->getTrueValue()))
+        return true;
+      if (related(A->getFalseValue(), SB->getFalseValue()))
+        return true;
+      return false;
+    }
+
+  // Check both arms of the Select node individually.
+  if (related(A->getTrueValue(), B))
+    return true;
+  if (related(A->getFalseValue(), B))
+    return true;
+
+  // The arms both checked out.
+  return false;
+}
+
+bool ProvenanceAnalysis::relatedPHI(const PHINode *A, const Value *B) {
+  // If the values are PHIs in the same block, we can do a more precise as well
+  // as efficient check: just check for relations between the values on
+  // corresponding edges.
+  if (const PHINode *PNB = dyn_cast<PHINode>(B))
+    if (PNB->getParent() == A->getParent()) {
+      for (unsigned i = 0, e = A->getNumIncomingValues(); i != e; ++i)
+        if (related(A->getIncomingValue(i),
+                    PNB->getIncomingValueForBlock(A->getIncomingBlock(i))))
+          return true;
+      return false;
+    }
+
+  // Check each unique source of the PHI node against B.
+  SmallPtrSet<const Value *, 4> UniqueSrc;
+  for (unsigned i = 0, e = A->getNumIncomingValues(); i != e; ++i) {
+    const Value *PV1 = A->getIncomingValue(i);
+    if (UniqueSrc.insert(PV1) && related(PV1, B))
+      return true;
+  }
+
+  // All of the arms checked out.
+  return false;
+}
+
+/// isStoredObjCPointer - Test if the value of P, or any value covered by its
+/// provenance, is ever stored within the function (not counting callees).
+static bool isStoredObjCPointer(const Value *P) {
+  SmallPtrSet<const Value *, 8> Visited;
+  SmallVector<const Value *, 8> Worklist;
+  Worklist.push_back(P);
+  Visited.insert(P);
+  do {
+    P = Worklist.pop_back_val();
+    for (Value::const_use_iterator UI = P->use_begin(), UE = P->use_end();
+         UI != UE; ++UI) {
+      const User *Ur = *UI;
+      if (isa<StoreInst>(Ur)) {
+        if (UI.getOperandNo() == 0)
+          // The pointer is stored.
+          return true;
+        // The pointed is stored through.
+        continue;
+      }
+      if (isa<CallInst>(Ur))
+        // The pointer is passed as an argument, ignore this.
+        continue;
+      if (isa<PtrToIntInst>(P))
+        // Assume the worst.
+        return true;
+      if (Visited.insert(Ur))
+        Worklist.push_back(Ur);
+    }
+  } while (!Worklist.empty());
+
+  // Everything checked out.
+  return false;
+}
+
+bool ProvenanceAnalysis::relatedCheck(const Value *A, const Value *B) {
+  // Skip past provenance pass-throughs.
+  A = GetUnderlyingObjCPtr(A);
+  B = GetUnderlyingObjCPtr(B);
+
+  // Quick check.
+  if (A == B)
+    return true;
+
+  // Ask regular AliasAnalysis, for a first approximation.
+  switch (AA->alias(A, B)) {
+  case AliasAnalysis::NoAlias:
+    return false;
+  case AliasAnalysis::MustAlias:
+  case AliasAnalysis::PartialAlias:
+    return true;
+  case AliasAnalysis::MayAlias:
+    break;
+  }
+
+  bool AIsIdentified = IsObjCIdentifiedObject(A);
+  bool BIsIdentified = IsObjCIdentifiedObject(B);
+
+  // An ObjC-Identified object can't alias a load if it is never locally stored.
+  if (AIsIdentified) {
+    if (BIsIdentified) {
+      // If both pointers have provenance, they can be directly compared.
+      if (A != B)
+        return false;
+    } else {
+      if (isa<LoadInst>(B))
+        return isStoredObjCPointer(A);
+    }
+  } else {
+    if (BIsIdentified && isa<LoadInst>(A))
+      return isStoredObjCPointer(B);
+  }
+
+   // Special handling for PHI and Select.
+  if (const PHINode *PN = dyn_cast<PHINode>(A))
+    return relatedPHI(PN, B);
+  if (const PHINode *PN = dyn_cast<PHINode>(B))
+    return relatedPHI(PN, A);
+  if (const SelectInst *S = dyn_cast<SelectInst>(A))
+    return relatedSelect(S, B);
+  if (const SelectInst *S = dyn_cast<SelectInst>(B))
+    return relatedSelect(S, A);
+
+  // Conservative.
+  return true;
+}
+
+bool ProvenanceAnalysis::related(const Value *A, const Value *B) {
+  // Begin by inserting a conservative value into the map. If the insertion
+  // fails, we have the answer already. If it succeeds, leave it there until we
+  // compute the real answer to guard against recursive queries.
+  if (A > B) std::swap(A, B);
+  std::pair<CachedResultsTy::iterator, bool> Pair =
+    CachedResults.insert(std::make_pair(ValuePairTy(A, B), true));
+  if (!Pair.second)
+    return Pair.first->second;
+
+  bool Result = relatedCheck(A, B);
+  CachedResults[ValuePairTy(A, B)] = Result;
+  return Result;
+}
+
+namespace {
+  // Sequence - A sequence of states that a pointer may go through in which an
+  // objc_retain and objc_release are actually needed.
+  enum Sequence {
+    S_None,
+    S_Retain,         ///< objc_retain(x)
+    S_CanRelease,     ///< foo(x) -- x could possibly see a ref count decrement
+    S_Use,            ///< any use of x
+    S_Stop,           ///< like S_Release, but code motion is stopped
+    S_Release,        ///< objc_release(x)
+    S_MovableRelease  ///< objc_release(x), !clang.imprecise_release
+  };
+}
+
+static Sequence MergeSeqs(Sequence A, Sequence B, bool TopDown) {
+  // The easy cases.
+  if (A == B)
+    return A;
+  if (A == S_None || B == S_None)
+    return S_None;
+
+  // Note that we can't merge S_CanRelease and S_Use.
+  if (A > B) std::swap(A, B);
+  if (TopDown) {
+    // Choose the side which is further along in the sequence.
+    if (A == S_Retain && (B == S_CanRelease || B == S_Use))
+      return B;
+  } else {
+    // Choose the side which is further along in the sequence.
+    if ((A == S_Use || A == S_CanRelease) &&
+        (B == S_Release || B == S_Stop || B == S_MovableRelease))
+      return A;
+    // If both sides are releases, choose the more conservative one.
+    if (A == S_Stop && (B == S_Release || B == S_MovableRelease))
+      return A;
+    if (A == S_Release && B == S_MovableRelease)
+      return A;
+  }
+
+  return S_None;
+}
+
+namespace {
+  /// RRInfo - Unidirectional information about either a
+  /// retain-decrement-use-release sequence or release-use-decrement-retain
+  /// reverese sequence.
+  struct RRInfo {
+    /// KnownIncremented - After an objc_retain, the reference count of the
+    /// referenced object is known to be positive. Similarly, before an
+    /// objc_release, the reference count of the referenced object is known to
+    /// be positive. If there are retain-release pairs in code regions where the
+    /// retain count is known to be positive, they can be eliminated, regardless
+    /// of any side effects between them.
+    bool KnownIncremented;
+
+    /// IsRetainBlock - True if the Calls are objc_retainBlock calls (as
+    /// opposed to objc_retain calls).
+    bool IsRetainBlock;
+
+    /// IsTailCallRelease - True of the objc_release calls are all marked
+    /// with the "tail" keyword.
+    bool IsTailCallRelease;
+
+    /// ReleaseMetadata - If the Calls are objc_release calls and they all have
+    /// a clang.imprecise_release tag, this is the metadata tag.
+    MDNode *ReleaseMetadata;
+
+    /// Calls - For a top-down sequence, the set of objc_retains or
+    /// objc_retainBlocks. For bottom-up, the set of objc_releases.
+    SmallPtrSet<Instruction *, 2> Calls;
+
+    /// ReverseInsertPts - The set of optimal insert positions for
+    /// moving calls in the opposite sequence.
+    SmallPtrSet<Instruction *, 2> ReverseInsertPts;
+
+    RRInfo() :
+      KnownIncremented(false), IsRetainBlock(false), IsTailCallRelease(false),
+      ReleaseMetadata(0) {}
+
+    void clear();
+  };
+}
+
+void RRInfo::clear() {
+  KnownIncremented = false;
+  IsRetainBlock = false;
+  IsTailCallRelease = false;
+  ReleaseMetadata = 0;
+  Calls.clear();
+  ReverseInsertPts.clear();
+}
+
+namespace {
+  /// PtrState - This class summarizes several per-pointer runtime properties
+  /// which are propogated through the flow graph.
+  class PtrState {
+    /// RefCount - The known minimum number of reference count increments.
+    unsigned RefCount;
+
+    /// Seq - The current position in the sequence.
+    Sequence Seq;
+
+  public:
+    /// RRI - Unidirectional information about the current sequence.
+    /// TODO: Encapsulate this better.
+    RRInfo RRI;
+
+    PtrState() : RefCount(0), Seq(S_None) {}
+
+    void IncrementRefCount() {
+      if (RefCount != UINT_MAX) ++RefCount;
+    }
+
+    void DecrementRefCount() {
+      if (RefCount != 0) --RefCount;
+    }
+
+    void ClearRefCount() {
+      RefCount = 0;
+    }
+
+    bool IsKnownIncremented() const {
+      return RefCount > 0;
+    }
+
+    void SetSeq(Sequence NewSeq) {
+      Seq = NewSeq;
+    }
+
+    void SetSeqToRelease(MDNode *M) {
+      if (Seq == S_None || Seq == S_Use) {
+        Seq = M ? S_MovableRelease : S_Release;
+        RRI.ReleaseMetadata = M;
+      } else if (Seq != S_MovableRelease || RRI.ReleaseMetadata != M) {
+        Seq = S_Release;
+        RRI.ReleaseMetadata = 0;
+      }
+    }
+
+    Sequence GetSeq() const {
+      return Seq;
+    }
+
+    void ClearSequenceProgress() {
+      Seq = S_None;
+      RRI.clear();
+    }
+
+    void Merge(const PtrState &Other, bool TopDown);
+  };
+}
+
+void
+PtrState::Merge(const PtrState &Other, bool TopDown) {
+  Seq = MergeSeqs(Seq, Other.Seq, TopDown);
+  RefCount = std::min(RefCount, Other.RefCount);
+
+  // We can't merge a plain objc_retain with an objc_retainBlock.
+  if (RRI.IsRetainBlock != Other.RRI.IsRetainBlock)
+    Seq = S_None;
+
+  if (Seq == S_None) {
+    RRI.clear();
+  } else {
+    // Conservatively merge the ReleaseMetadata information.
+    if (RRI.ReleaseMetadata != Other.RRI.ReleaseMetadata)
+      RRI.ReleaseMetadata = 0;
+
+    RRI.KnownIncremented = RRI.KnownIncremented && Other.RRI.KnownIncremented;
+    RRI.IsTailCallRelease = RRI.IsTailCallRelease && Other.RRI.IsTailCallRelease;
+    RRI.Calls.insert(Other.RRI.Calls.begin(), Other.RRI.Calls.end());
+    RRI.ReverseInsertPts.insert(Other.RRI.ReverseInsertPts.begin(),
+                                Other.RRI.ReverseInsertPts.end());
+  }
+}
+
+namespace {
+  /// BBState - Per-BasicBlock state.
+  class BBState {
+    /// TopDownPathCount - The number of unique control paths from the entry
+    /// which can reach this block.
+    unsigned TopDownPathCount;
+
+    /// BottomUpPathCount - The number of unique control paths to exits
+    /// from this block.
+    unsigned BottomUpPathCount;
+
+    /// MapTy - A type for PerPtrTopDown and PerPtrBottomUp.
+    typedef MapVector<const Value *, PtrState> MapTy;
+
+    /// PerPtrTopDown - The top-down traversal uses this to record information
+    /// known about a pointer at the bottom of each block.
+    MapTy PerPtrTopDown;
+
+    /// PerPtrBottomUp - The bottom-up traversal uses this to record information
+    /// known about a pointer at the top of each block.
+    MapTy PerPtrBottomUp;
+
+  public:
+    BBState() : TopDownPathCount(0), BottomUpPathCount(0) {}
+
+    typedef MapTy::iterator ptr_iterator;
+    typedef MapTy::const_iterator ptr_const_iterator;
+
+    ptr_iterator top_down_ptr_begin() { return PerPtrTopDown.begin(); }
+    ptr_iterator top_down_ptr_end() { return PerPtrTopDown.end(); }
+    ptr_const_iterator top_down_ptr_begin() const {
+      return PerPtrTopDown.begin();
+    }
+    ptr_const_iterator top_down_ptr_end() const {
+      return PerPtrTopDown.end();
+    }
+
+    ptr_iterator bottom_up_ptr_begin() { return PerPtrBottomUp.begin(); }
+    ptr_iterator bottom_up_ptr_end() { return PerPtrBottomUp.end(); }
+    ptr_const_iterator bottom_up_ptr_begin() const {
+      return PerPtrBottomUp.begin();
+    }
+    ptr_const_iterator bottom_up_ptr_end() const {
+      return PerPtrBottomUp.end();
+    }
+
+    /// SetAsEntry - Mark this block as being an entry block, which has one
+    /// path from the entry by definition.
+    void SetAsEntry() { TopDownPathCount = 1; }
+
+    /// SetAsExit - Mark this block as being an exit block, which has one
+    /// path to an exit by definition.
+    void SetAsExit()  { BottomUpPathCount = 1; }
+
+    PtrState &getPtrTopDownState(const Value *Arg) {
+      return PerPtrTopDown[Arg];
+    }
+
+    PtrState &getPtrBottomUpState(const Value *Arg) {
+      return PerPtrBottomUp[Arg];
+    }
+
+    void clearBottomUpPointers() {
+      PerPtrTopDown.clear();
+    }
+
+    void clearTopDownPointers() {
+      PerPtrTopDown.clear();
+    }
+
+    void InitFromPred(const BBState &Other);
+    void InitFromSucc(const BBState &Other);
+    void MergePred(const BBState &Other);
+    void MergeSucc(const BBState &Other);
+
+    /// GetAllPathCount - Return the number of possible unique paths from an
+    /// entry to an exit which pass through this block. This is only valid
+    /// after both the top-down and bottom-up traversals are complete.
+    unsigned GetAllPathCount() const {
+      return TopDownPathCount * BottomUpPathCount;
+    }
+  };
+}
+
+void BBState::InitFromPred(const BBState &Other) {
+  PerPtrTopDown = Other.PerPtrTopDown;
+  TopDownPathCount = Other.TopDownPathCount;
+}
+
+void BBState::InitFromSucc(const BBState &Other) {
+  PerPtrBottomUp = Other.PerPtrBottomUp;
+  BottomUpPathCount = Other.BottomUpPathCount;
+}
+
+/// MergePred - The top-down traversal uses this to merge information about
+/// predecessors to form the initial state for a new block.
+void BBState::MergePred(const BBState &Other) {
+  // Other.TopDownPathCount can be 0, in which case it is either dead or a
+  // loop backedge. Loop backedges are special.
+  TopDownPathCount += Other.TopDownPathCount;
+
+  // For each entry in the other set, if our set has an entry with the same key,
+  // merge the entries. Otherwise, copy the entry and merge it with an empty
+  // entry.
+  for (ptr_const_iterator MI = Other.top_down_ptr_begin(),
+       ME = Other.top_down_ptr_end(); MI != ME; ++MI) {
+    std::pair<ptr_iterator, bool> Pair = PerPtrTopDown.insert(*MI);
+    Pair.first->second.Merge(Pair.second ? PtrState() : MI->second,
+                             /*TopDown=*/true);
+  }
+
+  // For each entry in our set, if the other set doens't have an entry with the
+  // same key, force it to merge with an empty entry.
+  for (ptr_iterator MI = top_down_ptr_begin(),
+       ME = top_down_ptr_end(); MI != ME; ++MI)
+    if (Other.PerPtrTopDown.find(MI->first) == Other.PerPtrTopDown.end())
+      MI->second.Merge(PtrState(), /*TopDown=*/true);
+}
+
+/// MergeSucc - The bottom-up traversal uses this to merge information about
+/// successors to form the initial state for a new block.
+void BBState::MergeSucc(const BBState &Other) {
+  // Other.BottomUpPathCount can be 0, in which case it is either dead or a
+  // loop backedge. Loop backedges are special.
+  BottomUpPathCount += Other.BottomUpPathCount;
+
+  // For each entry in the other set, if our set has an entry with the
+  // same key, merge the entries. Otherwise, copy the entry and merge
+  // it with an empty entry.
+  for (ptr_const_iterator MI = Other.bottom_up_ptr_begin(),
+       ME = Other.bottom_up_ptr_end(); MI != ME; ++MI) {
+    std::pair<ptr_iterator, bool> Pair = PerPtrBottomUp.insert(*MI);
+    Pair.first->second.Merge(Pair.second ? PtrState() : MI->second,
+                             /*TopDown=*/false);
+  }
+
+  // For each entry in our set, if the other set doens't have an entry
+  // with the same key, force it to merge with an empty entry.
+  for (ptr_iterator MI = bottom_up_ptr_begin(),
+       ME = bottom_up_ptr_end(); MI != ME; ++MI)
+    if (Other.PerPtrBottomUp.find(MI->first) == Other.PerPtrBottomUp.end())
+      MI->second.Merge(PtrState(), /*TopDown=*/false);
+}
+
+namespace {
+  /// ObjCARCOpt - The main ARC optimization pass.
+  class ObjCARCOpt : public FunctionPass {
+    bool Changed;
+    ProvenanceAnalysis PA;
+
+    /// RetainFunc, RelaseFunc - Declarations for objc_retain,
+    /// objc_retainBlock, and objc_release.
+    Function *RetainFunc, *RetainBlockFunc, *RetainRVFunc, *ReleaseFunc,
+             *AutoreleaseFunc;
+
+    /// RetainRVCallee, etc. - Declarations for ObjC runtime
+    /// functions, for use in creating calls to them. These are initialized
+    /// lazily to avoid cluttering up the Module with unused declarations.
+    Constant *RetainRVCallee, *AutoreleaseRVCallee, *ReleaseCallee,
+             *RetainCallee, *AutoreleaseCallee;
+
+    /// UsedInThisFunciton - Flags which determine whether each of the
+    /// interesting runtine functions is in fact used in the current function.
+    unsigned UsedInThisFunction;
+
+    /// ImpreciseReleaseMDKind - The Metadata Kind for clang.imprecise_release
+    /// metadata.
+    unsigned ImpreciseReleaseMDKind;
+
+    Constant *getRetainRVCallee(Module *M);
+    Constant *getAutoreleaseRVCallee(Module *M);
+    Constant *getReleaseCallee(Module *M);
+    Constant *getRetainCallee(Module *M);
+    Constant *getAutoreleaseCallee(Module *M);
+
+    void OptimizeRetainCall(Function &F, Instruction *Retain);
+    bool OptimizeRetainRVCall(Function &F, Instruction *RetainRV);
+    void OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV);
+    void OptimizeIndividualCalls(Function &F);
+
+    void CheckForCFGHazards(const BasicBlock *BB,
+                            DenseMap<const BasicBlock *, BBState> &BBStates,
+                            BBState &MyStates) const;
+    bool VisitBottomUp(BasicBlock *BB,
+                       DenseMap<const BasicBlock *, BBState> &BBStates,
+                       MapVector<Value *, RRInfo> &Retains);
+    bool VisitTopDown(BasicBlock *BB,
+                      DenseMap<const BasicBlock *, BBState> &BBStates,
+                      DenseMap<Value *, RRInfo> &Releases);
+    bool Visit(Function &F,
+               DenseMap<const BasicBlock *, BBState> &BBStates,
+               MapVector<Value *, RRInfo> &Retains,
+               DenseMap<Value *, RRInfo> &Releases);
+
+    void MoveCalls(Value *Arg, RRInfo &RetainsToMove, RRInfo &ReleasesToMove,
+                   MapVector<Value *, RRInfo> &Retains,
+                   DenseMap<Value *, RRInfo> &Releases,
+                   SmallVectorImpl<Instruction *> &DeadInsts);
+
+    bool PerformCodePlacement(DenseMap<const BasicBlock *, BBState> &BBStates,
+                              MapVector<Value *, RRInfo> &Retains,
+                              DenseMap<Value *, RRInfo> &Releases);
+
+    void OptimizeWeakCalls(Function &F);
+
+    bool OptimizeSequences(Function &F);
+
+    void OptimizeReturns(Function &F);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+    virtual bool doInitialization(Module &M);
+    virtual bool runOnFunction(Function &F);
+    virtual void releaseMemory();
+
+  public:
+    static char ID;
+    ObjCARCOpt() : FunctionPass(ID) {
+      initializeObjCARCOptPass(*PassRegistry::getPassRegistry());
+    }
+  };
+}
+
+char ObjCARCOpt::ID = 0;
+INITIALIZE_PASS_BEGIN(ObjCARCOpt,
+                      "objc-arc", "ObjC ARC optimization", false, false)
+INITIALIZE_PASS_DEPENDENCY(ObjCARCAliasAnalysis)
+INITIALIZE_PASS_END(ObjCARCOpt,
+                    "objc-arc", "ObjC ARC optimization", false, false)
+
+Pass *llvm::createObjCARCOptPass() {
+  return new ObjCARCOpt();
+}
+
+void ObjCARCOpt::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<ObjCARCAliasAnalysis>();
+  AU.addRequired<AliasAnalysis>();
+  // ARC optimization doesn't currently split critical edges.
+  AU.setPreservesCFG();
+}
+
+Constant *ObjCARCOpt::getRetainRVCallee(Module *M) {
+  if (!RetainRVCallee) {
+    LLVMContext &C = M->getContext();
+    const Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
+    std::vector<const Type *> Params;
+    Params.push_back(I8X);
+    const FunctionType *FTy =
+      FunctionType::get(I8X, Params, /*isVarArg=*/false);
+    AttrListPtr Attributes;
+    Attributes.addAttr(~0u, Attribute::NoUnwind);
+    RetainRVCallee =
+      M->getOrInsertFunction("objc_retainAutoreleasedReturnValue", FTy,
+                             Attributes);
+  }
+  return RetainRVCallee;
+}
+
+Constant *ObjCARCOpt::getAutoreleaseRVCallee(Module *M) {
+  if (!AutoreleaseRVCallee) {
+    LLVMContext &C = M->getContext();
+    const Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
+    std::vector<const Type *> Params;
+    Params.push_back(I8X);
+    const FunctionType *FTy =
+      FunctionType::get(I8X, Params, /*isVarArg=*/false);
+    AttrListPtr Attributes;
+    Attributes.addAttr(~0u, Attribute::NoUnwind);
+    AutoreleaseRVCallee =
+      M->getOrInsertFunction("objc_autoreleaseReturnValue", FTy,
+                             Attributes);
+  }
+  return AutoreleaseRVCallee;
+}
+
+Constant *ObjCARCOpt::getReleaseCallee(Module *M) {
+  if (!ReleaseCallee) {
+    LLVMContext &C = M->getContext();
+    std::vector<const Type *> Params;
+    Params.push_back(PointerType::getUnqual(Type::getInt8Ty(C)));
+    AttrListPtr Attributes;
+    Attributes.addAttr(~0u, Attribute::NoUnwind);
+    ReleaseCallee =
+      M->getOrInsertFunction(
+        "objc_release",
+        FunctionType::get(Type::getVoidTy(C), Params, /*isVarArg=*/false),
+        Attributes);
+  }
+  return ReleaseCallee;
+}
+
+Constant *ObjCARCOpt::getRetainCallee(Module *M) {
+  if (!RetainCallee) {
+    LLVMContext &C = M->getContext();
+    std::vector<const Type *> Params;
+    Params.push_back(PointerType::getUnqual(Type::getInt8Ty(C)));
+    AttrListPtr Attributes;
+    Attributes.addAttr(~0u, Attribute::NoUnwind);
+    RetainCallee =
+      M->getOrInsertFunction(
+        "objc_retain",
+        FunctionType::get(Params[0], Params, /*isVarArg=*/false),
+        Attributes);
+  }
+  return RetainCallee;
+}
+
+Constant *ObjCARCOpt::getAutoreleaseCallee(Module *M) {
+  if (!AutoreleaseCallee) {
+    LLVMContext &C = M->getContext();
+    std::vector<const Type *> Params;
+    Params.push_back(PointerType::getUnqual(Type::getInt8Ty(C)));
+    AttrListPtr Attributes;
+    Attributes.addAttr(~0u, Attribute::NoUnwind);
+    AutoreleaseCallee =
+      M->getOrInsertFunction(
+        "objc_autorelease",
+        FunctionType::get(Params[0], Params, /*isVarArg=*/false),
+        Attributes);
+  }
+  return AutoreleaseCallee;
+}
+
+/// CanAlterRefCount - Test whether the given instruction can result in a
+/// reference count modification (positive or negative) for the pointer's
+/// object.
+static bool
+CanAlterRefCount(const Instruction *Inst, const Value *Ptr,
+                 ProvenanceAnalysis &PA, InstructionClass Class) {
+  switch (Class) {
+  case IC_Autorelease:
+  case IC_AutoreleaseRV:
+  case IC_User:
+    // These operations never directly modify a reference count.
+    return false;
+  default: break;
+  }
+
+  ImmutableCallSite CS = static_cast<const Value *>(Inst);
+  assert(CS && "Only calls can alter reference counts!");
+
+  // See if AliasAnalysis can help us with the call.
+  AliasAnalysis::ModRefBehavior MRB = PA.getAA()->getModRefBehavior(CS);
+  if (AliasAnalysis::onlyReadsMemory(MRB))
+    return false;
+  if (AliasAnalysis::onlyAccessesArgPointees(MRB)) {
+    for (ImmutableCallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end();
+         I != E; ++I) {
+      const Value *Op = *I;
+      if (IsPotentialUse(Op) && PA.related(Ptr, Op))
+        return true;
+    }
+    return false;
+  }
+
+  // Assume the worst.
+  return true;
+}
+
+/// CanUse - Test whether the given instruction can "use" the given pointer's
+/// object in a way that requires the reference count to be positive.
+static bool
+CanUse(const Instruction *Inst, const Value *Ptr, ProvenanceAnalysis &PA,
+       InstructionClass Class) {
+  // IC_Call operations (as opposed to IC_CallOrUser) never "use" objc pointers.
+  if (Class == IC_Call)
+    return false;
+
+  // Consider various instructions which may have pointer arguments which are
+  // not "uses".
+  if (const ICmpInst *ICI = dyn_cast<ICmpInst>(Inst)) {
+    // Comparing a pointer with null, or any other constant, isn't really a use,
+    // because we don't care what the pointer points to, or about the values
+    // of any other dynamic reference-counted pointers.
+    if (!IsPotentialUse(ICI->getOperand(1)))
+      return false;
+  } else if (ImmutableCallSite CS = static_cast<const Value *>(Inst)) {
+    // For calls, just check the arguments (and not the callee operand).
+    for (ImmutableCallSite::arg_iterator OI = CS.arg_begin(),
+         OE = CS.arg_end(); OI != OE; ++OI) {
+      const Value *Op = *OI;
+      if (IsPotentialUse(Op) && PA.related(Ptr, Op))
+        return true;
+    }
+    return false;
+  } else if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+    // Special-case stores, because we don't care about the stored value, just
+    // the store address.
+    const Value *Op = GetUnderlyingObjCPtr(SI->getPointerOperand());
+    // If we can't tell what the underlying object was, assume there is a
+    // dependence.
+    return IsPotentialUse(Op) && PA.related(Op, Ptr);
+  }
+
+  // Check each operand for a match.
+  for (User::const_op_iterator OI = Inst->op_begin(), OE = Inst->op_end();
+       OI != OE; ++OI) {
+    const Value *Op = *OI;
+    if (IsPotentialUse(Op) && PA.related(Ptr, Op))
+      return true;
+  }
+  return false;
+}
+
+/// CanInterruptRV - Test whether the given instruction can autorelease
+/// any pointer or cause an autoreleasepool pop.
+static bool
+CanInterruptRV(InstructionClass Class) {
+  switch (Class) {
+  case IC_AutoreleasepoolPop:
+  case IC_CallOrUser:
+  case IC_Call:
+  case IC_Autorelease:
+  case IC_AutoreleaseRV:
+  case IC_FusedRetainAutorelease:
+  case IC_FusedRetainAutoreleaseRV:
+    return true;
+  default:
+    return false;
+  }
+}
+
+namespace {
+  /// DependenceKind - There are several kinds of dependence-like concepts in
+  /// use here.
+  enum DependenceKind {
+    NeedsPositiveRetainCount,
+    CanChangeRetainCount,
+    RetainAutoreleaseDep,       ///< Blocks objc_retainAutorelease.
+    RetainAutoreleaseRVDep,     ///< Blocks objc_retainAutoreleaseReturnValue.
+    RetainRVDep                 ///< Blocks objc_retainAutoreleasedReturnValue.
+  };
+}
+
+/// Depends - Test if there can be dependencies on Inst through Arg. This
+/// function only tests dependencies relevant for removing pairs of calls.
+static bool
+Depends(DependenceKind Flavor, Instruction *Inst, const Value *Arg,
+        ProvenanceAnalysis &PA) {
+  // If we've reached the definition of Arg, stop.
+  if (Inst == Arg)
+    return true;
+
+  switch (Flavor) {
+  case NeedsPositiveRetainCount: {
+    InstructionClass Class = GetInstructionClass(Inst);
+    switch (Class) {
+    case IC_AutoreleasepoolPop:
+    case IC_AutoreleasepoolPush:
+    case IC_None:
+      return false;
+    default:
+      return CanUse(Inst, Arg, PA, Class);
+    }
+  }
+
+  case CanChangeRetainCount: {
+    InstructionClass Class = GetInstructionClass(Inst);
+    switch (Class) {
+    case IC_AutoreleasepoolPop:
+      // Conservatively assume this can decrement any count.
+      return true;
+    case IC_AutoreleasepoolPush:
+    case IC_None:
+      return false;
+    default:
+      return CanAlterRefCount(Inst, Arg, PA, Class);
+    }
+  }
+
+  case RetainAutoreleaseDep:
+    switch (GetBasicInstructionClass(Inst)) {
+    case IC_AutoreleasepoolPop:
+      // Don't merge an objc_autorelease with an objc_retain inside a different
+      // autoreleasepool scope.
+      return true;
+    case IC_Retain:
+    case IC_RetainRV:
+      // Check for a retain of the same pointer for merging.
+      return GetObjCArg(Inst) == Arg;
+    default:
+      // Nothing else matters for objc_retainAutorelease formation.
+      return false;
+    }
+    break;
+
+  case RetainAutoreleaseRVDep: {
+    InstructionClass Class = GetBasicInstructionClass(Inst);
+    switch (Class) {
+    case IC_Retain:
+    case IC_RetainRV:
+      // Check for a retain of the same pointer for merging.
+      return GetObjCArg(Inst) == Arg;
+    default:
+      // Anything that can autorelease interrupts
+      // retainAutoreleaseReturnValue formation.
+      return CanInterruptRV(Class);
+    }
+    break;
+  }
+
+  case RetainRVDep:
+    return CanInterruptRV(GetBasicInstructionClass(Inst));
+  }
+
+  llvm_unreachable("Invalid dependence flavor");
+  return true;
+}
+
+/// FindDependencies - Walk up the CFG from StartPos (which is in StartBB) and
+/// find local and non-local dependencies on Arg.
+/// TODO: Cache results?
+static void
+FindDependencies(DependenceKind Flavor,
+                 const Value *Arg,
+                 BasicBlock *StartBB, Instruction *StartInst,
+                 SmallPtrSet<Instruction *, 4> &DependingInstructions,
+                 SmallPtrSet<const BasicBlock *, 4> &Visited,
+                 ProvenanceAnalysis &PA) {
+  BasicBlock::iterator StartPos = StartInst;
+
+  SmallVector<std::pair<BasicBlock *, BasicBlock::iterator>, 4> Worklist;
+  Worklist.push_back(std::make_pair(StartBB, StartPos));
+  do {
+    std::pair<BasicBlock *, BasicBlock::iterator> Pair =
+      Worklist.pop_back_val();
+    BasicBlock *LocalStartBB = Pair.first;
+    BasicBlock::iterator LocalStartPos = Pair.second;
+    BasicBlock::iterator StartBBBegin = LocalStartBB->begin();
+    for (;;) {
+      if (LocalStartPos == StartBBBegin) {
+        pred_iterator PI(LocalStartBB), PE(LocalStartBB, false);
+        if (PI == PE)
+          // If we've reached the function entry, produce a null dependence.
+          DependingInstructions.insert(0);
+        else
+          // Add the predecessors to the worklist.
+          do {
+            BasicBlock *PredBB = *PI;
+            if (Visited.insert(PredBB))
+              Worklist.push_back(std::make_pair(PredBB, PredBB->end()));
+          } while (++PI != PE);
+        break;
+      }
+
+      Instruction *Inst = --LocalStartPos;
+      if (Depends(Flavor, Inst, Arg, PA)) {
+        DependingInstructions.insert(Inst);
+        break;
+      }
+    }
+  } while (!Worklist.empty());
+
+  // Determine whether the original StartBB post-dominates all of the blocks we
+  // visited. If not, insert a sentinal indicating that most optimizations are
+  // not safe.
+  for (SmallPtrSet<const BasicBlock *, 4>::const_iterator I = Visited.begin(),
+       E = Visited.end(); I != E; ++I) {
+    const BasicBlock *BB = *I;
+    if (BB == StartBB)
+      continue;
+    const TerminatorInst *TI = cast<TerminatorInst>(&BB->back());
+    for (succ_const_iterator SI(TI), SE(TI, false); SI != SE; ++SI) {
+      const BasicBlock *Succ = *SI;
+      if (Succ != StartBB && !Visited.count(Succ)) {
+        DependingInstructions.insert(reinterpret_cast<Instruction *>(-1));
+        return;
+      }
+    }
+  }
+}
+
+static bool isNullOrUndef(const Value *V) {
+  return isa<ConstantPointerNull>(V) || isa<UndefValue>(V);
+}
+
+static bool isNoopInstruction(const Instruction *I) {
+  return isa<BitCastInst>(I) ||
+         (isa<GetElementPtrInst>(I) &&
+          cast<GetElementPtrInst>(I)->hasAllZeroIndices());
+}
+
+/// OptimizeRetainCall - Turn objc_retain into
+/// objc_retainAutoreleasedReturnValue if the operand is a return value.
+void
+ObjCARCOpt::OptimizeRetainCall(Function &F, Instruction *Retain) {
+  CallSite CS(GetObjCArg(Retain));
+  Instruction *Call = CS.getInstruction();
+  if (!Call) return;
+  if (Call->getParent() != Retain->getParent()) return;
+
+  // Check that the call is next to the retain.
+  BasicBlock::iterator I = Call;
+  ++I;
+  while (isNoopInstruction(I)) ++I;
+  if (&*I != Retain)
+    return;
+
+  // Turn it to an objc_retainAutoreleasedReturnValue..
+  Changed = true;
+  ++NumPeeps;
+  cast<CallInst>(Retain)->setCalledFunction(getRetainRVCallee(F.getParent()));
+}
+
+/// OptimizeRetainRVCall - Turn objc_retainAutoreleasedReturnValue into
+/// objc_retain if the operand is not a return value.  Or, if it can be
+/// paired with an objc_autoreleaseReturnValue, delete the pair and
+/// return true.
+bool
+ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) {
+  // Check for the argument being from an immediately preceding call.
+  Value *Arg = GetObjCArg(RetainRV);
+  CallSite CS(Arg);
+  if (Instruction *Call = CS.getInstruction())
+    if (Call->getParent() == RetainRV->getParent()) {
+      BasicBlock::iterator I = Call;
+      ++I;
+      while (isNoopInstruction(I)) ++I;
+      if (&*I == RetainRV)
+        return false;
+    }
+
+  // Check for being preceded by an objc_autoreleaseReturnValue on the same
+  // pointer. In this case, we can delete the pair.
+  BasicBlock::iterator I = RetainRV, Begin = RetainRV->getParent()->begin();
+  if (I != Begin) {
+    do --I; while (I != Begin && isNoopInstruction(I));
+    if (GetBasicInstructionClass(I) == IC_AutoreleaseRV &&
+        GetObjCArg(I) == Arg) {
+      Changed = true;
+      ++NumPeeps;
+      EraseInstruction(I);
+      EraseInstruction(RetainRV);
+      return true;
+    }
+  }
+
+  // Turn it to a plain objc_retain.
+  Changed = true;
+  ++NumPeeps;
+  cast<CallInst>(RetainRV)->setCalledFunction(getRetainCallee(F.getParent()));
+  return false;
+}
+
+/// OptimizeAutoreleaseRVCall - Turn objc_autoreleaseReturnValue into
+/// objc_autorelease if the result is not used as a return value.
+void
+ObjCARCOpt::OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV) {
+  // Check for a return of the pointer value.
+  const Value *Ptr = GetObjCArg(AutoreleaseRV);
+  for (Value::const_use_iterator UI = Ptr->use_begin(), UE = Ptr->use_end();
+       UI != UE; ++UI) {
+    const User *I = *UI;
+    if (isa<ReturnInst>(I) || GetBasicInstructionClass(I) == IC_RetainRV)
+      return;
+  }
+
+  Changed = true;
+  ++NumPeeps;
+  cast<CallInst>(AutoreleaseRV)->
+    setCalledFunction(getAutoreleaseCallee(F.getParent()));
+}
+
+/// OptimizeIndividualCalls - Visit each call, one at a time, and make
+/// simplifications without doing any additional analysis.
+void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
+  // Reset all the flags in preparation for recomputing them.
+  UsedInThisFunction = 0;
+
+  // Visit all objc_* calls in F.
+  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
+    Instruction *Inst = &*I++;
+    InstructionClass Class = GetBasicInstructionClass(Inst);
+
+    switch (Class) {
+    default: break;
+
+    // Delete no-op casts. These function calls have special semantics, but
+    // the semantics are entirely implemented via lowering in the front-end,
+    // so by the time they reach the optimizer, they are just no-op calls
+    // which return their argument.
+    //
+    // There are gray areas here, as the ability to cast reference-counted
+    // pointers to raw void* and back allows code to break ARC assumptions,
+    // however these are currently considered to be unimportant.
+    case IC_NoopCast:
+      Changed = true;
+      ++NumNoops;
+      EraseInstruction(Inst);
+      continue;
+
+    // If the pointer-to-weak-pointer is null, it's undefined behavior.
+    case IC_StoreWeak:
+    case IC_LoadWeak:
+    case IC_LoadWeakRetained:
+    case IC_InitWeak:
+    case IC_DestroyWeak: {
+      CallInst *CI = cast<CallInst>(Inst);
+      if (isNullOrUndef(CI->getArgOperand(0))) {
+        const Type *Ty = CI->getArgOperand(0)->getType();
+        new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()),
+                      Constant::getNullValue(Ty),
+                      CI);
+        CI->replaceAllUsesWith(UndefValue::get(CI->getType()));
+        CI->eraseFromParent();
+        continue;
+      }
+      break;
+    }
+    case IC_CopyWeak:
+    case IC_MoveWeak: {
+      CallInst *CI = cast<CallInst>(Inst);
+      if (isNullOrUndef(CI->getArgOperand(0)) ||
+          isNullOrUndef(CI->getArgOperand(1))) {
+        const Type *Ty = CI->getArgOperand(0)->getType();
+        new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()),
+                      Constant::getNullValue(Ty),
+                      CI);
+        CI->replaceAllUsesWith(UndefValue::get(CI->getType()));
+        CI->eraseFromParent();
+        continue;
+      }
+      break;
+    }
+    case IC_Retain:
+      OptimizeRetainCall(F, Inst);
+      break;
+    case IC_RetainRV:
+      if (OptimizeRetainRVCall(F, Inst))
+        continue;
+      break;
+    case IC_AutoreleaseRV:
+      OptimizeAutoreleaseRVCall(F, Inst);
+      break;
+    }
+
+    // objc_autorelease(x) -> objc_release(x) if x is otherwise unused.
+    if (IsAutorelease(Class) && Inst->use_empty()) {
+      CallInst *Call = cast<CallInst>(Inst);
+      const Value *Arg = Call->getArgOperand(0);
+      Arg = FindSingleUseIdentifiedObject(Arg);
+      if (Arg) {
+        Changed = true;
+        ++NumAutoreleases;
+
+        // Create the declaration lazily.
+        LLVMContext &C = Inst->getContext();
+        CallInst *NewCall =
+          CallInst::Create(getReleaseCallee(F.getParent()),
+                           Call->getArgOperand(0), "", Call);
+        NewCall->setMetadata(ImpreciseReleaseMDKind,
+                             MDNode::get(C, ArrayRef<Value *>()));
+        EraseInstruction(Call);
+        Inst = NewCall;
+        Class = IC_Release;
+      }
+    }
+
+    // For functions which can never be passed stack arguments, add
+    // a tail keyword.
+    if (IsAlwaysTail(Class)) {
+      Changed = true;
+      cast<CallInst>(Inst)->setTailCall();
+    }
+
+    // Set nounwind as needed.
+    if (IsNoThrow(Class)) {
+      Changed = true;
+      cast<CallInst>(Inst)->setDoesNotThrow();
+    }
+
+    if (!IsNoopOnNull(Class)) {
+      UsedInThisFunction |= 1 << Class;
+      continue;
+    }
+
+    const Value *Arg = GetObjCArg(Inst);
+
+    // ARC calls with null are no-ops. Delete them.
+    if (isNullOrUndef(Arg)) {
+      Changed = true;
+      ++NumNoops;
+      EraseInstruction(Inst);
+      continue;
+    }
+
+    // Keep track of which of retain, release, autorelease, and retain_block
+    // are actually present in this function.
+    UsedInThisFunction |= 1 << Class;
+
+    // If Arg is a PHI, and one or more incoming values to the
+    // PHI are null, and the call is control-equivalent to the PHI, and there
+    // are no relevant side effects between the PHI and the call, the call
+    // could be pushed up to just those paths with non-null incoming values.
+    // For now, don't bother splitting critical edges for this.
+    SmallVector<std::pair<Instruction *, const Value *>, 4> Worklist;
+    Worklist.push_back(std::make_pair(Inst, Arg));
+    do {
+      std::pair<Instruction *, const Value *> Pair = Worklist.pop_back_val();
+      Inst = Pair.first;
+      Arg = Pair.second;
+
+      const PHINode *PN = dyn_cast<PHINode>(Arg);
+      if (!PN) continue;
+
+      // Determine if the PHI has any null operands, or any incoming
+      // critical edges.
+      bool HasNull = false;
+      bool HasCriticalEdges = false;
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+        Value *Incoming =
+          StripPointerCastsAndObjCCalls(PN->getIncomingValue(i));
+        if (isNullOrUndef(Incoming))
+          HasNull = true;
+        else if (cast<TerminatorInst>(PN->getIncomingBlock(i)->back())
+                   .getNumSuccessors() != 1) {
+          HasCriticalEdges = true;
+          break;
+        }
+      }
+      // If we have null operands and no critical edges, optimize.
+      if (!HasCriticalEdges && HasNull) {
+        SmallPtrSet<Instruction *, 4> DependingInstructions;
+        SmallPtrSet<const BasicBlock *, 4> Visited;
+
+        // Check that there is nothing that cares about the reference
+        // count between the call and the phi.
+        FindDependencies(NeedsPositiveRetainCount, Arg,
+                         Inst->getParent(), Inst,
+                         DependingInstructions, Visited, PA);
+        if (DependingInstructions.size() == 1 &&
+            *DependingInstructions.begin() == PN) {
+          Changed = true;
+          ++NumPartialNoops;
+          // Clone the call into each predecessor that has a non-null value.
+          CallInst *CInst = cast<CallInst>(Inst);
+          const Type *ParamTy = CInst->getArgOperand(0)->getType();
+          for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+            Value *Incoming =
+              StripPointerCastsAndObjCCalls(PN->getIncomingValue(i));
+            if (!isNullOrUndef(Incoming)) {
+              CallInst *Clone = cast<CallInst>(CInst->clone());
+              Value *Op = PN->getIncomingValue(i);
+              Instruction *InsertPos = &PN->getIncomingBlock(i)->back();
+              if (Op->getType() != ParamTy)
+                Op = new BitCastInst(Op, ParamTy, "", InsertPos);
+              Clone->setArgOperand(0, Op);
+              Clone->insertBefore(InsertPos);
+              Worklist.push_back(std::make_pair(Clone, Incoming));
+            }
+          }
+          // Erase the original call.
+          EraseInstruction(CInst);
+          continue;
+        }
+      }
+    } while (!Worklist.empty());
+  }
+}
+
+/// CheckForCFGHazards - Check for critical edges, loop boundaries, irreducible
+/// control flow, or other CFG structures where moving code across the edge
+/// would result in it being executed more.
+void
+ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB,
+                               DenseMap<const BasicBlock *, BBState> &BBStates,
+                               BBState &MyStates) const {
+  // If any top-down local-use or possible-dec has a succ which is earlier in
+  // the sequence, forget it.
+  for (BBState::ptr_const_iterator I = MyStates.top_down_ptr_begin(),
+       E = MyStates.top_down_ptr_end(); I != E; ++I)
+    switch (I->second.GetSeq()) {
+    default: break;
+    case S_Use: {
+      const Value *Arg = I->first;
+      const TerminatorInst *TI = cast<TerminatorInst>(&BB->back());
+      bool SomeSuccHasSame = false;
+      bool AllSuccsHaveSame = true;
+      for (succ_const_iterator SI(TI), SE(TI, false); SI != SE; ++SI)
+        switch (BBStates[*SI].getPtrBottomUpState(Arg).GetSeq()) {
+        case S_None:
+        case S_CanRelease:
+          MyStates.getPtrTopDownState(Arg).ClearSequenceProgress();
+          SomeSuccHasSame = false;
+          break;
+        case S_Use:
+          SomeSuccHasSame = true;
+          break;
+        case S_Stop:
+        case S_Release:
+        case S_MovableRelease:
+          AllSuccsHaveSame = false;
+          break;
+        case S_Retain:
+          llvm_unreachable("bottom-up pointer in retain state!");
+        }
+      // If the state at the other end of any of the successor edges
+      // matches the current state, require all edges to match. This
+      // guards against loops in the middle of a sequence.
+      if (SomeSuccHasSame && !AllSuccsHaveSame)
+        MyStates.getPtrTopDownState(Arg).ClearSequenceProgress();
+    }
+    case S_CanRelease: {
+      const Value *Arg = I->first;
+      const TerminatorInst *TI = cast<TerminatorInst>(&BB->back());
+      bool SomeSuccHasSame = false;
+      bool AllSuccsHaveSame = true;
+      for (succ_const_iterator SI(TI), SE(TI, false); SI != SE; ++SI)
+        switch (BBStates[*SI].getPtrBottomUpState(Arg).GetSeq()) {
+        case S_None:
+          MyStates.getPtrTopDownState(Arg).ClearSequenceProgress();
+          SomeSuccHasSame = false;
+          break;
+        case S_CanRelease:
+          SomeSuccHasSame = true;
+          break;
+        case S_Stop:
+        case S_Release:
+        case S_MovableRelease:
+        case S_Use:
+          AllSuccsHaveSame = false;
+          break;
+        case S_Retain:
+          llvm_unreachable("bottom-up pointer in retain state!");
+        }
+      // If the state at the other end of any of the successor edges
+      // matches the current state, require all edges to match. This
+      // guards against loops in the middle of a sequence.
+      if (SomeSuccHasSame && !AllSuccsHaveSame)
+        MyStates.getPtrTopDownState(Arg).ClearSequenceProgress();
+    }
+    }
+}
+
+bool
+ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
+                          DenseMap<const BasicBlock *, BBState> &BBStates,
+                          MapVector<Value *, RRInfo> &Retains) {
+  bool NestingDetected = false;
+  BBState &MyStates = BBStates[BB];
+
+  // Merge the states from each successor to compute the initial state
+  // for the current block.
+  const TerminatorInst *TI = cast<TerminatorInst>(&BB->back());
+  succ_const_iterator SI(TI), SE(TI, false);
+  if (SI == SE)
+    MyStates.SetAsExit();
+  else
+    do {
+      const BasicBlock *Succ = *SI++;
+      if (Succ == BB)
+        continue;
+      DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Succ);
+      if (I == BBStates.end())
+        continue;
+      MyStates.InitFromSucc(I->second);
+      while (SI != SE) {
+        Succ = *SI++;
+        if (Succ != BB) {
+          I = BBStates.find(Succ);
+          if (I != BBStates.end())
+            MyStates.MergeSucc(I->second);
+        }
+      }
+      break;
+    } while (SI != SE);
+
+  // Visit all the instructions, bottom-up.
+  for (BasicBlock::iterator I = BB->end(), E = BB->begin(); I != E; --I) {
+    Instruction *Inst = llvm::prior(I);
+    InstructionClass Class = GetInstructionClass(Inst);
+    const Value *Arg = 0;
+
+    switch (Class) {
+    case IC_Release: {
+      Arg = GetObjCArg(Inst);
+
+      PtrState &S = MyStates.getPtrBottomUpState(Arg);
+
+      // If we see two releases in a row on the same pointer. If so, make
+      // a note, and we'll cicle back to revisit it after we've
+      // hopefully eliminated the second release, which may allow us to
+      // eliminate the first release too.
+      // Theoretically we could implement removal of nested retain+release
+      // pairs by making PtrState hold a stack of states, but this is
+      // simple and avoids adding overhead for the non-nested case.
+      if (S.GetSeq() == S_Release || S.GetSeq() == S_MovableRelease)
+        NestingDetected = true;
+
+      S.SetSeqToRelease(Inst->getMetadata(ImpreciseReleaseMDKind));
+      S.RRI.clear();
+      S.RRI.KnownIncremented = S.IsKnownIncremented();
+      S.RRI.IsTailCallRelease = cast<CallInst>(Inst)->isTailCall();
+      S.RRI.Calls.insert(Inst);
+
+      S.IncrementRefCount();
+      break;
+    }
+    case IC_RetainBlock:
+    case IC_Retain:
+    case IC_RetainRV: {
+      Arg = GetObjCArg(Inst);
+
+      PtrState &S = MyStates.getPtrBottomUpState(Arg);
+      S.DecrementRefCount();
+
+      switch (S.GetSeq()) {
+      case S_Stop:
+      case S_Release:
+      case S_MovableRelease:
+      case S_Use:
+        S.RRI.ReverseInsertPts.clear();
+        // FALL THROUGH
+      case S_CanRelease:
+        // Don't do retain+release tracking for IC_RetainRV, because it's
+        // better to let it remain as the first instruction after a call.
+        if (Class != IC_RetainRV) {
+          S.RRI.IsRetainBlock = Class == IC_RetainBlock;
+          Retains[Inst] = S.RRI;
+        }
+        S.ClearSequenceProgress();
+        break;
+      case S_None:
+        break;
+      case S_Retain:
+        llvm_unreachable("bottom-up pointer in retain state!");
+      }
+      break;
+    }
+    case IC_AutoreleasepoolPop:
+      // Conservatively, clear MyStates for all known pointers.
+      MyStates.clearBottomUpPointers();
+      continue;
+    case IC_AutoreleasepoolPush:
+    case IC_None:
+      // These are irrelevant.
+      continue;
+    default:
+      break;
+    }
+
+    // Consider any other possible effects of this instruction on each
+    // pointer being tracked.
+    for (BBState::ptr_iterator MI = MyStates.bottom_up_ptr_begin(),
+         ME = MyStates.bottom_up_ptr_end(); MI != ME; ++MI) {
+      const Value *Ptr = MI->first;
+      if (Ptr == Arg)
+        continue; // Handled above.
+      PtrState &S = MI->second;
+      Sequence Seq = S.GetSeq();
+
+      // Check for possible retains and releases.
+      if (CanAlterRefCount(Inst, Ptr, PA, Class)) {
+        // Check for a retain (we're going bottom-up here).
+        S.DecrementRefCount();
+
+        // Check for a release.
+        if (!IsRetain(Class) && Class != IC_RetainBlock)
+          switch (Seq) {
+          case S_Use:
+            S.SetSeq(S_CanRelease);
+            continue;
+          case S_CanRelease:
+          case S_Release:
+          case S_MovableRelease:
+          case S_Stop:
+          case S_None:
+            break;
+          case S_Retain:
+            llvm_unreachable("bottom-up pointer in retain state!");
+          }
+      }
+
+      // Check for possible direct uses.
+      switch (Seq) {
+      case S_Release:
+      case S_MovableRelease:
+        if (CanUse(Inst, Ptr, PA, Class)) {
+          S.RRI.ReverseInsertPts.clear();
+          S.RRI.ReverseInsertPts.insert(Inst);
+          S.SetSeq(S_Use);
+        } else if (Seq == S_Release &&
+                   (Class == IC_User || Class == IC_CallOrUser)) {
+          // Non-movable releases depend on any possible objc pointer use.
+          S.SetSeq(S_Stop);
+          S.RRI.ReverseInsertPts.clear();
+          S.RRI.ReverseInsertPts.insert(Inst);
+        }
+        break;
+      case S_Stop:
+        if (CanUse(Inst, Ptr, PA, Class))
+          S.SetSeq(S_Use);
+        break;
+      case S_CanRelease:
+      case S_Use:
+      case S_None:
+        break;
+      case S_Retain:
+        llvm_unreachable("bottom-up pointer in retain state!");
+      }
+    }
+  }
+
+  return NestingDetected;
+}
+
+bool
+ObjCARCOpt::VisitTopDown(BasicBlock *BB,
+                         DenseMap<const BasicBlock *, BBState> &BBStates,
+                         DenseMap<Value *, RRInfo> &Releases) {
+  bool NestingDetected = false;
+  BBState &MyStates = BBStates[BB];
+
+  // Merge the states from each predecessor to compute the initial state
+  // for the current block.
+  const_pred_iterator PI(BB), PE(BB, false);
+  if (PI == PE)
+    MyStates.SetAsEntry();
+  else
+    do {
+      const BasicBlock *Pred = *PI++;
+      if (Pred == BB)
+        continue;
+      DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Pred);
+      if (I == BBStates.end())
+        continue;
+      MyStates.InitFromPred(I->second);
+      while (PI != PE) {
+        Pred = *PI++;
+        if (Pred != BB) {
+          I = BBStates.find(Pred);
+          if (I != BBStates.end())
+            MyStates.MergePred(I->second);
+        }
+      }
+      break;
+    } while (PI != PE);
+
+  // Visit all the instructions, top-down.
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+    Instruction *Inst = I;
+    InstructionClass Class = GetInstructionClass(Inst);
+    const Value *Arg = 0;
+
+    switch (Class) {
+    case IC_RetainBlock:
+    case IC_Retain:
+    case IC_RetainRV: {
+      Arg = GetObjCArg(Inst);
+
+      PtrState &S = MyStates.getPtrTopDownState(Arg);
+
+      // Don't do retain+release tracking for IC_RetainRV, because it's
+      // better to let it remain as the first instruction after a call.
+      if (Class != IC_RetainRV) {
+        // If we see two retains in a row on the same pointer. If so, make
+        // a note, and we'll cicle back to revisit it after we've
+        // hopefully eliminated the second retain, which may allow us to
+        // eliminate the first retain too.
+        // Theoretically we could implement removal of nested retain+release
+        // pairs by making PtrState hold a stack of states, but this is
+        // simple and avoids adding overhead for the non-nested case.
+        if (S.GetSeq() == S_Retain)
+          NestingDetected = true;
+
+        S.SetSeq(S_Retain);
+        S.RRI.clear();
+        S.RRI.IsRetainBlock = Class == IC_RetainBlock;
+        S.RRI.KnownIncremented = S.IsKnownIncremented();
+        S.RRI.Calls.insert(Inst);
+      }
+
+      S.IncrementRefCount();
+      break;
+    }
+    case IC_Release: {
+      Arg = GetObjCArg(Inst);
+
+      PtrState &S = MyStates.getPtrTopDownState(Arg);
+      S.DecrementRefCount();
+
+      switch (S.GetSeq()) {
+      case S_Retain:
+      case S_CanRelease:
+        S.RRI.ReverseInsertPts.clear();
+        // FALL THROUGH
+      case S_Use:
+        S.RRI.ReleaseMetadata = Inst->getMetadata(ImpreciseReleaseMDKind);
+        S.RRI.IsTailCallRelease = cast<CallInst>(Inst)->isTailCall();
+        Releases[Inst] = S.RRI;
+        S.ClearSequenceProgress();
+        break;
+      case S_None:
+        break;
+      case S_Stop:
+      case S_Release:
+      case S_MovableRelease:
+        llvm_unreachable("top-down pointer in release state!");
+      }
+      break;
+    }
+    case IC_AutoreleasepoolPop:
+      // Conservatively, clear MyStates for all known pointers.
+      MyStates.clearTopDownPointers();
+      continue;
+    case IC_AutoreleasepoolPush:
+    case IC_None:
+      // These are irrelevant.
+      continue;
+    default:
+      break;
+    }
+
+    // Consider any other possible effects of this instruction on each
+    // pointer being tracked.
+    for (BBState::ptr_iterator MI = MyStates.top_down_ptr_begin(),
+         ME = MyStates.top_down_ptr_end(); MI != ME; ++MI) {
+      const Value *Ptr = MI->first;
+      if (Ptr == Arg)
+        continue; // Handled above.
+      PtrState &S = MI->second;
+      Sequence Seq = S.GetSeq();
+
+      // Check for possible releases.
+      if (!IsRetain(Class) && Class != IC_RetainBlock &&
+          CanAlterRefCount(Inst, Ptr, PA, Class)) {
+        // Check for a release.
+        S.DecrementRefCount();
+
+        // Check for a release.
+        switch (Seq) {
+        case S_Retain:
+          S.SetSeq(S_CanRelease);
+          S.RRI.ReverseInsertPts.clear();
+          S.RRI.ReverseInsertPts.insert(Inst);
+
+          // One call can't cause a transition from S_Retain to S_CanRelease
+          // and S_CanRelease to S_Use. If we've made the first transition,
+          // we're done.
+          continue;
+        case S_Use:
+        case S_CanRelease:
+        case S_None:
+          break;
+        case S_Stop:
+        case S_Release:
+        case S_MovableRelease:
+          llvm_unreachable("top-down pointer in release state!");
+        }
+      }
+
+      // Check for possible direct uses.
+      switch (Seq) {
+      case S_CanRelease:
+        if (CanUse(Inst, Ptr, PA, Class))
+          S.SetSeq(S_Use);
+        break;
+      case S_Use:
+      case S_Retain:
+      case S_None:
+        break;
+      case S_Stop:
+      case S_Release:
+      case S_MovableRelease:
+        llvm_unreachable("top-down pointer in release state!");
+      }
+    }
+  }
+
+  CheckForCFGHazards(BB, BBStates, MyStates);
+  return NestingDetected;
+}
+
+// Visit - Visit the function both top-down and bottom-up.
+bool
+ObjCARCOpt::Visit(Function &F,
+                  DenseMap<const BasicBlock *, BBState> &BBStates,
+                  MapVector<Value *, RRInfo> &Retains,
+                  DenseMap<Value *, RRInfo> &Releases) {
+  // Use postorder for bottom-up, and reverse-postorder for top-down, because we
+  // magically know that loops will be well behaved, i.e. they won't repeatedly
+  // call retain on a single pointer without doing a release.
+  bool BottomUpNestingDetected = false;
+  SmallVector<BasicBlock *, 8> PostOrder;
+  for (po_iterator<Function *> I = po_begin(&F), E = po_end(&F); I != E; ++I) {
+    BasicBlock *BB = *I;
+    PostOrder.push_back(BB);
+
+    BottomUpNestingDetected |= VisitBottomUp(BB, BBStates, Retains);
+  }
+
+  // Iterate through the post-order in reverse order, achieving a
+  // reverse-postorder traversal. We don't use the ReversePostOrderTraversal
+  // class here because it works by computing its own full postorder iteration,
+  // recording the sequence, and playing it back in reverse. Since we're already
+  // doing a full iteration above, we can just record the sequence manually and
+  // avoid the cost of having ReversePostOrderTraversal compute it.
+  bool TopDownNestingDetected = false;
+  for (SmallVectorImpl<BasicBlock *>::const_reverse_iterator
+       RI = PostOrder.rbegin(), RE = PostOrder.rend(); RI != RE; ++RI)
+    TopDownNestingDetected |= VisitTopDown(*RI, BBStates, Releases);
+
+  return TopDownNestingDetected && BottomUpNestingDetected;
+}
+
+/// MoveCalls - Move the calls in RetainsToMove and ReleasesToMove.
+void ObjCARCOpt::MoveCalls(Value *Arg,
+                           RRInfo &RetainsToMove,
+                           RRInfo &ReleasesToMove,
+                           MapVector<Value *, RRInfo> &Retains,
+                           DenseMap<Value *, RRInfo> &Releases,
+                           SmallVectorImpl<Instruction *> &DeadInsts) {
+  const Type *ArgTy = Arg->getType();
+  const Type *ParamTy =
+    (RetainRVFunc ? RetainRVFunc :
+     RetainFunc ? RetainFunc :
+     RetainBlockFunc)->arg_begin()->getType();
+
+  // Insert the new retain and release calls.
+  for (SmallPtrSet<Instruction *, 2>::const_iterator
+       PI = ReleasesToMove.ReverseInsertPts.begin(),
+       PE = ReleasesToMove.ReverseInsertPts.end(); PI != PE; ++PI) {
+    Instruction *InsertPt = *PI;
+    Value *MyArg = ArgTy == ParamTy ? Arg :
+                   new BitCastInst(Arg, ParamTy, "", InsertPt);
+    CallInst *Call =
+      CallInst::Create(RetainsToMove.IsRetainBlock ?
+                         RetainBlockFunc : RetainFunc,
+                       MyArg, "", InsertPt);
+    Call->setDoesNotThrow();
+    if (!RetainsToMove.IsRetainBlock)
+      Call->setTailCall();
+  }
+  for (SmallPtrSet<Instruction *, 2>::const_iterator
+       PI = RetainsToMove.ReverseInsertPts.begin(),
+       PE = RetainsToMove.ReverseInsertPts.end(); PI != PE; ++PI) {
+    Instruction *LastUse = *PI;
+    Instruction *InsertPts[] = { 0, 0, 0 };
+    if (InvokeInst *II = dyn_cast<InvokeInst>(LastUse)) {
+      // We can't insert code immediately after an invoke instruction, so
+      // insert code at the beginning of both successor blocks instead.
+      // The invoke's return value isn't available in the unwind block,
+      // but our releases will never depend on it, because they must be
+      // paired with retains from before the invoke.
+      InsertPts[0] = II->getNormalDest()->getFirstNonPHI();
+      InsertPts[1] = II->getUnwindDest()->getFirstNonPHI();
+    } else {
+      // Insert code immediately after the last use.
+      InsertPts[0] = llvm::next(BasicBlock::iterator(LastUse));
+    }
+
+    for (Instruction **I = InsertPts; *I; ++I) {
+      Instruction *InsertPt = *I;
+      Value *MyArg = ArgTy == ParamTy ? Arg :
+                     new BitCastInst(Arg, ParamTy, "", InsertPt);
+      CallInst *Call = CallInst::Create(ReleaseFunc, MyArg, "", InsertPt);
+      // Attach a clang.imprecise_release metadata tag, if appropriate.
+      if (MDNode *M = ReleasesToMove.ReleaseMetadata)
+        Call->setMetadata(ImpreciseReleaseMDKind, M);
+      Call->setDoesNotThrow();
+      if (ReleasesToMove.IsTailCallRelease)
+        Call->setTailCall();
+    }
+  }
+
+  // Delete the original retain and release calls.
+  for (SmallPtrSet<Instruction *, 2>::const_iterator
+       AI = RetainsToMove.Calls.begin(),
+       AE = RetainsToMove.Calls.end(); AI != AE; ++AI) {
+    Instruction *OrigRetain = *AI;
+    Retains.blot(OrigRetain);
+    DeadInsts.push_back(OrigRetain);
+  }
+  for (SmallPtrSet<Instruction *, 2>::const_iterator
+       AI = ReleasesToMove.Calls.begin(),
+       AE = ReleasesToMove.Calls.end(); AI != AE; ++AI) {
+    Instruction *OrigRelease = *AI;
+    Releases.erase(OrigRelease);
+    DeadInsts.push_back(OrigRelease);
+  }
+}
+
+bool
+ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState>
+                                   &BBStates,
+                                 MapVector<Value *, RRInfo> &Retains,
+                                 DenseMap<Value *, RRInfo> &Releases) {
+  bool AnyPairsCompletelyEliminated = false;
+  RRInfo RetainsToMove;
+  RRInfo ReleasesToMove;
+  SmallVector<Instruction *, 4> NewRetains;
+  SmallVector<Instruction *, 4> NewReleases;
+  SmallVector<Instruction *, 8> DeadInsts;
+
+  for (MapVector<Value *, RRInfo>::const_iterator I = Retains.begin(),
+       E = Retains.end(); I != E; ) {
+    Value *V = (I++)->first;
+    if (!V) continue; // blotted
+
+    Instruction *Retain = cast<Instruction>(V);
+    Value *Arg = GetObjCArg(Retain);
+
+    // If the object being released is in static or stack storage, we know it's
+    // not being managed by ObjC reference counting, so we can delete pairs
+    // regardless of what possible decrements or uses lie between them.
+    bool KnownSafe = isa<Constant>(Arg) || isa<AllocaInst>(Arg);
+
+    // If a pair happens in a region where it is known that the reference count
+    // is already incremented, we can similarly ignore possible decrements.
+    bool KnownIncrementedTD = true, KnownIncrementedBU = true;
+
+    // Connect the dots between the top-down-collected RetainsToMove and
+    // bottom-up-collected ReleasesToMove to form sets of related calls.
+    // This is an iterative process so that we connect multiple releases
+    // to multiple retains if needed.
+    unsigned OldDelta = 0;
+    unsigned NewDelta = 0;
+    unsigned OldCount = 0;
+    unsigned NewCount = 0;
+    bool FirstRelease = true;
+    bool FirstRetain = true;
+    NewRetains.push_back(Retain);
+    for (;;) {
+      for (SmallVectorImpl<Instruction *>::const_iterator
+           NI = NewRetains.begin(), NE = NewRetains.end(); NI != NE; ++NI) {
+        Instruction *NewRetain = *NI;
+        MapVector<Value *, RRInfo>::const_iterator It = Retains.find(NewRetain);
+        assert(It != Retains.end());
+        const RRInfo &NewRetainRRI = It->second;
+        KnownIncrementedTD &= NewRetainRRI.KnownIncremented;
+        for (SmallPtrSet<Instruction *, 2>::const_iterator
+             LI = NewRetainRRI.Calls.begin(),
+             LE = NewRetainRRI.Calls.end(); LI != LE; ++LI) {
+          Instruction *NewRetainRelease = *LI;
+          DenseMap<Value *, RRInfo>::const_iterator Jt =
+            Releases.find(NewRetainRelease);
+          if (Jt == Releases.end())
+            goto next_retain;
+          const RRInfo &NewRetainReleaseRRI = Jt->second;
+          assert(NewRetainReleaseRRI.Calls.count(NewRetain));
+          if (ReleasesToMove.Calls.insert(NewRetainRelease)) {
+            OldDelta -=
+              BBStates[NewRetainRelease->getParent()].GetAllPathCount();
+
+            // Merge the ReleaseMetadata and IsTailCallRelease values.
+            if (FirstRelease) {
+              ReleasesToMove.ReleaseMetadata =
+                NewRetainReleaseRRI.ReleaseMetadata;
+              ReleasesToMove.IsTailCallRelease =
+                NewRetainReleaseRRI.IsTailCallRelease;
+              FirstRelease = false;
+            } else {
+              if (ReleasesToMove.ReleaseMetadata !=
+                    NewRetainReleaseRRI.ReleaseMetadata)
+                ReleasesToMove.ReleaseMetadata = 0;
+              if (ReleasesToMove.IsTailCallRelease !=
+                    NewRetainReleaseRRI.IsTailCallRelease)
+                ReleasesToMove.IsTailCallRelease = false;
+            }
+
+            // Collect the optimal insertion points.
+            if (!KnownSafe)
+              for (SmallPtrSet<Instruction *, 2>::const_iterator
+                   RI = NewRetainReleaseRRI.ReverseInsertPts.begin(),
+                   RE = NewRetainReleaseRRI.ReverseInsertPts.end();
+                   RI != RE; ++RI) {
+                Instruction *RIP = *RI;
+                if (ReleasesToMove.ReverseInsertPts.insert(RIP))
+                  NewDelta -= BBStates[RIP->getParent()].GetAllPathCount();
+              }
+            NewReleases.push_back(NewRetainRelease);
+          }
+        }
+      }
+      NewRetains.clear();
+      if (NewReleases.empty()) break;
+
+      // Back the other way.
+      for (SmallVectorImpl<Instruction *>::const_iterator
+           NI = NewReleases.begin(), NE = NewReleases.end(); NI != NE; ++NI) {
+        Instruction *NewRelease = *NI;
+        DenseMap<Value *, RRInfo>::const_iterator It =
+          Releases.find(NewRelease);
+        assert(It != Releases.end());
+        const RRInfo &NewReleaseRRI = It->second;
+        KnownIncrementedBU &= NewReleaseRRI.KnownIncremented;
+        for (SmallPtrSet<Instruction *, 2>::const_iterator
+             LI = NewReleaseRRI.Calls.begin(),
+             LE = NewReleaseRRI.Calls.end(); LI != LE; ++LI) {
+          Instruction *NewReleaseRetain = *LI;
+          MapVector<Value *, RRInfo>::const_iterator Jt =
+            Retains.find(NewReleaseRetain);
+          if (Jt == Retains.end())
+            goto next_retain;
+          const RRInfo &NewReleaseRetainRRI = Jt->second;
+          assert(NewReleaseRetainRRI.Calls.count(NewRelease));
+          if (RetainsToMove.Calls.insert(NewReleaseRetain)) {
+            unsigned PathCount =
+              BBStates[NewReleaseRetain->getParent()].GetAllPathCount();
+            OldDelta += PathCount;
+            OldCount += PathCount;
+
+            // Merge the IsRetainBlock values.
+            if (FirstRetain) {
+              RetainsToMove.IsRetainBlock = NewReleaseRetainRRI.IsRetainBlock;
+              FirstRetain = false;
+            } else if (ReleasesToMove.IsRetainBlock !=
+                       NewReleaseRetainRRI.IsRetainBlock)
+              // It's not possible to merge the sequences if one uses
+              // objc_retain and the other uses objc_retainBlock.
+              goto next_retain;
+
+            // Collect the optimal insertion points.
+            if (!KnownSafe)
+              for (SmallPtrSet<Instruction *, 2>::const_iterator
+                   RI = NewReleaseRetainRRI.ReverseInsertPts.begin(),
+                   RE = NewReleaseRetainRRI.ReverseInsertPts.end();
+                   RI != RE; ++RI) {
+                Instruction *RIP = *RI;
+                if (RetainsToMove.ReverseInsertPts.insert(RIP)) {
+                  PathCount = BBStates[RIP->getParent()].GetAllPathCount();
+                  NewDelta += PathCount;
+                  NewCount += PathCount;
+                }
+              }
+            NewRetains.push_back(NewReleaseRetain);
+          }
+        }
+      }
+      NewReleases.clear();
+      if (NewRetains.empty()) break;
+    }
+
+    // If the pointer is known incremented, we can safely delete the pair
+    // regardless of what's between them.
+    if (KnownIncrementedTD || KnownIncrementedBU) {
+      RetainsToMove.ReverseInsertPts.clear();
+      ReleasesToMove.ReverseInsertPts.clear();
+      NewCount = 0;
+    }
+
+    // Determine whether the original call points are balanced in the retain and
+    // release calls through the program. If not, conservatively don't touch
+    // them.
+    // TODO: It's theoretically possible to do code motion in this case, as
+    // long as the existing imbalances are maintained.
+    if (OldDelta != 0)
+      goto next_retain;
+
+    // Determine whether the new insertion points we computed preserve the
+    // balance of retain and release calls through the program.
+    // TODO: If the fully aggressive solution isn't valid, try to find a
+    // less aggressive solution which is.
+    if (NewDelta != 0)
+      goto next_retain;
+
+    // Ok, everything checks out and we're all set. Let's move some code!
+    Changed = true;
+    AnyPairsCompletelyEliminated = NewCount == 0;
+    NumRRs += OldCount - NewCount;
+    MoveCalls(Arg, RetainsToMove, ReleasesToMove, Retains, Releases, DeadInsts);
+
+  next_retain:
+    NewReleases.clear();
+    NewRetains.clear();
+    RetainsToMove.clear();
+    ReleasesToMove.clear();
+  }
+
+  // Now that we're done moving everything, we can delete the newly dead
+  // instructions, as we no longer need them as insert points.
+  while (!DeadInsts.empty())
+    EraseInstruction(DeadInsts.pop_back_val());
+
+  return AnyPairsCompletelyEliminated;
+}
+
+/// OptimizeWeakCalls - Weak pointer optimizations.
+void ObjCARCOpt::OptimizeWeakCalls(Function &F) {
+  // First, do memdep-style RLE and S2L optimizations. We can't use memdep
+  // itself because it uses AliasAnalysis and we need to do provenance
+  // queries instead.
+  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
+    Instruction *Inst = &*I++;
+    InstructionClass Class = GetBasicInstructionClass(Inst);
+    if (Class != IC_LoadWeak && Class != IC_LoadWeakRetained)
+      continue;
+
+    // Delete objc_loadWeak calls with no users.
+    if (Class == IC_LoadWeak && Inst->use_empty()) {
+      Inst->eraseFromParent();
+      continue;
+    }
+
+    // TODO: For now, just look for an earlier available version of this value
+    // within the same block. Theoretically, we could do memdep-style non-local
+    // analysis too, but that would want caching. A better approach would be to
+    // use the technique that EarlyCSE uses.
+    inst_iterator Current = llvm::prior(I);
+    BasicBlock *CurrentBB = Current.getBasicBlockIterator();
+    for (BasicBlock::iterator B = CurrentBB->begin(),
+                              J = Current.getInstructionIterator();
+         J != B; --J) {
+      Instruction *EarlierInst = &*llvm::prior(J);
+      InstructionClass EarlierClass = GetInstructionClass(EarlierInst);
+      switch (EarlierClass) {
+      case IC_LoadWeak:
+      case IC_LoadWeakRetained: {
+        // If this is loading from the same pointer, replace this load's value
+        // with that one.
+        CallInst *Call = cast<CallInst>(Inst);
+        CallInst *EarlierCall = cast<CallInst>(EarlierInst);
+        Value *Arg = Call->getArgOperand(0);
+        Value *EarlierArg = EarlierCall->getArgOperand(0);
+        switch (PA.getAA()->alias(Arg, EarlierArg)) {
+        case AliasAnalysis::MustAlias:
+          Changed = true;
+          // If the load has a builtin retain, insert a plain retain for it.
+          if (Class == IC_LoadWeakRetained) {
+            CallInst *CI =
+              CallInst::Create(getRetainCallee(F.getParent()), EarlierCall,
+                               "", Call);
+            CI->setTailCall();
+          }
+          // Zap the fully redundant load.
+          Call->replaceAllUsesWith(EarlierCall);
+          Call->eraseFromParent();
+          goto clobbered;
+        case AliasAnalysis::MayAlias:
+        case AliasAnalysis::PartialAlias:
+          goto clobbered;
+        case AliasAnalysis::NoAlias:
+          break;
+        }
+        break;
+      }
+      case IC_StoreWeak:
+      case IC_InitWeak: {
+        // If this is storing to the same pointer and has the same size etc.
+        // replace this load's value with the stored value.
+        CallInst *Call = cast<CallInst>(Inst);
+        CallInst *EarlierCall = cast<CallInst>(EarlierInst);
+        Value *Arg = Call->getArgOperand(0);
+        Value *EarlierArg = EarlierCall->getArgOperand(0);
+        switch (PA.getAA()->alias(Arg, EarlierArg)) {
+        case AliasAnalysis::MustAlias:
+          Changed = true;
+          // If the load has a builtin retain, insert a plain retain for it.
+          if (Class == IC_LoadWeakRetained) {
+            CallInst *CI =
+              CallInst::Create(getRetainCallee(F.getParent()), EarlierCall,
+                               "", Call);
+            CI->setTailCall();
+          }
+          // Zap the fully redundant load.
+          Call->replaceAllUsesWith(EarlierCall->getArgOperand(1));
+          Call->eraseFromParent();
+          goto clobbered;
+        case AliasAnalysis::MayAlias:
+        case AliasAnalysis::PartialAlias:
+          goto clobbered;
+        case AliasAnalysis::NoAlias:
+          break;
+        }
+        break;
+      }
+      case IC_MoveWeak:
+      case IC_CopyWeak:
+        // TOOD: Grab the copied value.
+        goto clobbered;
+      case IC_AutoreleasepoolPush:
+      case IC_None:
+      case IC_User:
+        // Weak pointers are only modified through the weak entry points
+        // (and arbitrary calls, which could call the weak entry points).
+        break;
+      default:
+        // Anything else could modify the weak pointer.
+        goto clobbered;
+      }
+    }
+  clobbered:;
+  }
+
+  // Then, for each destroyWeak with an alloca operand, check to see if
+  // the alloca and all its users can be zapped.
+  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
+    Instruction *Inst = &*I++;
+    InstructionClass Class = GetBasicInstructionClass(Inst);
+    if (Class != IC_DestroyWeak)
+      continue;
+
+    CallInst *Call = cast<CallInst>(Inst);
+    Value *Arg = Call->getArgOperand(0);
+    if (AllocaInst *Alloca = dyn_cast<AllocaInst>(Arg)) {
+      for (Value::use_iterator UI = Alloca->use_begin(),
+           UE = Alloca->use_end(); UI != UE; ++UI) {
+        Instruction *UserInst = cast<Instruction>(*UI);
+        switch (GetBasicInstructionClass(UserInst)) {
+        case IC_InitWeak:
+        case IC_StoreWeak:
+        case IC_DestroyWeak:
+          continue;
+        default:
+          goto done;
+        }
+      }
+      Changed = true;
+      for (Value::use_iterator UI = Alloca->use_begin(),
+           UE = Alloca->use_end(); UI != UE; ) {
+        CallInst *UserInst = cast<CallInst>(*UI++);
+        if (!UserInst->use_empty())
+          UserInst->replaceAllUsesWith(UserInst->getOperand(1));
+        UserInst->eraseFromParent();
+      }
+      Alloca->eraseFromParent();
+    done:;
+    }
+  }
+}
+
+/// OptimizeSequences - Identify program paths which execute sequences of
+/// retains and releases which can be eliminated.
+bool ObjCARCOpt::OptimizeSequences(Function &F) {
+  /// Releases, Retains - These are used to store the results of the main flow
+  /// analysis. These use Value* as the key instead of Instruction* so that the
+  /// map stays valid when we get around to rewriting code and calls get
+  /// replaced by arguments.
+  DenseMap<Value *, RRInfo> Releases;
+  MapVector<Value *, RRInfo> Retains;
+
+  /// BBStates, This is used during the traversal of the function to track the
+  /// states for each identified object at each block.
+  DenseMap<const BasicBlock *, BBState> BBStates;
+
+  // Analyze the CFG of the function, and all instructions.
+  bool NestingDetected = Visit(F, BBStates, Retains, Releases);
+
+  // Transform.
+  return PerformCodePlacement(BBStates, Retains, Releases) && NestingDetected;
+}
+
+/// OptimizeReturns - Look for this pattern:
+///
+///    %call = call i8* @something(...)
+///    %2 = call i8* @objc_retain(i8* %call)
+///    %3 = call i8* @objc_autorelease(i8* %2)
+///    ret i8* %3
+///
+/// And delete the retain and autorelease.
+///
+/// Otherwise if it's just this:
+///
+///    %3 = call i8* @objc_autorelease(i8* %2)
+///    ret i8* %3
+///
+/// convert the autorelease to autoreleaseRV.
+void ObjCARCOpt::OptimizeReturns(Function &F) {
+  if (!F.getReturnType()->isPointerTy())
+    return;
+
+  SmallPtrSet<Instruction *, 4> DependingInstructions;
+  SmallPtrSet<const BasicBlock *, 4> Visited;
+  for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) {
+    BasicBlock *BB = FI;
+    ReturnInst *Ret = dyn_cast<ReturnInst>(&BB->back());
+    if (!Ret) continue;
+
+    const Value *Arg = StripPointerCastsAndObjCCalls(Ret->getOperand(0));
+    FindDependencies(NeedsPositiveRetainCount, Arg,
+                     BB, Ret, DependingInstructions, Visited, PA);
+    if (DependingInstructions.size() != 1)
+      goto next_block;
+
+    {
+      CallInst *Autorelease =
+        dyn_cast_or_null<CallInst>(*DependingInstructions.begin());
+      if (!Autorelease)
+        goto next_block;
+      InstructionClass AutoreleaseClass =
+        GetBasicInstructionClass(Autorelease);
+      if (!IsAutorelease(AutoreleaseClass))
+        goto next_block;
+      if (GetObjCArg(Autorelease) != Arg)
+        goto next_block;
+
+      DependingInstructions.clear();
+      Visited.clear();
+
+      // Check that there is nothing that can affect the reference
+      // count between the autorelease and the retain.
+      FindDependencies(CanChangeRetainCount, Arg,
+                       BB, Autorelease, DependingInstructions, Visited, PA);
+      if (DependingInstructions.size() != 1)
+        goto next_block;
+
+      {
+        CallInst *Retain =
+          dyn_cast_or_null<CallInst>(*DependingInstructions.begin());
+
+        // Check that we found a retain with the same argument.
+        if (!Retain ||
+            !IsRetain(GetBasicInstructionClass(Retain)) ||
+            GetObjCArg(Retain) != Arg)
+          goto next_block;
+
+        DependingInstructions.clear();
+        Visited.clear();
+
+        // Convert the autorelease to an autoreleaseRV, since it's
+        // returning the value.
+        if (AutoreleaseClass == IC_Autorelease) {
+          Autorelease->setCalledFunction(getAutoreleaseRVCallee(F.getParent()));
+          AutoreleaseClass = IC_AutoreleaseRV;
+        }
+
+        // Check that there is nothing that can affect the reference
+        // count between the retain and the call.
+        FindDependencies(CanChangeRetainCount, Arg, BB, Retain,
+                         DependingInstructions, Visited, PA);
+        if (DependingInstructions.size() != 1)
+          goto next_block;
+
+        {
+          CallInst *Call =
+            dyn_cast_or_null<CallInst>(*DependingInstructions.begin());
+
+          // Check that the pointer is the return value of the call.
+          if (!Call || Arg != Call)
+            goto next_block;
+
+          // Check that the call is a regular call.
+          InstructionClass Class = GetBasicInstructionClass(Call);
+          if (Class != IC_CallOrUser && Class != IC_Call)
+            goto next_block;
+
+          // If so, we can zap the retain and autorelease.
+          Changed = true;
+          ++NumRets;
+          EraseInstruction(Retain);
+          EraseInstruction(Autorelease);
+        }
+      }
+    }
+
+  next_block:
+    DependingInstructions.clear();
+    Visited.clear();
+  }
+}
+
+bool ObjCARCOpt::doInitialization(Module &M) {
+  if (!EnableARCOpts)
+    return false;
+
+  // Identify the imprecise release metadata kind.
+  ImpreciseReleaseMDKind =
+    M.getContext().getMDKindID("clang.imprecise_release");
+
+  // Identify the declarations for objc_retain and friends.
+  RetainFunc = M.getFunction("objc_retain");
+  RetainBlockFunc = M.getFunction("objc_retainBlock");
+  RetainRVFunc = M.getFunction("objc_retainAutoreleasedReturnValue");
+  ReleaseFunc = M.getFunction("objc_release");
+  AutoreleaseFunc = M.getFunction("objc_autorelease");
+
+  // Intuitively, objc_retain and others are nocapture, however in practice
+  // they are not, because they return their argument value. And objc_release
+  // calls finalizers.
+
+  // These are initialized lazily.
+  RetainRVCallee = 0;
+  AutoreleaseRVCallee = 0;
+  ReleaseCallee = 0;
+  RetainCallee = 0;
+  AutoreleaseCallee = 0;
+
+  return false;
+}
+
+bool ObjCARCOpt::runOnFunction(Function &F) {
+  if (!EnableARCOpts)
+    return false;
+
+  Changed = false;
+
+  PA.setAA(&getAnalysis<AliasAnalysis>());
+
+  // This pass performs several distinct transformations. As a compile-time aid
+  // when compiling code that isn't ObjC, skip these if the relevant ObjC
+  // library functions aren't declared.
+
+  // Preliminary optimizations. This also computs UsedInThisFunction.
+  OptimizeIndividualCalls(F);
+
+  // Optimizations for weak pointers.
+  if (UsedInThisFunction & ((1 << IC_LoadWeak) |
+                            (1 << IC_LoadWeakRetained) |
+                            (1 << IC_StoreWeak) |
+                            (1 << IC_InitWeak) |
+                            (1 << IC_CopyWeak) |
+                            (1 << IC_MoveWeak) |
+                            (1 << IC_DestroyWeak)))
+    OptimizeWeakCalls(F);
+
+  // Optimizations for retain+release pairs.
+  if (UsedInThisFunction & ((1 << IC_Retain) |
+                            (1 << IC_RetainRV) |
+                            (1 << IC_RetainBlock)))
+    if (UsedInThisFunction & (1 << IC_Release))
+      // Run OptimizeSequences until it either stops making changes or
+      // no retain+release pair nesting is detected.
+      while (OptimizeSequences(F)) {}
+
+  // Optimizations if objc_autorelease is used.
+  if (UsedInThisFunction &
+      ((1 << IC_Autorelease) | (1 << IC_AutoreleaseRV)))
+    OptimizeReturns(F);
+
+  return Changed;
+}
+
+void ObjCARCOpt::releaseMemory() {
+  PA.clear();
+}
+
+//===----------------------------------------------------------------------===//
+// ARC contraction.
+//===----------------------------------------------------------------------===//
+
+// TODO: ObjCARCContract could insert PHI nodes when uses aren't
+// dominated by single calls.
+
+#include "llvm/Operator.h"
+#include "llvm/InlineAsm.h"
+#include "llvm/Analysis/Dominators.h"
+
+STATISTIC(NumStoreStrongs, "Number objc_storeStrong calls formed");
+
+namespace {
+  /// ObjCARCContract - Late ARC optimizations.  These change the IR in a way
+  /// that makes it difficult to be analyzed by ObjCARCOpt, so it's run late.
+  class ObjCARCContract : public FunctionPass {
+    bool Changed;
+    AliasAnalysis *AA;
+    DominatorTree *DT;
+    ProvenanceAnalysis PA;
+
+    /// StoreStrongCallee, etc. - Declarations for ObjC runtime
+    /// functions, for use in creating calls to them. These are initialized
+    /// lazily to avoid cluttering up the Module with unused declarations.
+    Constant *StoreStrongCallee,
+             *RetainAutoreleaseCallee, *RetainAutoreleaseRVCallee;
+
+    /// RetainRVMarker - The inline asm string to insert between calls and
+    /// RetainRV calls to make the optimization work on targets which need it.
+    const MDString *RetainRVMarker;
+
+    Constant *getStoreStrongCallee(Module *M);
+    Constant *getRetainAutoreleaseCallee(Module *M);
+    Constant *getRetainAutoreleaseRVCallee(Module *M);
+
+    bool ContractAutorelease(Function &F, Instruction *Autorelease,
+                             InstructionClass Class,
+                             SmallPtrSet<Instruction *, 4>
+                               &DependingInstructions,
+                             SmallPtrSet<const BasicBlock *, 4>
+                               &Visited);
+
+    void ContractRelease(Instruction *Release,
+                         inst_iterator &Iter);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+    virtual bool doInitialization(Module &M);
+    virtual bool runOnFunction(Function &F);
+
+  public:
+    static char ID;
+    ObjCARCContract() : FunctionPass(ID) {
+      initializeObjCARCContractPass(*PassRegistry::getPassRegistry());
+    }
+  };
+}
+
+char ObjCARCContract::ID = 0;
+INITIALIZE_PASS_BEGIN(ObjCARCContract,
+                      "objc-arc-contract", "ObjC ARC contraction", false, false)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_END(ObjCARCContract,
+                    "objc-arc-contract", "ObjC ARC contraction", false, false)
+
+Pass *llvm::createObjCARCContractPass() {
+  return new ObjCARCContract();
+}
+
+void ObjCARCContract::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<AliasAnalysis>();
+  AU.addRequired<DominatorTree>();
+  AU.setPreservesCFG();
+}
+
+Constant *ObjCARCContract::getStoreStrongCallee(Module *M) {
+  if (!StoreStrongCallee) {
+    LLVMContext &C = M->getContext();
+    const Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
+    const Type *I8XX = PointerType::getUnqual(I8X);
+    std::vector<const Type *> Params;
+    Params.push_back(I8XX);
+    Params.push_back(I8X);
+
+    AttrListPtr Attributes;
+    Attributes.addAttr(~0u, Attribute::NoUnwind);
+    Attributes.addAttr(1, Attribute::NoCapture);
+
+    StoreStrongCallee =
+      M->getOrInsertFunction(
+        "objc_storeStrong",
+        FunctionType::get(Type::getVoidTy(C), Params, /*isVarArg=*/false),
+        Attributes);
+  }
+  return StoreStrongCallee;
+}
+
+Constant *ObjCARCContract::getRetainAutoreleaseCallee(Module *M) {
+  if (!RetainAutoreleaseCallee) {
+    LLVMContext &C = M->getContext();
+    const Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
+    std::vector<const Type *> Params;
+    Params.push_back(I8X);
+    const FunctionType *FTy =
+      FunctionType::get(I8X, Params, /*isVarArg=*/false);
+    AttrListPtr Attributes;
+    Attributes.addAttr(~0u, Attribute::NoUnwind);
+    RetainAutoreleaseCallee =
+      M->getOrInsertFunction("objc_retainAutorelease", FTy, Attributes);
+  }
+  return RetainAutoreleaseCallee;
+}
+
+Constant *ObjCARCContract::getRetainAutoreleaseRVCallee(Module *M) {
+  if (!RetainAutoreleaseRVCallee) {
+    LLVMContext &C = M->getContext();
+    const Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
+    std::vector<const Type *> Params;
+    Params.push_back(I8X);
+    const FunctionType *FTy =
+      FunctionType::get(I8X, Params, /*isVarArg=*/false);
+    AttrListPtr Attributes;
+    Attributes.addAttr(~0u, Attribute::NoUnwind);
+    RetainAutoreleaseRVCallee =
+      M->getOrInsertFunction("objc_retainAutoreleaseReturnValue", FTy,
+                             Attributes);
+  }
+  return RetainAutoreleaseRVCallee;
+}
+
+/// ContractAutorelease - Merge an autorelease with a retain into a fused
+/// call.
+bool
+ObjCARCContract::ContractAutorelease(Function &F, Instruction *Autorelease,
+                                     InstructionClass Class,
+                                     SmallPtrSet<Instruction *, 4>
+                                       &DependingInstructions,
+                                     SmallPtrSet<const BasicBlock *, 4>
+                                       &Visited) {
+  const Value *Arg = GetObjCArg(Autorelease);
+
+  // Check that there are no instructions between the retain and the autorelease
+  // (such as an autorelease_pop) which may change the count.
+  CallInst *Retain = 0;
+  if (Class == IC_AutoreleaseRV)
+    FindDependencies(RetainAutoreleaseRVDep, Arg,
+                     Autorelease->getParent(), Autorelease,
+                     DependingInstructions, Visited, PA);
+  else
+    FindDependencies(RetainAutoreleaseDep, Arg,
+                     Autorelease->getParent(), Autorelease,
+                     DependingInstructions, Visited, PA);
+
+  Visited.clear();
+  if (DependingInstructions.size() != 1) {
+    DependingInstructions.clear();
+    return false;
+  }
+
+  Retain = dyn_cast_or_null<CallInst>(*DependingInstructions.begin());
+  DependingInstructions.clear();
+
+  if (!Retain ||
+      GetBasicInstructionClass(Retain) != IC_Retain ||
+      GetObjCArg(Retain) != Arg)
+    return false;
+
+  Changed = true;
+  ++NumPeeps;
+
+  if (Class == IC_AutoreleaseRV)
+    Retain->setCalledFunction(getRetainAutoreleaseRVCallee(F.getParent()));
+  else
+    Retain->setCalledFunction(getRetainAutoreleaseCallee(F.getParent()));
+
+  EraseInstruction(Autorelease);
+  return true;
+}
+
+/// ContractRelease - Attempt to merge an objc_release with a store, load, and
+/// objc_retain to form an objc_storeStrong. This can be a little tricky because
+/// the instructions don't always appear in order, and there may be unrelated
+/// intervening instructions.
+void ObjCARCContract::ContractRelease(Instruction *Release,
+                                      inst_iterator &Iter) {
+  LoadInst *Load = dyn_cast<LoadInst>(GetObjCArg(Release));
+  if (!Load || Load->isVolatile()) return;
+
+  // For now, require everything to be in one basic block.
+  BasicBlock *BB = Release->getParent();
+  if (Load->getParent() != BB) return;
+
+  // Walk down to find the store.
+  BasicBlock::iterator I = Load, End = BB->end();
+  ++I;
+  AliasAnalysis::Location Loc = AA->getLocation(Load);
+  while (I != End &&
+         (&*I == Release ||
+          IsRetain(GetBasicInstructionClass(I)) ||
+          !(AA->getModRefInfo(I, Loc) & AliasAnalysis::Mod)))
+    ++I;
+  StoreInst *Store = dyn_cast<StoreInst>(I);
+  if (!Store || Store->isVolatile()) return;
+  if (Store->getPointerOperand() != Loc.Ptr) return;
+
+  Value *New = StripPointerCastsAndObjCCalls(Store->getValueOperand());
+
+  // Walk up to find the retain.
+  I = Store;
+  BasicBlock::iterator Begin = BB->begin();
+  while (I != Begin && GetBasicInstructionClass(I) != IC_Retain)
+    --I;
+  Instruction *Retain = I;
+  if (GetBasicInstructionClass(Retain) != IC_Retain) return;
+  if (GetObjCArg(Retain) != New) return;
+
+  Changed = true;
+  ++NumStoreStrongs;
+
+  LLVMContext &C = Release->getContext();
+  const Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
+  const Type *I8XX = PointerType::getUnqual(I8X);
+
+  Value *Args[] = { Load->getPointerOperand(), New };
+  if (Args[0]->getType() != I8XX)
+    Args[0] = new BitCastInst(Args[0], I8XX, "", Store);
+  if (Args[1]->getType() != I8X)
+    Args[1] = new BitCastInst(Args[1], I8X, "", Store);
+  CallInst *StoreStrong =
+    CallInst::Create(getStoreStrongCallee(BB->getParent()->getParent()),
+                     Args, array_endof(Args), "", Store);
+  StoreStrong->setDoesNotThrow();
+  StoreStrong->setDebugLoc(Store->getDebugLoc());
+
+  if (&*Iter == Store) ++Iter;
+  Store->eraseFromParent();
+  Release->eraseFromParent();
+  EraseInstruction(Retain);
+  if (Load->use_empty())
+    Load->eraseFromParent();
+}
+
+bool ObjCARCContract::doInitialization(Module &M) {
+  // These are initialized lazily.
+  StoreStrongCallee = 0;
+  RetainAutoreleaseCallee = 0;
+  RetainAutoreleaseRVCallee = 0;
+
+  // Initialize RetainRVMarker.
+  RetainRVMarker = 0;
+  if (NamedMDNode *NMD =
+        M.getNamedMetadata("clang.arc.retainAutoreleasedReturnValueMarker"))
+    if (NMD->getNumOperands() == 1) {
+      const MDNode *N = NMD->getOperand(0);
+      if (N->getNumOperands() == 1)
+        if (const MDString *S = dyn_cast<MDString>(N->getOperand(0)))
+          RetainRVMarker = S;
+    }
+
+  return false;
+}
+
+bool ObjCARCContract::runOnFunction(Function &F) {
+  if (!EnableARCOpts)
+    return false;
+
+  Changed = false;
+  AA = &getAnalysis<AliasAnalysis>();
+  DT = &getAnalysis<DominatorTree>();
+
+  PA.setAA(&getAnalysis<AliasAnalysis>());
+
+  // For ObjC library calls which return their argument, replace uses of the
+  // argument with uses of the call return value, if it dominates the use. This
+  // reduces register pressure.
+  SmallPtrSet<Instruction *, 4> DependingInstructions;
+  SmallPtrSet<const BasicBlock *, 4> Visited;
+  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
+    Instruction *Inst = &*I++;
+
+    // Only these library routines return their argument. In particular,
+    // objc_retainBlock does not necessarily return its argument.
+    InstructionClass Class = GetBasicInstructionClass(Inst);
+    switch (Class) {
+    case IC_Retain:
+    case IC_FusedRetainAutorelease:
+    case IC_FusedRetainAutoreleaseRV:
+      break;
+    case IC_Autorelease:
+    case IC_AutoreleaseRV:
+      if (ContractAutorelease(F, Inst, Class, DependingInstructions, Visited))
+        continue;
+      break;
+    case IC_RetainRV: {
+      // If we're compiling for a target which needs a special inline-asm
+      // marker to do the retainAutoreleasedReturnValue optimization,
+      // insert it now.
+      if (!RetainRVMarker)
+        break;
+      BasicBlock::iterator BBI = Inst;
+      --BBI;
+      while (isNoopInstruction(BBI)) --BBI;
+      if (&*BBI == GetObjCArg(Inst)) {
+        InlineAsm *IA =
+          InlineAsm::get(FunctionType::get(Type::getVoidTy(Inst->getContext()),
+                                           /*isVarArg=*/false),
+                         RetainRVMarker->getString(),
+                         /*Constraints=*/"", /*hasSideEffects=*/true);
+        CallInst::Create(IA, "", Inst);
+      }
+      break;
+    }
+    case IC_InitWeak: {
+      // objc_initWeak(p, null) => *p = null
+      CallInst *CI = cast<CallInst>(Inst);
+      if (isNullOrUndef(CI->getArgOperand(1))) {
+        Value *Null =
+          ConstantPointerNull::get(cast<PointerType>(CI->getType()));
+        Changed = true;
+        new StoreInst(Null, CI->getArgOperand(0), CI);
+        CI->replaceAllUsesWith(Null);
+        CI->eraseFromParent();
+      }
+      continue;
+    }
+    case IC_Release:
+      ContractRelease(Inst, I);
+      continue;
+    default:
+      continue;
+    }
+
+    // Don't use GetObjCArg because we don't want to look through bitcasts
+    // and such; to do the replacement, the argument must have type i8*.
+    const Value *Arg = cast<CallInst>(Inst)->getArgOperand(0);
+    for (;;) {
+      // If we're compiling bugpointed code, don't get in trouble.
+      if (!isa<Instruction>(Arg) && !isa<Argument>(Arg))
+        break;
+      // Look through the uses of the pointer.
+      for (Value::const_use_iterator UI = Arg->use_begin(), UE = Arg->use_end();
+           UI != UE; ) {
+        Use &U = UI.getUse();
+        unsigned OperandNo = UI.getOperandNo();
+        ++UI; // Increment UI now, because we may unlink its element.
+        if (Instruction *UserInst = dyn_cast<Instruction>(U.getUser()))
+          if (Inst != UserInst && DT->dominates(Inst, UserInst)) {
+            Changed = true;
+            Instruction *Replacement = Inst;
+            const Type *UseTy = U.get()->getType();
+            if (PHINode *PHI = dyn_cast<PHINode>(UserInst)) {
+              // For PHI nodes, insert the bitcast in the predecessor block.
+              unsigned ValNo =
+                PHINode::getIncomingValueNumForOperand(OperandNo);
+              BasicBlock *BB =
+                PHI->getIncomingBlock(ValNo);
+              if (Replacement->getType() != UseTy)
+                Replacement = new BitCastInst(Replacement, UseTy, "",
+                                              &BB->back());
+              for (unsigned i = 0, e = PHI->getNumIncomingValues();
+                   i != e; ++i)
+                if (PHI->getIncomingBlock(i) == BB) {
+                  // Keep the UI iterator valid.
+                  if (&PHI->getOperandUse(
+                        PHINode::getOperandNumForIncomingValue(i)) ==
+                        &UI.getUse())
+                    ++UI;
+                  PHI->setIncomingValue(i, Replacement);
+                }
+            } else {
+              if (Replacement->getType() != UseTy)
+                Replacement = new BitCastInst(Replacement, UseTy, "", UserInst);
+              U.set(Replacement);
+            }
+          }
+      }
+
+      // If Arg is a no-op casted pointer, strip one level of casts and
+      // iterate.
+      if (const BitCastInst *BI = dyn_cast<BitCastInst>(Arg))
+        Arg = BI->getOperand(0);
+      else if (isa<GEPOperator>(Arg) &&
+               cast<GEPOperator>(Arg)->hasAllZeroIndices())
+        Arg = cast<GEPOperator>(Arg)->getPointerOperand();
+      else if (isa<GlobalAlias>(Arg) &&
+               !cast<GlobalAlias>(Arg)->mayBeOverridden())
+        Arg = cast<GlobalAlias>(Arg)->getAliasee();
+      else
+        break;
+    }
+  }
+
+  return Changed;
+}
diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp
index accabb0..c1dfe15 100644
--- a/lib/Transforms/Scalar/Reassociate.cpp
+++ b/lib/Transforms/Scalar/Reassociate.cpp
@@ -75,6 +75,7 @@ namespace {
   class Reassociate : public FunctionPass {
     DenseMap<BasicBlock*, unsigned> RankMap;
     DenseMap<AssertingVH<>, unsigned> ValueRankMap;
+    SmallVector<WeakVH, 8> RedoInsts;
     SmallVector<WeakVH, 8> DeadInsts;
     bool MadeChange;
   public:
@@ -100,7 +101,7 @@ namespace {
     void LinearizeExprTree(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops);
     void LinearizeExpr(BinaryOperator *I);
     Value *RemoveFactorFromExpression(Value *V, Value *Factor);
-    void ReassociateBB(BasicBlock *BB);
+    void ReassociateInst(BasicBlock::iterator &BBI);
     
     void RemoveDeadBinaryOp(Value *V);
   };
@@ -216,6 +217,7 @@ static Instruction *LowerNegateToMultiply(Instruction *Neg,
   ValueRankMap.erase(Neg);
   Res->takeName(Neg);
   Neg->replaceAllUsesWith(Res);
+  Res->setDebugLoc(Neg->getDebugLoc());
   Neg->eraseFromParent();
   return Res;
 }
@@ -505,6 +507,7 @@ static Instruction *BreakUpSubtract(Instruction *Sub,
   // Everyone now refers to the add instruction.
   ValueRankMap.erase(Sub);
   Sub->replaceAllUsesWith(New);
+  New->setDebugLoc(Sub->getDebugLoc());
   Sub->eraseFromParent();
 
   DEBUG(dbgs() << "Negated: " << *New << '\n');
@@ -530,6 +533,7 @@ static Instruction *ConvertShiftToMul(Instruction *Shl,
     ValueRankMap.erase(Shl);
     Mul->takeName(Shl);
     Shl->replaceAllUsesWith(Mul);
+    Mul->setDebugLoc(Shl->getDebugLoc());
     Shl->eraseFromParent();
     return Mul;
   }
@@ -734,7 +738,7 @@ Value *Reassociate::OptimizeAdd(Instruction *I,
       // Now that we have inserted a multiply, optimize it. This allows us to
       // handle cases that require multiple factoring steps, such as this:
       // (X*2) + (X*2) + (X*2) -> (X*2)*3 -> X*6
-      Mul = ReassociateExpression(cast<BinaryOperator>(Mul));
+      RedoInsts.push_back(Mul);
       
       // If every add operand was a duplicate, return the multiply.
       if (Ops.empty())
@@ -962,71 +966,69 @@ Value *Reassociate::OptimizeExpression(BinaryOperator *I,
 }
 
 
-/// ReassociateBB - Inspect all of the instructions in this basic block,
-/// reassociating them as we go.
-void Reassociate::ReassociateBB(BasicBlock *BB) {
-  for (BasicBlock::iterator BBI = BB->begin(); BBI != BB->end(); ) {
-    Instruction *BI = BBI++;
-    if (BI->getOpcode() == Instruction::Shl &&
-        isa<ConstantInt>(BI->getOperand(1)))
-      if (Instruction *NI = ConvertShiftToMul(BI, ValueRankMap)) {
-        MadeChange = true;
-        BI = NI;
-      }
+/// ReassociateInst - Inspect and reassociate the instruction at the
+/// given position, post-incrementing the position.
+void Reassociate::ReassociateInst(BasicBlock::iterator &BBI) {
+  Instruction *BI = BBI++;
+  if (BI->getOpcode() == Instruction::Shl &&
+      isa<ConstantInt>(BI->getOperand(1)))
+    if (Instruction *NI = ConvertShiftToMul(BI, ValueRankMap)) {
+      MadeChange = true;
+      BI = NI;
+    }
 
-    // Reject cases where it is pointless to do this.
-    if (!isa<BinaryOperator>(BI) || BI->getType()->isFloatingPointTy() || 
-        BI->getType()->isVectorTy())
-      continue;  // Floating point ops are not associative.
-
-    // Do not reassociate boolean (i1) expressions.  We want to preserve the
-    // original order of evaluation for short-circuited comparisons that
-    // SimplifyCFG has folded to AND/OR expressions.  If the expression
-    // is not further optimized, it is likely to be transformed back to a
-    // short-circuited form for code gen, and the source order may have been
-    // optimized for the most likely conditions.
-    if (BI->getType()->isIntegerTy(1))
-      continue;
+  // Reject cases where it is pointless to do this.
+  if (!isa<BinaryOperator>(BI) || BI->getType()->isFloatingPointTy() || 
+      BI->getType()->isVectorTy())
+    return;  // Floating point ops are not associative.
+
+  // Do not reassociate boolean (i1) expressions.  We want to preserve the
+  // original order of evaluation for short-circuited comparisons that
+  // SimplifyCFG has folded to AND/OR expressions.  If the expression
+  // is not further optimized, it is likely to be transformed back to a
+  // short-circuited form for code gen, and the source order may have been
+  // optimized for the most likely conditions.
+  if (BI->getType()->isIntegerTy(1))
+    return;
 
-    // If this is a subtract instruction which is not already in negate form,
-    // see if we can convert it to X+-Y.
-    if (BI->getOpcode() == Instruction::Sub) {
-      if (ShouldBreakUpSubtract(BI)) {
-        BI = BreakUpSubtract(BI, ValueRankMap);
-        // Reset the BBI iterator in case BreakUpSubtract changed the
-        // instruction it points to.
-        BBI = BI;
-        ++BBI;
+  // If this is a subtract instruction which is not already in negate form,
+  // see if we can convert it to X+-Y.
+  if (BI->getOpcode() == Instruction::Sub) {
+    if (ShouldBreakUpSubtract(BI)) {
+      BI = BreakUpSubtract(BI, ValueRankMap);
+      // Reset the BBI iterator in case BreakUpSubtract changed the
+      // instruction it points to.
+      BBI = BI;
+      ++BBI;
+      MadeChange = true;
+    } else if (BinaryOperator::isNeg(BI)) {
+      // Otherwise, this is a negation.  See if the operand is a multiply tree
+      // and if this is not an inner node of a multiply tree.
+      if (isReassociableOp(BI->getOperand(1), Instruction::Mul) &&
+          (!BI->hasOneUse() ||
+           !isReassociableOp(BI->use_back(), Instruction::Mul))) {
+        BI = LowerNegateToMultiply(BI, ValueRankMap);
         MadeChange = true;
-      } else if (BinaryOperator::isNeg(BI)) {
-        // Otherwise, this is a negation.  See if the operand is a multiply tree
-        // and if this is not an inner node of a multiply tree.
-        if (isReassociableOp(BI->getOperand(1), Instruction::Mul) &&
-            (!BI->hasOneUse() ||
-             !isReassociableOp(BI->use_back(), Instruction::Mul))) {
-          BI = LowerNegateToMultiply(BI, ValueRankMap);
-          MadeChange = true;
-        }
       }
     }
+  }
 
-    // If this instruction is a commutative binary operator, process it.
-    if (!BI->isAssociative()) continue;
-    BinaryOperator *I = cast<BinaryOperator>(BI);
+  // If this instruction is a commutative binary operator, process it.
+  if (!BI->isAssociative()) return;
+  BinaryOperator *I = cast<BinaryOperator>(BI);
 
-    // If this is an interior node of a reassociable tree, ignore it until we
-    // get to the root of the tree, to avoid N^2 analysis.
-    if (I->hasOneUse() && isReassociableOp(I->use_back(), I->getOpcode()))
-      continue;
+  // If this is an interior node of a reassociable tree, ignore it until we
+  // get to the root of the tree, to avoid N^2 analysis.
+  if (I->hasOneUse() && isReassociableOp(I->use_back(), I->getOpcode()))
+    return;
 
-    // If this is an add tree that is used by a sub instruction, ignore it 
-    // until we process the subtract.
-    if (I->hasOneUse() && I->getOpcode() == Instruction::Add &&
-        cast<Instruction>(I->use_back())->getOpcode() == Instruction::Sub)
-      continue;
+  // If this is an add tree that is used by a sub instruction, ignore it 
+  // until we process the subtract.
+  if (I->hasOneUse() && I->getOpcode() == Instruction::Add &&
+      cast<Instruction>(I->use_back())->getOpcode() == Instruction::Sub)
+    return;
 
-    ReassociateExpression(I);
-  }
+  ReassociateExpression(I);
 }
 
 Value *Reassociate::ReassociateExpression(BinaryOperator *I) {
@@ -1053,6 +1055,8 @@ Value *Reassociate::ReassociateExpression(BinaryOperator *I) {
     // eliminate it.
     DEBUG(dbgs() << "Reassoc to scalar: " << *V << '\n');
     I->replaceAllUsesWith(V);
+    if (Instruction *VI = dyn_cast<Instruction>(V))
+      VI->setDebugLoc(I->getDebugLoc());
     RemoveDeadBinaryOp(I);
     ++NumAnnihil;
     return V;
@@ -1076,6 +1080,8 @@ Value *Reassociate::ReassociateExpression(BinaryOperator *I) {
     // This expression tree simplified to something that isn't a tree,
     // eliminate it.
     I->replaceAllUsesWith(Ops[0].Op);
+    if (Instruction *OI = dyn_cast<Instruction>(Ops[0].Op))
+      OI->setDebugLoc(I->getDebugLoc());
     RemoveDeadBinaryOp(I);
     return Ops[0].Op;
   }
@@ -1093,7 +1099,16 @@ bool Reassociate::runOnFunction(Function &F) {
 
   MadeChange = false;
   for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI)
-    ReassociateBB(FI);
+    for (BasicBlock::iterator BBI = FI->begin(); BBI != FI->end(); )
+      ReassociateInst(BBI);
+
+  // Now that we're done, revisit any instructions which are likely to
+  // have secondary reassociation opportunities.
+  while (!RedoInsts.empty())
+    if (Value *V = RedoInsts.pop_back_val()) {
+      BasicBlock::iterator BBI = cast<Instruction>(V);
+      ReassociateInst(BBI);
+    }
 
   // Now that we're done, delete any instructions which are no longer used.
   while (!DeadInsts.empty())
diff --git a/lib/Transforms/Scalar/Reg2Mem.cpp b/lib/Transforms/Scalar/Reg2Mem.cpp
index 459bb06..47afc77 100644
--- a/lib/Transforms/Scalar/Reg2Mem.cpp
+++ b/lib/Transforms/Scalar/Reg2Mem.cpp
@@ -9,7 +9,7 @@
 //
 // This file demotes all registers to memory references.  It is intented to be
 // the inverse of PromoteMemoryToRegister.  By converting to loads, the only
-// values live accross basic blocks are allocas and loads before phi nodes.
+// values live across basic blocks are allocas and loads before phi nodes.
 // It is intended that this should make CFG hacking much easier.
 // To make later hacking easier, the entry block is split into two, such that
 // all introduced allocas and nothing else are in the entry block.
diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
index c82e929..083412e 100644
--- a/lib/Transforms/Scalar/SCCP.cpp
+++ b/lib/Transforms/Scalar/SCCP.cpp
@@ -655,7 +655,7 @@ bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) {
   
   // Just mark all destinations executable!
   // TODO: This could be improved if the operand is a [cast of a] BlockAddress.
-  if (isa<IndirectBrInst>(&TI))
+  if (isa<IndirectBrInst>(TI))
     return true;
   
 #ifndef NDEBUG
@@ -1989,7 +1989,7 @@ bool IPSCCP::runOnModule(Module &M) {
     ReturnsToZap[i]->setOperand(0, UndefValue::get(F->getReturnType()));
   }
     
-  // If we infered constant or undef values for globals variables, we can delete
+  // If we inferred constant or undef values for globals variables, we can delete
   // the global and any stores that remain to it.
   const DenseMap<GlobalVariable*, LatticeVal> &TG = Solver.getTrackedGlobals();
   for (DenseMap<GlobalVariable*, LatticeVal>::const_iterator I = TG.begin(),
diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp
index 5428a5a..158d653 100644
--- a/lib/Transforms/Scalar/Scalar.cpp
+++ b/lib/Transforms/Scalar/Scalar.cpp
@@ -17,6 +17,7 @@
 #include "llvm-c/Initialization.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/PassManager.h"
+#include "llvm/Analysis/Passes.h"
 #include "llvm/Analysis/Verifier.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Transforms/Scalar.h"
@@ -48,6 +49,10 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeLoopIdiomRecognizePass(Registry);
   initializeLowerAtomicPass(Registry);
   initializeMemCpyOptPass(Registry);
+  initializeObjCARCAliasAnalysisPass(Registry);
+  initializeObjCARCExpandPass(Registry);
+  initializeObjCARCContractPass(Registry);
+  initializeObjCARCOptPass(Registry);
   initializeReassociatePass(Registry);
   initializeRegToMemPass(Registry);
   initializeSCCPPass(Registry);
@@ -173,3 +178,11 @@ void LLVMAddCorrelatedValuePropagationPass(LLVMPassManagerRef PM) {
 void LLVMAddEarlyCSEPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createEarlyCSEPass());
 }
+
+void LLVMAddTypeBasedAliasAnalysisPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createTypeBasedAliasAnalysisPass());
+}
+
+void LLVMAddBasicAliasAnalysisPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createBasicAliasAnalysisPass());
+}
diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
index 42246ff..beef127 100644
--- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp
+++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
@@ -30,6 +30,7 @@
 #include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
 #include "llvm/Pass.h"
+#include "llvm/Analysis/DIBuilder.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -227,16 +228,30 @@ class ConvertToScalarInfo {
   /// which means that mem2reg can't promote it.
   bool IsNotTrivial;
 
+  /// ScalarKind - Tracks the kind of alloca being considered for promotion,
+  /// computed based on the uses of the alloca rather than the LLVM type system.
+  enum {
+    Unknown,
+
+    // Accesses via GEPs that are consistent with element access of a vector
+    // type. This will not be converted into a vector unless there is a later
+    // access using an actual vector type.
+    ImplicitVector,
+
+    // Accesses via vector operations and GEPs that are consistent with the
+    // layout of a vector type.
+    Vector,
+
+    // An integer bag-of-bits with bitwise operations for insertion and
+    // extraction. Any combination of types can be converted into this kind
+    // of scalar.
+    Integer
+  } ScalarKind;
+
   /// VectorTy - This tracks the type that we should promote the vector to if
   /// it is possible to turn it into a vector.  This starts out null, and if it
   /// isn't possible to turn into a vector type, it gets set to VoidTy.
-  const Type *VectorTy;
-
-  /// HadAVector - True if there is at least one vector access to the alloca.
-  /// We don't want to turn random arrays into vectors and use vector element
-  /// insert/extract, but if there are element accesses to something that is
-  /// also declared as a vector, we do want to promote to a vector.
-  bool HadAVector;
+  const VectorType *VectorTy;
 
   /// HadNonMemTransferAccess - True if there is at least one access to the 
   /// alloca that is not a MemTransferInst.  We don't want to turn structs into
@@ -245,14 +260,14 @@ class ConvertToScalarInfo {
 
 public:
   explicit ConvertToScalarInfo(unsigned Size, const TargetData &td)
-    : AllocaSize(Size), TD(td), IsNotTrivial(false), VectorTy(0),
-      HadAVector(false), HadNonMemTransferAccess(false) { }
+    : AllocaSize(Size), TD(td), IsNotTrivial(false), ScalarKind(Unknown),
+      VectorTy(0), HadNonMemTransferAccess(false) { }
 
   AllocaInst *TryConvert(AllocaInst *AI);
 
 private:
   bool CanConvertToScalar(Value *V, uint64_t Offset);
-  void MergeInType(const Type *In, uint64_t Offset, bool IsLoadOrStore);
+  void MergeInTypeForLoadOrStore(const Type *In, uint64_t Offset);
   bool MergeInVectorType(const VectorType *VInTy, uint64_t Offset);
   void ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, uint64_t Offset);
 
@@ -273,6 +288,11 @@ AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) {
   if (!CanConvertToScalar(AI, 0) || !IsNotTrivial)
     return 0;
 
+  // If an alloca has only memset / memcpy uses, it may still have an Unknown
+  // ScalarKind. Treat it as an Integer below.
+  if (ScalarKind == Unknown)
+    ScalarKind = Integer;
+
   // If we were able to find a vector type that can handle this with
   // insert/extract elements, and if there was at least one use that had
   // a vector type, promote this to a vector.  We don't want to promote
@@ -280,14 +300,15 @@ AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) {
   // we just get a lot of insert/extracts.  If at least one vector is
   // involved, then we probably really do have a union of vector/array.
   const Type *NewTy;
-  if (VectorTy && VectorTy->isVectorTy() && HadAVector) {
+  if (ScalarKind == Vector) {
+    assert(VectorTy && "Missing type for vector scalar.");
     DEBUG(dbgs() << "CONVERT TO VECTOR: " << *AI << "\n  TYPE = "
           << *VectorTy << '\n');
     NewTy = VectorTy;  // Use the vector type.
   } else {
     unsigned BitWidth = AllocaSize * 8;
-    if (!HadAVector && !HadNonMemTransferAccess &&
-        !TD.fitsInLegalInteger(BitWidth))
+    if ((ScalarKind == ImplicitVector || ScalarKind == Integer) &&
+        !HadNonMemTransferAccess && !TD.fitsInLegalInteger(BitWidth))
       return 0;
 
     DEBUG(dbgs() << "CONVERT TO SCALAR INTEGER: " << *AI << "\n");
@@ -299,8 +320,9 @@ AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) {
   return NewAI;
 }
 
-/// MergeInType - Add the 'In' type to the accumulated vector type (VectorTy)
-/// so far at the offset specified by Offset (which is specified in bytes).
+/// MergeInTypeForLoadOrStore - Add the 'In' type to the accumulated vector type
+/// (VectorTy) so far at the offset specified by Offset (which is specified in
+/// bytes).
 ///
 /// There are three cases we handle here:
 ///   1) A union of vector types of the same size and potentially its elements.
@@ -315,11 +337,11 @@ AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) {
 ///      large) integer type with extract and insert operations where the loads
 ///      and stores would mutate the memory.  We mark this by setting VectorTy
 ///      to VoidTy.
-void ConvertToScalarInfo::MergeInType(const Type *In, uint64_t Offset,
-                                      bool IsLoadOrStore) {
+void ConvertToScalarInfo::MergeInTypeForLoadOrStore(const Type *In,
+                                                    uint64_t Offset) {
   // If we already decided to turn this into a blob of integer memory, there is
   // nothing to be done.
-  if (VectorTy && VectorTy->isVoidTy())
+  if (ScalarKind == Integer)
     return;
 
   // If this could be contributing to a vector, analyze it.
@@ -335,33 +357,40 @@ void ConvertToScalarInfo::MergeInType(const Type *In, uint64_t Offset,
     // Full width accesses can be ignored, because they can always be turned
     // into bitcasts.
     unsigned EltSize = In->getPrimitiveSizeInBits()/8;
-    if (IsLoadOrStore && EltSize == AllocaSize)
+    if (EltSize == AllocaSize)
       return;
+
     // If we're accessing something that could be an element of a vector, see
     // if the implied vector agrees with what we already have and if Offset is
     // compatible with it.
     if (Offset % EltSize == 0 && AllocaSize % EltSize == 0 &&
-        (VectorTy == 0 ||
-         cast<VectorType>(VectorTy)->getElementType()
-               ->getPrimitiveSizeInBits()/8 == EltSize)) {
-      if (VectorTy == 0)
+        (!VectorTy || Offset * 8 < VectorTy->getPrimitiveSizeInBits())) {
+      if (!VectorTy) {
+        ScalarKind = ImplicitVector;
         VectorTy = VectorType::get(In, AllocaSize/EltSize);
-      return;
+        return;
+      }
+
+      unsigned CurrentEltSize = VectorTy->getElementType()
+                                ->getPrimitiveSizeInBits()/8;
+      if (EltSize == CurrentEltSize)
+        return;
+
+      if (In->isIntegerTy() && isPowerOf2_32(AllocaSize / EltSize))
+        return;
     }
   }
 
   // Otherwise, we have a case that we can't handle with an optimized vector
   // form.  We can still turn this into a large integer.
-  VectorTy = Type::getVoidTy(In->getContext());
+  ScalarKind = Integer;
+  VectorTy = 0;
 }
 
-/// MergeInVectorType - Handles the vector case of MergeInType, returning true
-/// if the type was successfully merged and false otherwise.
+/// MergeInVectorType - Handles the vector case of MergeInTypeForLoadOrStore,
+/// returning true if the type was successfully merged and false otherwise.
 bool ConvertToScalarInfo::MergeInVectorType(const VectorType *VInTy,
                                             uint64_t Offset) {
-  // Remember if we saw a vector type.
-  HadAVector = true;
-
   // TODO: Support nonzero offsets?
   if (Offset != 0)
     return false;
@@ -373,19 +402,22 @@ bool ConvertToScalarInfo::MergeInVectorType(const VectorType *VInTy,
   // If this the first vector we see, remember the type so that we know the
   // element size.
   if (!VectorTy) {
+    ScalarKind = Vector;
     VectorTy = VInTy;
     return true;
   }
 
-  unsigned BitWidth = cast<VectorType>(VectorTy)->getBitWidth();
+  unsigned BitWidth = VectorTy->getBitWidth();
   unsigned InBitWidth = VInTy->getBitWidth();
 
   // Vectors of the same size can be converted using a simple bitcast.
-  if (InBitWidth == BitWidth && AllocaSize == (InBitWidth / 8))
+  if (InBitWidth == BitWidth && AllocaSize == (InBitWidth / 8)) {
+    ScalarKind = Vector;
     return true;
+  }
 
-  const Type *ElementTy = cast<VectorType>(VectorTy)->getElementType();
-  const Type *InElementTy = cast<VectorType>(VInTy)->getElementType();
+  const Type *ElementTy = VectorTy->getElementType();
+  const Type *InElementTy = VInTy->getElementType();
 
   // Do not allow mixed integer and floating-point accesses from vectors of
   // different sizes.
@@ -420,6 +452,7 @@ bool ConvertToScalarInfo::MergeInVectorType(const VectorType *VInTy,
   }
 
   // Pick the largest of the two vector types.
+  ScalarKind = Vector;
   if (InBitWidth > BitWidth)
     VectorTy = VInTy;
 
@@ -447,7 +480,7 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) {
       if (LI->getType()->isX86_MMXTy())
         return false;
       HadNonMemTransferAccess = true;
-      MergeInType(LI->getType(), Offset, true);
+      MergeInTypeForLoadOrStore(LI->getType(), Offset);
       continue;
     }
 
@@ -458,7 +491,7 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) {
       if (SI->getOperand(0)->getType()->isX86_MMXTy())
         return false;
       HadNonMemTransferAccess = true;
-      MergeInType(SI->getOperand(0)->getType(), Offset, true);
+      MergeInTypeForLoadOrStore(SI->getOperand(0)->getType(), Offset);
       continue;
     }
 
@@ -657,23 +690,60 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
 }
 
 /// getScaledElementType - Gets a scaled element type for a partial vector
-/// access of an alloca. The input type must be an integer or float, and
-/// the resulting type must be an integer, float or double.
-static const Type *getScaledElementType(const Type *OldTy,
+/// access of an alloca. The input types must be integer or floating-point
+/// scalar or vector types, and the resulting type is an integer, float or
+/// double.
+static const Type *getScaledElementType(const Type *Ty1, const Type *Ty2,
                                         unsigned NewBitWidth) {
-  assert((OldTy->isIntegerTy() || OldTy->isFloatTy()) && "Partial vector "
-         "accesses must be scaled from integer or float elements.");
-
-  LLVMContext &Context = OldTy->getContext();
+  bool IsFP1 = Ty1->isFloatingPointTy() ||
+               (Ty1->isVectorTy() &&
+                cast<VectorType>(Ty1)->getElementType()->isFloatingPointTy());
+  bool IsFP2 = Ty2->isFloatingPointTy() ||
+               (Ty2->isVectorTy() &&
+                cast<VectorType>(Ty2)->getElementType()->isFloatingPointTy());
+
+  LLVMContext &Context = Ty1->getContext();
+
+  // Prefer floating-point types over integer types, as integer types may have
+  // been created by earlier scalar replacement.
+  if (IsFP1 || IsFP2) {
+    if (NewBitWidth == 32)
+      return Type::getFloatTy(Context);
+    if (NewBitWidth == 64)
+      return Type::getDoubleTy(Context);
+  }
 
-  if (OldTy->isIntegerTy())
-    return Type::getIntNTy(Context, NewBitWidth);
-  if (NewBitWidth == 32)
-    return Type::getFloatTy(Context);
-  if (NewBitWidth == 64)
-    return Type::getDoubleTy(Context);
+  return Type::getIntNTy(Context, NewBitWidth);
+}
 
-  llvm_unreachable("Invalid type for a partial vector access of an alloca!");
+/// CreateShuffleVectorCast - Creates a shuffle vector to convert one vector
+/// to another vector of the same element type which has the same allocation
+/// size but different primitive sizes (e.g. <3 x i32> and <4 x i32>).
+static Value *CreateShuffleVectorCast(Value *FromVal, const Type *ToType,
+                                      IRBuilder<> &Builder) {
+  const Type *FromType = FromVal->getType();
+  const VectorType *FromVTy = cast<VectorType>(FromType);
+  const VectorType *ToVTy = cast<VectorType>(ToType);
+  assert((ToVTy->getElementType() == FromVTy->getElementType()) &&
+         "Vectors must have the same element type");
+   Value *UnV = UndefValue::get(FromType);
+   unsigned numEltsFrom = FromVTy->getNumElements();
+   unsigned numEltsTo = ToVTy->getNumElements();
+
+   SmallVector<Constant*, 3> Args;
+   const Type* Int32Ty = Builder.getInt32Ty();
+   unsigned minNumElts = std::min(numEltsFrom, numEltsTo);
+   unsigned i;
+   for (i=0; i != minNumElts; ++i)
+     Args.push_back(ConstantInt::get(Int32Ty, i));
+
+   if (i < numEltsTo) {
+     Constant* UnC = UndefValue::get(Int32Ty);
+     for (; i != numEltsTo; ++i)
+       Args.push_back(UnC);
+   }
+   Constant *Mask = ConstantVector::get(Args);
+   return Builder.CreateShuffleVector(FromVal, UnV, Mask, "tmpV");
 }
 
 /// ConvertScalar_ExtractValue - Extract a value of type ToType from an integer
@@ -690,34 +760,45 @@ Value *ConvertToScalarInfo::
 ConvertScalar_ExtractValue(Value *FromVal, const Type *ToType,
                            uint64_t Offset, IRBuilder<> &Builder) {
   // If the load is of the whole new alloca, no conversion is needed.
-  if (FromVal->getType() == ToType && Offset == 0)
+  const Type *FromType = FromVal->getType();
+  if (FromType == ToType && Offset == 0)
     return FromVal;
 
   // If the result alloca is a vector type, this is either an element
   // access or a bitcast to another vector type of the same size.
-  if (const VectorType *VTy = dyn_cast<VectorType>(FromVal->getType())) {
+  if (const VectorType *VTy = dyn_cast<VectorType>(FromType)) {
+    unsigned FromTypeSize = TD.getTypeAllocSize(FromType);
     unsigned ToTypeSize = TD.getTypeAllocSize(ToType);
-    if (ToTypeSize == AllocaSize)
-      return Builder.CreateBitCast(FromVal, ToType, "tmp");
-
-    if (ToType->isVectorTy()) {
-      assert(isPowerOf2_64(AllocaSize / ToTypeSize) &&
-             "Partial vector access of an alloca must have a power-of-2 size "
-             "ratio.");
-      assert(Offset == 0 && "Can't extract a value of a smaller vector type "
-                            "from a nonzero offset.");
-
-      const Type *ToElementTy = cast<VectorType>(ToType)->getElementType();
-      const Type *CastElementTy = getScaledElementType(ToElementTy,
+    if (FromTypeSize == ToTypeSize) {
+      // If the two types have the same primitive size, use a bit cast.
+      // Otherwise, it is two vectors with the same element type that has
+      // the same allocation size but different number of elements so use
+      // a shuffle vector.
+      if (FromType->getPrimitiveSizeInBits() ==
+          ToType->getPrimitiveSizeInBits())
+        return Builder.CreateBitCast(FromVal, ToType, "tmp");
+      else
+        return CreateShuffleVectorCast(FromVal, ToType, Builder);
+    }
+
+    if (isPowerOf2_64(FromTypeSize / ToTypeSize)) {
+      assert(!(ToType->isVectorTy() && Offset != 0) && "Can't extract a value "
+             "of a smaller vector type at a nonzero offset.");
+
+      const Type *CastElementTy = getScaledElementType(FromType, ToType,
                                                        ToTypeSize * 8);
-      unsigned NumCastVectorElements = AllocaSize / ToTypeSize;
+      unsigned NumCastVectorElements = FromTypeSize / ToTypeSize;
 
       LLVMContext &Context = FromVal->getContext();
       const Type *CastTy = VectorType::get(CastElementTy,
                                            NumCastVectorElements);
       Value *Cast = Builder.CreateBitCast(FromVal, CastTy, "tmp");
+
+      unsigned EltSize = TD.getTypeAllocSizeInBits(CastElementTy);
+      unsigned Elt = Offset/EltSize;
+      assert(EltSize*Elt == Offset && "Invalid modulus in validity checking");
       Value *Extract = Builder.CreateExtractElement(Cast, ConstantInt::get(
-                                        Type::getInt32Ty(Context), 0), "tmp");
+                                        Type::getInt32Ty(Context), Elt), "tmp");
       return Builder.CreateBitCast(Extract, ToType, "tmp");
     }
 
@@ -837,16 +918,24 @@ ConvertScalar_InsertValue(Value *SV, Value *Old,
 
     // Changing the whole vector with memset or with an access of a different
     // vector type?
-    if (ValSize == VecSize)
-      return Builder.CreateBitCast(SV, AllocaType, "tmp");
+    if (ValSize == VecSize) {
+      // If the two types have the same primitive size, use a bit cast.
+      // Otherwise, it is two vectors with the same element type that has
+      // the same allocation size but different number of elements so use
+      // a shuffle vector.
+      if (VTy->getPrimitiveSizeInBits() ==
+          SV->getType()->getPrimitiveSizeInBits())
+        return Builder.CreateBitCast(SV, AllocaType, "tmp");
+      else
+        return CreateShuffleVectorCast(SV, VTy, Builder);
+    }
 
-    if (SV->getType()->isVectorTy() && isPowerOf2_64(VecSize / ValSize)) {
-      assert(Offset == 0 && "Can't insert a value of a smaller vector type at "
-                            "a nonzero offset.");
+    if (isPowerOf2_64(VecSize / ValSize)) {
+      assert(!(SV->getType()->isVectorTy() && Offset != 0) && "Can't insert a "
+             "value of a smaller vector type at a nonzero offset.");
 
-      const Type *ToElementTy =
-        cast<VectorType>(SV->getType())->getElementType();
-      const Type *CastElementTy = getScaledElementType(ToElementTy, ValSize);
+      const Type *CastElementTy = getScaledElementType(VTy, SV->getType(),
+                                                       ValSize);
       unsigned NumCastVectorElements = VecSize / ValSize;
 
       LLVMContext &Context = SV->getContext();
@@ -855,24 +944,23 @@ ConvertScalar_InsertValue(Value *SV, Value *Old,
       Value *OldCast = Builder.CreateBitCast(Old, OldCastTy, "tmp");
 
       Value *SVCast = Builder.CreateBitCast(SV, CastElementTy, "tmp");
+
+      unsigned EltSize = TD.getTypeAllocSizeInBits(CastElementTy);
+      unsigned Elt = Offset/EltSize;
+      assert(EltSize*Elt == Offset && "Invalid modulus in validity checking");
       Value *Insert =
         Builder.CreateInsertElement(OldCast, SVCast, ConstantInt::get(
-                                    Type::getInt32Ty(Context), 0), "tmp");
+                                        Type::getInt32Ty(Context), Elt), "tmp");
       return Builder.CreateBitCast(Insert, AllocaType, "tmp");
     }
 
-    uint64_t EltSize = TD.getTypeAllocSizeInBits(VTy->getElementType());
-
     // Must be an element insertion.
+    assert(SV->getType() == VTy->getElementType());
+    uint64_t EltSize = TD.getTypeAllocSizeInBits(VTy->getElementType());
     unsigned Elt = Offset/EltSize;
-
-    if (SV->getType() != VTy->getElementType())
-      SV = Builder.CreateBitCast(SV, VTy->getElementType(), "tmp");
-
-    SV = Builder.CreateInsertElement(Old, SV,
+    return Builder.CreateInsertElement(Old, SV,
                      ConstantInt::get(Type::getInt32Ty(SV->getContext()), Elt),
                                      "tmp");
-    return SV;
   }
 
   // If SV is a first-class aggregate value, insert each value recursively.
@@ -990,8 +1078,9 @@ namespace {
 class AllocaPromoter : public LoadAndStorePromoter {
   AllocaInst *AI;
 public:
-  AllocaPromoter(const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S)
-    : LoadAndStorePromoter(Insts, S), AI(0) {}
+  AllocaPromoter(const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S,
+                 DbgDeclareInst *DD, DIBuilder *&DB)
+    : LoadAndStorePromoter(Insts, S, DD, DB), AI(0) {}
   
   void run(AllocaInst *AI, const SmallVectorImpl<Instruction*> &Insts) {
     // Remember which alloca we're promoting (for isInstInList).
@@ -1268,7 +1357,6 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) {
   return true;
 }
 
-
 bool SROA::performPromotion(Function &F) {
   std::vector<AllocaInst*> Allocas;
   DominatorTree *DT = 0;
@@ -1279,6 +1367,7 @@ bool SROA::performPromotion(Function &F) {
 
   bool Changed = false;
   SmallVector<Instruction*, 64> Insts;
+  DIBuilder *DIB = 0;
   while (1) {
     Allocas.clear();
 
@@ -1302,8 +1391,11 @@ bool SROA::performPromotion(Function &F) {
         for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end();
              UI != E; ++UI)
           Insts.push_back(cast<Instruction>(*UI));
-        
-        AllocaPromoter(Insts, SSA).run(AI, Insts);
+
+        DbgDeclareInst *DDI = FindAllocaDbgDeclare(AI);
+        if (DDI && !DIB)
+          DIB = new DIBuilder(*AI->getParent()->getParent()->getParent());
+        AllocaPromoter(Insts, SSA, DDI, DIB).run(AI, Insts);
         Insts.clear();
       }
     }
@@ -1311,6 +1403,10 @@ bool SROA::performPromotion(Function &F) {
     Changed = true;
   }
 
+  // FIXME: Is there a better way to handle the lazy initialization of DIB
+  // so that there doesn't need to be an explicit delete?
+  delete DIB;
+
   return Changed;
 }
 
@@ -1770,9 +1866,10 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset,
         //   %insert = insertvalue { i32, i32 } %insert.0, i32 %load.1, 1
         // (Also works for arrays instead of structs)
         Value *Insert = UndefValue::get(LIType);
+        IRBuilder<> Builder(LI);
         for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
-          Value *Load = new LoadInst(NewElts[i], "load", LI);
-          Insert = InsertValueInst::Create(Insert, Load, i, "insert", LI);
+          Value *Load = Builder.CreateLoad(NewElts[i], "load");
+          Insert = Builder.CreateInsertValue(Insert, Load, i, "insert");
         }
         LI->replaceAllUsesWith(Insert);
         DeadInsts.push_back(LI);
@@ -1797,9 +1894,10 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset,
         //   %val.1 = extractvalue { i32, i32 } %val, 1
         //   store i32 %val.1, i32* %alloc.1
         // (Also works for arrays instead of structs)
+        IRBuilder<> Builder(SI);
         for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
-          Value *Extract = ExtractValueInst::Create(Val, i, Val->getName(), SI);
-          new StoreInst(Extract, NewElts[i], SI);
+          Value *Extract = Builder.CreateExtractValue(Val, i, Val->getName());
+          Builder.CreateStore(Extract, NewElts[i]);
         }
         DeadInsts.push_back(SI);
       } else if (SIType->isIntegerTy() &&
@@ -2420,19 +2518,22 @@ static bool isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy,
     }
 
     if (CallSite CS = U) {
-      // If this is a readonly/readnone call site, then we know it is just a
-      // load and we can ignore it.
-      if (CS.onlyReadsMemory())
-        continue;
-
       // If this is the function being called then we treat it like a load and
       // ignore it.
       if (CS.isCallee(UI))
         continue;
 
+      // If this is a readonly/readnone call site, then we know it is just a
+      // load (but one that potentially returns the value itself), so we can
+      // ignore it if we know that the value isn't captured.
+      unsigned ArgNo = CS.getArgumentNo(UI);
+      if (CS.onlyReadsMemory() &&
+          (CS.getInstruction()->use_empty() ||
+           CS.paramHasAttr(ArgNo+1, Attribute::NoCapture)))
+        continue;
+
       // If this is being passed as a byval argument, the caller is making a
       // copy, so it is only a read of the alloca.
-      unsigned ArgNo = CS.getArgumentNo(UI);
       if (CS.paramHasAttr(ArgNo+1, Attribute::ByVal))
         continue;
     }
diff --git a/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index 0bcec6b..7e9cc80 100644
--- a/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -73,7 +73,8 @@ static void ChangeToUnreachable(Instruction *I, bool UseLLVMTrap) {
   if (UseLLVMTrap) {
     Function *TrapFn =
       Intrinsic::getDeclaration(BB->getParent()->getParent(), Intrinsic::trap);
-    CallInst::Create(TrapFn, "", I);
+    CallInst *CallTrap = CallInst::Create(TrapFn, "", I);
+    CallTrap->setDebugLoc(I->getDebugLoc());
   }
   new UnreachableInst(I->getContext(), I);
   
@@ -95,6 +96,7 @@ static void ChangeToCall(InvokeInst *II) {
   NewCall->takeName(II);
   NewCall->setCallingConv(II->getCallingConv());
   NewCall->setAttributes(II->getAttributes());
+  NewCall->setDebugLoc(II->getDebugLoc());
   II->replaceAllUsesWith(NewCall);
 
   // Follow the call by a branch to the normal destination.
@@ -162,7 +164,7 @@ static bool MarkAliveBlocks(BasicBlock *BB,
         Changed = true;
       }
 
-    Changed |= ConstantFoldTerminator(BB);
+    Changed |= ConstantFoldTerminator(BB, true);
     for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
       Worklist.push_back(*SI);
   } while (!Worklist.empty());
diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp
index 5768ccb..e21eb9d 100644
--- a/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -36,7 +36,7 @@
 //     evaluated each time through the tail recursion.  Safely keeping allocas
 //     in the entry block requires analysis to proves that the tail-called
 //     function does not read or write the stack object.
-//  2. Tail recursion is only performed if the call immediately preceeds the
+//  2. Tail recursion is only performed if the call immediately precedes the
 //     return instruction.  It's possible that there could be a jump between
 //     the call and the return.
 //  3. There can be intervening operations between the call and the return that
@@ -59,6 +59,7 @@
 #include "llvm/Function.h"
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
+#include "llvm/Module.h"
 #include "llvm/Pass.h"
 #include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/InlineCost.h"
@@ -209,10 +210,10 @@ bool TailCallElim::runOnFunction(Function &F) {
     }
   }
 
-  // Finally, if this function contains no non-escaping allocas, mark all calls
-  // in the function as eligible for tail calls (there is no stack memory for
-  // them to access).
-  if (!FunctionContainsEscapingAllocas)
+  // Finally, if this function contains no non-escaping allocas, or calls
+  // setjmp, mark all calls in the function as eligible for tail calls
+  //(there is no stack memory for them to access).
+  if (!FunctionContainsEscapingAllocas && !F.callsFunctionThatReturnsTwice())
     for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
       for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
         if (CallInst *CI = dyn_cast<CallInst>(I)) {
@@ -433,7 +434,7 @@ bool TailCallElim::EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret,
     if (CanMoveAboveCall(BBI, CI)) continue;
     
     // If we can't move the instruction above the call, it might be because it
-    // is an associative and commutative operation that could be tranformed
+    // is an associative and commutative operation that could be transformed
     // using accumulator recursion elimination.  Check to see if this is the
     // case, and if so, remember the initial accumulator value for later.
     if ((AccumulatorRecursionEliminationInitVal =
@@ -573,7 +574,9 @@ bool TailCallElim::EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret,
 
   // Now that all of the PHI nodes are in place, remove the call and
   // ret instructions, replacing them with an unconditional branch.
-  BranchInst::Create(OldEntry, Ret);
+  BranchInst *NewBI = BranchInst::Create(OldEntry, Ret);
+  NewBI->setDebugLoc(CI->getDebugLoc());
+
   BB->getInstList().erase(Ret);  // Remove return.
   BB->getInstList().erase(CI);   // Remove call.
   ++NumEliminated;
diff --git a/lib/Transforms/Utils/BasicBlockUtils.cpp b/lib/Transforms/Utils/BasicBlockUtils.cpp
index f8c3326..92464e8 100644
--- a/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -538,3 +538,13 @@ ReturnInst *llvm::FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB,
   UncondBranch->eraseFromParent();
   return cast<ReturnInst>(NewRet);
 }
+
+/// GetFirstDebugLocInBasicBlock - Return first valid DebugLoc entry in a 
+/// given basic block.
+DebugLoc llvm::GetFirstDebugLocInBasicBlock(const BasicBlock *BB) {
+  if (const Instruction *I = BB->getFirstNonPHI())
+    return I->getDebugLoc();
+  // Scanning entire block may be too expensive, if the first instruction
+  // does not have valid location info.
+  return DebugLoc();
+}
diff --git a/lib/Transforms/Utils/BreakCriticalEdges.cpp b/lib/Transforms/Utils/BreakCriticalEdges.cpp
index 14a3c95..d6206a3 100644
--- a/lib/Transforms/Utils/BreakCriticalEdges.cpp
+++ b/lib/Transforms/Utils/BreakCriticalEdges.cpp
@@ -56,7 +56,7 @@ char BreakCriticalEdges::ID = 0;
 INITIALIZE_PASS(BreakCriticalEdges, "break-crit-edges",
                 "Break critical edges in CFG", false, false)
 
-// Publically exposed interface to pass...
+// Publicly exposed interface to pass...
 char &llvm::BreakCriticalEdgesID = BreakCriticalEdges::ID;
 FunctionPass *llvm::createBreakCriticalEdgesPass() {
   return new BreakCriticalEdges();
@@ -180,7 +180,8 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
   BasicBlock *NewBB = BasicBlock::Create(TI->getContext(),
                       TIBB->getName() + "." + DestBB->getName() + "_crit_edge");
   // Create our unconditional branch.
-  BranchInst::Create(DestBB, NewBB);
+  BranchInst *NewBI = BranchInst::Create(DestBB, NewBB);
+  NewBI->setDebugLoc(TI->getDebugLoc());
 
   // Branch to the new block, breaking the edge.
   TI->setSuccessor(SuccNum, NewBB);
diff --git a/lib/Transforms/Utils/BuildLibCalls.cpp b/lib/Transforms/Utils/BuildLibCalls.cpp
index 4a90751..14bb17f 100644
--- a/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -362,12 +362,8 @@ bool SimplifyFortifiedLibCalls::fold(CallInst *CI, const TargetData *TD) {
   Function *Callee = CI->getCalledFunction();
   StringRef Name = Callee->getName();
   const FunctionType *FT = Callee->getFunctionType();
-  BasicBlock *BB = CI->getParent();
   LLVMContext &Context = CI->getParent()->getContext();
-  IRBuilder<> B(Context);
-
-  // Set the builder to the instruction after the call.
-  B.SetInsertPoint(BB, CI);
+  IRBuilder<> B(CI);
 
   if (Name == "__memcpy_chk") {
     // Check if this has the right signature.
diff --git a/lib/Transforms/Utils/CMakeLists.txt b/lib/Transforms/Utils/CMakeLists.txt
index 5b76bb2..2df534f 100644
--- a/lib/Transforms/Utils/CMakeLists.txt
+++ b/lib/Transforms/Utils/CMakeLists.txt
@@ -5,7 +5,6 @@ add_llvm_library(LLVMTransformUtils
   BreakCriticalEdges.cpp
   BuildLibCalls.cpp
   CloneFunction.cpp
-  CloneLoop.cpp
   CloneModule.cpp
   CodeExtractor.cpp
   DemoteRegToStack.cpp
diff --git a/lib/Transforms/Utils/CloneLoop.cpp b/lib/Transforms/Utils/CloneLoop.cpp
deleted file mode 100644
index 87dd141..0000000
--- a/lib/Transforms/Utils/CloneLoop.cpp
+++ /dev/null
@@ -1,128 +0,0 @@
-//===- CloneLoop.cpp - Clone loop nest ------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the CloneLoop interface which makes a copy of a loop.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/BasicBlock.h"
-#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/Dominators.h"
-
-
-using namespace llvm;
-
-/// CloneDominatorInfo - Clone a basic block's dominator tree. It is expected
-/// that the basic block is already cloned.
-static void CloneDominatorInfo(BasicBlock *BB, 
-                               ValueToValueMapTy &VMap,
-                               DominatorTree *DT) {
-
-  assert (DT && "DominatorTree is not available");
-  ValueToValueMapTy::iterator BI = VMap.find(BB);
-  assert (BI != VMap.end() && "BasicBlock clone is missing");
-  BasicBlock *NewBB = cast<BasicBlock>(BI->second);
-
-  // NewBB already got dominator info.
-  if (DT->getNode(NewBB))
-    return;
-
-  assert (DT->getNode(BB) && "BasicBlock does not have dominator info");
-  // Entry block is not expected here. Infinite loops are not to cloned.
-  assert (DT->getNode(BB)->getIDom() && "BasicBlock does not have immediate dominator");
-  BasicBlock *BBDom = DT->getNode(BB)->getIDom()->getBlock();
-
-  // NewBB's dominator is either BB's dominator or BB's dominator's clone.
-  BasicBlock *NewBBDom = BBDom;
-  ValueToValueMapTy::iterator BBDomI = VMap.find(BBDom);
-  if (BBDomI != VMap.end()) {
-    NewBBDom = cast<BasicBlock>(BBDomI->second);
-    if (!DT->getNode(NewBBDom))
-      CloneDominatorInfo(BBDom, VMap, DT);
-  }
-  DT->addNewBlock(NewBB, NewBBDom);
-}
-
-/// CloneLoop - Clone Loop. Clone dominator info. Populate VMap
-/// using old blocks to new blocks mapping.
-Loop *llvm::CloneLoop(Loop *OrigL, LPPassManager  *LPM, LoopInfo *LI,
-                      ValueToValueMapTy &VMap, Pass *P) {
-  
-  DominatorTree *DT = NULL;
-  if (P)
-    DT = P->getAnalysisIfAvailable<DominatorTree>();
-
-  SmallVector<BasicBlock *, 16> NewBlocks;
-
-  // Populate loop nest.
-  SmallVector<Loop *, 8> LoopNest;
-  LoopNest.push_back(OrigL);
-
-
-  Loop *NewParentLoop = NULL;
-  do {
-    Loop *L = LoopNest.pop_back_val();
-    Loop *NewLoop = new Loop();
-
-    if (!NewParentLoop)
-      NewParentLoop = NewLoop;
-
-    LPM->insertLoop(NewLoop, L->getParentLoop());
-
-    // Clone Basic Blocks.
-    for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
-         I != E; ++I) {
-      BasicBlock *BB = *I;
-      BasicBlock *NewBB = CloneBasicBlock(BB, VMap, ".clone");
-      VMap[BB] = NewBB;
-      if (P)
-        LPM->cloneBasicBlockSimpleAnalysis(BB, NewBB, L);
-      NewLoop->addBasicBlockToLoop(NewBB, LI->getBase());
-      NewBlocks.push_back(NewBB);
-    }
-
-    // Clone dominator info.
-    if (DT)
-      for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
-           I != E; ++I) {
-        BasicBlock *BB = *I;
-        CloneDominatorInfo(BB, VMap, DT);
-      }
-
-    // Process sub loops
-    for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I)
-      LoopNest.push_back(*I);
-  } while (!LoopNest.empty());
-
-  // Remap instructions to reference operands from VMap.
-  for(SmallVector<BasicBlock *, 16>::iterator NBItr = NewBlocks.begin(), 
-        NBE = NewBlocks.end();  NBItr != NBE; ++NBItr) {
-    BasicBlock *NB = *NBItr;
-    for(BasicBlock::iterator BI = NB->begin(), BE = NB->end(); 
-        BI != BE; ++BI) {
-      Instruction *Insn = BI;
-      for (unsigned index = 0, num_ops = Insn->getNumOperands(); 
-           index != num_ops; ++index) {
-        Value *Op = Insn->getOperand(index);
-        ValueToValueMapTy::iterator OpItr = VMap.find(Op);
-        if (OpItr != VMap.end())
-          Insn->setOperand(index, OpItr->second);
-      }
-    }
-  }
-
-  BasicBlock *Latch = OrigL->getLoopLatch();
-  Function *F = Latch->getParent();
-  F->getBasicBlockList().insert(OrigL->getHeader(), 
-                                NewBlocks.begin(), NewBlocks.end());
-
-
-  return NewParentLoop;
-}
diff --git a/lib/Transforms/Utils/CodeExtractor.cpp b/lib/Transforms/Utils/CodeExtractor.cpp
index 46601b4..8c133ea 100644
--- a/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/lib/Transforms/Utils/CodeExtractor.cpp
@@ -157,7 +157,7 @@ void CodeExtractor::severSplitPHINodes(BasicBlock *&Header) {
         TI->replaceUsesOfWith(OldPred, NewBB);
       }
 
-    // Okay, everthing within the region is now branching to the right block, we
+    // Okay, everything within the region is now branching to the right block, we
     // just have to update the PHI nodes now, inserting PHI nodes into NewBB.
     for (AfterPHIs = OldPred->begin(); isa<PHINode>(AfterPHIs); ++AfterPHIs) {
       PHINode *PN = cast<PHINode>(AfterPHIs);
diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp
index 2cb1d3b..946e62f 100644
--- a/lib/Transforms/Utils/InlineFunction.cpp
+++ b/lib/Transforms/Utils/InlineFunction.cpp
@@ -10,6 +10,13 @@
 // This file implements inlining of a function into a call site, resolving
 // parameters and the return value as appropriate.
 //
+// The code in this file for handling inlines through invoke
+// instructions preserves semantics only under some assumptions about
+// the behavior of unwinders which correspond to gcc-style libUnwind
+// exception personality functions.  Eventually the IR will be
+// improved to make this unnecessary, but until then, this code is
+// marked [LIBUNWIND].
+//
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/Cloning.h"
@@ -28,6 +35,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/CallSite.h"
+#include "llvm/Support/IRBuilder.h"
 using namespace llvm;
 
 bool llvm::InlineFunction(CallInst *CI, InlineFunctionInfo &IFI) {
@@ -37,6 +45,372 @@ bool llvm::InlineFunction(InvokeInst *II, InlineFunctionInfo &IFI) {
   return InlineFunction(CallSite(II), IFI);
 }
 
+/// [LIBUNWIND] Look for an llvm.eh.exception call in the given block.
+static EHExceptionInst *findExceptionInBlock(BasicBlock *bb) {
+  for (BasicBlock::iterator i = bb->begin(), e = bb->end(); i != e; i++) {
+    EHExceptionInst *exn = dyn_cast<EHExceptionInst>(i);
+    if (exn) return exn;
+  }
+
+  return 0;
+}
+
+/// [LIBUNWIND] Look for the 'best' llvm.eh.selector instruction for
+/// the given llvm.eh.exception call.
+static EHSelectorInst *findSelectorForException(EHExceptionInst *exn) {
+  BasicBlock *exnBlock = exn->getParent();
+
+  EHSelectorInst *outOfBlockSelector = 0;
+  for (Instruction::use_iterator
+         ui = exn->use_begin(), ue = exn->use_end(); ui != ue; ++ui) {
+    EHSelectorInst *sel = dyn_cast<EHSelectorInst>(*ui);
+    if (!sel) continue;
+
+    // Immediately accept an eh.selector in the same block as the
+    // excepton call.
+    if (sel->getParent() == exnBlock) return sel;
+
+    // Otherwise, use the first selector we see.
+    if (!outOfBlockSelector) outOfBlockSelector = sel;
+  }
+
+  return outOfBlockSelector;
+}
+
+/// [LIBUNWIND] Find the (possibly absent) call to @llvm.eh.selector
+/// in the given landing pad.  In principle, llvm.eh.exception is
+/// required to be in the landing pad; in practice, SplitCriticalEdge
+/// can break that invariant, and then inlining can break it further.
+/// There's a real need for a reliable solution here, but until that
+/// happens, we have some fragile workarounds here.
+static EHSelectorInst *findSelectorForLandingPad(BasicBlock *lpad) {
+  // Look for an exception call in the actual landing pad.
+  EHExceptionInst *exn = findExceptionInBlock(lpad);
+  if (exn) return findSelectorForException(exn);
+
+  // Okay, if that failed, look for one in an obvious successor.  If
+  // we find one, we'll fix the IR by moving things back to the
+  // landing pad.
+
+  bool dominates = true; // does the lpad dominate the exn call
+  BasicBlock *nonDominated = 0; // if not, the first non-dominated block
+  BasicBlock *lastDominated = 0; // and the block which branched to it
+
+  BasicBlock *exnBlock = lpad;
+
+  // We need to protect against lpads that lead into infinite loops.
+  SmallPtrSet<BasicBlock*,4> visited;
+  visited.insert(exnBlock);
+
+  do {
+    // We're not going to apply this hack to anything more complicated
+    // than a series of unconditional branches, so if the block
+    // doesn't terminate in an unconditional branch, just fail.  More
+    // complicated cases can arise when, say, sinking a call into a
+    // split unwind edge and then inlining it; but that can do almost
+    // *anything* to the CFG, including leaving the selector
+    // completely unreachable.  The only way to fix that properly is
+    // to (1) prohibit transforms which move the exception or selector
+    // values away from the landing pad, e.g. by producing them with
+    // instructions that are pinned to an edge like a phi, or
+    // producing them with not-really-instructions, and (2) making
+    // transforms which split edges deal with that.
+    BranchInst *branch = dyn_cast<BranchInst>(&exnBlock->back());
+    if (!branch || branch->isConditional()) return 0;
+
+    BasicBlock *successor = branch->getSuccessor(0);
+
+    // Fail if we found an infinite loop.
+    if (!visited.insert(successor)) return 0;
+
+    // If the successor isn't dominated by exnBlock:
+    if (!successor->getSinglePredecessor()) {
+      // We don't want to have to deal with threading the exception
+      // through multiple levels of phi, so give up if we've already
+      // followed a non-dominating edge.
+      if (!dominates) return 0;
+
+      // Otherwise, remember this as a non-dominating edge.
+      dominates = false;
+      nonDominated = successor;
+      lastDominated = exnBlock;
+    }
+
+    exnBlock = successor;
+
+    // Can we stop here?
+    exn = findExceptionInBlock(exnBlock);
+  } while (!exn);
+
+  // Look for a selector call for the exception we found.
+  EHSelectorInst *selector = findSelectorForException(exn);
+  if (!selector) return 0;
+
+  // The easy case is when the landing pad still dominates the
+  // exception call, in which case we can just move both calls back to
+  // the landing pad.
+  if (dominates) {
+    selector->moveBefore(lpad->getFirstNonPHI());
+    exn->moveBefore(selector);
+    return selector;
+  }
+
+  // Otherwise, we have to split at the first non-dominating block.
+  // The CFG looks basically like this:
+  //    lpad:
+  //      phis_0
+  //      insnsAndBranches_1
+  //      br label %nonDominated
+  //    nonDominated:
+  //      phis_2
+  //      insns_3
+  //      %exn = call i8* @llvm.eh.exception()
+  //      insnsAndBranches_4
+  //      %selector = call @llvm.eh.selector(i8* %exn, ...
+  // We need to turn this into:
+  //    lpad:
+  //      phis_0
+  //      %exn0 = call i8* @llvm.eh.exception()
+  //      %selector0 = call @llvm.eh.selector(i8* %exn0, ...
+  //      insnsAndBranches_1
+  //      br label %split // from lastDominated
+  //    nonDominated:
+  //      phis_2 (without edge from lastDominated)
+  //      %exn1 = call i8* @llvm.eh.exception()
+  //      %selector1 = call i8* @llvm.eh.selector(i8* %exn1, ...
+  //      br label %split
+  //    split:
+  //      phis_2 (edge from lastDominated, edge from split)
+  //      %exn = phi ...
+  //      %selector = phi ...
+  //      insns_3
+  //      insnsAndBranches_4
+
+  assert(nonDominated);
+  assert(lastDominated);
+
+  // First, make clones of the intrinsics to go in lpad.
+  EHExceptionInst *lpadExn = cast<EHExceptionInst>(exn->clone());
+  EHSelectorInst *lpadSelector = cast<EHSelectorInst>(selector->clone());
+  lpadSelector->setArgOperand(0, lpadExn);
+  lpadSelector->insertBefore(lpad->getFirstNonPHI());
+  lpadExn->insertBefore(lpadSelector);
+
+  // Split the non-dominated block.
+  BasicBlock *split =
+    nonDominated->splitBasicBlock(nonDominated->getFirstNonPHI(),
+                                  nonDominated->getName() + ".lpad-fix");
+
+  // Redirect the last dominated branch there.
+  cast<BranchInst>(lastDominated->back()).setSuccessor(0, split);
+
+  // Move the existing intrinsics to the end of the old block.
+  selector->moveBefore(&nonDominated->back());
+  exn->moveBefore(selector);
+
+  Instruction *splitIP = &split->front();
+
+  // For all the phis in nonDominated, make a new phi in split to join
+  // that phi with the edge from lastDominated.
+  for (BasicBlock::iterator
+         i = nonDominated->begin(), e = nonDominated->end(); i != e; ++i) {
+    PHINode *phi = dyn_cast<PHINode>(i);
+    if (!phi) break;
+
+    PHINode *splitPhi = PHINode::Create(phi->getType(), 2, phi->getName(),
+                                        splitIP);
+    phi->replaceAllUsesWith(splitPhi);
+    splitPhi->addIncoming(phi, nonDominated);
+    splitPhi->addIncoming(phi->removeIncomingValue(lastDominated),
+                          lastDominated);
+  }
+
+  // Make new phis for the exception and selector.
+  PHINode *exnPhi = PHINode::Create(exn->getType(), 2, "", splitIP);
+  exn->replaceAllUsesWith(exnPhi);
+  selector->setArgOperand(0, exn); // except for this use
+  exnPhi->addIncoming(exn, nonDominated);
+  exnPhi->addIncoming(lpadExn, lastDominated);
+
+  PHINode *selectorPhi = PHINode::Create(selector->getType(), 2, "", splitIP);
+  selector->replaceAllUsesWith(selectorPhi);
+  selectorPhi->addIncoming(selector, nonDominated);
+  selectorPhi->addIncoming(lpadSelector, lastDominated);
+
+  return lpadSelector;
+}
+
+namespace {
+  /// A class for recording information about inlining through an invoke.
+  class InvokeInliningInfo {
+    BasicBlock *OuterUnwindDest;
+    EHSelectorInst *OuterSelector;
+    BasicBlock *InnerUnwindDest;
+    PHINode *InnerExceptionPHI;
+    PHINode *InnerSelectorPHI;
+    SmallVector<Value*, 8> UnwindDestPHIValues;
+
+  public:
+    InvokeInliningInfo(InvokeInst *II) :
+      OuterUnwindDest(II->getUnwindDest()), OuterSelector(0),
+      InnerUnwindDest(0), InnerExceptionPHI(0), InnerSelectorPHI(0) {
+
+      // If there are PHI nodes in the unwind destination block, we
+      // need to keep track of which values came into them from the
+      // invoke before removing the edge from this block.
+      llvm::BasicBlock *invokeBB = II->getParent();
+      for (BasicBlock::iterator I = OuterUnwindDest->begin();
+             isa<PHINode>(I); ++I) {
+        // Save the value to use for this edge.
+        PHINode *phi = cast<PHINode>(I);
+        UnwindDestPHIValues.push_back(phi->getIncomingValueForBlock(invokeBB));
+      }
+    }
+
+    /// The outer unwind destination is the target of unwind edges
+    /// introduced for calls within the inlined function.
+    BasicBlock *getOuterUnwindDest() const {
+      return OuterUnwindDest;
+    }
+
+    EHSelectorInst *getOuterSelector() {
+      if (!OuterSelector)
+        OuterSelector = findSelectorForLandingPad(OuterUnwindDest);
+      return OuterSelector;
+    }
+
+    BasicBlock *getInnerUnwindDest();
+
+    bool forwardEHResume(CallInst *call, BasicBlock *src);
+
+    /// Add incoming-PHI values to the unwind destination block for
+    /// the given basic block, using the values for the original
+    /// invoke's source block.
+    void addIncomingPHIValuesFor(BasicBlock *BB) const {
+      addIncomingPHIValuesForInto(BB, OuterUnwindDest);
+    }
+
+    void addIncomingPHIValuesForInto(BasicBlock *src, BasicBlock *dest) const {
+      BasicBlock::iterator I = dest->begin();
+      for (unsigned i = 0, e = UnwindDestPHIValues.size(); i != e; ++i, ++I) {
+        PHINode *phi = cast<PHINode>(I);
+        phi->addIncoming(UnwindDestPHIValues[i], src);
+      }
+    }
+  };
+}
+
+/// Get or create a target for the branch out of rewritten calls to
+/// llvm.eh.resume.
+BasicBlock *InvokeInliningInfo::getInnerUnwindDest() {
+  if (InnerUnwindDest) return InnerUnwindDest;
+
+  // Find and hoist the llvm.eh.exception and llvm.eh.selector calls
+  // in the outer landing pad to immediately following the phis.
+  EHSelectorInst *selector = getOuterSelector();
+  if (!selector) return 0;
+
+  // The call to llvm.eh.exception *must* be in the landing pad.
+  Instruction *exn = cast<Instruction>(selector->getArgOperand(0));
+  assert(exn->getParent() == OuterUnwindDest);
+
+  // TODO: recognize when we've already done this, so that we don't
+  // get a linear number of these when inlining calls into lots of
+  // invokes with the same landing pad.
+
+  // Do the hoisting.
+  Instruction *splitPoint = exn->getParent()->getFirstNonPHI();
+  assert(splitPoint != selector && "selector-on-exception dominance broken!");
+  if (splitPoint == exn) {
+    selector->removeFromParent();
+    selector->insertAfter(exn);
+    splitPoint = selector->getNextNode();
+  } else {
+    exn->moveBefore(splitPoint);
+    selector->moveBefore(splitPoint);
+  }
+
+  // Split the landing pad.
+  InnerUnwindDest = OuterUnwindDest->splitBasicBlock(splitPoint,
+                                        OuterUnwindDest->getName() + ".body");
+
+  // The number of incoming edges we expect to the inner landing pad.
+  const unsigned phiCapacity = 2;
+
+  // Create corresponding new phis for all the phis in the outer landing pad.
+  BasicBlock::iterator insertPoint = InnerUnwindDest->begin();
+  BasicBlock::iterator I = OuterUnwindDest->begin();
+  for (unsigned i = 0, e = UnwindDestPHIValues.size(); i != e; ++i, ++I) {
+    PHINode *outerPhi = cast<PHINode>(I);
+    PHINode *innerPhi = PHINode::Create(outerPhi->getType(), phiCapacity,
+                                        outerPhi->getName() + ".lpad-body",
+                                        insertPoint);
+    outerPhi->replaceAllUsesWith(innerPhi);
+    innerPhi->addIncoming(outerPhi, OuterUnwindDest);
+  }
+
+  // Create a phi for the exception value...
+  InnerExceptionPHI = PHINode::Create(exn->getType(), phiCapacity,
+                                      "exn.lpad-body", insertPoint);
+  exn->replaceAllUsesWith(InnerExceptionPHI);
+  selector->setArgOperand(0, exn); // restore this use
+  InnerExceptionPHI->addIncoming(exn, OuterUnwindDest);
+
+  // ...and the selector.
+  InnerSelectorPHI = PHINode::Create(selector->getType(), phiCapacity,
+                                     "selector.lpad-body", insertPoint);
+  selector->replaceAllUsesWith(InnerSelectorPHI);
+  InnerSelectorPHI->addIncoming(selector, OuterUnwindDest);
+
+  // All done.
+  return InnerUnwindDest;
+}
+
+/// [LIBUNWIND] Try to forward the given call, which logically occurs
+/// at the end of the given block, as a branch to the inner unwind
+/// block.  Returns true if the call was forwarded.
+bool InvokeInliningInfo::forwardEHResume(CallInst *call, BasicBlock *src) {
+  // First, check whether this is a call to the intrinsic.
+  Function *fn = dyn_cast<Function>(call->getCalledValue());
+  if (!fn || fn->getName() != "llvm.eh.resume")
+    return false;
+  
+  // At this point, we need to return true on all paths, because
+  // otherwise we'll construct an invoke of the intrinsic, which is
+  // not well-formed.
+
+  // Try to find or make an inner unwind dest, which will fail if we
+  // can't find a selector call for the outer unwind dest.
+  BasicBlock *dest = getInnerUnwindDest();
+  bool hasSelector = (dest != 0);
+
+  // If we failed, just use the outer unwind dest, dropping the
+  // exception and selector on the floor.
+  if (!hasSelector)
+    dest = OuterUnwindDest;
+
+  // Make a branch.
+  BranchInst::Create(dest, src);
+
+  // Update the phis in the destination.  They were inserted in an
+  // order which makes this work.
+  addIncomingPHIValuesForInto(src, dest);
+
+  if (hasSelector) {
+    InnerExceptionPHI->addIncoming(call->getArgOperand(0), src);
+    InnerSelectorPHI->addIncoming(call->getArgOperand(1), src);
+  }
+
+  return true;
+}
+
+/// [LIBUNWIND] Check whether this selector is "only cleanups":
+///   call i32 @llvm.eh.selector(blah, blah, i32 0)
+static bool isCleanupOnlySelector(EHSelectorInst *selector) {
+  if (selector->getNumArgOperands() != 3) return false;
+  ConstantInt *val = dyn_cast<ConstantInt>(selector->getArgOperand(2));
+  return (val && val->isZero());
+}
 
 /// HandleCallsInBlockInlinedThroughInvoke - When we inline a basic block into
 /// an invoke, we have to turn all of the calls that can throw into
@@ -44,9 +418,9 @@ bool llvm::InlineFunction(InvokeInst *II, InlineFunctionInfo &IFI) {
 /// it rewrites them to be invokes that jump to InvokeDest and fills in the PHI
 /// nodes in that block with the values specified in InvokeDestPHIValues.
 ///
-static void HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB,
-                                                   BasicBlock *InvokeDest,
-                           const SmallVectorImpl<Value*> &InvokeDestPHIValues) {
+/// Returns true to indicate that the next block should be skipped.
+static bool HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB,
+                                                   InvokeInliningInfo &Invoke) {
   for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E; ) {
     Instruction *I = BBI++;
     
@@ -54,6 +428,37 @@ static void HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB,
     // instructions require no special handling.
     CallInst *CI = dyn_cast<CallInst>(I);
     if (CI == 0) continue;
+
+    // LIBUNWIND: merge selector instructions.
+    if (EHSelectorInst *Inner = dyn_cast<EHSelectorInst>(CI)) {
+      EHSelectorInst *Outer = Invoke.getOuterSelector();
+      if (!Outer) continue;
+
+      bool innerIsOnlyCleanup = isCleanupOnlySelector(Inner);
+      bool outerIsOnlyCleanup = isCleanupOnlySelector(Outer);
+
+      // If both selectors contain only cleanups, we don't need to do
+      // anything.  TODO: this is really just a very specific instance
+      // of a much more general optimization.
+      if (innerIsOnlyCleanup && outerIsOnlyCleanup) continue;
+
+      // Otherwise, we just append the outer selector to the inner selector.
+      SmallVector<Value*, 16> NewSelector;
+      for (unsigned i = 0, e = Inner->getNumArgOperands(); i != e; ++i)
+        NewSelector.push_back(Inner->getArgOperand(i));
+      for (unsigned i = 2, e = Outer->getNumArgOperands(); i != e; ++i)
+        NewSelector.push_back(Outer->getArgOperand(i));
+
+      CallInst *NewInner =
+        IRBuilder<>(Inner).CreateCall(Inner->getCalledValue(),
+                                      NewSelector.begin(),
+                                      NewSelector.end());
+      // No need to copy attributes, calling convention, etc.
+      NewInner->takeName(Inner);
+      Inner->replaceAllUsesWith(NewInner);
+      Inner->eraseFromParent();
+      continue;
+    }
     
     // If this call cannot unwind, don't convert it to an invoke.
     if (CI->doesNotThrow())
@@ -62,37 +467,45 @@ static void HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB,
     // Convert this function call into an invoke instruction.
     // First, split the basic block.
     BasicBlock *Split = BB->splitBasicBlock(CI, CI->getName()+".noexc");
-    
-    // Next, create the new invoke instruction, inserting it at the end
-    // of the old basic block.
+
+    // Delete the unconditional branch inserted by splitBasicBlock
+    BB->getInstList().pop_back();
+
+    // LIBUNWIND: If this is a call to @llvm.eh.resume, just branch
+    // directly to the new landing pad.
+    if (Invoke.forwardEHResume(CI, BB)) {
+      // TODO: 'Split' is now unreachable; clean it up.
+
+      // We want to leave the original call intact so that the call
+      // graph and other structures won't get misled.  We also have to
+      // avoid processing the next block, or we'll iterate here forever.
+      return true;
+    }
+
+    // Otherwise, create the new invoke instruction.
     ImmutableCallSite CS(CI);
     SmallVector<Value*, 8> InvokeArgs(CS.arg_begin(), CS.arg_end());
     InvokeInst *II =
-      InvokeInst::Create(CI->getCalledValue(), Split, InvokeDest,
+      InvokeInst::Create(CI->getCalledValue(), Split,
+                         Invoke.getOuterUnwindDest(),
                          InvokeArgs.begin(), InvokeArgs.end(),
-                         CI->getName(), BB->getTerminator());
+                         CI->getName(), BB);
     II->setCallingConv(CI->getCallingConv());
     II->setAttributes(CI->getAttributes());
     
     // Make sure that anything using the call now uses the invoke!  This also
     // updates the CallGraph if present, because it uses a WeakVH.
     CI->replaceAllUsesWith(II);
-    
-    // Delete the unconditional branch inserted by splitBasicBlock
-    BB->getInstList().pop_back();
+
     Split->getInstList().pop_front();  // Delete the original call
-    
+
     // Update any PHI nodes in the exceptional block to indicate that
     // there is now a new entry in them.
-    unsigned i = 0;
-    for (BasicBlock::iterator I = InvokeDest->begin();
-         isa<PHINode>(I); ++I, ++i)
-      cast<PHINode>(I)->addIncoming(InvokeDestPHIValues[i], BB);
-    
-    // This basic block is now complete, the caller will continue scanning the
-    // next one.
-    return;
+    Invoke.addIncomingPHIValuesFor(BB);
+    return false;
   }
+
+  return false;
 }
   
 
@@ -106,17 +519,6 @@ static void HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB,
 static void HandleInlinedInvoke(InvokeInst *II, BasicBlock *FirstNewBlock,
                                 ClonedCodeInfo &InlinedCodeInfo) {
   BasicBlock *InvokeDest = II->getUnwindDest();
-  SmallVector<Value*, 8> InvokeDestPHIValues;
-
-  // If there are PHI nodes in the unwind destination block, we need to
-  // keep track of which values came into them from this invoke, then remove
-  // the entry for this block.
-  BasicBlock *InvokeBlock = II->getParent();
-  for (BasicBlock::iterator I = InvokeDest->begin(); isa<PHINode>(I); ++I) {
-    PHINode *PN = cast<PHINode>(I);
-    // Save the value to use for this edge.
-    InvokeDestPHIValues.push_back(PN->getIncomingValueForBlock(InvokeBlock));
-  }
 
   Function *Caller = FirstNewBlock->getParent();
 
@@ -132,11 +534,17 @@ static void HandleInlinedInvoke(InvokeInst *II, BasicBlock *FirstNewBlock,
     InvokeDest->removePredecessor(II->getParent());
     return;
   }
+
+  InvokeInliningInfo Invoke(II);
   
   for (Function::iterator BB = FirstNewBlock, E = Caller->end(); BB != E; ++BB){
     if (InlinedCodeInfo.ContainsCalls)
-      HandleCallsInBlockInlinedThroughInvoke(BB, InvokeDest,
-                                             InvokeDestPHIValues);
+      if (HandleCallsInBlockInlinedThroughInvoke(BB, Invoke)) {
+        // Honor a request to skip the next block.  We don't need to
+        // consider UnwindInsts in this case either.
+        ++BB;
+        continue;
+      }
 
     if (UnwindInst *UI = dyn_cast<UnwindInst>(BB->getTerminator())) {
       // An UnwindInst requires special handling when it gets inlined into an
@@ -150,12 +558,7 @@ static void HandleInlinedInvoke(InvokeInst *II, BasicBlock *FirstNewBlock,
 
       // Update any PHI nodes in the exceptional block to indicate that
       // there is now a new entry in them.
-      unsigned i = 0;
-      for (BasicBlock::iterator I = InvokeDest->begin();
-           isa<PHINode>(I); ++I, ++i) {
-        PHINode *PN = cast<PHINode>(I);
-        PN->addIncoming(InvokeDestPHIValues[i], BB);
-      }
+      Invoke.addIncomingPHIValuesFor(BB);
     }
   }
 
@@ -299,28 +702,55 @@ static Value *HandleByValArgument(Value *Arg, Instruction *TheCall,
     ConstantInt::get(Type::getInt32Ty(Context), 1),
     ConstantInt::getFalse(Context) // isVolatile
   };
-  CallInst *TheMemCpy =
-    CallInst::Create(MemCpyFn, CallArgs, CallArgs+5, "", TheCall);
-  
-  // If we have a call graph, update it.
-  if (CallGraph *CG = IFI.CG) {
-    CallGraphNode *MemCpyCGN = CG->getOrInsertFunction(MemCpyFn);
-    CallGraphNode *CallerNode = (*CG)[Caller];
-    CallerNode->addCalledFunction(TheMemCpy, MemCpyCGN);
-  }
+  IRBuilder<>(TheCall).CreateCall(MemCpyFn, CallArgs, CallArgs+5);
   
   // Uses of the argument in the function should use our new alloca
   // instead.
   return NewAlloca;
 }
 
+// isUsedByLifetimeMarker - Check whether this Value is used by a lifetime
+// intrinsic.
+static bool isUsedByLifetimeMarker(Value *V) {
+  for (Value::use_iterator UI = V->use_begin(), UE = V->use_end(); UI != UE;
+       ++UI) {
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(*UI)) {
+      switch (II->getIntrinsicID()) {
+      default: break;
+      case Intrinsic::lifetime_start:
+      case Intrinsic::lifetime_end:
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+// hasLifetimeMarkers - Check whether the given alloca already has
+// lifetime.start or lifetime.end intrinsics.
+static bool hasLifetimeMarkers(AllocaInst *AI) {
+  const Type *Int8PtrTy = Type::getInt8PtrTy(AI->getType()->getContext());
+  if (AI->getType() == Int8PtrTy)
+    return isUsedByLifetimeMarker(AI);
+
+  // Do a scan to find all the casts to i8*.
+  for (Value::use_iterator I = AI->use_begin(), E = AI->use_end(); I != E;
+       ++I) {
+    if (I->getType() != Int8PtrTy) continue;
+    if (I->stripPointerCasts() != AI) continue;
+    if (isUsedByLifetimeMarker(*I))
+      return true;
+  }
+  return false;
+}
+
 // InlineFunction - This function inlines the called function into the basic
 // block of the caller.  This returns false if it is not possible to inline this
 // call.  The program is still in a well defined state if this occurs though.
 //
 // Note that this only does one level of inlining.  For example, if the
 // instruction 'call B' is inlined, and 'B' calls 'C', then the call to 'C' now
-// exists in the instruction stream.  Similiarly this will inline a recursive
+// exists in the instruction stream.  Similarly this will inline a recursive
 // function by one level.
 //
 bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI) {
@@ -460,6 +890,26 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI) {
     }
   }
 
+  // Leave lifetime markers for the static alloca's, scoping them to the
+  // function we just inlined.
+  if (!IFI.StaticAllocas.empty()) {
+    IRBuilder<> builder(FirstNewBlock->begin());
+    for (unsigned ai = 0, ae = IFI.StaticAllocas.size(); ai != ae; ++ai) {
+      AllocaInst *AI = IFI.StaticAllocas[ai];
+
+      // If the alloca is already scoped to something smaller than the whole
+      // function then there's no need to add redundant, less accurate markers.
+      if (hasLifetimeMarkers(AI))
+        continue;
+
+      builder.CreateLifetimeStart(AI);
+      for (unsigned ri = 0, re = Returns.size(); ri != re; ++ri) {
+        IRBuilder<> builder(Returns[ri]);
+        builder.CreateLifetimeEnd(AI);
+      }
+    }
+  }
+
   // If the inlined code contained dynamic alloca instructions, wrap the inlined
   // code with llvm.stacksave/llvm.stackrestore intrinsics.
   if (InlinedFunctionInfo.ContainsDynamicAllocas) {
@@ -468,25 +918,14 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI) {
     Function *StackSave = Intrinsic::getDeclaration(M, Intrinsic::stacksave);
     Function *StackRestore=Intrinsic::getDeclaration(M,Intrinsic::stackrestore);
 
-    // If we are preserving the callgraph, add edges to the stacksave/restore
-    // functions for the calls we insert.
-    CallGraphNode *StackSaveCGN = 0, *StackRestoreCGN = 0, *CallerNode = 0;
-    if (CallGraph *CG = IFI.CG) {
-      StackSaveCGN    = CG->getOrInsertFunction(StackSave);
-      StackRestoreCGN = CG->getOrInsertFunction(StackRestore);
-      CallerNode = (*CG)[Caller];
-    }
-
     // Insert the llvm.stacksave.
-    CallInst *SavedPtr = CallInst::Create(StackSave, "savedstack",
-                                          FirstNewBlock->begin());
-    if (IFI.CG) CallerNode->addCalledFunction(SavedPtr, StackSaveCGN);
+    CallInst *SavedPtr = IRBuilder<>(FirstNewBlock, FirstNewBlock->begin())
+      .CreateCall(StackSave, "savedstack");
 
     // Insert a call to llvm.stackrestore before any return instructions in the
     // inlined function.
     for (unsigned i = 0, e = Returns.size(); i != e; ++i) {
-      CallInst *CI = CallInst::Create(StackRestore, SavedPtr, "", Returns[i]);
-      if (IFI.CG) CallerNode->addCalledFunction(CI, StackRestoreCGN);
+      IRBuilder<>(Returns[i]).CreateCall(StackRestore, SavedPtr);
     }
 
     // Count the number of StackRestore calls we insert.
@@ -498,8 +937,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI) {
       for (Function::iterator BB = FirstNewBlock, E = Caller->end();
            BB != E; ++BB)
         if (UnwindInst *UI = dyn_cast<UnwindInst>(BB->getTerminator())) {
-          CallInst *CI = CallInst::Create(StackRestore, SavedPtr, "", UI);
-          if (IFI.CG) CallerNode->addCalledFunction(CI, StackRestoreCGN);
+          IRBuilder<>(UI).CreateCall(StackRestore, SavedPtr);
           ++NumStackRestores;
         }
     }
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index d7e2336..19c3c72 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -20,6 +20,8 @@
 #include "llvm/Instructions.h"
 #include "llvm/Intrinsics.h"
 #include "llvm/IntrinsicInst.h"
+#include "llvm/Metadata.h"
+#include "llvm/Operator.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/DebugInfo.h"
@@ -33,6 +35,7 @@
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Support/IRBuilder.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/ValueHandle.h"
 #include "llvm/Support/raw_ostream.h"
@@ -42,12 +45,16 @@ using namespace llvm;
 //  Local constant propagation.
 //
 
-// ConstantFoldTerminator - If a terminator instruction is predicated on a
-// constant value, convert it into an unconditional branch to the constant
-// destination.
-//
-bool llvm::ConstantFoldTerminator(BasicBlock *BB) {
+/// ConstantFoldTerminator - If a terminator instruction is predicated on a
+/// constant value, convert it into an unconditional branch to the constant
+/// destination.  This is a nontrivial operation because the successors of this
+/// basic block must have their PHI nodes updated.
+/// Also calls RecursivelyDeleteTriviallyDeadInstructions() on any branch/switch
+/// conditions and indirectbr addresses this might make dead if
+/// DeleteDeadConditions is true.
+bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions) {
   TerminatorInst *T = BB->getTerminator();
+  IRBuilder<> Builder(T);
 
   // Branch - See if we are conditional jumping on constant
   if (BranchInst *BI = dyn_cast<BranchInst>(T)) {
@@ -67,11 +74,10 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB) {
 
       // Let the basic block know that we are letting go of it.  Based on this,
       // it will adjust it's PHI nodes.
-      assert(BI->getParent() && "Terminator not inserted in block!");
-      OldDest->removePredecessor(BI->getParent());
+      OldDest->removePredecessor(BB);
 
       // Replace the conditional branch with an unconditional one.
-      BranchInst::Create(Destination, BI);
+      Builder.CreateBr(Destination);
       BI->eraseFromParent();
       return true;
     }
@@ -86,8 +92,11 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB) {
       Dest1->removePredecessor(BI->getParent());
 
       // Replace the conditional branch with an unconditional one.
-      BranchInst::Create(Dest1, BI);
+      Builder.CreateBr(Dest1);
+      Value *Cond = BI->getCondition();
       BI->eraseFromParent();
+      if (DeleteDeadConditions)
+        RecursivelyDeleteTriviallyDeadInstructions(Cond);
       return true;
     }
     return false;
@@ -136,7 +145,7 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB) {
     // now.
     if (TheOnlyDest) {
       // Insert the new branch.
-      BranchInst::Create(TheOnlyDest, SI);
+      Builder.CreateBr(TheOnlyDest);
       BasicBlock *BB = SI->getParent();
 
       // Remove entries from PHI nodes which we no longer branch to...
@@ -150,17 +159,21 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB) {
       }
 
       // Delete the old switch.
-      BB->getInstList().erase(SI);
+      Value *Cond = SI->getCondition();
+      SI->eraseFromParent();
+      if (DeleteDeadConditions)
+        RecursivelyDeleteTriviallyDeadInstructions(Cond);
       return true;
     }
     
     if (SI->getNumSuccessors() == 2) {
       // Otherwise, we can fold this switch into a conditional branch
       // instruction if it has only one non-default destination.
-      Value *Cond = new ICmpInst(SI, ICmpInst::ICMP_EQ, SI->getCondition(),
-                                 SI->getSuccessorValue(1), "cond");
+      Value *Cond = Builder.CreateICmpEQ(SI->getCondition(),
+                                         SI->getSuccessorValue(1), "cond");
+
       // Insert the new branch.
-      BranchInst::Create(SI->getSuccessor(1), SI->getSuccessor(0), Cond, SI);
+      Builder.CreateCondBr(Cond, SI->getSuccessor(1), SI->getSuccessor(0));
 
       // Delete the old switch.
       SI->eraseFromParent();
@@ -175,7 +188,7 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB) {
           dyn_cast<BlockAddress>(IBI->getAddress()->stripPointerCasts())) {
       BasicBlock *TheOnlyDest = BA->getBasicBlock();
       // Insert the new branch.
-      BranchInst::Create(TheOnlyDest, IBI);
+      Builder.CreateBr(TheOnlyDest);
       
       for (unsigned i = 0, e = IBI->getNumDestinations(); i != e; ++i) {
         if (IBI->getDestination(i) == TheOnlyDest)
@@ -183,7 +196,10 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB) {
         else
           IBI->getDestination(i)->removePredecessor(IBI->getParent());
       }
+      Value *Address = IBI->getAddress();
       IBI->eraseFromParent();
+      if (DeleteDeadConditions)
+        RecursivelyDeleteTriviallyDeadInstructions(Address);
       
       // If we didn't find our destination in the IBI successor list, then we
       // have undefined behavior.  Replace the unconditional branch with an
@@ -690,39 +706,15 @@ bool llvm::EliminateDuplicatePHINodes(BasicBlock *BB) {
 ///
 static unsigned enforceKnownAlignment(Value *V, unsigned Align,
                                       unsigned PrefAlign) {
+  V = V->stripPointerCasts();
 
-  User *U = dyn_cast<User>(V);
-  if (!U) return Align;
-
-  switch (Operator::getOpcode(U)) {
-  default: break;
-  case Instruction::BitCast:
-    return enforceKnownAlignment(U->getOperand(0), Align, PrefAlign);
-  case Instruction::GetElementPtr: {
-    // If all indexes are zero, it is just the alignment of the base pointer.
-    bool AllZeroOperands = true;
-    for (User::op_iterator i = U->op_begin() + 1, e = U->op_end(); i != e; ++i)
-      if (!isa<Constant>(*i) ||
-          !cast<Constant>(*i)->isNullValue()) {
-        AllZeroOperands = false;
-        break;
-      }
-
-    if (AllZeroOperands) {
-      // Treat this like a bitcast.
-      return enforceKnownAlignment(U->getOperand(0), Align, PrefAlign);
-    }
-    return Align;
-  }
-  case Instruction::Alloca: {
-    AllocaInst *AI = cast<AllocaInst>(V);
+  if (AllocaInst *AI = dyn_cast<AllocaInst>(V)) {
     // If there is a requested alignment and if this is an alloca, round up.
     if (AI->getAlignment() >= PrefAlign)
       return AI->getAlignment();
     AI->setAlignment(PrefAlign);
     return PrefAlign;
   }
-  }
 
   if (GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
     // If there is a large requested alignment and we can, bump up the alignment
@@ -785,10 +777,19 @@ bool llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
   if (!DIVar.Verify())
     return false;
 
-  Instruction *DbgVal = 
-    Builder.insertDbgValueIntrinsic(SI->getOperand(0), 0,
-                                    DIVar, SI);
-  
+  Instruction *DbgVal = NULL;
+  // If an argument is zero extended then use argument directly. The ZExt
+  // may be zapped by an optimization pass in future.
+  Argument *ExtendedArg = NULL;
+  if (ZExtInst *ZExt = dyn_cast<ZExtInst>(SI->getOperand(0)))
+    ExtendedArg = dyn_cast<Argument>(ZExt->getOperand(0));
+  if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0)))
+    ExtendedArg = dyn_cast<Argument>(SExt->getOperand(0));
+  if (ExtendedArg)
+    DbgVal = Builder.insertDbgValueIntrinsic(ExtendedArg, 0, DIVar, SI);
+  else
+    DbgVal = Builder.insertDbgValueIntrinsic(SI->getOperand(0), 0, DIVar, SI);
+
   // Propagate any debug metadata from the store onto the dbg.value.
   DebugLoc SIDL = SI->getDebugLoc();
   if (!SIDL.isUnknown())
@@ -838,14 +839,30 @@ bool llvm::LowerDbgDeclare(Function &F) {
          E = Dbgs.end(); I != E; ++I) {
     DbgDeclareInst *DDI = *I;
     if (AllocaInst *AI = dyn_cast_or_null<AllocaInst>(DDI->getAddress())) {
+      bool RemoveDDI = true;
       for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end();
            UI != E; ++UI)
         if (StoreInst *SI = dyn_cast<StoreInst>(*UI))
           ConvertDebugDeclareToDebugValue(DDI, SI, DIB);
         else if (LoadInst *LI = dyn_cast<LoadInst>(*UI))
           ConvertDebugDeclareToDebugValue(DDI, LI, DIB);
+        else
+          RemoveDDI = false;
+      if (RemoveDDI)
+        DDI->eraseFromParent();
     }
-    DDI->eraseFromParent();
   }
   return true;
 }
+
+/// FindAllocaDbgDeclare - Finds the llvm.dbg.declare intrinsic describing the
+/// alloca 'V', if any.
+DbgDeclareInst *llvm::FindAllocaDbgDeclare(Value *V) {
+  if (MDNode *DebugNode = MDNode::getIfExists(V->getContext(), V))
+    for (Value::use_iterator UI = DebugNode->use_begin(),
+         E = DebugNode->use_end(); UI != E; ++UI)
+      if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(*UI))
+        return DDI;
+
+  return 0;
+}
diff --git a/lib/Transforms/Utils/LoopSimplify.cpp b/lib/Transforms/Utils/LoopSimplify.cpp
index 9fe5929..f02ffd2 100644
--- a/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/lib/Transforms/Utils/LoopSimplify.cpp
@@ -115,7 +115,7 @@ INITIALIZE_PASS_DEPENDENCY(LoopInfo)
 INITIALIZE_PASS_END(LoopSimplify, "loop-simplify",
                 "Canonicalize natural loops", true, false)
 
-// Publically exposed interface to pass...
+// Publicly exposed interface to pass...
 char &llvm::LoopSimplifyID = LoopSimplify::ID;
 Pass *llvm::createLoopSimplifyPass() { return new LoopSimplify(); }
 
diff --git a/lib/Transforms/Utils/LowerSwitch.cpp b/lib/Transforms/Utils/LowerSwitch.cpp
index 914a439..ed733d3 100644
--- a/lib/Transforms/Utils/LowerSwitch.cpp
+++ b/lib/Transforms/Utils/LowerSwitch.cpp
@@ -84,7 +84,7 @@ char LowerSwitch::ID = 0;
 INITIALIZE_PASS(LowerSwitch, "lowerswitch",
                 "Lower SwitchInst's to branches", false, false)
 
-// Publically exposed interface to pass...
+// Publicly exposed interface to pass...
 char &llvm::LowerSwitchID = LowerSwitch::ID;
 // createLowerSwitchPass - Interface to this file...
 FunctionPass *llvm::createLowerSwitchPass() {
diff --git a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index c96bbad..a1736b9 100644
--- a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -46,7 +46,6 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/CFG.h"
 #include <algorithm>
-#include <map>
 #include <queue>
 using namespace llvm;
 
@@ -101,18 +100,6 @@ bool llvm::isAllocaPromotable(const AllocaInst *AI) {
   return true;
 }
 
-/// FindAllocaDbgDeclare - Finds the llvm.dbg.declare intrinsic describing the
-/// alloca 'V', if any.
-static DbgDeclareInst *FindAllocaDbgDeclare(Value *V) {
-  if (MDNode *DebugNode = MDNode::getIfExists(V->getContext(), &V, 1))
-    for (Value::use_iterator UI = DebugNode->use_begin(),
-         E = DebugNode->use_end(); UI != E; ++UI)
-      if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(*UI))
-        return DDI;
-
-  return 0;
-}
-
 namespace {
   struct AllocaInfo;
 
diff --git a/lib/Transforms/Utils/SSAUpdater.cpp b/lib/Transforms/Utils/SSAUpdater.cpp
index 4f83b7e..b336194 100644
--- a/lib/Transforms/Utils/SSAUpdater.cpp
+++ b/lib/Transforms/Utils/SSAUpdater.cpp
@@ -12,16 +12,22 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "ssaupdater"
+#include "llvm/Constants.h"
 #include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/Analysis/DIBuilder.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Support/AlignOf.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 #include "llvm/Transforms/Utils/SSAUpdaterImpl.h"
+
 using namespace llvm;
 
 typedef DenseMap<BasicBlock*, Value*> AvailableValsTy;
@@ -184,6 +190,9 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) {
     return V;
   }
 
+  // Set DebugLoc.
+  InsertedPHI->setDebugLoc(GetFirstDebugLocInBasicBlock(BB));
+
   // If the client wants to know about all new instructions, tell it.
   if (InsertedPHIs) InsertedPHIs->push_back(InsertedPHI);
 
@@ -349,7 +358,8 @@ Value *SSAUpdater::GetValueAtEndOfBlockInternal(BasicBlock *BB) {
 
 LoadAndStorePromoter::
 LoadAndStorePromoter(const SmallVectorImpl<Instruction*> &Insts,
-                     SSAUpdater &S, StringRef BaseName) : SSA(S) {
+                     SSAUpdater &S, DbgDeclareInst *DD, DIBuilder *DB,
+                     StringRef BaseName) : SSA(S), DDI(DD), DIB(DB) {
   if (Insts.empty()) return;
   
   Value *SomeVal;
@@ -396,9 +406,11 @@ run(const SmallVectorImpl<Instruction*> &Insts) const {
     // single user in it, we can rewrite it trivially.
     if (BlockUses.size() == 1) {
       // If it is a store, it is a trivial def of the value in the block.
-      if (StoreInst *SI = dyn_cast<StoreInst>(User))
+      if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
+        if (DDI)
+          ConvertDebugDeclareToDebugValue(DDI, SI, *DIB);
         SSA.AddAvailableValue(BB, SI->getOperand(0));
-      else 
+      } else 
         // Otherwise it is a load, queue it to rewrite as a live-in load.
         LiveInLoads.push_back(cast<LoadInst>(User));
       BlockUses.clear();
@@ -447,12 +459,15 @@ run(const SmallVectorImpl<Instruction*> &Insts) const {
         continue;
       }
       
-      if (StoreInst *S = dyn_cast<StoreInst>(II)) {
+      if (StoreInst *SI = dyn_cast<StoreInst>(II)) {
         // If this is a store to an unrelated pointer, ignore it.
-        if (!isInstInList(S, Insts)) continue;
-        
+        if (!isInstInList(SI, Insts)) continue;
+
+        if (DDI)
+          ConvertDebugDeclareToDebugValue(DDI, SI, *DIB);
+
         // Remember that this is the active value in the block.
-        StoredValue = S->getOperand(0);
+        StoredValue = SI->getOperand(0);
       }
     }
     
@@ -507,4 +522,7 @@ run(const SmallVectorImpl<Instruction*> &Insts) const {
     instructionDeleted(User);
     User->eraseFromParent();
   }
+
+  if (DDI)
+    DDI->eraseFromParent();
 }
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index cfc897c..6df846c 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -20,6 +20,7 @@
 #include "llvm/DerivedTypes.h"
 #include "llvm/GlobalVariable.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/ADT/DenseMap.h"
@@ -31,12 +32,18 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ConstantRange.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/IRBuilder.h"
+#include "llvm/Support/NoFolder.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <set>
 #include <map>
 using namespace llvm;
 
+static cl::opt<unsigned>
+PHINodeFoldingThreshold("phi-node-folding-threshold", cl::Hidden, cl::init(1),
+   cl::desc("Control the amount of phi node folding to perform (default = 1)"));
+
 static cl::opt<bool>
 DupRet("simplifycfg-dup-ret", cl::Hidden, cl::init(false),
        cl::desc("Duplicate return instructions into unconditional branches"));
@@ -51,16 +58,18 @@ class SimplifyCFGOpt {
   BasicBlock *GetValueEqualityComparisonCases(TerminatorInst *TI,
     std::vector<std::pair<ConstantInt*, BasicBlock*> > &Cases);
   bool SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
-                                                     BasicBlock *Pred);
-  bool FoldValueComparisonIntoPredecessors(TerminatorInst *TI);
+                                                     BasicBlock *Pred,
+                                                     IRBuilder<> &Builder);
+  bool FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
+                                           IRBuilder<> &Builder);
 
-  bool SimplifyReturn(ReturnInst *RI);
-  bool SimplifyUnwind(UnwindInst *UI);
+  bool SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder);
+  bool SimplifyUnwind(UnwindInst *UI, IRBuilder<> &Builder);
   bool SimplifyUnreachable(UnreachableInst *UI);
-  bool SimplifySwitch(SwitchInst *SI);
+  bool SimplifySwitch(SwitchInst *SI, IRBuilder<> &Builder);
   bool SimplifyIndirectBr(IndirectBrInst *IBI);
-  bool SimplifyUncondBranch(BranchInst *BI);
-  bool SimplifyCondBranch(BranchInst *BI);
+  bool SimplifyUncondBranch(BranchInst *BI, IRBuilder <> &Builder);
+  bool SimplifyCondBranch(BranchInst *BI, IRBuilder <>&Builder);
 
 public:
   explicit SimplifyCFGOpt(const TargetData *td) : TD(td) {}
@@ -201,11 +210,20 @@ static Value *GetIfCondition(BasicBlock *BB, BasicBlock *&IfTrue,
 /// which works well enough for us.
 ///
 /// If AggressiveInsts is non-null, and if V does not dominate BB, we check to
-/// see if V (which must be an instruction) is cheap to compute and is
-/// non-trapping.  If both are true, the instruction is inserted into the set
-/// and true is returned.
+/// see if V (which must be an instruction) and its recursive operands
+/// that do not dominate BB have a combined cost lower than CostRemaining and
+/// are non-trapping.  If both are true, the instruction is inserted into the
+/// set and true is returned.
+///
+/// The cost for most non-trapping instructions is defined as 1 except for
+/// Select whose cost is 2.
+///
+/// After this function returns, CostRemaining is decreased by the cost of
+/// V plus its non-dominating operands.  If that cost is greater than
+/// CostRemaining, false is returned and CostRemaining is undefined.
 static bool DominatesMergePoint(Value *V, BasicBlock *BB,
-                                SmallPtrSet<Instruction*, 4> *AggressiveInsts) {
+                                SmallPtrSet<Instruction*, 4> *AggressiveInsts,
+                                unsigned &CostRemaining) {
   Instruction *I = dyn_cast<Instruction>(V);
   if (!I) {
     // Non-instructions all dominate instructions, but not all constantexprs
@@ -232,12 +250,17 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB,
   // instructions in the 'if region'.
   if (AggressiveInsts == 0) return false;
   
+  // If we have seen this instruction before, don't count it again.
+  if (AggressiveInsts->count(I)) return true;
+
   // Okay, it looks like the instruction IS in the "condition".  Check to
   // see if it's a cheap instruction to unconditionally compute, and if it
   // only uses stuff defined outside of the condition.  If so, hoist it out.
   if (!I->isSafeToSpeculativelyExecute())
     return false;
 
+  unsigned Cost = 0;
+
   switch (I->getOpcode()) {
   default: return false;  // Cannot hoist this out safely.
   case Instruction::Load:
@@ -246,11 +269,13 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB,
     // predecessor.
     if (PBB->getFirstNonPHIOrDbg() != I)
       return false;
+    Cost = 1;
     break;
   case Instruction::GetElementPtr:
     // GEPs are cheap if all indices are constant.
     if (!cast<GetElementPtrInst>(I)->hasAllConstantIndices())
       return false;
+    Cost = 1;
     break;
   case Instruction::Add:
   case Instruction::Sub:
@@ -261,13 +286,26 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB,
   case Instruction::LShr:
   case Instruction::AShr:
   case Instruction::ICmp:
+  case Instruction::Trunc:
+  case Instruction::ZExt:
+  case Instruction::SExt:
+    Cost = 1;
     break;   // These are all cheap and non-trapping instructions.
+
+  case Instruction::Select:
+    Cost = 2;
+    break;
   }
 
-  // Okay, we can only really hoist these out if their operands are not
-  // defined in the conditional region.
+  if (Cost > CostRemaining)
+    return false;
+
+  CostRemaining -= Cost;
+
+  // Okay, we can only really hoist these out if their operands do
+  // not take us over the cost threshold.
   for (User::op_iterator i = I->op_begin(), e = I->op_end(); i != e; ++i)
-    if (!DominatesMergePoint(*i, BB, 0))
+    if (!DominatesMergePoint(*i, BB, AggressiveInsts, CostRemaining))
       return false;
   // Okay, it's safe to do this!  Remember this instruction.
   AggressiveInsts->insert(I);
@@ -508,7 +546,8 @@ ValuesOverlap(std::vector<std::pair<ConstantInt*, BasicBlock*> > &C1,
 /// form of jump threading.
 bool SimplifyCFGOpt::
 SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
-                                              BasicBlock *Pred) {
+                                              BasicBlock *Pred,
+                                              IRBuilder<> &Builder) {
   Value *PredVal = isValueEqualityComparison(Pred->getTerminator());
   if (!PredVal) return false;  // Not a value comparison in predecessor.
 
@@ -541,7 +580,7 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
       // uncond br.
       assert(ThisCases.size() == 1 && "Branch can only have one case!");
       // Insert the new branch.
-      Instruction *NI = BranchInst::Create(ThisDef, TI);
+      Instruction *NI = Builder.CreateBr(ThisDef);
       (void) NI;
 
       // Remove PHI node entries for the dead edge.
@@ -606,7 +645,7 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
       CheckEdge = 0;
 
   // Insert the new branch.
-  Instruction *NI = BranchInst::Create(TheRealDest, TI);
+  Instruction *NI = Builder.CreateBr(TheRealDest);
   (void) NI;
 
   DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator()
@@ -641,7 +680,8 @@ static int ConstantIntSortPredicate(const void *P1, const void *P2) {
 /// equality comparison instruction (either a switch or a branch on "X == c").
 /// See if any of the predecessors of the terminator block are value comparisons
 /// on the same value.  If so, and if safe to do so, fold them together.
-bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI) {
+bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
+                                                         IRBuilder<> &Builder) {
   BasicBlock *BB = TI->getParent();
   Value *CV = isValueEqualityComparison(TI);  // CondVal
   assert(CV && "Not a comparison?");
@@ -734,16 +774,18 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI) {
       for (unsigned i = 0, e = NewSuccessors.size(); i != e; ++i)
         AddPredecessorToBlock(NewSuccessors[i], Pred, BB);
 
+      Builder.SetInsertPoint(PTI);
       // Convert pointer to int before we switch.
       if (CV->getType()->isPointerTy()) {
         assert(TD && "Cannot switch on pointer without TargetData");
-        CV = new PtrToIntInst(CV, TD->getIntPtrType(CV->getContext()),
-                              "magicptr", PTI);
+        CV = Builder.CreatePtrToInt(CV, TD->getIntPtrType(CV->getContext()),
+                                    "magicptr");
       }
 
       // Now that the successors are updated, create the new Switch instruction.
-      SwitchInst *NewSI = SwitchInst::Create(CV, PredDefault,
-                                             PredCases.size(), PTI);
+      SwitchInst *NewSI = Builder.CreateSwitch(CV, PredDefault,
+                                               PredCases.size());
+      NewSI->setDebugLoc(PTI->getDebugLoc());
       for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
         NewSI->addCase(PredCases[i].first, PredCases[i].second);
 
@@ -867,6 +909,7 @@ HoistTerminator:
     NT->takeName(I1);
   }
 
+  IRBuilder<true, NoFolder> Builder(NT);
   // Hoisting one of the terminators from our successor is a great thing.
   // Unfortunately, the successors of the if/else blocks may have PHI nodes in
   // them.  If they do, all PHI entries for BB1/BB2 must agree for all PHI
@@ -883,9 +926,11 @@ HoistTerminator:
       // These values do not agree.  Insert a select instruction before NT
       // that determines the right value.
       SelectInst *&SI = InsertedSelects[std::make_pair(BB1V, BB2V)];
-      if (SI == 0)
-        SI = SelectInst::Create(BI->getCondition(), BB1V, BB2V,
-                                BB1V->getName()+"."+BB2V->getName(), NT);
+      if (SI == 0) 
+        SI = cast<SelectInst>
+          (Builder.CreateSelect(BI->getCondition(), BB1V, BB2V,
+                                BB1V->getName()+"."+BB2V->getName()));
+
       // Make the PHI node use the select for all incoming values for BB1/BB2
       for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
         if (PN->getIncomingBlock(i) == BB1 || PN->getIncomingBlock(i) == BB2)
@@ -1043,13 +1088,16 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *BB1) {
 
   // Create a select whose true value is the speculatively executed value and
   // false value is the previously determined FalseV.
+  IRBuilder<true, NoFolder> Builder(BI);
   SelectInst *SI;
   if (Invert)
-    SI = SelectInst::Create(BrCond, FalseV, HInst,
-                            FalseV->getName() + "." + HInst->getName(), BI);
+    SI = cast<SelectInst>
+      (Builder.CreateSelect(BrCond, FalseV, HInst,
+                            FalseV->getName() + "." + HInst->getName()));
   else
-    SI = SelectInst::Create(BrCond, HInst, FalseV,
-                            HInst->getName() + "." + FalseV->getName(), BI);
+    SI = cast<SelectInst>
+      (Builder.CreateSelect(BrCond, HInst, FalseV,
+                            HInst->getName() + "." + FalseV->getName()));
 
   // Make the PHI node use the select for all incoming values for "then" and
   // "if" blocks.
@@ -1123,6 +1171,8 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const TargetData *TD) {
     BasicBlock *RealDest = BI->getSuccessor(!CB->getZExtValue());
     
     if (RealDest == BB) continue;  // Skip self loops.
+    // Skip if the predecessor's terminator is an indirect branch.
+    if (isa<IndirectBrInst>(PredBB->getTerminator())) continue;
     
     // The dest block might have PHI nodes, other predecessors and other
     // difficult cases.  Instead of being smart about this, just insert a new
@@ -1178,7 +1228,7 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const TargetData *TD) {
         BB->removePredecessor(PredBB);
         PredBBTI->setSuccessor(i, EdgeBB);
       }
-    
+
     // Recurse, simplifying any other constants.
     return FoldCondBranchOnPHI(BI, TD) | true;
   }
@@ -1217,6 +1267,8 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetData *TD) {
   // instructions.  While we are at it, keep track of the instructions
   // that need to be moved to the dominating block.
   SmallPtrSet<Instruction*, 4> AggressiveInsts;
+  unsigned MaxCostVal0 = PHINodeFoldingThreshold,
+           MaxCostVal1 = PHINodeFoldingThreshold;
   
   for (BasicBlock::iterator II = BB->begin(); isa<PHINode>(II);) {
     PHINode *PN = cast<PHINode>(II++);
@@ -1226,8 +1278,10 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetData *TD) {
       continue;
     }
     
-    if (!DominatesMergePoint(PN->getIncomingValue(0), BB, &AggressiveInsts) ||
-        !DominatesMergePoint(PN->getIncomingValue(1), BB, &AggressiveInsts))
+    if (!DominatesMergePoint(PN->getIncomingValue(0), BB, &AggressiveInsts,
+                             MaxCostVal0) ||
+        !DominatesMergePoint(PN->getIncomingValue(1), BB, &AggressiveInsts,
+                             MaxCostVal1))
       return false;
   }
   
@@ -1283,6 +1337,7 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetData *TD) {
   // If we can still promote the PHI nodes after this gauntlet of tests,
   // do all of the PHI's now.
   Instruction *InsertPt = DomBlock->getTerminator();
+  IRBuilder<true, NoFolder> Builder(InsertPt);
   
   // Move all 'aggressive' instructions, which are defined in the
   // conditional parts of the if's up to the dominating block.
@@ -1300,7 +1355,8 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetData *TD) {
     Value *TrueVal  = PN->getIncomingValue(PN->getIncomingBlock(0) == IfFalse);
     Value *FalseVal = PN->getIncomingValue(PN->getIncomingBlock(0) == IfTrue);
     
-    Value *NV = SelectInst::Create(IfCond, TrueVal, FalseVal, "", InsertPt);
+    SelectInst *NV = 
+      cast<SelectInst>(Builder.CreateSelect(IfCond, TrueVal, FalseVal, ""));
     PN->replaceAllUsesWith(NV);
     NV->takeName(PN);
     PN->eraseFromParent();
@@ -1310,7 +1366,8 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetData *TD) {
   // has been flattened.  Change DomBlock to jump directly to our new block to
   // avoid other simplifycfg's kicking in on the diamond.
   TerminatorInst *OldTI = DomBlock->getTerminator();
-  BranchInst::Create(BB, OldTI);
+  Builder.SetInsertPoint(OldTI);
+  Builder.CreateBr(BB);
   OldTI->eraseFromParent();
   return true;
 }
@@ -1318,7 +1375,8 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetData *TD) {
 /// SimplifyCondBranchToTwoReturns - If we found a conditional branch that goes
 /// to two returning blocks, try to merge them together into one return,
 /// introducing a select if the return values disagree.
-static bool SimplifyCondBranchToTwoReturns(BranchInst *BI) {
+static bool SimplifyCondBranchToTwoReturns(BranchInst *BI, 
+                                           IRBuilder<> &Builder) {
   assert(BI->isConditional() && "Must be a conditional branch");
   BasicBlock *TrueSucc = BI->getSuccessor(0);
   BasicBlock *FalseSucc = BI->getSuccessor(1);
@@ -1333,13 +1391,14 @@ static bool SimplifyCondBranchToTwoReturns(BranchInst *BI) {
   if (!FalseSucc->getFirstNonPHIOrDbg()->isTerminator())
     return false;
 
+  Builder.SetInsertPoint(BI);
   // Okay, we found a branch that is going to two return nodes.  If
   // there is no return value for this function, just change the
   // branch into a return.
   if (FalseRet->getNumOperands() == 0) {
     TrueSucc->removePredecessor(BI->getParent());
     FalseSucc->removePredecessor(BI->getParent());
-    ReturnInst::Create(BI->getContext(), 0, BI);
+    Builder.CreateRetVoid();
     EraseTerminatorInstAndDCECond(BI);
     return true;
   }
@@ -1382,14 +1441,14 @@ static bool SimplifyCondBranchToTwoReturns(BranchInst *BI) {
     } else if (isa<UndefValue>(TrueValue)) {
       TrueValue = FalseValue;
     } else {
-      TrueValue = SelectInst::Create(BrCond, TrueValue,
-                                     FalseValue, "retval", BI);
+      TrueValue = Builder.CreateSelect(BrCond, TrueValue,
+                                       FalseValue, "retval");
     }
   }
 
-  Value *RI = !TrueValue ?
-              ReturnInst::Create(BI->getContext(), BI) :
-              ReturnInst::Create(BI->getContext(), TrueValue, BI);
+  Value *RI = !TrueValue ? 
+    Builder.CreateRetVoid() : Builder.CreateRet(TrueValue);
+
   (void) RI;
       
   DEBUG(dbgs() << "\nCHANGING BRANCH TO TWO RETURNS INTO SELECT:"
@@ -1401,27 +1460,24 @@ static bool SimplifyCondBranchToTwoReturns(BranchInst *BI) {
   return true;
 }
 
-/// FoldBranchToCommonDest - If this basic block is ONLY a setcc and a branch,
-/// and if a predecessor branches to us and one of our successors, fold the
-/// setcc into the predecessor and use logical operations to pick the right
-/// destination.
+/// FoldBranchToCommonDest - If this basic block is simple enough, and if a
+/// predecessor branches to us and one of our successors, fold the block into
+/// the predecessor and use logical operations to pick the right destination.
 bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
   BasicBlock *BB = BI->getParent();
+
   Instruction *Cond = dyn_cast<Instruction>(BI->getCondition());
   if (Cond == 0 || (!isa<CmpInst>(Cond) && !isa<BinaryOperator>(Cond)) ||
     Cond->getParent() != BB || !Cond->hasOneUse())
   return false;
 
-  SmallVector<DbgInfoIntrinsic *, 8> DbgValues;
   // Only allow this if the condition is a simple instruction that can be
   // executed unconditionally.  It must be in the same block as the branch, and
   // must be at the front of the block.
   BasicBlock::iterator FrontIt = BB->front();
+
   // Ignore dbg intrinsics.
-  while (DbgInfoIntrinsic *DBI = dyn_cast<DbgInfoIntrinsic>(FrontIt)) {
-    DbgValues.push_back(DBI);
-    ++FrontIt;
-  }
+  while (isa<DbgInfoIntrinsic>(FrontIt)) ++FrontIt;
     
   // Allow a single instruction to be hoisted in addition to the compare
   // that feeds the branch.  We later ensure that any values that _it_ uses
@@ -1433,12 +1489,9 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
       FrontIt->isSafeToSpeculativelyExecute()) {
     BonusInst = &*FrontIt;
     ++FrontIt;
-  }
-  
-  // Ignore dbg intrinsics.
-  while (DbgInfoIntrinsic *DBI = dyn_cast<DbgInfoIntrinsic>(FrontIt)) {
-    DbgValues.push_back(DBI);
-    ++FrontIt;
+    
+    // Ignore dbg intrinsics.
+    while (isa<DbgInfoIntrinsic>(FrontIt)) ++FrontIt;
   }
 
   // Only a single bonus inst is allowed.
@@ -1447,15 +1500,12 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
   
   // Make sure the instruction after the condition is the cond branch.
   BasicBlock::iterator CondIt = Cond; ++CondIt;
+
   // Ingore dbg intrinsics.
-  while(DbgInfoIntrinsic *DBI = dyn_cast<DbgInfoIntrinsic>(CondIt)) {
-    DbgValues.push_back(DBI);
-    ++CondIt;
-  }
-  if (&*CondIt != BI) {
-    assert (!isa<DbgInfoIntrinsic>(CondIt) && "Hey do not forget debug info!");
+  while (isa<DbgInfoIntrinsic>(CondIt)) ++CondIt;
+  
+  if (&*CondIt != BI)
     return false;
-  }
 
   // Cond is known to be a compare or binary operator.  Check to make sure that
   // neither operand is a potentially-trapping constant expression.
@@ -1466,7 +1516,6 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
     if (CE->canTrap())
       return false;
   
-  
   // Finally, don't infinitely unroll conditional loops.
   BasicBlock *TrueDest  = BI->getSuccessor(0);
   BasicBlock *FalseDest = BI->getSuccessor(1);
@@ -1480,10 +1529,24 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
     // Check that we have two conditional branches.  If there is a PHI node in
     // the common successor, verify that the same value flows in from both
     // blocks.
-    if (PBI == 0 || PBI->isUnconditional() ||
-        !SafeToMergeTerminators(BI, PBI))
+    if (PBI == 0 || PBI->isUnconditional() || !SafeToMergeTerminators(BI, PBI))
       continue;
     
+    // Determine if the two branches share a common destination.
+    Instruction::BinaryOps Opc;
+    bool InvertPredCond = false;
+    
+    if (PBI->getSuccessor(0) == TrueDest)
+      Opc = Instruction::Or;
+    else if (PBI->getSuccessor(1) == FalseDest)
+      Opc = Instruction::And;
+    else if (PBI->getSuccessor(0) == FalseDest)
+      Opc = Instruction::And, InvertPredCond = true;
+    else if (PBI->getSuccessor(1) == TrueDest)
+      Opc = Instruction::Or, InvertPredCond = true;
+    else
+      continue;
+
     // Ensure that any values used in the bonus instruction are also used
     // by the terminator of the predecessor.  This means that those values
     // must already have been resolved, so we won't be inhibiting the 
@@ -1521,23 +1584,10 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
       
       if (!UsedValues.empty()) return false;
     }
-    
-    Instruction::BinaryOps Opc;
-    bool InvertPredCond = false;
-
-    if (PBI->getSuccessor(0) == TrueDest)
-      Opc = Instruction::Or;
-    else if (PBI->getSuccessor(1) == FalseDest)
-      Opc = Instruction::And;
-    else if (PBI->getSuccessor(0) == FalseDest)
-      Opc = Instruction::And, InvertPredCond = true;
-    else if (PBI->getSuccessor(1) == TrueDest)
-      Opc = Instruction::Or, InvertPredCond = true;
-    else
-      continue;
 
     DEBUG(dbgs() << "FOLDING BRANCH TO COMMON DEST:\n" << *PBI << *BB);
-    
+    IRBuilder<> Builder(PBI);    
+
     // If we need to invert the condition in the pred block to match, do so now.
     if (InvertPredCond) {
       Value *NewCond = PBI->getCondition();
@@ -1546,8 +1596,8 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
         CmpInst *CI = cast<CmpInst>(NewCond);
         CI->setPredicate(CI->getInversePredicate());
       } else {
-        NewCond = BinaryOperator::CreateNot(NewCond,
-                                  PBI->getCondition()->getName()+".not", PBI);
+        NewCond = Builder.CreateNot(NewCond, 
+                                    PBI->getCondition()->getName()+".not");
       }
       
       PBI->setCondition(NewCond);
@@ -1574,8 +1624,9 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
     New->takeName(Cond);
     Cond->setName(New->getName()+".old");
     
-    Value *NewCond = BinaryOperator::Create(Opc, PBI->getCondition(),
-                                            New, "or.cond", PBI);
+    Instruction *NewCond = 
+      cast<Instruction>(Builder.CreateBinOp(Opc, PBI->getCondition(),
+                                            New, "or.cond"));
     PBI->setCondition(NewCond);
     if (PBI->getSuccessor(0) == BB) {
       AddPredecessorToBlock(TrueDest, PredBlock, BB);
@@ -1586,13 +1637,11 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
       PBI->setSuccessor(1, FalseDest);
     }
 
-    // Move dbg value intrinsics in PredBlock.
-    for (SmallVector<DbgInfoIntrinsic *, 8>::iterator DBI = DbgValues.begin(),
-           DBE = DbgValues.end(); DBI != DBE; ++DBI) {
-      DbgInfoIntrinsic *DB = *DBI;
-      DB->removeFromParent();
-      DB->insertBefore(PBI);
-    }
+    // Copy any debug value intrinsics into the end of PredBlock.
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+      if (isa<DbgInfoIntrinsic>(*I))
+        I->clone()->insertBefore(PBI);
+      
     return true;
   }
   return false;
@@ -1720,23 +1769,22 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) {
   }  
   
   DEBUG(dbgs() << *PBI->getParent()->getParent());
-  
+
   // BI may have other predecessors.  Because of this, we leave
   // it alone, but modify PBI.
   
   // Make sure we get to CommonDest on True&True directions.
   Value *PBICond = PBI->getCondition();
+  IRBuilder<true, NoFolder> Builder(PBI);
   if (PBIOp)
-    PBICond = BinaryOperator::CreateNot(PBICond,
-                                        PBICond->getName()+".not",
-                                        PBI);
+    PBICond = Builder.CreateNot(PBICond, PBICond->getName()+".not");
+
   Value *BICond = BI->getCondition();
   if (BIOp)
-    BICond = BinaryOperator::CreateNot(BICond,
-                                       BICond->getName()+".not",
-                                       PBI);
+    BICond = Builder.CreateNot(BICond, BICond->getName()+".not");
+
   // Merge the conditions.
-  Value *Cond = BinaryOperator::CreateOr(PBICond, BICond, "brmerge", PBI);
+  Value *Cond = Builder.CreateOr(PBICond, BICond, "brmerge");
   
   // Modify PBI to branch on the new condition to the new dests.
   PBI->setCondition(Cond);
@@ -1759,8 +1807,8 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) {
     Value *PBIV = PN->getIncomingValue(PBBIdx);
     if (BIV != PBIV) {
       // Insert a select in PBI to pick the right value.
-      Value *NV = SelectInst::Create(PBICond, PBIV, BIV,
-                                     PBIV->getName()+".mux", PBI);
+      Value *NV = cast<SelectInst>
+        (Builder.CreateSelect(PBICond, PBIV, BIV, PBIV->getName()+".mux"));
       PN->setIncomingValue(PBBIdx, NV);
     }
   }
@@ -1799,16 +1847,19 @@ static bool SimplifyTerminatorOnSelect(TerminatorInst *OldTerm, Value *Cond,
       Succ->removePredecessor(OldTerm->getParent());
   }
 
+  IRBuilder<> Builder(OldTerm);
+  Builder.SetCurrentDebugLocation(OldTerm->getDebugLoc());
+
   // Insert an appropriate new terminator.
   if ((KeepEdge1 == 0) && (KeepEdge2 == 0)) {
     if (TrueBB == FalseBB)
       // We were only looking for one successor, and it was present.
       // Create an unconditional branch to it.
-      BranchInst::Create(TrueBB, OldTerm);
+      Builder.CreateBr(TrueBB);
     else
       // We found both of the successors we were looking for.
       // Create a conditional branch sharing the condition of the select.
-      BranchInst::Create(TrueBB, FalseBB, Cond, OldTerm);
+      Builder.CreateCondBr(Cond, TrueBB, FalseBB);
   } else if (KeepEdge1 && (KeepEdge2 || TrueBB == FalseBB)) {
     // Neither of the selected blocks were successors, so this
     // terminator must be unreachable.
@@ -1819,10 +1870,10 @@ static bool SimplifyTerminatorOnSelect(TerminatorInst *OldTerm, Value *Cond,
     // the edge to the one that wasn't must be unreachable.
     if (KeepEdge1 == 0)
       // Only TrueBB was found.
-      BranchInst::Create(TrueBB, OldTerm);
+      Builder.CreateBr(TrueBB);
     else
       // Only FalseBB was found.
-      BranchInst::Create(FalseBB, OldTerm);
+      Builder.CreateBr(FalseBB);
   }
 
   EraseTerminatorInstAndDCECond(OldTerm);
@@ -1887,8 +1938,10 @@ static bool SimplifyIndirectBrOnSelect(IndirectBrInst *IBI, SelectInst *SI) {
 /// We prefer to split the edge to 'end' so that there is a true/false entry to
 /// the PHI, merging the third icmp into the switch.
 static bool TryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI,
-                                                  const TargetData *TD) {
+                                                  const TargetData *TD,
+                                                  IRBuilder<> &Builder) {
   BasicBlock *BB = ICI->getParent();
+
   // If the block has any PHIs in it or the icmp has multiple uses, it is too
   // complex.
   if (isa<PHINode>(BB->begin()) || !ICI->hasOneUse()) return false;
@@ -1966,7 +2019,9 @@ static bool TryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI,
   SI->addCase(Cst, NewBB);
   
   // NewBB branches to the phi block, add the uncond branch and the phi entry.
-  BranchInst::Create(SuccBlock, NewBB);
+  Builder.SetInsertPoint(NewBB);
+  Builder.SetCurrentDebugLocation(SI->getDebugLoc());
+  Builder.CreateBr(SuccBlock);
   PHIUse->addIncoming(NewCst, NewBB);
   return true;
 }
@@ -1974,7 +2029,8 @@ static bool TryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI,
 /// SimplifyBranchOnICmpChain - The specified branch is a conditional branch.
 /// Check to see if it is branching on an or/and chain of icmp instructions, and
 /// fold it into a switch instruction if so.
-static bool SimplifyBranchOnICmpChain(BranchInst *BI, const TargetData *TD) {
+static bool SimplifyBranchOnICmpChain(BranchInst *BI, const TargetData *TD,
+                                      IRBuilder<> &Builder) {
   Instruction *Cond = dyn_cast<Instruction>(BI->getCondition());
   if (Cond == 0) return false;
   
@@ -2030,11 +2086,12 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, const TargetData *TD) {
     BasicBlock *NewBB = BB->splitBasicBlock(BI, "switch.early.test");
     // Remove the uncond branch added to the old block.
     TerminatorInst *OldTI = BB->getTerminator();
-    
+    Builder.SetInsertPoint(OldTI);
+
     if (TrueWhenEqual)
-      BranchInst::Create(EdgeBB, NewBB, ExtraCase, OldTI);
+      Builder.CreateCondBr(ExtraCase, EdgeBB, NewBB);
     else
-      BranchInst::Create(NewBB, EdgeBB, ExtraCase, OldTI);
+      Builder.CreateCondBr(ExtraCase, NewBB, EdgeBB);
       
     OldTI->eraseFromParent();
     
@@ -2046,18 +2103,19 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, const TargetData *TD) {
           << "\nEXTRABB = " << *BB);
     BB = NewBB;
   }
-  
+
+  Builder.SetInsertPoint(BI);
   // Convert pointer to int before we switch.
   if (CompVal->getType()->isPointerTy()) {
     assert(TD && "Cannot switch on pointer without TargetData");
-    CompVal = new PtrToIntInst(CompVal,
-                               TD->getIntPtrType(CompVal->getContext()),
-                               "magicptr", BI);
+    CompVal = Builder.CreatePtrToInt(CompVal,
+                                     TD->getIntPtrType(CompVal->getContext()),
+                                     "magicptr");
   }
   
   // Create the new switch instruction now.
-  SwitchInst *New = SwitchInst::Create(CompVal, DefaultBB, Values.size(), BI);
-  
+  SwitchInst *New = Builder.CreateSwitch(CompVal, DefaultBB, Values.size());
+
   // Add all of the 'cases' to the switch instruction.
   for (unsigned i = 0, e = Values.size(); i != e; ++i)
     New->addCase(Values[i], EdgeBB);
@@ -2080,7 +2138,7 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, const TargetData *TD) {
   return true;
 }
 
-bool SimplifyCFGOpt::SimplifyReturn(ReturnInst *RI) {
+bool SimplifyCFGOpt::SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder) {
   BasicBlock *BB = RI->getParent();
   if (!BB->getFirstNonPHIOrDbg()->isTerminator()) return false;
   
@@ -2124,13 +2182,13 @@ bool SimplifyCFGOpt::SimplifyReturn(ReturnInst *RI) {
     // Check to see if the non-BB successor is also a return block.
     if (isa<ReturnInst>(BI->getSuccessor(0)->getTerminator()) &&
         isa<ReturnInst>(BI->getSuccessor(1)->getTerminator()) &&
-        SimplifyCondBranchToTwoReturns(BI))
+        SimplifyCondBranchToTwoReturns(BI, Builder))
       return true;
   }
   return false;
 }
 
-bool SimplifyCFGOpt::SimplifyUnwind(UnwindInst *UI) {
+bool SimplifyCFGOpt::SimplifyUnwind(UnwindInst *UI, IRBuilder<> &Builder) {
   // Check to see if the first instruction in this block is just an unwind.
   // If so, replace any invoke instructions which use this as an exception
   // destination with call instructions.
@@ -2145,14 +2203,16 @@ bool SimplifyCFGOpt::SimplifyUnwind(UnwindInst *UI) {
     if (II && II->getUnwindDest() == BB) {
       // Insert a new branch instruction before the invoke, because this
       // is now a fall through.
-      BranchInst *BI = BranchInst::Create(II->getNormalDest(), II);
+      Builder.SetInsertPoint(II);
+      BranchInst *BI = Builder.CreateBr(II->getNormalDest());
       Pred->getInstList().remove(II);   // Take out of symbol table
       
       // Insert the call now.
       SmallVector<Value*,8> Args(II->op_begin(), II->op_end()-3);
-      CallInst *CI = CallInst::Create(II->getCalledValue(),
-                                      Args.begin(), Args.end(),
-                                      II->getName(), BI);
+      Builder.SetInsertPoint(BI);
+      CallInst *CI = Builder.CreateCall(II->getCalledValue(),
+                                        Args.begin(), Args.end(),
+                                        II->getName());
       CI->setCallingConv(II->getCallingConv());
       CI->setAttributes(II->getAttributes());
       // If the invoke produced a value, the Call now does instead.
@@ -2211,7 +2271,7 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
   SmallVector<BasicBlock*, 8> Preds(pred_begin(BB), pred_end(BB));
   for (unsigned i = 0, e = Preds.size(); i != e; ++i) {
     TerminatorInst *TI = Preds[i]->getTerminator();
-    
+    IRBuilder<> Builder(TI);
     if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
       if (BI->isUnconditional()) {
         if (BI->getSuccessor(0) == BB) {
@@ -2221,10 +2281,10 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
         }
       } else {
         if (BI->getSuccessor(0) == BB) {
-          BranchInst::Create(BI->getSuccessor(1), BI);
+          Builder.CreateBr(BI->getSuccessor(1));
           EraseTerminatorInstAndDCECond(BI);
         } else if (BI->getSuccessor(1) == BB) {
-          BranchInst::Create(BI->getSuccessor(0), BI);
+          Builder.CreateBr(BI->getSuccessor(0));
           EraseTerminatorInstAndDCECond(BI);
           Changed = true;
         }
@@ -2288,14 +2348,15 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
       if (II->getUnwindDest() == BB) {
         // Convert the invoke to a call instruction.  This would be a good
         // place to note that the call does not throw though.
-        BranchInst *BI = BranchInst::Create(II->getNormalDest(), II);
+        BranchInst *BI = Builder.CreateBr(II->getNormalDest());
         II->removeFromParent();   // Take out of symbol table
         
         // Insert the call now...
         SmallVector<Value*, 8> Args(II->op_begin(), II->op_end()-3);
-        CallInst *CI = CallInst::Create(II->getCalledValue(),
-                                        Args.begin(), Args.end(),
-                                        II->getName(), BI);
+        Builder.SetInsertPoint(BI);
+        CallInst *CI = Builder.CreateCall(II->getCalledValue(),
+                                          Args.begin(), Args.end(),
+                                          II->getName());
         CI->setCallingConv(II->getCallingConv());
         CI->setAttributes(II->getAttributes());
         // If the invoke produced a value, the call does now instead.
@@ -2319,7 +2380,7 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
 
 /// TurnSwitchRangeIntoICmp - Turns a switch with that contains only a
 /// integer range comparison into a sub, an icmp and a branch.
-static bool TurnSwitchRangeIntoICmp(SwitchInst *SI) {
+static bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder) {
   assert(SI->getNumCases() > 2 && "Degenerate switch?");
 
   // Make sure all cases point to the same destination and gather the values.
@@ -2344,9 +2405,9 @@ static bool TurnSwitchRangeIntoICmp(SwitchInst *SI) {
 
   Value *Sub = SI->getCondition();
   if (!Offset->isNullValue())
-    Sub = BinaryOperator::CreateAdd(Sub, Offset, Sub->getName()+".off", SI);
-  Value *Cmp = new ICmpInst(SI, ICmpInst::ICMP_ULT, Sub, NumCases, "switch");
-  BranchInst::Create(SI->getSuccessor(1), SI->getDefaultDest(), Cmp, SI);
+    Sub = Builder.CreateAdd(Sub, Offset, Sub->getName()+".off");
+  Value *Cmp = Builder.CreateICmpULT(Sub, NumCases, "switch");
+  Builder.CreateCondBr(Cmp, SI->getSuccessor(1), SI->getDefaultDest());
 
   // Prune obsolete incoming values off the successor's PHI nodes.
   for (BasicBlock::iterator BBI = SI->getSuccessor(1)->begin();
@@ -2359,7 +2420,37 @@ static bool TurnSwitchRangeIntoICmp(SwitchInst *SI) {
   return true;
 }
 
-bool SimplifyCFGOpt::SimplifySwitch(SwitchInst *SI) {
+/// EliminateDeadSwitchCases - Compute masked bits for the condition of a switch
+/// and use it to remove dead cases.
+static bool EliminateDeadSwitchCases(SwitchInst *SI) {
+  Value *Cond = SI->getCondition();
+  unsigned Bits = cast<IntegerType>(Cond->getType())->getBitWidth();
+  APInt KnownZero(Bits, 0), KnownOne(Bits, 0);
+  ComputeMaskedBits(Cond, APInt::getAllOnesValue(Bits), KnownZero, KnownOne);
+
+  // Gather dead cases.
+  SmallVector<ConstantInt*, 8> DeadCases;
+  for (unsigned I = 1, E = SI->getNumCases(); I != E; ++I) {
+    if ((SI->getCaseValue(I)->getValue() & KnownZero) != 0 ||
+        (SI->getCaseValue(I)->getValue() & KnownOne) != KnownOne) {
+      DeadCases.push_back(SI->getCaseValue(I));
+      DEBUG(dbgs() << "SimplifyCFG: switch case '"
+                   << SI->getCaseValue(I)->getValue() << "' is dead.\n");
+    }
+  }
+
+  // Remove dead cases from the switch.
+  for (unsigned I = 0, E = DeadCases.size(); I != E; ++I) {
+    unsigned Case = SI->findCaseValue(DeadCases[I]);
+    // Prune unused values from PHI nodes.
+    SI->getSuccessor(Case)->removePredecessor(SI->getParent());
+    SI->removeCase(Case);
+  }
+
+  return !DeadCases.empty();
+}
+
+bool SimplifyCFGOpt::SimplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
   // If this switch is too complex to want to look at, ignore it.
   if (!isValueEqualityComparison(SI))
     return false;
@@ -2369,7 +2460,7 @@ bool SimplifyCFGOpt::SimplifySwitch(SwitchInst *SI) {
   // If we only have one predecessor, and if it is a branch on this value,
   // see if that predecessor totally determines the outcome of this switch.
   if (BasicBlock *OnlyPred = BB->getSinglePredecessor())
-    if (SimplifyEqualityComparisonWithOnlyPredecessor(SI, OnlyPred))
+    if (SimplifyEqualityComparisonWithOnlyPredecessor(SI, OnlyPred, Builder))
       return SimplifyCFG(BB) | true;
 
   Value *Cond = SI->getCondition();
@@ -2384,13 +2475,17 @@ bool SimplifyCFGOpt::SimplifySwitch(SwitchInst *SI) {
   while (isa<DbgInfoIntrinsic>(BBI))
     ++BBI;
   if (SI == &*BBI)
-    if (FoldValueComparisonIntoPredecessors(SI))
+    if (FoldValueComparisonIntoPredecessors(SI, Builder))
       return SimplifyCFG(BB) | true;
 
   // Try to transform the switch into an icmp and a branch.
-  if (TurnSwitchRangeIntoICmp(SI))
+  if (TurnSwitchRangeIntoICmp(SI, Builder))
     return SimplifyCFG(BB) | true;
-  
+
+  // Remove unreachable cases.
+  if (EliminateDeadSwitchCases(SI))
+    return SimplifyCFG(BB) | true;
+
   return false;
 }
 
@@ -2431,7 +2526,7 @@ bool SimplifyCFGOpt::SimplifyIndirectBr(IndirectBrInst *IBI) {
   return Changed;
 }
 
-bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI) {
+bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder){
   BasicBlock *BB = BI->getParent();
   
   // If the Terminator is the only non-phi instruction, simplify the block.
@@ -2446,7 +2541,8 @@ bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI) {
     if (ICI->isEquality() && isa<ConstantInt>(ICI->getOperand(1))) {
       for (++I; isa<DbgInfoIntrinsic>(I); ++I)
         ;
-      if (I->isTerminator() && TryToSimplifyUncondBranchWithICmpInIt(ICI, TD))
+      if (I->isTerminator() 
+          && TryToSimplifyUncondBranchWithICmpInIt(ICI, TD, Builder))
         return true;
     }
   
@@ -2454,7 +2550,7 @@ bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI) {
 }
 
 
-bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI) {
+bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
   BasicBlock *BB = BI->getParent();
   
   // Conditional branch
@@ -2463,7 +2559,7 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI) {
     // see if that predecessor totally determines the outcome of this
     // switch.
     if (BasicBlock *OnlyPred = BB->getSinglePredecessor())
-      if (SimplifyEqualityComparisonWithOnlyPredecessor(BI, OnlyPred))
+      if (SimplifyEqualityComparisonWithOnlyPredecessor(BI, OnlyPred, Builder))
         return SimplifyCFG(BB) | true;
     
     // This block must be empty, except for the setcond inst, if it exists.
@@ -2473,20 +2569,20 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI) {
     while (isa<DbgInfoIntrinsic>(I))
       ++I;
     if (&*I == BI) {
-      if (FoldValueComparisonIntoPredecessors(BI))
+      if (FoldValueComparisonIntoPredecessors(BI, Builder))
         return SimplifyCFG(BB) | true;
     } else if (&*I == cast<Instruction>(BI->getCondition())){
       ++I;
       // Ignore dbg intrinsics.
       while (isa<DbgInfoIntrinsic>(I))
         ++I;
-      if (&*I == BI && FoldValueComparisonIntoPredecessors(BI))
+      if (&*I == BI && FoldValueComparisonIntoPredecessors(BI, Builder))
         return SimplifyCFG(BB) | true;
     }
   }
   
   // Try to turn "br (X == 0 | X == 1), T, F" into a switch instruction.
-  if (SimplifyBranchOnICmpChain(BI, TD))
+  if (SimplifyBranchOnICmpChain(BI, TD, Builder))
     return true;
   
   // We have a conditional branch to two blocks that are only reachable
@@ -2557,7 +2653,7 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) {
 
   // Check to see if we can constant propagate this terminator instruction
   // away...
-  Changed |= ConstantFoldTerminator(BB);
+  Changed |= ConstantFoldTerminator(BB, true);
 
   // Check for and eliminate duplicate PHI nodes in this block.
   Changed |= EliminateDuplicatePHINodes(BB);
@@ -2569,27 +2665,30 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) {
   if (MergeBlockIntoPredecessor(BB))
     return true;
   
+  IRBuilder<> Builder(BB);
+
   // If there is a trivial two-entry PHI node in this basic block, and we can
   // eliminate it, do so now.
   if (PHINode *PN = dyn_cast<PHINode>(BB->begin()))
     if (PN->getNumIncomingValues() == 2)
       Changed |= FoldTwoEntryPHINode(PN, TD);
 
+  Builder.SetInsertPoint(BB->getTerminator());
   if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
     if (BI->isUnconditional()) {
-      if (SimplifyUncondBranch(BI)) return true;
+      if (SimplifyUncondBranch(BI, Builder)) return true;
     } else {
-      if (SimplifyCondBranch(BI)) return true;
+      if (SimplifyCondBranch(BI, Builder)) return true;
     }
   } else if (ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator())) {
-    if (SimplifyReturn(RI)) return true;
+    if (SimplifyReturn(RI, Builder)) return true;
   } else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
-    if (SimplifySwitch(SI)) return true;
+    if (SimplifySwitch(SI, Builder)) return true;
   } else if (UnreachableInst *UI =
                dyn_cast<UnreachableInst>(BB->getTerminator())) {
     if (SimplifyUnreachable(UI)) return true;
   } else if (UnwindInst *UI = dyn_cast<UnwindInst>(BB->getTerminator())) {
-    if (SimplifyUnwind(UI)) return true;
+    if (SimplifyUnwind(UI, Builder)) return true;
   } else if (IndirectBrInst *IBI =
                dyn_cast<IndirectBrInst>(BB->getTerminator())) {
     if (SimplifyIndirectBr(IBI)) return true;
diff --git a/lib/Transforms/Utils/ValueMapper.cpp b/lib/Transforms/Utils/ValueMapper.cpp
index f5481d3..a73bf04 100644
--- a/lib/Transforms/Utils/ValueMapper.cpp
+++ b/lib/Transforms/Utils/ValueMapper.cpp
@@ -39,7 +39,7 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM,
       return VM[V] = const_cast<Value*>(V);
     
     // Create a dummy node in case we have a metadata cycle.
-    MDNode *Dummy = MDNode::getTemporary(V->getContext(), 0, 0);
+    MDNode *Dummy = MDNode::getTemporary(V->getContext(), ArrayRef<Value*>());
     VM[V] = Dummy;
     
     // Check all operands to see if any need to be remapped.
@@ -54,7 +54,7 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM,
         Value *Op = MD->getOperand(i);
         Elts.push_back(Op ? MapValue(Op, VM, Flags) : 0);
       }
-      MDNode *NewMD = MDNode::get(V->getContext(), Elts.data(), Elts.size());
+      MDNode *NewMD = MDNode::get(V->getContext(), Elts);
       Dummy->replaceAllUsesWith(NewMD);
       VM[V] = NewMD;
       MDNode::deleteTemporary(Dummy);
diff --git a/lib/VMCore/AsmWriter.cpp b/lib/VMCore/AsmWriter.cpp
index ffd367a..cfcffeb 100644
--- a/lib/VMCore/AsmWriter.cpp
+++ b/lib/VMCore/AsmWriter.cpp
@@ -32,6 +32,7 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/CFG.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -39,9 +40,13 @@
 #include "llvm/Support/FormattedStream.h"
 #include <algorithm>
 #include <cctype>
-#include <map>
 using namespace llvm;
 
+static cl::opt<bool>
+EnableDebugInfoComment("enable-debug-info-comment", cl::Hidden,
+                       cl::desc("Enable debug info comments"));
+
+
 // Make virtual table appear in this compilation unit.
 AssemblyAnnotationWriter::~AssemblyAnnotationWriter() {}
 
@@ -89,7 +94,7 @@ enum PrefixType {
 /// prefixed with % (if the string only contains simple characters) or is
 /// surrounded with ""'s (if it has special chars in it).  Print it out.
 static void PrintLLVMName(raw_ostream &OS, StringRef Name, PrefixType Prefix) {
-  assert(Name.data() && "Cannot get empty name!");
+  assert(!Name.empty() && "Cannot get empty name!");
   switch (Prefix) {
   default: llvm_unreachable("Bad prefix!");
   case NoPrefix: break;
@@ -1075,7 +1080,7 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
     }
 
     if (CE->hasIndices()) {
-      const SmallVector<unsigned, 4> &Indices = CE->getIndices();
+      ArrayRef<unsigned> Indices = CE->getIndices();
       for (unsigned i = 0, e = Indices.size(); i != e; ++i)
         Out << ", " << Indices[i];
     }
@@ -1396,7 +1401,25 @@ void AssemblyWriter::printModule(const Module *M) {
 }
 
 void AssemblyWriter::printNamedMDNode(const NamedMDNode *NMD) {
-  Out << "!" << NMD->getName() << " = !{";
+  Out << '!';
+  StringRef Name = NMD->getName();
+  if (Name.empty()) {
+    Out << "<empty name> ";
+  } else {
+    if (isalpha(Name[0]) || Name[0] == '-' || Name[0] == '$' ||
+        Name[0] == '.' || Name[0] == '_')
+      Out << Name[0];
+    else
+      Out << '\\' << hexdigit(Name[0] >> 4) << hexdigit(Name[0] & 0x0F);
+    for (unsigned i = 1, e = Name.size(); i != e; ++i) {
+      unsigned char C = Name[i];
+      if (isalnum(C) || C == '-' || C == '$' || C == '.' || C == '_')
+        Out << C;
+      else
+        Out << '\\' << hexdigit(C >> 4) << hexdigit(C & 0x0F);
+    }
+  }
+  Out << " = !{";
   for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) {
     if (i) Out << ", ";
     int Slot = Machine.getMetadataSlot(NMD->getOperand(i));
@@ -1730,6 +1753,18 @@ void AssemblyWriter::printBasicBlock(const BasicBlock *BB) {
   if (AnnotationWriter) AnnotationWriter->emitBasicBlockEndAnnot(BB, Out);
 }
 
+/// printDebugLoc - Print DebugLoc.
+static void printDebugLoc(const DebugLoc &DL, formatted_raw_ostream &OS) {
+  OS << DL.getLine() << ":" << DL.getCol();
+  if (MDNode *N = DL.getInlinedAt(getGlobalContext())) {
+    DebugLoc IDL = DebugLoc::getFromDILocation(N);
+    if (!IDL.isUnknown()) {
+      OS << "@";
+      printDebugLoc(IDL,OS);
+    }
+  }
+}
+
 /// printInfoComment - Print a little comment after the instruction indicating
 /// which slot it occupies.
 ///
@@ -1737,6 +1772,43 @@ void AssemblyWriter::printInfoComment(const Value &V) {
   if (AnnotationWriter) {
     AnnotationWriter->printInfoComment(V, Out);
     return;
+  } else if (EnableDebugInfoComment) {
+    bool Padded = false;
+    if (const Instruction *I = dyn_cast<Instruction>(&V)) {
+      const DebugLoc &DL = I->getDebugLoc();
+      if (!DL.isUnknown()) {
+        if (!Padded) {
+          Out.PadToColumn(50);
+          Padded = true;
+          Out << ";";
+        }
+        Out << " [debug line = ";
+        printDebugLoc(DL,Out);
+        Out << "]";
+      }
+      if (const DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(I)) {
+        const MDNode *Var = DDI->getVariable();
+        if (!Padded) {
+          Out.PadToColumn(50);
+          Padded = true;
+          Out << ";";
+        }
+        if (Var && Var->getNumOperands() >= 2)
+          if (MDString *MDS = dyn_cast_or_null<MDString>(Var->getOperand(2)))
+            Out << " [debug variable = " << MDS->getString() << "]";
+      }
+      else if (const DbgValueInst *DVI = dyn_cast<DbgValueInst>(I)) {
+        const MDNode *Var = DVI->getVariable();
+        if (!Padded) {
+          Out.PadToColumn(50);
+          Padded = true;
+          Out << ";";
+        }
+        if (Var && Var->getNumOperands() >= 2)
+          if (MDString *MDS = dyn_cast_or_null<MDString>(Var->getOperand(2)))
+            Out << " [debug variable = " << MDS->getString() << "]";
+      }
+    }
   }
 }
 
diff --git a/lib/VMCore/Attributes.cpp b/lib/VMCore/Attributes.cpp
index 92152a3..bf6efa1 100644
--- a/lib/VMCore/Attributes.cpp
+++ b/lib/VMCore/Attributes.cpp
@@ -36,6 +36,8 @@ std::string Attribute::getAsString(Attributes Attrs) {
     Result += "noreturn ";
   if (Attrs & Attribute::NoUnwind)
     Result += "nounwind ";
+  if (Attrs & Attribute::UWTable)
+    Result += "uwtable ";
   if (Attrs & Attribute::InReg)
     Result += "inreg ";
   if (Attrs & Attribute::NoAlias)
@@ -72,6 +74,8 @@ std::string Attribute::getAsString(Attributes Attrs) {
     Result += "naked ";
   if (Attrs & Attribute::Hotpatch)
     Result += "hotpatch ";
+  if (Attrs & Attribute::NonLazyBind)
+    Result += "nonlazybind ";
   if (Attrs & Attribute::StackAlignment) {
     Result += "alignstack(";
     Result += utostr(Attribute::getStackAlignmentFromAttrs(Attrs));
diff --git a/lib/VMCore/AutoUpgrade.cpp b/lib/VMCore/AutoUpgrade.cpp
index 9e551bb..9d4543d 100644
--- a/lib/VMCore/AutoUpgrade.cpp
+++ b/lib/VMCore/AutoUpgrade.cpp
@@ -284,8 +284,58 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
       break;
     }
 
+    //  This upgrades the llvm.prefetch intrinsic to accept one more parameter,
+    //  which is a instruction / data cache identifier. The old version only
+    //  implicitly accepted the data version.
+    if (Name.compare(5,8,"prefetch",8) == 0) {
+      // Don't do anything if it has the correct number of arguments already
+      if (FTy->getNumParams() == 4)
+        break;
+
+      assert(FTy->getNumParams() == 3 && "old prefetch takes 3 args!");
+      //  We first need to change the name of the old (bad) intrinsic, because
+      //  its type is incorrect, but we cannot overload that name. We
+      //  arbitrarily unique it here allowing us to construct a correctly named
+      //  and typed function below.
+      F->setName("");
+      NewFn = cast<Function>(M->getOrInsertFunction(Name,
+                                                    FTy->getReturnType(),
+                                                    FTy->getParamType(0),
+                                                    FTy->getParamType(1),
+                                                    FTy->getParamType(2),
+                                                    FTy->getParamType(2),
+                                                    (Type*)0));
+      return true;
+    }
+
     break;
-  case 'x': 
+  case 'x':
+    // This fixes the poorly named crc32 intrinsics
+    if (Name.compare(5, 13, "x86.sse42.crc", 13) == 0) {
+      const char* NewFnName = NULL;
+      if (Name.compare(18, 2, "32", 2) == 0) {
+        if (Name.compare(20, 2, ".8") == 0 && Name.length() == 22) {
+          NewFnName = "llvm.x86.sse42.crc32.32.8";
+        } else if (Name.compare(20, 3, ".16") == 0 && Name.length() == 23) {
+          NewFnName = "llvm.x86.sse42.crc32.32.16";
+        } else if (Name.compare(20, 3, ".32") == 0 && Name.length() == 23) {
+          NewFnName = "llvm.x86.sse42.crc32.32.32";
+        }
+      }
+      else if (Name.compare(18, 2, "64", 2) == 0) {
+        if (Name.compare(20, 2, ".8") == 0 && Name.length() == 22) {
+          NewFnName = "llvm.x86.sse42.crc32.64.8";
+        } else if (Name.compare(20, 3, ".64") == 0 && Name.length() == 23) {
+          NewFnName = "llvm.x86.sse42.crc32.64.64";
+        }
+      }
+      if (NewFnName) {
+        F->setName(NewFnName);
+        NewFn = F;
+        return true;
+      }
+    }
+
     // This fixes all MMX shift intrinsic instructions to take a
     // x86_mmx instead of a v1i64, v2i32, v4i16, or v8i8.
     if (Name.compare(5, 8, "x86.mmx.", 8) == 0) {
@@ -527,6 +577,19 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
       // or 0.
       NewFn = 0;
       return true;           
+    } else if (Name.compare(5, 16, "x86.sse.loadu.ps", 16) == 0 ||
+               Name.compare(5, 17, "x86.sse2.loadu.dq", 17) == 0 ||
+               Name.compare(5, 17, "x86.sse2.loadu.pd", 17) == 0) {
+      // Calls to these instructions are transformed into unaligned loads.
+      NewFn = 0;
+      return true;
+    } else if (Name.compare(5, 16, "x86.sse.movnt.ps", 16) == 0 ||
+               Name.compare(5, 17, "x86.sse2.movnt.dq", 17) == 0 ||
+               Name.compare(5, 17, "x86.sse2.movnt.pd", 17) == 0 ||
+               Name.compare(5, 17, "x86.sse2.movnt.i", 16) == 0) {
+      // Calls to these instructions are transformed into nontemporal stores.
+      NewFn = 0;
+      return true;
     } else if (Name.compare(5, 17, "x86.ssse3.pshuf.w", 17) == 0) {
       // This is an SSE/MMX instruction.
       const Type *X86_MMXTy = VectorType::getX86_MMXTy(FTy->getContext());
@@ -946,7 +1009,54 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
         
       // Remove upgraded instruction.
       CI->eraseFromParent();
-      
+    
+    } else if (F->getName() == "llvm.x86.sse.loadu.ps" ||
+               F->getName() == "llvm.x86.sse2.loadu.dq" ||
+               F->getName() == "llvm.x86.sse2.loadu.pd") {
+      // Convert to a native, unaligned load.
+      const Type *VecTy = CI->getType();
+      const Type *IntTy = IntegerType::get(C, 128);
+      IRBuilder<> Builder(C);
+      Builder.SetInsertPoint(CI->getParent(), CI);
+
+      Value *BC = Builder.CreateBitCast(CI->getArgOperand(0),
+                                        PointerType::getUnqual(IntTy),
+                                        "cast");
+      LoadInst *LI = Builder.CreateLoad(BC, CI->getName());
+      LI->setAlignment(1);      // Unaligned load.
+      BC = Builder.CreateBitCast(LI, VecTy, "new.cast");
+
+      // Fix up all the uses with our new load.
+      if (!CI->use_empty())
+        CI->replaceAllUsesWith(BC);
+
+      // Remove intrinsic.
+      CI->eraseFromParent();
+    } else if (F->getName() == "llvm.x86.sse.movnt.ps" ||
+               F->getName() == "llvm.x86.sse2.movnt.dq" ||
+               F->getName() == "llvm.x86.sse2.movnt.pd" ||
+               F->getName() == "llvm.x86.sse2.movnt.i") {
+      IRBuilder<> Builder(C);
+      Builder.SetInsertPoint(CI->getParent(), CI);
+
+      Module *M = F->getParent();
+      SmallVector<Value *, 1> Elts;
+      Elts.push_back(ConstantInt::get(Type::getInt32Ty(C), 1));
+      MDNode *Node = MDNode::get(C, Elts);
+
+      Value *Arg0 = CI->getArgOperand(0);
+      Value *Arg1 = CI->getArgOperand(1);
+
+      // Convert the type of the pointer to a pointer to the stored type.
+      Value *BC = Builder.CreateBitCast(Arg0,
+                                        PointerType::getUnqual(Arg1->getType()),
+                                        "cast");
+      StoreInst *SI = Builder.CreateStore(Arg1, BC);
+      SI->setMetadata(M->getMDKindID("nontemporal"), Node);
+      SI->setAlignment(16);
+
+      // Remove intrinsic.
+      CI->eraseFromParent();
     } else {
       llvm_unreachable("Unknown function for CallInst upgrade.");
     }
@@ -1258,6 +1368,29 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
     CI->eraseFromParent();
     break;
   }
+  case Intrinsic::prefetch: {
+    IRBuilder<> Builder(C);
+    Builder.SetInsertPoint(CI->getParent(), CI);
+    const llvm::Type *I32Ty = llvm::Type::getInt32Ty(CI->getContext());
+
+    // Add the extra "data cache" argument
+    Value *Operands[4] = { CI->getArgOperand(0), CI->getArgOperand(1),
+                           CI->getArgOperand(2),
+                           llvm::ConstantInt::get(I32Ty, 1) };
+    CallInst *NewCI = CallInst::Create(NewFn, Operands, Operands+4,
+                                       CI->getName(), CI);
+    NewCI->setTailCall(CI->isTailCall());
+    NewCI->setCallingConv(CI->getCallingConv());
+    //  Handle any uses of the old CallInst.
+    if (!CI->use_empty())
+      //  Replace all uses of the old call with the new cast which has the
+      //  correct type.
+      CI->replaceAllUsesWith(NewCI);
+
+    //  Clean up the old call now that it has been completely upgraded.
+    CI->eraseFromParent();
+    break;
+  }
   }
 }
 
diff --git a/lib/VMCore/ConstantFold.cpp b/lib/VMCore/ConstantFold.cpp
index 573efb7..9985ada 100644
--- a/lib/VMCore/ConstantFold.cpp
+++ b/lib/VMCore/ConstantFold.cpp
@@ -24,6 +24,7 @@
 #include "llvm/Function.h"
 #include "llvm/GlobalAlias.h"
 #include "llvm/GlobalVariable.h"
+#include "llvm/Operator.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -1735,7 +1736,7 @@ static ICmpInst::Predicate evaluateICmpRelation(Constant *V1, Constant *V2,
             // with a single zero index, it must be nonzero.
             assert(CE1->getNumOperands() == 2 &&
                    !CE1->getOperand(1)->isNullValue() &&
-                   "Suprising getelementptr!");
+                   "Surprising getelementptr!");
             return isSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT;
           } else {
             // If they are different globals, we don't know what the value is,
diff --git a/lib/VMCore/Constants.cpp b/lib/VMCore/Constants.cpp
index 7a4dcf9..15d7793 100644
--- a/lib/VMCore/Constants.cpp
+++ b/lib/VMCore/Constants.cpp
@@ -32,7 +32,6 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include <algorithm>
-#include <map>
 #include <cstdarg>
 using namespace llvm;
 
@@ -771,7 +770,7 @@ bool ConstantExpr::hasIndices() const {
          getOpcode() == Instruction::InsertValue;
 }
 
-const SmallVector<unsigned, 4> &ConstantExpr::getIndices() const {
+ArrayRef<unsigned> ConstantExpr::getIndices() const {
   if (const ExtractValueConstantExpr *EVCE =
         dyn_cast<ExtractValueConstantExpr>(this))
     return EVCE->Indices;
@@ -855,10 +854,10 @@ ConstantExpr::getWithOperandReplaced(unsigned OpNo, Constant *Op) const {
 /// operands replaced with the specified values.  The specified operands must
 /// match count and type with the existing ones.
 Constant *ConstantExpr::
-getWithOperands(Constant *const *Ops, unsigned NumOps) const {
-  assert(NumOps == getNumOperands() && "Operand count mismatch!");
+getWithOperands(ArrayRef<Constant*> Ops) const {
+  assert(Ops.size() == getNumOperands() && "Operand count mismatch!");
   bool AnyChange = false;
-  for (unsigned i = 0; i != NumOps; ++i) {
+  for (unsigned i = 0; i != Ops.size(); ++i) {
     assert(Ops[i]->getType() == getOperand(i)->getType() &&
            "Operand type mismatch!");
     AnyChange |= Ops[i] != getOperand(i);
@@ -890,8 +889,8 @@ getWithOperands(Constant *const *Ops, unsigned NumOps) const {
     return ConstantExpr::getShuffleVector(Ops[0], Ops[1], Ops[2]);
   case Instruction::GetElementPtr:
     return cast<GEPOperator>(this)->isInBounds() ?
-      ConstantExpr::getInBoundsGetElementPtr(Ops[0], &Ops[1], NumOps-1) :
-      ConstantExpr::getGetElementPtr(Ops[0], &Ops[1], NumOps-1);
+      ConstantExpr::getInBoundsGetElementPtr(Ops[0], &Ops[1], Ops.size()-1) :
+      ConstantExpr::getGetElementPtr(Ops[0], &Ops[1], Ops.size()-1);
   case Instruction::ICmp:
   case Instruction::FCmp:
     return ConstantExpr::getCompare(getPredicate(), Ops[0], Ops[1]);
@@ -2151,7 +2150,7 @@ void ConstantExpr::replaceUsesOfWithOnConstant(Value *From, Value *ToV,
     Constant *Agg = getOperand(0);
     if (Agg == From) Agg = To;
     
-    const SmallVector<unsigned, 4> &Indices = getIndices();
+    ArrayRef<unsigned> Indices = getIndices();
     Replacement = ConstantExpr::getExtractValue(Agg,
                                                 &Indices[0], Indices.size());
   } else if (getOpcode() == Instruction::InsertValue) {
@@ -2160,7 +2159,7 @@ void ConstantExpr::replaceUsesOfWithOnConstant(Value *From, Value *ToV,
     if (Agg == From) Agg = To;
     if (Val == From) Val = To;
     
-    const SmallVector<unsigned, 4> &Indices = getIndices();
+    ArrayRef<unsigned> Indices = getIndices();
     Replacement = ConstantExpr::getInsertValue(Agg, Val,
                                                &Indices[0], Indices.size());
   } else if (isCast()) {
diff --git a/lib/VMCore/ConstantsContext.h b/lib/VMCore/ConstantsContext.h
index ffc673f..1395754 100644
--- a/lib/VMCore/ConstantsContext.h
+++ b/lib/VMCore/ConstantsContext.h
@@ -301,20 +301,18 @@ struct OperandTraits<CompareConstantExpr> :
 DEFINE_TRANSPARENT_OPERAND_ACCESSORS(CompareConstantExpr, Value)
 
 struct ExprMapKeyType {
-  typedef SmallVector<unsigned, 4> IndexList;
-
   ExprMapKeyType(unsigned opc,
-      const std::vector<Constant*> &ops,
+      ArrayRef<Constant*> ops,
       unsigned short flags = 0,
       unsigned short optionalflags = 0,
-      const IndexList &inds = IndexList())
+      ArrayRef<unsigned> inds = ArrayRef<unsigned>())
         : opcode(opc), subclassoptionaldata(optionalflags), subclassdata(flags),
-        operands(ops), indices(inds) {}
+        operands(ops.begin(), ops.end()), indices(inds.begin(), inds.end()) {}
   uint8_t opcode;
   uint8_t subclassoptionaldata;
   uint16_t subclassdata;
   std::vector<Constant*> operands;
-  IndexList indices;
+  SmallVector<unsigned, 4> indices;
   bool operator==(const ExprMapKeyType& that) const {
     return this->opcode == that.opcode &&
            this->subclassdata == that.subclassdata &&
@@ -465,7 +463,7 @@ struct ConstantKeyData<ConstantExpr> {
         CE->isCompare() ? CE->getPredicate() : 0,
         CE->getRawSubclassOptionalData(),
         CE->hasIndices() ?
-          CE->getIndices() : SmallVector<unsigned, 4>());
+          CE->getIndices() : ArrayRef<unsigned>());
   }
 };
 
diff --git a/lib/VMCore/Core.cpp b/lib/VMCore/Core.cpp
index 986b403..92f9440 100644
--- a/lib/VMCore/Core.cpp
+++ b/lib/VMCore/Core.cpp
@@ -335,7 +335,7 @@ unsigned LLVMCountStructElementTypes(LLVMTypeRef StructTy) {
 
 void LLVMGetStructElementTypes(LLVMTypeRef StructTy, LLVMTypeRef *Dest) {
   StructType *Ty = unwrap<StructType>(StructTy);
-  for (FunctionType::param_iterator I = Ty->element_begin(),
+  for (StructType::element_iterator I = Ty->element_begin(),
                                     E = Ty->element_end(); I != E; ++I)
     *Dest++ = wrap(*I);
 }
@@ -543,7 +543,8 @@ LLVMValueRef LLVMMDString(const char *Str, unsigned SLen) {
 
 LLVMValueRef LLVMMDNodeInContext(LLVMContextRef C, LLVMValueRef *Vals,
                                  unsigned Count) {
-  return wrap(MDNode::get(*unwrap(C), unwrap<Value>(Vals, Count), Count));
+  return wrap(MDNode::get(*unwrap(C),
+                          ArrayRef<Value*>(unwrap<Value>(Vals, Count), Count)));
 }
 
 LLVMValueRef LLVMMDNode(LLVMValueRef *Vals, unsigned Count) {
diff --git a/lib/VMCore/DebugInfoProbe.cpp b/lib/VMCore/DebugInfoProbe.cpp
index 57d17a6..d1275ff 100644
--- a/lib/VMCore/DebugInfoProbe.cpp
+++ b/lib/VMCore/DebugInfoProbe.cpp
@@ -51,44 +51,28 @@ namespace llvm {
     unsigned NumDbgLineLost, NumDbgValueLost;
     std::string PassName;
     Function *TheFn;
-    std::set<unsigned> LineNos;
     std::set<MDNode *> DbgVariables;
+    std::set<Instruction *> MissingDebugLoc;
   };
 }
 
 //===----------------------------------------------------------------------===//
 // DebugInfoProbeImpl
 
-static void collect(Function &F, std::set<unsigned> &Lines) {
-  for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI)
-    for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); 
-         BI != BE; ++BI) {
-      const DebugLoc &DL = BI->getDebugLoc();
-      unsigned LineNo = 0;
-      if (!DL.isUnknown()) {
-        if (MDNode *N = DL.getInlinedAt(F.getContext()))
-          LineNo = DebugLoc::getFromDILocation(N).getLine();
-        else
-          LineNo = DL.getLine();
-
-        Lines.insert(LineNo);
-      }
-    }
-}
-
 /// initialize - Collect information before running an optimization pass.
 void DebugInfoProbeImpl::initialize(StringRef PName, Function &F) {
   if (!EnableDebugInfoProbe) return;
   PassName = PName;
 
-  LineNos.clear();
   DbgVariables.clear();
+  MissingDebugLoc.clear();
   TheFn = &F;
-  collect(F, LineNos);
 
   for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI)
     for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); 
          BI != BE; ++BI) {
+      if (!isa<PHINode>(BI) && BI->getDebugLoc().isUnknown())
+        MissingDebugLoc.insert(BI);
       if (!isa<DbgInfoIntrinsic>(BI)) continue;
       Value *Addr = NULL;
       MDNode *Node = NULL;
@@ -127,23 +111,19 @@ void DebugInfoProbeImpl::report() {
 /// must be used after initialization.
 void DebugInfoProbeImpl::finalize(Function &F) {
   if (!EnableDebugInfoProbe) return;
-  std::set<unsigned> LineNos2;
-  collect(F, LineNos2);
   assert (TheFn == &F && "Invalid function to measure!");
 
-  for (std::set<unsigned>::iterator I = LineNos.begin(),
-         E = LineNos.end(); I != E; ++I) {
-    unsigned LineNo = *I;
-    if (LineNos2.count(LineNo) == 0) {
-      DEBUG(dbgs() << "DebugInfoProbe: Losing dbg info intrinsic at line " << LineNo << "\n");
-      ++NumDbgLineLost;
-    }
-  }
-
   std::set<MDNode *>DbgVariables2;
   for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI)
     for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); 
          BI != BE; ++BI) {
+      if (!isa<PHINode>(BI) && BI->getDebugLoc().isUnknown() &&
+          MissingDebugLoc.count(BI) == 0) {
+        ++NumDbgLineLost;
+        DEBUG(dbgs() << "DebugInfoProbe (" << PassName << "): --- ");
+        DEBUG(BI->print(dbgs()));
+        DEBUG(dbgs() << "\n");
+      }
       if (!isa<DbgInfoIntrinsic>(BI)) continue;
       Value *Addr = NULL;
       MDNode *Node = NULL;
@@ -160,9 +140,17 @@ void DebugInfoProbeImpl::finalize(Function &F) {
 
   for (std::set<MDNode *>::iterator I = DbgVariables.begin(), 
          E = DbgVariables.end(); I != E; ++I) {
-    if (DbgVariables2.count(*I) == 0) {
-      DEBUG(dbgs() << "DebugInfoProbe: Losing dbg info for variable: ");
-      DEBUG((*I)->print(dbgs()));
+    if (DbgVariables2.count(*I) == 0 && (*I)->getNumOperands() >= 2) {
+      DEBUG(dbgs() 
+            << "DebugInfoProbe("
+            << PassName
+            << "): Losing dbg info for variable: ";
+            if (MDString *MDS = dyn_cast_or_null<MDString>(
+                (*I)->getOperand(2)))
+              dbgs() << MDS->getString();
+            else
+              dbgs() << "...";
+            dbgs() << "\n");
       ++NumDbgValueLost;
     }
   }
diff --git a/lib/VMCore/DebugLoc.cpp b/lib/VMCore/DebugLoc.cpp
index 3569162..520333c 100644
--- a/lib/VMCore/DebugLoc.cpp
+++ b/lib/VMCore/DebugLoc.cpp
@@ -109,7 +109,7 @@ MDNode *DebugLoc::getAsMDNode(const LLVMContext &Ctx) const {
     ConstantInt::get(Int32, getLine()), ConstantInt::get(Int32, getCol()),
     Scope, IA
   };
-  return MDNode::get(Ctx2, &Elts[0], 4);
+  return MDNode::get(Ctx2, Elts);
 }
 
 /// getFromDILocation - Translate the DILocation quad into a DebugLoc.
diff --git a/lib/VMCore/Function.cpp b/lib/VMCore/Function.cpp
index 00d1d78..0ae0bdb 100644
--- a/lib/VMCore/Function.cpp
+++ b/lib/VMCore/Function.cpp
@@ -24,6 +24,7 @@
 #include "llvm/Support/Threading.h"
 #include "SymbolTableListTraitsImpl.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
 using namespace llvm;
 
@@ -78,6 +79,12 @@ bool Argument::hasByValAttr() const {
   return getParent()->paramHasAttr(getArgNo()+1, Attribute::ByVal);
 }
 
+unsigned Argument::getParamAlignment() const {
+  assert(getType()->isPointerTy() && "Only pointers have alignments");
+  return getParent()->getParamAlignment(getArgNo()+1);
+  
+}
+
 /// hasNestAttr - Return true if this argument has the nest attribute on
 /// it in its containing function.
 bool Argument::hasNestAttr() const {
@@ -328,7 +335,7 @@ unsigned Function::getIntrinsicID() const {
 
 std::string Intrinsic::getName(ID id, const Type **Tys, unsigned numTys) { 
   assert(id < num_intrinsics && "Invalid intrinsic ID!");
-  const char * const Table[] = {
+  static const char * const Table[] = {
     "not_intrinsic",
 #define GET_INTRINSIC_NAME_TABLE
 #include "llvm/Intrinsics.gen"
@@ -363,7 +370,7 @@ const FunctionType *Intrinsic::getType(LLVMContext &Context,
 }
 
 bool Intrinsic::isOverloaded(ID id) {
-  const bool OTable[] = {
+  static const bool OTable[] = {
     false,
 #define GET_INTRINSIC_OVERLOAD_TABLE
 #include "llvm/Intrinsics.gen"
@@ -406,4 +413,36 @@ bool Function::hasAddressTaken(const User* *PutOffender) const {
   return false;
 }
 
+/// callsFunctionThatReturnsTwice - Return true if the function has a call to
+/// setjmp or other function that gcc recognizes as "returning twice".
+///
+/// FIXME: Remove after <rdar://problem/8031714> is fixed.
+/// FIXME: Is the obove FIXME valid?
+bool Function::callsFunctionThatReturnsTwice() const {
+  const Module *M = this->getParent();
+  static const char *ReturnsTwiceFns[] = {
+    "_setjmp",
+    "setjmp",
+    "sigsetjmp",
+    "setjmp_syscall",
+    "savectx",
+    "qsetjmp",
+    "vfork",
+    "getcontext"
+  };
+
+  for (unsigned I = 0; I < array_lengthof(ReturnsTwiceFns); ++I)
+    if (const Function *Callee = M->getFunction(ReturnsTwiceFns[I])) {
+      if (!Callee->use_empty())
+        for (Value::const_use_iterator
+               I = Callee->use_begin(), E = Callee->use_end();
+             I != E; ++I)
+          if (const CallInst *CI = dyn_cast<CallInst>(*I))
+            if (CI->getParent()->getParent() == this)
+              return true;
+    }
+
+  return false;
+}
+
 // vim: sw=2 ai
diff --git a/lib/VMCore/IRBuilder.cpp b/lib/VMCore/IRBuilder.cpp
index 1658d79..f2d469a 100644
--- a/lib/VMCore/IRBuilder.cpp
+++ b/lib/VMCore/IRBuilder.cpp
@@ -23,7 +23,7 @@ using namespace llvm;
 /// has array of i8 type filled in with the nul terminated string value
 /// specified.  If Name is specified, it is the name of the global variable
 /// created.
-Value *IRBuilderBase::CreateGlobalString(const char *Str, const Twine &Name) {
+Value *IRBuilderBase::CreateGlobalString(StringRef Str, const Twine &Name) {
   Constant *StrConstant = ConstantArray::get(Context, Str, true);
   Module &M = *BB->getParent()->getParent();
   GlobalVariable *GV = new GlobalVariable(M, StrConstant->getType(),
@@ -60,7 +60,6 @@ static CallInst *createCallHelper(Value *Callee, Value *const* Ops,
   return CI;  
 }
 
-
 CallInst *IRBuilderBase::
 CreateMemSet(Value *Ptr, Value *Val, Value *Size, unsigned Align,
              bool isVolatile, MDNode *TBAATag) {
@@ -118,3 +117,33 @@ CreateMemMove(Value *Dst, Value *Src, Value *Size, unsigned Align,
   
   return CI;  
 }
+
+CallInst *IRBuilderBase::CreateLifetimeStart(Value *Ptr, ConstantInt *Size) {
+  assert(isa<PointerType>(Ptr->getType()) &&
+	 "lifetime.start only applies to pointers.");
+  Ptr = getCastedInt8PtrValue(Ptr);
+  if (!Size)
+    Size = getInt64(-1);
+  else
+    assert(Size->getType() == getInt64Ty() &&
+	   "lifetime.start requires the size to be an i64");
+  Value *Ops[] = { Size, Ptr };
+  Module *M = BB->getParent()->getParent();
+  Value *TheFn = Intrinsic::getDeclaration(M, Intrinsic::lifetime_start);
+  return createCallHelper(TheFn, Ops, 2, this);
+}
+
+CallInst *IRBuilderBase::CreateLifetimeEnd(Value *Ptr, ConstantInt *Size) {
+  assert(isa<PointerType>(Ptr->getType()) &&
+	 "lifetime.end only applies to pointers.");
+  Ptr = getCastedInt8PtrValue(Ptr);
+  if (!Size)
+    Size = getInt64(-1);
+  else
+    assert(Size->getType() == getInt64Ty() &&
+	   "lifetime.end requires the size to be an i64");
+  Value *Ops[] = { Size, Ptr };
+  Module *M = BB->getParent()->getParent();
+  Value *TheFn = Intrinsic::getDeclaration(M, Intrinsic::lifetime_end);
+  return createCallHelper(TheFn, Ops, 2, this);
+}
diff --git a/lib/VMCore/InlineAsm.cpp b/lib/VMCore/InlineAsm.cpp
index e4f99f0..bd3667d 100644
--- a/lib/VMCore/InlineAsm.cpp
+++ b/lib/VMCore/InlineAsm.cpp
@@ -181,6 +181,11 @@ bool InlineAsm::ConstraintInfo::Parse(StringRef Str,
       multipleAlternativeIndex++;
       pCodes = &multipleAlternatives[multipleAlternativeIndex].Codes;
       ++I;
+    } else if (*I == '^') {
+      // Multi-letter constraint
+      // FIXME: For now assuming these are 2-character constraints.
+      pCodes->push_back(std::string(I+1, I+3));
+      I += 3;
     } else {
       // Single letter constraint.
       pCodes->push_back(std::string(I, I+1));
diff --git a/lib/VMCore/Instructions.cpp b/lib/VMCore/Instructions.cpp
index 33dfcc0..8f4eabe 100644
--- a/lib/VMCore/Instructions.cpp
+++ b/lib/VMCore/Instructions.cpp
@@ -137,7 +137,8 @@ Value *PHINode::removeIncomingValue(unsigned Idx, bool DeletePHIIfEmpty) {
 ///
 void PHINode::growOperands() {
   unsigned e = getNumOperands();
-  unsigned NumOps = e*3/2;
+  // Multiply by 1.5 and round down so the result is still even.
+  unsigned NumOps = e + e / 4 * 2;
   if (NumOps < 4) NumOps = 4;      // 4 op PHI nodes are VERY common.
 
   ReservedSpace = NumOps;
@@ -2075,6 +2076,7 @@ unsigned CastInst::isEliminableCastPair(
 
 CastInst *CastInst::Create(Instruction::CastOps op, Value *S, const Type *Ty, 
   const Twine &Name, Instruction *InsertBefore) {
+  assert(castIsValid(op, S, Ty) && "Invalid cast!");
   // Construct and return the appropriate CastInst subclass
   switch (op) {
     case Trunc:    return new TruncInst    (S, Ty, Name, InsertBefore);
@@ -2097,6 +2099,7 @@ CastInst *CastInst::Create(Instruction::CastOps op, Value *S, const Type *Ty,
 
 CastInst *CastInst::Create(Instruction::CastOps op, Value *S, const Type *Ty,
   const Twine &Name, BasicBlock *InsertAtEnd) {
+  assert(castIsValid(op, S, Ty) && "Invalid cast!");
   // Construct and return the appropriate CastInst subclass
   switch (op) {
     case Trunc:    return new TruncInst    (S, Ty, Name, InsertAtEnd);
@@ -2253,60 +2256,56 @@ bool CastInst::isCastable(const Type *SrcTy, const Type *DestTy) {
   if (SrcTy == DestTy)
     return true;
 
+  if (const VectorType *SrcVecTy = dyn_cast<VectorType>(SrcTy))
+    if (const VectorType *DestVecTy = dyn_cast<VectorType>(DestTy))
+      if (SrcVecTy->getNumElements() == DestVecTy->getNumElements()) {
+        // An element by element cast.  Valid if casting the elements is valid.
+        SrcTy = SrcVecTy->getElementType();
+        DestTy = DestVecTy->getElementType();
+      }
+
   // Get the bit sizes, we'll need these
-  unsigned SrcBits = SrcTy->getScalarSizeInBits();   // 0 for ptr
-  unsigned DestBits = DestTy->getScalarSizeInBits(); // 0 for ptr
+  unsigned SrcBits = SrcTy->getPrimitiveSizeInBits();   // 0 for ptr
+  unsigned DestBits = DestTy->getPrimitiveSizeInBits(); // 0 for ptr
 
   // Run through the possibilities ...
-  if (DestTy->isIntegerTy()) {                   // Casting to integral
-    if (SrcTy->isIntegerTy()) {                  // Casting from integral
+  if (DestTy->isIntegerTy()) {               // Casting to integral
+    if (SrcTy->isIntegerTy()) {                // Casting from integral
         return true;
-    } else if (SrcTy->isFloatingPointTy()) {     // Casting from floating pt
+    } else if (SrcTy->isFloatingPointTy()) {   // Casting from floating pt
       return true;
-    } else if (const VectorType *PTy = dyn_cast<VectorType>(SrcTy)) {
-                                               // Casting from vector
-      return DestBits == PTy->getBitWidth();
+    } else if (SrcTy->isVectorTy()) {          // Casting from vector
+      return DestBits == SrcBits;
     } else {                                   // Casting from something else
       return SrcTy->isPointerTy();
     }
-  } else if (DestTy->isFloatingPointTy()) {      // Casting to floating pt
-    if (SrcTy->isIntegerTy()) {                  // Casting from integral
+  } else if (DestTy->isFloatingPointTy()) {  // Casting to floating pt
+    if (SrcTy->isIntegerTy()) {                // Casting from integral
       return true;
-    } else if (SrcTy->isFloatingPointTy()) {     // Casting from floating pt
+    } else if (SrcTy->isFloatingPointTy()) {   // Casting from floating pt
       return true;
-    } else if (const VectorType *PTy = dyn_cast<VectorType>(SrcTy)) {
-                                               // Casting from vector
-      return DestBits == PTy->getBitWidth();
+    } else if (SrcTy->isVectorTy()) {          // Casting from vector
+      return DestBits == SrcBits;
     } else {                                   // Casting from something else
       return false;
     }
-  } else if (const VectorType *DestPTy = dyn_cast<VectorType>(DestTy)) {
-                                                // Casting to vector
-    if (const VectorType *SrcPTy = dyn_cast<VectorType>(SrcTy)) {
-                                                // Casting from vector
-      return DestPTy->getBitWidth() == SrcPTy->getBitWidth();
-    } else if (DestPTy->getBitWidth() == SrcBits) {
-      return true;                              // float/int -> vector
-    } else if (SrcTy->isX86_MMXTy()) {
-      return DestPTy->getBitWidth() == 64;      // MMX to 64-bit vector
-    } else {
-      return false;
-    }
+  } else if (DestTy->isVectorTy()) {         // Casting to vector
+    return DestBits == SrcBits;
   } else if (DestTy->isPointerTy()) {        // Casting to pointer
-    if (SrcTy->isPointerTy()) {              // Casting from pointer
+    if (SrcTy->isPointerTy()) {                // Casting from pointer
       return true;
-    } else if (SrcTy->isIntegerTy()) {            // Casting from integral
+    } else if (SrcTy->isIntegerTy()) {         // Casting from integral
       return true;
-    } else {                                    // Casting from something else
+    } else {                                   // Casting from something else
       return false;
     }
   } else if (DestTy->isX86_MMXTy()) {
-    if (const VectorType *SrcPTy = dyn_cast<VectorType>(SrcTy)) {
-      return SrcPTy->getBitWidth() == 64;       // 64-bit vector to MMX
+    if (SrcTy->isVectorTy()) {
+      return DestBits == SrcBits;       // 64-bit vector to MMX
     } else {
       return false;
     }
-  } else {                                      // Casting to something else
+  } else {                                   // Casting to something else
     return false;
   }
 }
@@ -2321,14 +2320,27 @@ bool CastInst::isCastable(const Type *SrcTy, const Type *DestTy) {
 Instruction::CastOps
 CastInst::getCastOpcode(
   const Value *Src, bool SrcIsSigned, const Type *DestTy, bool DestIsSigned) {
-  // Get the bit sizes, we'll need these
   const Type *SrcTy = Src->getType();
-  unsigned SrcBits = SrcTy->getScalarSizeInBits();   // 0 for ptr
-  unsigned DestBits = DestTy->getScalarSizeInBits(); // 0 for ptr
 
   assert(SrcTy->isFirstClassType() && DestTy->isFirstClassType() &&
          "Only first class types are castable!");
 
+  if (SrcTy == DestTy)
+    return BitCast;
+
+  if (const VectorType *SrcVecTy = dyn_cast<VectorType>(SrcTy))
+    if (const VectorType *DestVecTy = dyn_cast<VectorType>(DestTy))
+      if (SrcVecTy->getNumElements() == DestVecTy->getNumElements()) {
+        // An element by element cast.  Find the appropriate opcode based on the
+        // element types.
+        SrcTy = SrcVecTy->getElementType();
+        DestTy = DestVecTy->getElementType();
+      }
+
+  // Get the bit sizes, we'll need these
+  unsigned SrcBits = SrcTy->getPrimitiveSizeInBits();   // 0 for ptr
+  unsigned DestBits = DestTy->getPrimitiveSizeInBits(); // 0 for ptr
+
   // Run through the possibilities ...
   if (DestTy->isIntegerTy()) {                      // Casting to integral
     if (SrcTy->isIntegerTy()) {                     // Casting from integral
@@ -2347,10 +2359,9 @@ CastInst::getCastOpcode(
         return FPToSI;                              // FP -> sint
       else
         return FPToUI;                              // FP -> uint 
-    } else if (const VectorType *PTy = dyn_cast<VectorType>(SrcTy)) {
-      assert(DestBits == PTy->getBitWidth() &&
-               "Casting vector to integer of different width");
-      PTy = NULL;
+    } else if (SrcTy->isVectorTy()) {
+      assert(DestBits == SrcBits &&
+             "Casting vector to integer of different width");
       return BitCast;                             // Same size, no-op cast
     } else {
       assert(SrcTy->isPointerTy() &&
@@ -2371,29 +2382,17 @@ CastInst::getCastOpcode(
       } else  {
         return BitCast;                             // same size, no-op cast
       }
-    } else if (const VectorType *PTy = dyn_cast<VectorType>(SrcTy)) {
-      assert(DestBits == PTy->getBitWidth() &&
+    } else if (SrcTy->isVectorTy()) {
+      assert(DestBits == SrcBits &&
              "Casting vector to floating point of different width");
-      PTy = NULL;
       return BitCast;                             // same size, no-op cast
     } else {
       llvm_unreachable("Casting pointer or non-first class to float");
     }
-  } else if (const VectorType *DestPTy = dyn_cast<VectorType>(DestTy)) {
-    if (const VectorType *SrcPTy = dyn_cast<VectorType>(SrcTy)) {
-      assert(DestPTy->getBitWidth() == SrcPTy->getBitWidth() &&
-             "Casting vector to vector of different widths");
-      SrcPTy = NULL;
-      return BitCast;                             // vector -> vector
-    } else if (DestPTy->getBitWidth() == SrcBits) {
-      return BitCast;                               // float/int -> vector
-    } else if (SrcTy->isX86_MMXTy()) {
-      assert(DestPTy->getBitWidth()==64 &&
-             "Casting X86_MMX to vector of wrong width");
-      return BitCast;                             // MMX to 64-bit vector
-    } else {
-      assert(!"Illegal cast to vector (wrong type or size)");
-    }
+  } else if (DestTy->isVectorTy()) {
+    assert(DestBits == SrcBits &&
+           "Illegal cast to vector (wrong type or size)");
+    return BitCast;
   } else if (DestTy->isPointerTy()) {
     if (SrcTy->isPointerTy()) {
       return BitCast;                               // ptr -> ptr
@@ -2403,9 +2402,8 @@ CastInst::getCastOpcode(
       assert(!"Casting pointer to other than pointer or int");
     }
   } else if (DestTy->isX86_MMXTy()) {
-    if (isa<VectorType>(SrcTy)) {
-      assert(cast<VectorType>(SrcTy)->getBitWidth() == 64 &&
-             "Casting vector of wrong width to X86_MMX");
+    if (SrcTy->isVectorTy()) {
+      assert(DestBits == SrcBits && "Casting vector of wrong width to X86_MMX");
       return BitCast;                               // 64-bit vector to MMX
     } else {
       assert(!"Illegal cast to X86_MMX");
@@ -2441,46 +2439,40 @@ CastInst::castIsValid(Instruction::CastOps op, Value *S, const Type *DstTy) {
   unsigned SrcBitSize = SrcTy->getScalarSizeInBits();
   unsigned DstBitSize = DstTy->getScalarSizeInBits();
 
+  // If these are vector types, get the lengths of the vectors (using zero for
+  // scalar types means that checking that vector lengths match also checks that
+  // scalars are not being converted to vectors or vectors to scalars).
+  unsigned SrcLength = SrcTy->isVectorTy() ?
+    cast<VectorType>(SrcTy)->getNumElements() : 0;
+  unsigned DstLength = DstTy->isVectorTy() ?
+    cast<VectorType>(DstTy)->getNumElements() : 0;
+
   // Switch on the opcode provided
   switch (op) {
   default: return false; // This is an input error
   case Instruction::Trunc:
-    return SrcTy->isIntOrIntVectorTy() &&
-           DstTy->isIntOrIntVectorTy()&& SrcBitSize > DstBitSize;
+    return SrcTy->isIntOrIntVectorTy() && DstTy->isIntOrIntVectorTy() &&
+      SrcLength == DstLength && SrcBitSize > DstBitSize;
   case Instruction::ZExt:
-    return SrcTy->isIntOrIntVectorTy() &&
-           DstTy->isIntOrIntVectorTy()&& SrcBitSize < DstBitSize;
+    return SrcTy->isIntOrIntVectorTy() && DstTy->isIntOrIntVectorTy() &&
+      SrcLength == DstLength && SrcBitSize < DstBitSize;
   case Instruction::SExt: 
-    return SrcTy->isIntOrIntVectorTy() &&
-           DstTy->isIntOrIntVectorTy()&& SrcBitSize < DstBitSize;
+    return SrcTy->isIntOrIntVectorTy() && DstTy->isIntOrIntVectorTy() &&
+      SrcLength == DstLength && SrcBitSize < DstBitSize;
   case Instruction::FPTrunc:
-    return SrcTy->isFPOrFPVectorTy() &&
-           DstTy->isFPOrFPVectorTy() && 
-           SrcBitSize > DstBitSize;
+    return SrcTy->isFPOrFPVectorTy() && DstTy->isFPOrFPVectorTy() &&
+      SrcLength == DstLength && SrcBitSize > DstBitSize;
   case Instruction::FPExt:
-    return SrcTy->isFPOrFPVectorTy() &&
-           DstTy->isFPOrFPVectorTy() && 
-           SrcBitSize < DstBitSize;
+    return SrcTy->isFPOrFPVectorTy() && DstTy->isFPOrFPVectorTy() &&
+      SrcLength == DstLength && SrcBitSize < DstBitSize;
   case Instruction::UIToFP:
   case Instruction::SIToFP:
-    if (const VectorType *SVTy = dyn_cast<VectorType>(SrcTy)) {
-      if (const VectorType *DVTy = dyn_cast<VectorType>(DstTy)) {
-        return SVTy->getElementType()->isIntOrIntVectorTy() &&
-               DVTy->getElementType()->isFPOrFPVectorTy() &&
-               SVTy->getNumElements() == DVTy->getNumElements();
-      }
-    }
-    return SrcTy->isIntOrIntVectorTy() && DstTy->isFPOrFPVectorTy();
+    return SrcTy->isIntOrIntVectorTy() && DstTy->isFPOrFPVectorTy() &&
+      SrcLength == DstLength;
   case Instruction::FPToUI:
   case Instruction::FPToSI:
-    if (const VectorType *SVTy = dyn_cast<VectorType>(SrcTy)) {
-      if (const VectorType *DVTy = dyn_cast<VectorType>(DstTy)) {
-        return SVTy->getElementType()->isFPOrFPVectorTy() &&
-               DVTy->getElementType()->isIntOrIntVectorTy() &&
-               SVTy->getNumElements() == DVTy->getNumElements();
-      }
-    }
-    return SrcTy->isFPOrFPVectorTy() && DstTy->isIntOrIntVectorTy();
+    return SrcTy->isFPOrFPVectorTy() && DstTy->isIntOrIntVectorTy() &&
+      SrcLength == DstLength;
   case Instruction::PtrToInt:
     return SrcTy->isPointerTy() && DstTy->isIntegerTy();
   case Instruction::IntToPtr:
diff --git a/lib/VMCore/LLVMContextImpl.h b/lib/VMCore/LLVMContextImpl.h
index 23971aa..6ea4b48e 100644
--- a/lib/VMCore/LLVMContextImpl.h
+++ b/lib/VMCore/LLVMContextImpl.h
@@ -184,7 +184,7 @@ public:
 
   // Concrete/Abstract TypeDescriptions - We lazily calculate type descriptions
   // for types as they are needed.  Because resolution of types must invalidate
-  // all of the abstract type descriptions, we keep them in a seperate map to 
+  // all of the abstract type descriptions, we keep them in a separate map to
   // make this easy.
   TypePrinting ConcreteTypeDescriptions;
   TypePrinting AbstractTypeDescriptions;
diff --git a/lib/VMCore/Metadata.cpp b/lib/VMCore/Metadata.cpp
index 84a0975..eb719e5 100644
--- a/lib/VMCore/Metadata.cpp
+++ b/lib/VMCore/Metadata.cpp
@@ -84,18 +84,18 @@ static MDNodeOperand *getOperandPtr(MDNode *N, unsigned Op) {
   return reinterpret_cast<MDNodeOperand*>(N+1)+Op;
 }
 
-MDNode::MDNode(LLVMContext &C, Value *const *Vals, unsigned NumVals,
-               bool isFunctionLocal)
+MDNode::MDNode(LLVMContext &C, ArrayRef<Value*> Vals, bool isFunctionLocal)
 : Value(Type::getMetadataTy(C), Value::MDNodeVal) {
-  NumOperands = NumVals;
+  NumOperands = Vals.size();
 
   if (isFunctionLocal)
     setValueSubclassData(getSubclassDataFromValue() | FunctionLocalBit);
 
   // Initialize the operand list, which is co-allocated on the end of the node.
+  unsigned i = 0;
   for (MDNodeOperand *Op = getOperandPtr(this, 0), *E = Op+NumOperands;
-       Op != E; ++Op, ++Vals)
-    new (Op) MDNodeOperand(*Vals, this);
+       Op != E; ++Op, ++i)
+    new (Op) MDNodeOperand(Vals[i], this);
 }
 
 
@@ -183,9 +183,8 @@ static bool isFunctionLocalValue(Value *V) {
          (isa<MDNode>(V) && cast<MDNode>(V)->isFunctionLocal());
 }
 
-MDNode *MDNode::getMDNode(LLVMContext &Context, Value *const *Vals,
-                          unsigned NumVals, FunctionLocalness FL,
-                          bool Insert) {
+MDNode *MDNode::getMDNode(LLVMContext &Context, ArrayRef<Value*> Vals,
+                          FunctionLocalness FL, bool Insert) {
   LLVMContextImpl *pImpl = Context.pImpl;
 
   // Add all the operand pointers. Note that we don't have to add the
@@ -193,7 +192,7 @@ MDNode *MDNode::getMDNode(LLVMContext &Context, Value *const *Vals,
   // Note that if the operands are later nulled out, the node will be
   // removed from the uniquing map.
   FoldingSetNodeID ID;
-  for (unsigned i = 0; i != NumVals; ++i)
+  for (unsigned i = 0; i != Vals.size(); ++i)
     ID.AddPointer(Vals[i]);
 
   void *InsertPoint;
@@ -205,7 +204,7 @@ MDNode *MDNode::getMDNode(LLVMContext &Context, Value *const *Vals,
   bool isFunctionLocal = false;
   switch (FL) {
   case FL_Unknown:
-    for (unsigned i = 0; i != NumVals; ++i) {
+    for (unsigned i = 0; i != Vals.size(); ++i) {
       Value *V = Vals[i];
       if (!V) continue;
       if (isFunctionLocalValue(V)) {
@@ -223,8 +222,8 @@ MDNode *MDNode::getMDNode(LLVMContext &Context, Value *const *Vals,
   }
 
   // Coallocate space for the node and Operands together, then placement new.
-  void *Ptr = malloc(sizeof(MDNode)+NumVals*sizeof(MDNodeOperand));
-  N = new (Ptr) MDNode(Context, Vals, NumVals, isFunctionLocal);
+  void *Ptr = malloc(sizeof(MDNode)+Vals.size()*sizeof(MDNodeOperand));
+  N = new (Ptr) MDNode(Context, Vals, isFunctionLocal);
 
   // InsertPoint will have been set by the FindNodeOrInsertPos call.
   pImpl->MDNodeSet.InsertNode(N, InsertPoint);
@@ -233,26 +232,23 @@ MDNode *MDNode::getMDNode(LLVMContext &Context, Value *const *Vals,
 }
 
 MDNode *MDNode::get(LLVMContext &Context, ArrayRef<Value*> Vals) {
-  return getMDNode(Context, Vals.data(), Vals.size(), FL_Unknown);
-}
-MDNode *MDNode::get(LLVMContext &Context, Value*const* Vals, unsigned NumVals) {
-  return getMDNode(Context, Vals, NumVals, FL_Unknown);
+  return getMDNode(Context, Vals, FL_Unknown);
 }
 
-MDNode *MDNode::getWhenValsUnresolved(LLVMContext &Context, Value *const *Vals,
-                                      unsigned NumVals, bool isFunctionLocal) {
-  return getMDNode(Context, Vals, NumVals, isFunctionLocal ? FL_Yes : FL_No);
+MDNode *MDNode::getWhenValsUnresolved(LLVMContext &Context,
+                                      ArrayRef<Value*> Vals,
+                                      bool isFunctionLocal) {
+  return getMDNode(Context, Vals, isFunctionLocal ? FL_Yes : FL_No);
 }
 
-MDNode *MDNode::getIfExists(LLVMContext &Context, Value *const *Vals,
-                            unsigned NumVals) {
-  return getMDNode(Context, Vals, NumVals, FL_Unknown, false);
+MDNode *MDNode::getIfExists(LLVMContext &Context, ArrayRef<Value*> Vals) {
+  return getMDNode(Context, Vals, FL_Unknown, false);
 }
 
-MDNode *MDNode::getTemporary(LLVMContext &Context, Value *const *Vals,
-                             unsigned NumVals) {
-  MDNode *N = (MDNode *)malloc(sizeof(MDNode)+NumVals*sizeof(MDNodeOperand));
-  N = new (N) MDNode(Context, Vals, NumVals, FL_No);
+MDNode *MDNode::getTemporary(LLVMContext &Context, ArrayRef<Value*> Vals) {
+  MDNode *N =
+    (MDNode *)malloc(sizeof(MDNode)+Vals.size()*sizeof(MDNodeOperand));
+  N = new (N) MDNode(Context, Vals, FL_No);
   N->setValueSubclassData(N->getSubclassDataFromValue() |
                           NotUniquedBit);
   LeakDetector::addGarbageObject(N);
diff --git a/lib/VMCore/PassManager.cpp b/lib/VMCore/PassManager.cpp
index ca4455a..5cf2905 100644
--- a/lib/VMCore/PassManager.cpp
+++ b/lib/VMCore/PassManager.cpp
@@ -449,9 +449,9 @@ namespace {
 static DebugInfoProbeInfo *TheDebugProbe;
 static void createDebugInfoProbe() {
   if (TheDebugProbe) return;
-      
-  // Constructed the first time this is called. This guarantees that the 
-  // object will be constructed, if -enable-debug-info-probe is set, 
+
+  // Constructed the first time this is called. This guarantees that the
+  // object will be constructed, if -enable-debug-info-probe is set,
   // before static globals, thus it will be destroyed before them.
   static ManagedStatic<DebugInfoProbeInfo> DIP;
   TheDebugProbe = &*DIP;
@@ -632,6 +632,7 @@ void PMTopLevelManager::schedulePass(Pass *P) {
       Pass *AnalysisPass = findAnalysisPass(*I);
       if (!AnalysisPass) {
         const PassInfo *PI = PassRegistry::getPassRegistry()->getPassInfo(*I);
+        assert(PI && "Expected required passes to be initialized");
         AnalysisPass = PI->createPass();
         if (P->getPotentialPassManagerType () ==
             AnalysisPass->getPotentialPassManagerType())
@@ -686,6 +687,7 @@ Pass *PMTopLevelManager::findAnalysisPass(AnalysisID AID) {
     // If Pass not found then check the interfaces implemented by Immutable Pass
     const PassInfo *PassInf =
       PassRegistry::getPassRegistry()->getPassInfo(PI);
+    assert(PassInf && "Expected all immutable passes to be initialized");
     const std::vector<const PassInfo*> &ImmPI =
       PassInf->getInterfacesImplemented();
     for (std::vector<const PassInfo*>::const_iterator II = ImmPI.begin(),
@@ -727,9 +729,11 @@ void PMTopLevelManager::dumpArguments() const {
   for (SmallVector<ImmutablePass *, 8>::const_iterator I =
        ImmutablePasses.begin(), E = ImmutablePasses.end(); I != E; ++I)
     if (const PassInfo *PI =
-          PassRegistry::getPassRegistry()->getPassInfo((*I)->getPassID()))
+        PassRegistry::getPassRegistry()->getPassInfo((*I)->getPassID())) {
+      assert(PI && "Expected all immutable passes to be initialized");
       if (!PI->isAnalysisGroup())
         dbgs() << " -" << PI->getPassArgument();
+    }
   for (SmallVector<PMDataManager *, 8>::const_iterator I = PassManagers.begin(),
          E = PassManagers.end(); I != E; ++I)
     (*I)->dumpPassArguments();
@@ -982,7 +986,7 @@ void PMDataManager::add(Pass *P, bool ProcessAnalysis) {
       // Keep track of higher level analysis used by this manager.
       HigherLevelAnalysis.push_back(PRequired);
     } else
-      llvm_unreachable("Unable to accomodate Required Pass");
+      llvm_unreachable("Unable to accommodate Required Pass");
   }
 
   // Set P as P's last user until someone starts using P.
@@ -1183,6 +1187,12 @@ void PMDataManager::dumpAnalysisUsage(StringRef Msg, const Pass *P,
   for (unsigned i = 0; i != Set.size(); ++i) {
     if (i) dbgs() << ',';
     const PassInfo *PInf = PassRegistry::getPassRegistry()->getPassInfo(Set[i]);
+    if (!PInf) {
+      // Some preserved passes, such as AliasAnalysis, may not be initialized by
+      // all drivers.
+      dbgs() << " Uninitialized Pass";
+      continue;
+    }
     dbgs() << ' ' << PInf->getPassName();
   }
   dbgs() << '\n';
diff --git a/lib/VMCore/PassRegistry.cpp b/lib/VMCore/PassRegistry.cpp
index c97a170..fa92620 100644
--- a/lib/VMCore/PassRegistry.cpp
+++ b/lib/VMCore/PassRegistry.cpp
@@ -26,7 +26,7 @@ using namespace llvm;
 
 // FIXME: We use ManagedStatic to erase the pass registrar on shutdown.
 // Unfortunately, passes are registered with static ctors, and having
-// llvm_shutdown clear this map prevents successful ressurection after 
+// llvm_shutdown clear this map prevents successful resurrection after
 // llvm_shutdown is run.  Ideally we should find a solution so that we don't
 // leak the map, AND can still resurrect after shutdown.
 static ManagedStatic<PassRegistry> PassRegistryObj;
diff --git a/lib/VMCore/Type.cpp b/lib/VMCore/Type.cpp
index b15304c..566bb28 100644
--- a/lib/VMCore/Type.cpp
+++ b/lib/VMCore/Type.cpp
@@ -12,23 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "LLVMContextImpl.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Constants.h"
-#include "llvm/Assembly/Writer.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/Metadata.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/SCCIterator.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/ManagedStatic.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/Threading.h"
 #include <algorithm>
 #include <cstdarg>
 using namespace llvm;
@@ -87,7 +71,9 @@ void Type::destroy() const {
     operator delete(const_cast<Type *>(this));
 
     return;
-  } else if (const OpaqueType *opaque_this = dyn_cast<OpaqueType>(this)) {
+  }
+  
+  if (const OpaqueType *opaque_this = dyn_cast<OpaqueType>(this)) {
     LLVMContextImpl *pImpl = this->getContext().pImpl;
     pImpl->OpaqueTypes.erase(opaque_this);
   }
@@ -116,15 +102,6 @@ const Type *Type::getPrimitiveType(LLVMContext &C, TypeID IDNumber) {
   }
 }
 
-const Type *Type::getVAArgsPromotedType(LLVMContext &C) const {
-  if (ID == IntegerTyID && getSubclassData() < 32)
-    return Type::getInt32Ty(C);
-  else if (ID == FloatTyID)
-    return Type::getDoubleTy(C);
-  else
-    return this;
-}
-
 /// getScalarType - If this is a vector type, return the element type,
 /// otherwise return this.
 const Type *Type::getScalarType() const {
@@ -197,6 +174,25 @@ bool Type::canLosslesslyBitCastTo(const Type *Ty) const {
   return false;  // Other types have no identity values
 }
 
+bool Type::isEmptyTy() const {
+  const ArrayType *ATy = dyn_cast<ArrayType>(this);
+  if (ATy) {
+    unsigned NumElements = ATy->getNumElements();
+    return NumElements == 0 || ATy->getElementType()->isEmptyTy();
+  }
+
+  const StructType *STy = dyn_cast<StructType>(this);
+  if (STy) {
+    unsigned NumElements = STy->getNumElements();
+    for (unsigned i = 0; i < NumElements; ++i)
+      if (!STy->getElementType(i)->isEmptyTy())
+        return false;
+    return true;
+  }
+
+  return false;
+}
+
 unsigned Type::getPrimitiveSizeInBits() const {
   switch (getTypeID()) {
   case Type::FloatTyID: return 32;
@@ -243,8 +239,8 @@ bool Type::isSizedDerivedType() const {
   if (const ArrayType *ATy = dyn_cast<ArrayType>(this))
     return ATy->getElementType()->isSized();
 
-  if (const VectorType *PTy = dyn_cast<VectorType>(this))
-    return PTy->getElementType()->isSized();
+  if (const VectorType *VTy = dyn_cast<VectorType>(this))
+    return VTy->getElementType()->isSized();
 
   if (!this->isStructTy()) 
     return false;
@@ -463,11 +459,11 @@ bool FunctionType::isValidArgumentType(const Type *ArgTy) {
 FunctionType::FunctionType(const Type *Result,
                            ArrayRef<const Type*> Params,
                            bool IsVarArgs)
-  : DerivedType(Result->getContext(), FunctionTyID), isVarArgs(IsVarArgs) {
+  : DerivedType(Result->getContext(), FunctionTyID) {
   ContainedTys = reinterpret_cast<PATypeHandle*>(this+1);
   NumContainedTys = Params.size() + 1; // + 1 for result type
   assert(isValidReturnType(Result) && "invalid return type for function");
-
+  setSubclassData(IsVarArgs);
 
   bool isAbstract = Result->isAbstract();
   new (&ContainedTys[0]) PATypeHandle(Result, this);
@@ -523,7 +519,7 @@ VectorType::VectorType(const Type *ElType, unsigned NumEl)
 
 PointerType::PointerType(const Type *E, unsigned AddrSpace)
   : SequentialType(PointerTyID, E) {
-  AddressSpace = AddrSpace;
+  setSubclassData(AddrSpace);
   // Calculate whether or not this type is abstract
   setAbstract(E->isAbstract());
 }
@@ -836,6 +832,9 @@ FunctionValType FunctionValType::get(const FunctionType *FT) {
   return FunctionValType(FT->getReturnType(), ParamTypes, FT->isVarArg());
 }
 
+FunctionType *FunctionType::get(const Type *Result, bool isVarArg) {
+  return get(Result, ArrayRef<const Type *>(), isVarArg);
+}
 
 // FunctionType::get - The factory function for the FunctionType class...
 FunctionType *FunctionType::get(const Type *ReturnType,
@@ -912,9 +911,14 @@ bool VectorType::isValidElementType(const Type *ElemTy) {
 }
 
 //===----------------------------------------------------------------------===//
-// Struct Type Factory...
+// Struct Type Factory.
 //
 
+StructType *StructType::get(LLVMContext &Context, bool isPacked) {
+  return get(Context, llvm::ArrayRef<const Type*>(), isPacked);
+}
+
+
 StructType *StructType::get(LLVMContext &Context,
                             ArrayRef<const Type*> ETypes, 
                             bool isPacked) {
diff --git a/lib/VMCore/TypesContext.h b/lib/VMCore/TypesContext.h
index 6fb53be..ad09478 100644
--- a/lib/VMCore/TypesContext.h
+++ b/lib/VMCore/TypesContext.h
@@ -370,7 +370,7 @@ public:
 
         // Remove the old entry form TypesByHash.  If the hash values differ
         // now, remove it from the old place.  Otherwise, continue scanning
-        // withing this hashcode to reduce work.
+        // within this hashcode to reduce work.
         if (NewTypeHash != OldTypeHash) {
           RemoveFromTypesByHash(OldTypeHash, Ty);
         } else {
diff --git a/lib/VMCore/Verifier.cpp b/lib/VMCore/Verifier.cpp
index 8b89110..139e035 100644
--- a/lib/VMCore/Verifier.cpp
+++ b/lib/VMCore/Verifier.cpp
@@ -1645,6 +1645,9 @@ void Verifier::visitIntrinsicFunctionCall(Intrinsic::ID ID, CallInst &CI) {
     Assert1(isa<ConstantInt>(CI.getArgOperand(3)),
             "alignment argument of memory intrinsics must be a constant int",
             &CI);
+    Assert1(isa<ConstantInt>(CI.getArgOperand(4)),
+            "isvolatile argument of memory intrinsics must be a constant int",
+            &CI);
     break;
   case Intrinsic::gcroot:
   case Intrinsic::gcwrite:
diff --git a/projects/sample/autoconf/configure.ac b/projects/sample/autoconf/configure.ac
index 4e61bee..bb75bbd 100644
--- a/projects/sample/autoconf/configure.ac
+++ b/projects/sample/autoconf/configure.ac
@@ -15,7 +15,7 @@ dnl Tell autoconf that this is an LLVM project being configured
 dnl This provides the --with-llvmsrc and --with-llvmobj options
 LLVM_CONFIG_PROJECT($LLVM_ABS_SRC_ROOT,$LLVM_ABS_OBJ_ROOT)
 
-dnl Tell autoconf that the auxilliary files are actually located in
+dnl Tell autoconf that the auxiliary files are actually located in
 dnl the LLVM autoconf directory, not here.
 AC_CONFIG_AUX_DIR($LLVM_SRC/autoconf)
 
diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt
new file mode 100644
index 0000000..502b91d
--- /dev/null
+++ b/runtime/CMakeLists.txt
@@ -0,0 +1,5 @@
+if( NOT LLVM_BUILD_RUNTIME )
+  set(EXCLUDE_FROM_ALL ON)
+endif()
+
+add_subdirectory(libprofile)
diff --git a/runtime/libprofile/BasicBlockTracing.c b/runtime/libprofile/BasicBlockTracing.c
index dbe81e3..0815e2e 100644
--- a/runtime/libprofile/BasicBlockTracing.c
+++ b/runtime/libprofile/BasicBlockTracing.c
@@ -30,7 +30,7 @@ static void WriteAndFlushBBTraceData () {
 /* BBTraceAtExitHandler - When the program exits, just write out any remaining 
  * data and free the trace buffer.
  */
-static void BBTraceAtExitHandler() {
+static void BBTraceAtExitHandler(void) {
   WriteAndFlushBBTraceData ();
   free (ArrayStart);
 }
diff --git a/runtime/libprofile/CMakeLists.txt b/runtime/libprofile/CMakeLists.txt
new file mode 100644
index 0000000..414ad00
--- /dev/null
+++ b/runtime/libprofile/CMakeLists.txt
@@ -0,0 +1,19 @@
+set(SOURCES
+  BasicBlockTracing.c
+  CommonProfiling.c
+  GCDAProfiling.c
+  PathProfiling.c
+  EdgeProfiling.c
+  OptimalEdgeProfiling.c
+  Profiling.h
+  )
+
+add_llvm_library( profile_rt-static ${SOURCES} )
+set_target_properties( profile_rt-static
+  PROPERTIES
+  OUTPUT_NAME "profile_rt" )
+
+add_llvm_loadable_module( profile_rt-shared ${SOURCES} )
+set_target_properties( profile_rt-shared
+  PROPERTIES
+  OUTPUT_NAME "profile_rt" )
diff --git a/runtime/libprofile/CommonProfiling.c b/runtime/libprofile/CommonProfiling.c
index 1c1771c..210a5e5 100644
--- a/runtime/libprofile/CommonProfiling.c
+++ b/runtime/libprofile/CommonProfiling.c
@@ -19,7 +19,11 @@
 #include <fcntl.h>
 #include <stdio.h>
 #include <string.h>
+#if !defined(_MSC_VER) && !defined(__MINGW32__)
 #include <unistd.h>
+#else
+#include <io.h>
+#endif
 #include <stdlib.h>
 
 static char *SavedArgs = 0;
diff --git a/runtime/libprofile/EdgeProfiling.c b/runtime/libprofile/EdgeProfiling.c
index 4a68a08..f19e188 100644
--- a/runtime/libprofile/EdgeProfiling.c
+++ b/runtime/libprofile/EdgeProfiling.c
@@ -22,7 +22,7 @@ static unsigned NumElements;
 /* EdgeProfAtExitHandler - When the program exits, just write out the profiling
  * data.
  */
-static void EdgeProfAtExitHandler() {
+static void EdgeProfAtExitHandler(void) {
   /* Note that if this were doing something more intelligent with the
    * instrumentation, we could do some computation here to expand what we
    * collected into simple edge profiles.  Since we directly count each edge, we
diff --git a/runtime/libprofile/GCDAProfiling.c b/runtime/libprofile/GCDAProfiling.c
new file mode 100644
index 0000000..09a1aec
--- /dev/null
+++ b/runtime/libprofile/GCDAProfiling.c
@@ -0,0 +1,190 @@
+/*===- GCDAProfiling.c - Support library for GCDA file emission -----------===*\
+|*
+|*                     The LLVM Compiler Infrastructure
+|*
+|* This file is distributed under the University of Illinois Open Source
+|* License. See LICENSE.TXT for details.
+|* 
+|*===----------------------------------------------------------------------===*|
+|* 
+|* This file implements the call back routines for the gcov profiling
+|* instrumentation pass. Link against this library when running code through
+|* the -insert-gcov-profiling LLVM pass.
+|*
+|* We emit files in a corrupt version of GCOV's "gcda" file format. These files
+|* are only close enough that LCOV will happily parse them. Anything that lcov
+|* ignores is missing.
+|*
+|* TODO: gcov is multi-process safe by having each exit open the existing file
+|* and append to it. We'd like to achieve that and be thread-safe too.
+|*
+\*===----------------------------------------------------------------------===*/
+
+#include "llvm/Support/DataTypes.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#ifdef _MSC_VER
+#include <direct.h>
+#endif
+
+/* #define DEBUG_GCDAPROFILING */
+
+/*
+ * --- GCOV file format I/O primitives ---
+ */
+
+static FILE *output_file = NULL;
+
+static void write_int32(uint32_t i) {
+  fwrite(&i, 4, 1, output_file);
+}
+
+static void write_int64(uint64_t i) {
+  uint32_t lo, hi;
+  lo = i >>  0;
+  hi = i >> 32;
+
+  write_int32(lo);
+  write_int32(hi);
+}
+
+static uint32_t length_of_string(const char *s) {
+  return (strlen(s) / 4) + 1;
+}
+
+static void write_string(const char *s) {
+  uint32_t len = length_of_string(s);
+  write_int32(len);
+  fwrite(s, strlen(s), 1, output_file);
+  fwrite("\0\0\0\0", 4 - (strlen(s) % 4), 1, output_file);
+}
+
+static char *mangle_filename(const char *orig_filename) {
+  /* TODO: handle GCOV_PREFIX_STRIP */
+  const char *prefix;
+  char *filename = 0;
+
+  prefix = getenv("GCOV_PREFIX");
+
+  if (!prefix)
+    return strdup(orig_filename);
+
+  filename = malloc(strlen(prefix) + 1 + strlen(orig_filename) + 1);
+  strcpy(filename, prefix);
+  strcat(filename, "/");
+  strcat(filename, orig_filename);
+
+  return filename;
+}
+
+static void recursive_mkdir(const char *filename) {
+  char *pathname;
+  int i, e;
+
+  for (i = 1, e = strlen(filename); i != e; ++i) {
+    if (filename[i] == '/') {
+      pathname = malloc(i + 1);
+      strncpy(pathname, filename, i);
+      pathname[i] = '\0';
+#ifdef _MSC_VER
+      _mkdir(pathname);
+#else
+      mkdir(pathname, 0750);  /* some of these will fail, ignore it. */
+#endif
+      free(pathname);
+    }
+  }
+}
+
+/*
+ * --- LLVM line counter API ---
+ */
+
+/* A file in this case is a translation unit. Each .o file built with line
+ * profiling enabled will emit to a different file. Only one file may be
+ * started at a time.
+ */
+void llvm_gcda_start_file(const char *orig_filename) {
+  char *filename;
+  filename = mangle_filename(orig_filename);
+  recursive_mkdir(filename);
+  output_file = fopen(filename, "wb");
+
+  /* gcda file, version 404*, stamp LLVM. */
+  fwrite("adcg*404MVLL", 12, 1, output_file);
+
+#ifdef DEBUG_GCDAPROFILING
+  printf("llvmgcda: [%s]\n", orig_filename);
+#endif
+
+  free(filename);
+}
+
+/* Given an array of pointers to counters (counters), increment the n-th one,
+ * where we're also given a pointer to n (predecessor).
+ */
+void llvm_gcda_increment_indirect_counter(uint32_t *predecessor,
+                                          uint64_t **counters) {
+  uint64_t *counter;
+  uint32_t pred;
+
+  pred = *predecessor;
+  if (pred == 0xffffffff)
+    return;
+  counter = counters[pred];
+
+  /* Don't crash if the pred# is out of sync. This can happen due to threads,
+     or because of a TODO in GCOVProfiling.cpp buildEdgeLookupTable(). */
+  if (counter)
+    ++*counter;
+#ifdef DEBUG_GCDAPROFILING
+  else
+    printf("llvmgcda: increment_indirect_counter counters=%x, pred=%u\n",
+           state_table_row, *predecessor);
+#endif
+}
+
+void llvm_gcda_emit_function(uint32_t ident, const char *function_name) {
+#ifdef DEBUG_GCDAPROFILING
+  printf("llvmgcda: function id=%x\n", ident);
+#endif
+
+  /* function tag */  
+  fwrite("\0\0\0\1", 4, 1, output_file);
+  write_int32(3 + 1 + length_of_string(function_name));
+  write_int32(ident);
+  write_int32(0);
+  write_int32(0);
+  write_string(function_name);
+}
+
+void llvm_gcda_emit_arcs(uint32_t num_counters, uint64_t *counters) {
+  uint32_t i;
+  /* counter #1 (arcs) tag */
+  fwrite("\0\0\xa1\1", 4, 1, output_file);
+  write_int32(num_counters * 2);
+  for (i = 0; i < num_counters; ++i) {
+    write_int64(counters[i]);
+  }
+
+#ifdef DEBUG_GCDAPROFILING
+  printf("llvmgcda:   %u arcs\n", num_counters);
+  for (i = 0; i < num_counters; ++i) {
+    printf("llvmgcda:   %llu\n", (unsigned long long)counters[i]);
+  }
+#endif
+}
+
+void llvm_gcda_end_file() {
+  /* Write out EOF record. */
+  fwrite("\0\0\0\0\0\0\0\0", 8, 1, output_file);
+  fclose(output_file);
+  output_file = NULL;
+
+#ifdef DEBUG_GCDAPROFILING
+  printf("llvmgcda: -----\n");
+#endif
+}
diff --git a/runtime/libprofile/Makefile b/runtime/libprofile/Makefile
index 4125af6..cf31e46 100644
--- a/runtime/libprofile/Makefile
+++ b/runtime/libprofile/Makefile
@@ -13,10 +13,35 @@ include $(LEVEL)/Makefile.config
 ifneq ($(strip $(LLVMCC)),)
 BYTECODE_LIBRARY = 1
 endif
-SHARED_LIBRARY = 1
-LOADABLE_MODULE = 1
 LIBRARYNAME = profile_rt
+LINK_LIBS_IN_SHARED = 1
+SHARED_LIBRARY = 1
 EXTRA_DIST = libprofile.exports
 EXPORTED_SYMBOL_FILE = $(PROJ_SRC_DIR)/libprofile.exports
 
 include $(LEVEL)/Makefile.common
+
+ifeq ($(HOST_OS),Darwin)
+    # Special hack to allow libprofile_rt to have an offset version number.
+    PROFILE_RT_LIBRARY_VERSION := $(LLVM_SUBMIT_VERSION)
+
+    # Set dylib internal version number to llvmCore submission number.
+    ifdef LLVM_SUBMIT_VERSION
+        LLVMLibsOptions := $(LLVMLibsOptions) -Wl,-current_version \
+                        -Wl,$(PROFILE_RT_LIBRARY_VERSION).$(LLVM_SUBMIT_SUBVERSION) \
+                        -Wl,-compatibility_version -Wl,1
+    endif
+    # Extra options to override libtool defaults.
+    LLVMLibsOptions    := $(LLVMLibsOptions)  \
+                         -Wl,-dead_strip \
+                         -Wl,-seg1addr -Wl,0xE0000000 
+
+    # Mac OS X 10.4 and earlier tools do not allow a second -install_name on
+    # command line.
+    DARWIN_VERS := $(shell echo $(TARGET_TRIPLE) | sed 's/.*darwin\([0-9]*\).*/\1/')
+    ifneq ($(DARWIN_VERS),8)
+       LLVMLibsOptions    := $(LLVMLibsOptions)  \
+                            -Wl,-install_name \
+                            -Wl,"@executable_path/../lib/lib$(LIBRARYNAME)$(SHLIBEXT)"
+    endif
+endif
diff --git a/runtime/libprofile/OptimalEdgeProfiling.c b/runtime/libprofile/OptimalEdgeProfiling.c
index eb7887b..3a7631b 100644
--- a/runtime/libprofile/OptimalEdgeProfiling.c
+++ b/runtime/libprofile/OptimalEdgeProfiling.c
@@ -22,11 +22,11 @@ static unsigned NumElements;
 /* OptEdgeProfAtExitHandler - When the program exits, just write out the
  * profiling data.
  */
-static void OptEdgeProfAtExitHandler() {
+static void OptEdgeProfAtExitHandler(void) {
   /* Note that, although the array has a counter for each edge, not all
    * counters are updated, the ones that are not used are initialised with -1.
    * When loading this information the counters with value -1 have to be
-   * recalculated, it is guranteed that this is possible.
+   * recalculated, it is guaranteed that this is possible.
    */
   write_profiling_data(OptEdgeInfo, ArrayStart, NumElements);
 }
diff --git a/runtime/libprofile/PathProfiling.c b/runtime/libprofile/PathProfiling.c
index 651e63c..2836785 100644
--- a/runtime/libprofile/PathProfiling.c
+++ b/runtime/libprofile/PathProfiling.c
@@ -15,14 +15,22 @@
 
 #include "Profiling.h"
 #include "llvm/Analysis/ProfileInfoTypes.h"
+#include "llvm/Support/DataTypes.h"
 #include <sys/types.h>
+#if !defined(_MSC_VER) && !defined(__MINGW32__)
 #include <unistd.h>
+#else
+#include <io.h>
+#endif
 #include <string.h>
 #include <stdlib.h>
-#include <unistd.h>
-#include <stdint.h>
 #include <stdio.h>
 
+/* Must use __inline in Microsoft C */
+#if defined(_MSC_VER)
+#define inline __inline
+#endif
+
 /* note that this is used for functions with large path counts,
          but it is unlikely those paths will ALL be executed */
 #define ARBITRARY_HASH_BIN_COUNT 100
@@ -104,8 +112,8 @@ void writeArrayTable(uint32_t fNumber, ftEntry_t* ft, uint32_t* funcCount) {
   }
 }
 
-inline uint32_t hash (uint32_t key) {
-  /* this may benifit from a proper hash function */
+static inline uint32_t hash (uint32_t key) {
+  /* this may benefit from a proper hash function */
   return key%ARBITRARY_HASH_BIN_COUNT;
 }
 
@@ -147,7 +155,8 @@ void writeHashTable(uint32_t functionNumber, pathHashTable_t* hashTable) {
 }
 
 /* Return a pointer to this path's specific path counter */
-inline uint32_t* getPathCounter(uint32_t functionNumber, uint32_t pathNumber) {
+static inline uint32_t* getPathCounter(uint32_t functionNumber,
+                                       uint32_t pathNumber) {
   pathHashTable_t* hashTable;
   pathHashEntry_t* hashEntry;
   uint32_t index = hash(pathNumber);
@@ -214,7 +223,7 @@ void llvm_decrement_path_count (uint32_t functionNumber, uint32_t pathNumber) {
  *      +-----------------+-----------------+
  *
  */
-static void pathProfAtExitHandler() {
+static void pathProfAtExitHandler(void) {
   int outFile = getOutFile();
   uint32_t i;
   uint32_t header[2] = { PathInfo, 0 };
diff --git a/runtime/libprofile/libprofile.exports b/runtime/libprofile/libprofile.exports
index b8057c7..2f25be6 100644
--- a/runtime/libprofile/libprofile.exports
+++ b/runtime/libprofile/libprofile.exports
@@ -5,3 +5,8 @@ llvm_start_basic_block_tracing
 llvm_trace_basic_block
 llvm_increment_path_count
 llvm_decrement_path_count
+llvm_gcda_start_file
+llvm_gcda_increment_indirect_counter
+llvm_gcda_emit_function
+llvm_gcda_emit_arcs
+llvm_gcda_end_file
diff --git a/test/Analysis/BasicAA/2005-03-09-BrokenBasicAA.ll b/test/Analysis/BasicAA/2005-03-09-BrokenBasicAA.ll
deleted file mode 100644
index 4564263..0000000
--- a/test/Analysis/BasicAA/2005-03-09-BrokenBasicAA.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-; RUN: opt < %s -basicaa -gvn -instcombine |\
-; RUN: llvm-dis | grep {load i32\\* %A}
-
-declare double* @useit(i32*)
-
-define i32 @foo(i32 %Amt) {
-	%A = malloc i32, i32 %Amt
-	%P = call double*  @useit(i32* %A)
-
-	%X = load i32* %A
-	store double 0.0, double* %P
-	%Y = load i32* %A
-	%Z = sub i32 %X, %Y
-	ret i32 %Z
-}
diff --git a/test/Analysis/BasicAA/2010-09-15-GEP-SignedArithmetic.ll b/test/Analysis/BasicAA/2010-09-15-GEP-SignedArithmetic.ll
index 2b0cd78..7b5584e 100644
--- a/test/Analysis/BasicAA/2010-09-15-GEP-SignedArithmetic.ll
+++ b/test/Analysis/BasicAA/2010-09-15-GEP-SignedArithmetic.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -basicaa -aa-eval -print-all-alias-modref-info -disable-output |& grep {1 may alias}
+; RUN: opt < %s -basicaa -aa-eval -print-all-alias-modref-info -disable-output |& grep {1 partial alias}
 ; PR7959
 
 target datalayout = "e-p:32:32:32"
diff --git a/test/Analysis/BasicAA/dag.ll b/test/Analysis/BasicAA/dag.ll
new file mode 100644
index 0000000..501f4c3
--- /dev/null
+++ b/test/Analysis/BasicAA/dag.ll
@@ -0,0 +1,41 @@
+; RUN: opt < %s -basicaa -aa-eval -print-all-alias-modref-info |& FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+
+; BasicAA's guard against use-def cycles shouldn't prevent it from
+; analyzing use-def dags.
+
+; CHECK: MustAlias:  i8* %base, i8* %phi
+; CHECK: MustAlias: i8* %phi, i8* %wwa
+; CHECK: MustAlias: i8* %phi, i8* %wwb
+; CHECK: MustAlias: i16* %bigbase, i8* %phi
+define i8 @foo(i8* %base, i1 %x, i1 %w) {
+entry:
+  br i1 %w, label %wa, label %wb
+wa:
+  %wwa = bitcast i8* %base to i8*
+  br label %wc
+wb:
+  %wwb = bitcast i8* %base to i8*
+  br label %wc
+wc:
+  %first = phi i8* [ %wwa, %wa ], [ %wwb, %wb ]
+  %fc = bitcast i8* %first to i8*
+  br i1 %x, label %xa, label %xb
+xa:
+  %xxa = bitcast i8* %fc to i8*
+  br label %xc
+xb:
+  %xxb = bitcast i8* %fc to i8*
+  br label %xc
+xc:
+  %phi = phi i8* [ %xxa, %xa ], [ %xxb, %xb ]
+
+  store i8 0, i8* %phi
+
+  %bigbase = bitcast i8* %base to i16*
+  store i16 -1, i16* %bigbase
+
+  %loaded = load i8* %phi
+  ret i8 %loaded
+}
diff --git a/test/Analysis/BasicAA/intrinsics.ll b/test/Analysis/BasicAA/intrinsics.ll
new file mode 100644
index 0000000..59725cf
--- /dev/null
+++ b/test/Analysis/BasicAA/intrinsics.ll
@@ -0,0 +1,39 @@
+; RUN: opt -basicaa -gvn -S < %s | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"
+
+; BasicAA should prove that these calls don't interfere, since they are
+; IntrArgReadMem and have noalias pointers.
+
+; CHECK:      define <8 x i16> @test0(i8* noalias %p, i8* noalias %q, <8 x i16> %y) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT:   %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) nounwind
+; CHECK-NEXT:   call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16)
+; CHECK-NEXT:   %c = add <8 x i16> %a, %a
+define <8 x i16> @test0(i8* noalias %p, i8* noalias %q, <8 x i16> %y) {
+entry:
+  %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) nounwind
+  call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16)
+  %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) nounwind
+  %c = add <8 x i16> %a, %b
+  ret <8 x i16> %c
+}
+
+; CHECK:      define <8 x i16> @test1(i8* %p, <8 x i16> %y) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT:   %q = getelementptr i8* %p, i64 16
+; CHECK-NEXT:   %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) nounwind
+; CHECK-NEXT:   call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16)
+; CHECK-NEXT:   %c = add <8 x i16> %a, %a
+define <8 x i16> @test1(i8* %p, <8 x i16> %y) {
+entry:
+  %q = getelementptr i8* %p, i64 16
+  %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) nounwind
+  call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16)
+  %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) nounwind
+  %c = add <8 x i16> %a, %b
+  ret <8 x i16> %c
+}
+
+declare <8 x i16> @llvm.arm.neon.vld1.v8i16(i8*, i32) nounwind readonly
+declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32) nounwind
diff --git a/test/Analysis/BasicAA/modref.ll b/test/Analysis/BasicAA/modref.ll
index ec0c8a7..7a71e3e 100644
--- a/test/Analysis/BasicAA/modref.ll
+++ b/test/Analysis/BasicAA/modref.ll
@@ -102,7 +102,7 @@ define i32 @test4(i8* %P) {
   %sub = sub i32 %tmp2, %tmp
   ret i32 %sub
 ; CHECK: @test4
-; CHECK: load i32* @G
+; CHECK-NOT: load
 ; CHECK: memset.p0i8.i32
 ; CHECK-NOT: load
 ; CHECK: ret i32 0
@@ -117,7 +117,7 @@ define i32 @test5(i8* %P, i32 %Len) {
   %sub = sub i32 %tmp2, %tmp
   ret i32 %sub
 ; CHECK: @test5
-; CHECK: load i32* @G
+; CHECK-NOT: load
 ; CHECK: memcpy.p0i8.p0i8.i32
 ; CHECK-NOT: load
 ; CHECK: ret i32 0
diff --git a/test/Analysis/BasicAA/must-and-partial.ll b/test/Analysis/BasicAA/must-and-partial.ll
new file mode 100644
index 0000000..93b6184
--- /dev/null
+++ b/test/Analysis/BasicAA/must-and-partial.ll
@@ -0,0 +1,39 @@
+; RUN: opt < %s -basicaa -aa-eval -print-all-alias-modref-info |& FileCheck %s
+
+; When merging MustAlias and PartialAlias, merge to PartialAlias
+; instead of MayAlias.
+
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+
+; CHECK: PartialAlias:  i16* %bigbase0, i8* %phi
+define i8 @test0(i8* %base, i1 %x) {
+entry:
+  %baseplusone = getelementptr i8* %base, i64 1
+  br i1 %x, label %red, label %green
+red:
+  br label %green
+green:
+  %phi = phi i8* [ %baseplusone, %red ], [ %base, %entry ]
+  store i8 0, i8* %phi
+
+  %bigbase0 = bitcast i8* %base to i16*
+  store i16 -1, i16* %bigbase0
+
+  %loaded = load i8* %phi
+  ret i8 %loaded
+}
+
+; CHECK: PartialAlias:  i16* %bigbase1, i8* %sel
+define i8 @test1(i8* %base, i1 %x) {
+entry:
+  %baseplusone = getelementptr i8* %base, i64 1
+  %sel = select i1 %x, i8* %baseplusone, i8* %base
+  store i8 0, i8* %sel
+
+  %bigbase1 = bitcast i8* %base to i16*
+  store i16 -1, i16* %bigbase1
+
+  %loaded = load i8* %sel
+  ret i8 %loaded
+}
diff --git a/test/Analysis/BasicAA/underlying-value.ll b/test/Analysis/BasicAA/underlying-value.ll
new file mode 100644
index 0000000..0671c82
--- /dev/null
+++ b/test/Analysis/BasicAA/underlying-value.ll
@@ -0,0 +1,25 @@
+; RUN: opt -basicaa -licm -S < %s
+; PR9931
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+
+define void @func_20() nounwind {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.cond2, %entry
+  br i1 undef, label %for.cond2, label %for.end22
+
+for.cond2:                                        ; preds = %for.body5, %for.cond
+  br i1 false, label %for.body5, label %for.cond
+
+for.body5:                                        ; preds = %for.cond2
+  %arrayidx = getelementptr inbounds [2 x i64]* undef, i32 0, i64 0
+  %tmp7 = load i64* %arrayidx, align 8
+  %arrayidx9 = getelementptr inbounds [2 x i64]* undef, i32 0, i64 undef
+  %tmp10 = load i64* %arrayidx9, align 8
+  br label %for.cond2
+
+for.end22:                                        ; preds = %for.cond
+  ret void
+}
diff --git a/test/Analysis/CallGraph/no-intrinsics.ll b/test/Analysis/CallGraph/no-intrinsics.ll
new file mode 100644
index 0000000..272a559
--- /dev/null
+++ b/test/Analysis/CallGraph/no-intrinsics.ll
@@ -0,0 +1,13 @@
+; RUN: opt < %s -print-callgraph -disable-output |& FileCheck %s
+
+; Check that intrinsics aren't added to the call graph
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8*, i8*, i32, i32, i1)
+
+define void @f(i8* %out, i8* %in) {
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %out, i8* %in, i32 100, i32 4, i1 false)
+  ret void
+}
+
+; CHECK: Call graph node for function: 'f'
+; CHECK-NOT: calls function 'llvm.memcpy.p0i8.p0i8.i32'
+\ No newline at end of file
diff --git a/test/Analysis/GlobalsModRef/indirect-global.ll b/test/Analysis/GlobalsModRef/indirect-global.ll
index 1eab0bc..826f55c 100644
--- a/test/Analysis/GlobalsModRef/indirect-global.ll
+++ b/test/Analysis/GlobalsModRef/indirect-global.ll
@@ -3,8 +3,11 @@
 
 @G = internal global i32* null		; <i32**> [#uses=3]
 
+
+declare i8* @malloc(i32)
 define void @test() {
-	%A = malloc i32		; <i32*> [#uses=1]
+	%a = call i8* @malloc(i32 4)
+        %A = bitcast i8* %a to i32*
 	store i32* %A, i32** @G
 	ret void
 }
diff --git a/test/Analysis/RegionInfo/next.ll b/test/Analysis/RegionInfo/next.ll
index d986387..377a84d 100644
--- a/test/Analysis/RegionInfo/next.ll
+++ b/test/Analysis/RegionInfo/next.ll
@@ -32,8 +32,8 @@ __label_000020:                                   ; preds = %__label_002001, %bb
 ; CHECK-NOT: =>
 ; CHECK: [0] entry => <Function Return>
 ; CHECK-NEXT:  [1] __label_002001.outer => __label_000020
-; CHECK-NEXT;      [2] bb197 => bb229
-; CHECK-NEXT;            [3] bb224 => bb229
+; CHECK-NEXT:      [2] bb197 => bb229
+; CHECK-NEXT:            [3] bb224 => bb229
 
 ; STAT: 4 region - The # of regions
 ; STAT: 1 region - The # of simple regions
diff --git a/test/Analysis/ScalarEvolution/2011-04-26-FoldAddRec.ll b/test/Analysis/ScalarEvolution/2011-04-26-FoldAddRec.ll
new file mode 100644
index 0000000..1600d5f
--- /dev/null
+++ b/test/Analysis/ScalarEvolution/2011-04-26-FoldAddRec.ll
@@ -0,0 +1,33 @@
+; RUN: opt < %s -analyze -iv-users
+; PR9633: Tests that SCEV handles the mul.i2 recurrence being folded to
+; constant zero.
+
+define signext i8 @func_14(i8 signext %p_18) nounwind readnone ssp {
+entry:
+  br label %for.inc
+
+for.inc:
+  %p_17.addr.012 = phi i32 [ 0, %entry ], [ %add, %for.inc ]
+  %add = add nsw i32 %p_17.addr.012, 1
+  br i1 false, label %for.inc, label %for.cond
+
+for.cond:
+  %tobool.i = icmp ult i32 %add, 8192
+  %shl.i = select i1 %tobool.i, i32 13, i32 0
+  %shl.left.i = shl i32 %add, %shl.i
+  %conv.i4 = trunc i32 %shl.left.i to i8
+  br i1 undef, label %for.inc9, label %if.then
+
+for.inc9:
+  %p_18.addr.011 = phi i8 [ %add12, %for.inc9 ], [ %p_18, %for.cond ]
+  %add12 = add i8 %p_18.addr.011, 1
+  %mul.i2 = mul i8 %add12, %conv.i4
+  %mul.i2.lobit = lshr i8 %mul.i2, 7
+  %lor.ext.shr.i = select i1 undef, i8 %mul.i2.lobit, i8 %mul.i2
+  %tobool = icmp eq i8 %lor.ext.shr.i, 0
+  br i1 %tobool, label %for.inc9, label %if.then
+
+if.then:
+  ret i8 0
+
+}
+\ No newline at end of file
diff --git a/test/Analysis/TypeBasedAliasAnalysis/dynamic-indices.ll b/test/Analysis/TypeBasedAliasAnalysis/dynamic-indices.ll
new file mode 100644
index 0000000..52e394b
--- /dev/null
+++ b/test/Analysis/TypeBasedAliasAnalysis/dynamic-indices.ll
@@ -0,0 +1,131 @@
+; RUN: opt -tbaa -basicaa -gvn -S < %s | FileCheck %s
+; PR9971
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+
+%struct.X = type { i32, float }
+%union.vector_t = type { [2 x i64] }
+
+; Don't delete the load after the loop, because it loads values stored
+; inside the loop.
+
+; CHECK: define void @vrlh(
+
+; CHECK: for.end:
+; CHECK:   %arrayidx31 = getelementptr inbounds %union.vector_t* %t, i64 0, i32 0, i64 1
+; CHECK:   %tmp32 = load i64* %arrayidx31, align 8, !tbaa !3
+
+define void @vrlh(%union.vector_t* %va, %union.vector_t* %vb, %union.vector_t* %vd) nounwind {
+entry:
+  %t = alloca %union.vector_t, align 8
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %sub = sub nsw i32 7, %i.01
+  %idxprom = sext i32 %sub to i64
+  %half = bitcast %union.vector_t* %vb to [8 x i16]*
+  %arrayidx = getelementptr inbounds [8 x i16]* %half, i64 0, i64 %idxprom
+  %tmp4 = load i16* %arrayidx, align 2, !tbaa !0
+  %conv = zext i16 %tmp4 to i32
+  %and = and i32 %conv, 15
+  %sub6 = sub nsw i32 7, %i.01
+  %idxprom7 = sext i32 %sub6 to i64
+  %half9 = bitcast %union.vector_t* %va to [8 x i16]*
+  %arrayidx10 = getelementptr inbounds [8 x i16]* %half9, i64 0, i64 %idxprom7
+  %tmp11 = load i16* %arrayidx10, align 2, !tbaa !0
+  %conv12 = zext i16 %tmp11 to i32
+  %shl = shl i32 %conv12, %and
+  %sub15 = sub nsw i32 7, %i.01
+  %idxprom16 = sext i32 %sub15 to i64
+  %half18 = bitcast %union.vector_t* %va to [8 x i16]*
+  %arrayidx19 = getelementptr inbounds [8 x i16]* %half18, i64 0, i64 %idxprom16
+  %tmp20 = load i16* %arrayidx19, align 2, !tbaa !0
+  %conv21 = zext i16 %tmp20 to i32
+  %sub23 = sub nsw i32 16, %and
+  %shr = lshr i32 %conv21, %sub23
+  %or = or i32 %shl, %shr
+  %conv24 = trunc i32 %or to i16
+  %sub26 = sub nsw i32 7, %i.01
+  %idxprom27 = sext i32 %sub26 to i64
+  %half28 = bitcast %union.vector_t* %t to [8 x i16]*
+  %arrayidx29 = getelementptr inbounds [8 x i16]* %half28, i64 0, i64 %idxprom27
+  store i16 %conv24, i16* %arrayidx29, align 2, !tbaa !0
+  %inc = add nsw i32 %i.01, 1
+  %cmp = icmp slt i32 %inc, 8
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %arrayidx31 = getelementptr inbounds %union.vector_t* %t, i64 0, i32 0, i64 1
+  %tmp32 = load i64* %arrayidx31, align 8, !tbaa !3
+  %arrayidx35 = getelementptr inbounds %union.vector_t* %vd, i64 0, i32 0, i64 1
+  store i64 %tmp32, i64* %arrayidx35, align 8, !tbaa !3
+  %arrayidx37 = getelementptr inbounds %union.vector_t* %t, i64 0, i32 0, i64 0
+  %tmp38 = load i64* %arrayidx37, align 8, !tbaa !3
+  %arrayidx41 = getelementptr inbounds %union.vector_t* %vd, i64 0, i32 0, i64 0
+  store i64 %tmp38, i64* %arrayidx41, align 8, !tbaa !3
+  ret void
+}
+
+; Do delete the load after the loop.
+
+; CHECK: define i32 @test0(
+
+; CHECK:   ret i32 0
+
+define i32 @test0(%struct.X* %a) nounwind {
+entry:
+  %i = getelementptr inbounds %struct.X* %a, i64 0, i32 0
+  store i32 0, i32* %i, align 4, !tbaa !4
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i2.01 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %f = getelementptr inbounds %struct.X* %a, i64 %i2.01, i32 1
+  %tmp6 = load float* %f, align 4, !tbaa !5
+  %mul = fmul float %tmp6, 0x40019999A0000000
+  store float %mul, float* %f, align 4, !tbaa !5
+  %inc = add nsw i64 %i2.01, 1
+  %cmp = icmp slt i64 %inc, 10000
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %i9 = getelementptr inbounds %struct.X* %a, i64 0, i32 0
+  %tmp10 = load i32* %i9, align 4, !tbaa !4
+  ret i32 %tmp10
+}
+
+; Do delete the load after the loop.
+
+; CHECK: define float @test1(
+
+; CHECK:   ret float 0x3FD3333340000000
+
+define float @test1(%struct.X* %a) nounwind {
+entry:
+  %f = getelementptr inbounds %struct.X* %a, i64 0, i32 1
+  store float 0x3FD3333340000000, float* %f, align 4, !tbaa !5
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.01 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %i5 = getelementptr inbounds %struct.X* %a, i64 %i.01, i32 0
+  %tmp6 = load i32* %i5, align 4, !tbaa !4
+  %mul = mul nsw i32 %tmp6, 3
+  store i32 %mul, i32* %i5, align 4, !tbaa !4
+  %inc = add nsw i64 %i.01, 1
+  %cmp = icmp slt i64 %inc, 10000
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %f9 = getelementptr inbounds %struct.X* %a, i64 0, i32 1
+  %tmp10 = load float* %f9, align 4, !tbaa !5
+  ret float %tmp10
+}
+
+!0 = metadata !{metadata !"short", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
+!3 = metadata !{metadata !"long long", metadata !1}
+!4 = metadata !{metadata !"int", metadata !1}
+!5 = metadata !{metadata !"float", metadata !1}
diff --git a/test/Analysis/TypeBasedAliasAnalysis/intrinsics.ll b/test/Analysis/TypeBasedAliasAnalysis/intrinsics.ll
new file mode 100644
index 0000000..8f080e2
--- /dev/null
+++ b/test/Analysis/TypeBasedAliasAnalysis/intrinsics.ll
@@ -0,0 +1,27 @@
+; RUN: opt -tbaa -basicaa -gvn -S < %s | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"
+
+; TBAA should prove that these calls don't interfere, since they are
+; IntrArgReadMem and have TBAA metadata.
+
+; CHECK:      define <8 x i16> @test0(i8* %p, i8* %q, <8 x i16> %y) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT:   %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) nounwind
+; CHECK-NEXT:   call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16)
+; CHECK-NEXT:   %c = add <8 x i16> %a, %a
+define <8 x i16> @test0(i8* %p, i8* %q, <8 x i16> %y) {
+entry:
+  %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) nounwind, !tbaa !2
+  call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16), !tbaa !1
+  %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) nounwind, !tbaa !2
+  %c = add <8 x i16> %a, %b
+  ret <8 x i16> %c
+}
+
+declare <8 x i16> @llvm.arm.neon.vld1.v8i16(i8*, i32) nounwind readonly
+declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32) nounwind
+
+!0 = metadata !{metadata !"tbaa root", null}
+!1 = metadata !{metadata !"A", metadata !0}
+!2 = metadata !{metadata !"B", metadata !0}
diff --git a/test/Archive/check_binary_output.ll b/test/Archive/check_binary_output.ll
new file mode 100644
index 0000000..60ab5ca
--- /dev/null
+++ b/test/Archive/check_binary_output.ll
@@ -0,0 +1,4 @@
+; This is not an assembly file, this is just to run the test.
+; The test verifies that llvm-ar produces a binary output.
+
+;RUN: llvm-ar p %p/GNU.a very_long_bytecode_file_name.bc | cmp -s %p/very_long_bytecode_file_name.bc -
diff --git a/test/Assembler/2005-02-09-AsmWriterStoreBug.ll b/test/Assembler/2005-02-09-AsmWriterStoreBug.ll
deleted file mode 100644
index 4ec1796..0000000
--- a/test/Assembler/2005-02-09-AsmWriterStoreBug.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: llvm-as < %s | llvm-dis | llvm-as
-
-; Ensure that the asm writer emits types before both operands of the 
-; store, even though they can be the same.
-
-%RecTy = type %RecTy*
-
-define void @foo() {
-        %A = malloc %RecTy              ; <%RecTy> [#uses=1]
-        %B = malloc %RecTy              ; <%RecTy> [#uses=1]
-        store %RecTy %B, %RecTy %A
-        ret void
-}
-
diff --git a/test/Assembler/2006-05-26-VarargsCallEncode.ll b/test/Assembler/2006-05-26-VarargsCallEncode.ll
deleted file mode 100644
index 6dc60c3..0000000
--- a/test/Assembler/2006-05-26-VarargsCallEncode.ll
+++ /dev/null
@@ -1,8 +0,0 @@
-; RUN: llvm-as < %s | llvm-dis | grep {tail call void.*sret null}
-
-declare void @foo({  }* sret , ...)
-
-define void @bar() {
-        tail call void ({  }* sret , ...)* @foo( {  }* null sret , i32 0 )
-        ret void
-}
diff --git a/test/Assembler/2007-07-30-AutoUpgradeZextSext.ll b/test/Assembler/2007-07-30-AutoUpgradeZextSext.ll
deleted file mode 100644
index ea2db44..0000000
--- a/test/Assembler/2007-07-30-AutoUpgradeZextSext.ll
+++ /dev/null
@@ -1,12 +0,0 @@
-; Test that upgrading zext/sext attributes to zeroext and signext
-; works correctly.
-; PR1553
-; RUN: llvm-as < %s > /dev/null
-
-define i32 @bar() {
-        %t = call i8 @foo( i8 10 sext ) zext
-        %x = zext i8 %t to i32
-        ret i32 %x
-}
-
-declare i8 @foo(i8 signext ) zeroext
diff --git a/test/Assembler/2007-11-27-AutoUpgradeAttributes.ll b/test/Assembler/2007-11-27-AutoUpgradeAttributes.ll
deleted file mode 100644
index ee260ea..0000000
--- a/test/Assembler/2007-11-27-AutoUpgradeAttributes.ll
+++ /dev/null
@@ -1,3 +0,0 @@
-; RUN: llvm-as < %s
-
-@FP = weak global i8 (...) signext * null
diff --git a/test/Assembler/AutoUpgradeIntrinsics.ll b/test/Assembler/AutoUpgradeIntrinsics.ll
index 6752bd8..20beb49 100644
--- a/test/Assembler/AutoUpgradeIntrinsics.ll
+++ b/test/Assembler/AutoUpgradeIntrinsics.ll
@@ -7,7 +7,10 @@
 ; RUN: llvm-as < %s | llvm-dis | \
 ; RUN:   not grep {llvm\\.bswap\\.i\[0-9\]*\\.i\[0-9\]*}
 ; RUN: llvm-as < %s | llvm-dis | \
+; RUN:   not grep {llvm\\.x86\\.sse2\\.loadu}
+; RUN: llvm-as < %s | llvm-dis | \
 ; RUN:   grep {llvm\\.x86\\.mmx\\.ps} | grep {x86_mmx} | count 16
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
 
 declare i32 @llvm.ctpop.i28(i28 %val)
 declare i32 @llvm.cttz.i29(i29 %val)
@@ -79,3 +82,38 @@ define void @sh64(<1 x i64> %A, <2 x i32> %B) {
 	%r2 = call <1 x i64> @llvm.x86.mmx.psrl.q( <1 x i64> %A, <2 x i32> %B )		; <<1 x i64>> [#uses=0]
 	ret void
 }
+
+declare <4 x float> @llvm.x86.sse.loadu.ps(i8*) nounwind readnone
+declare <16 x i8> @llvm.x86.sse2.loadu.dq(i8*) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.loadu.pd(double*) nounwind readnone
+define void @test_loadu(i8* %a, double* %b) {
+  %v0 = call <4 x float> @llvm.x86.sse.loadu.ps(i8* %a)
+  %v1 = call <16 x i8> @llvm.x86.sse2.loadu.dq(i8* %a)
+  %v2 = call <2 x double> @llvm.x86.sse2.loadu.pd(double* %b)
+  ret void
+}
+
+declare void @llvm.x86.sse.movnt.ps(i8*, <4 x float>) nounwind readnone 
+declare void @llvm.x86.sse2.movnt.dq(i8*, <2 x double>) nounwind readnone 
+declare void @llvm.x86.sse2.movnt.pd(i8*, <2 x double>) nounwind readnone 
+declare void @llvm.x86.sse2.movnt.i(i8*, i32) nounwind readnone 
+
+define void @f(<4 x float> %A, i8* %B, <2 x double> %C, i32 %D) {
+; CHECK: store{{.*}}nontemporal
+  call void @llvm.x86.sse.movnt.ps(i8* %B, <4 x float> %A)
+; CHECK: store{{.*}}nontemporal
+  call void @llvm.x86.sse2.movnt.dq(i8* %B, <2 x double> %C)
+; CHECK: store{{.*}}nontemporal
+  call void @llvm.x86.sse2.movnt.pd(i8* %B, <2 x double> %C)
+; CHECK: store{{.*}}nontemporal
+  call void @llvm.x86.sse2.movnt.i(i8* %B, i32 %D)
+  ret void
+}
+
+declare void @llvm.prefetch(i8*, i32, i32) nounwind
+
+define void @p(i8* %ptr) {
+; CHECK: llvm.prefetch(i8* %ptr, i32 0, i32 1, i32 1)
+  tail call void @llvm.prefetch(i8* %ptr, i32 0, i32 1)
+  ret void
+}
diff --git a/test/Assembler/invalid_cast.ll b/test/Assembler/invalid_cast.ll
new file mode 100644
index 0000000..c5b082b
--- /dev/null
+++ b/test/Assembler/invalid_cast.ll
@@ -0,0 +1,6 @@
+; RUN: not llvm-as < %s |& grep {invalid cast opcode}
+
+define <3 x i8> @foo(<4 x i64> %x) {
+  %y = trunc <4 x i64> %x to <3 x i8>
+  ret <3 x i8> %y
+}
diff --git a/test/Assembler/invalid_cast2.ll b/test/Assembler/invalid_cast2.ll
new file mode 100644
index 0000000..f2e7c41
--- /dev/null
+++ b/test/Assembler/invalid_cast2.ll
@@ -0,0 +1,6 @@
+; RUN: not llvm-as < %s |& grep {invalid cast opcode}
+
+define i8 @foo(<4 x i64> %x) {
+  %y = trunc <4 x i64> %x to i8
+  ret i8 %y
+}
diff --git a/test/Assembler/named-metadata.ll b/test/Assembler/named-metadata.ll
new file mode 100644
index 0000000..db72810
--- /dev/null
+++ b/test/Assembler/named-metadata.ll
@@ -0,0 +1,24 @@
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+
+!0 = metadata !{metadata !"zero"}
+!1 = metadata !{metadata !"one"}
+!2 = metadata !{metadata !"two"}
+
+!foo = !{!0, !1, !2}
+; CHECK: !foo = !{!0, !1, !2}
+
+!\23pragma = !{!0, !1, !2}
+; CHECK: !\23pragma = !{!0, !1, !2}
+
+; \31 is the digit '1'. On emission, we escape the first character (to avoid
+; conflicting with anonymous metadata), but not the subsequent ones.
+!\31\31\31 = !{!0, !1, !2}
+; CHECK: !\3111 = !{!0, !1, !2}
+
+!\22name\22 = !{!0, !1, !2}
+; CHECK: !\22name\22 = !{!0, !1, !2}
+
+; \x doesn't mean anything, so we parse it literally but escape the \ into \5C
+; when emitting it, followed by xfoo.
+!\xfoo = !{!0, !1, !2}
+; CHECK: !\5Cxfoo = !{!0, !1, !2}
diff --git a/test/Bitcode/2006-12-11-Cast-ConstExpr.ll b/test/Bitcode/2006-12-11-Cast-ConstExpr.ll
index 6df8711..e704627 100644
--- a/test/Bitcode/2006-12-11-Cast-ConstExpr.ll
+++ b/test/Bitcode/2006-12-11-Cast-ConstExpr.ll
@@ -1,7 +1,7 @@
 ; This test ensures that we get a bitcast constant expression in and out,
 ; not a sitofp constant expression. 
-; RUN: llvm-as < %s | llvm-dis | \
-; RUN:   grep {bitcast (}
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+; CHECK: bitcast (
 
 @G = external global i32
 
diff --git a/test/Bitcode/AutoUpgradeGlobals.ll b/test/Bitcode/AutoUpgradeGlobals.ll
index 8a87673..a5af2b8 100644
--- a/test/Bitcode/AutoUpgradeGlobals.ll
+++ b/test/Bitcode/AutoUpgradeGlobals.ll
@@ -1,3 +1,4 @@
 ; This isn't really an assembly file. It just runs test on bitcode to ensure
 ; it is auto-upgraded.
-; RUN: llvm-dis < %s.bc | not grep {i32 @\\.llvm\\.eh}
+; RUN: llvm-dis < %s.bc | FileCheck %s 
+; CHECK-NOT: {i32 @\\.llvm\\.eh}
diff --git a/test/Bitcode/AutoUpgradeIntrinsics.ll b/test/Bitcode/AutoUpgradeIntrinsics.ll
index 5f9bcd5..c3e2e9e 100644
--- a/test/Bitcode/AutoUpgradeIntrinsics.ll
+++ b/test/Bitcode/AutoUpgradeIntrinsics.ll
@@ -1,10 +1,8 @@
 ; This isn't really an assembly file. It just runs test on bitcode to ensure
 ; it is auto-upgraded.
-; RUN: llvm-dis < %s.bc | not grep {i32 @llvm\\.ct}
-; RUN: llvm-dis < %s.bc | \
-; RUN:   not grep {llvm\\.part\\.set\\.i\[0-9\]*\\.i\[0-9\]*\\.i\[0-9\]*}
-; RUN: llvm-dis < %s.bc | \
-; RUN:   not grep {llvm\\.part\\.select\\.i\[0-9\]*\\.i\[0-9\]*}
-; RUN: llvm-dis < %s.bc | \
-; RUN:   not grep {llvm\\.bswap\\.i\[0-9\]*\\.i\[0-9\]*}
+; RUN: llvm-dis < %s.bc | FileCheck %s
+; CHECK-NOT: {i32 @llvm\\.ct}
+; CHECK-NOT: {llvm\\.part\\.set\\.i\[0-9\]*\\.i\[0-9\]*\\.i\[0-9\]*}
+; CHECK-NOT: {llvm\\.part\\.select\\.i\[0-9\]*\\.i\[0-9\]*}
+; CHECK-NOT: {llvm\\.bswap\\.i\[0-9\]*\\.i\[0-9\]*}
 
diff --git a/test/Bitcode/blockaddress.ll b/test/Bitcode/blockaddress.ll
new file mode 100644
index 0000000..b9f3341
--- /dev/null
+++ b/test/Bitcode/blockaddress.ll
@@ -0,0 +1,30 @@
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+; PR9857
+
+define void @f(i8** nocapture %ptr1) {
+; CHECK: define void @f
+entry:
+  br label %here.i
+
+here.i:
+  store i8* blockaddress(@doit, %here), i8** %ptr1, align 8
+; CHECK: blockaddress(@doit, %here)
+  br label %doit.exit
+
+doit.exit:
+  ret void
+}
+
+define void @doit(i8** nocapture %pptr) {
+; CHECK: define void @doit
+entry:
+  br label %here
+
+here:
+  store i8* blockaddress(@doit, %here), i8** %pptr, align 8
+; CHECK: blockaddress(@doit, %here)
+  br label %end
+
+end:
+  ret void
+}
diff --git a/test/Bitcode/sse2_loadl_pd.ll b/test/Bitcode/sse2_loadl_pd.ll
index b0bea16..6cb0da5 100644
--- a/test/Bitcode/sse2_loadl_pd.ll
+++ b/test/Bitcode/sse2_loadl_pd.ll
@@ -1,2 +1,3 @@
-; RUN: llvm-dis < %s.bc | not grep {i32 @llvm\\.loadl.pd}
-; RUN: llvm-dis < %s.bc | grep shufflevector
+; RUN: llvm-dis < %s.bc | FileCheck %s
+; CHECK-NOT: {i32 @llvm\\.loadl.pd} 
+; CHECK: shufflevector
diff --git a/test/Bitcode/sse2_movl_dq.ll b/test/Bitcode/sse2_movl_dq.ll
index 093d8213..2fc0149 100644
--- a/test/Bitcode/sse2_movl_dq.ll
+++ b/test/Bitcode/sse2_movl_dq.ll
@@ -1,2 +1,3 @@
-; RUN: llvm-dis < %s.bc | not grep {i32 @llvm\\.movl.dq}
-; RUN: llvm-dis < %s.bc | grep shufflevector
+; RUN: llvm-dis < %s.bc | FileCheck %s 
+; CHECK-NOT: {i32 @llvm\\.movl.dq}
+; CHECK: shufflevector
diff --git a/test/Bitcode/sse2_movs_d.ll b/test/Bitcode/sse2_movs_d.ll
index 25a35b6..ab82c43 100644
--- a/test/Bitcode/sse2_movs_d.ll
+++ b/test/Bitcode/sse2_movs_d.ll
@@ -1,2 +1,3 @@
-; RUN: llvm-dis < %s.bc | not grep {i32 @llvm\\.movs.d}
-; RUN: llvm-dis < %s.bc | grep shufflevector
+; RUN: llvm-dis < %s.bc | FileCheck %s
+; CHECK-NOT: {i32 @llvm\\.movs.d}
+; CHECK: shufflevector
diff --git a/test/Bitcode/sse2_punpck_qdq.ll b/test/Bitcode/sse2_punpck_qdq.ll
index b9d711c..4c68af5 100644
--- a/test/Bitcode/sse2_punpck_qdq.ll
+++ b/test/Bitcode/sse2_punpck_qdq.ll
@@ -1,3 +1,4 @@
-; RUN: llvm-dis < %s.bc | not grep {i32 @llvm\\.punpckh.qdq}
-; RUN: llvm-dis < %s.bc | not grep {i32 @llvm\\.punpckl.qdq}
-; RUN: llvm-dis < %s.bc | grep shufflevector
+; RUN: llvm-dis < %s.bc | FileCheck %s
+; CHECK-NOT: {i32 @llvm\\.punpckh.qdq}
+; CHECK-NOT: {i32 @llvm\\.punpckl.qdq}
+; CHECK: shufflevector
diff --git a/test/Bitcode/sse2_shuf_pd.ll b/test/Bitcode/sse2_shuf_pd.ll
index 5829edb..1ba6a1d 100644
--- a/test/Bitcode/sse2_shuf_pd.ll
+++ b/test/Bitcode/sse2_shuf_pd.ll
@@ -1,2 +1,3 @@
-; RUN: llvm-dis < %s.bc | not grep {i32 @llvm\\.shuf.pd}
-; RUN: llvm-dis < %s.bc | grep shufflevector
+; RUN: llvm-dis < %s.bc | FileCheck %s
+; CHECK-NOT: {i32 @llvm\\.shuf.pd}
+; CHECK: shufflevector
diff --git a/test/Bitcode/sse2_unpck_pd.ll b/test/Bitcode/sse2_unpck_pd.ll
index f4e5d54..99b61b6 100644
--- a/test/Bitcode/sse2_unpck_pd.ll
+++ b/test/Bitcode/sse2_unpck_pd.ll
@@ -1,3 +1,4 @@
-; RUN: llvm-dis < %s.bc | not grep {i32 @llvm\\.unpckh.pd}
-; RUN: llvm-dis < %s.bc | not grep {i32 @llvm\\.unpckl.pd}
-; RUN: llvm-dis < %s.bc | grep shufflevector
+; RUN: llvm-dis < %s.bc | FileCheck %s
+; CHECK-NOT: {i32 @llvm\\.unpckh.pd}
+; CHECK-NOT: {i32 @llvm\\.unpckl.pd}
+; CHECK: shufflevector
diff --git a/test/Bitcode/sse41_pmulld.ll b/test/Bitcode/sse41_pmulld.ll
index 6872cc0..752786d 100644
--- a/test/Bitcode/sse41_pmulld.ll
+++ b/test/Bitcode/sse41_pmulld.ll
@@ -1,2 +1,3 @@
-; RUN: llvm-dis < %s.bc | not grep {i32 @llvm\\.pmulld}
-; RUN: llvm-dis < %s.bc | grep mul
+; RUN: llvm-dis < %s.bc | FileCheck %s
+; CHECK-NOT: {i32 @llvm\\.pmulld}
+; CHECK: mul
diff --git a/test/Bitcode/sse42_crc32.ll b/test/Bitcode/sse42_crc32.ll
new file mode 100644
index 0000000..1c371c3
--- /dev/null
+++ b/test/Bitcode/sse42_crc32.ll
@@ -0,0 +1,28 @@
+; Check to make sure old CRC32 intrinsics are auto-upgraded
+; correctly.
+;
+; Rdar: 9472944
+;
+; RUN: llvm-dis < %s.bc | FileCheck %s
+
+; crc32.8 should upgrade to crc32.32.8
+; CHECK: i32 @llvm.x86.sse42.crc32.32.8(
+; CHECK-NOT: i32 @llvm.x86.sse42.crc32.8(
+
+; crc32.16 should upgrade to crc32.32.16
+; CHECK: i32 @llvm.x86.sse42.crc32.32.16(
+; CHECK-NOT: i32 @llvm.x86.sse42.crc32.16(
+
+; crc32.32 should upgrade to crc32.32.32
+; CHECK: i32 @llvm.x86.sse42.crc32.32.32(
+; CHECK-NOT: i32 @llvm.x86.sse42.crc32.32(
+
+; crc64.8 should upgrade to crc32.64.8
+; CHECK: i64 @llvm.x86.sse42.crc32.64.8(
+; CHECK-NOT: i64 @llvm.x86.sse42.crc64.8(
+
+; crc64.64 should upgrade to crc32.64.64
+; CHECK: i64 @llvm.x86.sse42.crc32.64.64(
+; CHECK-NOT: i64 @llvm.x86.sse42.crc64.64(
+
+
diff --git a/test/Bitcode/sse42_crc32.ll.bc b/test/Bitcode/sse42_crc32.ll.bc
new file mode 100644
index 0000000..d895fad
--- /dev/null
+++ b/test/Bitcode/sse42_crc32.ll.bc
diff --git a/test/Bitcode/ssse3_palignr.ll b/test/Bitcode/ssse3_palignr.ll
index d596dd5..f62ca11 100644
--- a/test/Bitcode/ssse3_palignr.ll
+++ b/test/Bitcode/ssse3_palignr.ll
@@ -1 +1,2 @@
-; RUN: llvm-dis < %s.bc | not grep {@llvm\\.palign}
+; RUN: llvm-dis < %s.bc | FileCheck %s 
+; CHECK-NOT: {@llvm\\.palign}
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index b696682..82eac60 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -37,14 +37,32 @@ if(PYTHONINTERP_FOUND)
   foreach(INC_DIR ${INC_DIRS})
     set(IDIRS "${IDIRS} -I${INC_DIR}")
   endforeach()
-  string(REPLACE "<CMAKE_CXX_COMPILER>" "${CMAKE_CXX_COMPILER}" TEST_COMPILE_CXX_CMD ${CMAKE_CXX_COMPILE_OBJECT})
+
+  if( MSVC )
+    # The compiler's path may contain white space. Wrap it:
+    string(REPLACE "<CMAKE_CXX_COMPILER>" "\\\"${CMAKE_CXX_COMPILER}\\\"" TEST_COMPILE_CXX_CMD ${CMAKE_CXX_COMPILE_OBJECT})
+    # Eliminate continuation lines from NMake flow. PR9680
+    string(REPLACE "@<<\n"                " "                     TEST_COMPILE_CXX_CMD ${TEST_COMPILE_CXX_CMD})
+    string(REPLACE "\n<<"                 " "                     TEST_COMPILE_CXX_CMD ${TEST_COMPILE_CXX_CMD})
+  else()
+    string(REPLACE "<CMAKE_CXX_COMPILER>" "${CMAKE_CXX_COMPILER}" TEST_COMPILE_CXX_CMD ${CMAKE_CXX_COMPILE_OBJECT})
+  endif()
+
   string(REPLACE "<DEFINES>"            "${DEFS}"               TEST_COMPILE_CXX_CMD ${TEST_COMPILE_CXX_CMD})
   string(REPLACE "<FLAGS>"              "${CMAKE_CXX_FLAGS}"    TEST_COMPILE_CXX_CMD ${TEST_COMPILE_CXX_CMD})
-  string(REPLACE "-o"                   ""                      TEST_COMPILE_CXX_CMD ${TEST_COMPILE_CXX_CMD})
+  if (MSVC) # PR9680
+    # Eliminate MSVC equivalent of -o
+    string(REPLACE "/Fo<OBJECT>"        ""                      TEST_COMPILE_CXX_CMD ${TEST_COMPILE_CXX_CMD})
+    # Eliminate "how to rename program database" argument
+    string(REPLACE "/Fd<TARGET_PDB>"    ""                      TEST_COMPILE_CXX_CMD ${TEST_COMPILE_CXX_CMD})
+  else()
+    string(REPLACE "-o"                 ""                      TEST_COMPILE_CXX_CMD ${TEST_COMPILE_CXX_CMD})
+  endif(MSVC)
   string(REGEX REPLACE "<[^>]+>"        ""                      TEST_COMPILE_CXX_CMD ${TEST_COMPILE_CXX_CMD})
   set(TEST_COMPILE_CXX_CMD "${TEST_COMPILE_CXX_CMD} ${IDIRS}")
   if(NOT MSVC)
     set(TEST_COMPILE_CXX_CMD "${TEST_COMPILE_CXX_CMD} -x c++")
+    # MSVC already has /TP to indicate a C++ source file
   endif()
   configure_file(
     ${CMAKE_CURRENT_SOURCE_DIR}/site.exp.in
@@ -53,7 +71,7 @@ if(PYTHONINTERP_FOUND)
   MAKE_DIRECTORY(${CMAKE_CURRENT_BINARY_DIR}/Unit)
 
   # Configuration-time: See Unit/lit.site.cfg.in
-  set(LLVM_BUILD_MODE "%(build_mode)s")
+  set(LLVM_BUILD_MODE "${LLVM_BUILD_MODE}")
 
   set(LLVM_SOURCE_DIR ${LLVM_MAIN_SRC_DIR})
   set(LLVM_BINARY_DIR ${LLVM_BINARY_DIR})
diff --git a/test/CodeGen/ARM/2007-03-26-RegScavengerAssert.ll b/test/CodeGen/ARM/2007-03-26-RegScavengerAssert.ll
deleted file mode 100644
index 76fa364..0000000
--- a/test/CodeGen/ARM/2007-03-26-RegScavengerAssert.ll
+++ /dev/null
@@ -1,947 +0,0 @@
-; RUN: llc < %s -march=arm
-; PR1266
-
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64"
-target triple = "arm-unknown-linux-gnueabi"
-	%struct.CUMULATIVE_ARGS = type { i32, i32, i32, i32, i32, i32 }
-	%struct.FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct.FILE*, i32, i32, i32, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i32, [52 x i8] }
-	%struct.VEC_edge = type { i32, i32, [1 x %struct.edge_def*] }
-	%struct.VEC_tree = type { i32, i32, [1 x %struct.tree_node*] }
-	%struct._IO_marker = type { %struct._IO_marker*, %struct.FILE*, i32 }
-	%struct._obstack_chunk = type { i8*, %struct._obstack_chunk*, [4 x i8] }
-	%struct.addr_diff_vec_flags = type { i8, i8, i8, i8 }
-	%struct.arm_stack_offsets = type { i32, i32, i32, i32, i32 }
-	%struct.attribute_spec = type { i8*, i32, i32, i8, i8, i8, %struct.tree_node* (%struct.tree_node**, %struct.tree_node*, %struct.tree_node*, i32, i8*)* }
-	%struct.basic_block_def = type { %struct.rtx_def*, %struct.rtx_def*, %struct.tree_node*, %struct.VEC_edge*, %struct.VEC_edge*, %struct.bitmap_head_def*, %struct.bitmap_head_def*, i8*, %struct.loop*, [2 x %struct.et_node*], %struct.basic_block_def*, %struct.basic_block_def*, %struct.reorder_block_def*, %struct.bb_ann_d*, i64, i32, i32, i32, i32 }
-	%struct.bb_ann_d = type { %struct.tree_node*, i8, %struct.edge_prediction* }
-	%struct.bitmap_element_def = type { %struct.bitmap_element_def*, %struct.bitmap_element_def*, i32, [4 x i32] }
-	%struct.bitmap_head_def = type { %struct.bitmap_element_def*, %struct.bitmap_element_def*, i32, %struct.bitmap_obstack* }
-	%struct.bitmap_obstack = type { %struct.bitmap_element_def*, %struct.bitmap_head_def*, %struct.obstack }
-	%struct.cgraph_edge = type { %struct.cgraph_node*, %struct.cgraph_node*, %struct.cgraph_edge*, %struct.cgraph_edge*, %struct.cgraph_edge*, %struct.cgraph_edge*, %struct.tree_node*, i8*, i8* }
-	%struct.cgraph_global_info = type { %struct.cgraph_node*, i32, i8 }
-	%struct.cgraph_local_info = type { i32, i8, i8, i8, i8, i8, i8, i8 }
-	%struct.cgraph_node = type { %struct.tree_node*, %struct.cgraph_edge*, %struct.cgraph_edge*, %struct.cgraph_node*, %struct.cgraph_node*, %struct.cgraph_node*, %struct.cgraph_node*, %struct.cgraph_node*, %struct.cgraph_node*, %struct.cgraph_node*, i8*, %struct.cgraph_local_info, %struct.cgraph_global_info, %struct.cgraph_rtl_info, i32, i8, i8, i8, i8, i8 }
-	%struct.cgraph_rtl_info = type { i32, i8, i8 }
-	%struct.cl_perfunc_opts = type { i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i32, i32, i32, i32, i32, i32, i32, i32, i32 }
-	%struct.cselib_val_struct = type opaque
-	%struct.dataflow_d = type { %struct.varray_head_tag*, [2 x %struct.tree_node*] }
-	%struct.def_operand_ptr = type { %struct.tree_node** }
-	%struct.def_optype_d = type { i32, [1 x %struct.def_operand_ptr] }
-	%struct.diagnostic_context = type { %struct.pretty_printer*, [8 x i32], i8, i8, i8, void (%struct.diagnostic_context*, %struct.diagnostic_info*)*, void (%struct.diagnostic_context*, %struct.diagnostic_info*)*, void (i8*, i8**)*, %struct.tree_node*, i32, i32 }
-	%struct.diagnostic_info = type { %struct.text_info, %struct.location_t, i32 }
-	%struct.die_struct = type opaque
-	%struct.edge_def = type { %struct.basic_block_def*, %struct.basic_block_def*, %struct.edge_def_insns, i8*, %struct.location_t*, i32, i32, i64, i32 }
-	%struct.edge_def_insns = type { %struct.rtx_def* }
-	%struct.edge_prediction = type { %struct.edge_prediction*, %struct.edge_def*, i32, i32 }
-	%struct.eh_status = type opaque
-	%struct.elt_list = type opaque
-	%struct.elt_t = type { %struct.tree_node*, %struct.tree_node* }
-	%struct.emit_status = type { i32, i32, %struct.rtx_def*, %struct.rtx_def*, %struct.sequence_stack*, i32, %struct.location_t, i32, i8*, %struct.rtx_def** }
-	%struct.et_node = type opaque
-	%struct.expr_status = type { i32, i32, i32, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def* }
-	%struct.function = type { %struct.eh_status*, %struct.expr_status*, %struct.emit_status*, %struct.varasm_status*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.function*, i32, i32, i32, i32, %struct.rtx_def*, %struct.CUMULATIVE_ARGS, %struct.rtx_def*, %struct.rtx_def*, %struct.initial_value_struct*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, i8, i32, i64, %struct.tree_node*, %struct.tree_node*, %struct.rtx_def*, %struct.varray_head_tag*, %struct.temp_slot*, i32, %struct.var_refs_queue*, i32, i32, %struct.rtvec_def*, %struct.tree_node*, i32, i32, i32, %struct.machine_function*, i32, i32, i8, i8, %struct.language_function*, %struct.rtx_def*, i32, i32, i32, i32, %struct.location_t, %struct.varray_head_tag*, %struct.tree_node*, i8, i8, i8 }
-	%struct.ggc_root_tab = type { i8*, i32, i32, void (i8*)*, void (i8*)* }
-	%struct.gimplify_ctx = type { %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.varray_head_tag*, %struct.htab*, i32, i8, i8 }
-	%struct.gimplify_init_ctor_preeval_data = type { %struct.tree_node*, i32 }
-	%struct.ht_identifier = type { i8*, i32, i32 }
-	%struct.htab = type { i32 (i8*)*, i32 (i8*, i8*)*, void (i8*)*, i8**, i32, i32, i32, i32, i32, i8* (i32, i32)*, void (i8*)*, i8*, i8* (i8*, i32, i32)*, void (i8*, i8*)*, i32 }
-	%struct.initial_value_struct = type opaque
-	%struct.lang_decl = type opaque
-	%struct.lang_hooks = type { i8*, i32, i32 (i32)*, i32 (i32, i8**)*, void (%struct.diagnostic_context*)*, i32 (i32, i8*, i32)*, i8 (i8*, i32) zeroext *, i8 (i8**) zeroext *, i8 () zeroext *, void ()*, void ()*, void (i32)*, void ()*, i64 (%struct.tree_node*)*, %struct.tree_node* (%struct.tree_node*)*, %struct.rtx_def* (%struct.tree_node*, %struct.rtx_def*, i32, i32, %struct.rtx_def**)*, i32 (%struct.tree_node*)*, %struct.tree_node* (%struct.tree_node*)*, i32 (%struct.rtx_def*, %struct.tree_node*)*, void (%struct.tree_node*)*, i8 (%struct.tree_node*) zeroext *, %struct.tree_node* (%struct.tree_node*)*, void (%struct.tree_node*)*, void (%struct.tree_node*)*, i8 () zeroext *, i8, i8, void ()*, void (%struct.FILE*, %struct.tree_node*, i32)*, void (%struct.FILE*, %struct.tree_node*, i32)*, void (%struct.FILE*, %struct.tree_node*, i32)*, void (%struct.FILE*, %struct.tree_node*, i32)*, i8* (%struct.tree_node*, i32)*, i32 (%struct.tree_node*, %struct.tree_node*)*, %struct.tree_node* (%struct.tree_node*)*, void (%struct.diagnostic_context*, i8*)*, %struct.tree_node* (%struct.tree_node*)*, i64 (i64)*, %struct.attribute_spec*, %struct.attribute_spec*, %struct.attribute_spec*, i32 (%struct.tree_node*)*, %struct.lang_hooks_for_functions, %struct.lang_hooks_for_tree_inlining, %struct.lang_hooks_for_callgraph, %struct.lang_hooks_for_tree_dump, %struct.lang_hooks_for_decls, %struct.lang_hooks_for_types, i32 (%struct.tree_node**, %struct.tree_node**, %struct.tree_node**)*, %struct.tree_node* (%struct.tree_node*, %struct.tree_node*)*, %struct.tree_node* (i8*, %struct.tree_node*, i32, i32, i8*, %struct.tree_node*)* }
-	%struct.lang_hooks_for_callgraph = type { %struct.tree_node* (%struct.tree_node**, i32*, %struct.tree_node*)*, void (%struct.tree_node*)* }
-	%struct.lang_hooks_for_decls = type { i32 ()*, void (%struct.tree_node*)*, %struct.tree_node* (%struct.tree_node*)*, %struct.tree_node* ()*, i8 (%struct.tree_node*) zeroext *, void ()*, void (%struct.tree_node*)*, i8 (%struct.tree_node*) zeroext *, i8* (%struct.tree_node*)* }
-	%struct.lang_hooks_for_functions = type { void (%struct.function*)*, void (%struct.function*)*, void (%struct.function*)*, void (%struct.function*)*, i8 (%struct.tree_node*) zeroext * }
-	%struct.lang_hooks_for_tree_dump = type { i8 (i8*, %struct.tree_node*) zeroext *, i32 (%struct.tree_node*)* }
-	%struct.lang_hooks_for_tree_inlining = type { %struct.tree_node* (%struct.tree_node**, i32*, %struct.tree_node* (%struct.tree_node**, i32*, i8*)*, i8*, %struct.pointer_set_t*)*, i32 (%struct.tree_node**)*, i32 (%struct.tree_node*)*, %struct.tree_node* (i8*, %struct.tree_node*)*, i32 (%struct.tree_node*, %struct.tree_node*)*, i32 (%struct.tree_node*)*, i8 (%struct.tree_node*, %struct.tree_node*) zeroext *, i32 (%struct.tree_node*)*, void (%struct.tree_node*)*, %struct.tree_node* (%struct.tree_node*, %struct.tree_node*, %struct.tree_node*, i32)* }
-	%struct.lang_hooks_for_types = type { %struct.tree_node* (i32)*, %struct.tree_node* (i32, i32)*, %struct.tree_node* (i32, i32)*, %struct.tree_node* (%struct.tree_node*)*, %struct.tree_node* (%struct.tree_node*)*, %struct.tree_node* (i32, %struct.tree_node*)*, %struct.tree_node* (%struct.tree_node*)*, void (%struct.tree_node*, i8*)*, void (%struct.tree_node*, %struct.tree_node*)*, %struct.tree_node* (%struct.tree_node*)*, i8 }
-	%struct.lang_type = type opaque
-	%struct.language_function = type opaque
-	%struct.location_t = type { i8*, i32 }
-	%struct.loop = type opaque
-	%struct.machine_function = type { %struct.rtx_def*, i32, i32, i32, %struct.arm_stack_offsets, i32, i32, i32, [14 x %struct.rtx_def*] }
-	%struct.mem_attrs = type { i64, %struct.tree_node*, %struct.rtx_def*, %struct.rtx_def*, i32 }
-	%struct.obstack = type { i32, %struct._obstack_chunk*, i8*, i8*, i8*, i32, i32, %struct._obstack_chunk* (i8*, i32)*, void (i8*, %struct._obstack_chunk*)*, i8*, i8 }
-	%struct.output_buffer = type { %struct.obstack, %struct.FILE*, i32, [128 x i8] }
-	%struct.phi_arg_d = type { %struct.tree_node*, i8 }
-	%struct.pointer_set_t = type opaque
-	%struct.pretty_printer = type { %struct.output_buffer*, i8*, i32, i32, i32, i32, i32, i8 (%struct.pretty_printer*, %struct.text_info*) zeroext *, i8, i8 }
-	%struct.ptr_info_def = type { i8, %struct.bitmap_head_def*, %struct.tree_node* }
-	%struct.real_value = type { i8, [3 x i8], [5 x i32] }
-	%struct.reg_attrs = type { %struct.tree_node*, i64 }
-	%struct.reg_info_def = type opaque
-	%struct.reorder_block_def = type { %struct.rtx_def*, %struct.rtx_def*, %struct.basic_block_def*, %struct.basic_block_def*, %struct.basic_block_def*, i32, i32, i32 }
-	%struct.rtunion = type { i32 }
-	%struct.rtvec_def = type { i32, [1 x %struct.rtx_def*] }
-	%struct.rtx_def = type { i16, i8, i8, %struct.u }
-	%struct.sequence_stack = type { %struct.rtx_def*, %struct.rtx_def*, %struct.sequence_stack* }
-	%struct.stmt_ann_d = type { %struct.tree_ann_common_d, i8, %struct.basic_block_def*, %struct.stmt_operands_d, %struct.dataflow_d*, %struct.bitmap_head_def*, i32 }
-	%struct.stmt_operands_d = type { %struct.def_optype_d*, %struct.def_optype_d*, %struct.v_may_def_optype_d*, %struct.vuse_optype_d*, %struct.v_may_def_optype_d* }
-	%struct.temp_slot = type opaque
-	%struct.text_info = type { i8*, i8**, i32 }
-	%struct.tree_ann_common_d = type { i32, i8*, %struct.tree_node* }
-	%struct.tree_ann_d = type { %struct.stmt_ann_d }
-	%struct.tree_binfo = type { %struct.tree_common, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.VEC_tree*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.VEC_tree }
-	%struct.tree_block = type { %struct.tree_common, i8, [3 x i8], %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node* }
-	%struct.tree_common = type { %struct.tree_node*, %struct.tree_node*, %struct.tree_ann_d*, i8, i8, i8, i8, i8 }
-	%struct.tree_complex = type { %struct.tree_common, %struct.tree_node*, %struct.tree_node* }
-	%struct.tree_decl = type { %struct.tree_common, %struct.location_t, i32, %struct.tree_node*, i8, i8, i8, i8, i8, i8, i8, i8, i32, %struct.tree_decl_u1, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.rtx_def*, i32, %struct.tree_decl_u2, %struct.tree_node*, %struct.tree_node*, i64, %struct.lang_decl* }
-	%struct.tree_decl_u1 = type { i64 }
-	%struct.tree_decl_u1_a = type { i32 }
-	%struct.tree_decl_u2 = type { %struct.function* }
-	%struct.tree_exp = type { %struct.tree_common, %struct.location_t*, i32, %struct.tree_node*, [1 x %struct.tree_node*] }
-	%struct.tree_identifier = type { %struct.tree_common, %struct.ht_identifier }
-	%struct.tree_int_cst = type { %struct.tree_common, %struct.tree_int_cst_lowhi }
-	%struct.tree_int_cst_lowhi = type { i64, i64 }
-	%struct.tree_list = type { %struct.tree_common, %struct.tree_node*, %struct.tree_node* }
-	%struct.tree_node = type { %struct.tree_decl }
-	%struct.tree_phi_node = type { %struct.tree_common, %struct.tree_node*, i32, i32, i32, %struct.basic_block_def*, %struct.dataflow_d*, [1 x %struct.phi_arg_d] }
-	%struct.tree_real_cst = type { %struct.tree_common, %struct.real_value* }
-	%struct.tree_ssa_name = type { %struct.tree_common, %struct.tree_node*, i32, %struct.ptr_info_def*, %struct.tree_node*, i8* }
-	%struct.tree_statement_list = type { %struct.tree_common, %struct.tree_statement_list_node*, %struct.tree_statement_list_node* }
-	%struct.tree_statement_list_node = type { %struct.tree_statement_list_node*, %struct.tree_statement_list_node*, %struct.tree_node* }
-	%struct.tree_stmt_iterator = type { %struct.tree_statement_list_node*, %struct.tree_node* }
-	%struct.tree_string = type { %struct.tree_common, i32, [1 x i8] }
-	%struct.tree_type = type { %struct.tree_common, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, i32, i16, i8, i8, i32, %struct.tree_node*, %struct.tree_node*, %struct.rtunion, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, i64, %struct.lang_type* }
-	%struct.tree_type_symtab = type { i32 }
-	%struct.tree_value_handle = type { %struct.tree_common, %struct.value_set*, i32 }
-	%struct.tree_vec = type { %struct.tree_common, i32, [1 x %struct.tree_node*] }
-	%struct.tree_vector = type { %struct.tree_common, %struct.tree_node* }
-	%struct.u = type { [1 x i64] }
-	%struct.use_operand_ptr = type { %struct.tree_node** }
-	%struct.use_optype_d = type { i32, [1 x %struct.def_operand_ptr] }
-	%struct.v_def_use_operand_type_t = type { %struct.tree_node*, %struct.tree_node* }
-	%struct.v_may_def_optype_d = type { i32, [1 x %struct.elt_t] }
-	%struct.v_must_def_optype_d = type { i32, [1 x %struct.elt_t] }
-	%struct.value_set = type opaque
-	%struct.var_ann_d = type { %struct.tree_ann_common_d, i8, i8, %struct.tree_node*, %struct.varray_head_tag*, i32, i32, i32, %struct.tree_node*, %struct.tree_node* }
-	%struct.var_refs_queue = type { %struct.rtx_def*, i32, i32, %struct.var_refs_queue* }
-	%struct.varasm_status = type opaque
-	%struct.varray_data = type { [1 x i64] }
-	%struct.varray_head_tag = type { i32, i32, i32, i8*, %struct.u }
-	%struct.vuse_optype_d = type { i32, [1 x %struct.tree_node*] }
-@gt_pch_rs_gt_gimplify_h = external global [2 x %struct.ggc_root_tab]		; <[2 x %struct.ggc_root_tab]*> [#uses=0]
-@tmp_var_id_num = external global i32		; <i32*> [#uses=0]
-@gt_ggc_r_gt_gimplify_h = external global [1 x %struct.ggc_root_tab]		; <[1 x %struct.ggc_root_tab]*> [#uses=0]
-@__FUNCTION__.19956 = external global [15 x i8]		; <[15 x i8]*> [#uses=0]
-@str = external global [42 x i8]		; <[42 x i8]*> [#uses=1]
-@__FUNCTION__.19974 = external global [22 x i8]		; <[22 x i8]*> [#uses=0]
-@gimplify_ctxp = external global %struct.gimplify_ctx*		; <%struct.gimplify_ctx**> [#uses=0]
-@cl_pf_opts = external global %struct.cl_perfunc_opts		; <%struct.cl_perfunc_opts*> [#uses=0]
-@__FUNCTION__.20030 = external global [22 x i8]		; <[22 x i8]*> [#uses=0]
-@__FUNCTION__.20099 = external global [24 x i8]		; <[24 x i8]*> [#uses=0]
-@global_trees = external global [47 x %struct.tree_node*]		; <[47 x %struct.tree_node*]*> [#uses=0]
-@tree_code_type = external global [0 x i32]		; <[0 x i32]*> [#uses=2]
-@current_function_decl = external global %struct.tree_node*		; <%struct.tree_node**> [#uses=0]
-@str1 = external global [2 x i8]		; <[2 x i8]*> [#uses=0]
-@str2 = external global [7 x i8]		; <[7 x i8]*> [#uses=0]
-@__FUNCTION__.20151 = external global [19 x i8]		; <[19 x i8]*> [#uses=0]
-@__FUNCTION__.20221 = external global [9 x i8]		; <[9 x i8]*> [#uses=0]
-@tree_code_length = external global [0 x i8]		; <[0 x i8]*> [#uses=0]
-@__FUNCTION__.20435 = external global [17 x i8]		; <[17 x i8]*> [#uses=0]
-@__FUNCTION__.20496 = external global [19 x i8]		; <[19 x i8]*> [#uses=0]
-@cfun = external global %struct.function*		; <%struct.function**> [#uses=0]
-@__FUNCTION__.20194 = external global [15 x i8]		; <[15 x i8]*> [#uses=0]
-@__FUNCTION__.19987 = external global [21 x i8]		; <[21 x i8]*> [#uses=0]
-@__FUNCTION__.20532 = external global [21 x i8]		; <[21 x i8]*> [#uses=0]
-@__FUNCTION__.20583 = external global [19 x i8]		; <[19 x i8]*> [#uses=0]
-@__FUNCTION__.20606 = external global [22 x i8]		; <[22 x i8]*> [#uses=0]
-@__FUNCTION__.20644 = external global [17 x i8]		; <[17 x i8]*> [#uses=0]
-@__FUNCTION__.20681 = external global [13 x i8]		; <[13 x i8]*> [#uses=0]
-@__FUNCTION__.20700 = external global [13 x i8]		; <[13 x i8]*> [#uses=0]
-@__FUNCTION__.21426 = external global [20 x i8]		; <[20 x i8]*> [#uses=0]
-@__FUNCTION__.21471 = external global [17 x i8]		; <[17 x i8]*> [#uses=0]
-@__FUNCTION__.21962 = external global [27 x i8]		; <[27 x i8]*> [#uses=0]
-@__FUNCTION__.22992 = external global [21 x i8]		; <[21 x i8]*> [#uses=0]
-@__FUNCTION__.23735 = external global [15 x i8]		; <[15 x i8]*> [#uses=0]
-@lang_hooks = external global %struct.lang_hooks		; <%struct.lang_hooks*> [#uses=0]
-@__FUNCTION__.27383 = external global [22 x i8]		; <[22 x i8]*> [#uses=0]
-@__FUNCTION__.20776 = external global [21 x i8]		; <[21 x i8]*> [#uses=0]
-@__FUNCTION__.10672 = external global [9 x i8]		; <[9 x i8]*> [#uses=0]
-@str3 = external global [47 x i8]		; <[47 x i8]*> [#uses=0]
-@str4 = external global [7 x i8]		; <[7 x i8]*> [#uses=0]
-@__FUNCTION__.20065 = external global [25 x i8]		; <[25 x i8]*> [#uses=0]
-@__FUNCTION__.23256 = external global [16 x i8]		; <[16 x i8]*> [#uses=0]
-@__FUNCTION__.23393 = external global [19 x i8]		; <[19 x i8]*> [#uses=0]
-@__FUNCTION__.20043 = external global [21 x i8]		; <[21 x i8]*> [#uses=0]
-@__FUNCTION__.20729 = external global [23 x i8]		; <[23 x i8]*> [#uses=0]
-@__FUNCTION__.20563 = external global [24 x i8]		; <[24 x i8]*> [#uses=0]
-@__FUNCTION__.10663 = external global [10 x i8]		; <[10 x i8]*> [#uses=0]
-@__FUNCTION__.20367 = external global [21 x i8]		; <[21 x i8]*> [#uses=0]
-@__FUNCTION__.20342 = external global [15 x i8]		; <[15 x i8]*> [#uses=0]
-@input_location = external global %struct.location_t		; <%struct.location_t*> [#uses=0]
-@__FUNCTION__.24510 = external global [27 x i8]		; <[27 x i8]*> [#uses=0]
-@__FUNCTION__.25097 = external global [25 x i8]		; <[25 x i8]*> [#uses=0]
-@__FUNCTION__.24705 = external global [26 x i8]		; <[26 x i8]*> [#uses=0]
-@str5 = external global [2 x i8]		; <[2 x i8]*> [#uses=0]
-@__FUNCTION__.25136 = external global [21 x i8]		; <[21 x i8]*> [#uses=0]
-@__FUNCTION__.24450 = external global [31 x i8]		; <[31 x i8]*> [#uses=0]
-@implicit_built_in_decls = external global [471 x %struct.tree_node*]		; <[471 x %struct.tree_node*]*> [#uses=0]
-@__FUNCTION__.24398 = external global [31 x i8]		; <[31 x i8]*> [#uses=0]
-@__FUNCTION__.26156 = external global [14 x i8]		; <[14 x i8]*> [#uses=1]
-@unknown_location = external global %struct.location_t		; <%struct.location_t*> [#uses=0]
-@__FUNCTION__.23038 = external global [19 x i8]		; <[19 x i8]*> [#uses=0]
-@str6 = external global [43 x i8]		; <[43 x i8]*> [#uses=0]
-@__FUNCTION__.25476 = external global [19 x i8]		; <[19 x i8]*> [#uses=0]
-@__FUNCTION__.22136 = external global [20 x i8]		; <[20 x i8]*> [#uses=1]
-@__FUNCTION__.21997 = external global [23 x i8]		; <[23 x i8]*> [#uses=0]
-@__FUNCTION__.21247 = external global [19 x i8]		; <[19 x i8]*> [#uses=0]
-@built_in_decls = external global [471 x %struct.tree_node*]		; <[471 x %struct.tree_node*]*> [#uses=0]
-@__FUNCTION__.21924 = external global [19 x i8]		; <[19 x i8]*> [#uses=0]
-@__FUNCTION__.21861 = external global [25 x i8]		; <[25 x i8]*> [#uses=0]
-@global_dc = external global %struct.diagnostic_context*		; <%struct.diagnostic_context**> [#uses=0]
-@__FUNCTION__.25246 = external global [32 x i8]		; <[32 x i8]*> [#uses=0]
-@str7 = external global [4 x i8]		; <[4 x i8]*> [#uses=0]
-@stderr = external global %struct.FILE*		; <%struct.FILE**> [#uses=0]
-@str8 = external global [24 x i8]		; <[24 x i8]*> [#uses=0]
-@str9 = external global [22 x i8]		; <[22 x i8]*> [#uses=0]
-@__FUNCTION__.27653 = external global [21 x i8]		; <[21 x i8]*> [#uses=0]
-@__FUNCTION__.27322 = external global [21 x i8]		; <[21 x i8]*> [#uses=0]
-@__FUNCTION__.27139 = external global [20 x i8]		; <[20 x i8]*> [#uses=0]
-@__FUNCTION__.22462 = external global [23 x i8]		; <[23 x i8]*> [#uses=0]
-@str10 = external global [6 x i8]		; <[6 x i8]*> [#uses=0]
-@__FUNCTION__.25389 = external global [19 x i8]		; <[19 x i8]*> [#uses=0]
-@__FUNCTION__.25650 = external global [18 x i8]		; <[18 x i8]*> [#uses=0]
-@str11 = external global [32 x i8]		; <[32 x i8]*> [#uses=0]
-@str12 = external global [3 x i8]		; <[3 x i8]*> [#uses=0]
-@str13 = external global [44 x i8]		; <[44 x i8]*> [#uses=0]
-@__FUNCTION__.27444 = external global [14 x i8]		; <[14 x i8]*> [#uses=0]
-@timevar_enable = external global i8		; <i8*> [#uses=0]
-@__FUNCTION__.27533 = external global [23 x i8]		; <[23 x i8]*> [#uses=0]
-@flag_instrument_function_entry_exit = external global i32		; <i32*> [#uses=0]
-@__FUNCTION__.25331 = external global [23 x i8]		; <[23 x i8]*> [#uses=0]
-@__FUNCTION__.20965 = external global [19 x i8]		; <[19 x i8]*> [#uses=0]
-@str14 = external global [12 x i8]		; <[12 x i8]*> [#uses=0]
-@__FUNCTION__.26053 = external global [21 x i8]		; <[21 x i8]*> [#uses=0]
-@__FUNCTION__.26004 = external global [20 x i8]		; <[20 x i8]*> [#uses=0]
-@str15 = external global [8 x i8]		; <[8 x i8]*> [#uses=0]
-@__FUNCTION__.21584 = external global [21 x i8]		; <[21 x i8]*> [#uses=0]
-@str16 = external global [12 x i8]		; <[12 x i8]*> [#uses=0]
-@__FUNCTION__.25903 = external global [28 x i8]		; <[28 x i8]*> [#uses=0]
-@__FUNCTION__.22930 = external global [23 x i8]		; <[23 x i8]*> [#uses=0]
-@__FUNCTION__.23832 = external global [19 x i8]		; <[19 x i8]*> [#uses=0]
-@str17 = external global [6 x i8]		; <[6 x i8]*> [#uses=0]
-@__FUNCTION__.24620 = external global [24 x i8]		; <[24 x i8]*> [#uses=0]
-@__FUNCTION__.24582 = external global [30 x i8]		; <[30 x i8]*> [#uses=0]
-@__FUNCTION__.21382 = external global [19 x i8]		; <[19 x i8]*> [#uses=0]
-@__FUNCTION__.21117 = external global [21 x i8]		; <[21 x i8]*> [#uses=0]
-
-
-declare void @push_gimplify_context()
-
-declare i32 @gimple_tree_hash(i8*)
-
-declare i32 @iterative_hash_expr(%struct.tree_node*, i32)
-
-declare i32 @gimple_tree_eq(i8*, i8*)
-
-declare i32 @operand_equal_p(%struct.tree_node*, %struct.tree_node*, i32)
-
-declare void @fancy_abort(i8*, i32, i8*)
-
-declare i8* @xcalloc(i32, i32)
-
-declare %struct.htab* @htab_create(i32, i32 (i8*)*, i32 (i8*, i8*)*, void (i8*)*)
-
-declare void @free(i8*)
-
-declare void @gimple_push_bind_expr(%struct.tree_node*)
-
-declare void @gimple_pop_bind_expr()
-
-declare %struct.tree_node* @gimple_current_bind_expr()
-
-declare fastcc void @gimple_push_condition()
-
-declare %struct.tree_node* @create_artificial_label()
-
-declare %struct.tree_node* @build_decl_stat(i32, %struct.tree_node*, %struct.tree_node*)
-
-declare void @tree_class_check_failed(%struct.tree_node*, i32, i8*, i32, i8*)
-
-declare %struct.tree_node* @create_tmp_var_name(i8*)
-
-declare i32 @strlen(i8*)
-
-declare void @llvm.memcpy.i32(i8*, i8*, i32, i32)
-
-declare i32 @sprintf(i8*, i8*, ...)
-
-declare %struct.tree_node* @get_identifier(i8*)
-
-declare %struct.tree_node* @create_tmp_var_raw(%struct.tree_node*, i8*)
-
-declare %struct.tree_node* @build_qualified_type(%struct.tree_node*, i32)
-
-declare i8* @get_name(%struct.tree_node*)
-
-declare void @tree_operand_check_failed(i32, i32, i8*, i32, i8*)
-
-declare void @tree_check_failed(%struct.tree_node*, i8*, i32, i8*, ...)
-
-declare void @declare_tmp_vars(%struct.tree_node*, %struct.tree_node*)
-
-declare %struct.tree_node* @nreverse(%struct.tree_node*)
-
-declare void @gimple_add_tmp_var(%struct.tree_node*)
-
-declare void @record_vars(%struct.tree_node*)
-
-declare %struct.tree_node* @create_tmp_var(%struct.tree_node*, i8*)
-
-declare void @pop_gimplify_context(%struct.tree_node*)
-
-declare void @htab_delete(%struct.htab*)
-
-declare fastcc void @annotate_one_with_locus(%struct.tree_node*, i32, i32)
-
-declare void @annotate_with_locus(%struct.tree_node*, i32, i32)
-
-declare %struct.tree_node* @mostly_copy_tree_r(%struct.tree_node**, i32*, i8*)
-
-declare %struct.tree_node* @copy_tree_r(%struct.tree_node**, i32*, i8*)
-
-declare %struct.tree_node* @mark_decls_volatile_r(%struct.tree_node**, i32*, i8*)
-
-declare %struct.tree_node* @copy_if_shared_r(%struct.tree_node**, i32*, i8*)
-
-declare %struct.tree_node* @walk_tree(%struct.tree_node**, %struct.tree_node* (%struct.tree_node**, i32*, i8*)*, i8*, %struct.pointer_set_t*)
-
-declare %struct.tree_node* @unmark_visited_r(%struct.tree_node**, i32*, i8*)
-
-declare fastcc void @unshare_body(%struct.tree_node**, %struct.tree_node*)
-
-declare %struct.cgraph_node* @cgraph_node(%struct.tree_node*)
-
-declare fastcc void @unvisit_body(%struct.tree_node**, %struct.tree_node*)
-
-declare void @unshare_all_trees(%struct.tree_node*)
-
-declare %struct.tree_node* @unshare_expr(%struct.tree_node*)
-
-declare %struct.tree_node* @build_and_jump(%struct.tree_node**)
-
-declare %struct.tree_node* @build1_stat(i32, %struct.tree_node*, %struct.tree_node*)
-
-declare i32 @compare_case_labels(i8*, i8*)
-
-declare i32 @tree_int_cst_compare(%struct.tree_node*, %struct.tree_node*)
-
-declare void @sort_case_labels(%struct.tree_node*)
-
-declare void @tree_vec_elt_check_failed(i32, i32, i8*, i32, i8*)
-
-declare void @qsort(i8*, i32, i32, i32 (i8*, i8*)*)
-
-declare %struct.tree_node* @force_labels_r(%struct.tree_node**, i32*, i8*)
-
-declare fastcc void @canonicalize_component_ref(%struct.tree_node**)
-
-declare %struct.tree_node* @get_unwidened(%struct.tree_node*, %struct.tree_node*)
-
-declare fastcc void @maybe_with_size_expr(%struct.tree_node**)
-
-declare %struct.tree_node* @substitute_placeholder_in_expr(%struct.tree_node*, %struct.tree_node*)
-
-declare %struct.tree_node* @build2_stat(i32, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*)
-
-declare fastcc %struct.tree_node* @gimple_boolify(%struct.tree_node*)
-
-declare %struct.tree_node* @convert(%struct.tree_node*, %struct.tree_node*)
-
-declare %struct.tree_node* @gimplify_init_ctor_preeval_1(%struct.tree_node**, i32*, i8*)
-
-declare i64 @get_alias_set(%struct.tree_node*)
-
-declare i32 @alias_sets_conflict_p(i64, i64)
-
-declare fastcc i8 @cpt_same_type(%struct.tree_node*, %struct.tree_node*) zeroext
-
-declare %struct.tree_node* @check_pointer_types_r(%struct.tree_node**, i32*, i8*)
-
-declare %struct.tree_node* @voidify_wrapper_expr(%struct.tree_node*, %struct.tree_node*)
-
-declare i32 @integer_zerop(%struct.tree_node*)
-
-declare fastcc void @append_to_statement_list_1(%struct.tree_node*, %struct.tree_node**)
-
-declare %struct.tree_node* @alloc_stmt_list()
-
-declare void @tsi_link_after(%struct.tree_stmt_iterator*, %struct.tree_node*, i32)
-
-declare void @append_to_statement_list_force(%struct.tree_node*, %struct.tree_node**)
-
-declare void @append_to_statement_list(%struct.tree_node*, %struct.tree_node**)
-
-declare fastcc %struct.tree_node* @shortcut_cond_r(%struct.tree_node*, %struct.tree_node**, %struct.tree_node**)
-
-declare %struct.tree_node* @build3_stat(i32, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*)
-
-declare fastcc %struct.tree_node* @shortcut_cond_expr(%struct.tree_node*)
-
-declare %struct.tree_node* @expr_last(%struct.tree_node*)
-
-declare i8 @block_may_fallthru(%struct.tree_node*) zeroext 
-
-declare fastcc void @gimple_pop_condition(%struct.tree_node**)
-
-declare %struct.tree_node* @gimple_build_eh_filter(%struct.tree_node*, %struct.tree_node*, %struct.tree_node*)
-
-declare void @annotate_all_with_locus(%struct.tree_node**, i32, i32)
-
-declare fastcc %struct.tree_node* @internal_get_tmp_var(%struct.tree_node*, %struct.tree_node**, %struct.tree_node**, i8 zeroext )
-
-define i32 @gimplify_expr(%struct.tree_node** %expr_p, %struct.tree_node** %pre_p, %struct.tree_node** %post_p, i8 (%struct.tree_node*) zeroext * %gimple_test_f, i32 %fallback) {
-entry:
-	%internal_post = alloca %struct.tree_node*, align 4		; <%struct.tree_node**> [#uses=2]
-	%pre_p_addr.0 = select i1 false, %struct.tree_node** null, %struct.tree_node** %pre_p		; <%struct.tree_node**> [#uses=7]
-	%post_p_addr.0 = select i1 false, %struct.tree_node** %internal_post, %struct.tree_node** %post_p		; <%struct.tree_node**> [#uses=7]
-	br i1 false, label %bb277, label %bb191
-
-bb191:		; preds = %entry
-	ret i32 0
-
-bb277:		; preds = %entry
-	%tmp283 = call i32 null( %struct.tree_node** %expr_p, %struct.tree_node** %pre_p_addr.0, %struct.tree_node** %post_p_addr.0 )		; <i32> [#uses=1]
-	switch i32 %tmp283, label %bb7478 [
-		 i32 0, label %cond_next289
-		 i32 -1, label %cond_next298
-	]
-
-cond_next289:		; preds = %bb277
-	ret i32 0
-
-cond_next298:		; preds = %bb277
-	switch i32 0, label %bb7444 [
-		 i32 24, label %bb7463
-		 i32 25, label %bb7463
-		 i32 26, label %bb7463
-		 i32 27, label %bb7463
-		 i32 28, label %bb7463
-		 i32 33, label %bb4503
-		 i32 39, label %bb397
-		 i32 40, label %bb5650
-		 i32 41, label %bb4339
-		 i32 42, label %bb4350
-		 i32 43, label %bb4350
-		 i32 44, label %bb319
-		 i32 45, label %bb397
-		 i32 46, label %bb6124
-		 i32 47, label %bb7463
-		 i32 49, label %bb5524
-		 i32 50, label %bb1283
-		 i32 51, label %bb1289
-		 i32 52, label %bb1289
-		 i32 53, label %bb5969
-		 i32 54, label %bb408
-		 i32 56, label %bb5079
-		 i32 57, label %bb428
-		 i32 59, label %bb5965
-		 i32 74, label %bb4275
-		 i32 75, label %bb4275
-		 i32 76, label %bb4275
-		 i32 77, label %bb4275
-		 i32 91, label %bb1296
-		 i32 92, label %bb1296
-		 i32 96, label %bb1322
-		 i32 112, label %bb2548
-		 i32 113, label %bb2548
-		 i32 115, label %bb397
-		 i32 116, label %bb5645
-		 i32 117, label %bb1504
-		 i32 121, label %bb397
-		 i32 122, label %bb397
-		 i32 123, label %bb313
-		 i32 124, label %bb313
-		 i32 125, label %bb313
-		 i32 126, label %bb313
-		 i32 127, label %bb2141
-		 i32 128, label %cond_next5873
-		 i32 129, label %cond_next5873
-		 i32 130, label %bb4536
-		 i32 131, label %bb5300
-		 i32 132, label %bb5170
-		 i32 133, label %bb5519
-		 i32 134, label %bb5091
-		 i32 135, label %bb5083
-		 i32 136, label %bb5087
-		 i32 137, label %bb5382
-		 i32 139, label %bb7463
-		 i32 140, label %bb7463
-		 i32 142, label %bb5974
-		 i32 143, label %bb6049
-		 i32 147, label %bb6296
-		 i32 151, label %cond_next6474
-	]
-
-bb313:		; preds = %cond_next298, %cond_next298, %cond_next298, %cond_next298
-	ret i32 0
-
-bb319:		; preds = %cond_next298
-	ret i32 0
-
-bb397:		; preds = %cond_next298, %cond_next298, %cond_next298, %cond_next298, %cond_next298
-	ret i32 0
-
-bb408:		; preds = %cond_next298
-	%tmp413 = call fastcc i32 @gimplify_cond_expr( %struct.tree_node** %expr_p, %struct.tree_node** %pre_p_addr.0, %struct.tree_node** %post_p_addr.0, %struct.tree_node* null, i32 %fallback )		; <i32> [#uses=0]
-	ret i32 0
-
-bb428:		; preds = %cond_next298
-	ret i32 0
-
-bb1283:		; preds = %cond_next298
-	ret i32 0
-
-bb1289:		; preds = %cond_next298, %cond_next298
-	ret i32 0
-
-bb1296:		; preds = %cond_next298, %cond_next298
-	ret i32 0
-
-bb1322:		; preds = %cond_next298
-	ret i32 0
-
-bb1504:		; preds = %cond_next298
-	ret i32 0
-
-bb2141:		; preds = %cond_next298
-	ret i32 0
-
-bb2548:		; preds = %cond_next298, %cond_next298
-	%tmp2554 = load %struct.tree_node** %expr_p		; <%struct.tree_node*> [#uses=2]
-	%tmp2562 = and i32 0, 255		; <i32> [#uses=1]
-	%tmp2569 = add i8 0, -4		; <i8> [#uses=1]
-	icmp ugt i8 %tmp2569, 5		; <i1>:0 [#uses=2]
-	%tmp2587 = load i8* null		; <i8> [#uses=1]
-	icmp eq i8 %tmp2587, 0		; <i1>:1 [#uses=2]
-	%tmp2607 = load %struct.tree_node** null		; <%struct.tree_node*> [#uses=2]
-	br i1 false, label %bb2754, label %cond_next2617
-
-cond_next2617:		; preds = %bb2548
-	ret i32 0
-
-bb2754:		; preds = %bb2548
-	br i1 %0, label %cond_true2780, label %cond_next2783
-
-cond_true2780:		; preds = %bb2754
-	call void @tree_class_check_failed( %struct.tree_node* %tmp2554, i32 9, i8* getelementptr ([42 x i8]* @str, i32 0, i32 0), i32 1415, i8* getelementptr ([20 x i8]* @__FUNCTION__.22136, i32 0, i32 0) )
-	unreachable
-
-cond_next2783:		; preds = %bb2754
-	%tmp2825 = and i32 0, 255		; <i32> [#uses=1]
-	%tmp2829 = load i32* null		; <i32> [#uses=1]
-	%tmp28292830 = trunc i32 %tmp2829 to i8		; <i8> [#uses=1]
-	%tmp2832 = add i8 %tmp28292830, -4		; <i8> [#uses=1]
-	icmp ugt i8 %tmp2832, 5		; <i1>:2 [#uses=1]
-	icmp eq i8 0, 0		; <i1>:3 [#uses=1]
-	%tmp28652866 = bitcast %struct.tree_node* %tmp2607 to %struct.tree_exp*		; <%struct.tree_exp*> [#uses=1]
-	%tmp2868 = getelementptr %struct.tree_exp* %tmp28652866, i32 0, i32 4, i32 0		; <%struct.tree_node**> [#uses=1]
-	%tmp2870 = load %struct.tree_node** %tmp2868		; <%struct.tree_node*> [#uses=1]
-	br i1 %1, label %cond_true2915, label %cond_next2927
-
-cond_true2915:		; preds = %cond_next2783
-	unreachable
-
-cond_next2927:		; preds = %cond_next2783
-	%tmp2938 = load %struct.tree_node** null		; <%struct.tree_node*> [#uses=1]
-	%tmp2944 = load i32* null		; <i32> [#uses=1]
-	%tmp2946 = and i32 %tmp2944, 255		; <i32> [#uses=1]
-	%tmp2949 = getelementptr [0 x i32]* @tree_code_type, i32 0, i32 %tmp2946		; <i32*> [#uses=1]
-	%tmp2950 = load i32* %tmp2949		; <i32> [#uses=1]
-	icmp eq i32 %tmp2950, 2		; <i1>:4 [#uses=1]
-	br i1 %4, label %cond_next2954, label %cond_true2951
-
-cond_true2951:		; preds = %cond_next2927
-	call void @tree_class_check_failed( %struct.tree_node* %tmp2938, i32 2, i8* getelementptr ([42 x i8]* @str, i32 0, i32 0), i32 1415, i8* getelementptr ([20 x i8]* @__FUNCTION__.22136, i32 0, i32 0) )
-	unreachable
-
-cond_next2954:		; preds = %cond_next2927
-	br i1 %0, label %cond_true2991, label %cond_next2994
-
-cond_true2991:		; preds = %cond_next2954
-	unreachable
-
-cond_next2994:		; preds = %cond_next2954
-	br i1 %1, label %cond_true3009, label %cond_next3021
-
-cond_true3009:		; preds = %cond_next2994
-	call void @tree_operand_check_failed( i32 0, i32 %tmp2562, i8* getelementptr ([42 x i8]* @str, i32 0, i32 0), i32 1415, i8* getelementptr ([20 x i8]* @__FUNCTION__.22136, i32 0, i32 0) )
-	unreachable
-
-cond_next3021:		; preds = %cond_next2994
-	br i1 %2, label %cond_true3044, label %cond_next3047
-
-cond_true3044:		; preds = %cond_next3021
-	call void @tree_class_check_failed( %struct.tree_node* %tmp2607, i32 9, i8* getelementptr ([42 x i8]* @str, i32 0, i32 0), i32 1415, i8* getelementptr ([20 x i8]* @__FUNCTION__.22136, i32 0, i32 0) )
-	unreachable
-
-cond_next3047:		; preds = %cond_next3021
-	br i1 %3, label %cond_true3062, label %cond_next3074
-
-cond_true3062:		; preds = %cond_next3047
-	call void @tree_operand_check_failed( i32 0, i32 %tmp2825, i8* getelementptr ([42 x i8]* @str, i32 0, i32 0), i32 1415, i8* getelementptr ([20 x i8]* @__FUNCTION__.22136, i32 0, i32 0) )
-	unreachable
-
-cond_next3074:		; preds = %cond_next3047
-	%tmp3084 = getelementptr %struct.tree_node* %tmp2870, i32 0, i32 0, i32 0, i32 1		; <%struct.tree_node**> [#uses=1]
-	%tmp3085 = load %struct.tree_node** %tmp3084		; <%struct.tree_node*> [#uses=1]
-	%tmp31043105 = bitcast %struct.tree_node* %tmp3085 to %struct.tree_type*		; <%struct.tree_type*> [#uses=1]
-	%tmp3106 = getelementptr %struct.tree_type* %tmp31043105, i32 0, i32 6		; <i16*> [#uses=1]
-	%tmp31063107 = bitcast i16* %tmp3106 to i32*		; <i32*> [#uses=1]
-	%tmp3108 = load i32* %tmp31063107		; <i32> [#uses=1]
-	xor i32 %tmp3108, 0		; <i32>:5 [#uses=1]
-	%tmp81008368 = and i32 %5, 65024		; <i32> [#uses=1]
-	icmp eq i32 %tmp81008368, 0		; <i1>:6 [#uses=1]
-	br i1 %6, label %cond_next3113, label %bb3351
-
-cond_next3113:		; preds = %cond_next3074
-	ret i32 0
-
-bb3351:		; preds = %cond_next3074
-	%tmp3354 = call i8 @tree_ssa_useless_type_conversion( %struct.tree_node* %tmp2554 ) zeroext 		; <i8> [#uses=1]
-	icmp eq i8 %tmp3354, 0		; <i1>:7 [#uses=1]
-	%tmp3424 = load i32* null		; <i32> [#uses=1]
-	br i1 %7, label %cond_next3417, label %cond_true3356
-
-cond_true3356:		; preds = %bb3351
-	ret i32 0
-
-cond_next3417:		; preds = %bb3351
-	br i1 false, label %cond_true3429, label %cond_next4266
-
-cond_true3429:		; preds = %cond_next3417
-	%tmp3443 = and i32 %tmp3424, 255		; <i32> [#uses=0]
-	ret i32 0
-
-cond_next4266:		; preds = %cond_next3417
-	%tmp4268 = load %struct.tree_node** %expr_p		; <%struct.tree_node*> [#uses=1]
-	icmp eq %struct.tree_node* %tmp4268, null		; <i1>:8 [#uses=1]
-	br i1 %8, label %bb4275, label %bb7463
-
-bb4275:		; preds = %cond_next4266, %cond_next298, %cond_next298, %cond_next298, %cond_next298
-	%tmp4289 = and i32 0, 255		; <i32> [#uses=2]
-	%tmp4292 = getelementptr [0 x i32]* @tree_code_type, i32 0, i32 %tmp4289		; <i32*> [#uses=1]
-	%tmp4293 = load i32* %tmp4292		; <i32> [#uses=1]
-	%tmp42934294 = trunc i32 %tmp4293 to i8		; <i8> [#uses=1]
-	%tmp4296 = add i8 %tmp42934294, -4		; <i8> [#uses=1]
-	icmp ugt i8 %tmp4296, 5		; <i1>:9 [#uses=1]
-	br i1 %9, label %cond_true4297, label %cond_next4300
-
-cond_true4297:		; preds = %bb4275
-	unreachable
-
-cond_next4300:		; preds = %bb4275
-	%tmp4314 = load i8* null		; <i8> [#uses=1]
-	icmp eq i8 %tmp4314, 0		; <i1>:10 [#uses=1]
-	br i1 %10, label %cond_true4315, label %cond_next4327
-
-cond_true4315:		; preds = %cond_next4300
-	call void @tree_operand_check_failed( i32 0, i32 %tmp4289, i8* getelementptr ([42 x i8]* @str, i32 0, i32 0), i32 3997, i8* getelementptr ([14 x i8]* @__FUNCTION__.26156, i32 0, i32 0) )
-	unreachable
-
-cond_next4327:		; preds = %cond_next4300
-	%tmp4336 = call i32 @gimplify_expr( %struct.tree_node** null, %struct.tree_node** %pre_p_addr.0, %struct.tree_node** %post_p_addr.0, i8 (%struct.tree_node*) zeroext * @is_gimple_val, i32 1 )		; <i32> [#uses=0]
-	ret i32 0
-
-bb4339:		; preds = %cond_next298
-	ret i32 0
-
-bb4350:		; preds = %cond_next298, %cond_next298
-	ret i32 0
-
-bb4503:		; preds = %cond_next298
-	ret i32 0
-
-bb4536:		; preds = %cond_next298
-	ret i32 0
-
-bb5079:		; preds = %cond_next298
-	ret i32 0
-
-bb5083:		; preds = %cond_next298
-	ret i32 0
-
-bb5087:		; preds = %cond_next298
-	ret i32 0
-
-bb5091:		; preds = %cond_next298
-	ret i32 0
-
-bb5170:		; preds = %cond_next298
-	ret i32 0
-
-bb5300:		; preds = %cond_next298
-	ret i32 0
-
-bb5382:		; preds = %cond_next298
-	ret i32 0
-
-bb5519:		; preds = %cond_next298
-	ret i32 0
-
-bb5524:		; preds = %cond_next298
-	ret i32 0
-
-bb5645:		; preds = %cond_next298
-	ret i32 0
-
-bb5650:		; preds = %cond_next298
-	ret i32 0
-
-cond_next5873:		; preds = %cond_next298, %cond_next298
-	ret i32 0
-
-bb5965:		; preds = %cond_next298
-	%tmp5968 = call fastcc i32 @gimplify_cleanup_point_expr( %struct.tree_node** %expr_p, %struct.tree_node** %pre_p_addr.0 )		; <i32> [#uses=0]
-	ret i32 0
-
-bb5969:		; preds = %cond_next298
-	%tmp5973 = call fastcc i32 @gimplify_target_expr( %struct.tree_node** %expr_p, %struct.tree_node** %pre_p_addr.0, %struct.tree_node** %post_p_addr.0 )		; <i32> [#uses=0]
-	ret i32 0
-
-bb5974:		; preds = %cond_next298
-	ret i32 0
-
-bb6049:		; preds = %cond_next298
-	ret i32 0
-
-bb6124:		; preds = %cond_next298
-	ret i32 0
-
-bb6296:		; preds = %cond_next298
-	ret i32 0
-
-cond_next6474:		; preds = %cond_next298
-	icmp eq %struct.tree_node** %internal_post, %post_p_addr.0		; <i1>:11 [#uses=1]
-	%iftmp.381.0 = select i1 %11, %struct.tree_node** null, %struct.tree_node** %post_p_addr.0		; <%struct.tree_node**> [#uses=1]
-	%tmp6490 = call i32 @gimplify_expr( %struct.tree_node** null, %struct.tree_node** %pre_p_addr.0, %struct.tree_node** %iftmp.381.0, i8 (%struct.tree_node*) zeroext * %gimple_test_f, i32 %fallback )		; <i32> [#uses=0]
-	%tmp6551 = call i32 @gimplify_expr( %struct.tree_node** null, %struct.tree_node** %pre_p_addr.0, %struct.tree_node** %post_p_addr.0, i8 (%struct.tree_node*) zeroext * @is_gimple_val, i32 1 )		; <i32> [#uses=0]
-	ret i32 0
-
-bb7444:		; preds = %cond_next298
-	ret i32 0
-
-bb7463:		; preds = %cond_next4266, %cond_next298, %cond_next298, %cond_next298, %cond_next298, %cond_next298, %cond_next298, %cond_next298, %cond_next298
-	ret i32 0
-
-bb7478:		; preds = %bb277
-	ret i32 0
-}
-
-declare i8 @is_gimple_formal_tmp_rhs(%struct.tree_node*) zeroext 
-
-declare void @gimplify_and_add(%struct.tree_node*, %struct.tree_node**)
-
-declare %struct.tree_node* @get_initialized_tmp_var(%struct.tree_node*, %struct.tree_node**, %struct.tree_node**)
-
-declare %struct.tree_node* @get_formal_tmp_var(%struct.tree_node*, %struct.tree_node**)
-
-declare fastcc void @gimplify_init_ctor_preeval(%struct.tree_node**, %struct.tree_node**, %struct.tree_node**, %struct.gimplify_init_ctor_preeval_data*)
-
-declare i8 @type_contains_placeholder_p(%struct.tree_node*) zeroext 
-
-declare i8 @is_gimple_mem_rhs(%struct.tree_node*) zeroext 
-
-declare fastcc i32 @gimplify_modify_expr_rhs(%struct.tree_node**, %struct.tree_node**, %struct.tree_node**, %struct.tree_node**, %struct.tree_node**, i8 zeroext )
-
-declare %struct.tree_node* @fold_indirect_ref(%struct.tree_node*)
-
-declare fastcc i32 @gimplify_compound_expr(%struct.tree_node**, %struct.tree_node**, i8 zeroext )
-
-declare i8 @is_gimple_lvalue(%struct.tree_node*) zeroext 
-
-declare void @categorize_ctor_elements(%struct.tree_node*, i64*, i64*, i64*, i8*)
-
-declare void @lhd_set_decl_assembler_name(%struct.tree_node*)
-
-declare i64 @int_size_in_bytes(%struct.tree_node*)
-
-declare i32 @can_move_by_pieces(i64, i32)
-
-declare i64 @count_type_elements(%struct.tree_node*)
-
-declare void @gimplify_stmt(%struct.tree_node**)
-
-declare %struct.tree_node* @get_base_address(%struct.tree_node*)
-
-declare fastcc void @gimplify_init_ctor_eval(%struct.tree_node*, %struct.tree_node*, %struct.tree_node**, i8 zeroext )
-
-declare %struct.tree_node* @build_complex(%struct.tree_node*, %struct.tree_node*, %struct.tree_node*)
-
-declare i8 (%struct.tree_node*) zeroext * @rhs_predicate_for(%struct.tree_node*)
-
-declare %struct.tree_node* @build_vector(%struct.tree_node*, %struct.tree_node*)
-
-declare i8 @is_gimple_val(%struct.tree_node*) zeroext 
-
-declare i8 @is_gimple_reg_type(%struct.tree_node*) zeroext 
-
-declare fastcc i32 @gimplify_cond_expr(%struct.tree_node**, %struct.tree_node**, %struct.tree_node**, %struct.tree_node*, i32)
-
-declare fastcc i32 @gimplify_modify_expr(%struct.tree_node**, %struct.tree_node**, %struct.tree_node**, i8 zeroext )
-
-declare %struct.tree_node* @tree_cons_stat(%struct.tree_node*, %struct.tree_node*, %struct.tree_node*)
-
-declare %struct.tree_node* @build_fold_addr_expr(%struct.tree_node*)
-
-declare %struct.tree_node* @build_function_call_expr(%struct.tree_node*, %struct.tree_node*)
-
-declare i8 @is_gimple_addressable(%struct.tree_node*) zeroext 
-
-declare i8 @is_gimple_reg(%struct.tree_node*) zeroext 
-
-declare %struct.tree_node* @make_ssa_name(%struct.tree_node*, %struct.tree_node*)
-
-declare i8 @tree_ssa_useless_type_conversion(%struct.tree_node*) zeroext 
-
-declare fastcc i32 @gimplify_self_mod_expr(%struct.tree_node**, %struct.tree_node**, %struct.tree_node**, i8 zeroext )
-
-declare fastcc i32 @gimplify_compound_lval(%struct.tree_node**, %struct.tree_node**, %struct.tree_node**, i32)
-
-declare %struct.tree_node* @get_callee_fndecl(%struct.tree_node*)
-
-declare %struct.tree_node* @fold_builtin(%struct.tree_node*, i8 zeroext )
-
-declare void @error(i8*, ...)
-
-declare %struct.tree_node* @build_empty_stmt()
-
-declare i8 @fold_builtin_next_arg(%struct.tree_node*) zeroext 
-
-declare fastcc i32 @gimplify_arg(%struct.tree_node**, %struct.tree_node**)
-
-declare i8 @is_gimple_call_addr(%struct.tree_node*) zeroext 
-
-declare i32 @call_expr_flags(%struct.tree_node*)
-
-declare void @recalculate_side_effects(%struct.tree_node*)
-
-declare %struct.tree_node* @fold_convert(%struct.tree_node*, %struct.tree_node*)
-
-declare void @recompute_tree_invarant_for_addr_expr(%struct.tree_node*)
-
-declare i32 @gimplify_va_arg_expr(%struct.tree_node**, %struct.tree_node**, %struct.tree_node**)
-
-declare %struct.tree_node* @size_int_kind(i64, i32)
-
-declare %struct.tree_node* @size_binop(i32, %struct.tree_node*, %struct.tree_node*)
-
-declare %struct.tree_node* @build4_stat(i32, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*)
-
-declare void @gimplify_type_sizes(%struct.tree_node*, %struct.tree_node**)
-
-declare void @gimplify_one_sizepos(%struct.tree_node**, %struct.tree_node**)
-
-declare %struct.tree_node* @build_pointer_type(%struct.tree_node*)
-
-declare %struct.tree_node* @build_fold_indirect_ref(%struct.tree_node*)
-
-declare fastcc i32 @gimplify_bind_expr(%struct.tree_node**, %struct.tree_node*, %struct.tree_node**)
-
-declare fastcc void @gimplify_loop_expr(%struct.tree_node**, %struct.tree_node**)
-
-declare fastcc i32 @gimplify_switch_expr(%struct.tree_node**, %struct.tree_node**)
-
-declare %struct.tree_node* @decl_function_context(%struct.tree_node*)
-
-declare %struct.varray_head_tag* @varray_grow(%struct.varray_head_tag*, i32)
-
-declare fastcc void @gimplify_return_expr(%struct.tree_node*, %struct.tree_node**)
-
-declare fastcc i32 @gimplify_save_expr(%struct.tree_node**, %struct.tree_node**, %struct.tree_node**)
-
-declare fastcc i32 @gimplify_asm_expr(%struct.tree_node**, %struct.tree_node**, %struct.tree_node**)
-
-declare void @gimplify_to_stmt_list(%struct.tree_node**)
-
-declare fastcc i32 @gimplify_cleanup_point_expr(%struct.tree_node**, %struct.tree_node**)
-
-declare fastcc i32 @gimplify_target_expr(%struct.tree_node**, %struct.tree_node**, %struct.tree_node**)
-
-declare void @tsi_delink(%struct.tree_stmt_iterator*)
-
-declare void @tsi_link_before(%struct.tree_stmt_iterator*, %struct.tree_node*, i32)
-
-declare i8 @is_gimple_stmt(%struct.tree_node*) zeroext 
-
-declare void @print_generic_expr(%struct.FILE*, %struct.tree_node*, i32)
-
-declare void @debug_tree(%struct.tree_node*)
-
-declare void @internal_error(i8*, ...)
-
-declare %struct.tree_node* @force_gimple_operand(%struct.tree_node*, %struct.tree_node**, i8 zeroext , %struct.tree_node*)
-
-declare i8 @is_gimple_reg_rhs(%struct.tree_node*) zeroext 
-
-declare void @add_referenced_tmp_var(%struct.tree_node*)
-
-declare i8 @contains_placeholder_p(%struct.tree_node*) zeroext 
-
-declare %struct.varray_head_tag* @varray_init(i32, i32, i8*)
-
-declare i32 @handled_component_p(%struct.tree_node*)
-
-declare void @varray_check_failed(%struct.varray_head_tag*, i32, i8*, i32, i8*)
-
-declare %struct.tree_node* @array_ref_low_bound(%struct.tree_node*)
-
-declare i8 @is_gimple_min_invariant(%struct.tree_node*) zeroext 
-
-declare i8 @is_gimple_formal_tmp_reg(%struct.tree_node*) zeroext 
-
-declare %struct.tree_node* @array_ref_element_size(%struct.tree_node*)
-
-declare %struct.tree_node* @component_ref_field_offset(%struct.tree_node*)
-
-declare i8 @is_gimple_min_lval(%struct.tree_node*) zeroext 
-
-declare void @varray_underflow(%struct.varray_head_tag*, i8*, i32, i8*)
-
-declare i32 @list_length(%struct.tree_node*)
-
-declare i8 @parse_output_constraint(i8**, i32, i32, i32, i8*, i8*, i8*) zeroext 
-
-declare i8* @xstrdup(i8*)
-
-declare %struct.tree_node* @build_string(i32, i8*)
-
-declare i8* @strchr(i8*, i32)
-
-declare %struct.tree_node* @build_tree_list_stat(%struct.tree_node*, %struct.tree_node*)
-
-declare %struct.tree_node* @chainon(%struct.tree_node*, %struct.tree_node*)
-
-declare i8 @parse_input_constraint(i8**, i32, i32, i32, i32, i8**, i8*, i8*) zeroext 
-
-declare i8 @is_gimple_asm_val(%struct.tree_node*) zeroext 
-
-declare void @gimplify_body(%struct.tree_node**, %struct.tree_node*, i8 zeroext )
-
-declare void @timevar_push_1(i32)
-
-declare %struct.tree_node* @gimplify_parameters()
-
-declare %struct.tree_node* @expr_only(%struct.tree_node*)
-
-declare void @timevar_pop_1(i32)
-
-declare void @gimplify_function_tree(%struct.tree_node*)
-
-declare void @allocate_struct_function(%struct.tree_node*)
-
-declare %struct.tree_node* @make_tree_vec_stat(i32)
-
-declare %struct.tree_node* @tsi_split_statement_list_after(%struct.tree_stmt_iterator*)
-
-declare i8 @is_gimple_condexpr(%struct.tree_node*) zeroext 
-
-declare %struct.tree_node* @invert_truthvalue(%struct.tree_node*)
-
-declare i8 @initializer_zerop(%struct.tree_node*) zeroext 
-
-declare i32 @simple_cst_equal(%struct.tree_node*, %struct.tree_node*)
-
-declare i32 @aggregate_value_p(%struct.tree_node*, %struct.tree_node*)
-
-declare i32 @fwrite(i8*, i32, i32, %struct.FILE*)
diff --git a/test/CodeGen/ARM/2007-05-03-BadPostIndexedLd.ll b/test/CodeGen/ARM/2007-05-03-BadPostIndexedLd.ll
index c73b679..25ac52e 100644
--- a/test/CodeGen/ARM/2007-05-03-BadPostIndexedLd.ll
+++ b/test/CodeGen/ARM/2007-05-03-BadPostIndexedLd.ll
@@ -2,7 +2,7 @@
 
 	%struct.Connection = type { i32, [10 x i8], i32 }
 	%struct.IntChunk = type { %struct.cppobjtype, i32, i32*, i32 }
-	%struct.Point = type { i8*, %struct.cppobjtype, i16 (%struct.Point*) signext *, i16 (%struct.Point*) signext *, double (%struct.Point*)*, double (%struct.Point*)* }
+	%struct.Point = type { i8*, %struct.cppobjtype, i16 (%struct.Point*)  *, i16 (%struct.Point*)  *, double (%struct.Point*)*, double (%struct.Point*)* }
 	%struct.RefPoint = type { %struct.Point*, %struct.cppobjtype }
 	%struct.ShortArray = type { %struct.cppobjtype, i32, i16* }
 	%struct.TestObj = type { i8*, %struct.cppobjtype, i8, [32 x i8], i8*, i8**, i16, i16, i32, i32, i32, i32, float, double, %struct.cppobjtype, i32, i16*, i16**, i8**, i32, %struct.XyPoint, [3 x %struct.Connection], %struct.Point*, %struct.XyPoint*, i32, i8*, i8*, i16*, %struct.ShortArray, %struct.IntChunk, %struct.cppobjtype, %struct.cppobjtype, %struct.RefPoint, i32, %struct.cppobjtype, %struct.cppobjtype }
diff --git a/test/CodeGen/ARM/2008-04-04-ScavengerAssert.ll b/test/CodeGen/ARM/2008-04-04-ScavengerAssert.ll
index 234c7b6..6b39a76 100644
--- a/test/CodeGen/ARM/2008-04-04-ScavengerAssert.ll
+++ b/test/CodeGen/ARM/2008-04-04-ScavengerAssert.ll
@@ -46,7 +46,8 @@ bb17.i:		; preds = %cond_next119.i
 cond_true53.i:		; preds = %bb17.i
 	ret { i16, %struct.rnode* }* null
 cond_false99.i:		; preds = %bb17.i
-	%tmp106.i = malloc %struct.ch_set		; <%struct.ch_set*> [#uses=1]
+        %malloccall = tail call i8* @malloc(i32 trunc (i64 mul nuw (i64 ptrtoint (i1** getelementptr (i1** null, i32 1) to i64), i64 2) to i32))
+        %tmp106.i = bitcast i8* %malloccall to %struct.ch_set*
 	br i1 false, label %bb126.i, label %cond_next119.i
 cond_next119.i:		; preds = %cond_false99.i, %bb42
 	%curr_ptr.0.reg2mem.0.i = phi %struct.ch_set* [ %tmp106.i, %cond_false99.i ], [ null, %bb42 ]		; <%struct.ch_set*> [#uses=2]
@@ -58,3 +59,5 @@ bb126.i:		; preds = %cond_next119.i, %cond_false99.i
 bb78:		; preds = %entry
 	ret { i16, %struct.rnode* }* null
 }
+
+declare noalias i8* @malloc(i32)
diff --git a/test/CodeGen/ARM/2008-09-14-CoalescerBug.ll b/test/CodeGen/ARM/2008-09-14-CoalescerBug.ll
deleted file mode 100644
index 5f9d9ae..0000000
--- a/test/CodeGen/ARM/2008-09-14-CoalescerBug.ll
+++ /dev/null
@@ -1,29 +0,0 @@
-; RUN: llc < %s -mtriple=arm-apple-darwin
-
-@"\01LC1" = external constant [288 x i8]		; <[288 x i8]*> [#uses=1]
-
-declare void @llvm.memcpy.i32(i8*, i8*, i32, i32) nounwind
-
-define i32 @main(i32 %argc, i8** %argv) nounwind {
-entry:
-	br label %bb.i
-
-bb.i:		; preds = %bb.i, %entry
-	%i.01.i = phi i32 [ 0, %entry ], [ %indvar.next52, %bb.i ]		; <i32> [#uses=1]
-	%indvar.next52 = add i32 %i.01.i, 1		; <i32> [#uses=2]
-	%exitcond53 = icmp eq i32 %indvar.next52, 15		; <i1> [#uses=1]
-	br i1 %exitcond53, label %bb.i33.loopexit, label %bb.i
-
-bb.i33.loopexit:		; preds = %bb.i
-	%0 = malloc [347 x i8]		; <[347 x i8]*> [#uses=2]
-	%.sub = getelementptr [347 x i8]* %0, i32 0, i32 0		; <i8*> [#uses=1]
-	call void @llvm.memcpy.i32( i8* %.sub, i8* getelementptr ([288 x i8]* @"\01LC1", i32 0, i32 0), i32 287, i32 1 ) nounwind
-	br label %bb.i28
-
-bb.i28:		; preds = %bb.i28, %bb.i33.loopexit
-	br i1 false, label %repeat_fasta.exit, label %bb.i28
-
-repeat_fasta.exit:		; preds = %bb.i28
-	free [347 x i8]* %0
-	unreachable
-}
diff --git a/test/CodeGen/ARM/2009-08-21-PostRAKill3.ll b/test/CodeGen/ARM/2009-08-21-PostRAKill3.ll
index 90a4a42..382038e 100644
--- a/test/CodeGen/ARM/2009-08-21-PostRAKill3.ll
+++ b/test/CodeGen/ARM/2009-08-21-PostRAKill3.ll
@@ -14,7 +14,8 @@ entry:
   br i1 %p, label %bb8, label %bb1
 
 bb1:                                              ; preds = %entry
-  %0 = malloc %struct.Village                     ; <%struct.Village*> [#uses=3]
+  %malloccall = tail call i8* @malloc(i32 ptrtoint (%struct.Village* getelementptr (%struct.Village* null, i32 1) to i32))
+  %0 = bitcast i8* %malloccall to %struct.Village*
   %exp2 = call double @ldexp(double 1.000000e+00, i32 %level) nounwind ; <double> [#uses=1]
   %.c = fptosi double %exp2 to i32                ; <i32> [#uses=1]
   store i32 %.c, i32* null
@@ -29,3 +30,4 @@ bb8:                                              ; preds = %entry
 }
 
 declare double @ldexp(double, i32)
+declare noalias i8* @malloc(i32)
diff --git a/test/CodeGen/ARM/2009-09-09-fpcmp-ole.ll b/test/CodeGen/ARM/2009-09-09-fpcmp-ole.ll
index 3909c6a..0a157c9 100644
--- a/test/CodeGen/ARM/2009-09-09-fpcmp-ole.ll
+++ b/test/CodeGen/ARM/2009-09-09-fpcmp-ole.ll
@@ -1,16 +1,16 @@
-; RUN: llc -O1 -march=arm -mattr=+vfp2 < %s | FileCheck %s
+; RUN: llc -O1 -march=arm -mattr=+vfp2 -mtriple=arm-linux-gnueabi < %s | FileCheck %s
 ; pr4939
 
 define void @test(double* %x, double* %y) nounwind {
-  %1 = load double* %x, align 4
-  %2 = load double* %y, align 4
+  %1 = load double* %x
+  %2 = load double* %y
   %3 = fsub double -0.000000e+00, %1
   %4 = fcmp ugt double %2, %3
   br i1 %4, label %bb1, label %bb2
 
 bb1:
 ;CHECK: vstrhi.64
-  store double %1, double* %y, align 4
+  store double %1, double* %y
   br label %bb2
 
 bb2:
diff --git a/test/CodeGen/ARM/2010-05-18-PostIndexBug.ll b/test/CodeGen/ARM/2010-05-18-PostIndexBug.ll
index 5ad1c09..df9dbca 100644
--- a/test/CodeGen/ARM/2010-05-18-PostIndexBug.ll
+++ b/test/CodeGen/ARM/2010-05-18-PostIndexBug.ll
@@ -7,13 +7,13 @@
 define zeroext i8 @t(%struct.foo* %this) noreturn optsize {
 entry:
 ; ARM:       t:
-; ARM:       str r0, [r1], r0
+; ARM:       str r2, [r1], r0
 
 ; THUMB:     t:
 ; THUMB-NOT: str r0, [r1], r0
-; THUMB:     str r0, [r1]
+; THUMB:     str r2, [r1]
   %0 = getelementptr inbounds %struct.foo* %this, i32 0, i32 1 ; <i64*> [#uses=1]
-  store i32 undef, i32* inttoptr (i32 8 to i32*), align 8
+  store i32 0, i32* inttoptr (i32 8 to i32*), align 8
   br i1 undef, label %bb.nph96, label %bb3
 
 bb3:                                              ; preds = %entry
diff --git a/test/CodeGen/ARM/2010-08-04-StackVariable.ll b/test/CodeGen/ARM/2010-08-04-StackVariable.ll
index f077d04..6aeaa26 100644
--- a/test/CodeGen/ARM/2010-08-04-StackVariable.ll
+++ b/test/CodeGen/ARM/2010-08-04-StackVariable.ll
@@ -1,5 +1,5 @@
-; RUN: llc -O0 -mtriple=arm-apple-darwin < %s | grep DW_OP_fbreg
-; Use DW_OP_fbreg in variable's location expression if the variable is in a stack slot.
+; RUN: llc -O0 -mtriple=arm-apple-darwin < %s | grep DW_OP_breg
+; Use DW_OP_breg in variable's location expression if the variable is in a stack slot.
 
 %struct.SVal = type { i8*, i32 }
 
@@ -31,7 +31,7 @@ return:                                           ; preds = %bb2
   ret i32 %.0, !dbg !29
 }
 
-define linkonce_odr void @_ZN4SValC1Ev(%struct.SVal* %this) nounwind ssp align 2 {
+define linkonce_odr void @_ZN4SValC1Ev(%struct.SVal* %this) nounwind ssp align 2  {
 entry:
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
   call void @llvm.dbg.value(metadata !{%struct.SVal* %this}, i64 0, metadata !31), !dbg !34
diff --git a/test/CodeGen/ARM/2010-12-13-reloc-pic.ll b/test/CodeGen/ARM/2010-12-13-reloc-pic.ll
deleted file mode 100644
index d5aefbe..0000000
--- a/test/CodeGen/ARM/2010-12-13-reloc-pic.ll
+++ /dev/null
@@ -1,100 +0,0 @@
-; RUN: llc  %s -mtriple=armv7-linux-gnueabi -relocation-model=pic -filetype=obj -o - | \
-; RUN:    elf-dump --dump-section-data | FileCheck  -check-prefix=PIC01 %s
-
-;; FIXME: Reduce this test further, or even better,
-;; redo as .s -> .o test once ARM AsmParser is working better
-
-; ModuleID = 'large2.pnacl.bc'
-target triple = "armv7-none-linux-gnueabi"
-
-%struct._Bigint = type { %struct._Bigint*, i32, i32, i32, i32, [1 x i32] }
-%struct.__FILE = type { i8*, i32, i32, i16, i16, %struct.__sbuf, i32, %struct._reent*, i8*, i32 (%struct._reent*, i8*, i8*, i32)*, i32 (%struct._reent*, i8*, i8*, i32)*, i32 (%struct._reent*, i8*, i32, i32)*, i32 (%struct._reent*, i8*)*, %struct.__sbuf, i8*, i32, [3 x i8], [1 x i8], %struct.__sbuf, i32, i32, %struct._flock_t, %struct._mbstate_t, i32 }
-%struct.__sbuf = type { i8*, i32 }
-%struct.__tm = type { i32, i32, i32, i32, i32, i32, i32, i32, i32 }
-%struct._atexit = type { %struct._atexit*, i32, [32 x void ()*], %struct._on_exit_args* }
-%struct._flock_t = type { i32, i32, i32, i32, i32 }
-%struct._glue = type { %struct._glue*, i32, %struct.__FILE* }
-%struct._mbstate_t = type { i32, %union.anon }
-%struct._misc_reent = type { i8*, %struct._mbstate_t, %struct._mbstate_t, %struct._mbstate_t, [8 x i8], i32, %struct._mbstate_t, %struct._mbstate_t, %struct._mbstate_t, %struct._mbstate_t, %struct._mbstate_t }
-%struct._mprec = type { %struct._Bigint*, i32, %struct._Bigint*, %struct._Bigint** }
-%struct._on_exit_args = type { [32 x i8*], [32 x i8*], i32, i32 }
-%struct._rand48 = type { [3 x i16], [3 x i16], i16, i64 }
-%struct._reent = type { %struct.__FILE*, %struct.__FILE*, %struct.__FILE*, i32, i32, i8*, i32, i32, i8*, %struct._mprec*, void (%struct._reent*)*, i32, i32, i8*, %struct._rand48*, %struct.__tm*, i8*, void (i32)**, %struct._atexit*, %struct._atexit, %struct._glue, %struct.__FILE*, %struct._misc_reent*, i8* }
-%union.anon = type { i32 }
-
-@buf = constant [2 x i8] c"x\00", align 4
-@_impure_ptr = external thread_local global %struct._reent*
-@.str = private constant [22 x i8] c"This should fault...\0A\00", align 4
-@.str1 = private constant [40 x i8] c"We're still running. This is not good.\0A\00", align 4
-
-define i32 @main() nounwind {
-entry:
-  %0 = load %struct._reent** @_impure_ptr, align 4
-  %1 = getelementptr inbounds %struct._reent* %0, i32 0, i32 1
-  %2 = load %struct.__FILE** %1, align 4
-  %3 = bitcast %struct.__FILE* %2 to i8*
-  %4 = tail call i32 @fwrite(i8* getelementptr inbounds ([22 x i8]* @.str, i32 0, i32 0), i32 1, i32 21, i8* %3) nounwind
-  %5 = load %struct._reent** @_impure_ptr, align 4
-  %6 = getelementptr inbounds %struct._reent* %5, i32 0, i32 1
-  %7 = load %struct.__FILE** %6, align 4
-  %8 = tail call i32 @fflush(%struct.__FILE* %7) nounwind
-  store i8 121, i8* getelementptr inbounds ([2 x i8]* @buf, i32 0, i32 0), align 4
-  %9 = load %struct._reent** @_impure_ptr, align 4
-  %10 = getelementptr inbounds %struct._reent* %9, i32 0, i32 1
-  %11 = load %struct.__FILE** %10, align 4
-  %12 = bitcast %struct.__FILE* %11 to i8*
-  %13 = tail call i32 @fwrite(i8* getelementptr inbounds ([40 x i8]* @.str1, i32 0, i32 0), i32 1, i32 39, i8* %12) nounwind
-  ret i32 1
-}
-
-
-; PIC01:             Relocation 0x00000000
-; PIC01-NEXT:        'r_offset', 0x0000001c
-; PIC01-NEXT:          'r_sym'
-; PIC01-NEXT:          'r_type', 0x0000001b
-
-
-; PIC01:             Relocation 0x00000001
-; PIC01-NEXT:      'r_offset', 0x00000038
-; PIC01-NEXT:        'r_sym'
-; PIC01-NEXT:        'r_type', 0x0000001b
-
-; PIC01:              Relocation 0x00000002
-; PIC01-NEXT:      'r_offset', 0x00000044
-; PIC01-NEXT:        'r_sym'
-; PIC01-NEXT:        'r_type', 0x0000001b
-
-; PIC01:              Relocation 0x00000003
-; PIC01-NEXT:      'r_offset', 0x00000070
-; PIC01-NEXT:        'r_sym'
-; PIC01-NEXT:        'r_type', 0x0000001b
-
-; PIC01:              Relocation 0x00000004
-; PIC01-NEXT:      'r_offset', 0x0000007c
-; PIC01-NEXT:        'r_sym'
-; PIC01-NEXT:        'r_type', 0x00000019
-
-
-; PIC01:              Relocation 0x00000005
-; PIC01-NEXT:      'r_offset', 0x00000080
-; PIC01-NEXT:        'r_sym'
-; PIC01-NEXT:        'r_type', 0x00000018
-
-; PIC01:              Relocation 0x00000006
-; PIC01-NEXT:      'r_offset', 0x00000084
-; PIC01-NEXT:        'r_sym'
-; PIC01-NEXT:        'r_type', 0x00000068
-
-; PIC01:              Relocation 0x00000007
-; PIC01-NEXT:      'r_offset', 0x00000088
-; PIC01-NEXT:        'r_sym'
-; PIC01-NEXT:        'r_type', 0x0000001a
-
-; PIC01:              Relocation 0x00000008
-; PIC01-NEXT:      'r_offset', 0x0000008c
-; PIC01-NEXT:        'r_sym'
-; PIC01-NEXT:        'r_type', 0x00000018
-
-declare i32 @fwrite(i8* nocapture, i32, i32, i8* nocapture) nounwind
-
-declare i32 @fflush(%struct.__FILE* nocapture) nounwind
diff --git a/test/CodeGen/ARM/2010-12-15-elf-lcomm.ll b/test/CodeGen/ARM/2010-12-15-elf-lcomm.ll
index 7642dc4..69d4a14 100644
--- a/test/CodeGen/ARM/2010-12-15-elf-lcomm.ll
+++ b/test/CodeGen/ARM/2010-12-15-elf-lcomm.ll
@@ -10,7 +10,7 @@
 @STRIDE = internal global i32 8
 
 ; ASM:          .type   array00,%object         @ @array00
-; ASM-NEXT:     .lcomm  array00,80              @ @array00
+; ASM-NEXT:     .lcomm  array00,80
 ; ASM-NEXT:     .type   _MergedGlobals,%object  @ @_MergedGlobals
 
 
diff --git a/test/CodeGen/ARM/2011-04-07-schediv.ll b/test/CodeGen/ARM/2011-04-07-schediv.ll
index a61908f..19f756f 100644
--- a/test/CodeGen/ARM/2011-04-07-schediv.ll
+++ b/test/CodeGen/ARM/2011-04-07-schediv.ll
@@ -13,6 +13,7 @@ entry:
 ; Make sure the scheduler schedules all uses of the preincrement
 ; induction variable before defining the postincrement value.
 ; CHECK: t:
+; CHECK: %bb
 ; CHECK-NOT: mov
 bb:                                               ; preds = %entry, %bb
   %j.05 = phi i32 [ %2, %bb ], [ 0, %entry ]
diff --git a/test/CodeGen/ARM/2011-04-11-MachineLICMBug.ll b/test/CodeGen/ARM/2011-04-11-MachineLICMBug.ll
new file mode 100644
index 0000000..568718c
--- /dev/null
+++ b/test/CodeGen/ARM/2011-04-11-MachineLICMBug.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a8 | FileCheck %s
+
+; Overly aggressive LICM simply adds copies of constants
+; rdar://9266679
+
+define zeroext i1 @t(i32* nocapture %A, i32 %size, i32 %value) nounwind readonly ssp {
+; CHECK: t:
+entry:
+  br label %for.cond
+
+for.cond:
+  %0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp ult i32 %0, %size
+  br i1 %cmp, label %for.body, label %return
+
+for.body:
+; CHECK: %for.
+; CHECK: movs r{{[0-9]+}}, #{{[01]}}
+  %arrayidx = getelementptr i32* %A, i32 %0
+  %tmp4 = load i32* %arrayidx, align 4
+  %cmp6 = icmp eq i32 %tmp4, %value
+  br i1 %cmp6, label %return, label %for.inc
+
+; CHECK: %for.
+; CHECK: movs r{{[0-9]+}}, #{{[01]}}
+
+for.inc:
+  %inc = add i32 %0, 1
+  br label %for.cond
+
+return:
+  %retval.0 = phi i1 [ true, %for.body ], [ false, %for.cond ]
+  ret i1 %retval.0
+}
diff --git a/test/CodeGen/ARM/2011-04-12-AlignBug.ll b/test/CodeGen/ARM/2011-04-12-AlignBug.ll
new file mode 100644
index 0000000..317be94
--- /dev/null
+++ b/test/CodeGen/ARM/2011-04-12-AlignBug.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"
+target triple = "thumbv7-apple-darwin10.0.0"
+
+; CHECK: align 3
+@.v = linker_private unnamed_addr constant <4 x i32> <i32 1, i32 2, i32 3, i32 4>, align 8
+; CHECK: align 2
+@.strA = linker_private unnamed_addr constant [4 x i8] c"bar\00"
+; CHECK-NOT: align
+@.strB = linker_private unnamed_addr constant [4 x i8] c"foo\00", align 1
+@.strC = linker_private unnamed_addr constant [4 x i8] c"baz\00", section "__TEXT,__cstring,cstring_literals", align 1
diff --git a/test/CodeGen/ARM/2011-04-12-FastRegAlloc.ll b/test/CodeGen/ARM/2011-04-12-FastRegAlloc.ll
new file mode 100644
index 0000000..eb23de0
--- /dev/null
+++ b/test/CodeGen/ARM/2011-04-12-FastRegAlloc.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -O0 -verify-machineinstrs -regalloc=fast
+; Previously we'd crash as out of registers on this input by clobbering all of
+; the aliases.
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"
+target triple = "thumbv7-apple-darwin10.0.0"
+
+define void @_Z8TestCasev() nounwind ssp {
+entry:
+  %a = alloca float, align 4
+  %tmp = load float* %a, align 4
+  call void asm sideeffect "", "w,~{s0},~{s16}"(float %tmp) nounwind, !srcloc !0
+  ret void
+}
+
+!0 = metadata !{i32 109}
diff --git a/test/CodeGen/ARM/2011-04-15-AndVFlagPeepholeBug.ll b/test/CodeGen/ARM/2011-04-15-AndVFlagPeepholeBug.ll
new file mode 100644
index 0000000..e712e08
--- /dev/null
+++ b/test/CodeGen/ARM/2011-04-15-AndVFlagPeepholeBug.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin10 | FileCheck %s
+
+; CHECK: _f
+; CHECK-NOT: ands
+; CHECK: cmp
+; CHECK: blxle _g
+
+define i32 @f(i32 %a, i32 %b) nounwind ssp {
+entry:
+  %and = and i32 %b, %a
+  %cmp = icmp slt i32 %and, 1
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  tail call void (...)* @g(i32 %a, i32 %b) nounwind
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret i32 %and
+}
+
+declare void @g(...)
diff --git a/test/CodeGen/ARM/2011-04-15-RegisterCmpPeephole.ll b/test/CodeGen/ARM/2011-04-15-RegisterCmpPeephole.ll
new file mode 100644
index 0000000..5404cf5
--- /dev/null
+++ b/test/CodeGen/ARM/2011-04-15-RegisterCmpPeephole.ll
@@ -0,0 +1,41 @@
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin10 | FileCheck %s
+
+; CHECK: _f
+; CHECK: adds
+; CHECK-NOT: cmp
+; CHECK: blxeq _h
+
+define i32 @f(i32 %a, i32 %b) nounwind ssp {
+entry:
+  %add = add nsw i32 %b, %a
+  %cmp = icmp eq i32 %add, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  tail call void (...)* @h(i32 %a, i32 %b) nounwind
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret i32 %add
+}
+
+; CHECK: _g
+; CHECK: orrs
+; CHECK-NOT: cmp
+; CHECK: blxeq _h
+
+define i32 @g(i32 %a, i32 %b) nounwind ssp {
+entry:
+  %add = or i32 %b, %a
+  %cmp = icmp eq i32 %add, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  tail call void (...)* @h(i32 %a, i32 %b) nounwind
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret i32 %add
+}
+
+declare void @h(...)
diff --git a/test/CodeGen/ARM/2011-04-26-SchedTweak.ll b/test/CodeGen/ARM/2011-04-26-SchedTweak.ll
new file mode 100644
index 0000000..ed7dd03
--- /dev/null
+++ b/test/CodeGen/ARM/2011-04-26-SchedTweak.ll
@@ -0,0 +1,70 @@
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -relocation-model=pic -mcpu=cortex-a8 | FileCheck %s
+
+; Do not move the umull above previous call which would require use of
+; more callee-saved registers and introduce copies.
+; rdar://9329627
+
+%struct.FF = type { i32 (i32*)*, i32 (i32*, i32*, i32, i32, i32, i32)*, i32 (i32, i32, i8*)*, void ()*, i32 (i32, i8*, i32*)*, i32 ()* }
+%struct.BD = type { %struct.BD*, i32, i32, i32, i32, i64, i32 (%struct.BD*, i8*, i64, i32)*, i32 (%struct.BD*, i8*, i32, i32)*, i32 (%struct.BD*, i8*, i64, i32)*, i32 (%struct.BD*, i8*, i32, i32)*, i32 (%struct.BD*, i64, i32)*, [16 x i8], i64, i64 }
+
+@FuncPtr = external hidden unnamed_addr global %struct.FF*
+@.str1 = external hidden unnamed_addr constant [6 x i8], align 4
+@G = external unnamed_addr global i32
+@.str2 = external hidden unnamed_addr constant [58 x i8], align 4
+@.str3 = external hidden unnamed_addr constant [58 x i8], align 4
+
+define i32 @test() nounwind optsize ssp {
+entry:
+; CHECK: test:
+; CHECK: push
+; CHECK-NOT: push
+  %block_size = alloca i32, align 4
+  %block_count = alloca i32, align 4
+  %index_cache = alloca i32, align 4
+  store i32 0, i32* %index_cache, align 4
+  %tmp = load i32* @G, align 4
+  %tmp1 = call i32 @bar(i32 0, i32 0, i32 %tmp) nounwind
+  switch i32 %tmp1, label %bb8 [
+    i32 0, label %bb
+    i32 536870913, label %bb4
+    i32 536870914, label %bb6
+  ]
+
+bb:
+  %tmp2 = load i32* @G, align 4
+  %tmp4 = icmp eq i32 %tmp2, 0
+  br i1 %tmp4, label %bb1, label %bb8
+
+bb1:
+; CHECK: %bb1
+; CHECK-NOT: umull
+; CHECK: blx _Get
+; CHECK: umull
+; CHECK: blx _foo
+  %tmp5 = load i32* %block_size, align 4
+  %tmp6 = load i32* %block_count, align 4
+  %tmp7 = call %struct.FF* @Get() nounwind
+  store %struct.FF* %tmp7, %struct.FF** @FuncPtr, align 4
+  %tmp10 = zext i32 %tmp6 to i64
+  %tmp11 = zext i32 %tmp5 to i64
+  %tmp12 = mul nsw i64 %tmp10, %tmp11
+  %tmp13 = call i32 @foo(i8* getelementptr inbounds ([6 x i8]* @.str1, i32 0, i32 0), i64 %tmp12, i32 %tmp5) nounwind
+  br label %bb8
+
+bb4:
+  ret i32 0
+
+bb6:
+  ret i32 1
+
+bb8:
+  ret i32 -1
+}
+
+declare i32 @printf(i8*, ...)
+
+declare %struct.FF* @Get()
+
+declare i32 @foo(i8*, i64, i32)
+
+declare i32 @bar(i32, i32, i32)
diff --git a/test/CodeGen/ARM/2011-04-27-IfCvtBug.ll b/test/CodeGen/ARM/2011-04-27-IfCvtBug.ll
new file mode 100644
index 0000000..0741049
--- /dev/null
+++ b/test/CodeGen/ARM/2011-04-27-IfCvtBug.ll
@@ -0,0 +1,59 @@
+; RUN: llc < %s -mtriple=thumbv7-apple-ios
+
+; If converter was being too cute. It look for root BBs (which don't have
+; successors) and use inverse depth first search to traverse the BBs. However
+; that doesn't work when the CFG has infinite loops. Simply do a linear
+; traversal of all BBs work just fine.
+
+; rdar://9344645
+
+%struct.hc = type { i32, i32, i32, i32 }
+
+define i32 @t(i32 %type) optsize {
+entry:
+  br i1 undef, label %if.then, label %if.else
+
+if.then:
+  unreachable
+
+if.else:
+  br i1 undef, label %if.then15, label %if.else18
+
+if.then15:
+  unreachable
+
+if.else18:
+  switch i32 %type, label %if.else173 [
+    i32 3, label %if.then115
+    i32 1, label %if.then102
+  ]
+
+if.then102:
+  br i1 undef, label %cond.true10.i, label %t.exit
+
+cond.true10.i:
+  br label %t.exit
+
+t.exit:
+  unreachable
+
+if.then115:
+  br i1 undef, label %if.else163, label %if.else145
+
+if.else145:
+  %call150 = call fastcc %struct.hc* @foo(%struct.hc* undef, i32 34865152) optsize
+  br label %while.body172
+
+if.else163:
+  %call168 = call fastcc %struct.hc* @foo(%struct.hc* undef, i32 34078720) optsize
+  br label %while.body172
+
+while.body172:
+  br label %while.body172
+
+if.else173:
+  ret i32 -1
+}
+
+declare hidden fastcc %struct.hc* @foo(%struct.hc* nocapture, i32) nounwind optsize
+
diff --git a/test/CodeGen/ARM/2011-05-04-MultipleLandingPadSuccs.ll b/test/CodeGen/ARM/2011-05-04-MultipleLandingPadSuccs.ll
new file mode 100644
index 0000000..0b5f962
--- /dev/null
+++ b/test/CodeGen/ARM/2011-05-04-MultipleLandingPadSuccs.ll
@@ -0,0 +1,93 @@
+; RUN: llc < %s -verify-machineinstrs
+; <rdar://problem/9187612>
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32-n32"
+target triple = "thumbv7-apple-darwin"
+
+define void @func() unnamed_addr align 2 {
+entry:
+  br label %for.cond
+
+for.cond:
+  %tmp2 = phi i32 [ 0, %entry ], [ %add, %for.cond.backedge ]
+  %cmp = icmp ult i32 %tmp2, 14
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+  %add = add i32 %tmp2, 1
+  switch i32 %tmp2, label %sw.default [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb
+    i32 2, label %sw.bb
+    i32 4, label %sw.bb
+    i32 5, label %sw.bb
+    i32 10, label %sw.bb
+  ]
+
+sw.bb:
+  invoke void @foo()
+          to label %invoke.cont17 unwind label %lpad
+
+invoke.cont17:
+  invoke void @foo()
+          to label %for.cond.backedge unwind label %lpad26
+
+for.cond.backedge:
+  br label %for.cond
+
+lpad:
+  %exn = tail call i8* @llvm.eh.exception() nounwind
+  %eh.selector = tail call i32 (i8*, i8*, ...)* @llvm.eh.selector(i8* %exn, i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*), i8* null) nounwind
+  invoke void @foo()
+          to label %eh.resume unwind label %terminate.lpad
+
+lpad26:
+  %exn27 = tail call i8* @llvm.eh.exception() nounwind
+  %eh.selector28 = tail call i32 (i8*, i8*, ...)* @llvm.eh.selector(i8* %exn27, i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*), i8* null) nounwind
+  invoke void @foo()
+          to label %eh.resume unwind label %terminate.lpad
+
+sw.default:
+  br label %for.cond.backedge
+
+for.end:
+  invoke void @foo()
+          to label %call8.i.i.i.noexc unwind label %lpad44
+
+call8.i.i.i.noexc:
+  ret void
+
+lpad44:
+  %exn45 = tail call i8* @llvm.eh.exception() nounwind
+  %eh.selector46 = tail call i32 (i8*, i8*, ...)* @llvm.eh.selector(i8* %exn45, i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*), i8* null) nounwind
+  invoke void @foo()
+          to label %eh.resume unwind label %terminate.lpad
+
+eh.resume:
+  %exn.slot.0 = phi i8* [ %exn27, %lpad26 ], [ %exn, %lpad ], [ %exn45, %lpad44 ]
+  tail call void @_Unwind_SjLj_Resume_or_Rethrow(i8* %exn.slot.0) noreturn
+  unreachable
+
+terminate.lpad:
+  %exn51 = tail call i8* @llvm.eh.exception() nounwind
+  %eh.selector52 = tail call i32 (i8*, i8*, ...)* @llvm.eh.selector(i8* %exn51, i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*), i8* null) nounwind
+  tail call void @_ZSt9terminatev() noreturn nounwind
+  unreachable
+}
+
+declare void @foo()
+
+declare i8* @llvm.eh.exception() nounwind readonly
+
+declare i32 @__gxx_personality_sj0(...)
+
+declare i32 @llvm.eh.selector(i8*, i8*, ...) nounwind
+
+declare void @_Unwind_SjLj_Resume_or_Rethrow(i8*)
+
+declare void @_ZSt9terminatev()
+
+!0 = metadata !{metadata !"any pointer", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
+!3 = metadata !{metadata !"bool", metadata !1}
+!4 = metadata !{metadata !"int", metadata !1}
diff --git a/test/CodeGen/ARM/2011-06-09-TailCallByVal.ll b/test/CodeGen/ARM/2011-06-09-TailCallByVal.ll
new file mode 100644
index 0000000..4db3acf
--- /dev/null
+++ b/test/CodeGen/ARM/2011-06-09-TailCallByVal.ll
@@ -0,0 +1,39 @@
+; RUN: llc < %s -relocation-model=pic -mcpu=cortex-a8 -arm-tail-calls=1 | FileCheck %s
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"
+target triple = "thumbv7-apple-darwin10"
+
+%struct._RuneCharClass = type { [14 x i8], i32 }
+%struct._RuneEntry = type { i32, i32, i32, i32* }
+%struct._RuneLocale = type { [8 x i8], [32 x i8], i32 (i8*, i32, i8**)*, i32 (i32, i8*, i32, i8**)*, i32, [256 x i32], [256 x i32], [256 x i32], %struct._RuneRange, %struct._RuneRange, %struct._RuneRange, i8*, i32, i32, %struct._RuneCharClass* }
+%struct._RuneRange = type { i32, %struct._RuneEntry* }
+%struct.__collate_st_chain_pri = type { [10 x i32], [2 x i32] }
+%struct.__collate_st_char_pri = type { [2 x i32] }
+%struct.__collate_st_info = type { [2 x i8], i8, i8, [2 x i32], [2 x i32], i32, i32 }
+%struct.__collate_st_large_char_pri = type { i32, %struct.__collate_st_char_pri }
+%struct.__collate_st_subst = type { i32, [10 x i32] }
+%struct.__xlocale_st_collate = type { i32, void (i8*)*, [32 x i8], %struct.__collate_st_info, [2 x %struct.__collate_st_subst*], %struct.__collate_st_chain_pri*, %struct.__collate_st_large_char_pri*, [256 x %struct.__collate_st_char_pri] }
+%struct.__xlocale_st_messages = type { i32, void (i8*)*, i8*, %struct.lc_messages_T }
+%struct.__xlocale_st_monetary = type { i32, void (i8*)*, i8*, %struct.lc_monetary_T }
+%struct.__xlocale_st_numeric = type { i32, void (i8*)*, i8*, %struct.lc_numeric_T }
+%struct.__xlocale_st_runelocale = type { i32, void (i8*)*, [32 x i8], i32, i32, i32 (i32*, i8*, i32, %union.__mbstate_t*, %struct._xlocale*)*, i32 (%union.__mbstate_t*, %struct._xlocale*)*, i32 (i32*, i8**, i32, i32, %union.__mbstate_t*, %struct._xlocale*)*, i32 (i8*, i32, %union.__mbstate_t*, %struct._xlocale*)*, i32 (i8*, i32**, i32, i32, %union.__mbstate_t*, %struct._xlocale*)*, i32, %struct._RuneLocale }
+%struct.__xlocale_st_time = type { i32, void (i8*)*, i8*, %struct.lc_time_T }
+%struct._xlocale = type { i32, void (i8*)*, %union.__mbstate_t, %union.__mbstate_t, %union.__mbstate_t, %union.__mbstate_t, %union.__mbstate_t, %union.__mbstate_t, %union.__mbstate_t, %union.__mbstate_t, %union.__mbstate_t, %union.__mbstate_t, i32, i64, i8, i8, i8, i8, i8, i8, i8, i8, i8, %struct.__xlocale_st_collate*, %struct.__xlocale_st_runelocale*, %struct.__xlocale_st_messages*, %struct.__xlocale_st_monetary*, %struct.__xlocale_st_numeric*, %struct._xlocale*, %struct.__xlocale_st_time*, %struct.lconv }
+%struct.lc_messages_T = type { i8*, i8*, i8*, i8* }
+%struct.lc_monetary_T = type { i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8* }
+%struct.lc_numeric_T = type { i8*, i8*, i8* }
+%struct.lc_time_T = type { [12 x i8*], [12 x i8*], [7 x i8*], [7 x i8*], i8*, i8*, i8*, i8*, i8*, i8*, [12 x i8*], i8*, i8* }
+%struct.lconv = type { i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 }
+%union.__mbstate_t = type { i64, [120 x i8] }
+
+@"\01_fnmatch.initial" = external constant %union.__mbstate_t, align 4
+
+; CHECK: _fnmatch
+; CHECK: blx _fnmatch1
+
+define i32 @"\01_fnmatch"(i8* %pattern, i8* %string, i32 %flags) nounwind optsize {
+entry:
+  %call4 = tail call i32 @fnmatch1(i8* %pattern, i8* %string, i8* %string, i32 %flags, %union.__mbstate_t* byval @"\01_fnmatch.initial", %union.__mbstate_t* byval @"\01_fnmatch.initial", %struct._xlocale* undef, i32 64) optsize
+  ret i32 %call4
+}
+
+declare i32 @fnmatch1(i8*, i8*, i8*, i32, %union.__mbstate_t* byval, %union.__mbstate_t* byval, %struct._xlocale*, i32) nounwind optsize
diff --git a/test/CodeGen/ARM/2011-06-16-TailCallByVal.ll b/test/CodeGen/ARM/2011-06-16-TailCallByVal.ll
new file mode 100644
index 0000000..7baacfe
--- /dev/null
+++ b/test/CodeGen/ARM/2011-06-16-TailCallByVal.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -arm-tail-calls=1 | FileCheck %s
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"
+target triple = "thumbv7-apple-darwin10"
+
+%struct.A = type <{ i16, i16, i32, i16, i16, i32, i16, [8 x %struct.B], [418 x i8], %struct.C }>
+%struct.B = type <{ i32, i16, i16 }>
+%struct.C = type { i16, i32, i16, i16 }
+
+; CHECK: f
+; CHECK: push {r1, r2, r3}
+; CHECK: add sp, #12
+; CHECK: b.w _puts
+
+define void @f(i8* %s, %struct.A* nocapture byval %a) nounwind optsize {
+entry:
+  %puts = tail call i32 @puts(i8* %s)
+  ret void
+}
+
+declare i32 @puts(i8* nocapture) nounwind
diff --git a/test/CodeGen/ARM/align.ll b/test/CodeGen/ARM/align.ll
index d57c159..9589e72 100644
--- a/test/CodeGen/ARM/align.ll
+++ b/test/CodeGen/ARM/align.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=arm-linux-gnueabi | FileCheck %s -check-prefix=ELF
-; RUN: llc < %s -mtriple=arm-apple-darwin | FileCheck %s -check-prefix=DARWIN
+; RUN: llc < %s -mtriple=arm-apple-darwin10 | FileCheck %s -check-prefix=DARWIN
 
 @a = global i1 true
 ; no alignment
diff --git a/test/CodeGen/ARM/arm-and-tst-peephole.ll b/test/CodeGen/ARM/arm-and-tst-peephole.ll
index 444dce7..0762070 100644
--- a/test/CodeGen/ARM/arm-and-tst-peephole.ll
+++ b/test/CodeGen/ARM/arm-and-tst-peephole.ll
@@ -23,7 +23,7 @@ tailrecurse:                                      ; preds = %sw.bb, %entry
   %tmp2 = load i8** %scevgep5
   %0 = ptrtoint i8* %tmp2 to i32
 
-; ARM:      ands r12, r12, #3
+; ARM:      ands {{r[0-9]+}}, {{r[0-9]+}}, #3
 ; ARM-NEXT: beq
 
 ; THUMB:      movs r[[R0:[0-9]+]], #3
@@ -31,7 +31,7 @@ tailrecurse:                                      ; preds = %sw.bb, %entry
 ; THUMB-NEXT: cmp r[[R0]], #0
 ; THUMB-NEXT: beq
 
-; T2:      ands r12, r12, #3
+; T2:      ands {{r[0-9]+}}, {{r[0-9]+}}, #3
 ; T2-NEXT: beq
 
   %and = and i32 %0, 3
diff --git a/test/CodeGen/ARM/arm-modifier.ll b/test/CodeGen/ARM/arm-modifier.ll
new file mode 100644
index 0000000..0a7bb6c
--- /dev/null
+++ b/test/CodeGen/ARM/arm-modifier.ll
@@ -0,0 +1,59 @@
+; RUN: llc < %s -march=arm -mattr=+vfp2 | FileCheck %s
+
+define i32 @foo(float %scale, float %scale2) nounwind {
+entry:
+  %scale.addr = alloca float, align 4
+  %scale2.addr = alloca float, align 4
+  store float %scale, float* %scale.addr, align 4
+  store float %scale2, float* %scale2.addr, align 4
+  %tmp = load float* %scale.addr, align 4
+  %tmp1 = load float* %scale2.addr, align 4
+  call void asm sideeffect "vmul.f32    q0, q0, ${0:y} \0A\09vmul.f32    q1, q1, ${0:y} \0A\09vmul.f32    q1, q0, ${1:y} \0A\09", "w,w,~{q0},~{q1}"(float %tmp, float %tmp1) nounwind
+  ret i32 0
+}
+
+define void @f0() nounwind {
+entry:
+; CHECK: f0
+; CHECK: .word -1
+call void asm sideeffect ".word ${0:B} \0A\09", "i"(i32 0) nounwind
+ret void
+}
+
+define void @f1() nounwind {
+entry:
+; CHECK: f1
+; CHECK: .word 65535
+call void asm sideeffect ".word ${0:L} \0A\09", "i"(i32 -1) nounwind
+ret void
+}
+
+@f2_ptr = internal global i32* @f2_var, align 4
+@f2_var = external global i32
+
+define void @f2() nounwind {
+entry:
+; CHECK: f2
+; CHECK: ldr r0, [r{{[0-9]+}}]
+call void asm sideeffect "ldr r0, [${0:m}]\0A\09", "*m,~{r0}"(i32** @f2_ptr) nounwind
+ret void
+}
+
+@f3_ptr = internal global i64* @f3_var, align 4
+@f3_var = external global i64
+@f3_var2 = external global i64
+
+define void @f3() nounwind {
+entry:
+; CHECK: f3
+; CHECK: stm r{{[0-9]+}}, {[[REG1:(r[0-9]+)]], r{{[0-9]+}}}
+; CHECK: adds lr, [[REG1]]
+; CHECK: ldm r{{[0-9]+}}, {r{{[0-9]+}}, r{{[0-9]+}}}
+%tmp = load i64* @f3_var, align 4
+%tmp1 = load i64* @f3_var2, align 4
+%0 = call i64 asm sideeffect "stm ${0:m}, ${1:M}\0A\09adds $3, $1\0A\09", "=*m,=r,1,r"(i64** @f3_ptr, i64 %tmp, i64 %tmp1) nounwind
+store i64 %0, i64* @f3_var, align 4
+%1 = call i64 asm sideeffect "ldm ${1:m}, ${0:M}\0A\09", "=r,*m"(i64** @f3_ptr) nounwind
+store i64 %1, i64* @f3_var, align 4
+ret void
+}
diff --git a/test/CodeGen/ARM/atomic-op.ll b/test/CodeGen/ARM/atomic-op.ll
new file mode 100644
index 0000000..03940e3
--- /dev/null
+++ b/test/CodeGen/ARM/atomic-op.ll
@@ -0,0 +1,103 @@
+; RUN: llc < %s -mtriple=armv7-apple-darwin10 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin10 | FileCheck %s
+
+define void @func(i32 %argc, i8** %argv) nounwind {
+entry:
+	%argc.addr = alloca i32		; <i32*> [#uses=1]
+	%argv.addr = alloca i8**		; <i8***> [#uses=1]
+	%val1 = alloca i32		; <i32*> [#uses=2]
+	%val2 = alloca i32		; <i32*> [#uses=15]
+	%andt = alloca i32		; <i32*> [#uses=2]
+	%ort = alloca i32		; <i32*> [#uses=2]
+	%xort = alloca i32		; <i32*> [#uses=2]
+	%old = alloca i32		; <i32*> [#uses=18]
+	%temp = alloca i32		; <i32*> [#uses=2]
+	store i32 %argc, i32* %argc.addr
+	store i8** %argv, i8*** %argv.addr
+	store i32 0, i32* %val1
+	store i32 31, i32* %val2
+	store i32 3855, i32* %andt
+	store i32 3855, i32* %ort
+	store i32 3855, i32* %xort
+	store i32 4, i32* %temp
+	%tmp = load i32* %temp
+  ; CHECK: ldrex
+  ; CHECK: add
+  ; CHECK: strex
+	call i32 @llvm.atomic.load.add.i32.p0i32( i32* %val1, i32 %tmp )		; <i32>:0 [#uses=1]
+	store i32 %0, i32* %old
+  ; CHECK: ldrex
+  ; CHECK: sub
+  ; CHECK: strex
+	call i32 @llvm.atomic.load.sub.i32.p0i32( i32* %val2, i32 30 )		; <i32>:1 [#uses=1]
+	store i32 %1, i32* %old
+  ; CHECK: ldrex
+  ; CHECK: add
+  ; CHECK: strex
+	call i32 @llvm.atomic.load.add.i32.p0i32( i32* %val2, i32 1 )		; <i32>:2 [#uses=1]
+	store i32 %2, i32* %old
+  ; CHECK: ldrex
+  ; CHECK: sub
+  ; CHECK: strex
+	call i32 @llvm.atomic.load.sub.i32.p0i32( i32* %val2, i32 1 )		; <i32>:3 [#uses=1]
+	store i32 %3, i32* %old
+  ; CHECK: ldrex
+  ; CHECK: and
+  ; CHECK: strex
+	call i32 @llvm.atomic.load.and.i32.p0i32( i32* %andt, i32 4080 )		; <i32>:4 [#uses=1]
+	store i32 %4, i32* %old
+  ; CHECK: ldrex
+  ; CHECK: or
+  ; CHECK: strex
+	call i32 @llvm.atomic.load.or.i32.p0i32( i32* %ort, i32 4080 )		; <i32>:5 [#uses=1]
+	store i32 %5, i32* %old
+  ; CHECK: ldrex
+  ; CHECK: eor
+  ; CHECK: strex
+	call i32 @llvm.atomic.load.xor.i32.p0i32( i32* %xort, i32 4080 )		; <i32>:6 [#uses=1]
+	store i32 %6, i32* %old
+  ; CHECK: ldrex
+  ; CHECK: cmp
+  ; CHECK: strex
+	call i32 @llvm.atomic.load.min.i32.p0i32( i32* %val2, i32 16 )		; <i32>:7 [#uses=1]
+	store i32 %7, i32* %old
+	%neg = sub i32 0, 1		; <i32> [#uses=1]
+  ; CHECK: ldrex
+  ; CHECK: cmp
+  ; CHECK: strex
+	call i32 @llvm.atomic.load.min.i32.p0i32( i32* %val2, i32 %neg )		; <i32>:8 [#uses=1]
+	store i32 %8, i32* %old
+  ; CHECK: ldrex
+  ; CHECK: cmp
+  ; CHECK: strex
+	call i32 @llvm.atomic.load.max.i32.p0i32( i32* %val2, i32 1 )		; <i32>:9 [#uses=1]
+	store i32 %9, i32* %old
+  ; CHECK: ldrex
+  ; CHECK: cmp
+  ; CHECK: strex
+	call i32 @llvm.atomic.load.max.i32.p0i32( i32* %val2, i32 0 )		; <i32>:10 [#uses=1]
+	store i32 %10, i32* %old
+	ret void
+}
+
+declare i32 @llvm.atomic.load.add.i32.p0i32(i32*, i32) nounwind 
+
+declare i32 @llvm.atomic.load.sub.i32.p0i32(i32*, i32) nounwind 
+
+declare i32 @llvm.atomic.load.and.i32.p0i32(i32*, i32) nounwind 
+
+declare i32 @llvm.atomic.load.or.i32.p0i32(i32*, i32) nounwind 
+
+declare i32 @llvm.atomic.load.xor.i32.p0i32(i32*, i32) nounwind 
+
+declare i32 @llvm.atomic.load.min.i32.p0i32(i32*, i32) nounwind 
+
+declare i32 @llvm.atomic.load.max.i32.p0i32(i32*, i32) nounwind 
+
+declare i32 @llvm.atomic.load.umax.i32.p0i32(i32*, i32) nounwind 
+
+declare i32 @llvm.atomic.load.umin.i32.p0i32(i32*, i32) nounwind 
+
+declare i32 @llvm.atomic.swap.i32.p0i32(i32*, i32) nounwind 
+
+declare i32 @llvm.atomic.cmp.swap.i32.p0i32(i32*, i32, i32) nounwind 
diff --git a/test/CodeGen/ARM/avoid-cpsr-rmw.ll b/test/CodeGen/ARM/avoid-cpsr-rmw.ll
new file mode 100644
index 0000000..d0c4f3a
--- /dev/null
+++ b/test/CodeGen/ARM/avoid-cpsr-rmw.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a9 | FileCheck %s
+; Avoid some 's' 16-bit instruction which partially update CPSR (and add false
+; dependency) when it isn't dependent on last CPSR defining instruction.
+; rdar://8928208
+
+define i32 @t(i32 %a, i32 %b, i32 %c, i32 %d) nounwind readnone {
+ entry:
+; CHECK: t:
+; CHECK: muls r2, r3, r2
+; CHECK-NEXT: mul  r0, r0, r1
+; CHECK-NEXT: muls r0, r2, r0
+  %0 = mul nsw i32 %a, %b
+  %1 = mul nsw i32 %c, %d
+  %2 = mul nsw i32 %0, %1
+  ret i32 %2
+}
diff --git a/test/CodeGen/ARM/bfi.ll b/test/CodeGen/ARM/bfi.ll
index 946db19..84f3813 100644
--- a/test/CodeGen/ARM/bfi.ll
+++ b/test/CodeGen/ARM/bfi.ll
@@ -31,8 +31,7 @@ define i32 @f3(i32 %A, i32 %B) nounwind {
 entry:
 ; CHECK: f3
 ; CHECK: lsr{{.*}} #7
-; CHECK: mov r0, r1
-; CHECK: bfi r0, r2, #7, #16
+; CHECK: bfi {{.*}}, #7, #16
   %and = and i32 %A, 8388480                      ; <i32> [#uses=1]
   %and2 = and i32 %B, -8388481                    ; <i32> [#uses=1]
   %or = or i32 %and2, %and                        ; <i32> [#uses=1]
@@ -42,8 +41,8 @@ entry:
 ; rdar://8752056
 define i32 @f4(i32 %a) nounwind {
 ; CHECK: f4
-; CHECK: movw r1, #3137
-; CHECK: bfi r1, r0, #15, #5
+; CHECK: movw [[R1:r[0-9]+]], #3137
+; CHECK: bfi [[R1]], {{r[0-9]+}}, #15, #5
   %1 = shl i32 %a, 15
   %ins7 = and i32 %1, 1015808
   %ins12 = or i32 %ins7, 3137
@@ -62,3 +61,16 @@ entry:
   %3 = or i32 %2, %0
   ret i32 %3
 }
+
+; rdar://9609030
+define i32 @f6(i32 %a, i32 %b) nounwind readnone {
+entry:
+; CHECK: f6:
+; CHECK-NOT: bic
+; CHECK: bfi r0, r1, #8, #9
+  %and = and i32 %a, -130817
+  %and2 = shl i32 %b, 8
+  %shl = and i32 %and2, 130816
+  %or = or i32 %shl, %and
+  ret i32 %or
+}
diff --git a/test/CodeGen/ARM/call-tc.ll b/test/CodeGen/ARM/call-tc.ll
index 4dc37aa..c460f7a 100644
--- a/test/CodeGen/ARM/call-tc.ll
+++ b/test/CodeGen/ARM/call-tc.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -mtriple=armv6-apple-darwin -mattr=+vfp2 -arm-tail-calls | FileCheck %s -check-prefix=CHECKV6
 ; RUN: llc < %s -mtriple=armv6-linux-gnueabi -relocation-model=pic -mattr=+vfp2 -arm-tail-calls | FileCheck %s -check-prefix=CHECKELF
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin -arm-tail-calls | FileCheck %s -check-prefix=CHECKT2
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -arm-tail-calls | FileCheck %s -check-prefix=CHECKT2D
 
 @t = weak global i32 ()* null           ; <i32 ()**> [#uses=1]
 
@@ -16,6 +16,10 @@ define void @t1() {
 define void @t2() {
 ; CHECKV6: t2:
 ; CHECKV6: bx r0 @ TAILCALL
+; CHECKT2D: t2:
+; CHECKT2D: ldr
+; CHECKT2D-NEXT: ldr
+; CHECKT2D-NEXT: bx r0 @ TAILCALL
         %tmp = load i32 ()** @t         ; <i32 ()*> [#uses=1]
         %tmp.upgrd.2 = tail call i32 %tmp( )            ; <i32> [#uses=0]
         ret void
@@ -26,6 +30,9 @@ define void @t3() {
 ; CHECKV6: b _t2  @ TAILCALL
 ; CHECKELF: t3:
 ; CHECKELF: b t2(PLT) @ TAILCALL
+; CHECKT2D: t3:
+; CHECKT2D: b.w _t2  @ TAILCALL
+
         tail call void @t2( )            ; <i32> [#uses=0]
         ret void
 }
@@ -71,10 +78,10 @@ declare void @foo() nounwind
 
 define void @t7() nounwind {
 entry:
-; CHECKT2: t7:
-; CHECKT2: blxeq _foo
-; CHECKT2-NEXT: pop.w
-; CHECKT2-NEXT: b _foo
+; CHECKT2D: t7:
+; CHECKT2D: blxeq _foo
+; CHECKT2D-NEXT: pop.w
+; CHECKT2D-NEXT: b.w _foo
   br i1 undef, label %bb, label %bb1.lr.ph
 
 bb1.lr.ph:
diff --git a/test/CodeGen/ARM/carry.ll b/test/CodeGen/ARM/carry.ll
index a6a7ed6..9b90408 100644
--- a/test/CodeGen/ARM/carry.ll
+++ b/test/CodeGen/ARM/carry.ll
@@ -19,3 +19,20 @@ entry:
 	%tmp2 = sub i64 %tmp1, %b
 	ret i64 %tmp2
 }
+
+; add with live carry
+define i64 @f3(i32 %al, i32 %bl) {
+; CHECK: f3:
+; CHECK: adds r
+; CHECK: adcs r
+; CHECK: adc r
+entry:
+        ; unsigned wide add
+        %aw = zext i32 %al to i64
+        %bw = zext i32 %bl to i64
+        %cw = add i64 %aw, %bw
+        ; ch == carry bit
+        %ch = lshr i64 %cw, 32
+	%dw = add i64 %ch, %bw
+	ret i64 %dw
+}
diff --git a/test/CodeGen/ARM/crash-greedy.ll b/test/CodeGen/ARM/crash-greedy.ll
index 3a94110..8a865e2 100644
--- a/test/CodeGen/ARM/crash-greedy.ll
+++ b/test/CodeGen/ARM/crash-greedy.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -regalloc=greedy -mcpu=cortex-a8 -relocation-model=pic -disable-fp-elim | FileCheck %s
+; RUN: llc < %s -regalloc=greedy -mcpu=cortex-a8 -relocation-model=pic -disable-fp-elim -verify-machineinstrs | FileCheck %s
 ;
 ; ARM tests that crash or fail with the greedy register allocator.
 
@@ -6,7 +6,7 @@ target triple = "thumbv7-apple-darwin"
 
 declare double @exp(double)
 
-; CHECK remat_subreg
+; CHECK: remat_subreg
 define void @remat_subreg(float* nocapture %x, i32* %y, i32 %n, i32 %z, float %c, float %lambda, float* nocapture %ret_f, float* nocapture %ret_df) nounwind {
 entry:
   %conv16 = fpext float %lambda to double
@@ -59,3 +59,26 @@ for.end:                                          ; preds = %cond.end
   ret void
 }
 
+; CHECK: insert_elem
+; This test has a sub-register copy with a kill flag:
+;   %vreg6:ssub_3<def> = COPY %vreg6:ssub_2<kill>; QPR_VFP2:%vreg6
+; The rewriter must do something sensible with that, or the scavenger crashes.
+define void @insert_elem() nounwind {
+entry:
+  br i1 undef, label %if.end251, label %if.then84
+
+if.then84:                                        ; preds = %entry
+  br i1 undef, label %if.end251, label %if.then195
+
+if.then195:                                       ; preds = %if.then84
+  %div = fdiv float 1.000000e+00, undef
+  %vecinit207 = insertelement <4 x float> undef, float %div, i32 1
+  %vecinit208 = insertelement <4 x float> %vecinit207, float 1.000000e+00, i32 2
+  %vecinit209 = insertelement <4 x float> %vecinit208, float 1.000000e+00, i32 3
+  %mul216 = fmul <4 x float> zeroinitializer, %vecinit209
+  store <4 x float> %mul216, <4 x float>* undef, align 16
+  br label %if.end251
+
+if.end251:                                        ; preds = %if.then195, %if.then84, %entry
+  ret void
+}
diff --git a/test/CodeGen/ARM/debug-info-branch-folding.ll b/test/CodeGen/ARM/debug-info-branch-folding.ll
new file mode 100644
index 0000000..9bdae43
--- /dev/null
+++ b/test/CodeGen/ARM/debug-info-branch-folding.ll
@@ -0,0 +1,94 @@
+; RUN: llc < %s - | FileCheck %s
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"
+target triple = "thumbv7-apple-macosx10.6.7"
+
+;CHECK: 	vadd.f32	q4, q8, q8
+;CHECK-NEXT: Ltmp
+;CHECK-NEXT: 	@DEBUG_VALUE: y <- Q4+0
+;CHECK-NEXT:    @DEBUG_VALUE: x <- Q4+0
+
+
+@.str = external constant [13 x i8]
+
+declare <4 x float> @test0001(float) nounwind readnone ssp
+
+define i32 @main(i32 %argc, i8** nocapture %argv) nounwind ssp {
+entry:
+  br label %for.body9
+
+for.body9:                                        ; preds = %for.body9, %entry
+  %add19 = fadd <4 x float> undef, <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00>, !dbg !39
+  tail call void @llvm.dbg.value(metadata !{<4 x float> %add19}, i64 0, metadata !27), !dbg !39
+  %add20 = fadd <4 x float> undef, <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00>, !dbg !39
+  tail call void @llvm.dbg.value(metadata !{<4 x float> %add20}, i64 0, metadata !28), !dbg !39
+  br i1 undef, label %for.end54, label %for.body9, !dbg !44
+
+for.end54:                                        ; preds = %for.body9
+  %tmp115 = extractelement <4 x float> %add19, i32 1
+  %conv6.i75 = fpext float %tmp115 to double, !dbg !45
+  %call.i82 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([13 x i8]* @.str, i32 0, i32 0), double undef, double %conv6.i75, double undef, double undef) nounwind, !dbg !45
+  %tmp116 = extractelement <4 x float> %add20, i32 1
+  %conv6.i76 = fpext float %tmp116 to double, !dbg !45
+  %call.i83 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([13 x i8]* @.str, i32 0, i32 0), double undef, double %conv6.i76, double undef, double undef) nounwind, !dbg !45
+  ret i32 0, !dbg !49
+}
+
+declare i32 @printf(i8* nocapture, ...) nounwind
+
+declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+
+!llvm.dbg.sp = !{!0, !10, !14}
+!llvm.dbg.lv.test0001 = !{!18}
+!llvm.dbg.lv.main = !{!19, !20, !24, !26, !27, !28, !29}
+!llvm.dbg.lv.printFV = !{!30}
+
+!0 = metadata !{i32 589870, i32 0, metadata !1, metadata !"test0001", metadata !"test0001", metadata !"", metadata !1, i32 3, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, <4 x float> (float)* @test0001, null} ; [ DW_TAG_subprogram ]
+!1 = metadata !{i32 589865, metadata !"build2.c", metadata !"/private/tmp", metadata !2} ; [ DW_TAG_file_type ]
+!2 = metadata !{i32 589841, i32 0, i32 12, metadata !"build2.c", metadata !"/private/tmp", metadata !"clang version 3.0 (trunk 129915)", i1 true, i1 true, metadata !"", i32 0} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{i32 589845, metadata !1, metadata !"", metadata !1, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!4 = metadata !{metadata !5}
+!5 = metadata !{i32 589846, metadata !2, metadata !"v4f32", metadata !1, i32 14, i64 0, i64 0, i64 0, i32 0, metadata !6} ; [ DW_TAG_typedef ]
+!6 = metadata !{i32 590083, metadata !2, metadata !"", metadata !2, i32 0, i64 128, i64 128, i32 0, i32 0, metadata !7, metadata !8, i32 0, i32 0} ; [ DW_TAG_vector_type ]
+!7 = metadata !{i32 589860, metadata !2, metadata !"float", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
+!8 = metadata !{metadata !9}
+!9 = metadata !{i32 589857, i64 0, i64 3}         ; [ DW_TAG_subrange_type ]
+!10 = metadata !{i32 589870, i32 0, metadata !1, metadata !"main", metadata !"main", metadata !"", metadata !1, i32 59, metadata !11, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i32 (i32, i8**)* @main, null} ; [ DW_TAG_subprogram ]
+!11 = metadata !{i32 589845, metadata !1, metadata !"", metadata !1, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !12, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!12 = metadata !{metadata !13}
+!13 = metadata !{i32 589860, metadata !2, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!14 = metadata !{i32 589870, i32 0, metadata !15, metadata !"printFV", metadata !"printFV", metadata !"", metadata !15, i32 41, metadata !16, i1 true, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, null, null} ; [ DW_TAG_subprogram ]
+!15 = metadata !{i32 589865, metadata !"/Volumes/Lalgate/work/llvm/projects/llvm-test/SingleSource/UnitTests/Vector/helpers.h", metadata !"/private/tmp", metadata !2} ; [ DW_TAG_file_type ]
+!16 = metadata !{i32 589845, metadata !15, metadata !"", metadata !15, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !17, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!17 = metadata !{null}
+!18 = metadata !{i32 590081, metadata !0, metadata !"a", metadata !1, i32 16777219, metadata !7, i32 0} ; [ DW_TAG_arg_variable ]
+!19 = metadata !{i32 590081, metadata !10, metadata !"argc", metadata !1, i32 16777275, metadata !13, i32 0} ; [ DW_TAG_arg_variable ]
+!20 = metadata !{i32 590081, metadata !10, metadata !"argv", metadata !1, i32 33554491, metadata !21, i32 0} ; [ DW_TAG_arg_variable ]
+!21 = metadata !{i32 589839, metadata !2, metadata !"", null, i32 0, i64 32, i64 32, i64 0, i32 0, metadata !22} ; [ DW_TAG_pointer_type ]
+!22 = metadata !{i32 589839, metadata !2, metadata !"", null, i32 0, i64 32, i64 32, i64 0, i32 0, metadata !23} ; [ DW_TAG_pointer_type ]
+!23 = metadata !{i32 589860, metadata !2, metadata !"char", null, i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
+!24 = metadata !{i32 590080, metadata !25, metadata !"i", metadata !1, i32 60, metadata !13, i32 0} ; [ DW_TAG_auto_variable ]
+!25 = metadata !{i32 589835, metadata !10, i32 59, i32 33, metadata !1, i32 14} ; [ DW_TAG_lexical_block ]
+!26 = metadata !{i32 590080, metadata !25, metadata !"j", metadata !1, i32 60, metadata !13, i32 0} ; [ DW_TAG_auto_variable ]
+!27 = metadata !{i32 590080, metadata !25, metadata !"x", metadata !1, i32 61, metadata !5, i32 0} ; [ DW_TAG_auto_variable ]
+!28 = metadata !{i32 590080, metadata !25, metadata !"y", metadata !1, i32 62, metadata !5, i32 0} ; [ DW_TAG_auto_variable ]
+!29 = metadata !{i32 590080, metadata !25, metadata !"z", metadata !1, i32 63, metadata !5, i32 0} ; [ DW_TAG_auto_variable ]
+!30 = metadata !{i32 590081, metadata !14, metadata !"F", metadata !15, i32 16777257, metadata !31, i32 0} ; [ DW_TAG_arg_variable ]
+!31 = metadata !{i32 589839, metadata !2, metadata !"", null, i32 0, i64 32, i64 32, i64 0, i32 0, metadata !32} ; [ DW_TAG_pointer_type ]
+!32 = metadata !{i32 589846, metadata !2, metadata !"FV", metadata !15, i32 25, i64 0, i64 0, i64 0, i32 0, metadata !33} ; [ DW_TAG_typedef ]
+!33 = metadata !{i32 589847, metadata !2, metadata !"", metadata !15, i32 22, i64 128, i64 128, i64 0, i32 0, i32 0, metadata !34, i32 0, i32 0} ; [ DW_TAG_union_type ]
+!34 = metadata !{metadata !35, metadata !37}
+!35 = metadata !{i32 589837, metadata !15, metadata !"V", metadata !15, i32 23, i64 128, i64 128, i64 0, i32 0, metadata !36} ; [ DW_TAG_member ]
+!36 = metadata !{i32 589846, metadata !2, metadata !"v4sf", metadata !15, i32 3, i64 0, i64 0, i64 0, i32 0, metadata !6} ; [ DW_TAG_typedef ]
+!37 = metadata !{i32 589837, metadata !15, metadata !"A", metadata !15, i32 24, i64 128, i64 32, i64 0, i32 0, metadata !38} ; [ DW_TAG_member ]
+!38 = metadata !{i32 589825, metadata !2, metadata !"", metadata !2, i32 0, i64 128, i64 32, i32 0, i32 0, metadata !7, metadata !8, i32 0, i32 0} ; [ DW_TAG_array_type ]
+!39 = metadata !{i32 79, i32 7, metadata !40, null}
+!40 = metadata !{i32 589835, metadata !41, i32 75, i32 35, metadata !1, i32 18} ; [ DW_TAG_lexical_block ]
+!41 = metadata !{i32 589835, metadata !42, i32 75, i32 5, metadata !1, i32 17} ; [ DW_TAG_lexical_block ]
+!42 = metadata !{i32 589835, metadata !43, i32 71, i32 32, metadata !1, i32 16} ; [ DW_TAG_lexical_block ]
+!43 = metadata !{i32 589835, metadata !25, i32 71, i32 3, metadata !1, i32 15} ; [ DW_TAG_lexical_block ]
+!44 = metadata !{i32 75, i32 5, metadata !42, null}
+!45 = metadata !{i32 42, i32 2, metadata !46, metadata !48}
+!46 = metadata !{i32 589835, metadata !47, i32 42, i32 2, metadata !15, i32 20} ; [ DW_TAG_lexical_block ]
+!47 = metadata !{i32 589835, metadata !14, i32 41, i32 28, metadata !15, i32 19} ; [ DW_TAG_lexical_block ]
+!48 = metadata !{i32 95, i32 3, metadata !25, null}
+!49 = metadata !{i32 99, i32 3, metadata !25, null}
diff --git a/test/CodeGen/ARM/debug-info-d16-reg.ll b/test/CodeGen/ARM/debug-info-d16-reg.ll
new file mode 100644
index 0000000..8c9095e
--- /dev/null
+++ b/test/CodeGen/ARM/debug-info-d16-reg.ll
@@ -0,0 +1,105 @@
+; RUN: llc < %s - | FileCheck %s
+; Radar 9309221
+; Test dwarf reg no for d16
+;CHECK: DW_OP_regx
+;CHECK-NEXT: 272
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32"
+target triple = "thumbv7-apple-darwin10"
+
+@.str = private unnamed_addr constant [11 x i8] c"%p %lf %c\0A\00", align 4
+@.str1 = private unnamed_addr constant [6 x i8] c"point\00", align 4
+
+define i32 @inlineprinter(i8* %ptr, double %val, i8 zeroext %c) nounwind optsize {
+entry:
+  tail call void @llvm.dbg.value(metadata !{i8* %ptr}, i64 0, metadata !19), !dbg !26
+  tail call void @llvm.dbg.value(metadata !{double %val}, i64 0, metadata !20), !dbg !26
+  tail call void @llvm.dbg.value(metadata !{i8 %c}, i64 0, metadata !21), !dbg !26
+  %0 = zext i8 %c to i32, !dbg !27
+  %1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([11 x i8]* @.str, i32 0, i32 0), i8* %ptr, double %val, i32 %0) nounwind, !dbg !27
+  ret i32 0, !dbg !29
+}
+
+define i32 @printer(i8* %ptr, double %val, i8 zeroext %c) nounwind optsize noinline {
+entry:
+  tail call void @llvm.dbg.value(metadata !{i8* %ptr}, i64 0, metadata !16), !dbg !30
+  tail call void @llvm.dbg.value(metadata !{double %val}, i64 0, metadata !17), !dbg !30
+  tail call void @llvm.dbg.value(metadata !{i8 %c}, i64 0, metadata !18), !dbg !30
+  %0 = zext i8 %c to i32, !dbg !31
+  %1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([11 x i8]* @.str, i32 0, i32 0), i8* %ptr, double %val, i32 %0) nounwind, !dbg !31
+  ret i32 0, !dbg !33
+}
+
+declare i32 @printf(i8* nocapture, ...) nounwind
+
+declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+
+define i32 @main(i32 %argc, i8** nocapture %argv) nounwind optsize {
+entry:
+  tail call void @llvm.dbg.value(metadata !{i32 %argc}, i64 0, metadata !22), !dbg !34
+  tail call void @llvm.dbg.value(metadata !{i8** %argv}, i64 0, metadata !23), !dbg !34
+  %0 = sitofp i32 %argc to double, !dbg !35
+  %1 = fadd double %0, 5.555552e+05, !dbg !35
+  tail call void @llvm.dbg.value(metadata !{double %1}, i64 0, metadata !24), !dbg !35
+  %2 = tail call i32 @puts(i8* getelementptr inbounds ([6 x i8]* @.str1, i32 0, i32 0)) nounwind, !dbg !36
+  %3 = getelementptr inbounds i8* bitcast (i32 (i32, i8**)* @main to i8*), i32 %argc, !dbg !37
+  %4 = trunc i32 %argc to i8, !dbg !37
+  %5 = add i8 %4, 97, !dbg !37
+  tail call void @llvm.dbg.value(metadata !{i8* %3}, i64 0, metadata !19) nounwind, !dbg !38
+  tail call void @llvm.dbg.value(metadata !{double %1}, i64 0, metadata !20) nounwind, !dbg !38
+  tail call void @llvm.dbg.value(metadata !{i8 %5}, i64 0, metadata !21) nounwind, !dbg !38
+  %6 = zext i8 %5 to i32, !dbg !39
+  %7 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([11 x i8]* @.str, i32 0, i32 0), i8* %3, double %1, i32 %6) nounwind, !dbg !39
+  %8 = tail call i32 @printer(i8* %3, double %1, i8 zeroext %5) nounwind, !dbg !40
+  ret i32 0, !dbg !41
+}
+
+declare i32 @puts(i8* nocapture) nounwind
+
+!llvm.dbg.sp = !{!0, !9, !10}
+!llvm.dbg.lv.printer = !{!16, !17, !18}
+!llvm.dbg.lv.inlineprinter = !{!19, !20, !21}
+!llvm.dbg.lv.main = !{!22, !23, !24}
+
+!0 = metadata !{i32 589870, i32 0, metadata !1, metadata !"printer", metadata !"printer", metadata !"printer", metadata !1, i32 12, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i8*, double, i8)* @printer} ; [ DW_TAG_subprogram ]
+!1 = metadata !{i32 589865, metadata !"a.c", metadata !"/tmp/", metadata !2} ; [ DW_TAG_file_type ]
+!2 = metadata !{i32 589841, i32 0, i32 1, metadata !"/tmp/a.c", metadata !"/tmp", metadata !"(LLVM build 00)", i1 true, i1 true, metadata !"", i32 0} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{i32 589845, metadata !1, metadata !"", metadata !1, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!4 = metadata !{metadata !5, metadata !6, metadata !7, metadata !8}
+!5 = metadata !{i32 589860, metadata !1, metadata !"int", metadata !1, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!6 = metadata !{i32 589839, metadata !1, metadata !"", metadata !1, i32 0, i64 32, i64 32, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ]
+!7 = metadata !{i32 589860, metadata !1, metadata !"double", metadata !1, i32 0, i64 64, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
+!8 = metadata !{i32 589860, metadata !1, metadata !"unsigned char", metadata !1, i32 0, i64 8, i64 8, i64 0, i32 0, i32 8} ; [ DW_TAG_base_type ]
+!9 = metadata !{i32 589870, i32 0, metadata !1, metadata !"inlineprinter", metadata !"inlineprinter", metadata !"inlineprinter", metadata !1, i32 5, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i8*, double, i8)* @inlineprinter} ; [ DW_TAG_subprogram ]
+!10 = metadata !{i32 589870, i32 0, metadata !1, metadata !"main", metadata !"main", metadata !"main", metadata !1, i32 18, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32, i8**)* @main} ; [ DW_TAG_subprogram ]
+!11 = metadata !{i32 589845, metadata !1, metadata !"", metadata !1, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!12 = metadata !{metadata !5, metadata !5, metadata !13}
+!13 = metadata !{i32 589839, metadata !1, metadata !"", metadata !1, i32 0, i64 32, i64 32, i64 0, i32 0, metadata !14} ; [ DW_TAG_pointer_type ]
+!14 = metadata !{i32 589839, metadata !1, metadata !"", metadata !1, i32 0, i64 32, i64 32, i64 0, i32 0, metadata !15} ; [ DW_TAG_pointer_type ]
+!15 = metadata !{i32 589860, metadata !1, metadata !"char", metadata !1, i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
+!16 = metadata !{i32 590081, metadata !0, metadata !"ptr", metadata !1, i32 11, metadata !6, i32 0} ; [ DW_TAG_arg_variable ]
+!17 = metadata !{i32 590081, metadata !0, metadata !"val", metadata !1, i32 11, metadata !7, i32 0} ; [ DW_TAG_arg_variable ]
+!18 = metadata !{i32 590081, metadata !0, metadata !"c", metadata !1, i32 11, metadata !8, i32 0} ; [ DW_TAG_arg_variable ]
+!19 = metadata !{i32 590081, metadata !9, metadata !"ptr", metadata !1, i32 4, metadata !6, i32 0} ; [ DW_TAG_arg_variable ]
+!20 = metadata !{i32 590081, metadata !9, metadata !"val", metadata !1, i32 4, metadata !7, i32 0} ; [ DW_TAG_arg_variable ]
+!21 = metadata !{i32 590081, metadata !9, metadata !"c", metadata !1, i32 4, metadata !8, i32 0} ; [ DW_TAG_arg_variable ]
+!22 = metadata !{i32 590081, metadata !10, metadata !"argc", metadata !1, i32 17, metadata !5, i32 0} ; [ DW_TAG_arg_variable ]
+!23 = metadata !{i32 590081, metadata !10, metadata !"argv", metadata !1, i32 17, metadata !13, i32 0} ; [ DW_TAG_arg_variable ]
+!24 = metadata !{i32 590080, metadata !25, metadata !"dval", metadata !1, i32 19, metadata !7, i32 0} ; [ DW_TAG_auto_variable ]
+!25 = metadata !{i32 589835, metadata !10, i32 18, i32 0, metadata !1, i32 2} ; [ DW_TAG_lexical_block ]
+!26 = metadata !{i32 4, i32 0, metadata !9, null}
+!27 = metadata !{i32 6, i32 0, metadata !28, null}
+!28 = metadata !{i32 589835, metadata !9, i32 5, i32 0, metadata !1, i32 1} ; [ DW_TAG_lexical_block ]
+!29 = metadata !{i32 7, i32 0, metadata !28, null}
+!30 = metadata !{i32 11, i32 0, metadata !0, null}
+!31 = metadata !{i32 13, i32 0, metadata !32, null}
+!32 = metadata !{i32 589835, metadata !0, i32 12, i32 0, metadata !1, i32 0} ; [ DW_TAG_lexical_block ]
+!33 = metadata !{i32 14, i32 0, metadata !32, null}
+!34 = metadata !{i32 17, i32 0, metadata !10, null}
+!35 = metadata !{i32 19, i32 0, metadata !25, null}
+!36 = metadata !{i32 20, i32 0, metadata !25, null}
+!37 = metadata !{i32 21, i32 0, metadata !25, null}
+!38 = metadata !{i32 4, i32 0, metadata !9, metadata !37}
+!39 = metadata !{i32 6, i32 0, metadata !28, metadata !37}
+!40 = metadata !{i32 22, i32 0, metadata !25, null}
+!41 = metadata !{i32 23, i32 0, metadata !25, null}
diff --git a/test/CodeGen/ARM/debug-info-qreg.ll b/test/CodeGen/ARM/debug-info-qreg.ll
new file mode 100644
index 0000000..e83a83d
--- /dev/null
+++ b/test/CodeGen/ARM/debug-info-qreg.ll
@@ -0,0 +1,94 @@
+; RUN: llc < %s - | FileCheck %s
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"
+target triple = "thumbv7-apple-macosx10.6.7"
+
+;CHECK: DW_OP_regx for Q register: D1
+;CHECK-NEXT: byte
+;CHECK-NEXT: byte
+;CHECK-NEXT: DW_OP_piece 8
+;CHECK-NEXT: byte   8
+;CHECK-NEXT: DW_OP_regx for Q register: D2
+;CHECK-NEXT: byte
+;CHECK-NEXT: byte
+;CHECK-NEXT: DW_OP_piece 8
+;CHECK-NEXT: byte   8
+
+@.str = external constant [13 x i8]
+
+declare <4 x float> @test0001(float) nounwind readnone ssp
+
+define i32 @main(i32 %argc, i8** nocapture %argv) nounwind ssp {
+entry:
+  br label %for.body9
+
+for.body9:                                        ; preds = %for.body9, %entry
+  %add19 = fadd <4 x float> undef, <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00>, !dbg !39
+  br i1 undef, label %for.end54, label %for.body9, !dbg !44
+
+for.end54:                                        ; preds = %for.body9
+  tail call void @llvm.dbg.value(metadata !{<4 x float> %add19}, i64 0, metadata !27), !dbg !39
+  %tmp115 = extractelement <4 x float> %add19, i32 1
+  %conv6.i75 = fpext float %tmp115 to double, !dbg !45
+  %call.i82 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([13 x i8]* @.str, i32 0, i32 0), double undef, double %conv6.i75, double undef, double undef) nounwind, !dbg !45
+  ret i32 0, !dbg !49
+}
+
+declare i32 @printf(i8* nocapture, ...) nounwind
+
+declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+
+!llvm.dbg.sp = !{!0, !10, !14}
+!llvm.dbg.lv.test0001 = !{!18}
+!llvm.dbg.lv.main = !{!19, !20, !24, !26, !27, !28, !29}
+!llvm.dbg.lv.printFV = !{!30}
+
+!0 = metadata !{i32 589870, i32 0, metadata !1, metadata !"test0001", metadata !"test0001", metadata !"", metadata !1, i32 3, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, <4 x float> (float)* @test0001, null} ; [ DW_TAG_subprogram ]
+!1 = metadata !{i32 589865, metadata !"build2.c", metadata !"/private/tmp", metadata !2} ; [ DW_TAG_file_type ]
+!2 = metadata !{i32 589841, i32 0, i32 12, metadata !"build2.c", metadata !"/private/tmp", metadata !"clang version 3.0 (trunk 129915)", i1 true, i1 true, metadata !"", i32 0} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{i32 589845, metadata !1, metadata !"", metadata !1, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!4 = metadata !{metadata !5}
+!5 = metadata !{i32 589846, metadata !2, metadata !"v4f32", metadata !1, i32 14, i64 0, i64 0, i64 0, i32 0, metadata !6} ; [ DW_TAG_typedef ]
+!6 = metadata !{i32 590083, metadata !2, metadata !"", metadata !2, i32 0, i64 128, i64 128, i32 0, i32 0, metadata !7, metadata !8, i32 0, i32 0} ; [ DW_TAG_vector_type ]
+!7 = metadata !{i32 589860, metadata !2, metadata !"float", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
+!8 = metadata !{metadata !9}
+!9 = metadata !{i32 589857, i64 0, i64 3}         ; [ DW_TAG_subrange_type ]
+!10 = metadata !{i32 589870, i32 0, metadata !1, metadata !"main", metadata !"main", metadata !"", metadata !1, i32 59, metadata !11, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i32 (i32, i8**)* @main, null} ; [ DW_TAG_subprogram ]
+!11 = metadata !{i32 589845, metadata !1, metadata !"", metadata !1, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !12, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!12 = metadata !{metadata !13}
+!13 = metadata !{i32 589860, metadata !2, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!14 = metadata !{i32 589870, i32 0, metadata !15, metadata !"printFV", metadata !"printFV", metadata !"", metadata !15, i32 41, metadata !16, i1 true, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, null, null} ; [ DW_TAG_subprogram ]
+!15 = metadata !{i32 589865, metadata !"/Volumes/Lalgate/work/llvm/projects/llvm-test/SingleSource/UnitTests/Vector/helpers.h", metadata !"/private/tmp", metadata !2} ; [ DW_TAG_file_type ]
+!16 = metadata !{i32 589845, metadata !15, metadata !"", metadata !15, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !17, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!17 = metadata !{null}
+!18 = metadata !{i32 590081, metadata !0, metadata !"a", metadata !1, i32 16777219, metadata !7, i32 0} ; [ DW_TAG_arg_variable ]
+!19 = metadata !{i32 590081, metadata !10, metadata !"argc", metadata !1, i32 16777275, metadata !13, i32 0} ; [ DW_TAG_arg_variable ]
+!20 = metadata !{i32 590081, metadata !10, metadata !"argv", metadata !1, i32 33554491, metadata !21, i32 0} ; [ DW_TAG_arg_variable ]
+!21 = metadata !{i32 589839, metadata !2, metadata !"", null, i32 0, i64 32, i64 32, i64 0, i32 0, metadata !22} ; [ DW_TAG_pointer_type ]
+!22 = metadata !{i32 589839, metadata !2, metadata !"", null, i32 0, i64 32, i64 32, i64 0, i32 0, metadata !23} ; [ DW_TAG_pointer_type ]
+!23 = metadata !{i32 589860, metadata !2, metadata !"char", null, i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
+!24 = metadata !{i32 590080, metadata !25, metadata !"i", metadata !1, i32 60, metadata !13, i32 0} ; [ DW_TAG_auto_variable ]
+!25 = metadata !{i32 589835, metadata !10, i32 59, i32 33, metadata !1, i32 14} ; [ DW_TAG_lexical_block ]
+!26 = metadata !{i32 590080, metadata !25, metadata !"j", metadata !1, i32 60, metadata !13, i32 0} ; [ DW_TAG_auto_variable ]
+!27 = metadata !{i32 590080, metadata !25, metadata !"x", metadata !1, i32 61, metadata !5, i32 0} ; [ DW_TAG_auto_variable ]
+!28 = metadata !{i32 590080, metadata !25, metadata !"y", metadata !1, i32 62, metadata !5, i32 0} ; [ DW_TAG_auto_variable ]
+!29 = metadata !{i32 590080, metadata !25, metadata !"z", metadata !1, i32 63, metadata !5, i32 0} ; [ DW_TAG_auto_variable ]
+!30 = metadata !{i32 590081, metadata !14, metadata !"F", metadata !15, i32 16777257, metadata !31, i32 0} ; [ DW_TAG_arg_variable ]
+!31 = metadata !{i32 589839, metadata !2, metadata !"", null, i32 0, i64 32, i64 32, i64 0, i32 0, metadata !32} ; [ DW_TAG_pointer_type ]
+!32 = metadata !{i32 589846, metadata !2, metadata !"FV", metadata !15, i32 25, i64 0, i64 0, i64 0, i32 0, metadata !33} ; [ DW_TAG_typedef ]
+!33 = metadata !{i32 589847, metadata !2, metadata !"", metadata !15, i32 22, i64 128, i64 128, i64 0, i32 0, i32 0, metadata !34, i32 0, i32 0} ; [ DW_TAG_union_type ]
+!34 = metadata !{metadata !35, metadata !37}
+!35 = metadata !{i32 589837, metadata !15, metadata !"V", metadata !15, i32 23, i64 128, i64 128, i64 0, i32 0, metadata !36} ; [ DW_TAG_member ]
+!36 = metadata !{i32 589846, metadata !2, metadata !"v4sf", metadata !15, i32 3, i64 0, i64 0, i64 0, i32 0, metadata !6} ; [ DW_TAG_typedef ]
+!37 = metadata !{i32 589837, metadata !15, metadata !"A", metadata !15, i32 24, i64 128, i64 32, i64 0, i32 0, metadata !38} ; [ DW_TAG_member ]
+!38 = metadata !{i32 589825, metadata !2, metadata !"", metadata !2, i32 0, i64 128, i64 32, i32 0, i32 0, metadata !7, metadata !8, i32 0, i32 0} ; [ DW_TAG_array_type ]
+!39 = metadata !{i32 79, i32 7, metadata !40, null}
+!40 = metadata !{i32 589835, metadata !41, i32 75, i32 35, metadata !1, i32 18} ; [ DW_TAG_lexical_block ]
+!41 = metadata !{i32 589835, metadata !42, i32 75, i32 5, metadata !1, i32 17} ; [ DW_TAG_lexical_block ]
+!42 = metadata !{i32 589835, metadata !43, i32 71, i32 32, metadata !1, i32 16} ; [ DW_TAG_lexical_block ]
+!43 = metadata !{i32 589835, metadata !25, i32 71, i32 3, metadata !1, i32 15} ; [ DW_TAG_lexical_block ]
+!44 = metadata !{i32 75, i32 5, metadata !42, null}
+!45 = metadata !{i32 42, i32 2, metadata !46, metadata !48}
+!46 = metadata !{i32 589835, metadata !47, i32 42, i32 2, metadata !15, i32 20} ; [ DW_TAG_lexical_block ]
+!47 = metadata !{i32 589835, metadata !14, i32 41, i32 28, metadata !15, i32 19} ; [ DW_TAG_lexical_block ]
+!48 = metadata !{i32 95, i32 3, metadata !25, null}
+!49 = metadata !{i32 99, i32 3, metadata !25, null}
diff --git a/test/CodeGen/ARM/debug-info-s16-reg.ll b/test/CodeGen/ARM/debug-info-s16-reg.ll
new file mode 100644
index 0000000..548c9bd
--- /dev/null
+++ b/test/CodeGen/ARM/debug-info-s16-reg.ll
@@ -0,0 +1,116 @@
+; RUN: llc < %s - | FileCheck %s
+; Radar 9309221
+; Test dwarf reg no for s16
+;CHECK: DW_OP_regx for S register
+;CHECK-NEXT: byte
+;CHECK-NEXT: byte
+;CHECK-NEXT: DW_OP_bit_piece 32 0
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"
+target triple = "thumbv7-apple-macosx10.6.7"
+
+@.str = private unnamed_addr constant [11 x i8] c"%p %lf %c\0A\00"
+@.str1 = private unnamed_addr constant [6 x i8] c"point\00"
+
+define i32 @inlineprinter(i8* %ptr, float %val, i8 zeroext %c) nounwind optsize ssp {
+entry:
+  tail call void @llvm.dbg.value(metadata !{i8* %ptr}, i64 0, metadata !8), !dbg !24
+  tail call void @llvm.dbg.value(metadata !{float %val}, i64 0, metadata !10), !dbg !25
+  tail call void @llvm.dbg.value(metadata !{i8 %c}, i64 0, metadata !12), !dbg !26
+  %conv = fpext float %val to double, !dbg !27
+  %conv3 = zext i8 %c to i32, !dbg !27
+  %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([11 x i8]* @.str, i32 0, i32 0), i8* %ptr, double %conv, i32 %conv3) nounwind optsize, !dbg !27
+  ret i32 0, !dbg !29
+}
+
+declare i32 @printf(i8* nocapture, ...) nounwind optsize
+
+define i32 @printer(i8* %ptr, float %val, i8 zeroext %c) nounwind optsize noinline ssp {
+entry:
+  tail call void @llvm.dbg.value(metadata !{i8* %ptr}, i64 0, metadata !14), !dbg !30
+  tail call void @llvm.dbg.value(metadata !{float %val}, i64 0, metadata !15), !dbg !31
+  tail call void @llvm.dbg.value(metadata !{i8 %c}, i64 0, metadata !16), !dbg !32
+  %conv = fpext float %val to double, !dbg !33
+  %conv3 = zext i8 %c to i32, !dbg !33
+  %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([11 x i8]* @.str, i32 0, i32 0), i8* %ptr, double %conv, i32 %conv3) nounwind optsize, !dbg !33
+  ret i32 0, !dbg !35
+}
+
+define i32 @main(i32 %argc, i8** nocapture %argv) nounwind optsize ssp {
+entry:
+  tail call void @llvm.dbg.value(metadata !{i32 %argc}, i64 0, metadata !17), !dbg !36
+  tail call void @llvm.dbg.value(metadata !{i8** %argv}, i64 0, metadata !18), !dbg !37
+  %conv = sitofp i32 %argc to double, !dbg !38
+  %add = fadd double %conv, 5.555552e+05, !dbg !38
+  %conv1 = fptrunc double %add to float, !dbg !38
+  tail call void @llvm.dbg.value(metadata !{float %conv1}, i64 0, metadata !22), !dbg !38
+  %call = tail call i32 @puts(i8* getelementptr inbounds ([6 x i8]* @.str1, i32 0, i32 0)) nounwind optsize, !dbg !39
+  %add.ptr = getelementptr i8* bitcast (i32 (i32, i8**)* @main to i8*), i32 %argc, !dbg !40
+  %add5 = add nsw i32 %argc, 97, !dbg !40
+  %conv6 = trunc i32 %add5 to i8, !dbg !40
+  tail call void @llvm.dbg.value(metadata !{i8* %add.ptr}, i64 0, metadata !8) nounwind, !dbg !41
+  tail call void @llvm.dbg.value(metadata !{float %conv1}, i64 0, metadata !10) nounwind, !dbg !42
+  tail call void @llvm.dbg.value(metadata !{i8 %conv6}, i64 0, metadata !12) nounwind, !dbg !43
+  %conv.i = fpext float %conv1 to double, !dbg !44
+  %conv3.i = and i32 %add5, 255, !dbg !44
+  %call.i = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([11 x i8]* @.str, i32 0, i32 0), i8* %add.ptr, double %conv.i, i32 %conv3.i) nounwind optsize, !dbg !44
+  %call14 = tail call i32 @printer(i8* %add.ptr, float %conv1, i8 zeroext %conv6) optsize, !dbg !45
+  ret i32 0, !dbg !46
+}
+
+declare i32 @puts(i8* nocapture) nounwind optsize
+
+declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+
+!llvm.dbg.sp = !{!0, !6, !7}
+!llvm.dbg.lv.inlineprinter = !{!8, !10, !12}
+!llvm.dbg.lv.printer = !{!14, !15, !16}
+!llvm.dbg.lv.main = !{!17, !18, !22}
+
+!0 = metadata !{i32 589870, i32 0, metadata !1, metadata !"inlineprinter", metadata !"inlineprinter", metadata !"", metadata !1, i32 5, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i32 (i8*, float, i8)* @inlineprinter, null} ; [ DW_TAG_subprogram ]
+!1 = metadata !{i32 589865, metadata !"a.c", metadata !"/private/tmp", metadata !2} ; [ DW_TAG_file_type ]
+!2 = metadata !{i32 589841, i32 0, i32 12, metadata !"a.c", metadata !"/private/tmp", metadata !"clang version 3.0 (trunk 129915)", i1 true, i1 true, metadata !"", i32 0} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{i32 589845, metadata !1, metadata !"", metadata !1, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!4 = metadata !{metadata !5}
+!5 = metadata !{i32 589860, metadata !2, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!6 = metadata !{i32 589870, i32 0, metadata !1, metadata !"printer", metadata !"printer", metadata !"", metadata !1, i32 12, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i32 (i8*, float, i8)* @printer, null} ; [ DW_TAG_subprogram ]
+!7 = metadata !{i32 589870, i32 0, metadata !1, metadata !"main", metadata !"main", metadata !"", metadata !1, i32 18, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i32 (i32, i8**)* @main, null} ; [ DW_TAG_subprogram ]
+!8 = metadata !{i32 590081, metadata !0, metadata !"ptr", metadata !1, i32 16777220, metadata !9, i32 0} ; [ DW_TAG_arg_variable ]
+!9 = metadata !{i32 589839, metadata !2, metadata !"", null, i32 0, i64 32, i64 32, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ]
+!10 = metadata !{i32 590081, metadata !0, metadata !"val", metadata !1, i32 33554436, metadata !11, i32 0} ; [ DW_TAG_arg_variable ]
+!11 = metadata !{i32 589860, metadata !2, metadata !"float", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
+!12 = metadata !{i32 590081, metadata !0, metadata !"c", metadata !1, i32 50331652, metadata !13, i32 0} ; [ DW_TAG_arg_variable ]
+!13 = metadata !{i32 589860, metadata !2, metadata !"unsigned char", null, i32 0, i64 8, i64 8, i64 0, i32 0, i32 8} ; [ DW_TAG_base_type ]
+!14 = metadata !{i32 590081, metadata !6, metadata !"ptr", metadata !1, i32 16777227, metadata !9, i32 0} ; [ DW_TAG_arg_variable ]
+!15 = metadata !{i32 590081, metadata !6, metadata !"val", metadata !1, i32 33554443, metadata !11, i32 0} ; [ DW_TAG_arg_variable ]
+!16 = metadata !{i32 590081, metadata !6, metadata !"c", metadata !1, i32 50331659, metadata !13, i32 0} ; [ DW_TAG_arg_variable ]
+!17 = metadata !{i32 590081, metadata !7, metadata !"argc", metadata !1, i32 16777233, metadata !5, i32 0} ; [ DW_TAG_arg_variable ]
+!18 = metadata !{i32 590081, metadata !7, metadata !"argv", metadata !1, i32 33554449, metadata !19, i32 0} ; [ DW_TAG_arg_variable ]
+!19 = metadata !{i32 589839, metadata !2, metadata !"", null, i32 0, i64 32, i64 32, i64 0, i32 0, metadata !20} ; [ DW_TAG_pointer_type ]
+!20 = metadata !{i32 589839, metadata !2, metadata !"", null, i32 0, i64 32, i64 32, i64 0, i32 0, metadata !21} ; [ DW_TAG_pointer_type ]
+!21 = metadata !{i32 589860, metadata !2, metadata !"char", null, i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
+!22 = metadata !{i32 590080, metadata !23, metadata !"dval", metadata !1, i32 19, metadata !11, i32 0} ; [ DW_TAG_auto_variable ]
+!23 = metadata !{i32 589835, metadata !7, i32 18, i32 1, metadata !1, i32 2} ; [ DW_TAG_lexical_block ]
+!24 = metadata !{i32 4, i32 22, metadata !0, null}
+!25 = metadata !{i32 4, i32 33, metadata !0, null}
+!26 = metadata !{i32 4, i32 52, metadata !0, null}
+!27 = metadata !{i32 6, i32 3, metadata !28, null}
+!28 = metadata !{i32 589835, metadata !0, i32 5, i32 1, metadata !1, i32 0} ; [ DW_TAG_lexical_block ]
+!29 = metadata !{i32 7, i32 3, metadata !28, null}
+!30 = metadata !{i32 11, i32 42, metadata !6, null}
+!31 = metadata !{i32 11, i32 53, metadata !6, null}
+!32 = metadata !{i32 11, i32 72, metadata !6, null}
+!33 = metadata !{i32 13, i32 3, metadata !34, null}
+!34 = metadata !{i32 589835, metadata !6, i32 12, i32 1, metadata !1, i32 1} ; [ DW_TAG_lexical_block ]
+!35 = metadata !{i32 14, i32 3, metadata !34, null}
+!36 = metadata !{i32 17, i32 15, metadata !7, null}
+!37 = metadata !{i32 17, i32 28, metadata !7, null}
+!38 = metadata !{i32 19, i32 31, metadata !23, null}
+!39 = metadata !{i32 20, i32 3, metadata !23, null}
+!40 = metadata !{i32 21, i32 3, metadata !23, null}
+!41 = metadata !{i32 4, i32 22, metadata !0, metadata !40}
+!42 = metadata !{i32 4, i32 33, metadata !0, metadata !40}
+!43 = metadata !{i32 4, i32 52, metadata !0, metadata !40}
+!44 = metadata !{i32 6, i32 3, metadata !28, metadata !40}
+!45 = metadata !{i32 22, i32 3, metadata !23, null}
+!46 = metadata !{i32 23, i32 1, metadata !23, null}
diff --git a/test/CodeGen/ARM/debug-info-sreg2.ll b/test/CodeGen/ARM/debug-info-sreg2.ll
new file mode 100644
index 0000000..16aeab3
--- /dev/null
+++ b/test/CodeGen/ARM/debug-info-sreg2.ll
@@ -0,0 +1,61 @@
+; RUN: llc < %s - | FileCheck %s
+; Radar 9376013
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"
+target triple = "thumbv7-apple-macosx10.6.7"
+
+;CHECK: Ldebug_loc0:
+;CHECK-NEXT:        .long   Ltmp1
+;CHECK-NEXT:        .long   Ltmp3
+;CHECK-NEXT: Lset9 = Ltmp10-Ltmp9                    @ Loc expr size
+;CHECK-NEXT:        .short  Lset9
+;CHECK-NEXT: Ltmp9:
+;CHECK-NEXT:        .byte   144                     @ DW_OP_regx for S register
+
+define void @_Z3foov() optsize ssp {
+entry:
+  %call = tail call float @_Z3barv() optsize, !dbg !11
+  tail call void @llvm.dbg.value(metadata !{float %call}, i64 0, metadata !5), !dbg !11
+  %call16 = tail call float @_Z2f2v() optsize, !dbg !12
+  %cmp7 = fcmp olt float %call, %call16, !dbg !12
+  br i1 %cmp7, label %for.body, label %for.end, !dbg !12
+
+for.body:                                         ; preds = %entry, %for.body
+  %k.08 = phi float [ %inc, %for.body ], [ %call, %entry ]
+  %call4 = tail call float @_Z2f3f(float %k.08) optsize, !dbg !13
+  %inc = fadd float %k.08, 1.000000e+00, !dbg !14
+  %call1 = tail call float @_Z2f2v() optsize, !dbg !12
+  %cmp = fcmp olt float %inc, %call1, !dbg !12
+  br i1 %cmp, label %for.body, label %for.end, !dbg !12
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void, !dbg !15
+}
+
+declare float @_Z3barv() optsize
+
+declare float @_Z2f2v() optsize
+
+declare float @_Z2f3f(float) optsize
+
+declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+
+!llvm.dbg.cu = !{!0}
+!llvm.dbg.sp = !{!1}
+!llvm.dbg.lv._Z3foov = !{!5, !8}
+
+!0 = metadata !{i32 589841, i32 0, i32 4, metadata !"k.cc", metadata !"/private/tmp", metadata !"clang version 3.0 (trunk 130845)", i1 true, i1 true, metadata !"", i32 0} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{i32 589870, i32 0, metadata !2, metadata !"foo", metadata !"foo", metadata !"_Z3foov", metadata !2, i32 5, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, void ()* @_Z3foov, null, null} ; [ DW_TAG_subprogram ]
+!2 = metadata !{i32 589865, metadata !"k.cc", metadata !"/private/tmp", metadata !0} ; [ DW_TAG_file_type ]
+!3 = metadata !{i32 589845, metadata !2, metadata !"", metadata !2, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!4 = metadata !{null}
+!5 = metadata !{i32 590080, metadata !6, metadata !"k", metadata !2, i32 6, metadata !7, i32 0} ; [ DW_TAG_auto_variable ]
+!6 = metadata !{i32 589835, metadata !1, i32 5, i32 12, metadata !2, i32 0} ; [ DW_TAG_lexical_block ]
+!7 = metadata !{i32 589860, metadata !0, metadata !"float", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
+!8 = metadata !{i32 590080, metadata !9, metadata !"y", metadata !2, i32 8, metadata !7, i32 0} ; [ DW_TAG_auto_variable ]
+!9 = metadata !{i32 589835, metadata !10, i32 7, i32 25, metadata !2, i32 2} ; [ DW_TAG_lexical_block ]
+!10 = metadata !{i32 589835, metadata !6, i32 7, i32 3, metadata !2, i32 1} ; [ DW_TAG_lexical_block ]
+!11 = metadata !{i32 6, i32 18, metadata !6, null}
+!12 = metadata !{i32 7, i32 3, metadata !6, null}
+!13 = metadata !{i32 8, i32 20, metadata !9, null}
+!14 = metadata !{i32 7, i32 20, metadata !10, null}
+!15 = metadata !{i32 10, i32 1, metadata !6, null}
diff --git a/test/CodeGen/ARM/divmod.ll b/test/CodeGen/ARM/divmod.ll
deleted file mode 100644
index 04b8fbf..0000000
--- a/test/CodeGen/ARM/divmod.ll
+++ /dev/null
@@ -1,27 +0,0 @@
-; RUN: llc < %s -mtriple=arm-apple-darwin -use-divmod-libcall | FileCheck %s
-
-define void @foo(i32 %x, i32 %y, i32* nocapture %P) nounwind ssp {
-entry:
-; CHECK: foo:
-; CHECK: bl ___divmodsi4
-; CHECK-NOT: bl ___divmodsi4
-  %div = sdiv i32 %x, %y
-  store i32 %div, i32* %P, align 4
-  %rem = srem i32 %x, %y
-  %arrayidx6 = getelementptr inbounds i32* %P, i32 1
-  store i32 %rem, i32* %arrayidx6, align 4
-  ret void
-}
-
-define void @bar(i32 %x, i32 %y, i32* nocapture %P) nounwind ssp {
-entry:
-; CHECK: bar:
-; CHECK: bl ___udivmodsi4
-; CHECK-NOT: bl ___udivmodsi4
-  %div = udiv i32 %x, %y
-  store i32 %div, i32* %P, align 4
-  %rem = urem i32 %x, %y
-  %arrayidx6 = getelementptr inbounds i32* %P, i32 1
-  store i32 %rem, i32* %arrayidx6, align 4
-  ret void
-}
diff --git a/test/CodeGen/ARM/eh-resume-darwin.ll b/test/CodeGen/ARM/eh-resume-darwin.ll
new file mode 100644
index 0000000..e475508
--- /dev/null
+++ b/test/CodeGen/ARM/eh-resume-darwin.ll
@@ -0,0 +1,29 @@
+; RUN: llc < %s -march=arm | FileCheck %s
+target triple = "armv6-apple-macosx10.6"
+
+declare void @func()
+
+declare i8* @llvm.eh.exception() nounwind readonly
+
+declare i32 @llvm.eh.selector(i8*, i8*, ...) nounwind
+
+declare void @llvm.eh.resume(i8*, i32)
+
+declare i32 @__gxx_personality_sj0(...)
+
+define void @test0() {
+entry:
+  invoke void @func()
+    to label %cont unwind label %lpad
+
+cont:
+  ret void
+
+lpad:
+  %exn = call i8* @llvm.eh.exception()
+  %sel = call i32 (i8*, i8*, ...)* @llvm.eh.selector(i8* %exn, i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*), i32 0)
+  call void @llvm.eh.resume(i8* %exn, i32 %sel) noreturn
+  unreachable
+}
+
+; CHECK: __Unwind_SjLj_Resume
diff --git a/test/CodeGen/ARM/fabss.ll b/test/CodeGen/ARM/fabss.ll
index f03282b..51efe51 100644
--- a/test/CodeGen/ARM/fabss.ll
+++ b/test/CodeGen/ARM/fabss.ll
@@ -24,4 +24,4 @@ declare float @fabsf(float)
 ; CORTEXA8: test:
 ; CORTEXA8: 	vabs.f32	d1, d1
 ; CORTEXA9: test:
-; CORTEXA9: 	vabs.f32	s1, s1
+; CORTEXA9: 	vabs.f32	s{{.}}, s{{.}}
diff --git a/test/CodeGen/ARM/fadds.ll b/test/CodeGen/ARM/fadds.ll
index 749690e..e35103c 100644
--- a/test/CodeGen/ARM/fadds.ll
+++ b/test/CodeGen/ARM/fadds.ll
@@ -20,4 +20,4 @@ entry:
 ; CORTEXA8: test:
 ; CORTEXA8: 	vadd.f32	d0, d1, d0
 ; CORTEXA9: test:
-; CORTEXA9: 	vadd.f32	s0, s1, s0
+; CORTEXA9: 	vadd.f32	s{{.}}, s{{.}}, s{{.}}
diff --git a/test/CodeGen/ARM/fast-isel-crash2.ll b/test/CodeGen/ARM/fast-isel-crash2.ll
new file mode 100644
index 0000000..aa06299
--- /dev/null
+++ b/test/CodeGen/ARM/fast-isel-crash2.ll
@@ -0,0 +1,9 @@
+; RUN: llc < %s -O0 -mtriple=thumbv7-apple-darwin
+; rdar://9515076
+; (Make sure this doesn't crash.)
+
+define i32 @test(i32 %i) {
+  %t = trunc i32 %i to i4
+  %r = sext i4 %t to i32
+  ret i32 %r
+}
diff --git a/test/CodeGen/ARM/fast-isel-redefinition.ll b/test/CodeGen/ARM/fast-isel-redefinition.ll
new file mode 100644
index 0000000..08dcc64
--- /dev/null
+++ b/test/CodeGen/ARM/fast-isel-redefinition.ll
@@ -0,0 +1,11 @@
+; RUN: llc -O0 -regalloc=linearscan < %s
+; This isn't exactly a useful set of command-line options, but check that it
+; doesn't crash.  (It was crashing because a register was getting redefined.)
+
+target triple = "thumbv7-apple-macosx10.6.7"
+
+define i32 @f(i32* %x) nounwind ssp {
+  %y = getelementptr inbounds i32* %x, i32 5000
+  %tmp103 = load i32* %y, align 4
+  ret i32 %tmp103
+}
diff --git a/test/CodeGen/ARM/fast-isel-static.ll b/test/CodeGen/ARM/fast-isel-static.ll
index 2d79674..a86e325 100644
--- a/test/CodeGen/ARM/fast-isel-static.ll
+++ b/test/CodeGen/ARM/fast-isel-static.ll
@@ -23,7 +23,7 @@ entry:
   %z = alloca float, align 4
   store float 0.000000e+00, float* %ztot, align 4
   store float 1.000000e+00, float* %z, align 4
-; CHECK-LONG: blx     r2
+; CHECK-LONG: blx     r
 ; CHECK-NORM: bl      _myadd
   call void @myadd(float* %ztot, float* %z)
   ret i32 0
diff --git a/test/CodeGen/ARM/fast-isel.ll b/test/CodeGen/ARM/fast-isel.ll
index dd806ec..499c97f 100644
--- a/test/CodeGen/ARM/fast-isel.ll
+++ b/test/CodeGen/ARM/fast-isel.ll
@@ -1,8 +1,7 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=armv7-apple-darwin
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=thumbv7-apple-darwin
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=armv7-apple-darwin | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=thumbv7-apple-darwin | FileCheck %s --check-prefix=THUMB
 
 ; Very basic fast-isel functionality.
-
 define i32 @add(i32 %a, i32 %b) nounwind {
 entry:
   %a.addr = alloca i32, align 4
@@ -13,4 +12,149 @@ entry:
   %tmp1 = load i32* %b.addr
   %add = add nsw i32 %tmp, %tmp1
   ret i32 %add
-}
-\ No newline at end of file
+}
+
+; Check truncate to bool
+define void @test1(i32 %tmp) nounwind {
+entry:
+%tobool = trunc i32 %tmp to i1
+br i1 %tobool, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+call void @test1(i32 0)
+br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+ret void
+; ARM: test1:
+; ARM: tst r0, #1
+; THUMB: test1:
+; THUMB: tst.w r0, #1
+}
+
+; Check some simple operations with immediates
+define void @test2(i32 %tmp, i32* %ptr) nounwind {
+; THUMB: test2:
+; ARM: test2:
+
+b1:
+  %a = add i32 %tmp, 4096
+  store i32 %a, i32* %ptr
+  br label %b2
+
+; THUMB: add.w {{.*}} #4096
+; ARM: add {{.*}} #1, #20
+
+b2:
+  %b = add i32 %tmp, 4095
+  store i32 %b, i32* %ptr
+  br label %b3
+; THUMB: addw {{.*}} #4095
+; ARM: movw {{.*}} #4095
+; ARM: add
+
+b3:
+  %c = or i32 %tmp, 4
+  store i32 %c, i32* %ptr
+  ret void
+
+; THUMB: orr {{.*}} #4
+; ARM: orr {{.*}} #4
+}
+
+define void @test3(i32 %tmp, i32* %ptr1, i16* %ptr2, i8* %ptr3) nounwind {
+; THUMB: test3:
+; ARM: test3:
+
+bb1:
+  %a1 = trunc i32 %tmp to i16
+  %a2 = trunc i16 %a1 to i8
+  %a3 = trunc i8 %a2 to i1
+  %a4 = zext i1 %a3 to i8
+  store i8 %a4, i8* %ptr3
+  %a5 = zext i8 %a4 to i16
+  store i16 %a5, i16* %ptr2
+  %a6 = zext i16 %a5 to i32
+  store i32 %a6, i32* %ptr1
+  br label %bb2
+
+; THUMB: and
+; THUMB: strb
+; THUMB: uxtb
+; THUMB: strh
+; THUMB: uxth
+; ARM: and
+; ARM: strb
+; ARM: uxtb
+; ARM: strh
+; ARM: uxth
+
+bb2:
+  %b1 = trunc i32 %tmp to i16
+  %b2 = trunc i16 %b1 to i8
+  store i8 %b2, i8* %ptr3
+  %b3 = sext i8 %b2 to i16
+  store i16 %b3, i16* %ptr2
+  %b4 = sext i16 %b3 to i32
+  store i32 %b4, i32* %ptr1
+  br label %bb3
+
+; THUMB: strb
+; THUMB: sxtb
+; THUMB: strh
+; THUMB: sxth
+; ARM: strb
+; ARM: sxtb
+; ARM: strh
+; ARM: sxth
+
+bb3:
+  %c1 = load i8* %ptr3
+  %c2 = load i16* %ptr2
+  %c3 = load i32* %ptr1
+  %c4 = zext i8 %c1 to i32
+  %c5 = sext i16 %c2 to i32
+  %c6 = add i32 %c4, %c5
+  %c7 = sub i32 %c3, %c6
+  store i32 %c7, i32* %ptr1
+  ret void
+
+; THUMB: ldrb
+; THUMB: ldrh
+; THUMB: uxtb
+; THUMB: sxth
+; THUMB: add
+; THUMB: sub
+; ARM: ldrb
+; ARM: ldrh
+; ARM: uxtb
+; ARM: sxth
+; ARM: add
+; ARM: sub
+}
+
+; Check loads/stores with globals
+@test4g = external global i32
+
+define void @test4() {
+  %a = load i32* @test4g
+  %b = add i32 %a, 1
+  store i32 %b, i32* @test4g
+  ret void
+
+; THUMB: ldr.n r0, LCPI4_1
+; THUMB: ldr r0, [r0]
+; THUMB: ldr r0, [r0]
+; THUMB: adds r0, #1
+; THUMB: ldr.n r1, LCPI4_0
+; THUMB: ldr r1, [r1]
+; THUMB: str r0, [r1]
+
+; ARM: ldr r0, LCPI4_1
+; ARM: ldr r0, [r0]
+; ARM: ldr r0, [r0]
+; ARM: add r0, r0, #1
+; ARM: ldr r1, LCPI4_0
+; ARM: ldr r1, [r1]
+; ARM: str r0, [r1]
+}
diff --git a/test/CodeGen/ARM/fcopysign.ll b/test/CodeGen/ARM/fcopysign.ll
index 9e94e39..c4dbeb9 100644
--- a/test/CodeGen/ARM/fcopysign.ll
+++ b/test/CodeGen/ARM/fcopysign.ll
@@ -10,7 +10,7 @@ entry:
 
 ; HARD: test1:
 ; HARD: vmov.i32 [[REG1:(d[0-9]+)]], #0x80000000
-; HARD: vbsl [[REG1]], d2, d0
+; HARD: vbsl [[REG1]], d
   %0 = tail call float @copysignf(float %x, float %y) nounwind
   ret float %0
 }
@@ -44,15 +44,33 @@ entry:
 define i32 @test4() ssp {
 entry:
 ; SOFT: test4:
-; SOFT: vcvt.f32.f64 s0, 
-; SOFT: vmov.i32 [[REG4:(d[0-9]+)]], #0x80000000
-; SOFT: vbic [[REG5:(d[0-9]+)]], d0, [[REG4]]
-; SOFT: vorr d0, [[REG4]], [[REG5]]
+; SOFT: vmov.f64 [[REG4:(d[0-9]+)]], #1.000000e+00
+; This S-reg must be the first sub-reg of the last D-reg on vbsl.
+; SOFT: vcvt.f32.f64 {{s1?[02468]}}, [[REG4]]
+; SOFT: vshr.u64 [[REG4]], [[REG4]], #32
+; SOFT: vmov.i32 [[REG5:(d[0-9]+)]], #0x80000000
+; SOFT: vbsl [[REG5]], [[REG4]], {{d[0-9]+}}
   %call80 = tail call double @copysign(double 1.000000e+00, double undef)
   %conv81 = fptrunc double %call80 to float
   %tmp88 = bitcast float %conv81 to i32
   ret i32 %tmp88
 }
 
+; rdar://9287902
+define float @test5() nounwind {
+entry:
+; SOFT: test5:
+; SOFT: vmov.i32 [[REG6:(d[0-9]+)]], #0x80000000
+; SOFT: vmov [[REG7:(d[0-9]+)]], r0, r1
+; SOFT: vshr.u64 [[REG7]], [[REG7]], #32
+; SOFT: vbsl [[REG6]], [[REG7]], 
+  %0 = tail call double (...)* @bar() nounwind
+  %1 = fptrunc double %0 to float
+  %2 = tail call float @copysignf(float 5.000000e-01, float %1) nounwind readnone
+  %3 = fadd float %1, %2
+  ret float %3
+}
+
+declare double @bar(...)
 declare double @copysign(double, double) nounwind
 declare float @copysignf(float, float) nounwind
diff --git a/test/CodeGen/ARM/fdivs.ll b/test/CodeGen/ARM/fdivs.ll
index 0c31495..31c1ca9 100644
--- a/test/CodeGen/ARM/fdivs.ll
+++ b/test/CodeGen/ARM/fdivs.ll
@@ -20,4 +20,4 @@ entry:
 ; CORTEXA8: test:
 ; CORTEXA8: 	vdiv.f32	s0, s1, s0
 ; CORTEXA9: test:
-; CORTEXA9: 	vdiv.f32	s0, s1, s0
+; CORTEXA9: 	vdiv.f32	s{{.}}, s{{.}}, s{{.}}
diff --git a/test/CodeGen/ARM/fmacs.ll b/test/CodeGen/ARM/fmacs.ll
index fb83ef6..b63f609 100644
--- a/test/CodeGen/ARM/fmacs.ll
+++ b/test/CodeGen/ARM/fmacs.ll
@@ -1,6 +1,8 @@
 ; RUN: llc < %s -march=arm -mattr=+vfp2 | FileCheck %s -check-prefix=VFP2
 ; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s -check-prefix=NEON
 ; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s -check-prefix=A8
+; RUN: llc < %s -march=arm -mcpu=cortex-a9 | FileCheck %s -check-prefix=A9
+; RUN: llc < %s -mtriple=arm-linux-gnueabi -mcpu=cortex-a9 -float-abi=hard | FileCheck %s -check-prefix=HARD
 
 define float @t1(float %acc, float %a, float %b) {
 entry:
@@ -49,3 +51,54 @@ entry:
         %1 = fadd float %0, %acc
 	ret float %1
 }
+
+; It's possible to make use of fp vmla / vmls on Cortex-A9.
+; rdar://8659675
+define void @t4(float %acc1, float %a, float %b, float %acc2, float %c, float* %P1, float* %P2) {
+entry:
+; A8: t4:
+; A8: vmul.f32
+; A8: vmul.f32
+; A8: vadd.f32
+; A8: vadd.f32
+
+; Two vmla with now RAW hazard
+; A9: t4:
+; A9: vmla.f32
+; A9: vmla.f32
+
+; HARD: t4:
+; HARD: vmla.f32 s0, s1, s2
+; HARD: vmla.f32 s3, s1, s4
+  %0 = fmul float %a, %b
+  %1 = fadd float %acc1, %0
+  %2 = fmul float %a, %c
+  %3 = fadd float %acc2, %2
+  store float %1, float* %P1
+  store float %3, float* %P2
+  ret void
+}
+
+define float @t5(float %a, float %b, float %c, float %d, float %e) {
+entry:
+; A8: t5:
+; A8: vmul.f32
+; A8: vmul.f32
+; A8: vadd.f32
+; A8: vadd.f32
+
+; A9: t5:
+; A9: vmla.f32
+; A9: vmul.f32
+; A9: vadd.f32
+
+; HARD: t5:
+; HARD: vmla.f32 s4, s0, s1
+; HARD: vmul.f32 s0, s2, s3
+; HARD: vadd.f32 s0, s4, s0
+  %0 = fmul float %a, %b
+  %1 = fadd float %e, %0
+  %2 = fmul float %c, %d
+  %3 = fadd float %1, %2
+  ret float %3
+}
diff --git a/test/CodeGen/ARM/fmuls.ll b/test/CodeGen/ARM/fmuls.ll
index ef4e3e5..bc118b8 100644
--- a/test/CodeGen/ARM/fmuls.ll
+++ b/test/CodeGen/ARM/fmuls.ll
@@ -20,4 +20,4 @@ entry:
 ; CORTEXA8: test:
 ; CORTEXA8: 	vmul.f32	d0, d1, d0
 ; CORTEXA9: test:
-; CORTEXA9: 	vmul.f32	s0, s1, s0
+; CORTEXA9: 	vmul.f32	s{{.}}, s{{.}}, s{{.}}
diff --git a/test/CodeGen/ARM/fnmscs.ll b/test/CodeGen/ARM/fnmscs.ll
index 9facf20..6081712 100644
--- a/test/CodeGen/ARM/fnmscs.ll
+++ b/test/CodeGen/ARM/fnmscs.ll
@@ -29,7 +29,7 @@ entry:
 ; NEON: vnmla.f32
 
 ; A8: t2:
-; A8: vnmul.f32 s{{[0123]}}, s{{[0123]}}, s{{[0123]}}
+; A8: vnmul.f32 s{{[01234]}}, s{{[01234]}}, s{{[01234]}}
 ; A8: vsub.f32 d{{[0-9]}}, d{{[0-9]}}, d{{[0-9]}}
 	%0 = fmul float %a, %b
 	%1 = fmul float -1.0, %0
diff --git a/test/CodeGen/ARM/fp-arg-shuffle.ll b/test/CodeGen/ARM/fp-arg-shuffle.ll
index 59303ac..ae02b79 100644
--- a/test/CodeGen/ARM/fp-arg-shuffle.ll
+++ b/test/CodeGen/ARM/fp-arg-shuffle.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=arm -mattr=+neon -float-abi=soft | FileCheck %s
 
 ; CHECK: function1
-; CHECK-NOT: vmov r
+; CHECK-NOT: vmov
 define double @function1(double %a, double %b, double %c, double %d, double %e, double %f) nounwind noinline ssp {
 entry:
   %call = tail call double @function2(double %f, double %e, double %d, double %c, double %b, double %a) nounwind
diff --git a/test/CodeGen/ARM/fp_convert.ll b/test/CodeGen/ARM/fp_convert.ll
index 1ef9f7f..86c06f1 100644
--- a/test/CodeGen/ARM/fp_convert.ll
+++ b/test/CodeGen/ARM/fp_convert.ll
@@ -5,7 +5,7 @@
 
 define i32 @test1(float %a, float %b) {
 ; VFP2: test1:
-; VFP2: vcvt.s32.f32 s0, s0
+; VFP2: vcvt.s32.f32 s{{.}}, s{{.}}
 ; NEON: test1:
 ; NEON: vcvt.s32.f32 d0, d0
 entry:
@@ -16,7 +16,7 @@ entry:
 
 define i32 @test2(float %a, float %b) {
 ; VFP2: test2:
-; VFP2: vcvt.u32.f32 s0, s0
+; VFP2: vcvt.u32.f32 s{{.}}, s{{.}}
 ; NEON: test2:
 ; NEON: vcvt.u32.f32 d0, d0
 entry:
@@ -27,7 +27,7 @@ entry:
 
 define float @test3(i32 %a, i32 %b) {
 ; VFP2: test3:
-; VFP2: vcvt.f32.u32 s0, s0
+; VFP2: vcvt.f32.u32 s{{.}}, s{{.}}
 ; NEON: test3:
 ; NEON: vcvt.f32.u32 d0, d0
 entry:
@@ -38,7 +38,7 @@ entry:
 
 define float @test4(i32 %a, i32 %b) {
 ; VFP2: test4:
-; VFP2: vcvt.f32.s32 s0, s0
+; VFP2: vcvt.f32.s32 s{{.}}, s{{.}}
 ; NEON: test4:
 ; NEON: vcvt.f32.s32 d0, d0
 entry:
diff --git a/test/CodeGen/ARM/indirectbr.ll b/test/CodeGen/ARM/indirectbr.ll
index 19dad3a..f0ab9dd 100644
--- a/test/CodeGen/ARM/indirectbr.ll
+++ b/test/CodeGen/ARM/indirectbr.ll
@@ -42,20 +42,23 @@ L3:                                               ; preds = %L4, %bb2
   br label %L2
 
 L2:                                               ; preds = %L3, %bb2
+; THUMB: muls
   %res.2 = phi i32 [ %res.1, %L3 ], [ 1, %bb2 ]   ; <i32> [#uses=1]
   %phitmp = mul i32 %res.2, 6                     ; <i32> [#uses=1]
   br label %L1
 
 L1:                                               ; preds = %L2, %bb2
   %res.3 = phi i32 [ %phitmp, %L2 ], [ 2, %bb2 ]  ; <i32> [#uses=1]
-; ARM: ldr r1, LCPI
-; ARM: add r1, pc, r1
-; ARM: str r1
-; THUMB: ldr.n r2, LCPI
-; THUMB: add r2, pc
-; THUMB: str r2
-; THUMB2: ldr.n r2, LCPI
-; THUMB2-NEXT: str r2
+; ARM: ldr [[R1:r[0-9]+]], LCPI
+; ARM: add [[R1b:r[0-9]+]], pc, [[R1]]
+; ARM: str [[R1b]]
+; THUMB: ldr.n
+; THUMB: add
+; THUMB: ldr.n [[R2:r[0-9]+]], LCPI
+; THUMB: add [[R2]], pc
+; THUMB: str [[R2]]
+; THUMB2: ldr.n [[R2:r[0-9]+]], LCPI
+; THUMB2-NEXT: str{{(.w)?}} [[R2]]
   store i8* blockaddress(@foo, %L5), i8** @nextaddr, align 4
   ret i32 %res.3
 }
diff --git a/test/CodeGen/ARM/inlineasm3.ll b/test/CodeGen/ARM/inlineasm3.ll
index 9f77ad1..58687b9 100644
--- a/test/CodeGen/ARM/inlineasm3.ll
+++ b/test/CodeGen/ARM/inlineasm3.ll
@@ -6,7 +6,7 @@
 define void @t() nounwind {
 entry:
 ; CHECK: vmov.I64 q15, #0
-; CHECK: vmov.32 d30[0], r0
+; CHECK: vmov.32 d30[0],
 ; CHECK: vmov q8, q15
   %tmp = alloca %struct.int32x4_t, align 16
   call void asm sideeffect "vmov.I64 q15, #0\0Avmov.32 d30[0], $1\0Avmov ${0:q}, q15\0A", "=*w,r,~{d31},~{d30}"(%struct.int32x4_t* %tmp, i32 8192) nounwind
@@ -23,3 +23,38 @@ entry:
   %asmtmp2 = tail call i32 asm sideeffect "vmov d30, $1\0Avmov.32 $0, d30[0]\0A", "=r,w,~{d30}"(<2 x i32> undef) nounwind
   ret void
 }
+
+; Radar 9306086
+
+%0 = type { <8 x i8>, <16 x i8>* }
+
+define hidden void @conv4_8_E() nounwind {
+entry:
+%asmtmp31 = call %0 asm "vld1.u8  {$0}, [$1, :128]!\0A", "=w,=r,1"(<16 x i8>* undef) nounwind
+unreachable
+}
+
+; Radar 9037836 & 9119939
+
+define i32 @t3() nounwind {
+entry:
+tail call void asm sideeffect "flds s15, $0 \0A", "^Uv|m,~{s15}"(float 1.000000e+00) nounwind
+ret i32 0
+}
+
+; Radar 9037836 & 9119939
+
+@k.2126 = internal unnamed_addr global float 1.000000e+00
+define i32 @t4() nounwind {
+entry:
+call void asm sideeffect "flds s15, $0 \0A", "*^Uv,~{s15}"(float* @k.2126) nounwind
+ret i32 0
+}
+
+; Radar 9037836 & 9119939
+
+define i32 @t5() nounwind {
+entry:
+call void asm sideeffect "flds s15, $0 \0A", "*^Uvm,~{s15}"(float* @k.2126) nounwind
+ret i32 0
+}
diff --git a/test/CodeGen/ARM/intrinsics.ll b/test/CodeGen/ARM/intrinsics.ll
new file mode 100644
index 0000000..54cc3e0
--- /dev/null
+++ b/test/CodeGen/ARM/intrinsics.ll
@@ -0,0 +1,39 @@
+; RUN: llc < %s -mtriple=armv7-eabi -mcpu=cortex-a8 | FileCheck %s
+; RUN: llc < %s -march=thumb -mtriple=thumbv7-eabi -mcpu=cortex-a8 | FileCheck %s
+
+define void @coproc() nounwind {
+entry:
+  ; CHECK: mrc
+  %0 = tail call i32 @llvm.arm.mrc(i32 7, i32 1, i32 1, i32 1, i32 4) nounwind
+  ; CHECK: mcr
+  tail call void @llvm.arm.mcr(i32 7, i32 1, i32 %0, i32 1, i32 1, i32 4) nounwind
+  ; CHECK: mrc2
+  %1 = tail call i32 @llvm.arm.mrc2(i32 7, i32 1, i32 1, i32 1, i32 4) nounwind
+  ; CHECK: mcr2
+  tail call void @llvm.arm.mcr2(i32 7, i32 1, i32 %1, i32 1, i32 1, i32 4) nounwind
+  ; CHECK: mcrr
+  tail call void @llvm.arm.mcrr(i32 7, i32 1, i32 %0, i32 %1, i32 1) nounwind
+  ; CHECK: mcrr2
+  tail call void @llvm.arm.mcrr2(i32 7, i32 1, i32 %0, i32 %1, i32 1) nounwind
+  ; CHECK: cdp
+  tail call void @llvm.arm.cdp(i32 7, i32 3, i32 1, i32 1, i32 1, i32 5) nounwind
+  ; CHECK: cdp2
+  tail call void @llvm.arm.cdp2(i32 7, i32 3, i32 1, i32 1, i32 1, i32 5) nounwind
+  ret void
+}
+
+declare void @llvm.arm.cdp2(i32, i32, i32, i32, i32, i32) nounwind
+
+declare void @llvm.arm.cdp(i32, i32, i32, i32, i32, i32) nounwind
+
+declare void @llvm.arm.mcrr2(i32, i32, i32, i32, i32) nounwind
+
+declare void @llvm.arm.mcrr(i32, i32, i32, i32, i32) nounwind
+
+declare void @llvm.arm.mcr2(i32, i32, i32, i32, i32, i32) nounwind
+
+declare i32 @llvm.arm.mrc2(i32, i32, i32, i32, i32) nounwind
+
+declare void @llvm.arm.mcr(i32, i32, i32, i32, i32, i32) nounwind
+
+declare i32 @llvm.arm.mrc(i32, i32, i32, i32, i32) nounwind
diff --git a/test/CodeGen/ARM/jumptable-label.ll b/test/CodeGen/ARM/jumptable-label.ll
new file mode 100644
index 0000000..49d6986
--- /dev/null
+++ b/test/CodeGen/ARM/jumptable-label.ll
@@ -0,0 +1,33 @@
+; RUN: llc < %s -mtriple thumbv6-apple-macosx10.6.0 | FileCheck %s
+
+; test that we print the label of a bb that is only used in a jump table.
+
+; CHECK:	.long	LBB0_2
+; CHECK: LBB0_2:
+
+define i32 @calculate()  {
+entry:
+  switch i32 undef, label %return [
+    i32 1, label %sw.bb
+    i32 2, label %sw.bb6
+    i32 3, label %sw.bb13
+    i32 4, label %sw.bb20
+  ]
+
+sw.bb:                                            ; preds = %entry
+  br label %return
+
+sw.bb6:                                           ; preds = %entry
+  br label %return
+
+sw.bb13:                                          ; preds = %entry
+  br label %return
+
+sw.bb20:                                          ; preds = %entry
+  %div = sdiv i32 undef, undef
+  br label %return
+
+return:                                           ; preds = %sw.bb20, %sw.bb13, %sw.bb6, %sw.bb, %entry
+  %retval.0 = phi i32 [ %div, %sw.bb20 ], [ undef, %sw.bb13 ], [ undef, %sw.bb6 ], [ undef, %sw.bb ], [ 0, %entry ]
+  ret i32 %retval.0
+}
diff --git a/test/CodeGen/ARM/ldrd.ll b/test/CodeGen/ARM/ldrd.ll
index 3856944..8010f20 100644
--- a/test/CodeGen/ARM/ldrd.ll
+++ b/test/CodeGen/ARM/ldrd.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -mtriple=armv6-apple-darwin -regalloc=linearscan | FileCheck %s -check-prefix=V6
-; RUN: llc < %s -mtriple=armv5-apple-darwin | FileCheck %s -check-prefix=V5
-; RUN: llc < %s -mtriple=armv6-eabi | FileCheck %s -check-prefix=EABI
+; RUN: llc < %s -mtriple=armv5-apple-darwin -regalloc=linearscan | FileCheck %s -check-prefix=V5
+; RUN: llc < %s -mtriple=armv6-eabi -regalloc=linearscan | FileCheck %s -check-prefix=EABI
 ; rdar://r6949835
 
 ; Magic ARM pair hints works best with linearscan.
diff --git a/test/CodeGen/ARM/ldst-f32-2-i32.ll b/test/CodeGen/ARM/ldst-f32-2-i32.ll
index 2d016f6..1c69e15 100644
--- a/test/CodeGen/ARM/ldst-f32-2-i32.ll
+++ b/test/CodeGen/ARM/ldst-f32-2-i32.ll
@@ -10,8 +10,8 @@ entry:
   br i1 %0, label %return, label %bb
 
 bb:
-; CHECK: ldr [[REGISTER:(r[0-9]+)]], [r1], r3
-; CHECK: str [[REGISTER]], [r2], #4
+; CHECK: ldr [[REGISTER:(r[0-9]+)]], [{{r[0-9]+}}], {{r[0-9]+}}
+; CHECK: str [[REGISTER]], [{{r[0-9]+}}], #4
   %j.05 = phi i32 [ %2, %bb ], [ 0, %entry ]
   %tmp = mul i32 %j.05, %index
   %uglygep = getelementptr i8* %src6, i32 %tmp
diff --git a/test/CodeGen/ARM/ldstrexd.ll b/test/CodeGen/ARM/ldstrexd.ll
new file mode 100644
index 0000000..0c0911a
--- /dev/null
+++ b/test/CodeGen/ARM/ldstrexd.ll
@@ -0,0 +1,33 @@
+; RUN: llc < %s -mtriple=armv7-apple-darwin   | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin | FileCheck %s
+
+%0 = type { i32, i32 }
+
+; CHECK: f0:
+; CHECK: ldrexd
+define i64 @f0(i8* %p) nounwind readonly {
+entry:
+  %ldrexd = tail call %0 @llvm.arm.ldrexd(i8* %p)
+  %0 = extractvalue %0 %ldrexd, 1
+  %1 = extractvalue %0 %ldrexd, 0
+  %2 = zext i32 %0 to i64
+  %3 = zext i32 %1 to i64
+  %shl = shl nuw i64 %2, 32
+  %4 = or i64 %shl, %3
+  ret i64 %4
+}
+
+; CHECK: f1:
+; CHECK: strexd
+define i32 @f1(i8* %ptr, i64 %val) nounwind {
+entry:
+  %tmp4 = trunc i64 %val to i32
+  %tmp6 = lshr i64 %val, 32
+  %tmp7 = trunc i64 %tmp6 to i32
+  %strexd = tail call i32 @llvm.arm.strexd(i32 %tmp4, i32 %tmp7, i8* %ptr)
+  ret i32 %strexd
+}
+
+declare %0 @llvm.arm.ldrexd(i8*) nounwind readonly
+declare i32 @llvm.arm.strexd(i32, i32, i8*) nounwind
+
diff --git a/test/CodeGen/ARM/lsr-code-insertion.ll b/test/CodeGen/ARM/lsr-code-insertion.ll
index 1bbb96d..153fd8f 100644
--- a/test/CodeGen/ARM/lsr-code-insertion.ll
+++ b/test/CodeGen/ARM/lsr-code-insertion.ll
@@ -1,5 +1,4 @@
-; RUN: llc < %s -stats |& grep {39.*Number of machine instrs printed}
-; RUN: llc < %s -stats |& not grep {.*Number of re-materialization}
+; RUN: llc < %s | FileCheck %s
 ; This test really wants to check that the resultant "cond_true" block only 
 ; has a single store in it, and that cond_true55 only has code to materialize 
 ; the constant and do a store.  We do *not* want something like this:
@@ -8,6 +7,11 @@
 ;        add r8, r0, r6
 ;        str r10, [r8, #+4]
 ;
+; CHECK: ldr [[R6:r[0-9*]+]], LCP
+; CHECK: cmp {{.*}}, [[R6]]
+; CHECK: ldrle
+; CHECK-NEXT: strle
+
 target triple = "arm-apple-darwin8"
 
 define void @foo(i32* %mc, i32* %mpp, i32* %ip, i32* %dpp, i32* %tpmm, i32 %M, i32* %tpim, i32* %tpdm, i32* %bp, i32* %ms, i32 %xmb) {
diff --git a/test/CodeGen/ARM/lsr-on-unrolled-loops.ll b/test/CodeGen/ARM/lsr-on-unrolled-loops.ll
index 9882690..c1318ec 100644
--- a/test/CodeGen/ARM/lsr-on-unrolled-loops.ll
+++ b/test/CodeGen/ARM/lsr-on-unrolled-loops.ll
@@ -4,11 +4,6 @@
 ; constant offset addressing, so that each of the following stores
 ; uses the same register.
 
-; CHECK: vstr.32 s{{.*}}, [{{(r[0-9]+)|(lr)}}, #-128]
-; CHECK: vstr.32 s{{.*}}, [{{(r[0-9]+)|(lr)}}, #-96]
-; CHECK: vstr.32 s{{.*}}, [{{(r[0-9]+)|(lr)}}, #-64]
-; CHECK: vstr.32 s{{.*}}, [{{(r[0-9]+)|(lr)}}, #-32]
-; CHECK: vstr.32 s{{.*}}, [{{(r[0-9]+)|(lr)}}]
 ; CHECK: vstr.32 s{{.*}}, [{{(r[0-9]+)|(lr)}}, #32]
 ; CHECK: vstr.32 s{{.*}}, [{{(r[0-9]+)|(lr)}}, #64]
 ; CHECK: vstr.32 s{{.*}}, [{{(r[0-9]+)|(lr)}}, #96]
diff --git a/test/CodeGen/ARM/lsr-unfolded-offset.ll b/test/CodeGen/ARM/lsr-unfolded-offset.ll
new file mode 100644
index 0000000..e3e6eae
--- /dev/null
+++ b/test/CodeGen/ARM/lsr-unfolded-offset.ll
@@ -0,0 +1,80 @@
+; RUN: llc -regalloc=greedy < %s | FileCheck %s
+
+; LSR shouldn't introduce more induction variables than needed, increasing
+; register pressure and therefore spilling. There is more room for improvement
+; here.
+
+; CHECK: sub sp, #{{32|24}}
+
+; CHECK:      ldr r{{.*}}, [sp, #4]
+; CHECK-NEXT: ldr r{{.*}}, [sp, #16]
+; CHECK-NEXT: ldr r{{.*}}, [sp, #12]
+; CHECK-NEXT: adds
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"
+target triple = "thumbv7-apple-macosx10.7.0"
+
+%struct.partition_entry = type { i32, i32, i64, i64 }
+
+define i32 @partition_overlap_check(%struct.partition_entry* nocapture %part, i32 %num_entries) nounwind readonly optsize ssp {
+entry:
+  %cmp79 = icmp sgt i32 %num_entries, 0
+  br i1 %cmp79, label %outer.loop, label %for.end72
+
+outer.loop:                                 ; preds = %for.inc69, %entry
+  %overlap.081 = phi i32 [ %overlap.4, %for.inc69 ], [ 0, %entry ]
+  %0 = phi i32 [ %inc71, %for.inc69 ], [ 0, %entry ]
+  %offset = getelementptr %struct.partition_entry* %part, i32 %0, i32 2
+  %len = getelementptr %struct.partition_entry* %part, i32 %0, i32 3
+  %tmp5 = load i64* %offset, align 4, !tbaa !0
+  %tmp15 = load i64* %len, align 4, !tbaa !0
+  %add = add nsw i64 %tmp15, %tmp5
+  br label %inner.loop
+
+inner.loop:                                       ; preds = %for.inc, %outer.loop
+  %overlap.178 = phi i32 [ %overlap.081, %outer.loop ], [ %overlap.4, %for.inc ]
+  %1 = phi i32 [ 0, %outer.loop ], [ %inc, %for.inc ]
+  %cmp23 = icmp eq i32 %0, %1
+  br i1 %cmp23, label %for.inc, label %if.end
+
+if.end:                                           ; preds = %inner.loop
+  %len39 = getelementptr %struct.partition_entry* %part, i32 %1, i32 3
+  %offset28 = getelementptr %struct.partition_entry* %part, i32 %1, i32 2
+  %tmp29 = load i64* %offset28, align 4, !tbaa !0
+  %tmp40 = load i64* %len39, align 4, !tbaa !0
+  %add41 = add nsw i64 %tmp40, %tmp29
+  %cmp44 = icmp sge i64 %tmp29, %tmp5
+  %cmp47 = icmp slt i64 %tmp29, %add
+  %or.cond = and i1 %cmp44, %cmp47
+  %overlap.2 = select i1 %or.cond, i32 1, i32 %overlap.178
+  %cmp52 = icmp sle i64 %add41, %add
+  %cmp56 = icmp sgt i64 %add41, %tmp5
+  %or.cond74 = and i1 %cmp52, %cmp56
+  %overlap.3 = select i1 %or.cond74, i32 1, i32 %overlap.2
+  %cmp61 = icmp sgt i64 %tmp29, %tmp5
+  %cmp65 = icmp slt i64 %add41, %add
+  %or.cond75 = or i1 %cmp61, %cmp65
+  br i1 %or.cond75, label %for.inc, label %if.then66
+
+if.then66:                                        ; preds = %if.end
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end, %if.then66, %inner.loop
+  %overlap.4 = phi i32 [ %overlap.178, %inner.loop ], [ 1, %if.then66 ], [ %overlap.3, %if.end ]
+  %inc = add nsw i32 %1, 1
+  %exitcond = icmp eq i32 %inc, %num_entries
+  br i1 %exitcond, label %for.inc69, label %inner.loop
+
+for.inc69:                                        ; preds = %for.inc
+  %inc71 = add nsw i32 %0, 1
+  %exitcond83 = icmp eq i32 %inc71, %num_entries
+  br i1 %exitcond83, label %for.end72, label %outer.loop
+
+for.end72:                                        ; preds = %for.inc69, %entry
+  %overlap.0.lcssa = phi i32 [ 0, %entry ], [ %overlap.4, %for.inc69 ]
+  ret i32 %overlap.0.lcssa
+}
+
+!0 = metadata !{metadata !"long long", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/test/CodeGen/ARM/memcpy-inline.ll b/test/CodeGen/ARM/memcpy-inline.ll
index e8a2a3b..5bae037 100644
--- a/test/CodeGen/ARM/memcpy-inline.ll
+++ b/test/CodeGen/ARM/memcpy-inline.ll
@@ -1,10 +1,8 @@
-; RUN: llc < %s -mtriple=arm-apple-darwin -regalloc=linearscan -disable-post-ra | FileCheck %s
-; RUN: llc < %s -mtriple=arm-apple-darwin -regalloc=basic -disable-post-ra | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -regalloc=linearscan -disable-post-ra | FileCheck %s
 
 ; The ARM magic hinting works best with linear scan.
-; CHECK: ldmia
-; CHECK: stmia
-; CHECK: ldrh
+; CHECK: ldrd
+; CHECK: strd
 ; CHECK: ldrb
 
 %struct.x = type { i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 }
diff --git a/test/CodeGen/ARM/memfunc.ll b/test/CodeGen/ARM/memfunc.ll
index 41d5944..757364b 100644
--- a/test/CodeGen/ARM/memfunc.ll
+++ b/test/CodeGen/ARM/memfunc.ll
@@ -1,10 +1,26 @@
-; RUN: llc < %s -march=arm
+; RUN: llc < %s -mtriple=armv7-apple-ios -o - | FileCheck %s
+; RUN: llc < %s -mtriple=arm-none-eabi -o - | FileCheck --check-prefix=EABI %s
+
+@from = common global [500 x i32] zeroinitializer, align 4
+@to = common global [500 x i32] zeroinitializer, align 4
 
 define void @f() {
 entry:
-        call void @llvm.memmove.i32( i8* null, i8* null, i32 64, i32 0 )
-        call void @llvm.memcpy.i32( i8* null, i8* null, i32 64, i32 0 )
-        call void @llvm.memset.i32( i8* null, i8 64, i32 0, i32 0 )
+
+        ; CHECK: memmove
+        ; EABI: __aeabi_memmove
+        call void @llvm.memmove.i32( i8* bitcast ([500 x i32]* @from to i8*), i8* bitcast ([500 x i32]* @to to i8*), i32 500, i32 0 )
+
+        ; CHECK: memcpy
+        ; EABI: __aeabi_memcpy
+        call void @llvm.memcpy.i32( i8* bitcast ([500 x i32]* @from to i8*), i8* bitcast ([500 x i32]* @to to i8*), i32 500, i32 0 )
+
+        ; EABI memset swaps arguments
+        ; CHECK: mov r1, #0
+        ; CHECK: memset
+        ; EABI: mov r2, #0
+        ; EABI: __aeabi_memset
+        call void @llvm.memset.i32( i8* bitcast ([500 x i32]* @from to i8*), i8 0, i32 500, i32 0 )
         unreachable
 }
 
diff --git a/test/CodeGen/ARM/movt-movw-global.ll b/test/CodeGen/ARM/movt-movw-global.ll
index 886ff3f..991d728 100644
--- a/test/CodeGen/ARM/movt-movw-global.ll
+++ b/test/CodeGen/ARM/movt-movw-global.ll
@@ -1,20 +1,39 @@
-; RUN: llc < %s | FileCheck %s
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64"
-target triple = "armv7-eabi"
+; RUN: llc < %s -mtriple=armv7-eabi      | FileCheck %s -check-prefix=EABI
+; RUN: llc < %s -mtriple=armv7-apple-ios -relocation-model=dynamic-no-pic | FileCheck %s -check-prefix=IOS
+; RUN: llc < %s -mtriple=armv7-apple-ios -relocation-model=pic            | FileCheck %s -check-prefix=IOS-PIC
+; RUN: llc < %s -mtriple=armv7-apple-ios -relocation-model=static         | FileCheck %s -check-prefix=IOS-STATIC
 
-@foo = common global i32 0                        ; <i32*> [#uses=1]
+@foo = common global i32 0
 
-define arm_aapcs_vfpcc i32* @bar1() nounwind readnone {
+define i32* @bar1() nounwind readnone {
 entry:
-; CHECK:      movw    r0, :lower16:foo
-; CHECK-NEXT: movt    r0, :upper16:foo
+; EABI:      movw    r0, :lower16:foo
+; EABI-NEXT: movt    r0, :upper16:foo
+
+; IOS:      movw    r0, :lower16:L_foo$non_lazy_ptr
+; IOS-NEXT: movt    r0, :upper16:L_foo$non_lazy_ptr
+
+; IOS-PIC:      movw    r0, :lower16:(L_foo$non_lazy_ptr-(LPC0_0+8))
+; IOS-PIC-NEXT: movt    r0, :upper16:(L_foo$non_lazy_ptr-(LPC0_0+8))
+
+; IOS-STATIC-NOT:      movw    r0, :lower16:_foo
+; IOS-STATIC-NOT:       movt    r0, :upper16:_foo
   ret i32* @foo
 }
 
-define arm_aapcs_vfpcc void @bar2(i32 %baz) nounwind {
+define void @bar2(i32 %baz) nounwind {
 entry:
-; CHECK:      movw    r1, :lower16:foo
-; CHECK-NEXT: movt    r1, :upper16:foo
+; EABI:      movw    r1, :lower16:foo
+; EABI-NEXT: movt    r1, :upper16:foo
+
+; IOS:      movw    r1, :lower16:L_foo$non_lazy_ptr
+; IOS-NEXT: movt    r1, :upper16:L_foo$non_lazy_ptr
+
+; IOS-PIC:      movw    r1, :lower16:(L_foo$non_lazy_ptr-(LPC1_0+8))
+; IOS-PIC-NEXT: movt    r1, :upper16:(L_foo$non_lazy_ptr-(LPC1_0+8))
+
+; IOS-STATIC-NOT:      movw    r1, :lower16:_foo
+; IOS-STATIC-NOT:      movt    r1, :upper16:_foo
   store i32 %baz, i32* @foo, align 4
   ret void
 }
diff --git a/test/CodeGen/ARM/neon_div.ll b/test/CodeGen/ARM/neon_div.ll
index e337970..de48fee 100644
--- a/test/CodeGen/ARM/neon_div.ll
+++ b/test/CodeGen/ARM/neon_div.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc < %s -march=arm -mattr=+neon -pre-RA-sched=source | FileCheck %s
 
 define <8 x i8> @sdivi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK: vrecpe.f32
diff --git a/test/CodeGen/ARM/prefetch.ll b/test/CodeGen/ARM/prefetch.ll
index 895b27b..250a34e 100644
--- a/test/CodeGen/ARM/prefetch.ll
+++ b/test/CodeGen/ARM/prefetch.ll
@@ -1,10 +1,15 @@
 ; RUN: llc < %s -march=thumb -mattr=-thumb2 | not grep pld
-; RUN: llc < %s -march=thumb -mattr=+v7a     | FileCheck %s -check-prefix=THUMB2
-; RUN: llc < %s -march=arm   -mattr=+v7a,+mp | FileCheck %s -check-prefix=ARM-MP
+; RUN: llc < %s -march=thumb -mattr=+v7a        | FileCheck %s -check-prefix=THUMB2
+; RUN: llc < %s -march=arm   -mattr=+v7a        | FileCheck %s -check-prefix=ARM
+; RUN: llc < %s -march=arm   -mcpu=cortex-a9-mp | FileCheck %s -check-prefix=ARM-MP
 ; rdar://8601536
 
 define void @t1(i8* %ptr) nounwind  {
 entry:
+; ARM: t1:
+; ARM-NOT: pldw [r0]
+; ARM: pld [r0]
+
 ; ARM-MP: t1:
 ; ARM-MP: pldw [r0]
 ; ARM-MP: pld [r0]
@@ -12,27 +17,27 @@ entry:
 ; THUMB2: t1:
 ; THUMB2-NOT: pldw [r0]
 ; THUMB2: pld [r0]
-  tail call void @llvm.prefetch( i8* %ptr, i32 1, i32 3 )
-  tail call void @llvm.prefetch( i8* %ptr, i32 0, i32 3 )
+  tail call void @llvm.prefetch( i8* %ptr, i32 1, i32 3, i32 1 )
+  tail call void @llvm.prefetch( i8* %ptr, i32 0, i32 3, i32 1 )
   ret void
 }
 
 define void @t2(i8* %ptr) nounwind  {
 entry:
-; ARM-MP: t2:
-; ARM-MP: pld [r0, #1023]
+; ARM: t2:
+; ARM: pld [r0, #1023]
 
 ; THUMB2: t2:
 ; THUMB2: pld [r0, #1023]
   %tmp = getelementptr i8* %ptr, i32 1023
-  tail call void @llvm.prefetch( i8* %tmp, i32 0, i32 3 )
+  tail call void @llvm.prefetch( i8* %tmp, i32 0, i32 3, i32 1 )
   ret void
 }
 
 define void @t3(i32 %base, i32 %offset) nounwind  {
 entry:
-; ARM-MP: t3:
-; ARM-MP: pld [r0, r1, lsr #2]
+; ARM: t3:
+; ARM: pld [r0, r1, lsr #2]
 
 ; THUMB2: t3:
 ; THUMB2: lsrs r1, r1, #2
@@ -40,22 +45,33 @@ entry:
   %tmp1 = lshr i32 %offset, 2
   %tmp2 = add i32 %base, %tmp1
   %tmp3 = inttoptr i32 %tmp2 to i8*
-  tail call void @llvm.prefetch( i8* %tmp3, i32 0, i32 3 )
+  tail call void @llvm.prefetch( i8* %tmp3, i32 0, i32 3, i32 1 )
   ret void
 }
 
 define void @t4(i32 %base, i32 %offset) nounwind  {
 entry:
-; ARM-MP: t4:
-; ARM-MP: pld [r0, r1, lsl #2]
+; ARM: t4:
+; ARM: pld [r0, r1, lsl #2]
 
 ; THUMB2: t4:
 ; THUMB2: pld [r0, r1, lsl #2]
   %tmp1 = shl i32 %offset, 2
   %tmp2 = add i32 %base, %tmp1
   %tmp3 = inttoptr i32 %tmp2 to i8*
-  tail call void @llvm.prefetch( i8* %tmp3, i32 0, i32 3 )
+  tail call void @llvm.prefetch( i8* %tmp3, i32 0, i32 3, i32 1 )
   ret void
 }
 
-declare void @llvm.prefetch(i8*, i32, i32) nounwind 
+declare void @llvm.prefetch(i8*, i32, i32, i32) nounwind
+
+define void @t5(i8* %ptr) nounwind  {
+entry:
+; ARM: t5:
+; ARM: pli [r0]
+
+; THUMB2: t5:
+; THUMB2: pli [r0]
+  tail call void @llvm.prefetch( i8* %ptr, i32 0, i32 3, i32 0 )
+  ret void
+}
diff --git a/test/CodeGen/ARM/rev.ll b/test/CodeGen/ARM/rev.ll
index 4170ff3..5739086 100644
--- a/test/CodeGen/ARM/rev.ll
+++ b/test/CodeGen/ARM/rev.ll
@@ -54,3 +54,16 @@ entry:
   %conv8 = ashr exact i32 %sext, 16
   ret i32 %conv8
 }
+
+; rdar://9609059
+define i32 @test5(i32 %i) nounwind readnone {
+entry:
+; CHECK: test5
+; CHECK: revsh r0, r0
+  %shl = shl i32 %i, 24
+  %shr = ashr exact i32 %shl, 16
+  %shr23 = lshr i32 %i, 8
+  %and = and i32 %shr23, 255
+  %or = or i32 %shr, %and
+  ret i32 %or
+}
diff --git a/test/CodeGen/ARM/select-imm.ll b/test/CodeGen/ARM/select-imm.ll
index 82ed018..43f8a66 100644
--- a/test/CodeGen/ARM/select-imm.ll
+++ b/test/CodeGen/ARM/select-imm.ll
@@ -5,8 +5,8 @@
 define i32 @t1(i32 %c) nounwind readnone {
 entry:
 ; ARM: t1:
-; ARM: mov r1, #101
-; ARM: orr r1, r1, #1, #24
+; ARM: mov [[R1:r[0-9]+]], #101
+; ARM: orr [[R1b:r[0-9]+]], [[R1]], #1, #24
 ; ARM: movgt r0, #123
 
 ; ARMT2: t1:
@@ -34,7 +34,7 @@ entry:
 ; ARMT2: movwgt r0, #357
 
 ; THUMB2: t2:
-; THUMB2: mov.w r0, #123
+; THUMB2: mov{{(s|\.w)}} r0, #123
 ; THUMB2: movwgt r0, #357
 
   %0 = icmp sgt i32 %c, 1
@@ -53,7 +53,7 @@ entry:
 ; ARMT2: moveq r0, #1
 
 ; THUMB2: t3:
-; THUMB2: mov.w r0, #0
+; THUMB2: mov{{(s|\.w)}} r0, #0
 ; THUMB2: moveq r0, #1
   %0 = icmp eq i32 %a, 160
   %1 = zext i1 %0 to i32
@@ -67,11 +67,11 @@ entry:
 ; ARM: movlt
 
 ; ARMT2: t4:
-; ARMT2: movwlt r0, #65365
-; ARMT2: movtlt r0, #65365
+; ARMT2: movwlt [[R0:r[0-9]+]], #65365
+; ARMT2: movtlt [[R0]], #65365
 
 ; THUMB2: t4:
-; THUMB2: mvnlt.w r0, #11141290
+; THUMB2: mvnlt.w [[R0:r[0-9]+]], #11141290
   %0 = icmp slt i32 %a, %b
   %1 = select i1 %0, i32 4283826005, i32 %x
   ret i32 %1
diff --git a/test/CodeGen/ARM/shifter_operand.ll b/test/CodeGen/ARM/shifter_operand.ll
index be891bc..f0e2d10 100644
--- a/test/CodeGen/ARM/shifter_operand.ll
+++ b/test/CodeGen/ARM/shifter_operand.ll
@@ -58,7 +58,7 @@ entry:
 ; A8: str r2, [r0, r1, lsl #2]
 
 ; A9: test4:
-; A9: add r0, r0, r4, lsl #2
+; A9: add r0, r0, r{{[0-9]+}}, lsl #2
 ; A9: ldr r1, [r0]
 ; A9: str r1, [r0]
   %0 = tail call i8* (...)* @malloc(i32 undef) nounwind
diff --git a/test/CodeGen/ARM/stm.ll b/test/CodeGen/ARM/stm.ll
index 2f5fadb..82dc14d 100644
--- a/test/CodeGen/ARM/stm.ll
+++ b/test/CodeGen/ARM/stm.ll
@@ -9,7 +9,7 @@ define i32 @main() nounwind {
 entry:
 ; CHECK: main
 ; CHECK: push
-; CHECK: stmib
+; CHECK: stm
 	%0 = tail call i32 (i8*, ...)* @printf(i8* getelementptr ([26 x i8]* @"\01LC1", i32 0, i32 0), i32 -2, i32 -3, i32 2, i32 -6) nounwind		; <i32> [#uses=0]
 	%1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr ([32 x i8]* @"\01LC", i32 0, i32 0), i32 0, i32 1, i32 0, i32 1, i32 0, i32 1) nounwind		; <i32> [#uses=0]
 	ret i32 0
diff --git a/test/CodeGen/ARM/sxt_rot.ll b/test/CodeGen/ARM/sxt_rot.ll
index 4752f17..355fee3 100644
--- a/test/CodeGen/ARM/sxt_rot.ll
+++ b/test/CodeGen/ARM/sxt_rot.ll
@@ -10,7 +10,7 @@ define i32 @test0(i8 %A) {
 	ret i32 %B
 }
 
-define i8 @test1(i32 %A) signext {
+define signext i8 @test1(i32 %A) {
 	%B = lshr i32 %A, 8
 	%C = shl i32 %A, 24
 	%D = or i32 %B, %C
@@ -18,7 +18,7 @@ define i8 @test1(i32 %A) signext {
 	ret i8 %E
 }
 
-define i32 @test2(i32 %A, i32 %X) signext {
+define signext i32 @test2(i32 %A, i32 %X) {
 	%B = lshr i32 %A, 8
 	%C = shl i32 %A, 24
 	%D = or i32 %B, %C
diff --git a/test/CodeGen/ARM/trap.ll b/test/CodeGen/ARM/trap.ll
index 189bc8c..38842a9 100644
--- a/test/CodeGen/ARM/trap.ll
+++ b/test/CodeGen/ARM/trap.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=arm-apple-darwin | FileCheck %s -check-prefix=INSTR
-; RUN: llc < %s -mtriple=arm-apple-darwin -arm-trap-func=_trap | FileCheck %s -check-prefix=FUNC
+; RUN: llc < %s -mtriple=arm-apple-darwin -trap-func=_trap | FileCheck %s -check-prefix=FUNC
 ; rdar://7961298
 ; rdar://9249183
 
diff --git a/test/CodeGen/ARM/umulo-32.ll b/test/CodeGen/ARM/umulo-32.ll
index aa7d28a..fa5c016 100644
--- a/test/CodeGen/ARM/umulo-32.ll
+++ b/test/CodeGen/ARM/umulo-32.ll
@@ -12,3 +12,30 @@ define i32 @func(i32 %a) nounwind {
 }
 
 declare %umul.ty @llvm.umul.with.overflow.i32(i32, i32) nounwind readnone
+
+define i32 @f(i32 %argc, i8** %argv) ssp {
+; CHECK: func
+; CHECK: str     r0
+; CHECK: movs    r2
+; CHECK: mov     r1
+; CHECK: mov     r3
+; CHECK: muldi3
+%1 = alloca i32, align 4
+%2 = alloca i32, align 4
+%3 = alloca i8**, align 4
+%m_degree = alloca i32, align 4
+store i32 0, i32* %1
+store i32 %argc, i32* %2, align 4
+store i8** %argv, i8*** %3, align 4
+store i32 10, i32* %m_degree, align 4
+%4 = load i32* %m_degree, align 4
+%5 = call %umul.ty @llvm.umul.with.overflow.i32(i32 %4, i32 8)
+%6 = extractvalue %umul.ty %5, 1
+%7 = extractvalue %umul.ty %5, 0
+%8 = select i1 %6, i32 -1, i32 %7
+%9 = call noalias i8* @_Znam(i32 %8)
+%10 = bitcast i8* %9 to double*
+ret i32 0
+}
+
+declare noalias i8* @_Znam(i32)
diff --git a/test/CodeGen/ARM/unaligned_load_store.ll b/test/CodeGen/ARM/unaligned_load_store.ll
index b42e11f..a8237c6 100644
--- a/test/CodeGen/ARM/unaligned_load_store.ll
+++ b/test/CodeGen/ARM/unaligned_load_store.ll
@@ -8,14 +8,14 @@
 define void @t(i8* nocapture %a, i8* nocapture %b) nounwind {
 entry:
 ; GENERIC: t:
-; GENERIC: ldrb r2
-; GENERIC: ldrb r3
-; GENERIC: ldrb r12
-; GENERIC: ldrb r1
-; GENERIC: strb r1
-; GENERIC: strb r12
-; GENERIC: strb r3
-; GENERIC: strb r2
+; GENERIC: ldrb [[R2:r[0-9]+]]
+; GENERIC: ldrb [[R3:r[0-9]+]]
+; GENERIC: ldrb [[R12:r[0-9]+]]
+; GENERIC: ldrb [[R1:r[0-9]+]]
+; GENERIC: strb [[R1]]
+; GENERIC: strb [[R12]]
+; GENERIC: strb [[R3]]
+; GENERIC: strb [[R2]]
 
 ; DARWIN_V6: t:
 ; DARWIN_V6: ldr r1
diff --git a/test/CodeGen/ARM/uxt_rot.ll b/test/CodeGen/ARM/uxt_rot.ll
index 6307795..628c079 100644
--- a/test/CodeGen/ARM/uxt_rot.ll
+++ b/test/CodeGen/ARM/uxt_rot.ll
@@ -2,19 +2,19 @@
 ; RUN: llc < %s -march=arm -mattr=+v6 | grep uxtab | count 1
 ; RUN: llc < %s -march=arm -mattr=+v6 | grep uxth | count 1
 
-define i8 @test1(i32 %A.u) zeroext {
+define zeroext i8 @test1(i32 %A.u) {
     %B.u = trunc i32 %A.u to i8
     ret i8 %B.u
 }
 
-define i32 @test2(i32 %A.u, i32 %B.u) zeroext {
+define zeroext i32 @test2(i32 %A.u, i32 %B.u) {
     %C.u = trunc i32 %B.u to i8
     %D.u = zext i8 %C.u to i32
     %E.u = add i32 %A.u, %D.u
     ret i32 %E.u
 }
 
-define i32 @test3(i32 %A.u) zeroext {
+define zeroext i32 @test3(i32 %A.u) {
     %B.u = lshr i32 %A.u, 8
     %C.u = shl i32 %A.u, 24
     %D.u = or i32 %B.u, %C.u
diff --git a/test/CodeGen/ARM/va_arg.ll b/test/CodeGen/ARM/va_arg.ll
index 7cb9762..bb40453 100644
--- a/test/CodeGen/ARM/va_arg.ll
+++ b/test/CodeGen/ARM/va_arg.ll
@@ -1,10 +1,10 @@
-; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi | FileCheck %s
+; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -pre-RA-sched=source | FileCheck %s
 ; Test that we correctly align elements when using va_arg
 
 ; CHECK: test1:
 ; CHECK-NOT: bfc
-; CHECK: add	r0, r0, #7
-; CHECK: bfc	r0, #0, #3
+; CHECK: add	[[REG:(r[0-9]+)|(lr)]], {{(r[0-9]+)|(lr)}}, #7
+; CHECK: bfc	[[REG]], #0, #3
 ; CHECK-NOT: bfc
 
 define i64 @test1(i32 %i, ...) nounwind optsize {
@@ -19,8 +19,8 @@ entry:
 
 ; CHECK: test2:
 ; CHECK-NOT: bfc
-; CHECK: add	r0, r0, #7
-; CHECK: bfc	r0, #0, #3
+; CHECK: add	[[REG:(r[0-9]+)|(lr)]], {{(r[0-9]+)|(lr)}}, #7
+; CHECK: bfc	[[REG]], #0, #3
 ; CHECK-NOT:	bfc
 ; CHECK: bx	lr
 
diff --git a/test/CodeGen/ARM/vbsl-constant.ll b/test/CodeGen/ARM/vbsl-constant.ll
index 9a9cc5b..14e668e 100644
--- a/test/CodeGen/ARM/vbsl-constant.ll
+++ b/test/CodeGen/ARM/vbsl-constant.ll
@@ -2,6 +2,8 @@
 
 define <8 x i8> @v_bsli8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
 ;CHECK: v_bsli8:
+;CHECK: vldr.64
+;CHECK: vldr.64
 ;CHECK: vbsl
 	%tmp1 = load <8 x i8>* %A
 	%tmp2 = load <8 x i8>* %B
@@ -14,6 +16,8 @@ define <8 x i8> @v_bsli8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
 
 define <4 x i16> @v_bsli16(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
 ;CHECK: v_bsli16:
+;CHECK: vldr.64
+;CHECK: vldr.64
 ;CHECK: vbsl
 	%tmp1 = load <4 x i16>* %A
 	%tmp2 = load <4 x i16>* %B
@@ -26,6 +30,8 @@ define <4 x i16> @v_bsli16(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind
 
 define <2 x i32> @v_bsli32(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
 ;CHECK: v_bsli32:
+;CHECK: vldr.64
+;CHECK: vldr.64
 ;CHECK: vbsl
 	%tmp1 = load <2 x i32>* %A
 	%tmp2 = load <2 x i32>* %B
@@ -38,6 +44,9 @@ define <2 x i32> @v_bsli32(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind
 
 define <1 x i64> @v_bsli64(<1 x i64>* %A, <1 x i64>* %B, <1 x i64>* %C) nounwind {
 ;CHECK: v_bsli64:
+;CHECK: vldr.64
+;CHECK: vldr.64
+;CHECK: vldr.64
 ;CHECK: vbsl
 	%tmp1 = load <1 x i64>* %A
 	%tmp2 = load <1 x i64>* %B
@@ -50,6 +59,8 @@ define <1 x i64> @v_bsli64(<1 x i64>* %A, <1 x i64>* %B, <1 x i64>* %C) nounwind
 
 define <16 x i8> @v_bslQi8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
 ;CHECK: v_bslQi8:
+;CHECK: vldmia
+;CHECK: vldmia
 ;CHECK: vbsl
 	%tmp1 = load <16 x i8>* %A
 	%tmp2 = load <16 x i8>* %B
@@ -62,6 +73,8 @@ define <16 x i8> @v_bslQi8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind
 
 define <8 x i16> @v_bslQi16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
 ;CHECK: v_bslQi16:
+;CHECK: vldmia
+;CHECK: vldmia
 ;CHECK: vbsl
 	%tmp1 = load <8 x i16>* %A
 	%tmp2 = load <8 x i16>* %B
@@ -74,6 +87,8 @@ define <8 x i16> @v_bslQi16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwin
 
 define <4 x i32> @v_bslQi32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
 ;CHECK: v_bslQi32:
+;CHECK: vldmia
+;CHECK: vldmia
 ;CHECK: vbsl
 	%tmp1 = load <4 x i32>* %A
 	%tmp2 = load <4 x i32>* %B
@@ -86,6 +101,9 @@ define <4 x i32> @v_bslQi32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwin
 
 define <2 x i64> @v_bslQi64(<2 x i64>* %A, <2 x i64>* %B, <2 x i64>* %C) nounwind {
 ;CHECK: v_bslQi64:
+;CHECK: vldmia
+;CHECK: vldmia
+;CHECK: vldmia
 ;CHECK: vbsl
 	%tmp1 = load <2 x i64>* %A
 	%tmp2 = load <2 x i64>* %B
diff --git a/test/CodeGen/ARM/vfp.ll b/test/CodeGen/ARM/vfp.ll
index 390457f..49a6982 100644
--- a/test/CodeGen/ARM/vfp.ll
+++ b/test/CodeGen/ARM/vfp.ll
@@ -40,8 +40,8 @@ define void @test_add(float* %P, double* %D) {
 define void @test_ext_round(float* %P, double* %D) {
 ;CHECK: test_ext_round:
 	%a = load float* %P		; <float> [#uses=1]
-;CHECK: vcvt.f32.f64
 ;CHECK: vcvt.f64.f32
+;CHECK: vcvt.f32.f64
 	%b = fpext float %a to double		; <double> [#uses=1]
 	%A = load double* %D		; <double> [#uses=1]
 	%B = fptrunc double %A to float		; <float> [#uses=1]
diff --git a/test/CodeGen/ARM/vld1.ll b/test/CodeGen/ARM/vld1.ll
index 02e543c..e524395 100644
--- a/test/CodeGen/ARM/vld1.ll
+++ b/test/CodeGen/ARM/vld1.ll
@@ -133,8 +133,6 @@ declare <2 x i64> @llvm.arm.neon.vld1.v2i64(i8*, i32) nounwind readonly
 ; Do not crash if the vld1 result is not used.
 define void @unused_vld1_result() {
 entry:
-;CHECK: unused_vld1_result
-;CHECK: vld1.32
   %0 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) 
   call void @llvm.trap()
   unreachable
diff --git a/test/CodeGen/ARM/vldlane.ll b/test/CodeGen/ARM/vldlane.ll
index 68dd503..0d7d4ec 100644
--- a/test/CodeGen/ARM/vldlane.ll
+++ b/test/CodeGen/ARM/vldlane.ll
@@ -125,7 +125,7 @@ define <2 x i32> @vld2lanei32(i32* %A, <2 x i32>* %B) nounwind {
 ;Check for a post-increment updating load.
 define <2 x i32> @vld2lanei32_update(i32** %ptr, <2 x i32>* %B) nounwind {
 ;CHECK: vld2lanei32_update:
-;CHECK: vld2.32 {d16[1], d17[1]}, [r1]!
+;CHECK: vld2.32 {d16[1], d17[1]}, [{{r[0-9]+}}]!
 	%A = load i32** %ptr
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <2 x i32>* %B
@@ -153,7 +153,7 @@ define <2 x float> @vld2lanef(float* %A, <2 x float>* %B) nounwind {
 define <8 x i16> @vld2laneQi16(i16* %A, <8 x i16>* %B) nounwind {
 ;CHECK: vld2laneQi16:
 ;Check the (default) alignment.
-;CHECK: vld2.16 {d17[1], d19[1]}, [r0]
+;CHECK: vld2.16 {d17[1], d19[1]}, [{{r[0-9]+}}]
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <8 x i16>* %B
 	%tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 5, i32 1)
@@ -166,7 +166,7 @@ define <8 x i16> @vld2laneQi16(i16* %A, <8 x i16>* %B) nounwind {
 define <4 x i32> @vld2laneQi32(i32* %A, <4 x i32>* %B) nounwind {
 ;CHECK: vld2laneQi32:
 ;Check the alignment value.  Max for this instruction is 64 bits:
-;CHECK: vld2.32 {d17[0], d19[0]}, [r0, :64]
+;CHECK: vld2.32 {d17[0], d19[0]}, [{{r[0-9]+}}, :64]
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <4 x i32>* %B
 	%tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 16)
@@ -222,7 +222,7 @@ define <8 x i8> @vld3lanei8(i8* %A, <8 x i8>* %B) nounwind {
 define <4 x i16> @vld3lanei16(i16* %A, <4 x i16>* %B) nounwind {
 ;CHECK: vld3lanei16:
 ;Check the (default) alignment value.  VLD3 does not support alignment.
-;CHECK: vld3.16 {d16[1], d17[1], d18[1]}, [r0]
+;CHECK: vld3.16 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}]
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <4 x i16>* %B
 	%tmp2 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
@@ -265,7 +265,7 @@ define <2 x float> @vld3lanef(float* %A, <2 x float>* %B) nounwind {
 define <8 x i16> @vld3laneQi16(i16* %A, <8 x i16>* %B) nounwind {
 ;CHECK: vld3laneQi16:
 ;Check the (default) alignment value.  VLD3 does not support alignment.
-;CHECK: vld3.16 {d16[1], d18[1], d20[1]}, [r0]
+;CHECK: vld3.16 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}]
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <8 x i16>* %B
 	%tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 8)
@@ -280,7 +280,7 @@ define <8 x i16> @vld3laneQi16(i16* %A, <8 x i16>* %B) nounwind {
 ;Check for a post-increment updating load with register increment.
 define <8 x i16> @vld3laneQi16_update(i16** %ptr, <8 x i16>* %B, i32 %inc) nounwind {
 ;CHECK: vld3laneQi16_update:
-;CHECK: vld3.16 {d16[1], d18[1], d20[1]}, [{{r[0-9]+}}], {{r[0-9]+}}
+;CHECK: vld3.16 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}], {{r[0-9]+}}
 	%A = load i16** %ptr
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <8 x i16>* %B
@@ -344,7 +344,7 @@ declare %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32(i8*, <4 x flo
 define <8 x i8> @vld4lanei8(i8* %A, <8 x i8>* %B) nounwind {
 ;CHECK: vld4lanei8:
 ;Check the alignment value.  Max for this instruction is 32 bits:
-;CHECK: vld4.8 {d16[1], d17[1], d18[1], d19[1]}, [r0, :32]
+;CHECK: vld4.8 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}, :32]
 	%tmp1 = load <8 x i8>* %B
 	%tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
         %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 0
@@ -360,7 +360,7 @@ define <8 x i8> @vld4lanei8(i8* %A, <8 x i8>* %B) nounwind {
 ;Check for a post-increment updating load.
 define <8 x i8> @vld4lanei8_update(i8** %ptr, <8 x i8>* %B) nounwind {
 ;CHECK: vld4lanei8_update:
-;CHECK: vld4.8 {d16[1], d17[1], d18[1], d19[1]}, [r1, :32]!
+;CHECK: vld4.8 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}, :32]!
 	%A = load i8** %ptr
 	%tmp1 = load <8 x i8>* %B
 	%tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
@@ -380,7 +380,7 @@ define <4 x i16> @vld4lanei16(i16* %A, <4 x i16>* %B) nounwind {
 ;CHECK: vld4lanei16:
 ;Check that a power-of-two alignment smaller than the total size of the memory
 ;being loaded is ignored.
-;CHECK: vld4.16 {d16[1], d17[1], d18[1], d19[1]}, [r0]
+;CHECK: vld4.16 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}]
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <4 x i16>* %B
 	%tmp2 = call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 4)
@@ -398,7 +398,7 @@ define <2 x i32> @vld4lanei32(i32* %A, <2 x i32>* %B) nounwind {
 ;CHECK: vld4lanei32:
 ;Check the alignment value.  An 8-byte alignment is allowed here even though
 ;it is smaller than the total size of the memory being loaded.
-;CHECK: vld4.32 {d16[1], d17[1], d18[1], d19[1]}, [r0, :64]
+;CHECK: vld4.32 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}, :64]
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <2 x i32>* %B
 	%tmp2 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 8)
@@ -431,7 +431,7 @@ define <2 x float> @vld4lanef(float* %A, <2 x float>* %B) nounwind {
 define <8 x i16> @vld4laneQi16(i16* %A, <8 x i16>* %B) nounwind {
 ;CHECK: vld4laneQi16:
 ;Check the alignment value.  Max for this instruction is 64 bits:
-;CHECK: vld4.16 {d16[1], d18[1], d20[1], d22[1]}, [r0, :64]
+;CHECK: vld4.16 {d16[1], d18[1], d20[1], d22[1]}, [{{r[0-9]+}}, :64]
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <8 x i16>* %B
 	%tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 16)
@@ -448,7 +448,7 @@ define <8 x i16> @vld4laneQi16(i16* %A, <8 x i16>* %B) nounwind {
 define <4 x i32> @vld4laneQi32(i32* %A, <4 x i32>* %B) nounwind {
 ;CHECK: vld4laneQi32:
 ;Check the (default) alignment.
-;CHECK: vld4.32 {d17[0], d19[0], d21[0], d23[0]}, [r0]
+;CHECK: vld4.32 {d17[0], d19[0], d21[0], d23[0]}, [{{r[0-9]+}}]
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <4 x i32>* %B
 	%tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 1)
@@ -491,7 +491,7 @@ declare %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32(i8*, <4 x flo
 ; in the QPR_VFP2 regclass, it needs to be copied to a QPR regclass because
 ; we don't currently have a QQQQ_VFP2 super-regclass.  (The "0" for the low
 ; part of %ins67 is supposed to be loaded by a VLDRS instruction in this test.)
-define void @test_qqqq_regsequence_subreg([6 x i64] %b) nounwind {
+define <8 x i16> @test_qqqq_regsequence_subreg([6 x i64] %b) nounwind {
 ;CHECK: test_qqqq_regsequence_subreg
 ;CHECK: vld3.16
   %tmp63 = extractvalue [6 x i64] %b, 5
@@ -500,8 +500,12 @@ define void @test_qqqq_regsequence_subreg([6 x i64] %b) nounwind {
   %ins67 = or i128 %tmp65, 0
   %tmp78 = bitcast i128 %ins67 to <8 x i16>
   %vld3_lane = tail call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8* undef, <8 x i16> undef, <8 x i16> undef, <8 x i16> %tmp78, i32 1, i32 2)
-  call void @llvm.trap()
-  unreachable
+  %tmp3 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 0
+  %tmp4 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 1
+  %tmp5 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 2
+  %tmp6 = add <8 x i16> %tmp3, %tmp4
+  %tmp7 = add <8 x i16> %tmp5, %tmp6
+  ret <8 x i16> %tmp7
 }
 
 declare void @llvm.trap() nounwind
diff --git a/test/CodeGen/ARM/vmul.ll b/test/CodeGen/ARM/vmul.ll
index 1fd6581..1780d6e 100644
--- a/test/CodeGen/ARM/vmul.ll
+++ b/test/CodeGen/ARM/vmul.ll
@@ -439,9 +439,9 @@ define <2 x i64> @vmull_extvec_u32(<2 x i32> %arg) nounwind {
 }
 
 ; rdar://9197392
-define void @distribue(i16* %dst, i8* %src, i32 %mul) nounwind {
+define void @distribute(i16* %dst, i8* %src, i32 %mul) nounwind {
 entry:
-; CHECK: distribue:
+; CHECK: distribute:
 ; CHECK: vmull.u8 [[REG1:(q[0-9]+)]], d{{.*}}, [[REG2:(d[0-9]+)]]
 ; CHECK: vmlal.u8 [[REG1]], d{{.*}}, [[REG2]]
   %0 = trunc i32 %mul to i8
@@ -471,9 +471,9 @@ declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32) nounwind
 
 %struct.uint8x8_t = type { <8 x i8> }
 
-define void @distribue2(%struct.uint8x8_t* nocapture %dst, i8* %src, i32 %mul) nounwind {
+define void @distribute2(%struct.uint8x8_t* nocapture %dst, i8* %src, i32 %mul) nounwind {
 entry:
-; CHECK: distribue2
+; CHECK: distribute2
 ; CHECK-NOT: vadd.i8
 ; CHECK: vmul.i8
 ; CHECK: vmla.i8
@@ -492,3 +492,25 @@ entry:
   store <8 x i8> %10, <8 x i8>* %11, align 8
   ret void
 }
+
+define void @distribute2_commutative(%struct.uint8x8_t* nocapture %dst, i8* %src, i32 %mul) nounwind {
+entry:
+; CHECK: distribute2_commutative
+; CHECK-NOT: vadd.i8
+; CHECK: vmul.i8
+; CHECK: vmla.i8
+  %0 = trunc i32 %mul to i8
+  %1 = insertelement <8 x i8> undef, i8 %0, i32 0
+  %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
+  %3 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %src, i32 1)
+  %4 = bitcast <16 x i8> %3 to <2 x double>
+  %5 = extractelement <2 x double> %4, i32 1
+  %6 = bitcast double %5 to <8 x i8>
+  %7 = extractelement <2 x double> %4, i32 0
+  %8 = bitcast double %7 to <8 x i8>
+  %9 = add <8 x i8> %6, %8
+  %10 = mul <8 x i8> %2, %9
+  %11 = getelementptr inbounds %struct.uint8x8_t* %dst, i32 0, i32 0
+  store <8 x i8> %10, <8 x i8>* %11, align 8
+  ret void
+}
diff --git a/test/CodeGen/ARM/vpadd.ll b/test/CodeGen/ARM/vpadd.ll
index 2125573..1ba68f5 100644
--- a/test/CodeGen/ARM/vpadd.ll
+++ b/test/CodeGen/ARM/vpadd.ll
@@ -138,6 +138,20 @@ define <2 x i64> @vpaddlQu32(<4 x i32>* %A) nounwind {
 	ret <2 x i64> %tmp2
 }
 
+; Test AddCombine optimization that generates a vpaddl.s
+define void @addCombineToVPADDL() nounwind ssp {
+; CHECK: vpaddl.s8
+  %cbcr = alloca <16 x i8>, align 16
+  %X = alloca <8 x i8>, align 8
+  %tmp = load <16 x i8>* %cbcr
+  %tmp1 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %tmp2 = load <16 x i8>* %cbcr
+  %tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %add = add <8 x i8> %tmp3, %tmp1
+  store <8 x i8> %add, <8 x i8>* %X, align 8
+  ret void
+}
+
 declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) nounwind readnone
 declare <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16>) nounwind readnone
 declare <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM/vrev.ll b/test/CodeGen/ARM/vrev.ll
index f0f9e4e..34acd16 100644
--- a/test/CodeGen/ARM/vrev.ll
+++ b/test/CodeGen/ARM/vrev.ll
@@ -147,3 +147,34 @@ define void @test_with_vcombine(<4 x float>* %v) nounwind {
   store <4 x float> %tmp8, <4 x float>* %v, align 16
   ret void
 }
+
+; vrev <4 x i16> should use VREV32 and not VREV64
+define void @test_vrev64(<4 x i16>* nocapture %source, <2 x i16>* nocapture %dst) nounwind ssp {
+; CHECK: test_vrev64:
+; CHECK: vext.16
+; CHECK: vrev32.16
+entry:
+  %0 = bitcast <4 x i16>* %source to <8 x i16>*
+  %tmp2 = load <8 x i16>* %0, align 4
+  %tmp3 = extractelement <8 x i16> %tmp2, i32 6
+  %tmp5 = insertelement <2 x i16> undef, i16 %tmp3, i32 0
+  %tmp9 = extractelement <8 x i16> %tmp2, i32 5
+  %tmp11 = insertelement <2 x i16> %tmp5, i16 %tmp9, i32 1
+  store <2 x i16> %tmp11, <2 x i16>* %dst, align 4
+  ret void
+}
+
+; Test vrev of float4
+define void @float_vrev64(float* nocapture %source, <4 x float>* nocapture %dest) nounwind noinline ssp {
+; CHECK: float_vrev64
+; CHECK: vext.32
+; CHECK: vrev64.32
+entry:
+  %0 = bitcast float* %source to <4 x float>*
+  %tmp2 = load <4 x float>* %0, align 4
+  %tmp5 = shufflevector <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x float> %tmp2, <4 x i32> <i32 0, i32 7, i32 0, i32 0>
+  %arrayidx8 = getelementptr inbounds <4 x float>* %dest, i32 11
+  store <4 x float> %tmp5, <4 x float>* %arrayidx8, align 4
+  ret void
+}
+
diff --git a/test/CodeGen/ARM/vst3.ll b/test/CodeGen/ARM/vst3.ll
index d262303..e3372a0 100644
--- a/test/CodeGen/ARM/vst3.ll
+++ b/test/CodeGen/ARM/vst3.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon -O0 | FileCheck %s
+; RUN: llc < %s -march=arm -mattr=+neon -disable-arm-fast-isel -O0 | FileCheck %s
 
 define void @vst3i8(i8* %A, <8 x i8>* %B) nounwind {
 ;CHECK: vst3i8:
diff --git a/test/CodeGen/ARM/vstlane.ll b/test/CodeGen/ARM/vstlane.ll
index d1bc15a..08b7232 100644
--- a/test/CodeGen/ARM/vstlane.ll
+++ b/test/CodeGen/ARM/vstlane.ll
@@ -54,7 +54,8 @@ define void @vst1lanef(float* %A, <2 x float>* %B) nounwind {
 
 define void @vst1laneQi8(i8* %A, <16 x i8>* %B) nounwind {
 ;CHECK: vst1laneQi8:
-;CHECK: vst1.8 {d17[1]}, [r0]
+; // Can use scalar load. No need to use vectors.
+; // CHE-CK: vst1.8 {d17[1]}, [r0]
 	%tmp1 = load <16 x i8>* %B
         %tmp2 = extractelement <16 x i8> %tmp1, i32 9
         store i8 %tmp2, i8* %A, align 8
@@ -72,7 +73,8 @@ define void @vst1laneQi16(i16* %A, <8 x i16>* %B) nounwind {
 
 define void @vst1laneQi32(i32* %A, <4 x i32>* %B) nounwind {
 ;CHECK: vst1laneQi32:
-;CHECK: vst1.32 {d17[1]}, [r0, :32]
+; // Can use scalar load. No need to use vectors.
+; // CHE-CK: vst1.32 {d17[1]}, [r0, :32]
 	%tmp1 = load <4 x i32>* %B
         %tmp2 = extractelement <4 x i32> %tmp1, i32 3
         store i32 %tmp2, i32* %A, align 8
@@ -82,7 +84,8 @@ define void @vst1laneQi32(i32* %A, <4 x i32>* %B) nounwind {
 ;Check for a post-increment updating store.
 define void @vst1laneQi32_update(i32** %ptr, <4 x i32>* %B) nounwind {
 ;CHECK: vst1laneQi32_update:
-;CHECK: vst1.32 {d17[1]}, [r1, :32]!
+; // Can use scalar load. No need to use vectors.
+; // CHE-CK: vst1.32 {d17[1]}, [r1, :32]!
 	%A = load i32** %ptr
 	%tmp1 = load <4 x i32>* %B
 	%tmp2 = extractelement <4 x i32> %tmp1, i32 3
@@ -94,7 +97,8 @@ define void @vst1laneQi32_update(i32** %ptr, <4 x i32>* %B) nounwind {
 
 define void @vst1laneQf(float* %A, <4 x float>* %B) nounwind {
 ;CHECK: vst1laneQf:
-;CHECK: vst1.32 {d17[1]}, [r0]
+; // Can use scalar load. No need to use vectors.
+; // CHE-CK: vst1.32 {d17[1]}, [r0]
 	%tmp1 = load <4 x float>* %B
         %tmp2 = extractelement <4 x float> %tmp1, i32 3
         store float %tmp2, float* %A
diff --git a/test/CodeGen/Alpha/2005-07-12-TwoMallocCalls.ll b/test/CodeGen/Alpha/2005-07-12-TwoMallocCalls.ll
deleted file mode 100644
index 87d9928..0000000
--- a/test/CodeGen/Alpha/2005-07-12-TwoMallocCalls.ll
+++ /dev/null
@@ -1,17 +0,0 @@
-; There should be exactly two calls here (memset and malloc), no more.
-; RUN: llc < %s -march=alpha | grep jsr | count 2
-
-%typedef.bc_struct = type opaque
-declare void @llvm.memset.i64(i8*, i8, i64, i32)
-
-define i1 @l12_l94_bc_divide_endif_2E_3_2E_ce(i32* %tmp.71.reload, i32 %scale2.1.3, i32 %extra.0, %typedef.bc_struct* %n1, %typedef.bc_struct* %n2, i32* %tmp.92.reload, i32 %tmp.94.reload, i32* %tmp.98.reload, i32 %tmp.100.reload, i8** %tmp.112.out, i32* %tmp.157.out, i8** %tmp.158.out) {
-newFuncRoot:
-        %tmp.120 = add i32 %extra.0, 2          ; <i32> [#uses=1]
-        %tmp.122 = add i32 %tmp.120, %tmp.94.reload             ; <i32> [#uses=1]
-        %tmp.123 = add i32 %tmp.122, %tmp.100.reload            ; <i32> [#uses=2]
-        %tmp.112 = malloc i8, i32 %tmp.123              ; <i8*> [#uses=1]
-        %tmp.137 = zext i32 %tmp.123 to i64             ; <i64> [#uses=1]
-        tail call void @llvm.memset.i64( i8* %tmp.112, i8 0, i64 %tmp.137, i32 0 )
-        ret i1 true
-}
-
diff --git a/test/CodeGen/Alpha/add.ll b/test/CodeGen/Alpha/add.ll
index cd883f6..8a92695 100644
--- a/test/CodeGen/Alpha/add.ll
+++ b/test/CodeGen/Alpha/add.ll
@@ -17,19 +17,19 @@
 ; RUN: grep {s8subq} %t.s | count 2
 
 
-define i32 @al(i32 signext %x.s, i32 signext %y.s) signext {
+define signext i32 @al(i32 signext %x.s, i32 signext %y.s) {
 entry:
 	%tmp.3.s = add i32 %y.s, %x.s		; <i32> [#uses=1]
 	ret i32 %tmp.3.s
 }
 
-define i32 @ali(i32 signext %x.s) signext {
+define signext i32 @ali(i32 signext %x.s)  {
 entry:
 	%tmp.3.s = add i32 100, %x.s		; <i32> [#uses=1]
 	ret i32 %tmp.3.s
 }
 
-define i64 @aq(i64 signext %x.s, i64 signext %y.s) signext {
+define signext i64 @aq(i64 signext %x.s, i64 signext %y.s)  {
 entry:
 	%tmp.3.s = add i64 %y.s, %x.s		; <i64> [#uses=1]
 	ret i64 %tmp.3.s
@@ -41,13 +41,13 @@ entry:
 	ret i64 %tmp.3.s
 }
 
-define i32 @sl(i32 signext %x.s, i32 signext %y.s) signext {
+define signext i32 @sl(i32 signext %x.s, i32 signext %y.s)  {
 entry:
 	%tmp.3.s = sub i32 %y.s, %x.s		; <i32> [#uses=1]
 	ret i32 %tmp.3.s
 }
 
-define i32 @sli(i32 signext %x.s) signext {
+define signext i32 @sli(i32 signext %x.s)  {
 entry:
 	%tmp.3.s = sub i32 %x.s, 100		; <i32> [#uses=1]
 	ret i32 %tmp.3.s
@@ -65,14 +65,14 @@ entry:
 	ret i64 %tmp.3.s
 }
 
-define i32 @a4l(i32 signext %x.s, i32 signext %y.s) signext {
+define signext i32 @a4l(i32 signext %x.s, i32 signext %y.s)  {
 entry:
 	%tmp.1.s = shl i32 %y.s, 2		; <i32> [#uses=1]
 	%tmp.3.s = add i32 %tmp.1.s, %x.s		; <i32> [#uses=1]
 	ret i32 %tmp.3.s
 }
 
-define i32 @a8l(i32 signext %x.s, i32 signext %y.s) signext {
+define signext i32 @a8l(i32 signext %x.s, i32 signext %y.s)  {
 entry:
 	%tmp.1.s = shl i32 %y.s, 3		; <i32> [#uses=1]
 	%tmp.3.s = add i32 %tmp.1.s, %x.s		; <i32> [#uses=1]
@@ -93,14 +93,14 @@ entry:
 	ret i64 %tmp.3.s
 }
 
-define i32 @a4li(i32 signext %y.s) signext {
+define signext i32 @a4li(i32 signext %y.s)  {
 entry:
 	%tmp.1.s = shl i32 %y.s, 2		; <i32> [#uses=1]
 	%tmp.3.s = add i32 100, %tmp.1.s		; <i32> [#uses=1]
 	ret i32 %tmp.3.s
 }
 
-define i32 @a8li(i32 signext %y.s) signext {
+define signext i32 @a8li(i32 signext %y.s)  {
 entry:
 	%tmp.1.s = shl i32 %y.s, 3		; <i32> [#uses=1]
 	%tmp.3.s = add i32 100, %tmp.1.s		; <i32> [#uses=1]
@@ -121,14 +121,14 @@ entry:
 	ret i64 %tmp.3.s
 }
 
-define i32 @s4l(i32 signext %x.s, i32 signext %y.s) signext {
+define signext i32 @s4l(i32 signext %x.s, i32 signext %y.s)  {
 entry:
 	%tmp.1.s = shl i32 %y.s, 2		; <i32> [#uses=1]
 	%tmp.3.s = sub i32 %tmp.1.s, %x.s		; <i32> [#uses=1]
 	ret i32 %tmp.3.s
 }
 
-define i32 @s8l(i32 signext %x.s, i32 signext %y.s) signext {
+define signext i32 @s8l(i32 signext %x.s, i32 signext %y.s)  {
 entry:
 	%tmp.1.s = shl i32 %y.s, 3		; <i32> [#uses=1]
 	%tmp.3.s = sub i32 %tmp.1.s, %x.s		; <i32> [#uses=1]
@@ -149,14 +149,14 @@ entry:
 	ret i64 %tmp.3.s
 }
 
-define i32 @s4li(i32 signext %y.s) signext {
+define signext i32 @s4li(i32 signext %y.s)  {
 entry:
 	%tmp.1.s = shl i32 %y.s, 2		; <i32> [#uses=1]
 	%tmp.3.s = sub i32 %tmp.1.s, 100		; <i32> [#uses=1]
 	ret i32 %tmp.3.s
 }
 
-define i32 @s8li(i32 signext %y.s) signext {
+define signext i32 @s8li(i32 signext %y.s)  {
 entry:
 	%tmp.1.s = shl i32 %y.s, 3		; <i32> [#uses=1]
 	%tmp.3.s = sub i32 %tmp.1.s, 100		; <i32> [#uses=1]
diff --git a/test/CodeGen/Alpha/i32_sub_1.ll b/test/CodeGen/Alpha/i32_sub_1.ll
index ffeafbd..35b1d08 100644
--- a/test/CodeGen/Alpha/i32_sub_1.ll
+++ b/test/CodeGen/Alpha/i32_sub_1.ll
@@ -2,7 +2,7 @@
 ; RUN: llc < %s -march=alpha | grep -i {subl \$16,1,\$0}
 
 
-define i32 @foo(i32 signext %x) signext {
+define signext i32 @foo(i32 signext %x) {
 entry:
 	%tmp.1 = add i32 %x, -1		; <int> [#uses=1]
 	ret i32 %tmp.1
diff --git a/test/CodeGen/Alpha/zapnot.ll b/test/CodeGen/Alpha/zapnot.ll
index d00984a..a47035e 100644
--- a/test/CodeGen/Alpha/zapnot.ll
+++ b/test/CodeGen/Alpha/zapnot.ll
@@ -2,7 +2,7 @@
 ; RUN: llc < %s -march=alpha | grep zapnot
 
 
-define i16 @foo(i64 %y) zeroext {
+define zeroext i16 @foo(i64 %y)  {
 entry:
         %tmp.1 = trunc i64 %y to i16         ; <ushort> [#uses=1]
         ret i16 %tmp.1
diff --git a/test/CodeGen/CPP/llvm2cpp.ll b/test/CodeGen/CPP/llvm2cpp.ll
deleted file mode 100644
index d0ba0cf..0000000
--- a/test/CodeGen/CPP/llvm2cpp.ll
+++ /dev/null
@@ -1,756 +0,0 @@
-; RUN: llvm-as < %s | llvm-dis > /dev/null
-; RUN: llc < %s -march=cpp -cppgen=program -o -
-
-@X = global i32 4, align 16		; <i32*> [#uses=0]
-
-define i32* @test1012() align 32 {
-	%X = alloca i32, align 4		; <i32*> [#uses=1]
-	%Y = alloca i32, i32 42, align 16		; <i32*> [#uses=0]
-	%Z = alloca i32		; <i32*> [#uses=0]
-	ret i32* %X
-}
-
-define i32* @test1013() {
-	%X = malloc i32, align 4		; <i32*> [#uses=1]
-	%Y = malloc i32, i32 42, align 16		; <i32*> [#uses=0]
-	%Z = malloc i32		; <i32*> [#uses=0]
-	ret i32* %X
-}
-
-define void @void(i32, i32) {
-	add i32 0, 0		; <i32>:3 [#uses=2]
-	sub i32 0, 4		; <i32>:4 [#uses=2]
-	br label %5
-
-; <label>:5		; preds = %5, %2
-	add i32 %0, %1		; <i32>:6 [#uses=2]
-	sub i32 %6, %4		; <i32>:7 [#uses=1]
-	icmp sle i32 %7, %3		; <i1>:8 [#uses=1]
-	br i1 %8, label %9, label %5
-
-; <label>:9		; preds = %5
-	add i32 %0, %1		; <i32>:10 [#uses=0]
-	sub i32 %6, %4		; <i32>:11 [#uses=1]
-	icmp sle i32 %11, %3		; <i1>:12 [#uses=0]
-	ret void
-}
-
-define i32 @zarro() {
-Startup:
-	ret i32 0
-}
-
-define fastcc void @foo() {
-	ret void
-}
-
-define coldcc void @bar() {
-	call fastcc void @foo( )
-	ret void
-}
-
-define void @structret({ i8 }* sret  %P) {
-	call void @structret( { i8 }* %P sret  )
-	ret void
-}
-
-define void @foo4() {
-	ret void
-}
-
-define coldcc void @bar2() {
-	call fastcc void @foo( )
-	ret void
-}
-
-define cc42 void @bar3() {
-	invoke fastcc void @foo( )
-			to label %Ok unwind label %U
-
-Ok:		; preds = %0
-	ret void
-
-U:		; preds = %0
-	unwind
-}
-
-define void @bar4() {
-	call cc42 void @bar( )
-	invoke cc42 void @bar3( )
-			to label %Ok unwind label %U
-
-Ok:		; preds = %0
-	ret void
-
-U:		; preds = %0
-	unwind
-}
-; ModuleID = 'calltest.ll'
-	%FunTy = type i32 (i32)
-
-define i32 @test1000(i32 %i0) {
-	ret i32 %i0
-}
-
-define void @invoke(%FunTy* %x) {
-	%foo = call i32 %x( i32 123 )		; <i32> [#uses=0]
-	%foo2 = tail call i32 %x( i32 123 )		; <i32> [#uses=0]
-	ret void
-}
-
-define i32 @main(i32 %argc) {
-	%retval = call i32 @test1000( i32 %argc )		; <i32> [#uses=2]
-	%two = add i32 %retval, %retval		; <i32> [#uses=1]
-	%retval2 = invoke i32 @test1000( i32 %argc )
-			to label %Next unwind label %Error		; <i32> [#uses=1]
-
-Next:		; preds = %0
-	%two2 = add i32 %two, %retval2		; <i32> [#uses=1]
-	call void @invoke( %FunTy* @test1000 )
-	ret i32 %two2
-
-Error:		; preds = %0
-	ret i32 -1
-}
-; ModuleID = 'casttest.ll'
-
-define i16 @FunFunc(i64 %x, i8 %z) {
-bb0:
-	%cast110 = sext i8 %z to i16		; <i16> [#uses=1]
-	%cast10 = trunc i64 %x to i16		; <i16> [#uses=1]
-	%reg109 = add i16 %cast110, %cast10		; <i16> [#uses=1]
-	ret i16 %reg109
-}
-; ModuleID = 'cfgstructures.ll'
-
-define void @irreducible(i1 %cond) {
-	br i1 %cond, label %X, label %Y
-
-X:		; preds = %Y, %0
-	br label %Y
-
-Y:		; preds = %X, %0
-	br label %X
-}
-
-define void @sharedheader(i1 %cond) {
-	br label %A
-
-A:		; preds = %Y, %X, %0
-	br i1 %cond, label %X, label %Y
-
-X:		; preds = %A
-	br label %A
-
-Y:		; preds = %A
-	br label %A
-}
-
-define void @nested(i1 %cond1, i1 %cond2, i1 %cond3) {
-	br label %Loop1
-
-Loop1:		; preds = %L2Exit, %0
-	br label %Loop2
-
-Loop2:		; preds = %L3Exit, %Loop1
-	br label %Loop3
-
-Loop3:		; preds = %Loop3, %Loop2
-	br i1 %cond3, label %Loop3, label %L3Exit
-
-L3Exit:		; preds = %Loop3
-	br i1 %cond2, label %Loop2, label %L2Exit
-
-L2Exit:		; preds = %L3Exit
-	br i1 %cond1, label %Loop1, label %L1Exit
-
-L1Exit:		; preds = %L2Exit
-	ret void
-}
-; ModuleID = 'constexpr.ll'
-	%SAType = type { i32, { [2 x float], i64 } }
-	%SType = type { i32, { float, { i8 } }, i64 }
-global i64 1		; <i64*>:0 [#uses=0]
-global i64 74514		; <i64*>:1 [#uses=0]
-@t2 = global i32* @t1		; <i32**> [#uses=0]
-@t3 = global i32* @t1		; <i32**> [#uses=2]
-@t1 = global i32 4		; <i32*> [#uses=2]
-@t4 = global i32** @t3		; <i32***> [#uses=1]
-@t5 = global i32** @t3		; <i32***> [#uses=0]
-@t6 = global i32*** @t4		; <i32****> [#uses=0]
-@t7 = global float* inttoptr (i32 12345678 to float*)		; <float**> [#uses=0]
-@t9 = global i32 8		; <i32*> [#uses=0]
-global i32* bitcast (float* @4 to i32*)		; <i32**>:2 [#uses=0]
-global float* @4		; <float**>:3 [#uses=0]
-global float 0.000000e+00		; <float*>:4 [#uses=2]
-@array = constant [2 x i32] [ i32 12, i32 52 ]		; <[2 x i32]*> [#uses=1]
-@arrayPtr = global i32* getelementptr ([2 x i32]* @array, i64 0, i64 0)		; <i32**> [#uses=1]
-@arrayPtr5 = global i32** getelementptr (i32** @arrayPtr, i64 5)		; <i32***> [#uses=0]
-@somestr = constant [11 x i8] c"hello world"		; <[11 x i8]*> [#uses=2]
-@char5 = global i8* getelementptr ([11 x i8]* @somestr, i64 0, i64 5)		; <i8**> [#uses=0]
-@char8a = global i32* bitcast (i8* getelementptr ([11 x i8]* @somestr, i64 0, i64 8) to i32*)		; <i32**> [#uses=0]
-@char8b = global i8* getelementptr ([11 x i8]* @somestr, i64 0, i64 8)		; <i8**> [#uses=0]
-@S1 = global %SType* null		; <%SType**> [#uses=1]
-@S2c = constant %SType {
-    i32 1, 
-    { float, { i8 } } { float 2.000000e+00, { i8 } { i8 3 } }, 
-    i64 4 }		; <%SType*> [#uses=3]
-@S3c = constant %SAType { i32 1, { [2 x float], i64 } { [2 x float] [ float 2.000000e+00, float 3.000000e+00 ], i64 4 } }		; <%SAType*> [#uses=1]
-@S1ptr = global %SType** @S1		; <%SType***> [#uses=0]
-@S2 = global %SType* @S2c		; <%SType**> [#uses=0]
-@S3 = global %SAType* @S3c		; <%SAType**> [#uses=0]
-@S1fld1a = global float* getelementptr (%SType* @S2c, i64 0, i32 1, i32 0)		; <float**> [#uses=0]
-@S1fld1b = global float* getelementptr (%SType* @S2c, i64 0, i32 1, i32 0)		; <float**> [#uses=1]
-@S1fld1bptr = global float** @S1fld1b		; <float***> [#uses=0]
-@S2fld3 = global i8* getelementptr (%SType* @S2c, i64 0, i32 1, i32 1, i32 0)		; <i8**> [#uses=0]
-
-; ModuleID = 'constpointer.ll'
-@cpt3 = global i32* @cpt1		; <i32**> [#uses=1]
-@cpt1 = global i32 4		; <i32*> [#uses=2]
-@cpt4 = global i32** @cpt3		; <i32***> [#uses=0]
-@cpt2 = global i32* @cpt1		; <i32**> [#uses=0]
-global float* @7		; <float**>:0 [#uses=0]
-global float* @7		; <float**>:1 [#uses=0]
-global float 0.000000e+00		; <float*>:2 [#uses=3]
-global float* @7		; <float**>:3 [#uses=0]
-@fptr = global void ()* @f		; <void ()**> [#uses=0]
-@sptr1 = global [11 x i8]* @somestr		; <[11 x i8]**> [#uses=0]
-@somestr2 = constant [11 x i8] c"hello world"		; <[11 x i8]*> [#uses=2]
-@sptr2 = global [11 x i8]* @somestr2		; <[11 x i8]**> [#uses=0]
-
-declare void @f()
-; ModuleID = 'escaped_label.ll'
-
-define i32 @foo3() {
-	br label "foo`~!@#$%^&*()-_=+{}[]\\|;:',<.>/?"
-
-"foo`~!@#$%^&*()-_=+{}[]\\|;:',<.>/?":		; preds = %0
-	ret i32 17
-}
-; ModuleID = 'float.ll'
-@F1 = global float 4.000000e+00		; <float*> [#uses=0]
-@D1 = global double 4.000000e+00		; <double*> [#uses=0]
-; ModuleID = 'fold-fpcast.ll'
-
-define i32 @test1() {
-	ret i32 1080872141
-}
-
-define float @test1002() {
-	ret float 0x36E1000000000000
-}
-
-define i64 @test3() {
-	ret i64 4614256656431372362
-}
-
-define double @test4() {
-	ret double 2.075076e-322
-}
-; ModuleID = 'forwardreftest.ll'
-	%myfn = type float (i32, double, i32, i16)
-	%myty = type i32
-	%thisfuncty = type i32 (i32)*
-
-declare void @F(%thisfuncty, %thisfuncty, %thisfuncty)
-
-define i32 @zarro2(i32 %Func) {
-Startup:
-	add i32 0, 10		; <i32>:0 [#uses=0]
-	ret i32 0
-}
-
-define i32 @test1004(i32) {
-	call void @F( %thisfuncty @zarro2, %thisfuncty @test1004, %thisfuncty @foozball )
-	ret i32 0
-}
-
-define i32 @foozball(i32) {
-	ret i32 0
-}
-
-; ModuleID = 'globalredefinition.ll'
-@A = global i32* @B		; <i32**> [#uses=0]
-@B = global i32 7		; <i32*> [#uses=1]
-
-define void @test12312() {
-	ret void
-}
-; ModuleID = 'global_section.ll'
-@GlobSec = global i32 4, section "foo", align 16
-
-define void @test1005() section "bar" {
-	ret void
-}
-
-; ModuleID = 'globalvars.ll'
-@MyVar = external global i32		; <i32*> [#uses=1]
-@MyIntList = external global { \2*, i32 }		; <{ \2*, i32 }*> [#uses=1]
-external global i32		; <i32*>:0 [#uses=0]
-@AConst = constant i32 123		; <i32*> [#uses=0]
-@AString = constant [4 x i8] c"test"		; <[4 x i8]*> [#uses=0]
-@ZeroInit = global { [100 x i32], [40 x float] } zeroinitializer		; <{ [100 x i32], [40 x float] }*> [#uses=0]
-
-define i32 @foo10015(i32 %blah) {
-	store i32 5, i32* @MyVar
-	%idx = getelementptr { \2*, i32 }* @MyIntList, i64 0, i32 1		; <i32*> [#uses=1]
-	store i32 12, i32* %idx
-	ret i32 %blah
-}
-; ModuleID = 'indirectcall2.ll'
-
-define i64 @test1006(i64 %X) {
-	ret i64 %X
-}
-
-define i64 @fib(i64 %n) {
-; <label>:0
-	%T = icmp ult i64 %n, 2		; <i1> [#uses=1]
-	br i1 %T, label %BaseCase, label %RecurseCase
-
-RecurseCase:		; preds = %0
-	%result = call i64 @test1006( i64 %n )		; <i64> [#uses=0]
-	br label %BaseCase
-
-BaseCase:		; preds = %RecurseCase, %0
-	%X = phi i64 [ 1, %0 ], [ 2, %RecurseCase ]		; <i64> [#uses=1]
-	ret i64 %X
-}
-; ModuleID = 'indirectcall.ll'
-
-declare i32 @atoi(i8*)
-
-define i64 @fibonacc(i64 %n) {
-	icmp ult i64 %n, 2		; <i1>:1 [#uses=1]
-	br i1 %1, label %BaseCase, label %RecurseCase
-
-BaseCase:		; preds = %0
-	ret i64 1
-
-RecurseCase:		; preds = %0
-	%n2 = sub i64 %n, 2		; <i64> [#uses=1]
-	%n1 = sub i64 %n, 1		; <i64> [#uses=1]
-	%f2 = call i64 @fibonacc( i64 %n2 )		; <i64> [#uses=1]
-	%f1 = call i64 @fibonacc( i64 %n1 )		; <i64> [#uses=1]
-	%result = add i64 %f2, %f1		; <i64> [#uses=1]
-	ret i64 %result
-}
-
-define i64 @realmain(i32 %argc, i8** %argv) {
-; <label>:0
-	icmp eq i32 %argc, 2		; <i1>:1 [#uses=1]
-	br i1 %1, label %HasArg, label %Continue
-
-HasArg:		; preds = %0
-	%n1 = add i32 1, 1		; <i32> [#uses=1]
-	br label %Continue
-
-Continue:		; preds = %HasArg, %0
-	%n = phi i32 [ %n1, %HasArg ], [ 1, %0 ]		; <i32> [#uses=1]
-	%N = sext i32 %n to i64		; <i64> [#uses=1]
-	%F = call i64 @fib( i64 %N )		; <i64> [#uses=1]
-	ret i64 %F
-}
-
-define i64 @trampoline(i64 %n, i64 (i64)* %fibfunc) {
-	%F = call i64 %fibfunc( i64 %n )		; <i64> [#uses=1]
-	ret i64 %F
-}
-
-define i32 @main2() {
-	%Result = call i64 @trampoline( i64 10, i64 (i64)* @fib )		; <i64> [#uses=1]
-	%Result.upgrd.1 = trunc i64 %Result to i32		; <i32> [#uses=1]
-	ret i32 %Result.upgrd.1
-}
-; ModuleID = 'inlineasm.ll'
-module asm "this is an inline asm block"
-module asm "this is another inline asm block"
-
-define i32 @test1007() {
-	%X = call i32 asm "tricky here $0, $1", "=r,r"( i32 4 )		; <i32> [#uses=1]
-	call void asm sideeffect "eieio", ""( )
-	ret i32 %X
-}
-; ModuleID = 'instructions.ll'
-
-define i32 @test_extractelement(<4 x i32> %V) {
-	%R = extractelement <4 x i32> %V, i32 1		; <i32> [#uses=1]
-	ret i32 %R
-}
-
-define <4 x i32> @test_insertelement(<4 x i32> %V) {
-	%R = insertelement <4 x i32> %V, i32 0, i32 0		; <<4 x i32>> [#uses=1]
-	ret <4 x i32> %R
-}
-
-define <4 x i32> @test_shufflevector_u(<4 x i32> %V) {
-	%R = shufflevector <4 x i32> %V, <4 x i32> %V, <4 x i32> < i32 1, i32 undef, i32 7, i32 2 >		; <<4 x i32>> [#uses=1]
-	ret <4 x i32> %R
-}
-
-define <4 x float> @test_shufflevector_f(<4 x float> %V) {
-	%R = shufflevector <4 x float> %V, <4 x float> undef, <4 x i32> < i32 1, i32 undef, i32 7, i32 2 >		; <<4 x float>> [#uses=1]
-	ret <4 x float> %R
-}
-; ModuleID = 'intrinsics.ll'
-
-declare i1 @llvm.isunordered.f32(float, float)
-
-declare i1 @llvm.isunordered.f64(double, double)
-
-declare void @llvm.prefetch(i8*, i32, i32)
-
-declare float @llvm.sqrt.f32(float)
-
-declare double @llvm.sqrt.f64(double)
-
-define void @libm() {
-	fcmp uno float 1.000000e+00, 2.000000e+00		; <i1>:1 [#uses=0]
-	fcmp uno double 3.000000e+00, 4.000000e+00		; <i1>:2 [#uses=0]
-	call void @llvm.prefetch( i8* null, i32 1, i32 3 )
-	call float @llvm.sqrt.f32( float 5.000000e+00 )		; <float>:3 [#uses=0]
-	call double @llvm.sqrt.f64( double 6.000000e+00 )		; <double>:4 [#uses=0]
-	call i8 @llvm.ctpop.i8( i8 10 )		; <i32>:5 [#uses=1]
-	call i16 @llvm.ctpop.i16( i16 11 )		; <i32>:7 [#uses=1]
-	call i32 @llvm.ctpop.i32( i32 12 )		; <i32>:9 [#uses=1]
-	call i64 @llvm.ctpop.i64( i64 13 )		; <i32>:11 [#uses=1]
-	call i8 @llvm.ctlz.i8( i8 14 )		; <i32>:13 [#uses=1]
-	call i16 @llvm.ctlz.i16( i16 15 )		; <i32>:15 [#uses=1]
-	call i32 @llvm.ctlz.i32( i32 16 )		; <i32>:17 [#uses=1]
-	call i64 @llvm.ctlz.i64( i64 17 )		; <i32>:19 [#uses=1]
-	call i8 @llvm.cttz.i8( i8 18 )		; <i32>:21 [#uses=1]
-	call i16 @llvm.cttz.i16( i16 19 )		; <i32>:23 [#uses=1]
-	call i32 @llvm.cttz.i32( i32 20 )		; <i32>:25 [#uses=1]
-	call i64 @llvm.cttz.i64( i64 21 )		; <i32>:27 [#uses=1]
-	ret void
-}
-
-declare i8 @llvm.ctpop.i8(i8)
-
-declare i16 @llvm.ctpop.i16(i16)
-
-declare i32 @llvm.ctpop.i32(i32)
-
-declare i64 @llvm.ctpop.i64(i64)
-
-declare i8 @llvm.ctlz.i8(i8)
-
-declare i16 @llvm.ctlz.i16(i16)
-
-declare i32 @llvm.ctlz.i32(i32)
-
-declare i64 @llvm.ctlz.i64(i64)
-
-declare i8 @llvm.cttz.i8(i8)
-
-declare i16 @llvm.cttz.i16(i16)
-
-declare i32 @llvm.cttz.i32(i32)
-
-declare i64 @llvm.cttz.i64(i64)
-
-; ModuleID = 'packed.ll'
-@foo1 = external global <4 x float>		; <<4 x float>*> [#uses=2]
-@foo102 = external global <2 x i32>		; <<2 x i32>*> [#uses=2]
-
-define void @main3() {
-	store <4 x float> < float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00 >, <4 x float>* @foo1
-	store <2 x i32> < i32 4, i32 4 >, <2 x i32>* @foo102
-	%l1 = load <4 x float>* @foo1		; <<4 x float>> [#uses=0]
-	%l2 = load <2 x i32>* @foo102		; <<2 x i32>> [#uses=0]
-	ret void
-}
-
-; ModuleID = 'properties.ll'
-target datalayout = "e-p:32:32"
-target triple = "proc-vend-sys"
-deplibs = [ "m", "c" ]
-; ModuleID = 'prototype.ll'
-
-declare i32 @bar1017(i32 %in)
-
-define i32 @foo1016(i32 %blah) {
-	%xx = call i32 @bar1017( i32 %blah )		; <i32> [#uses=1]
-	ret i32 %xx
-}
-
-; ModuleID = 'recursivetype.ll'
-	%list = type { %list*, i32 }
-
-declare i8* @malloc(i32)
-
-define void @InsertIntoListTail(%list** %L, i32 %Data) {
-bb1:
-	%reg116 = load %list** %L		; <%list*> [#uses=1]
-	%cast1004 = inttoptr i64 0 to %list*		; <%list*> [#uses=1]
-	%cond1000 = icmp eq %list* %reg116, %cast1004		; <i1> [#uses=1]
-	br i1 %cond1000, label %bb3, label %bb2
-
-bb2:		; preds = %bb2, %bb1
-	%reg117 = phi %list** [ %reg118, %bb2 ], [ %L, %bb1 ]		; <%list**> [#uses=1]
-	%cast1010 = bitcast %list** %reg117 to %list***		; <%list***> [#uses=1]
-	%reg118 = load %list*** %cast1010		; <%list**> [#uses=3]
-	%reg109 = load %list** %reg118		; <%list*> [#uses=1]
-	%cast1005 = inttoptr i64 0 to %list*		; <%list*> [#uses=1]
-	%cond1001 = icmp ne %list* %reg109, %cast1005		; <i1> [#uses=1]
-	br i1 %cond1001, label %bb2, label %bb3
-
-bb3:		; preds = %bb2, %bb1
-	%reg119 = phi %list** [ %reg118, %bb2 ], [ %L, %bb1 ]		; <%list**> [#uses=1]
-	%cast1006 = bitcast %list** %reg119 to i8**		; <i8**> [#uses=1]
-	%reg111 = call i8* @malloc( i32 16 )		; <i8*> [#uses=3]
-	store i8* %reg111, i8** %cast1006
-	%reg111.upgrd.1 = ptrtoint i8* %reg111 to i64		; <i64> [#uses=1]
-	%reg1002 = add i64 %reg111.upgrd.1, 8		; <i64> [#uses=1]
-	%reg1002.upgrd.2 = inttoptr i64 %reg1002 to i8*		; <i8*> [#uses=1]
-	%cast1008 = bitcast i8* %reg1002.upgrd.2 to i32*		; <i32*> [#uses=1]
-	store i32 %Data, i32* %cast1008
-	%cast1003 = inttoptr i64 0 to i64*		; <i64*> [#uses=1]
-	%cast1009 = bitcast i8* %reg111 to i64**		; <i64**> [#uses=1]
-	store i64* %cast1003, i64** %cast1009
-	ret void
-}
-
-define %list* @FindData(%list* %L, i32 %Data) {
-bb1:
-	br label %bb2
-
-bb2:		; preds = %bb6, %bb1
-	%reg115 = phi %list* [ %reg116, %bb6 ], [ %L, %bb1 ]		; <%list*> [#uses=4]
-	%cast1014 = inttoptr i64 0 to %list*		; <%list*> [#uses=1]
-	%cond1011 = icmp ne %list* %reg115, %cast1014		; <i1> [#uses=1]
-	br i1 %cond1011, label %bb4, label %bb3
-
-bb3:		; preds = %bb2
-	ret %list* null
-
-bb4:		; preds = %bb2
-	%idx = getelementptr %list* %reg115, i64 0, i32 1		; <i32*> [#uses=1]
-	%reg111 = load i32* %idx		; <i32> [#uses=1]
-	%cond1013 = icmp ne i32 %reg111, %Data		; <i1> [#uses=1]
-	br i1 %cond1013, label %bb6, label %bb5
-
-bb5:		; preds = %bb4
-	ret %list* %reg115
-
-bb6:		; preds = %bb4
-	%idx2 = getelementptr %list* %reg115, i64 0, i32 0		; <%list**> [#uses=1]
-	%reg116 = load %list** %idx2		; <%list*> [#uses=1]
-	br label %bb2
-}
-; ModuleID = 'simplecalltest.ll'
-	%FunTy = type i32 (i32)
-
-define void @invoke1019(%FunTy* %x) {
-	%foo = call i32 %x( i32 123 )		; <i32> [#uses=0]
-	ret void
-}
-
-define i32 @main4(i32 %argc, i8** %argv, i8** %envp) {
-	%retval = call i32 @test1008( i32 %argc )		; <i32> [#uses=2]
-	%two = add i32 %retval, %retval		; <i32> [#uses=1]
-	%retval2 = call i32 @test1008( i32 %argc )		; <i32> [#uses=1]
-	%two2 = add i32 %two, %retval2		; <i32> [#uses=1]
-	call void @invoke1019( %FunTy* @test1008 )
-	ret i32 %two2
-}
-
-define i32 @test1008(i32 %i0) {
-	ret i32 %i0
-}
-; ModuleID = 'smallest.ll'
-; ModuleID = 'small.ll'
-	%x = type i32
-
-define i32 @foo1020(i32 %in) {
-label:
-	ret i32 2
-}
-; ModuleID = 'testalloca.ll'
-	%inners = type { float, { i8 } }
-	%struct = type { i32, %inners, i64 }
-
-define i32 @testfunction(i32 %i0, i32 %j0) {
-	alloca i8, i32 5		; <i8*>:1 [#uses=0]
-	%ptr = alloca i32		; <i32*> [#uses=2]
-	store i32 3, i32* %ptr
-	%val = load i32* %ptr		; <i32> [#uses=0]
-	%sptr = alloca %struct		; <%struct*> [#uses=2]
-	%nsptr = getelementptr %struct* %sptr, i64 0, i32 1		; <%inners*> [#uses=1]
-	%ubsptr = getelementptr %inners* %nsptr, i64 0, i32 1		; <{ i8 }*> [#uses=1]
-	%idx = getelementptr { i8 }* %ubsptr, i64 0, i32 0		; <i8*> [#uses=1]
-	store i8 4, i8* %idx
-	%fptr = getelementptr %struct* %sptr, i64 0, i32 1, i32 0		; <float*> [#uses=1]
-	store float 4.000000e+00, float* %fptr
-	ret i32 3
-}
-; ModuleID = 'testconstants.ll'
-@somestr3 = constant [11 x i8] c"hello world"
-@array99 = constant [2 x i32] [ i32 12, i32 52 ]
-constant { i32, i32 } { i32 4, i32 3 }		; <{ i32, i32 }*>:0 [#uses=0]
-
-define [2 x i32]* @testfunction99(i32 %i0, i32 %j0) {
-	ret [2 x i32]* @array
-}
-
-define i8* @otherfunc(i32, double) {
-	%somestr = getelementptr [11 x i8]* @somestr3, i64 0, i64 0		; <i8*> [#uses=1]
-	ret i8* %somestr
-}
-
-define i8* @yetanotherfunc(i32, double) {
-	ret i8* null
-}
-
-define i32 @negativeUnsigned() {
-	ret i32 -1
-}
-
-define i32 @largeSigned() {
-	ret i32 -394967296
-}
-; ModuleID = 'testlogical.ll'
-
-define i32 @simpleAdd(i32 %i0, i32 %j0) {
-	%t1 = xor i32 %i0, %j0		; <i32> [#uses=1]
-	%t2 = or i32 %i0, %j0		; <i32> [#uses=1]
-	%t3 = and i32 %t1, %t2		; <i32> [#uses=1]
-	ret i32 %t3
-}
-; ModuleID = 'testmemory.ll'
-	%complexty = type { i32, { [4 x i8*], float }, double }
-	%struct = type { i32, { float, { i8 } }, i64 }
-
-define i32 @main6() {
-	call i32 @testfunction98( i64 0, i64 1 )
-	ret i32 0
-}
-
-define i32 @testfunction98(i64 %i0, i64 %j0) {
-	%array0 = malloc [4 x i8]		; <[4 x i8]*> [#uses=2]
-	%size = add i32 2, 2		; <i32> [#uses=1]
-	%array1 = malloc i8, i32 4		; <i8*> [#uses=1]
-	%array2 = malloc i8, i32 %size		; <i8*> [#uses=1]
-	%idx = getelementptr [4 x i8]* %array0, i64 0, i64 2		; <i8*> [#uses=1]
-	store i8 123, i8* %idx
-	free [4 x i8]* %array0
-	free i8* %array1
-	free i8* %array2
-	%aa = alloca %complexty, i32 5		; <%complexty*> [#uses=1]
-	%idx2 = getelementptr %complexty* %aa, i64 %i0, i32 1, i32 0, i64 %j0		; <i8**> [#uses=1]
-	store i8* null, i8** %idx2
-	%ptr = alloca i32		; <i32*> [#uses=2]
-	store i32 3, i32* %ptr
-	%val = load i32* %ptr		; <i32> [#uses=0]
-	%sptr = alloca %struct		; <%struct*> [#uses=1]
-	%ubsptr = getelementptr %struct* %sptr, i64 0, i32 1, i32 1		; <{ i8 }*> [#uses=1]
-	%idx3 = getelementptr { i8 }* %ubsptr, i64 0, i32 0		; <i8*> [#uses=1]
-	store i8 4, i8* %idx3
-	ret i32 3
-}
-; ModuleID = 'testswitch.ll'
-	%int = type i32
-
-define i32 @squared(i32 %i0) {
-	switch i32 %i0, label %Default [
-		 i32 1, label %Case1
-		 i32 2, label %Case2
-		 i32 4, label %Case4
-	]
-
-Default:		; preds = %0
-	ret i32 -1
-
-Case1:		; preds = %0
-	ret i32 1
-
-Case2:		; preds = %0
-	ret i32 4
-
-Case4:		; preds = %0
-	ret i32 16
-}
-; ModuleID = 'testvarargs.ll'
-
-declare i32 @printf(i8*, ...)
-
-define i32 @testvarar() {
-	call i32 (i8*, ...)* @printf( i8* null, i32 12, i8 42 )		; <i32>:1 [#uses=1]
-	ret i32 %1
-}
-; ModuleID = 'undefined.ll'
-@X2 = global i32 undef		; <i32*> [#uses=0]
-
-declare i32 @atoi(i8*)
-
-define i32 @test1009() {
-	ret i32 undef
-}
-
-define i32 @test1003() {
-	%X = add i32 undef, 1		; <i32> [#uses=1]
-	ret i32 %X
-}
-; ModuleID = 'unreachable.ll'
-
-declare void @bar()
-
-define i32 @foo1021() {
-	unreachable
-}
-
-define double @xyz() {
-	call void @bar( )
-	unreachable
-}
-; ModuleID = 'varargs.ll'
-
-declare void @llvm.va_start(i8* %ap)
-
-declare void @llvm.va_copy(i8* %aq, i8* %ap)
-
-declare void @llvm.va_end(i8* %ap)
-
-define i32 @test1010(i32 %X, ...) {
-	%ap = alloca i8*		; <i8**> [#uses=4]
-	%va.upgrd.1 = bitcast i8** %ap to i8*		; <i8*> [#uses=1]
-	call void @llvm.va_start( i8* %va.upgrd.1 )
-	%tmp = va_arg i8** %ap, i32		; <i32> [#uses=1]
-	%aq = alloca i8*		; <i8**> [#uses=2]
-	%va0.upgrd.2 = bitcast i8** %aq to i8*		; <i8*> [#uses=1]
-	%va1.upgrd.3 = bitcast i8** %ap to i8*		; <i8*> [#uses=1]
-	call void @llvm.va_copy( i8* %va0.upgrd.2, i8* %va1.upgrd.3 )
-	%va.upgrd.4 = bitcast i8** %aq to i8*		; <i8*> [#uses=1]
-	call void @llvm.va_end( i8* %va.upgrd.4 )
-	%va.upgrd.5 = bitcast i8** %ap to i8*		; <i8*> [#uses=1]
-	call void @llvm.va_end( i8* %va.upgrd.5 )
-	ret i32 %tmp
-}
-; ModuleID = 'varargs_new.ll'
-
-declare void @llvm.va_start(i8*)
-
-declare void @llvm.va_copy(i8*, i8*)
-
-declare void @llvm.va_end(i8*)
-
-define i32 @test1011(i32 %X, ...) {
-	%ap = alloca i8*		; <i8**> [#uses=4]
-	%aq = alloca i8*		; <i8**> [#uses=2]
-	%va.upgrd.1 = bitcast i8** %ap to i8*		; <i8*> [#uses=1]
-	call void @llvm.va_start( i8* %va.upgrd.1 )
-	%tmp = va_arg i8** %ap, i32		; <i32> [#uses=1]
-	%apv = load i8** %ap		; <i8*> [#uses=1]
-	%va0.upgrd.2 = bitcast i8** %aq to i8*		; <i8*> [#uses=1]
-	%va1.upgrd.3 = bitcast i8* %apv to i8*		; <i8*> [#uses=1]
-	call void @llvm.va_copy( i8* %va0.upgrd.2, i8* %va1.upgrd.3 )
-	%va.upgrd.4 = bitcast i8** %aq to i8*		; <i8*> [#uses=1]
-	call void @llvm.va_end( i8* %va.upgrd.4 )
-	%va.upgrd.5 = bitcast i8** %ap to i8*		; <i8*> [#uses=1]
-	call void @llvm.va_end( i8* %va.upgrd.5 )
-	ret i32 %tmp
-}
-; ModuleID = 'weirdnames.ll'
-	"&^ " = type { i32 }
-@"%.*+ foo" = global "&^ " { i32 5 }		; <"&^ "*> [#uses=0]
-@"0" = global float 0.000000e+00		; <float*> [#uses=0]
diff --git a/test/CodeGen/CellSPU/and_ops.ll b/test/CodeGen/CellSPU/and_ops.ll
index 139e97b..72478a1 100644
--- a/test/CodeGen/CellSPU/and_ops.ll
+++ b/test/CodeGen/CellSPU/and_ops.ll
@@ -201,12 +201,12 @@ define <4 x i32> @andi_v4i32_4(<4 x i32> %in) {
         ret <4 x i32> %tmp2
 }
 
-define i32 @andi_u32(i32 zeroext  %in) zeroext  {
+define zeroext i32 @andi_u32(i32 zeroext  %in)   {
         %tmp37 = and i32 %in, 37
         ret i32 %tmp37
 }
 
-define i32 @andi_i32(i32 signext  %in) signext  {
+define signext i32 @andi_i32(i32 signext  %in)   {
         %tmp38 = and i32 %in, 37
         ret i32 %tmp38
 }
@@ -241,12 +241,12 @@ define <8 x i16> @andhi_v8i16_4(<8 x i16> %in) {
         ret <8 x i16> %tmp2
 }
 
-define i16 @andhi_u16(i16 zeroext  %in) zeroext  {
+define zeroext i16 @andhi_u16(i16 zeroext  %in)   {
         %tmp37 = and i16 %in, 37         ; <i16> [#uses=1]
         ret i16 %tmp37
 }
 
-define i16 @andhi_i16(i16 signext  %in) signext  {
+define signext i16 @andhi_i16(i16 signext  %in)   {
         %tmp38 = and i16 %in, 37         ; <i16> [#uses=1]
         ret i16 %tmp38
 }
@@ -260,13 +260,13 @@ define <16 x i8> @and_v16i8(<16 x i8> %in) {
         ret <16 x i8> %tmp2
 }
 
-define i8 @and_u8(i8 zeroext  %in) zeroext  {
+define zeroext i8 @and_u8(i8 zeroext  %in)   {
         ; ANDBI generated:
         %tmp37 = and i8 %in, 37
         ret i8 %tmp37
 }
 
-define i8 @and_sext8(i8 signext  %in) signext  {
+define signext i8 @and_sext8(i8 signext  %in)   {
         ; ANDBI generated
         %tmp38 = and i8 %in, 37
         ret i8 %tmp38
diff --git a/test/CodeGen/CellSPU/eqv.ll b/test/CodeGen/CellSPU/eqv.ll
index 22c8c3b..7967681 100644
--- a/test/CodeGen/CellSPU/eqv.ll
+++ b/test/CodeGen/CellSPU/eqv.ll
@@ -79,7 +79,7 @@ define i32 @equiv_i32_5(i32 %arg1, i32 %arg2) {
         ret i32 %C
 }
 
-define i16 @equiv_i16_1(i16 signext %arg1, i16 signext %arg2) signext {
+define signext i16 @equiv_i16_1(i16 signext %arg1, i16 signext %arg2)  {
         %A = and i16 %arg1, %arg2               ; <i16> [#uses=1]
         %B = or i16 %arg1, %arg2                ; <i16> [#uses=1]
         %Bnot = xor i16 %B, -1                  ; <i16> [#uses=1]
@@ -87,7 +87,7 @@ define i16 @equiv_i16_1(i16 signext %arg1, i16 signext %arg2) signext {
         ret i16 %C
 }
 
-define i16 @equiv_i16_2(i16 signext %arg1, i16 signext %arg2) signext {
+define signext i16 @equiv_i16_2(i16 signext %arg1, i16 signext %arg2) {
         %B = or i16 %arg1, %arg2                ; <i16> [#uses=1]
         %Bnot = xor i16 %B, -1                  ; <i16> [#uses=1]
         %A = and i16 %arg1, %arg2               ; <i16> [#uses=1]
@@ -95,7 +95,7 @@ define i16 @equiv_i16_2(i16 signext %arg1, i16 signext %arg2) signext {
         ret i16 %C
 }
 
-define i16 @equiv_i16_3(i16 signext %arg1, i16 signext %arg2) signext {
+define signext i16 @equiv_i16_3(i16 signext %arg1, i16 signext %arg2)  {
         %B = or i16 %arg1, %arg2                ; <i16> [#uses=1]
         %A = and i16 %arg1, %arg2               ; <i16> [#uses=1]
         %Bnot = xor i16 %B, -1                  ; <i16> [#uses=1]
@@ -103,7 +103,7 @@ define i16 @equiv_i16_3(i16 signext %arg1, i16 signext %arg2) signext {
         ret i16 %C
 }
 
-define i8 @equiv_i8_1(i8 signext %arg1, i8 signext %arg2) signext {
+define signext i8 @equiv_i8_1(i8 signext %arg1, i8 signext %arg2)  {
         %A = and i8 %arg1, %arg2                ; <i8> [#uses=1]
         %B = or i8 %arg1, %arg2         ; <i8> [#uses=1]
         %Bnot = xor i8 %B, -1                   ; <i8> [#uses=1]
@@ -111,7 +111,7 @@ define i8 @equiv_i8_1(i8 signext %arg1, i8 signext %arg2) signext {
         ret i8 %C
 }
 
-define i8 @equiv_i8_2(i8 signext %arg1, i8 signext %arg2) signext {
+define signext i8 @equiv_i8_2(i8 signext %arg1, i8 signext %arg2)  {
         %B = or i8 %arg1, %arg2         ; <i8> [#uses=1]
         %Bnot = xor i8 %B, -1                   ; <i8> [#uses=1]
         %A = and i8 %arg1, %arg2                ; <i8> [#uses=1]
@@ -119,7 +119,7 @@ define i8 @equiv_i8_2(i8 signext %arg1, i8 signext %arg2) signext {
         ret i8 %C
 }
 
-define i8 @equiv_i8_3(i8 signext %arg1, i8 signext %arg2) signext {
+define signext i8 @equiv_i8_3(i8 signext %arg1, i8 signext %arg2)  {
         %B = or i8 %arg1, %arg2         ; <i8> [#uses=1]
         %A = and i8 %arg1, %arg2                ; <i8> [#uses=1]
         %Bnot = xor i8 %B, -1                   ; <i8> [#uses=1]
@@ -127,7 +127,7 @@ define i8 @equiv_i8_3(i8 signext %arg1, i8 signext %arg2) signext {
         ret i8 %C
 }
 
-define i8 @equiv_u8_1(i8 zeroext %arg1, i8 zeroext %arg2) zeroext {
+define zeroext i8 @equiv_u8_1(i8 zeroext %arg1, i8 zeroext %arg2)  {
         %A = and i8 %arg1, %arg2                ; <i8> [#uses=1]
         %B = or i8 %arg1, %arg2         ; <i8> [#uses=1]
         %Bnot = xor i8 %B, -1                   ; <i8> [#uses=1]
@@ -135,7 +135,7 @@ define i8 @equiv_u8_1(i8 zeroext %arg1, i8 zeroext %arg2) zeroext {
         ret i8 %C
 }
 
-define i8 @equiv_u8_2(i8 zeroext %arg1, i8 zeroext %arg2) zeroext {
+define zeroext i8 @equiv_u8_2(i8 zeroext %arg1, i8 zeroext %arg2)  {
         %B = or i8 %arg1, %arg2         ; <i8> [#uses=1]
         %Bnot = xor i8 %B, -1                   ; <i8> [#uses=1]
         %A = and i8 %arg1, %arg2                ; <i8> [#uses=1]
@@ -143,7 +143,7 @@ define i8 @equiv_u8_2(i8 zeroext %arg1, i8 zeroext %arg2) zeroext {
         ret i8 %C
 }
 
-define i8 @equiv_u8_3(i8 zeroext %arg1, i8 zeroext %arg2) zeroext {
+define zeroext i8 @equiv_u8_3(i8 zeroext %arg1, i8 zeroext %arg2)  {
         %B = or i8 %arg1, %arg2         ; <i8> [#uses=1]
         %A = and i8 %arg1, %arg2                ; <i8> [#uses=1]
         %Bnot = xor i8 %B, -1                   ; <i8> [#uses=1]
diff --git a/test/CodeGen/CellSPU/mul-with-overflow.ll b/test/CodeGen/CellSPU/mul-with-overflow.ll
index d15da12..c04e69e 100644
--- a/test/CodeGen/CellSPU/mul-with-overflow.ll
+++ b/test/CodeGen/CellSPU/mul-with-overflow.ll
@@ -1,14 +1,14 @@
 ; RUN: llc < %s -march=cellspu
 
 declare {i16, i1} @llvm.smul.with.overflow.i16(i16 %a, i16 %b)
-define i1 @a(i16 %x) zeroext nounwind {
+define zeroext i1 @a(i16 %x)  nounwind {
   %res = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %x, i16 3)
   %obil = extractvalue {i16, i1} %res, 1
   ret i1 %obil
 }
 
 declare {i16, i1} @llvm.umul.with.overflow.i16(i16 %a, i16 %b)
-define i1 @b(i16 %x) zeroext nounwind {
+define zeroext i1 @b(i16 %x)  nounwind {
   %res = call {i16, i1} @llvm.umul.with.overflow.i16(i16 %x, i16 3)
   %obil = extractvalue {i16, i1} %res, 1
   ret i1 %obil
diff --git a/test/CodeGen/CellSPU/nand.ll b/test/CodeGen/CellSPU/nand.ll
index e141923..b770cad 100644
--- a/test/CodeGen/CellSPU/nand.ll
+++ b/test/CodeGen/CellSPU/nand.ll
@@ -60,49 +60,49 @@ define i32 @nand_i32_2(i32 %arg1, i32 %arg2) {
         ret i32 %B
 }
 
-define i16 @nand_i16_1(i16 signext  %arg1, i16 signext  %arg2) signext  {
+define signext i16 @nand_i16_1(i16 signext  %arg1, i16 signext  %arg2)   {
         %A = and i16 %arg2, %arg1            ; <i16> [#uses=1]
         %B = xor i16 %A, -1                  ; <i16> [#uses=1]
         ret i16 %B
 }
 
-define i16 @nand_i16_2(i16 signext  %arg1, i16 signext  %arg2) signext  {
+define signext i16 @nand_i16_2(i16 signext  %arg1, i16 signext  %arg2)   {
         %A = and i16 %arg1, %arg2            ; <i16> [#uses=1]
         %B = xor i16 %A, -1                  ; <i16> [#uses=1]
         ret i16 %B
 }
 
-define i16 @nand_i16u_1(i16 zeroext  %arg1, i16 zeroext  %arg2) zeroext  {
+define zeroext i16 @nand_i16u_1(i16 zeroext  %arg1, i16 zeroext  %arg2)   {
         %A = and i16 %arg2, %arg1            ; <i16> [#uses=1]
         %B = xor i16 %A, -1                  ; <i16> [#uses=1]
         ret i16 %B
 }
 
-define i16 @nand_i16u_2(i16 zeroext  %arg1, i16 zeroext  %arg2) zeroext  {
+define zeroext i16 @nand_i16u_2(i16 zeroext  %arg1, i16 zeroext  %arg2)   {
         %A = and i16 %arg1, %arg2            ; <i16> [#uses=1]
         %B = xor i16 %A, -1                  ; <i16> [#uses=1]
         ret i16 %B
 }
 
-define i8 @nand_i8u_1(i8 zeroext  %arg1, i8 zeroext  %arg2) zeroext  {
+define zeroext i8 @nand_i8u_1(i8 zeroext  %arg1, i8 zeroext  %arg2)   {
         %A = and i8 %arg2, %arg1             ; <i8> [#uses=1]
         %B = xor i8 %A, -1                   ; <i8> [#uses=1]
         ret i8 %B
 }
 
-define i8 @nand_i8u_2(i8 zeroext  %arg1, i8 zeroext  %arg2) zeroext  {
+define zeroext i8 @nand_i8u_2(i8 zeroext  %arg1, i8 zeroext  %arg2)   {
         %A = and i8 %arg1, %arg2             ; <i8> [#uses=1]
         %B = xor i8 %A, -1                   ; <i8> [#uses=1]
         ret i8 %B
 }
 
-define i8 @nand_i8_1(i8 signext  %arg1, i8 signext  %arg2) signext  {
+define signext i8 @nand_i8_1(i8 signext  %arg1, i8 signext  %arg2)   {
         %A = and i8 %arg2, %arg1             ; <i8> [#uses=1]
         %B = xor i8 %A, -1                   ; <i8> [#uses=1]
         ret i8 %B
 }
 
-define i8 @nand_i8_2(i8 signext  %arg1, i8 signext  %arg2) signext  {
+define signext i8 @nand_i8_2(i8 signext  %arg1, i8 signext  %arg2) {
         %A = and i8 %arg1, %arg2             ; <i8> [#uses=1]
         %B = xor i8 %A, -1                   ; <i8> [#uses=1]
         ret i8 %B
diff --git a/test/CodeGen/CellSPU/or_ops.ll b/test/CodeGen/CellSPU/or_ops.ll
index 8aa1e99..46349b9 100644
--- a/test/CodeGen/CellSPU/or_ops.ll
+++ b/test/CodeGen/CellSPU/or_ops.ll
@@ -200,12 +200,12 @@ define <4 x i32> @ori_v4i32_4(<4 x i32> %in) {
         ret <4 x i32> %tmp2
 }
 
-define i32 @ori_u32(i32 zeroext  %in) zeroext  {
+define zeroext i32 @ori_u32(i32 zeroext  %in)   {
         %tmp37 = or i32 %in, 37         ; <i32> [#uses=1]
         ret i32 %tmp37
 }
 
-define i32 @ori_i32(i32 signext  %in) signext  {
+define signext i32 @ori_i32(i32 signext  %in)   {
         %tmp38 = or i32 %in, 37         ; <i32> [#uses=1]
         ret i32 %tmp38
 }
@@ -235,12 +235,12 @@ define <8 x i16> @orhi_v8i16_4(<8 x i16> %in) {
         ret <8 x i16> %tmp2
 }
 
-define i16 @orhi_u16(i16 zeroext  %in) zeroext  {
+define zeroext i16 @orhi_u16(i16 zeroext  %in)   {
         %tmp37 = or i16 %in, 37         ; <i16> [#uses=1]
         ret i16 %tmp37
 }
 
-define i16 @orhi_i16(i16 signext  %in) signext  {
+define signext i16 @orhi_i16(i16 signext  %in)   {
         %tmp38 = or i16 %in, 37         ; <i16> [#uses=1]
         ret i16 %tmp38
 }
@@ -253,12 +253,12 @@ define <16 x i8> @orbi_v16i8(<16 x i8> %in) {
         ret <16 x i8> %tmp2
 }
 
-define i8 @orbi_u8(i8 zeroext  %in) zeroext  {
+define zeroext i8 @orbi_u8(i8 zeroext  %in)   {
         %tmp37 = or i8 %in, 37         ; <i8> [#uses=1]
         ret i8 %tmp37
 }
 
-define i8 @orbi_i8(i8 signext  %in) signext  {
+define signext i8 @orbi_i8(i8 signext  %in)   {
         %tmp38 = or i8 %in, 37         ; <i8> [#uses=1]
         ret i8 %tmp38
 }
diff --git a/test/CodeGen/CellSPU/shift_ops.ll b/test/CodeGen/CellSPU/shift_ops.ll
index c4a5abd..3252c77 100644
--- a/test/CodeGen/CellSPU/shift_ops.ll
+++ b/test/CodeGen/CellSPU/shift_ops.ll
@@ -33,22 +33,22 @@ define i16 @shlh_i16_2(i16 %arg1, i16 %arg2) {
         ret i16 %A
 }
 
-define i16 @shlh_i16_3(i16 signext %arg1, i16 signext %arg2) signext {
+define signext i16 @shlh_i16_3(i16 signext %arg1, i16 signext %arg2) {
         %A = shl i16 %arg1, %arg2
         ret i16 %A
 }
 
-define i16 @shlh_i16_4(i16 signext %arg1, i16 signext %arg2) signext {
+define signext i16 @shlh_i16_4(i16 signext %arg1, i16 signext %arg2) {
         %A = shl i16 %arg2, %arg1
         ret i16 %A
 }
 
-define i16 @shlh_i16_5(i16 zeroext %arg1, i16 zeroext %arg2) zeroext {
+define zeroext i16 @shlh_i16_5(i16 zeroext %arg1, i16 zeroext %arg2)  {
         %A = shl i16 %arg1, %arg2
         ret i16 %A
 }
 
-define i16 @shlh_i16_6(i16 zeroext %arg1, i16 zeroext %arg2) zeroext {
+define zeroext i16 @shlh_i16_6(i16 zeroext %arg1, i16 zeroext %arg2) {
         %A = shl i16 %arg2, %arg1
         ret i16 %A
 }
@@ -76,46 +76,46 @@ define i16 @shlhi_i16_4(i16 %arg1) {
         ret i16 %A
 }
 
-define i16 @shlhi_i16_5(i16 signext %arg1) signext {
+define signext i16 @shlhi_i16_5(i16 signext %arg1)  {
         %A = shl i16 %arg1, 12
         ret i16 %A
 }
 
 ; Should not generate anything other than the return, arg1 << 0 = arg1
-define i16 @shlhi_i16_6(i16 signext %arg1) signext {
+define signext i16 @shlhi_i16_6(i16 signext %arg1) {
         %A = shl i16 %arg1, 0
         ret i16 %A
 }
 
-define i16 @shlhi_i16_7(i16 signext %arg1) signext {
+define signext i16 @shlhi_i16_7(i16 signext %arg1) {
         %A = shl i16 16383, %arg1
         ret i16 %A
 }
 
 ; Should generate 0, 0 << arg1 = 0
-define i16 @shlhi_i16_8(i16 signext %arg1) signext {
+define signext i16 @shlhi_i16_8(i16 signext %arg1)  {
         %A = shl i16 0, %arg1
         ret i16 %A
 }
 
-define i16 @shlhi_i16_9(i16 zeroext %arg1) zeroext {
+define zeroext i16 @shlhi_i16_9(i16 zeroext %arg1)  {
         %A = shl i16 %arg1, 12
         ret i16 %A
 }
 
 ; Should not generate anything other than the return, arg1 << 0 = arg1
-define i16 @shlhi_i16_10(i16 zeroext %arg1) zeroext {
+define zeroext i16 @shlhi_i16_10(i16 zeroext %arg1)  {
         %A = shl i16 %arg1, 0
         ret i16 %A
 }
 
-define i16 @shlhi_i16_11(i16 zeroext %arg1) zeroext {
+define zeroext i16 @shlhi_i16_11(i16 zeroext %arg1)  {
         %A = shl i16 16383, %arg1
         ret i16 %A
 }
 
 ; Should generate 0, 0 << arg1 = 0
-define i16 @shlhi_i16_12(i16 zeroext %arg1) zeroext {
+define zeroext i16 @shlhi_i16_12(i16 zeroext %arg1)  {
         %A = shl i16 0, %arg1
         ret i16 %A
 }
@@ -133,22 +133,22 @@ define i32 @shl_i32_2(i32 %arg1, i32 %arg2) {
         ret i32 %A
 }
 
-define i32 @shl_i32_3(i32 signext %arg1, i32 signext %arg2) signext {
+define signext i32 @shl_i32_3(i32 signext %arg1, i32 signext %arg2)  {
         %A = shl i32 %arg1, %arg2
         ret i32 %A
 }
 
-define i32 @shl_i32_4(i32 signext %arg1, i32 signext %arg2) signext {
+define signext i32 @shl_i32_4(i32 signext %arg1, i32 signext %arg2)  {
         %A = shl i32 %arg2, %arg1
         ret i32 %A
 }
 
-define i32 @shl_i32_5(i32 zeroext %arg1, i32 zeroext %arg2) zeroext {
+define zeroext i32 @shl_i32_5(i32 zeroext %arg1, i32 zeroext %arg2)  {
         %A = shl i32 %arg1, %arg2
         ret i32 %A
 }
 
-define i32 @shl_i32_6(i32 zeroext %arg1, i32 zeroext %arg2) zeroext {
+define zeroext i32 @shl_i32_6(i32 zeroext %arg1, i32 zeroext %arg2)  {
         %A = shl i32 %arg2, %arg1
         ret i32 %A
 }
@@ -176,46 +176,46 @@ define i32 @shli_i32_4(i32 %arg1) {
         ret i32 %A
 }
 
-define i32 @shli_i32_5(i32 signext %arg1) signext {
+define signext i32 @shli_i32_5(i32 signext %arg1)  {
         %A = shl i32 %arg1, 12
         ret i32 %A
 }
 
 ; Should not generate anything other than the return, arg1 << 0 = arg1
-define i32 @shli_i32_6(i32 signext %arg1) signext {
+define signext i32 @shli_i32_6(i32 signext %arg1) {
         %A = shl i32 %arg1, 0
         ret i32 %A
 }
 
-define i32 @shli_i32_7(i32 signext %arg1) signext {
+define signext i32 @shli_i32_7(i32 signext %arg1)  {
         %A = shl i32 16383, %arg1
         ret i32 %A
 }
 
 ; Should generate 0, 0 << arg1 = 0
-define i32 @shli_i32_8(i32 signext %arg1) signext {
+define signext i32 @shli_i32_8(i32 signext %arg1) {
         %A = shl i32 0, %arg1
         ret i32 %A
 }
 
-define i32 @shli_i32_9(i32 zeroext %arg1) zeroext {
+define zeroext i32 @shli_i32_9(i32 zeroext %arg1)  {
         %A = shl i32 %arg1, 12
         ret i32 %A
 }
 
 ; Should not generate anything other than the return, arg1 << 0 = arg1
-define i32 @shli_i32_10(i32 zeroext %arg1) zeroext {
+define zeroext i32 @shli_i32_10(i32 zeroext %arg1)  {
         %A = shl i32 %arg1, 0
         ret i32 %A
 }
 
-define i32 @shli_i32_11(i32 zeroext %arg1) zeroext {
+define zeroext i32 @shli_i32_11(i32 zeroext %arg1) {
         %A = shl i32 16383, %arg1
         ret i32 %A
 }
 
 ; Should generate 0, 0 << arg1 = 0
-define i32 @shli_i32_12(i32 zeroext %arg1) zeroext {
+define zeroext i32 @shli_i32_12(i32 zeroext %arg1) {
         %A = shl i32 0, %arg1
         ret i32 %A
 }
diff --git a/test/CodeGen/CellSPU/struct_1.ll b/test/CodeGen/CellSPU/struct_1.ll
index 8ee7d93..adbb5ef 100644
--- a/test/CodeGen/CellSPU/struct_1.ll
+++ b/test/CodeGen/CellSPU/struct_1.ll
@@ -47,19 +47,19 @@ target triple = "spu"
 ; struct hackstate state = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }
 @state = global %struct.hackstate zeroinitializer, align 16
 
-define i8 @get_hackstate_c1() zeroext nounwind  {
+define zeroext i8 @get_hackstate_c1()  nounwind  {
 entry:
         %tmp2 = load i8* getelementptr (%struct.hackstate* @state, i32 0, i32 0), align 16
         ret i8 %tmp2
 }
 
-define i8 @get_hackstate_c2() zeroext nounwind  {
+define zeroext i8 @get_hackstate_c2()  nounwind  {
 entry:
         %tmp2 = load i8* getelementptr (%struct.hackstate* @state, i32 0, i32 1), align 16
         ret i8 %tmp2
 }
 
-define i8 @get_hackstate_c3() zeroext nounwind  {
+define zeroext i8 @get_hackstate_c3()  nounwind  {
 entry:
         %tmp2 = load i8* getelementptr (%struct.hackstate* @state, i32 0, i32 2), align 16
         ret i8 %tmp2
@@ -71,19 +71,19 @@ entry:
         ret i32 %tmp2
 }
 
-define i16 @get_hackstate_s1() signext nounwind  {
+define signext i16 @get_hackstate_s1()  nounwind  {
 entry:
         %tmp2 = load i16* getelementptr (%struct.hackstate* @state, i32 0, i32 4), align 16
         ret i16 %tmp2
 }
 
-define i8 @get_hackstate_c6() zeroext nounwind  {
+define zeroext i8 @get_hackstate_c6()  nounwind  {
 entry:
         %tmp2 = load i8* getelementptr (%struct.hackstate* @state, i32 0, i32 8), align 16
         ret i8 %tmp2
 }
 
-define i8 @get_hackstate_c7() zeroext nounwind  {
+define zeroext i8 @get_hackstate_c7()  nounwind  {
 entry:
         %tmp2 = load i8* getelementptr (%struct.hackstate* @state, i32 0, i32 9), align 16
         ret i8 %tmp2
diff --git a/test/CodeGen/CellSPU/v2f32.ll b/test/CodeGen/CellSPU/v2f32.ll
index efd0320..09e15ff 100644
--- a/test/CodeGen/CellSPU/v2f32.ll
+++ b/test/CodeGen/CellSPU/v2f32.ll
@@ -33,6 +33,7 @@ define %vec @test_mul(%vec %param)
  ret %vec %1
 }
 
+; CHECK: test_splat:
 define %vec @test_splat(float %param ) {
 ;CHECK: lqa
 ;CHECK: shufb
@@ -43,16 +44,17 @@ define %vec @test_splat(float %param ) {
 }
 
 define void @test_store(%vec %val, %vec* %ptr){
-
+; CHECK: test_store:
 ;CHECK: stqd 
-  store %vec undef, %vec* null
+  store %vec zeroinitializer, %vec* null
 
-;CHECK: stqd $3, 0(${{.}})
+;CHECK: stqd $3, 0(${{.*}})
 ;CHECK: bi $lr
   store %vec %val, %vec* %ptr
   ret void
 }
 
+; CHECK: test_insert:
 define %vec @test_insert(){
 ;CHECK: cwd
 ;CHECK: shufb $3
@@ -61,6 +63,8 @@ define %vec @test_insert(){
   ret %vec %rv
 }
 
+; CHECK: test_unaligned_store:
+
 define void @test_unaligned_store()  {
 ;CHECK:	cdd
 ;CHECK:	shufb
@@ -68,7 +72,7 @@ define void @test_unaligned_store()  {
   %data = alloca [4 x float], align 16         ; <[4 x float]*> [#uses=1]
   %ptr = getelementptr [4 x float]* %data, i32 0, i32 2 ; <float*> [#uses=1]
   %vptr = bitcast float* %ptr to  <2 x float>* ; <[1 x <2 x float>]*> [#uses=1]
-  store <2 x float> undef, <2 x float>* %vptr
+  store <2 x float> zeroinitializer, <2 x float>* %vptr
   ret void
 }
 
diff --git a/test/CodeGen/Generic/2010-11-04-BigByval.ll b/test/CodeGen/Generic/2010-11-04-BigByval.ll
index ecb354e..df2ca4c 100644
--- a/test/CodeGen/Generic/2010-11-04-BigByval.ll
+++ b/test/CodeGen/Generic/2010-11-04-BigByval.ll
@@ -1,6 +1,5 @@
 ; RUN: llc < %s
 ; PR7170
-; XFAIL: arm
 
 %big = type [131072 x i8]
 
diff --git a/test/CodeGen/Generic/badlive.ll b/test/CodeGen/Generic/badlive.ll
deleted file mode 100644
index 43b03e3..0000000
--- a/test/CodeGen/Generic/badlive.ll
+++ /dev/null
@@ -1,28 +0,0 @@
-; RUN: llc < %s
-
-define i32 @main() {
-bb0:
-        %reg109 = malloc i32, i32 100           ; <i32*> [#uses=2]
-        br label %bb2
-
-bb2:            ; preds = %bb2, %bb0
-        %cann-indvar1 = phi i32 [ 0, %bb0 ], [ %add1-indvar1, %bb2 ]            ; <i32> [#uses=2]
-        %reg127 = mul i32 %cann-indvar1, 2              ; <i32> [#uses=1]
-        %add1-indvar1 = add i32 %cann-indvar1, 1                ; <i32> [#uses=1]
-        store i32 999, i32* %reg109
-        %cond1015 = icmp sle i32 1, 99          ; <i1> [#uses=1]
-        %reg128 = add i32 %reg127, 2            ; <i32> [#uses=0]
-        br i1 %cond1015, label %bb2, label %bb4
-
-bb4:            ; preds = %bb4, %bb2
-        %cann-indvar = phi i32 [ %add1-indvar, %bb4 ], [ 0, %bb2 ]              ; <i32> [#uses=1]
-        %add1-indvar = add i32 %cann-indvar, 1          ; <i32> [#uses=2]
-        store i32 333, i32* %reg109
-        %reg131 = add i32 %add1-indvar, 3               ; <i32> [#uses=1]
-        %cond1017 = icmp ule i32 %reg131, 99            ; <i1> [#uses=1]
-        br i1 %cond1017, label %bb4, label %bb5
-
-bb5:            ; preds = %bb4
-        ret i32 0
-}
-
diff --git a/test/CodeGen/Generic/crash.ll b/test/CodeGen/Generic/crash.ll
index 0427398..e7cc7e3 100644
--- a/test/CodeGen/Generic/crash.ll
+++ b/test/CodeGen/Generic/crash.ll
@@ -38,3 +38,31 @@ unreachable
 declare void @Parse_Vector(double*)
 declare i32 @llvm.objectsize.i32(i8*, i1)
 
+
+; PR9578
+%struct.S0 = type { i32, i8, i32 }
+
+define void @func_82() nounwind optsize {
+entry:
+  br label %for.body.i
+
+for.body.i:                                       ; preds = %for.body.i, %entry
+  br i1 undef, label %func_74.exit.for.cond29.thread_crit_edge, label %for.body.i
+
+func_74.exit.for.cond29.thread_crit_edge:         ; preds = %for.body.i
+  %f13576.pre = getelementptr inbounds %struct.S0* undef, i64 0, i32 1
+  store i8 0, i8* %f13576.pre, align 4, !tbaa !0
+  br label %lbl_468
+
+lbl_468:                                          ; preds = %lbl_468, %func_74.exit.for.cond29.thread_crit_edge
+  %f13577.ph = phi i8* [ %f13576.pre, %func_74.exit.for.cond29.thread_crit_edge ], [ %f135.pre, %lbl_468 ]
+  store i8 1, i8* %f13577.ph, align 1
+  %f135.pre = getelementptr inbounds %struct.S0* undef, i64 0, i32 1
+  br i1 undef, label %lbl_468, label %for.end74
+
+for.end74:                                        ; preds = %lbl_468
+  ret void
+}
+
+!0 = metadata !{metadata !"omnipotent char", metadata !1}
+!1 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/test/CodeGen/Generic/edge-bundles-blockIDs.ll b/test/CodeGen/Generic/edge-bundles-blockIDs.ll
new file mode 100644
index 0000000..b4ae415
--- /dev/null
+++ b/test/CodeGen/Generic/edge-bundles-blockIDs.ll
@@ -0,0 +1,81 @@
+; Make sure EdgeBoundles handles the case when the function size is less then 
+; the number of block IDs.
+; RUN: llc -regalloc=fast < %s
+
+define void @foo() nounwind {
+entry:
+  br i1 undef, label %bb5.i1632, label %bb1.i1605
+
+bb1.i1605:                                        ; preds = %entry
+  br i1 undef, label %bb5.i73.i, label %bb3.i68.i
+
+bb3.i68.i:                                        ; preds = %bb1.i1605
+  unreachable
+
+bb5.i73.i:                                        ; preds = %bb1.i1605
+  br i1 undef, label %bb7.i79.i, label %bb6.i76.i
+
+bb6.i76.i:                                        ; preds = %bb5.i73.i
+  unreachable
+
+bb7.i79.i:                                        ; preds = %bb5.i73.i
+  br i1 undef, label %bb.i.i1608, label %bb8.i82.i
+
+bb8.i82.i:                                        ; preds = %bb7.i79.i
+  unreachable
+
+bb.i.i1608:                                       ; preds = %bb.i.i1608, %bb7.i79.i
+  br i1 undef, label %bb1.i.dis.preheader_crit_edge.i, label %bb.i.i1608
+
+bb1.i.dis.preheader_crit_edge.i: ; preds = %bb.i.i1608
+  br label %dis.i
+
+bb3.i.i1610:                                      ; preds = %bb8.i.i, %bb7.i.i1615
+  br i1 undef, label %bb5.i.i1613, label %bb4.i.i1611
+
+bb4.i.i1611:                                      ; preds = %bb3.i.i1610
+  br label %bb5.i.i1613
+
+bb5.i.i1613:                                      ; preds = %bb4.i.i1611, %bb3.i.i1610
+  unreachable
+
+bb7.i.i1615:                                      ; preds = %getfloder.exit.i
+  br i1 undef, label %bb3.i.i1610, label %bb8.i.i
+
+bb8.i.i:                                          ; preds = %bb7.i.i1615
+  br i1 undef, label %bb3.i.i1610, label %bb9.i.i
+
+bb9.i.i:                                          ; preds = %bb8.i.i
+  br label %bb12.i.i
+
+bb12.i.i:                                         ; preds = %bb12.i.i, %bb9.i.i
+  br i1 undef, label %bb13.i.bb14.i_crit_edge.i, label %bb12.i.i
+
+bb13.i.bb14.i_crit_edge.i:                        ; preds = %bb12.i.i
+  br i1 undef, label %bb25.i.i, label %bb20.i.i
+
+bb19.i.i:                                         ; preds = %bb20.i.i
+  br label %bb20.i.i
+
+bb20.i.i:                                         ; preds = %bb19.i.i, %bb13.i.bb14.i_crit_edge.i
+  %or.cond.i = or i1 undef, undef
+  br i1 %or.cond.i, label %bb25.i.i, label %bb19.i.i
+
+bb25.i.i:                                         ; preds = %bb20.i.i, %bb13.i.bb14.i_crit_edge.i
+  unreachable
+
+bb5.i1632:                                        ; preds = %entry
+  unreachable
+
+dis.i:                     ; preds = %getfloder.exit.i, %bb1.i.dis.preheader_crit_edge.i
+  br i1 undef, label %bb.i96.i, label %bb1.i102.i
+
+bb.i96.i:                                         ; preds = %dis.i
+  br label %getfloder.exit.i
+
+bb1.i102.i:                                       ; preds = %dis.i
+  br label %getfloder.exit.i
+
+getfloder.exit.i:                           ; preds = %bb1.i102.i, %bb.i96.i
+  br i1 undef, label %bb7.i.i1615, label %dis.i
+}
diff --git a/test/CodeGen/Generic/promote-integers.ll b/test/CodeGen/Generic/promote-integers.ll
new file mode 100644
index 0000000..5812592
--- /dev/null
+++ b/test/CodeGen/Generic/promote-integers.ll
@@ -0,0 +1,15 @@
+; Test that vectors are scalarized/lowered correctly.
+; RUN: llc -march=x86 -promote-elements < %s | FileCheck %s
+
+; This test is the poster-child for integer-element-promotion.
+; Until this feature is complete, we mark this test as expected to fail.
+; XFAIL: *
+; CHECK: vector_code
+; CHECK: ret
+define <4 x float> @vector_code(<4 x i64> %A, <4 x i64> %B, <4 x float> %R0, <4 x float> %R1 )  {
+   %C = icmp eq <4 x i64> %A, %B
+   %K = xor <4 x i1> <i1 1, i1 1, i1 1, i1 1>, %C
+   %D = select <4 x i1> %K, <4 x float> %R1, <4 x float> %R0
+   ret <4 x float> %D
+}
+
diff --git a/test/CodeGen/Generic/zero-sized-array.ll b/test/CodeGen/Generic/zero-sized-array.ll
new file mode 100644
index 0000000..280ba00
--- /dev/null
+++ b/test/CodeGen/Generic/zero-sized-array.ll
@@ -0,0 +1,81 @@
+; RUN: llc < %s
+; PR9900
+
+%zero = type [0 x i8]
+%foobar = type { i32, %zero }
+
+define void @f(%foobar %arg) {
+  %arg1 = extractvalue %foobar %arg, 0
+  %arg2 = extractvalue %foobar %arg, 1
+  call i32 @f2(%zero %arg2, i32 5, i32 42)
+  ret void
+}
+
+define i32 @f2(%zero %x, i32 %y, i32 %z) {
+  ret i32 %y
+}
+
+define void @f3(%zero %x, i32 %y) {
+  call i32 @f2(%zero %x, i32 5, i32 %y)
+  ret void
+}
+
+define void @f4(%zero %z) {
+  insertvalue %foobar undef, %zero %z, 1
+  ret void
+}
+
+define void @f5(%foobar %x) {
+allocas:
+  %y = extractvalue %foobar %x, 1
+  br  label %b1
+
+b1:
+  %insert120 = insertvalue %foobar undef, %zero %y, 1
+  ret void
+}
+
+define void @f6(%zero %x, %zero %y) {
+b1:
+  br i1 undef, label %end, label %b2
+
+b2:
+  br label %end
+
+end:
+  %z = phi %zero [ %y, %b1 ], [ %x, %b2 ]
+  call void @f4(%zero %z)
+  ret void
+}
+
+%zero2 = type {}
+
+define i32 @g1(%zero2 %x, i32 %y, i32 %z) {
+  ret i32 %y
+}
+
+define void @g2(%zero2 %x, i32 %y) {
+  call i32 @g1(%zero2 %x, i32 5, i32 %y)
+  ret void
+}
+
+%zero2r = type {%zero2}
+
+define i32 @h1(%zero2r %x, i32 %y, i32 %z) {
+  ret i32 %y
+}
+
+define void @h2(%zero2r %x, i32 %y) {
+  call i32 @h1(%zero2r %x, i32 5, i32 %y)
+  ret void
+}
+
+%foobar2 = type { i32, %zero2r }
+
+define void @h3(%foobar2 %arg) {
+  %arg1 = extractvalue %foobar2 %arg, 0
+  %arg2 = extractvalue %foobar2 %arg, 1
+  %arg21 = extractvalue %zero2r %arg2, 0
+  call void @g2(%zero2 %arg21, i32 5)
+  ret void
+}
diff --git a/test/CodeGen/MBlaze/fsl.ll b/test/CodeGen/MBlaze/fsl.ll
index f9c6205..5444f82 100644
--- a/test/CodeGen/MBlaze/fsl.ll
+++ b/test/CodeGen/MBlaze/fsl.ll
@@ -3,7 +3,7 @@
 ; dynamic version of the instructions and that constant values use the
 ; constant version of the instructions.
 ;
-; RUN: llc < %s -march=mblaze | FileCheck %s
+; RUN: llc -O3 < %s -march=mblaze | FileCheck %s
 
 declare i32 @llvm.mblaze.fsl.get(i32 %port)
 declare i32 @llvm.mblaze.fsl.aget(i32 %port)
@@ -55,8 +55,7 @@ declare void @llvm.mblaze.fsl.tnaput(i32 %port)
 declare void @llvm.mblaze.fsl.tncput(i32 %port)
 declare void @llvm.mblaze.fsl.tncaput(i32 %port)
 
-define i32 @fsl_get(i32 %port)
-{
+define void @fsl_get(i32 %port) {
     ; CHECK:        fsl_get:
     %v0  = call i32 @llvm.mblaze.fsl.get(i32 %port)
     ; CHECK:        getd
@@ -122,12 +121,11 @@ define i32 @fsl_get(i32 %port)
     ; CHECK-NEXT:   tnecgetd
     %v31 = call i32 @llvm.mblaze.fsl.tnecaget(i32 %port)
     ; CHECK-NEXT:   tnecagetd
-    ret i32 1
+    ret void
     ; CHECK:        rtsd
 }
 
-define i32 @fslc_get()
-{
+define void @fslc_get() {
     ; CHECK:        fslc_get:
     %v0  = call i32 @llvm.mblaze.fsl.get(i32 1)
     ; CHECK:        get
@@ -224,12 +222,11 @@ define i32 @fslc_get()
     %v31 = call i32 @llvm.mblaze.fsl.tnecaget(i32 1)
     ; CHECK-NOT:    tnecagetd
     ; CHECK:        tnecaget
-    ret i32 1
+    ret void
     ; CHECK:        rtsd
 }
 
-define void @putfsl(i32 %value, i32 %port)
-{
+define void @putfsl(i32 %value, i32 %port) {
     ; CHECK:        putfsl:
     call void @llvm.mblaze.fsl.put(i32 %value, i32 %port)
     ; CHECK:        putd
@@ -267,8 +264,7 @@ define void @putfsl(i32 %value, i32 %port)
     ; CHECK:        rtsd
 }
 
-define void @putfsl_const(i32 %value)
-{
+define void @putfsl_const(i32 %value) {
     ; CHECK:        putfsl_const:
     call void @llvm.mblaze.fsl.put(i32 %value, i32 1)
     ; CHECK-NOT:    putd
diff --git a/test/CodeGen/MBlaze/loop.ll b/test/CodeGen/MBlaze/loop.ll
index 8973f75..7439d0b 100644
--- a/test/CodeGen/MBlaze/loop.ll
+++ b/test/CodeGen/MBlaze/loop.ll
@@ -29,14 +29,12 @@ loop_inner_finish:
     %inner.5 = add i32 %inner.2, 1
     call i32 (i8*,...)* @printf( i8* getelementptr([19 x i8]* @MSG,i32 0,i32 0),
                                  i32 %inner.0, i32 %inner.1, i32 %inner.2 )
-    ; CHECK:        brlid
-    ; CHECK:        addik {{.*, 1}}
 
     %inner.6 = icmp eq i32 %inner.5, 100
-    ; CHECK:        cmp
+    ; CHECK:        cmp [[REG:r[0-9]*]]
 
     br i1 %inner.6, label %loop_inner, label %loop_outer_finish
-    ; CHECK:        {{beq|bne}}
+    ; CHECK:        {{beqid|bneid}} [[REG]]
 
 loop_outer_finish:
     %outer.1 = add i32 %outer.0, 1
diff --git a/test/CodeGen/MSP430/Inst8rr.ll b/test/CodeGen/MSP430/Inst8rr.ll
index 0f5fc12..45342e2 100644
--- a/test/CodeGen/MSP430/Inst8rr.ll
+++ b/test/CodeGen/MSP430/Inst8rr.ll
@@ -10,7 +10,7 @@ define i8 @mov(i8 %a, i8 %b) nounwind {
 
 define i8 @add(i8 %a, i8 %b) nounwind {
 ; CHECK: add:
-; CHECK: add.b	r12, r15
+; CHECK: add.b
 	%1 = add i8 %a, %b
 	ret i8 %1
 }
diff --git a/test/CodeGen/Mips/2008-07-16-SignExtInReg.ll b/test/CodeGen/Mips/2008-07-16-SignExtInReg.ll
index 41ae5dd..855194a 100644
--- a/test/CodeGen/Mips/2008-07-16-SignExtInReg.ll
+++ b/test/CodeGen/Mips/2008-07-16-SignExtInReg.ll
@@ -5,13 +5,13 @@
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64"
 target triple = "mipsallegrexel-unknown-psp-elf"
 
-define i8 @A(i8 %e.0, i8 signext %sum) signext nounwind {
+define signext i8 @A(i8 %e.0, i8 signext %sum)  nounwind {
 entry:
 	add i8 %sum, %e.0		; <i8>:0 [#uses=1]
 	ret i8 %0
 }
 
-define i16 @B(i16 %e.0, i16 signext %sum) signext nounwind {
+define signext i16 @B(i16 %e.0, i16 signext %sum) nounwind {
 entry:
 	add i16 %sum, %e.0		; <i16>:0 [#uses=1]
 	ret i16 %0
diff --git a/test/CodeGen/Mips/2008-07-31-fcopysign.ll b/test/CodeGen/Mips/2008-07-31-fcopysign.ll
index 47382f9..f152acc 100644
--- a/test/CodeGen/Mips/2008-07-31-fcopysign.ll
+++ b/test/CodeGen/Mips/2008-07-31-fcopysign.ll
@@ -2,6 +2,10 @@
 ; RUN: grep abs.s  %t | count 1
 ; RUN: grep neg.s %t | count 1
 
+; FIXME: Should not emit abs.s or neg.s since these instructions produce
+;        incorrect results if the operand is NaN.
+; REQUIRES: disabled
+
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64"
 target triple = "mipsallegrexel-unknown-psp-elf"
 
diff --git a/test/CodeGen/Mips/2011-05-26-BranchKillsVreg.ll b/test/CodeGen/Mips/2011-05-26-BranchKillsVreg.ll
new file mode 100644
index 0000000..1255949
--- /dev/null
+++ b/test/CodeGen/Mips/2011-05-26-BranchKillsVreg.ll
@@ -0,0 +1,43 @@
+; RUN: llc < %s -verify-coalescing
+; PR10046
+;
+; PHI elimination splits the critical edge from %while.end415 to %if.end427.
+; This requires updating the BNE-J terminators to a BEQ. The BNE instruction
+; kills a virtual register, and LiveVariables must be updated with the new kill
+; instruction.
+
+target datalayout = "E-p:32:32:32-i1:8:8-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-n32"
+target triple = "mips-ellcc-linux"
+
+define i32 @mergesort(i8* %base, i32 %nmemb, i32 %size, i32 (i8*, i8*)* nocapture %cmp) nounwind {
+entry:
+  br i1 undef, label %return, label %if.end13
+
+if.end13:                                         ; preds = %entry
+  br label %while.body
+
+while.body:                                       ; preds = %while.body, %if.end13
+  %list1.0482 = phi i8* [ %base, %if.end13 ], [ null, %while.body ]
+  br i1 undef, label %while.end415, label %while.body
+
+while.end415:                                     ; preds = %while.body
+  br i1 undef, label %if.then419, label %if.end427
+
+if.then419:                                       ; preds = %while.end415
+  %call425 = tail call i8* @memmove(i8* %list1.0482, i8* undef, i32 undef) nounwind
+  br label %if.end427
+
+if.end427:                                        ; preds = %if.then419, %while.end415
+  %list2.1 = phi i8* [ undef, %if.then419 ], [ %list1.0482, %while.end415 ]
+  tail call void @free(i8* %list2.1)
+  unreachable
+
+return:                                           ; preds = %entry
+  ret i32 -1
+}
+
+
+declare i8* @memmove(i8*, i8*, i32)
+
+declare void @free(i8*)
+
diff --git a/test/CodeGen/Mips/alloca.ll b/test/CodeGen/Mips/alloca.ll
new file mode 100644
index 0000000..50eeecf
--- /dev/null
+++ b/test/CodeGen/Mips/alloca.ll
@@ -0,0 +1,31 @@
+; RUN: llc -march=mipsel -mcpu=4ke < %s | FileCheck %s
+
+define i32 @twoalloca(i32 %size) nounwind {
+entry:
+; CHECK: subu  $[[T0:[0-9]+]], $sp, $[[SZ:[0-9]+]]
+; CHECK: addu  $sp, $zero, $[[T0]]
+; CHECK: addu  $[[SP1:[0-9]+]], $zero, $sp
+; CHECK: subu  $[[T1:[0-9]+]], $sp, $[[SZ]]
+; CHECK: addu  $sp, $zero, $[[T1]]
+; CHECK: addu  $[[SP2:[0-9]+]], $zero, $sp
+; CHECK: lw  $25, %call16(foo)($gp)
+; CHECK: addiu $4, $[[SP1]], 24
+; CHECK: jalr  $25
+; CHECK: lw  $25, %call16(foo)($gp)
+; CHECK: addiu $4, $[[SP2]], 24
+; CHECK: jalr  $25
+  %tmp1 = alloca i8, i32 %size, align 4
+  %add.ptr = getelementptr inbounds i8* %tmp1, i32 5
+  store i8 97, i8* %add.ptr, align 1
+  %tmp4 = alloca i8, i32 %size, align 4
+  call void @foo2(double 1.000000e+00, double 2.000000e+00, i32 3) nounwind
+  %call = call i32 @foo(i8* %tmp1) nounwind
+  %call7 = call i32 @foo(i8* %tmp4) nounwind
+  %add = add nsw i32 %call7, %call
+  ret i32 %add
+}
+
+declare void @foo2(double, double, i32)
+
+declare i32 @foo(i8*)
+
diff --git a/test/CodeGen/Mips/atomic.ll b/test/CodeGen/Mips/atomic.ll
new file mode 100644
index 0000000..2d5555b
--- /dev/null
+++ b/test/CodeGen/Mips/atomic.ll
@@ -0,0 +1,253 @@
+; RUN: llc -march=mipsel -mcpu=mips2 < %s | FileCheck %s
+
+
+declare i32 @llvm.atomic.load.add.i32.p0i32(i32* nocapture, i32) nounwind
+declare i32 @llvm.atomic.load.nand.i32.p0i32(i32* nocapture, i32) nounwind
+declare i32 @llvm.atomic.swap.i32.p0i32(i32* nocapture, i32) nounwind
+declare i32 @llvm.atomic.cmp.swap.i32.p0i32(i32* nocapture, i32, i32) nounwind
+
+declare i8 @llvm.atomic.load.add.i8.p0i8(i8* nocapture, i8) nounwind
+declare i8 @llvm.atomic.load.sub.i8.p0i8(i8* nocapture, i8) nounwind
+declare i8 @llvm.atomic.load.nand.i8.p0i8(i8* nocapture, i8) nounwind
+declare i8 @llvm.atomic.swap.i8.p0i8(i8* nocapture, i8) nounwind
+declare i8 @llvm.atomic.cmp.swap.i8.p0i8(i8* nocapture, i8, i8) nounwind
+
+
+@x = common global i32 0, align 4
+
+define i32 @AtomicLoadAdd32(i32 %incr) nounwind {
+entry:
+  %0 = call i32 @llvm.atomic.load.add.i32.p0i32(i32* @x, i32 %incr)
+  ret i32 %0
+
+; CHECK:   AtomicLoadAdd32:
+; CHECK:   lw      $[[R0:[0-9]+]], %got(x)($gp)
+; CHECK:   $[[BB0:[A-Z_0-9]+]]:
+; CHECK:   ll      $[[R1:[0-9]+]], 0($[[R0]])
+; CHECK:   or      $2, $zero, $[[R1]]
+; CHECK:   addu    $[[R2:[0-9]+]], $[[R1]], $4
+; CHECK:   sc      $[[R2]], 0($[[R0]])
+; CHECK:   beq     $[[R2]], $zero, $[[BB0]]
+}
+
+define i32 @AtomicLoadNand32(i32 %incr) nounwind {
+entry:
+  %0 = call i32 @llvm.atomic.load.nand.i32.p0i32(i32* @x, i32 %incr)
+  ret i32 %0
+
+; CHECK:   AtomicLoadNand32:
+; CHECK:   lw      $[[R0:[0-9]+]], %got(x)($gp)
+; CHECK:   $[[BB0:[A-Z_0-9]+]]:
+; CHECK:   ll      $[[R1:[0-9]+]], 0($[[R0]])
+; CHECK:   or      $2, $zero, $[[R1]]
+; CHECK:   and     $[[R1]], $[[R1]], $4
+; CHECK:   nor     $[[R2:[0-9]+]], $zero, $[[R1]]
+; CHECK:   sc      $[[R2]], 0($[[R0]])
+; CHECK:   beq     $[[R2]], $zero, $[[BB0]]
+}
+
+define i32 @AtomicSwap32(i32 %oldval) nounwind {
+entry:
+  %0 = call i32 @llvm.atomic.swap.i32.p0i32(i32* @x, i32 %oldval)
+  ret i32 %0
+
+; CHECK:   AtomicSwap32:
+; CHECK:   lw      $[[R0:[0-9]+]], %got(x)($gp)
+; CHECK:   sw      $4, [[OFFSET:[0-9]+]]($sp)
+; CHECK:   $[[BB0:[A-Z_0-9]+]]:
+; CHECK:   ll      $[[R1:[0-9]+]], 0($[[R0]])
+; CHECK:   or      $2, $zero, $[[R1]]
+; CHECK:   lw      $[[R2:[0-9]+]], [[OFFSET]]($sp)
+; CHECK:   or      $[[R3:[0-9]+]], $zero, $[[R2]]
+; CHECK:   sc      $[[R3]], 0($[[R0]])
+; CHECK:   beq     $[[R3]], $zero, $[[BB0]]
+}
+
+define i32 @AtomicCmpSwap32(i32 %oldval, i32 %newval) nounwind {
+entry:
+  %0 = call i32 @llvm.atomic.cmp.swap.i32.p0i32(i32* @x, i32 %oldval, i32 %newval)
+  ret i32 %0
+
+; CHECK:   AtomicCmpSwap32:
+; CHECK:   lw      $[[R0:[0-9]+]], %got(x)($gp)
+; CHECK:   sw      $5, [[OFFSET:[0-9]+]]($sp)
+; CHECK:   $[[BB0:[A-Z_0-9]+]]:
+; CHECK:   ll      $2, 0($[[R0]])
+; CHECK:   bne     $2, $4, $[[BB1:[A-Z_0-9]+]]
+; CHECK:   lw      $[[R1:[0-9]+]], [[OFFSET]]($sp)
+; CHECK:   or      $[[R2:[0-9]+]], $zero, $[[R1]]
+; CHECK:   sc      $[[R2]], 0($[[R0]])
+; CHECK:   beq     $[[R2]], $zero, $[[BB0]]
+; CHECK:   $[[BB1]]:
+}
+
+
+
+@y = common global i8 0, align 1
+
+define signext i8 @AtomicLoadAdd8(i8 signext %incr) nounwind {
+entry:
+  %0 = call i8 @llvm.atomic.load.add.i8.p0i8(i8* @y, i8 %incr)
+  ret i8 %0
+
+; CHECK:   AtomicLoadAdd8:
+; CHECK:   lw      $[[R0:[0-9]+]], %got(y)($gp)
+; CHECK:   addiu   $[[R1:[0-9]+]], $zero, -4
+; CHECK:   and     $[[R2:[0-9]+]], $[[R0]], $[[R1]]
+; CHECK:   andi    $[[R3:[0-9]+]], $[[R0]], 3
+; CHECK:   sll     $[[R4:[0-9]+]], $[[R3]], 3
+; CHECK:   ori     $[[R5:[0-9]+]], $zero, 255
+; CHECK:   sll     $[[R6:[0-9]+]], $[[R5]], $[[R4]]
+; CHECK:   nor     $[[R7:[0-9]+]], $zero, $[[R6]]
+; CHECK:   andi    $[[R8:[0-9]+]], $4, 255
+; CHECK:   sll     $[[R9:[0-9]+]], $[[R8]], $[[R4]]
+
+; CHECK:   $[[BB0:[A-Z_0-9]+]]:
+; CHECK:   ll      $[[R10:[0-9]+]], 0($[[R2]])
+; CHECK:   addu    $[[R11:[0-9]+]], $[[R10]], $[[R9]]
+; CHECK:   and     $[[R12:[0-9]+]], $[[R11]], $[[R6]]
+; CHECK:   and     $[[R13:[0-9]+]], $[[R10]], $[[R7]]
+; CHECK:   or      $[[R14:[0-9]+]], $[[R13]], $[[R12]]
+; CHECK:   sc      $[[R14]], 0($[[R2]])
+; CHECK:   beq     $[[R14]], $zero, $[[BB0]]
+
+; CHECK:   and     $[[R15:[0-9]+]], $[[R10]], $[[R6]]
+; CHECK:   srl     $[[R16:[0-9]+]], $[[R15]], $[[R4]]
+; CHECK:   sll     $[[R17:[0-9]+]], $[[R16]], 24
+; CHECK:   sra     $2, $[[R17]], 24
+}
+
+define signext i8 @AtomicLoadSub8(i8 signext %incr) nounwind {
+entry:
+  %0 = call i8 @llvm.atomic.load.sub.i8.p0i8(i8* @y, i8 %incr)
+  ret i8 %0
+
+; CHECK:   AtomicLoadSub8:
+; CHECK:   lw      $[[R0:[0-9]+]], %got(y)($gp)
+; CHECK:   addiu   $[[R1:[0-9]+]], $zero, -4
+; CHECK:   and     $[[R2:[0-9]+]], $[[R0]], $[[R1]]
+; CHECK:   andi    $[[R3:[0-9]+]], $[[R0]], 3
+; CHECK:   sll     $[[R4:[0-9]+]], $[[R3]], 3
+; CHECK:   ori     $[[R5:[0-9]+]], $zero, 255
+; CHECK:   sll     $[[R6:[0-9]+]], $[[R5]], $[[R4]]
+; CHECK:   nor     $[[R7:[0-9]+]], $zero, $[[R6]]
+; CHECK:   subu    $[[R18:[0-9]+]], $zero, $4
+; CHECK:   andi    $[[R8:[0-9]+]], $[[R18]], 255
+; CHECK:   sll     $[[R9:[0-9]+]], $[[R8]], $[[R4]]
+
+; CHECK:   $[[BB0:[A-Z_0-9]+]]:
+; CHECK:   ll      $[[R10:[0-9]+]], 0($[[R2]])
+; CHECK:   addu    $[[R11:[0-9]+]], $[[R10]], $[[R9]]
+; CHECK:   and     $[[R12:[0-9]+]], $[[R11]], $[[R6]]
+; CHECK:   and     $[[R13:[0-9]+]], $[[R10]], $[[R7]]
+; CHECK:   or      $[[R14:[0-9]+]], $[[R13]], $[[R12]]
+; CHECK:   sc      $[[R14]], 0($[[R2]])
+; CHECK:   beq     $[[R14]], $zero, $[[BB0]]
+
+; CHECK:   and     $[[R15:[0-9]+]], $[[R10]], $[[R6]]
+; CHECK:   srl     $[[R16:[0-9]+]], $[[R15]], $[[R4]]
+; CHECK:   sll     $[[R17:[0-9]+]], $[[R16]], 24
+; CHECK:   sra     $2, $[[R17]], 24
+}
+
+define signext i8 @AtomicLoadNand8(i8 signext %incr) nounwind {
+entry:
+  %0 = call i8 @llvm.atomic.load.nand.i8.p0i8(i8* @y, i8 %incr)
+  ret i8 %0
+
+; CHECK:   AtomicLoadNand8:
+; CHECK:   lw      $[[R0:[0-9]+]], %got(y)($gp)
+; CHECK:   addiu   $[[R1:[0-9]+]], $zero, -4
+; CHECK:   and     $[[R2:[0-9]+]], $[[R0]], $[[R1]]
+; CHECK:   andi    $[[R3:[0-9]+]], $[[R0]], 3
+; CHECK:   sll     $[[R4:[0-9]+]], $[[R3]], 3
+; CHECK:   ori     $[[R5:[0-9]+]], $zero, 255
+; CHECK:   sll     $[[R6:[0-9]+]], $[[R5]], $[[R4]]
+; CHECK:   nor     $[[R7:[0-9]+]], $zero, $[[R6]]
+; CHECK:   andi    $[[R8:[0-9]+]], $4, 255
+; CHECK:   sll     $[[R9:[0-9]+]], $[[R8]], $[[R4]]
+
+; CHECK:   $[[BB0:[A-Z_0-9]+]]:
+; CHECK:   ll      $[[R10:[0-9]+]], 0($[[R2]])
+; CHECK:   and     $[[R18:[0-9]+]], $[[R10]], $[[R9]]
+; CHECK:   nor     $[[R11:[0-9]+]], $zero, $[[R18]]
+; CHECK:   and     $[[R12:[0-9]+]], $[[R11]], $[[R6]]
+; CHECK:   and     $[[R13:[0-9]+]], $[[R10]], $[[R7]]
+; CHECK:   or      $[[R14:[0-9]+]], $[[R13]], $[[R12]]
+; CHECK:   sc      $[[R14]], 0($[[R2]])
+; CHECK:   beq     $[[R14]], $zero, $[[BB0]]
+
+; CHECK:   and     $[[R15:[0-9]+]], $[[R10]], $[[R6]]
+; CHECK:   srl     $[[R16:[0-9]+]], $[[R15]], $[[R4]]
+; CHECK:   sll     $[[R17:[0-9]+]], $[[R16]], 24
+; CHECK:   sra     $2, $[[R17]], 24
+}
+
+define signext i8 @AtomicSwap8(i8 signext %oldval) nounwind {
+entry:
+  %0 = call i8 @llvm.atomic.swap.i8.p0i8(i8* @y, i8 %oldval)
+  ret i8 %0
+
+; CHECK:   AtomicSwap8:
+; CHECK:   lw      $[[R0:[0-9]+]], %got(y)($gp)
+; CHECK:   addiu   $[[R1:[0-9]+]], $zero, -4
+; CHECK:   and     $[[R2:[0-9]+]], $[[R0]], $[[R1]]
+; CHECK:   andi    $[[R3:[0-9]+]], $[[R0]], 3
+; CHECK:   sll     $[[R4:[0-9]+]], $[[R3]], 3
+; CHECK:   ori     $[[R5:[0-9]+]], $zero, 255
+; CHECK:   sll     $[[R6:[0-9]+]], $[[R5]], $[[R4]]
+; CHECK:   nor     $[[R7:[0-9]+]], $zero, $[[R6]]
+; CHECK:   andi    $[[R8:[0-9]+]], $4, 255
+; CHECK:   sll     $[[R9:[0-9]+]], $[[R8]], $[[R4]]
+; CHECK:   sw      $[[R9]], [[OFFSET:[0-9]+]]($sp)
+
+; CHECK:   $[[BB0:[A-Z_0-9]+]]:
+; CHECK:   ll      $[[R10:[0-9]+]], 0($[[R2]])
+; CHECK:   lw      $[[R18:[0-9]+]], [[OFFSET]]($sp)
+; CHECK:   or      $[[R11:[0-9]+]], $zero, $[[R18]]
+; CHECK:   and     $[[R12:[0-9]+]], $[[R11]], $[[R6]]
+; CHECK:   and     $[[R13:[0-9]+]], $[[R10]], $[[R7]]
+; CHECK:   or      $[[R14:[0-9]+]], $[[R13]], $[[R12]]
+; CHECK:   sc      $[[R14]], 0($[[R2]])
+; CHECK:   beq     $[[R14]], $zero, $[[BB0]]
+
+; CHECK:   and     $[[R15:[0-9]+]], $[[R10]], $[[R6]]
+; CHECK:   srl     $[[R16:[0-9]+]], $[[R15]], $[[R4]]
+; CHECK:   sll     $[[R17:[0-9]+]], $[[R16]], 24
+; CHECK:   sra     $2, $[[R17]], 24
+}
+
+define signext i8 @AtomicCmpSwap8(i8 signext %oldval, i8 signext %newval) nounwind {
+entry:
+  %0 = call i8 @llvm.atomic.cmp.swap.i8.p0i8(i8* @y, i8 %oldval, i8 %newval)
+  ret i8 %0
+
+; CHECK:   AtomicCmpSwap8:
+; CHECK:   lw      $[[R0:[0-9]+]], %got(y)($gp)
+; CHECK:   addiu   $[[R1:[0-9]+]], $zero, -4
+; CHECK:   and     $[[R2:[0-9]+]], $[[R0]], $[[R1]]
+; CHECK:   andi    $[[R3:[0-9]+]], $[[R0]], 3
+; CHECK:   sll     $[[R4:[0-9]+]], $[[R3]], 3
+; CHECK:   ori     $[[R5:[0-9]+]], $zero, 255
+; CHECK:   sll     $[[R6:[0-9]+]], $[[R5]], $[[R4]]
+; CHECK:   nor     $[[R7:[0-9]+]], $zero, $[[R6]]
+; CHECK:   andi    $[[R8:[0-9]+]], $4, 255
+; CHECK:   sll     $[[R9:[0-9]+]], $[[R8]], $[[R4]]
+; CHECK:   andi    $[[R10:[0-9]+]], $5, 255
+; CHECK:   sll     $[[R11:[0-9]+]], $[[R10]], $[[R4]]
+
+; CHECK:   $[[BB0:[A-Z_0-9]+]]:
+; CHECK:   ll      $[[R12:[0-9]+]], 0($[[R2]])
+; CHECK:   and     $[[R13:[0-9]+]], $[[R12]], $[[R6]]
+; CHECK:   bne     $[[R13]], $[[R9]], $[[BB1:[A-Z_0-9]+]]
+
+; CHECK:   and     $[[R14:[0-9]+]], $[[R12]], $[[R7]]
+; CHECK:   or      $[[R15:[0-9]+]], $[[R14]], $[[R11]]
+; CHECK:   sc      $[[R15]], 0($[[R2]])
+; CHECK:   beq     $[[R15]], $zero, $[[BB0]]
+
+; CHECK:   $[[BB1]]:
+; CHECK:   srl     $[[R16:[0-9]+]], $[[R13]], $[[R4]]
+; CHECK:   sll     $[[R17:[0-9]+]], $[[R16]], 24
+; CHECK:   sra     $2, $[[R17]], 24
+}
diff --git a/test/CodeGen/Mips/blockaddr.ll b/test/CodeGen/Mips/blockaddr.ll
index 2b06314..6de6b77 100644
--- a/test/CodeGen/Mips/blockaddr.ll
+++ b/test/CodeGen/Mips/blockaddr.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=mipsel < %s | FileCheck %s
+; RUN: llc -march=mipsel -relocation-model=pic < %s | FileCheck %s -check-prefix=CHECK-PIC
+; RUN: llc -march=mipsel -relocation-model=static < %s | FileCheck %s -check-prefix=CHECK-STATIC
 
 @reg = common global i8* null, align 4
 
@@ -7,10 +8,14 @@ entry:
   ret i8* %x
 }
 
-; CHECK: lw  $2, %got($tmp1)($gp)
-; CHECK: addiu $4, $2, %lo($tmp1)
-; CHECK: lw  $2, %got($tmp2)($gp)
-; CHECK: addiu $2, $2, %lo($tmp2)
+; CHECK-PIC: lw  $[[R0:[0-9]+]], %got($tmp[[T0:[0-9]+]])($gp)
+; CHECK-PIC: addiu ${{[0-9]+}}, $[[R0]], %lo($tmp[[T0]])
+; CHECK-PIC: lw  $[[R1:[0-9]+]], %got($tmp[[T1:[0-9]+]])($gp)
+; CHECK-PIC: addiu ${{[0-9]+}}, $[[R1]], %lo($tmp[[T1]])
+; CHECK-STATIC: lui  $[[R2:[0-9]+]], %hi($tmp[[T0:[0-9]+]])
+; CHECK-STATIC: addiu ${{[0-9]+}}, $[[R2]], %lo($tmp[[T0]])
+; CHECK-STATIC: lui   $[[R3:[0-9]+]], %hi($tmp[[T1:[0-9]+]])
+; CHECK-STATIC: addiu ${{[0-9]+}}, $[[R3]], %lo($tmp[[T1]])
 define void @f() nounwind {
 entry:
   %call = tail call i8* @dummy(i8* blockaddress(@f, %baz))
diff --git a/test/CodeGen/Mips/buildpairextractelementf64.ll b/test/CodeGen/Mips/buildpairextractelementf64.ll
new file mode 100644
index 0000000..585bc25
--- /dev/null
+++ b/test/CodeGen/Mips/buildpairextractelementf64.ll
@@ -0,0 +1,23 @@
+; RUN: llc  < %s -march=mipsel | FileCheck %s
+; RUN: llc  < %s -march=mips   | FileCheck %s
+@a = external global i32
+
+define double @f(i32 %a1, double %d) nounwind {
+entry:
+; CHECK: mtc1
+; CHECK: mtc1
+  store i32 %a1, i32* @a, align 4
+  %add = fadd double %d, 2.000000e+00
+  ret double %add
+}
+
+define void @f3(double %d, i32 %a1) nounwind {
+entry:
+; CHECK: mfc1
+; CHECK: mfc1
+  tail call void @f2(i32 %a1, double %d) nounwind
+  ret void
+}
+
+declare void @f2(i32, double)
+
diff --git a/test/CodeGen/Mips/cmov.ll b/test/CodeGen/Mips/cmov.ll
index 8329c89..ec37961 100755
--- a/test/CodeGen/Mips/cmov.ll
+++ b/test/CodeGen/Mips/cmov.ll
@@ -4,8 +4,8 @@
 @i1 = global [3 x i32] [i32 1, i32 2, i32 3], align 4
 @i3 = common global i32* null, align 4
 
-; CHECK:  lw  ${{[0-9]+}}, %got(i3)($gp)
 ; CHECK:  addiu ${{[0-9]+}}, $gp, %got(i1)
+; CHECK:  lw  ${{[0-9]+}}, %got(i3)($gp)
 define i32* @cmov1(i32 %s) nounwind readonly {
 entry:
   %tobool = icmp ne i32 %s, 0
@@ -14,3 +14,19 @@ entry:
   ret i32* %cond
 }
 
+@c = global i32 1, align 4
+@d = global i32 0, align 4
+
+; CHECK: cmov2:
+; CHECK: addiu $[[R0:[0-9]+]], $gp, %got(c)
+; CHECK: addiu $[[R1:[0-9]+]], $gp, %got(d)
+; CHECK: movn  $[[R1]], $[[R0]], ${{[0-9]+}}
+define i32 @cmov2(i32 %s) nounwind readonly {
+entry:
+  %tobool = icmp ne i32 %s, 0
+  %tmp1 = load i32* @c, align 4
+  %tmp2 = load i32* @d, align 4
+  %cond = select i1 %tobool, i32 %tmp1, i32 %tmp2
+  ret i32 %cond
+}
+
diff --git a/test/CodeGen/Mips/double2int.ll b/test/CodeGen/Mips/double2int.ll
new file mode 100644
index 0000000..3d033e1
--- /dev/null
+++ b/test/CodeGen/Mips/double2int.ll
@@ -0,0 +1,8 @@
+; RUN: llc -march=mips -mcpu=4ke < %s | FileCheck %s
+
+define i32 @f1(double %d) nounwind readnone {
+entry:
+; CHECK: trunc.w.d $f{{[0-9]+}}, $f12
+  %conv = fptosi double %d to i32
+  ret i32 %conv
+}
diff --git a/test/CodeGen/Mips/eh.ll b/test/CodeGen/Mips/eh.ll
new file mode 100644
index 0000000..765b778
--- /dev/null
+++ b/test/CodeGen/Mips/eh.ll
@@ -0,0 +1,78 @@
+; RUN: llc  < %s -march=mipsel -mcpu=4ke | FileCheck %s -check-prefix=CHECK-EL
+; RUN: llc  < %s -march=mips   -mcpu=4ke | FileCheck %s -check-prefix=CHECK-EB
+
+@g1 = global double 0.000000e+00, align 8
+@_ZTId = external constant i8*
+
+define void @_Z1fd(double %i2) {
+entry:
+; CHECK-EL:  addiu $sp, $sp
+; CHECK-EL:  .cfi_def_cfa_offset
+; CHECK-EL:  sdc1 $f20
+; CHECK-EL:  sw  $ra
+; CHECK-EL:  sw  $17
+; CHECK-EL:  sw  $16
+; CHECK-EL:  .cfi_offset 52, -8
+; CHECK-EL:  .cfi_offset 53, -4
+; CHECK-EB:  .cfi_offset 53, -8
+; CHECK-EB:  .cfi_offset 52, -4
+; CHECK-EL:  .cfi_offset 31, -12
+; CHECK-EL:  .cfi_offset 17, -16
+; CHECK-EL:  .cfi_offset 16, -20
+; CHECK-EL:  .cprestore 
+
+  %exception = tail call i8* @__cxa_allocate_exception(i32 8) nounwind
+  %0 = bitcast i8* %exception to double*
+  store double 3.200000e+00, double* %0, align 8, !tbaa !0
+  invoke void @__cxa_throw(i8* %exception, i8* bitcast (i8** @_ZTId to i8*), i8* null) noreturn
+          to label %unreachable unwind label %lpad
+
+lpad:                                             ; preds = %entry
+; CHECK-EL:  # %lpad
+; CHECK-EL:  lw  $gp
+; CHECK-EL:  beq $5
+
+  %exn = tail call i8* @llvm.eh.exception() nounwind
+  %eh.selector = tail call i32 (i8*, i8*, ...)* @llvm.eh.selector(i8* %exn, i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*), i8* bitcast (i8** @_ZTId to i8*)) nounwind
+  %1 = tail call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTId to i8*)) nounwind
+  %2 = icmp eq i32 %eh.selector, %1
+  br i1 %2, label %catch, label %eh.resume
+
+catch:                                            ; preds = %lpad
+  %3 = tail call i8* @__cxa_begin_catch(i8* %exn) nounwind
+  %4 = bitcast i8* %3 to double*
+  %exn.scalar = load double* %4, align 8
+  %add = fadd double %exn.scalar, %i2
+  store double %add, double* @g1, align 8, !tbaa !0
+  tail call void @__cxa_end_catch() nounwind
+  ret void
+
+eh.resume:                                        ; preds = %lpad
+  tail call void @llvm.eh.resume(i8* %exn, i32 %eh.selector) noreturn
+  unreachable
+
+unreachable:                                      ; preds = %entry
+  unreachable
+}
+
+declare i8* @__cxa_allocate_exception(i32)
+
+declare i8* @llvm.eh.exception() nounwind readonly
+
+declare i32 @__gxx_personality_v0(...)
+
+declare i32 @llvm.eh.selector(i8*, i8*, ...) nounwind
+
+declare i32 @llvm.eh.typeid.for(i8*) nounwind
+
+declare void @llvm.eh.resume(i8*, i32)
+
+declare void @__cxa_throw(i8*, i8*, i8*)
+
+declare i8* @__cxa_begin_catch(i8*)
+
+declare void @__cxa_end_catch()
+
+!0 = metadata !{metadata !"double", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/test/CodeGen/Mips/fcopysign.ll b/test/CodeGen/Mips/fcopysign.ll
new file mode 100644
index 0000000..14c6507
--- /dev/null
+++ b/test/CodeGen/Mips/fcopysign.ll
@@ -0,0 +1,55 @@
+; RUN: llc  < %s -march=mipsel -mcpu=4ke | FileCheck %s -check-prefix=CHECK-EL
+; RUN: llc  < %s -march=mips -mcpu=4ke | FileCheck %s -check-prefix=CHECK-EB
+
+define double @func0(double %d0, double %d1) nounwind readnone {
+entry:
+; CHECK-EL: func0:
+; CHECK-EL: lui $[[T0:[0-9]+]], 32767
+; CHECK-EL: lui $[[T1:[0-9]+]], 32768
+; CHECK-EL: mfc1 $[[HI0:[0-9]+]], $f13
+; CHECK-EL: ori $[[MSK0:[0-9]+]], $[[T0]], 65535
+; CHECK-EL: mfc1 $[[HI1:[0-9]+]], $f15
+; CHECK-EL: ori $[[MSK1:[0-9]+]], $[[T1]], 0
+; CHECK-EL: and $[[AND0:[0-9]+]], $[[HI0]], $[[MSK0]]
+; CHECK-EL: and $[[AND1:[0-9]+]], $[[HI1]], $[[MSK1]]
+; CHECK-EL: mfc1 $[[LO0:[0-9]+]], $f12
+; CHECK-EL: or  $[[OR:[0-9]+]], $[[AND0]], $[[AND1]]
+; CHECK-EL: mtc1 $[[LO0]], $f0
+; CHECK-EL: mtc1 $[[OR]], $f1
+;
+; CHECK-EB: lui $[[T0:[0-9]+]], 32767
+; CHECK-EB: lui $[[T1:[0-9]+]], 32768
+; CHECK-EB: mfc1 $[[HI0:[0-9]+]], $f12
+; CHECK-EB: ori $[[MSK0:[0-9]+]], $[[T0]], 65535
+; CHECK-EB: mfc1 $[[HI1:[0-9]+]], $f14
+; CHECK-EB: ori $[[MSK1:[0-9]+]], $[[T1]], 0
+; CHECK-EB: and $[[AND0:[0-9]+]], $[[HI0]], $[[MSK0]]
+; CHECK-EB: and $[[AND1:[0-9]+]], $[[HI1]], $[[MSK1]]
+; CHECK-EB: or  $[[OR:[0-9]+]], $[[AND0]], $[[AND1]]
+; CHECK-EB: mfc1 $[[LO0:[0-9]+]], $f13
+; CHECK-EB: mtc1 $[[OR]], $f0
+; CHECK-EB: mtc1 $[[LO0]], $f1
+  %call = tail call double @copysign(double %d0, double %d1) nounwind readnone
+  ret double %call
+}
+
+declare double @copysign(double, double) nounwind readnone
+
+define float @func1(float %f0, float %f1) nounwind readnone {
+entry:
+; CHECK-EL: func1:
+; CHECK-EL: lui $[[T0:[0-9]+]], 32767
+; CHECK-EL: lui $[[T1:[0-9]+]], 32768
+; CHECK-EL: mfc1 $[[ARG0:[0-9]+]], $f12
+; CHECK-EL: ori $[[MSK0:[0-9]+]], $[[T0]], 65535
+; CHECK-EL: mfc1 $[[ARG1:[0-9]+]], $f14
+; CHECK-EL: ori $[[MSK1:[0-9]+]], $[[T1]], 0
+; CHECK-EL: and $[[T2:[0-9]+]], $[[ARG0]], $[[MSK0]]
+; CHECK-EL: and $[[T3:[0-9]+]], $[[ARG1]], $[[MSK1]]
+; CHECK-EL: or  $[[T4:[0-9]+]], $[[T2]], $[[T3]]
+; CHECK-EL: mtc1 $[[T4]], $f0
+  %call = tail call float @copysignf(float %f0, float %f1) nounwind readnone
+  ret float %call
+}
+
+declare float @copysignf(float, float) nounwind readnone
diff --git a/test/CodeGen/Mips/frame-address.ll b/test/CodeGen/Mips/frame-address.ll
new file mode 100644
index 0000000..c48ce7e
--- /dev/null
+++ b/test/CodeGen/Mips/frame-address.ll
@@ -0,0 +1,12 @@
+; RUN: llc -march=mipsel -mcpu=mips2 < %s | FileCheck %s
+
+declare i8* @llvm.frameaddress(i32) nounwind readnone
+
+define i8* @f() nounwind {
+entry:
+  %0 = call i8* @llvm.frameaddress(i32 0)
+  ret i8* %0
+
+; CHECK:   addu    $fp, $sp, $zero
+; CHECK:   addu    $2, $zero, $fp
+}
diff --git a/test/CodeGen/Mips/gprestore.ll b/test/CodeGen/Mips/gprestore.ll
new file mode 100644
index 0000000..ee7e131
--- /dev/null
+++ b/test/CodeGen/Mips/gprestore.ll
@@ -0,0 +1,32 @@
+; RUN: llc -march=mips < %s | FileCheck %s
+
+@p = external global i32
+@q = external global i32
+@r = external global i32
+
+define void @f0() nounwind {
+entry:
+; CHECK: jalr
+; CHECK-NOT: got({{.*}})($gp)
+; CHECK: lw $gp
+; CHECK: jalr
+; CHECK-NOT: got({{.*}})($gp)
+; CHECK: lw $gp
+; CHECK: jalr
+; CHECK-NOT: got({{.*}})($gp)
+; CHECK: lw $gp
+  tail call void (...)* @f1() nounwind
+  %tmp = load i32* @p, align 4
+  tail call void @f2(i32 %tmp) nounwind
+  %tmp1 = load i32* @q, align 4
+  %tmp2 = load i32* @r, align 4
+  tail call void @f3(i32 %tmp1, i32 %tmp2) nounwind
+  ret void
+}
+
+declare void @f1(...)
+
+declare void @f2(i32)
+
+declare void @f3(i32, i32)
+
diff --git a/test/CodeGen/Mips/i64arg.ll b/test/CodeGen/Mips/i64arg.ll
new file mode 100644
index 0000000..9a30453
--- /dev/null
+++ b/test/CodeGen/Mips/i64arg.ll
@@ -0,0 +1,34 @@
+; RUN: llc -march=mips -mcpu=4ke < %s | FileCheck %s
+
+define void @f1(i64 %ll1, float %f, i64 %ll, i32 %i, float %f2) nounwind {
+entry:
+; CHECK: addu $[[R1:[0-9]+]], $zero, $5
+; CHECK: addu $[[R0:[0-9]+]], $zero, $4
+; CHECK: lw  $25, %call16(ff1)
+; CHECK: ori $6, ${{[0-9]+}}, 3855
+; CHECK: ori $7, ${{[0-9]+}}, 22136
+; CHECK: jalr
+  tail call void @ff1(i32 %i, i64 1085102592623924856) nounwind
+; CHECK: lw $25, %call16(ff2)
+; CHECK: lw $[[R2:[0-9]+]], 80($sp)
+; CHECK: lw $[[R3:[0-9]+]], 84($sp)
+; CHECK: addu $4, $zero, $[[R2]]
+; CHECK: addu $5, $zero, $[[R3]]
+; CHECK: jalr $25
+  tail call void @ff2(i64 %ll, double 3.000000e+00) nounwind
+  %sub = add nsw i32 %i, -1
+; CHECK: sw $[[R0]], 24($sp)
+; CHECK: sw $[[R1]], 28($sp)
+; CHECK: lw $25, %call16(ff3)
+; CHECK: addu $6, $zero, $[[R2]]
+; CHECK: addu $7, $zero, $[[R3]]
+; CHECK: jalr $25
+  tail call void @ff3(i32 %i, i64 %ll, i32 %sub, i64 %ll1) nounwind
+  ret void
+}
+
+declare void @ff1(i32, i64)
+
+declare void @ff2(i64, double)
+
+declare void @ff3(i32, i64, i32, i64)
diff --git a/test/CodeGen/Mips/internalfunc.ll b/test/CodeGen/Mips/internalfunc.ll
index fdfa01a..50d0993 100644
--- a/test/CodeGen/Mips/internalfunc.ll
+++ b/test/CodeGen/Mips/internalfunc.ll
@@ -1,4 +1,4 @@
-; RUN: llc  < %s -march=mips | FileCheck %s
+; RUN: llc  < %s -march=mipsel -mcpu=4ke  | FileCheck %s
 
 @caller.sf1 = internal unnamed_addr global void (...)* null, align 4
 @gf1 = external global void (...)*
diff --git a/test/CodeGen/Mips/largeimmprinting.ll b/test/CodeGen/Mips/largeimmprinting.ll
new file mode 100644
index 0000000..fd7ae9e
--- /dev/null
+++ b/test/CodeGen/Mips/largeimmprinting.ll
@@ -0,0 +1,23 @@
+; RUN: llc -march=mipsel -mcpu=4ke < %s | FileCheck %s
+
+%struct.S1 = type { [65536 x i8] }
+
+@s1 = external global %struct.S1
+
+define void @f() nounwind {
+entry:
+; CHECK:  lui $at, 65534
+; CHECK:  addu  $at, $sp, $at
+; CHECK:  addiu $sp, $at, -16
+; CHECK:  .cprestore  65536
+
+  %agg.tmp = alloca %struct.S1, align 1
+  %tmp = getelementptr inbounds %struct.S1* %agg.tmp, i32 0, i32 0, i32 0
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %tmp, i8* getelementptr inbounds (%struct.S1* @s1, i32 0, i32 0, i32 0), i32 65536, i32 1, i1 false)
+  call void @f2(%struct.S1* byval %agg.tmp) nounwind
+  ret void
+}
+
+declare void @f2(%struct.S1* byval)
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
diff --git a/test/CodeGen/Mips/o32_cc_byval.ll b/test/CodeGen/Mips/o32_cc_byval.ll
new file mode 100644
index 0000000..b78c393
--- /dev/null
+++ b/test/CodeGen/Mips/o32_cc_byval.ll
@@ -0,0 +1,127 @@
+; RUN: llc -march=mipsel -mcpu=4ke < %s | FileCheck %s
+
+%0 = type { i8, i16, i32, i64, double, i32, [4 x i8] }
+%struct.S1 = type { i8, i16, i32, i64, double, i32 }
+%struct.S2 = type { [4 x i32] }
+%struct.S3 = type { i8 }
+
+@f1.s1 = internal unnamed_addr constant %0 { i8 1, i16 2, i32 3, i64 4, double 5.000000e+00, i32 6, [4 x i8] undef }, align 8
+@f1.s2 = internal unnamed_addr constant %struct.S2 { [4 x i32] [i32 7, i32 8, i32 9, i32 10] }, align 4
+
+define void @f1() nounwind {
+entry:
+; CHECK: lw  $[[R1:[0-9]+]], %got(f1.s1)($gp)
+; CHECK: addiu $[[R0:[0-9]+]], $[[R1]], %lo(f1.s1)
+; CHECK: lw  $[[R2:[0-9]+]], 8($[[R0]])
+; CHECK: lw  $[[R7:[0-9]+]], 12($[[R0]])
+; CHECK: lw  $[[R3:[0-9]+]], 16($[[R0]])
+; CHECK: lw  $[[R4:[0-9]+]], 20($[[R0]])
+; CHECK: lw  $[[R5:[0-9]+]], 24($[[R0]])
+; CHECK: lw  $[[R6:[0-9]+]], 28($[[R0]])
+; CHECK: sw  $[[R2]], 16($sp)
+; CHECK: sw  $[[R7]], 20($sp)
+; CHECK: sw  $[[R3]], 24($sp)
+; CHECK: sw  $[[R4]], 28($sp)
+; CHECK: sw  $[[R5]], 32($sp)
+; CHECK: sw  $[[R6]], 36($sp)
+; CHECK: lw  $6, 0($[[R0]])
+; CHECK: lw  $7, 4($[[R0]])
+  %agg.tmp10 = alloca %struct.S3, align 4
+  call void @callee1(float 2.000000e+01, %struct.S1* byval bitcast (%0* @f1.s1 to %struct.S1*)) nounwind
+  call void @callee2(%struct.S2* byval @f1.s2) nounwind
+  %tmp11 = getelementptr inbounds %struct.S3* %agg.tmp10, i32 0, i32 0
+  store i8 11, i8* %tmp11, align 4
+  call void @callee3(float 2.100000e+01, %struct.S3* byval %agg.tmp10, %struct.S1* byval bitcast (%0* @f1.s1 to %struct.S1*)) nounwind
+  ret void
+}
+
+declare void @callee1(float, %struct.S1* byval)
+
+declare void @callee2(%struct.S2* byval)
+
+declare void @callee3(float, %struct.S3* byval, %struct.S1* byval)
+
+define void @f2(float %f, %struct.S1* nocapture byval %s1) nounwind {
+entry:
+; CHECK: addiu $sp, $sp, -56
+; CHECK: sw  $6, 64($sp)
+; CHECK: sw  $7, 68($sp)
+; CHECK: ldc1 $f[[F0:[0-9]+]], 80($sp)
+; CHECK: lw  $[[R2:[0-9]+]], 68($sp)
+; CHECK: lh  $[[R1:[0-9]+]], 66($sp)
+; CHECK: lb  $[[R0:[0-9]+]], 64($sp)
+; CHECK: lw  $[[R3:[0-9]+]], 72($sp)
+; CHECK: lw  $[[R4:[0-9]+]], 76($sp)
+; CHECK: lw  $4, 88($sp)
+; CHECK: sw  $[[R3]], 16($sp)
+; CHECK: sw  $[[R4]], 20($sp)
+; CHECK: sw  $[[R2]], 24($sp)
+; CHECK: sw  $[[R1]], 28($sp)
+; CHECK: sw  $[[R0]], 32($sp)
+; CHECK: mfc1 $6, $f[[F0]]
+
+  %i2 = getelementptr inbounds %struct.S1* %s1, i32 0, i32 5
+  %tmp = load i32* %i2, align 4, !tbaa !0
+  %d = getelementptr inbounds %struct.S1* %s1, i32 0, i32 4
+  %tmp1 = load double* %d, align 8, !tbaa !3
+  %ll = getelementptr inbounds %struct.S1* %s1, i32 0, i32 3
+  %tmp2 = load i64* %ll, align 8, !tbaa !4
+  %i = getelementptr inbounds %struct.S1* %s1, i32 0, i32 2
+  %tmp3 = load i32* %i, align 4, !tbaa !0
+  %s = getelementptr inbounds %struct.S1* %s1, i32 0, i32 1
+  %tmp4 = load i16* %s, align 2, !tbaa !5
+  %c = getelementptr inbounds %struct.S1* %s1, i32 0, i32 0
+  %tmp5 = load i8* %c, align 1, !tbaa !1
+  tail call void @callee4(i32 %tmp, double %tmp1, i64 %tmp2, i32 %tmp3, i16 signext %tmp4, i8 signext %tmp5, float %f) nounwind
+  ret void
+}
+
+declare void @callee4(i32, double, i64, i32, i16 signext, i8 signext, float)
+
+define void @f3(%struct.S2* nocapture byval %s2) nounwind {
+entry:
+; CHECK: addiu $sp, $sp, -56
+; CHECK: sw  $4, 56($sp)
+; CHECK: sw  $5, 60($sp)
+; CHECK: sw  $6, 64($sp)
+; CHECK: sw  $7, 68($sp)
+; CHECK: lw  $[[R0:[0-9]+]], 68($sp)
+; CHECK: lw  $4, 56($sp)
+; CHECK: sw  $[[R0]], 24($sp)
+
+  %arrayidx = getelementptr inbounds %struct.S2* %s2, i32 0, i32 0, i32 0
+  %tmp = load i32* %arrayidx, align 4, !tbaa !0
+  %arrayidx2 = getelementptr inbounds %struct.S2* %s2, i32 0, i32 0, i32 3
+  %tmp3 = load i32* %arrayidx2, align 4, !tbaa !0
+  tail call void @callee4(i32 %tmp, double 2.000000e+00, i64 3, i32 %tmp3, i16 signext 4, i8 signext 5, float 6.000000e+00) nounwind
+  ret void
+}
+
+define void @f4(float %f, %struct.S3* nocapture byval %s3, %struct.S1* nocapture byval %s1) nounwind {
+entry:
+; CHECK: addiu $sp, $sp, -56
+; CHECK: sw  $5, 60($sp)
+; CHECK: sw  $6, 64($sp)
+; CHECK: sw  $7, 68($sp)
+; CHECK: lw  $[[R1:[0-9]+]], 88($sp)
+; CHECK: lb  $[[R0:[0-9]+]], 60($sp)
+; CHECK: lw  $4, 68($sp)
+; CHECK: sw  $[[R1]], 24($sp)
+; CHECK: sw  $[[R0]], 32($sp)
+
+  %i = getelementptr inbounds %struct.S1* %s1, i32 0, i32 2
+  %tmp = load i32* %i, align 4, !tbaa !0
+  %i2 = getelementptr inbounds %struct.S1* %s1, i32 0, i32 5
+  %tmp1 = load i32* %i2, align 4, !tbaa !0
+  %c = getelementptr inbounds %struct.S3* %s3, i32 0, i32 0
+  %tmp2 = load i8* %c, align 1, !tbaa !1
+  tail call void @callee4(i32 %tmp, double 2.000000e+00, i64 3, i32 %tmp1, i16 signext 4, i8 signext %tmp2, float 6.000000e+00) nounwind
+  ret void
+}
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
+!3 = metadata !{metadata !"double", metadata !1}
+!4 = metadata !{metadata !"long long", metadata !1}
+!5 = metadata !{metadata !"short", metadata !1}
diff --git a/test/CodeGen/Mips/o32_cc_vararg.ll b/test/CodeGen/Mips/o32_cc_vararg.ll
index 6601d25..14ce04b 100644
--- a/test/CodeGen/Mips/o32_cc_vararg.ll
+++ b/test/CodeGen/Mips/o32_cc_vararg.ll
@@ -1,12 +1,11 @@
-; RUN: llc -march=mipsel -mcpu=mips2 < %s | FileCheck %s
-; RUN: llc -march=mipsel -mcpu=mips2 < %s -regalloc=basic | FileCheck %s
+; RUN: llc -march=mipsel -mcpu=mips2 -pre-RA-sched=source < %s | FileCheck %s
 
 
 ; All test functions do the same thing - they return the first variable
 ; argument.
 
-; All CHECK's do the same thing - they check whether variable arguments from 
-; registers are placed on correct stack locations, and whether the first 
+; All CHECK's do the same thing - they check whether variable arguments from
+; registers are placed on correct stack locations, and whether the first
 ; variable argument is returned from the correct stack location.
 
 
@@ -30,15 +29,15 @@ entry:
   ret i32 %tmp
 
 ; CHECK: va1:
-; CHECK: addiu   $sp, $sp, -32
-; CHECK: sw      $5, 36($sp)
-; CHECK: sw      $6, 40($sp)
-; CHECK: sw      $7, 44($sp)
-; CHECK: lw      $2, 36($sp)
+; CHECK: addiu   $sp, $sp, -16
+; CHECK: sw      $7, 28($sp)
+; CHECK: sw      $6, 24($sp)
+; CHECK: sw      $5, 20($sp)
+; CHECK: lw      $2, 20($sp)
 }
 
-; check whether the variable double argument will be accessed from the 8-byte 
-; aligned location (i.e. whether the address is computed by adding 7 and 
+; check whether the variable double argument will be accessed from the 8-byte
+; aligned location (i.e. whether the address is computed by adding 7 and
 ; clearing lower 3 bits)
 define double @va2(i32 %a, ...) nounwind {
 entry:
@@ -56,11 +55,11 @@ entry:
   ret double %tmp
 
 ; CHECK: va2:
-; CHECK: addiu   $sp, $sp, -40
-; CHECK: addiu   $[[R0:[0-9]+]], $sp, 44
-; CHECK: sw      $5, 44($sp)
-; CHECK: sw      $6, 48($sp)
-; CHECK: sw      $7, 52($sp)
+; CHECK: addiu   $sp, $sp, -16
+; CHECK: sw      $7, 28($sp)
+; CHECK: sw      $6, 24($sp)
+; CHECK: sw      $5, 20($sp)
+; CHECK: addiu   $[[R0:[0-9]+]], $sp, 20
 ; CHECK: addiu   $[[R1:[0-9]+]], $[[R0]], 7
 ; CHECK: addiu   $[[R2:[0-9]+]], $zero, -8
 ; CHECK: and     $[[R3:[0-9]+]], $[[R1]], $[[R2]]
@@ -84,10 +83,10 @@ entry:
   ret i32 %tmp
 
 ; CHECK: va3:
-; CHECK: addiu   $sp, $sp, -40
-; CHECK: sw      $6, 48($sp)
-; CHECK: sw      $7, 52($sp)
-; CHECK: lw      $2, 48($sp)
+; CHECK: addiu   $sp, $sp, -16
+; CHECK: sw      $7, 28($sp)
+; CHECK: sw      $6, 24($sp)
+; CHECK: lw      $2, 24($sp)
 }
 
 ; double
@@ -107,14 +106,11 @@ entry:
   ret double %tmp
 
 ; CHECK: va4:
-; CHECK: addiu   $sp, $sp, -48
-; CHECK: sw      $6, 56($sp)
-; CHECK: sw      $7, 60($sp)
-; CHECK: addiu   $[[R0:[0-9]+]], $sp, 56
-; CHECK: addiu   $[[R1:[0-9]+]], $[[R0]], 7
-; CHECK: addiu   $[[R2:[0-9]+]], $zero, -8
-; CHECK: and     $[[R3:[0-9]+]], $[[R1]], $[[R2]]
-; CHECK: ldc1    $f0, 0($[[R3]])
+; CHECK: addiu   $sp, $sp, -24
+; CHECK: sw      $7, 36($sp)
+; CHECK: sw      $6, 32($sp)
+; CHECK: addiu   ${{[0-9]+}}, $sp, 32
+; CHECK: ldc1    $f0, 32($sp)
 }
 
 ; int
@@ -138,9 +134,9 @@ entry:
   ret i32 %tmp
 
 ; CHECK: va5:
-; CHECK: addiu   $sp, $sp, -40
-; CHECK: sw      $7, 52($sp)
-; CHECK: lw      $2, 52($sp)
+; CHECK: addiu   $sp, $sp, -24
+; CHECK: sw      $7, 36($sp)
+; CHECK: lw      $2, 36($sp)
 }
 
 ; double
@@ -164,9 +160,9 @@ entry:
   ret double %tmp
 
 ; CHECK: va6:
-; CHECK: addiu   $sp, $sp, -48
-; CHECK: sw      $7, 60($sp)
-; CHECK: addiu   $[[R0:[0-9]+]], $sp, 60
+; CHECK: addiu   $sp, $sp, -24
+; CHECK: sw      $7, 36($sp)
+; CHECK: addiu   $[[R0:[0-9]+]], $sp, 36
 ; CHECK: addiu   $[[R1:[0-9]+]], $[[R0]], 7
 ; CHECK: addiu   $[[R2:[0-9]+]], $zero, -8
 ; CHECK: and     $[[R3:[0-9]+]], $[[R1]], $[[R2]]
@@ -192,8 +188,8 @@ entry:
   ret i32 %tmp
 
 ; CHECK: va7:
-; CHECK: addiu   $sp, $sp, -40
-; CHECK: lw      $2, 56($sp)
+; CHECK: addiu   $sp, $sp, -24
+; CHECK: lw      $2, 40($sp)
 }
 
 ; double
@@ -215,12 +211,9 @@ entry:
   ret double %tmp
 
 ; CHECK: va8:
-; CHECK: addiu   $sp, $sp, -48
-; CHECK: addiu   $[[R0:[0-9]+]], $sp, 64
-; CHECK: addiu   $[[R1:[0-9]+]], $[[R0]], 7
-; CHECK: addiu   $[[R2:[0-9]+]], $zero, -8
-; CHECK: and     $[[R3:[0-9]+]], $[[R1]], $[[R2]]
-; CHECK: ldc1    $f0, 0($[[R3]])
+; CHECK: addiu   $sp, $sp, -32
+; CHECK: addiu   ${{[0-9]+}}, $sp, 48
+; CHECK: ldc1    $f0, 48($sp)
 }
 
 ; int
@@ -244,8 +237,8 @@ entry:
   ret i32 %tmp
 
 ; CHECK: va9:
-; CHECK: addiu   $sp, $sp, -56
-; CHECK: lw      $2, 76($sp)
+; CHECK: addiu   $sp, $sp, -32
+; CHECK: lw      $2, 52($sp)
 }
 
 ; double
@@ -269,8 +262,8 @@ entry:
   ret double %tmp
 
 ; CHECK: va10:
-; CHECK: addiu   $sp, $sp, -56
-; CHECK: addiu   $[[R0:[0-9]+]], $sp, 76
+; CHECK: addiu   $sp, $sp, -32
+; CHECK: addiu   $[[R0:[0-9]+]], $sp, 52
 ; CHECK: addiu   $[[R1:[0-9]+]], $[[R0]], 7
 ; CHECK: addiu   $[[R2:[0-9]+]], $zero, -8
 ; CHECK: and     $[[R3:[0-9]+]], $[[R1]], $[[R2]]
diff --git a/test/CodeGen/Mips/tls.ll b/test/CodeGen/Mips/tls.ll
new file mode 100644
index 0000000..034738b
--- /dev/null
+++ b/test/CodeGen/Mips/tls.ll
@@ -0,0 +1,46 @@
+; RUN: llc -march=mipsel -mcpu=mips2 < %s | FileCheck %s -check-prefix=PIC
+; RUN: llc -march=mipsel -mcpu=mips2 -relocation-model=static < %s \
+; RUN:                             | FileCheck %s -check-prefix=STATIC
+
+
+@t1 = thread_local global i32 0, align 4
+
+define i32 @f1() nounwind {
+entry:
+  %tmp = load i32* @t1, align 4
+  ret i32 %tmp
+
+; CHECK: f1:
+
+; PIC:   lw      $25, %call16(__tls_get_addr)($gp)
+; PIC:   addiu   $4, $gp, %tlsgd(t1)
+; PIC:   jalr    $25
+; PIC:   lw      $2, 0($2)
+
+; STATIC:   rdhwr   $3, $29
+; STATIC:   lui     $[[R0:[0-9]+]], %tprel_hi(t1)
+; STATIC:   addiu   $[[R1:[0-9]+]], $[[R0]], %tprel_lo(t1)
+; STATIC:   addu    $[[R2:[0-9]+]], $3, $[[R1]]
+; STATIC:   lw      $2, 0($[[R2]])
+}
+
+
+@t2 = external thread_local global i32
+
+define i32 @f2() nounwind {
+entry:
+  %tmp = load i32* @t2, align 4
+  ret i32 %tmp
+
+; CHECK: f2:
+
+; PIC:   lw      $25, %call16(__tls_get_addr)($gp)
+; PIC:   addiu   $4, $gp, %tlsgd(t2)
+; PIC:   jalr    $25
+; PIC:   lw      $2, 0($2)
+
+; STATIC:   rdhwr   $3, $29
+; STATIC:   lw      $[[R0:[0-9]+]], %gottprel(t2)($gp)
+; STATIC:   addu    $[[R1:[0-9]+]], $3, $[[R0]]
+; STATIC:   lw      $2, 0($[[R1]])
+}
diff --git a/test/CodeGen/Mips/weak.ll b/test/CodeGen/Mips/weak.ll
new file mode 100644
index 0000000..09dd2a4
--- /dev/null
+++ b/test/CodeGen/Mips/weak.ll
@@ -0,0 +1,12 @@
+; RUN: llc -march=mips < %s | FileCheck %s
+
+@t = common global i32 (...)* null, align 4
+
+define void @f() nounwind {
+entry:
+  store i32 (...)* @test_weak, i32 (...)** @t, align 4
+  ret void
+}
+
+; CHECK: .weak test_weak
+declare extern_weak i32 @test_weak(...)
diff --git a/test/CodeGen/PTX/add.ll b/test/CodeGen/PTX/add.ll
index 598591c..b89a2f6 100644
--- a/test/CodeGen/PTX/add.ll
+++ b/test/CodeGen/PTX/add.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=ptx | FileCheck %s
+; RUN: llc < %s -march=ptx32 | FileCheck %s
 
 define ptx_device i16 @t1_u16(i16 %x, i16 %y) {
 ; CHECK: add.u16 rh0, rh1, rh2;
@@ -22,14 +22,14 @@ define ptx_device i64 @t1_u64(i64 %x, i64 %y) {
 }
 
 define ptx_device float @t1_f32(float %x, float %y) {
-; CHECK: add.f32 f0, f1, f2
+; CHECK: add.f32 r0, r1, r2
 ; CHECK-NEXT: ret;
   %z = fadd float %x, %y
   ret float %z
 }
 
 define ptx_device double @t1_f64(double %x, double %y) {
-; CHECK: add.f64 fd0, fd1, fd2
+; CHECK: add.f64 rd0, rd1, rd2
 ; CHECK-NEXT: ret;
   %z = fadd double %x, %y
   ret double %z
@@ -57,14 +57,14 @@ define ptx_device i64 @t2_u64(i64 %x) {
 }
 
 define ptx_device float @t2_f32(float %x) {
-; CHECK: add.f32 f0, f1, 0F3F800000;
+; CHECK: add.f32 r0, r1, 0F3F800000;
 ; CHECK-NEXT: ret;
   %z = fadd float %x, 1.0
   ret float %z
 }
 
 define ptx_device double @t2_f64(double %x) {
-; CHECK: add.f64 fd0, fd1, 0D3FF0000000000000;
+; CHECK: add.f64 rd0, rd1, 0D3FF0000000000000;
 ; CHECK-NEXT: ret;
   %z = fadd double %x, 1.0
   ret double %z
diff --git a/test/CodeGen/PTX/bitwise.ll b/test/CodeGen/PTX/bitwise.ll
new file mode 100644
index 0000000..dbc77e5
--- /dev/null
+++ b/test/CodeGen/PTX/bitwise.ll
@@ -0,0 +1,24 @@
+; RUN: llc < %s -march=ptx32 | FileCheck %s
+
+; preds
+
+define ptx_device i32 @t1_and_preds(i1 %x, i1 %y) {
+; CHECK: and.pred p0, p1, p2
+  %c = and i1 %x, %y
+  %d = zext i1 %c to i32 
+  ret i32 %d
+}
+
+define ptx_device i32 @t1_or_preds(i1 %x, i1 %y) {
+; CHECK: or.pred p0, p1, p2
+  %a = or i1 %x, %y
+  %b = zext i1 %a to i32 
+  ret i32 %b
+}
+
+define ptx_device i32 @t1_xor_preds(i1 %x, i1 %y) {
+; CHECK: xor.pred p0, p1, p2
+  %a = xor i1 %x, %y
+  %b = zext i1 %a to i32 
+  ret i32 %b
+}
diff --git a/test/CodeGen/PTX/bra.ll b/test/CodeGen/PTX/bra.ll
index 0506a99..49383eb 100644
--- a/test/CodeGen/PTX/bra.ll
+++ b/test/CodeGen/PTX/bra.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=ptx | FileCheck %s
+; RUN: llc < %s -march=ptx32 | FileCheck %s
 
 define ptx_device void @test_bra_direct() {
 ; CHECK: bra $L__BB0_1;
diff --git a/test/CodeGen/PTX/cvt.ll b/test/CodeGen/PTX/cvt.ll
new file mode 100644
index 0000000..984cb4d
--- /dev/null
+++ b/test/CodeGen/PTX/cvt.ll
@@ -0,0 +1,234 @@
+; RUN: llc < %s -march=ptx32 | FileCheck %s
+
+; preds 
+; (note: we convert back to i32 to return)
+
+define ptx_device i32 @cvt_pred_i16(i16 %x, i1 %y) {
+; CHECK: cvt.pred.u16 p0, rh1;
+; CHECK: ret;
+	%a = trunc i16 %x to i1
+	%b = and i1 %a, %y
+	%c = zext i1 %b to i32
+	ret i32 %c
+}
+
+define ptx_device i32 @cvt_pred_i32(i32 %x, i1 %y) {
+; CHECK: cvt.pred.u32 p0, r1;
+; CHECK: ret;
+	%a = trunc i32 %x to i1
+	%b = and i1 %a, %y
+	%c = zext i1 %b to i32
+	ret i32 %c
+}
+
+define ptx_device i32 @cvt_pred_i64(i64 %x, i1 %y) {
+; CHECK: cvt.pred.u64 p0, rd1;
+; CHECK: ret;
+	%a = trunc i64 %x to i1
+	%b = and i1 %a, %y
+	%c = zext i1 %b to i32
+	ret i32 %c
+}
+
+define ptx_device i32 @cvt_pred_f32(float %x, i1 %y) {
+; CHECK: cvt.rni.pred.f32 p0, r1;
+; CHECK: ret;
+	%a = fptoui float %x to i1
+	%b = and i1 %a, %y
+	%c = zext i1 %b to i32
+	ret i32 %c
+}
+
+define ptx_device i32 @cvt_pred_f64(double %x, i1 %y) {
+; CHECK: cvt.rni.pred.f64 p0, rd1;
+; CHECK: ret;
+	%a = fptoui double %x to i1
+	%b = and i1 %a, %y
+	%c = zext i1 %b to i32
+	ret i32 %c
+}
+
+; i16
+
+define ptx_device i16 @cvt_i16_preds(i1 %x) {
+; CHECK: cvt.u16.pred rh0, p1;
+; CHECK: ret;
+	%a = zext i1 %x to i16
+	ret i16 %a
+}
+
+define ptx_device i16 @cvt_i16_i32(i32 %x) {
+; CHECK: cvt.u16.u32 rh0, r1;
+; CHECK: ret;
+	%a = trunc i32 %x to i16
+	ret i16 %a
+}
+
+define ptx_device i16 @cvt_i16_i64(i64 %x) {
+; CHECK: cvt.u16.u64 rh0, rd1;
+; CHECK: ret;
+	%a = trunc i64 %x to i16
+	ret i16 %a
+}
+
+define ptx_device i16 @cvt_i16_f32(float %x) {
+; CHECK: cvt.rni.u16.f32 rh0, r1;
+; CHECK: ret;
+	%a = fptoui float %x to i16
+	ret i16 %a
+}
+
+define ptx_device i16 @cvt_i16_f64(double %x) {
+; CHECK: cvt.rni.u16.f64 rh0, rd1;
+; CHECK: ret;
+	%a = fptoui double %x to i16
+	ret i16 %a
+}
+
+; i32
+
+define ptx_device i32 @cvt_i32_preds(i1 %x) {
+; CHECK: cvt.u32.pred r0, p1;
+; CHECK: ret;
+	%a = zext i1 %x to i32
+	ret i32 %a
+}
+
+define ptx_device i32 @cvt_i32_i16(i16 %x) {
+; CHECK: cvt.u32.u16 r0, rh1;
+; CHECK: ret;
+	%a = zext i16 %x to i32
+	ret i32 %a
+}
+
+define ptx_device i32 @cvt_i32_i64(i64 %x) {
+; CHECK: cvt.u32.u64 r0, rd1;
+; CHECK: ret;
+	%a = trunc i64 %x to i32
+	ret i32 %a
+}
+
+define ptx_device i32 @cvt_i32_f32(float %x) {
+; CHECK: cvt.rni.u32.f32 r0, r1;
+; CHECK: ret;
+	%a = fptoui float %x to i32
+	ret i32 %a
+}
+
+define ptx_device i32 @cvt_i32_f64(double %x) {
+; CHECK: cvt.rni.u32.f64 r0, rd1;
+; CHECK: ret;
+	%a = fptoui double %x to i32
+	ret i32 %a
+}
+
+; i64
+
+define ptx_device i64 @cvt_i64_preds(i1 %x) {
+; CHECK: cvt.u64.pred rd0, p1;
+; CHECK: ret;
+	%a = zext i1 %x to i64
+	ret i64 %a
+}
+
+define ptx_device i64 @cvt_i64_i16(i16 %x) {
+; CHECK: cvt.u64.u16 rd0, rh1;
+; CHECK: ret;
+	%a = zext i16 %x to i64
+	ret i64 %a
+}
+
+define ptx_device i64 @cvt_i64_i32(i32 %x) {
+; CHECK: cvt.u64.u32 rd0, r1;
+; CHECK: ret;
+	%a = zext i32 %x to i64
+	ret i64 %a
+}
+
+define ptx_device i64 @cvt_i64_f32(float %x) {
+; CHECK: cvt.rni.u64.f32 rd0, r1;
+; CHECK: ret;
+	%a = fptoui float %x to i64
+	ret i64 %a
+}
+
+define ptx_device i64 @cvt_i64_f64(double %x) {
+; CHECK: cvt.rni.u64.f64 rd0, rd1;
+; CHECK: ret;
+	%a = fptoui double %x to i64
+	ret i64 %a
+}
+
+; f32
+
+define ptx_device float @cvt_f32_preds(i1 %x) {
+; CHECK: cvt.rn.f32.pred r0, p1;
+; CHECK: ret;
+	%a = uitofp i1 %x to float
+	ret float %a
+}
+
+define ptx_device float @cvt_f32_i16(i16 %x) {
+; CHECK: cvt.rn.f32.u16 r0, rh1;
+; CHECK: ret;
+	%a = uitofp i16 %x to float
+	ret float %a
+}
+
+define ptx_device float @cvt_f32_i32(i32 %x) {
+; CHECK: cvt.rn.f32.u32 r0, r1;
+; CHECK: ret;
+	%a = uitofp i32 %x to float
+	ret float %a
+}
+
+define ptx_device float @cvt_f32_i64(i64 %x) {
+; CHECK: cvt.rn.f32.u64 r0, rd1;
+; CHECK: ret;
+	%a = uitofp i64 %x to float
+	ret float %a
+}
+
+define ptx_device float @cvt_f32_f64(double %x) {
+; CHECK: cvt.rn.f32.f64 r0, rd1;
+; CHECK: ret;
+	%a = fptrunc double %x to float
+	ret float %a
+}
+
+; f64
+
+define ptx_device double @cvt_f64_preds(i1 %x) {
+; CHECK: cvt.rn.f64.pred rd0, p1;
+; CHECK: ret;
+	%a = uitofp i1 %x to double
+	ret double %a
+}
+
+define ptx_device double @cvt_f64_i16(i16 %x) {
+; CHECK: cvt.rn.f64.u16 rd0, rh1;
+; CHECK: ret;
+	%a = uitofp i16 %x to double
+	ret double %a
+}
+
+define ptx_device double @cvt_f64_i32(i32 %x) {
+; CHECK: cvt.rn.f64.u32 rd0, r1;
+; CHECK: ret;
+	%a = uitofp i32 %x to double
+	ret double %a
+}
+
+define ptx_device double @cvt_f64_i64(i64 %x) {
+; CHECK: cvt.rn.f64.u64 rd0, rd1;
+; CHECK: ret;
+	%a = uitofp i64 %x to double
+	ret double %a
+}
+
+define ptx_device double @cvt_f64_f32(float %x) {
+; CHECK: cvt.f64.f32 rd0, r1;
+; CHECK: ret;
+	%a = fpext float %x to double
+	ret double %a
+}
diff --git a/test/CodeGen/PTX/exit.ll b/test/CodeGen/PTX/exit.ll
index 4071bab..7816c80 100644
--- a/test/CodeGen/PTX/exit.ll
+++ b/test/CodeGen/PTX/exit.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=ptx | FileCheck %s
+; RUN: llc < %s -march=ptx32 | FileCheck %s
 
 define ptx_kernel void @t1() {
 ; CHECK: exit;
diff --git a/test/CodeGen/PTX/fdiv-sm10.ll b/test/CodeGen/PTX/fdiv-sm10.ll
index 42f615d..9aff251 100644
--- a/test/CodeGen/PTX/fdiv-sm10.ll
+++ b/test/CodeGen/PTX/fdiv-sm10.ll
@@ -1,14 +1,14 @@
-; RUN: llc < %s -march=ptx -mattr=+sm10 | FileCheck %s
+; RUN: llc < %s -march=ptx32 -mattr=+sm10 | FileCheck %s
 
 define ptx_device float @t1_f32(float %x, float %y) {
-; CHECK: div.approx.f32 f0, f1, f2;
+; CHECK: div.approx.f32 r0, r1, r2;
 ; CHECK-NEXT: ret;
 	%a = fdiv float %x, %y
 	ret float %a
 }
 
 define ptx_device double @t1_f64(double %x, double %y) {
-; CHECK: div.f64 fd0, fd1, fd2;
+; CHECK: div.f64 rd0, rd1, rd2;
 ; CHECK-NEXT: ret;
 	%a = fdiv double %x, %y
 	ret double %a
diff --git a/test/CodeGen/PTX/fdiv-sm13.ll b/test/CodeGen/PTX/fdiv-sm13.ll
index eb20f78..84e0ada 100644
--- a/test/CodeGen/PTX/fdiv-sm13.ll
+++ b/test/CodeGen/PTX/fdiv-sm13.ll
@@ -1,14 +1,14 @@
-; RUN: llc < %s -march=ptx -mattr=+sm13 | FileCheck %s
+; RUN: llc < %s -march=ptx32 -mattr=+sm13 | FileCheck %s
 
 define ptx_device float @t1_f32(float %x, float %y) {
-; CHECK: div.approx.f32 f0, f1, f2;
+; CHECK: div.approx.f32 r0, r1, r2;
 ; CHECK-NEXT: ret;
 	%a = fdiv float %x, %y
 	ret float %a
 }
 
 define ptx_device double @t1_f64(double %x, double %y) {
-; CHECK: div.rn.f64 fd0, fd1, fd2;
+; CHECK: div.rn.f64 rd0, rd1, rd2;
 ; CHECK-NEXT: ret;
 	%a = fdiv double %x, %y
 	ret double %a
diff --git a/test/CodeGen/PTX/fneg.ll b/test/CodeGen/PTX/fneg.ll
new file mode 100644
index 0000000..185c37c
--- /dev/null
+++ b/test/CodeGen/PTX/fneg.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -march=ptx32 | FileCheck %s
+
+define ptx_device float @t1_f32(float %x) {
+; CHECK: neg.f32 r0, r1;
+; CHECK-NEXT: ret;
+	%y = fsub float -0.000000e+00, %x
+	ret float %y
+}
+
+define ptx_device double @t1_f64(double %x) {
+; CHECK: neg.f64 rd0, rd1;
+; CHECK-NEXT: ret;
+	%y = fsub double -0.000000e+00, %x
+	ret double %y
+}
diff --git a/test/CodeGen/PTX/intrinsic.ll b/test/CodeGen/PTX/intrinsic.ll
index 7405dd6..cea4182 100644
--- a/test/CodeGen/PTX/intrinsic.ll
+++ b/test/CodeGen/PTX/intrinsic.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=ptx -mattr=+ptx20,+sm20 | FileCheck %s
+; RUN: llc < %s -march=ptx32 -mattr=+ptx20,+sm20 | FileCheck %s
 
 define ptx_device i32 @test_tid_x() {
 ; CHECK: mov.u32 r0, %tid.x;
diff --git a/test/CodeGen/PTX/ld.ll b/test/CodeGen/PTX/ld.ll
index 1119aa4..9b75998 100644
--- a/test/CodeGen/PTX/ld.ll
+++ b/test/CodeGen/PTX/ld.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=ptx | FileCheck %s
+; RUN: llc < %s -march=ptx32 | FileCheck %s
 
 ;CHECK: .extern .global .b8 array_i16[20];
 @array_i16 = external global [10 x i16]
@@ -64,7 +64,7 @@
 define ptx_device i16 @t1_u16(i16* %p) {
 entry:
 ;CHECK: ld.global.u16 rh0, [r1];
-;CHECK-NEXT; ret;
+;CHECK-NEXT: ret;
   %x = load i16* %p
   ret i16 %x
 }
@@ -87,7 +87,7 @@ entry:
 
 define ptx_device float @t1_f32(float* %p) {
 entry:
-;CHECK: ld.global.f32 f0, [r1];
+;CHECK: ld.global.f32 r0, [r1];
 ;CHECK-NEXT: ret;
   %x = load float* %p
   ret float %x
@@ -95,7 +95,7 @@ entry:
 
 define ptx_device double @t1_f64(double* %p) {
 entry:
-;CHECK: ld.global.f64 fd0, [r1];
+;CHECK: ld.global.f64 rd0, [r1];
 ;CHECK-NEXT: ret;
   %x = load double* %p
   ret double %x
@@ -130,7 +130,7 @@ entry:
 
 define ptx_device float @t2_f32(float* %p) {
 entry:
-;CHECK: ld.global.f32 f0, [r1+4];
+;CHECK: ld.global.f32 r0, [r1+4];
 ;CHECK-NEXT: ret;
   %i = getelementptr float* %p, i32 1
   %x = load float* %i
@@ -139,7 +139,7 @@ entry:
 
 define ptx_device double @t2_f64(double* %p) {
 entry:
-;CHECK: ld.global.f64 fd0, [r1+8];
+;CHECK: ld.global.f64 rd0, [r1+8];
 ;CHECK-NEXT: ret;
   %i = getelementptr double* %p, i32 1
   %x = load double* %i
@@ -180,7 +180,7 @@ define ptx_device float @t3_f32(float* %p, i32 %q) {
 entry:
 ;CHECK: shl.b32 r0, r2, 2;
 ;CHECK-NEXT: add.u32 r0, r1, r0;
-;CHECK-NEXT: ld.global.f32 f0, [r0];
+;CHECK-NEXT: ld.global.f32 r0, [r0];
   %i = getelementptr float* %p, i32 %q
   %x = load float* %i
   ret float %x
@@ -190,7 +190,7 @@ define ptx_device double @t3_f64(double* %p, i32 %q) {
 entry:
 ;CHECK: shl.b32 r0, r2, 3;
 ;CHECK-NEXT: add.u32 r0, r1, r0;
-;CHECK-NEXT: ld.global.f64 fd0, [r0];
+;CHECK-NEXT: ld.global.f64 rd0, [r0];
   %i = getelementptr double* %p, i32 %q
   %x = load double* %i
   ret double %x
@@ -229,7 +229,7 @@ entry:
 define ptx_device float @t4_global_f32() {
 entry:
 ;CHECK: mov.u32 r0, array_float;
-;CHECK-NEXT: ld.global.f32 f0, [r0];
+;CHECK-NEXT: ld.global.f32 r0, [r0];
 ;CHECK-NEXT: ret;
   %i = getelementptr [10 x float]* @array_float, i32 0, i32 0
   %x = load float* %i
@@ -239,7 +239,7 @@ entry:
 define ptx_device double @t4_global_f64() {
 entry:
 ;CHECK: mov.u32 r0, array_double;
-;CHECK-NEXT: ld.global.f64 fd0, [r0];
+;CHECK-NEXT: ld.global.f64 rd0, [r0];
 ;CHECK-NEXT: ret;
   %i = getelementptr [10 x double]* @array_double, i32 0, i32 0
   %x = load double* %i
@@ -279,7 +279,7 @@ entry:
 define ptx_device float @t4_const_f32() {
 entry:
 ;CHECK: mov.u32 r0, array_constant_float;
-;CHECK-NEXT: ld.const.f32 f0, [r0];
+;CHECK-NEXT: ld.const.f32 r0, [r0];
 ;CHECK-NEXT: ret;
   %i = getelementptr [10 x float] addrspace(1)* @array_constant_float, i32 0, i32 0
   %x = load float addrspace(1)* %i
@@ -289,7 +289,7 @@ entry:
 define ptx_device double @t4_const_f64() {
 entry:
 ;CHECK: mov.u32 r0, array_constant_double;
-;CHECK-NEXT: ld.const.f64 fd0, [r0];
+;CHECK-NEXT: ld.const.f64 rd0, [r0];
 ;CHECK-NEXT: ret;
   %i = getelementptr [10 x double] addrspace(1)* @array_constant_double, i32 0, i32 0
   %x = load double addrspace(1)* %i
@@ -329,7 +329,7 @@ entry:
 define ptx_device float @t4_local_f32() {
 entry:
 ;CHECK: mov.u32 r0, array_local_float;
-;CHECK-NEXT: ld.local.f32 f0, [r0];
+;CHECK-NEXT: ld.local.f32 r0, [r0];
 ;CHECK-NEXT: ret;
   %i = getelementptr [10 x float] addrspace(2)* @array_local_float, i32 0, i32 0
   %x = load float addrspace(2)* %i
@@ -339,7 +339,7 @@ entry:
 define ptx_device double @t4_local_f64() {
 entry:
 ;CHECK: mov.u32 r0, array_local_double;
-;CHECK-NEXT: ld.local.f64 fd0, [r0];
+;CHECK-NEXT: ld.local.f64 rd0, [r0];
 ;CHECK-NEXT: ret;
   %i = getelementptr [10 x double] addrspace(2)* @array_local_double, i32 0, i32 0
   %x = load double addrspace(2)* %i
@@ -379,7 +379,7 @@ entry:
 define ptx_device float @t4_shared_f32() {
 entry:
 ;CHECK: mov.u32 r0, array_shared_float;
-;CHECK-NEXT: ld.shared.f32 f0, [r0];
+;CHECK-NEXT: ld.shared.f32 r0, [r0];
 ;CHECK-NEXT: ret;
   %i = getelementptr [10 x float] addrspace(4)* @array_shared_float, i32 0, i32 0
   %x = load float addrspace(4)* %i
@@ -389,7 +389,7 @@ entry:
 define ptx_device double @t4_shared_f64() {
 entry:
 ;CHECK: mov.u32 r0, array_shared_double;
-;CHECK-NEXT: ld.shared.f64 fd0, [r0];
+;CHECK-NEXT: ld.shared.f64 rd0, [r0];
 ;CHECK-NEXT: ret;
   %i = getelementptr [10 x double] addrspace(4)* @array_shared_double, i32 0, i32 0
   %x = load double addrspace(4)* %i
@@ -429,7 +429,7 @@ entry:
 define ptx_device float @t5_f32() {
 entry:
 ;CHECK: mov.u32 r0, array_float;
-;CHECK-NEXT: ld.global.f32 f0, [r0+4];
+;CHECK-NEXT: ld.global.f32 r0, [r0+4];
 ;CHECK-NEXT: ret;
   %i = getelementptr [10 x float]* @array_float, i32 0, i32 1
   %x = load float* %i
@@ -439,7 +439,7 @@ entry:
 define ptx_device double @t5_f64() {
 entry:
 ;CHECK: mov.u32 r0, array_double;
-;CHECK-NEXT: ld.global.f64 fd0, [r0+8];
+;CHECK-NEXT: ld.global.f64 rd0, [r0+8];
 ;CHECK-NEXT: ret;
   %i = getelementptr [10 x double]* @array_double, i32 0, i32 1
   %x = load double* %i
diff --git a/test/CodeGen/PTX/llvm-intrinsic.ll b/test/CodeGen/PTX/llvm-intrinsic.ll
index 3ce4c29..a317645 100644
--- a/test/CodeGen/PTX/llvm-intrinsic.ll
+++ b/test/CodeGen/PTX/llvm-intrinsic.ll
@@ -1,8 +1,8 @@
-; RUN: llc < %s -march=ptx -mattr=+ptx20,+sm20 | FileCheck %s
+; RUN: llc < %s -march=ptx32 -mattr=+ptx20,+sm20 | FileCheck %s
 
 define ptx_device float @test_sqrt_f32(float %x) {
 entry:
-; CHECK: sqrt.rn.f32 f0, f1;
+; CHECK: sqrt.rn.f32 r0, r1;
 ; CHECK-NEXT: ret;
   %y = call float @llvm.sqrt.f32(float %x)
   ret float %y
@@ -10,7 +10,7 @@ entry:
 
 define ptx_device double @test_sqrt_f64(double %x) {
 entry:
-; CHECK: sqrt.rn.f64 fd0, fd1;
+; CHECK: sqrt.rn.f64 rd0, rd1;
 ; CHECK-NEXT: ret;
   %y = call double @llvm.sqrt.f64(double %x)
   ret double %y
@@ -18,7 +18,7 @@ entry:
 
 define ptx_device float @test_sin_f32(float %x) {
 entry:
-; CHECK: sin.approx.f32 f0, f1;
+; CHECK: sin.approx.f32 r0, r1;
 ; CHECK-NEXT: ret;
   %y = call float @llvm.sin.f32(float %x)
   ret float %y
@@ -26,7 +26,7 @@ entry:
 
 define ptx_device double @test_sin_f64(double %x) {
 entry:
-; CHECK: sin.approx.f64 fd0, fd1;
+; CHECK: sin.approx.f64 rd0, rd1;
 ; CHECK-NEXT: ret;
   %y = call double @llvm.sin.f64(double %x)
   ret double %y
@@ -34,7 +34,7 @@ entry:
 
 define ptx_device float @test_cos_f32(float %x) {
 entry:
-; CHECK: cos.approx.f32 f0, f1;
+; CHECK: cos.approx.f32 r0, r1;
 ; CHECK-NEXT: ret;
   %y = call float @llvm.cos.f32(float %x)
   ret float %y
@@ -42,7 +42,7 @@ entry:
 
 define ptx_device double @test_cos_f64(double %x) {
 entry:
-; CHECK: cos.approx.f64 fd0, fd1;
+; CHECK: cos.approx.f64 rd0, rd1;
 ; CHECK-NEXT: ret;
   %y = call double @llvm.cos.f64(double %x)
   ret double %y
diff --git a/test/CodeGen/PTX/mad-disabling.ll b/test/CodeGen/PTX/mad-disabling.ll
new file mode 100644
index 0000000..ad7b341
--- /dev/null
+++ b/test/CodeGen/PTX/mad-disabling.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -march=ptx32 -mattr=+ptx20,+sm20 | grep "mad"
+; RUN: llc < %s -march=ptx32 -mattr=+ptx20,+sm20,+no-fma | grep -v "mad"
+
+define ptx_device float @test_mul_add_f(float %x, float %y, float %z) {
+entry:
+  %a = fmul float %x, %y
+  %b = fadd float %a, %z
+  ret float %b
+}
+
+define ptx_device double @test_mul_add_d(double %x, double %y, double %z) {
+entry:
+  %a = fmul double %x, %y
+  %b = fadd double %a, %z
+  ret double %b
+}
diff --git a/test/CodeGen/PTX/mad.ll b/test/CodeGen/PTX/mad.ll
index 786345b..56d3811 100644
--- a/test/CodeGen/PTX/mad.ll
+++ b/test/CodeGen/PTX/mad.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -march=ptx -mattr=+sm13 | FileCheck %s
+; RUN: llc < %s -march=ptx32 -mattr=+sm13 | FileCheck %s
 
 define ptx_device float @t1_f32(float %x, float %y, float %z) {
-; CHECK: mad.rn.f32 f0, f1, f2, f3;
+; CHECK: mad.rn.f32 r0, r1, r2, r3;
 ; CHECK-NEXT: ret;
 	%a = fmul float %x, %y
   %b = fadd float %a, %z
@@ -9,7 +9,7 @@ define ptx_device float @t1_f32(float %x, float %y, float %z) {
 }
 
 define ptx_device double @t1_f64(double %x, double %y, double %z) {
-; CHECK: mad.rn.f64 fd0, fd1, fd2, fd3;
+; CHECK: mad.rn.f64 rd0, rd1, rd2, rd3;
 ; CHECK-NEXT: ret;
 	%a = fmul double %x, %y
   %b = fadd double %a, %z
diff --git a/test/CodeGen/PTX/mov.ll b/test/CodeGen/PTX/mov.ll
index 00dcf19..05ce4c0 100644
--- a/test/CodeGen/PTX/mov.ll
+++ b/test/CodeGen/PTX/mov.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=ptx | FileCheck %s
+; RUN: llc < %s -march=ptx32 | FileCheck %s
 
 define ptx_device i16 @t1_u16() {
 ; CHECK: mov.u16 rh0, 0;
@@ -19,13 +19,13 @@ define ptx_device i64 @t1_u64() {
 }
 
 define ptx_device float @t1_f32() {
-; CHECK: mov.f32 f0, 0F00000000;
+; CHECK: mov.f32 r0, 0F00000000;
 ; CHECK: ret;
 	ret float 0.0
 }
 
 define ptx_device double @t1_f64() {
-; CHECK: mov.f64 fd0, 0D0000000000000000;
+; CHECK: mov.f64 rd0, 0D0000000000000000;
 ; CHECK: ret;
 	ret double 0.0
 }
@@ -49,13 +49,13 @@ define ptx_device i64 @t2_u64(i64 %x) {
 }
 
 define ptx_device float @t3_f32(float %x) {
-; CHECK: mov.f32 f0, f1;
+; CHECK: mov.u32 r0, r1;
 ; CHECK-NEXT: ret;
 	ret float %x
 }
 
 define ptx_device double @t3_f64(double %x) {
-; CHECK: mov.f64 fd0, fd1;
+; CHECK: mov.u64 rd0, rd1;
 ; CHECK-NEXT: ret;
 	ret double %x
 }
diff --git a/test/CodeGen/PTX/mul.ll b/test/CodeGen/PTX/mul.ll
index fd0788f..93f94e3 100644
--- a/test/CodeGen/PTX/mul.ll
+++ b/test/CodeGen/PTX/mul.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=ptx | FileCheck %s
+; RUN: llc < %s -march=ptx32 | FileCheck %s
 
 ;define ptx_device i32 @t1(i32 %x, i32 %y) {
 ;	%z = mul i32 %x, %y
@@ -11,28 +11,28 @@
 ;}
 
 define ptx_device float @t1_f32(float %x, float %y) {
-; CHECK: mul.f32 f0, f1, f2
+; CHECK: mul.f32 r0, r1, r2
 ; CHECK-NEXT: ret;
   %z = fmul float %x, %y
   ret float %z
 }
 
 define ptx_device double @t1_f64(double %x, double %y) {
-; CHECK: mul.f64 fd0, fd1, fd2
+; CHECK: mul.f64 rd0, rd1, rd2
 ; CHECK-NEXT: ret;
   %z = fmul double %x, %y
   ret double %z
 }
 
 define ptx_device float @t2_f32(float %x) {
-; CHECK: mul.f32 f0, f1, 0F40A00000;
+; CHECK: mul.f32 r0, r1, 0F40A00000;
 ; CHECK-NEXT: ret;
   %z = fmul float %x, 5.0
   ret float %z
 }
 
 define ptx_device double @t2_f64(double %x) {
-; CHECK: mul.f64 fd0, fd1, 0D4014000000000000;
+; CHECK: mul.f64 rd0, rd1, 0D4014000000000000;
 ; CHECK-NEXT: ret;
   %z = fmul double %x, 5.0
   ret double %z
diff --git a/test/CodeGen/PTX/options.ll b/test/CodeGen/PTX/options.ll
index 6576a6d..92effa6 100644
--- a/test/CodeGen/PTX/options.ll
+++ b/test/CodeGen/PTX/options.ll
@@ -1,9 +1,10 @@
-; RUN: llc < %s -march=ptx -mattr=ptx20 | grep ".version 2.0"
-; RUN: llc < %s -march=ptx -mattr=ptx21 | grep ".version 2.1"
-; RUN: llc < %s -march=ptx -mattr=ptx22 | grep ".version 2.2"
-; RUN: llc < %s -march=ptx -mattr=sm10 | grep ".target sm_10"
-; RUN: llc < %s -march=ptx -mattr=sm13 | grep ".target sm_13"
-; RUN: llc < %s -march=ptx -mattr=sm20 | grep ".target sm_20"
+; RUN: llc < %s -march=ptx32 -mattr=ptx20 | grep ".version 2.0"
+; RUN: llc < %s -march=ptx32 -mattr=ptx21 | grep ".version 2.1"
+; RUN: llc < %s -march=ptx32 -mattr=ptx22 | grep ".version 2.2"
+; RUN: llc < %s -march=ptx32 -mattr=ptx23 | grep ".version 2.3"
+; RUN: llc < %s -march=ptx32 -mattr=sm10 | grep ".target sm_10"
+; RUN: llc < %s -march=ptx32 -mattr=sm13 | grep ".target sm_13"
+; RUN: llc < %s -march=ptx32 -mattr=sm20 | grep ".target sm_20"
 
 define ptx_device void @t1() {
 	ret void
diff --git a/test/CodeGen/PTX/parameter-order.ll b/test/CodeGen/PTX/parameter-order.ll
index dbbbb67..5486472 100644
--- a/test/CodeGen/PTX/parameter-order.ll
+++ b/test/CodeGen/PTX/parameter-order.ll
@@ -1,8 +1,8 @@
-; RUN: llc < %s -march=ptx | FileCheck %s
+; RUN: llc < %s -march=ptx32 | FileCheck %s
 
-; CHECK: .func (.reg .u32 r0) test_parameter_order (.reg .u32 r1, .reg .u32 r2)
-define ptx_device i32 @test_parameter_order(i32 %x, i32 %y) {
-; CHECK: sub.u32 r0, r1, r2
-	%z = sub i32 %x, %y
-	ret i32 %z
+; CHECK: .func (.reg .b32 r0) test_parameter_order (.reg .b32 r1, .reg .b32 r2, .reg .b32 r3, .reg .b32 r4)
+define ptx_device i32 @test_parameter_order(float %a, i32 %b, i32 %c, float %d) {
+; CHECK: sub.u32 r0, r2, r3
+	%result = sub i32 %b, %c
+	ret i32 %result
 }
diff --git a/test/CodeGen/PTX/ret.ll b/test/CodeGen/PTX/ret.ll
index d5037f2..ba0523f 100644
--- a/test/CodeGen/PTX/ret.ll
+++ b/test/CodeGen/PTX/ret.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=ptx | FileCheck %s
+; RUN: llc < %s -march=ptx32 | FileCheck %s
 
 define ptx_device void @t1() {
 ; CHECK: ret;
diff --git a/test/CodeGen/PTX/selp.ll b/test/CodeGen/PTX/selp.ll
new file mode 100644
index 0000000..19cfa53
--- /dev/null
+++ b/test/CodeGen/PTX/selp.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -march=ptx32 | FileCheck %s
+
+define ptx_device i32 @test_selp_i32(i1 %x, i32 %y, i32 %z) {
+; CHECK: selp.u32 r0, r1, r2, p1;
+	%a = select i1 %x, i32 %y, i32 %z
+	ret i32 %a
+}
+
+define ptx_device i64 @test_selp_i64(i1 %x, i64 %y, i64 %z) {
+; CHECK: selp.u64 rd0, rd1, rd2, p1;
+	%a = select i1 %x, i64 %y, i64 %z
+	ret i64 %a
+}
+
+define ptx_device float @test_selp_f32(i1 %x, float %y, float %z) {
+; CHECK: selp.f32 r0, r1, r2, p1;
+	%a = select i1 %x, float %y, float %z
+	ret float %a
+}
+
+define ptx_device double @test_selp_f64(i1 %x, double %y, double %z) {
+; CHECK: selp.f64 rd0, rd1, rd2, p1;
+	%a = select i1 %x, double %y, double %z
+	ret double %a
+}
diff --git a/test/CodeGen/PTX/setp.ll b/test/CodeGen/PTX/setp.ll
index 5348482..5836122 100644
--- a/test/CodeGen/PTX/setp.ll
+++ b/test/CodeGen/PTX/setp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=ptx | FileCheck %s
+; RUN: llc < %s -march=ptx32 | FileCheck %s
 
 define ptx_device i32 @test_setp_eq_u32_rr(i32 %x, i32 %y) {
 ; CHECK: setp.eq.u32 p0, r1, r2;
diff --git a/test/CodeGen/PTX/shl.ll b/test/CodeGen/PTX/shl.ll
index b564b43..6e72c92 100644
--- a/test/CodeGen/PTX/shl.ll
+++ b/test/CodeGen/PTX/shl.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=ptx | FileCheck %s
+; RUN: llc < %s -march=ptx32 | FileCheck %s
 
 define ptx_device i32 @t1(i32 %x, i32 %y) {
 ; CHECK: shl.b32 r0, r1, r2
diff --git a/test/CodeGen/PTX/shr.ll b/test/CodeGen/PTX/shr.ll
index 3f8ade8..8693e0e 100644
--- a/test/CodeGen/PTX/shr.ll
+++ b/test/CodeGen/PTX/shr.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=ptx | FileCheck %s
+; RUN: llc < %s -march=ptx32 | FileCheck %s
 
 define ptx_device i32 @t1(i32 %x, i32 %y) {
 ; CHECK: shr.u32 r0, r1, r2
diff --git a/test/CodeGen/PTX/st.ll b/test/CodeGen/PTX/st.ll
index 4e9b08a..612967a 100644
--- a/test/CodeGen/PTX/st.ll
+++ b/test/CodeGen/PTX/st.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=ptx | FileCheck %s
+; RUN: llc < %s -march=ptx32 | FileCheck %s
 
 ;CHECK: .extern .global .b8 array_i16[20];
 @array_i16 = external global [10 x i16]
@@ -87,7 +87,7 @@ entry:
 
 define ptx_device void @t1_f32(float* %p, float %x) {
 entry:
-;CHECK: st.global.f32 [r1], f1;
+;CHECK: st.global.f32 [r1], r2;
 ;CHECK-NEXT: ret;
   store float %x, float* %p
   ret void
@@ -95,7 +95,7 @@ entry:
 
 define ptx_device void @t1_f64(double* %p, double %x) {
 entry:
-;CHECK: st.global.f64 [r1], fd1;
+;CHECK: st.global.f64 [r1], rd1;
 ;CHECK-NEXT: ret;
   store double %x, double* %p
   ret void
@@ -130,7 +130,7 @@ entry:
 
 define ptx_device void @t2_f32(float* %p, float %x) {
 entry:
-;CHECK: st.global.f32 [r1+4], f1;
+;CHECK: st.global.f32 [r1+4], r2;
 ;CHECK-NEXT: ret;
   %i = getelementptr float* %p, i32 1
   store float %x, float* %i
@@ -139,7 +139,7 @@ entry:
 
 define ptx_device void @t2_f64(double* %p, double %x) {
 entry:
-;CHECK: st.global.f64 [r1+8], fd1;
+;CHECK: st.global.f64 [r1+8], rd1;
 ;CHECK-NEXT: ret;
   %i = getelementptr double* %p, i32 1
   store double %x, double* %i
@@ -183,7 +183,7 @@ define ptx_device void @t3_f32(float* %p, i32 %q, float %x) {
 entry:
 ;CHECK: shl.b32 r0, r2, 2;
 ;CHECK-NEXT: add.u32 r0, r1, r0;
-;CHECK-NEXT: st.global.f32 [r0], f1;
+;CHECK-NEXT: st.global.f32 [r0], r3;
 ;CHECK-NEXT: ret;
   %i = getelementptr float* %p, i32 %q
   store float %x, float* %i
@@ -194,7 +194,7 @@ define ptx_device void @t3_f64(double* %p, i32 %q, double %x) {
 entry:
 ;CHECK: shl.b32 r0, r2, 3;
 ;CHECK-NEXT: add.u32 r0, r1, r0;
-;CHECK-NEXT: st.global.f64 [r0], fd1;
+;CHECK-NEXT: st.global.f64 [r0], rd1;
 ;CHECK-NEXT: ret;
   %i = getelementptr double* %p, i32 %q
   store double %x, double* %i
@@ -234,7 +234,7 @@ entry:
 define ptx_device void @t4_global_f32(float %x) {
 entry:
 ;CHECK: mov.u32 r0, array_float;
-;CHECK-NEXT: st.global.f32 [r0], f1;
+;CHECK-NEXT: st.global.f32 [r0], r1;
 ;CHECK-NEXT: ret;
   %i = getelementptr [10 x float]* @array_float, i32 0, i32 0
   store float %x, float* %i
@@ -244,7 +244,7 @@ entry:
 define ptx_device void @t4_global_f64(double %x) {
 entry:
 ;CHECK: mov.u32 r0, array_double;
-;CHECK-NEXT: st.global.f64 [r0], fd1;
+;CHECK-NEXT: st.global.f64 [r0], rd1;
 ;CHECK-NEXT: ret;
   %i = getelementptr [10 x double]* @array_double, i32 0, i32 0
   store double %x, double* %i
@@ -284,7 +284,7 @@ entry:
 define ptx_device void @t4_local_f32(float %x) {
 entry:
 ;CHECK: mov.u32 r0, array_local_float;
-;CHECK-NEXT: st.local.f32 [r0], f1;
+;CHECK-NEXT: st.local.f32 [r0], r1;
 ;CHECK-NEXT: ret;
   %i = getelementptr [10 x float] addrspace(2)* @array_local_float, i32 0, i32 0
   store float %x, float addrspace(2)* %i
@@ -294,7 +294,7 @@ entry:
 define ptx_device void @t4_local_f64(double %x) {
 entry:
 ;CHECK: mov.u32 r0, array_local_double;
-;CHECK-NEXT: st.local.f64 [r0], fd1;
+;CHECK-NEXT: st.local.f64 [r0], rd1;
 ;CHECK-NEXT: ret;
   %i = getelementptr [10 x double] addrspace(2)* @array_local_double, i32 0, i32 0
   store double %x, double addrspace(2)* %i
@@ -334,7 +334,7 @@ entry:
 define ptx_device void @t4_shared_f32(float %x) {
 entry:
 ;CHECK: mov.u32 r0, array_shared_float;
-;CHECK-NEXT: st.shared.f32 [r0], f1;
+;CHECK-NEXT: st.shared.f32 [r0], r1;
 ;CHECK-NEXT: ret;
   %i = getelementptr [10 x float] addrspace(4)* @array_shared_float, i32 0, i32 0
   store float %x, float addrspace(4)* %i
@@ -344,7 +344,7 @@ entry:
 define ptx_device void @t4_shared_f64(double %x) {
 entry:
 ;CHECK: mov.u32 r0, array_shared_double;
-;CHECK-NEXT: st.shared.f64 [r0], fd1;
+;CHECK-NEXT: st.shared.f64 [r0], rd1;
 ;CHECK-NEXT: ret;
   %i = getelementptr [10 x double] addrspace(4)* @array_shared_double, i32 0, i32 0
   store double %x, double addrspace(4)* %i
@@ -384,7 +384,7 @@ entry:
 define ptx_device void @t5_f32(float %x) {
 entry:
 ;CHECK: mov.u32 r0, array_float;
-;CHECK-NEXT: st.global.f32 [r0+4], f1;
+;CHECK-NEXT: st.global.f32 [r0+4], r1;
 ;CHECK-NEXT: ret;
   %i = getelementptr [10 x float]* @array_float, i32 0, i32 1
   store float %x, float* %i
@@ -394,7 +394,7 @@ entry:
 define ptx_device void @t5_f64(double %x) {
 entry:
 ;CHECK: mov.u32 r0, array_double;
-;CHECK-NEXT: st.global.f64 [r0+8], fd1;
+;CHECK-NEXT: st.global.f64 [r0+8], rd1;
 ;CHECK-NEXT: ret;
   %i = getelementptr [10 x double]* @array_double, i32 0, i32 1
   store double %x, double* %i
diff --git a/test/CodeGen/PTX/sub.ll b/test/CodeGen/PTX/sub.ll
index 4810e4f..9efeaac 100644
--- a/test/CodeGen/PTX/sub.ll
+++ b/test/CodeGen/PTX/sub.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=ptx | FileCheck %s
+; RUN: llc < %s -march=ptx32 | FileCheck %s
 
 define ptx_device i16 @t1_u16(i16 %x, i16 %y) {
 ; CHECK: sub.u16 rh0, rh1, rh2;
@@ -22,14 +22,14 @@ define ptx_device i64 @t1_u64(i64 %x, i64 %y) {
 }
 
 define ptx_device float @t1_f32(float %x, float %y) {
-; CHECK: sub.f32 f0, f1, f2
+; CHECK: sub.f32 r0, r1, r2
 ; CHECK-NEXT: ret;
   %z = fsub float %x, %y
   ret float %z
 }
 
 define ptx_device double @t1_f64(double %x, double %y) {
-; CHECK: sub.f64 fd0, fd1, fd2
+; CHECK: sub.f64 rd0, rd1, rd2
 ; CHECK-NEXT: ret;
   %z = fsub double %x, %y
   ret double %z
@@ -57,14 +57,14 @@ define ptx_device i64 @t2_u64(i64 %x) {
 }
 
 define ptx_device float @t2_f32(float %x) {
-; CHECK: add.f32 f0, f1, 0FBF800000;
+; CHECK: add.f32 r0, r1, 0FBF800000;
 ; CHECK-NEXT: ret;
   %z = fsub float %x, 1.0
   ret float %z
 }
 
 define ptx_device double @t2_f64(double %x) {
-; CHECK: add.f64 fd0, fd1, 0DBFF0000000000000;
+; CHECK: add.f64 rd0, rd1, 0DBFF0000000000000;
 ; CHECK-NEXT: ret;
   %z = fsub double %x, 1.0
   ret double %z
diff --git a/test/CodeGen/PowerPC/2007-05-30-dagcombine-miscomp.ll b/test/CodeGen/PowerPC/2007-05-30-dagcombine-miscomp.ll
index 2938c70..72e93a9 100644
--- a/test/CodeGen/PowerPC/2007-05-30-dagcombine-miscomp.ll
+++ b/test/CodeGen/PowerPC/2007-05-30-dagcombine-miscomp.ll
@@ -4,7 +4,7 @@ target triple = "powerpc-apple-darwin8.8.0"
 ; RUN: llc < %s -march=ppc32 | grep {rlwinm r3, r3, 23, 30, 30}
 ; PR1473
 
-define i8 @foo(i16 zeroext  %a) zeroext  {
+define zeroext i8 @foo(i16 zeroext  %a)   {
         %tmp2 = lshr i16 %a, 10         ; <i16> [#uses=1]
         %tmp23 = trunc i16 %tmp2 to i8          ; <i8> [#uses=1]
         %tmp4 = shl i8 %tmp23, 1                ; <i8> [#uses=1]
diff --git a/test/CodeGen/PowerPC/2008-03-24-CoalescerBug.ll b/test/CodeGen/PowerPC/2008-03-24-CoalescerBug.ll
index 8776d9a..01c83cb 100644
--- a/test/CodeGen/PowerPC/2008-03-24-CoalescerBug.ll
+++ b/test/CodeGen/PowerPC/2008-03-24-CoalescerBug.ll
@@ -18,7 +18,7 @@ define void @"-[PFTPersistentSymbols saveSymbolWithName:address:path:lineNumber:
 entry:
 	br i1 false, label %bb12, label %bb21
 bb12:		; preds = %entry
-	%tmp17 = tail call i8 inttoptr (i64 4294901504 to i8 (%struct..0objc_object*, %struct.objc_selector*, %struct.NSArray*)*)( %struct..0objc_object* null, %struct.objc_selector* null, %struct.NSArray* bitcast (%struct.__builtin_CFString* @0 to %struct.NSArray*) ) signext nounwind 		; <i8> [#uses=0]
+	%tmp17 = tail call signext i8 inttoptr (i64 4294901504 to i8 (%struct..0objc_object*, %struct.objc_selector*, %struct.NSArray*)*)( %struct..0objc_object* null, %struct.objc_selector* null, %struct.NSArray* bitcast (%struct.__builtin_CFString* @0 to %struct.NSArray*) )  nounwind 		; <i8> [#uses=0]
 	br i1 false, label %bb25, label %bb21
 bb21:		; preds = %bb12, %entry
 	%tmp24 = or i64 %flags, 4		; <i64> [#uses=1]
diff --git a/test/CodeGen/PowerPC/2008-07-15-SignExtendInreg.ll b/test/CodeGen/PowerPC/2008-07-15-SignExtendInreg.ll
index 5cd8c34..21b0c61 100644
--- a/test/CodeGen/PowerPC/2008-07-15-SignExtendInreg.ll
+++ b/test/CodeGen/PowerPC/2008-07-15-SignExtendInreg.ll
@@ -2,7 +2,7 @@
 target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f128:64:128"
 target triple = "powerpc-apple-darwin9"
 
-define i16 @t(i16* %dct) signext nounwind  {
+define signext i16 @t(i16* %dct)  nounwind  {
 entry:
          load i16* null, align 2         ; <i16>:0 [#uses=2]
          lshr i16 %0, 11         ; <i16>:1 [#uses=0]
diff --git a/test/CodeGen/PowerPC/2008-12-12-EH.ll b/test/CodeGen/PowerPC/2008-12-12-EH.ll
index 2315e36..a2a5e9e 100644
--- a/test/CodeGen/PowerPC/2008-12-12-EH.ll
+++ b/test/CodeGen/PowerPC/2008-12-12-EH.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s  -march=ppc32 -mtriple=powerpc-apple-darwin9 | grep ^__Z1fv.eh
+; RUN: llc < %s -disable-cfi -march=ppc32 -mtriple=powerpc-apple-darwin9 | grep ^__Z1fv.eh
 
 define void @_Z1fv() {
 entry:
diff --git a/test/CodeGen/PowerPC/2010-03-09-indirect-call.ll b/test/CodeGen/PowerPC/2010-03-09-indirect-call.ll
index d094509..6b31397 100644
--- a/test/CodeGen/PowerPC/2010-03-09-indirect-call.ll
+++ b/test/CodeGen/PowerPC/2010-03-09-indirect-call.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=ppc32 -mcpu=g5 -mtriple=powerpc-apple-darwin10.0 | FileCheck %s
+; RUN: llc < %s -march=ppc32 -mcpu=g5 -mtriple=powerpc-apple-darwin10.0 -join-physregs | FileCheck %s
 ; ModuleID = 'nn.c'
 target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f128:64:128"
 target triple = "powerpc-apple-darwin11.0"
diff --git a/test/CodeGen/PowerPC/and-elim.ll b/test/CodeGen/PowerPC/and-elim.ll
index 3685361..a1ec29b 100644
--- a/test/CodeGen/PowerPC/and-elim.ll
+++ b/test/CodeGen/PowerPC/and-elim.ll
@@ -9,7 +9,7 @@ define void @test(i8* %P) {
 	ret void
 }
 
-define i16 @test2(i16 zeroext %crc) zeroext { 
+define zeroext i16 @test2(i16 zeroext %crc)  { 
         ; No and's should be needed for the i16s here.
         %tmp.1 = lshr i16 %crc, 1
         %tmp.7 = xor i16 %tmp.1, 40961
diff --git a/test/CodeGen/PowerPC/and_sext.ll b/test/CodeGen/PowerPC/and_sext.ll
index c6d234e..df48ccf 100644
--- a/test/CodeGen/PowerPC/and_sext.ll
+++ b/test/CodeGen/PowerPC/and_sext.ll
@@ -9,7 +9,7 @@ define i32 @test1(i32 %mode.0.i.0) {
         ret i32 %tmp.81
 }
 
-define i16 @test2(i16 signext %X, i16 signext %x) signext {
+define signext i16 @test2(i16 signext %X, i16 signext %x)  {
         %tmp = sext i16 %X to i32
         %tmp1 = sext i16 %x to i32
         %tmp2 = add i32 %tmp, %tmp1
@@ -20,7 +20,7 @@ define i16 @test2(i16 signext %X, i16 signext %x) signext {
         ret i16 %retval
 }
 
-define i16 @test3(i32 zeroext %X) signext {
+define signext i16 @test3(i32 zeroext %X)  {
         %tmp1 = lshr i32 %X, 16
         %tmp2 = trunc i32 %tmp1 to i16
         ret i16 %tmp2
diff --git a/test/CodeGen/PowerPC/big-endian-formal-args.ll b/test/CodeGen/PowerPC/big-endian-formal-args.ll
index e46e1ec..318ccb0 100644
--- a/test/CodeGen/PowerPC/big-endian-formal-args.ll
+++ b/test/CodeGen/PowerPC/big-endian-formal-args.ll
@@ -1,14 +1,12 @@
-; RUN: llc < %s -march=ppc32 -mtriple=powerpc-unknown-linux-gnu | \
-; RUN:   grep {li 6, 3}
-; RUN: llc < %s -march=ppc32 -mtriple=powerpc-unknown-linux-gnu | \
-; RUN:   grep {li 4, 2}
-; RUN: llc < %s -march=ppc32 -mtriple=powerpc-unknown-linux-gnu | \
-; RUN:   grep {li 3, 0}
-; RUN: llc < %s -march=ppc32 -mtriple=powerpc-unknown-linux-gnu | \
-; RUN:   grep {mr 5, 3}
+; RUN: llc < %s -march=ppc32 -mtriple=powerpc-unknown-linux-gnu | FileCheck %s
 
 declare void @bar(i64 %x, i64 %y)
 
+; CHECK: li 4, 2
+; CHECK: li {{[53]}}, 0
+; CHECK: li 6, 3
+; CHECK: mr {{[53]}}, {{[53]}}
+
 define void @foo() {
   call void @bar(i64 2, i64 3)
   ret void
diff --git a/test/CodeGen/PowerPC/calls.ll b/test/CodeGen/PowerPC/calls.ll
index 0db184f..29bcb20 100644
--- a/test/CodeGen/PowerPC/calls.ll
+++ b/test/CodeGen/PowerPC/calls.ll
@@ -1,7 +1,7 @@
 ; Test various forms of calls.
 
 ; RUN: llc < %s -march=ppc32 | \
-; RUN:   grep {bl } | count 2
+; RUN:   grep {bl } | count 1
 ; RUN: llc < %s -march=ppc32 | \
 ; RUN:   grep {bctrl} | count 1
 ; RUN: llc < %s -march=ppc32 | \
@@ -14,11 +14,6 @@ define void @test_direct() {
         ret void
 }
 
-define void @test_extsym(i8* %P) {
-        free i8* %P
-        ret void
-}
-
 define void @test_indirect(void ()* %fp) {
         call void %fp( )
         ret void
diff --git a/test/CodeGen/PowerPC/indirectbr.ll b/test/CodeGen/PowerPC/indirectbr.ll
index ac56625..29c620e 100644
--- a/test/CodeGen/PowerPC/indirectbr.ll
+++ b/test/CodeGen/PowerPC/indirectbr.ll
@@ -1,5 +1,6 @@
 ; RUN: llc < %s -relocation-model=pic -march=ppc32 -mtriple=powerpc-apple-darwin | FileCheck %s -check-prefix=PIC
 ; RUN: llc < %s -relocation-model=static -march=ppc32 -mtriple=powerpc-apple-darwin | FileCheck %s -check-prefix=STATIC
+; RUN: llc < %s -relocation-model=pic -march=ppc64 -mtriple=powerpc64-apple-darwin | FileCheck %s -check-prefix=PPC64
 
 @nextaddr = global i8* null                       ; <i8**> [#uses=2]
 @C.0.2070 = private constant [5 x i8*] [i8* blockaddress(@foo, %L1), i8* blockaddress(@foo, %L2), i8* blockaddress(@foo, %L3), i8* blockaddress(@foo, %L4), i8* blockaddress(@foo, %L5)] ; <[5 x i8*]*> [#uses=1]
@@ -7,6 +8,7 @@
 define internal i32 @foo(i32 %i) nounwind {
 ; PIC: foo:
 ; STATIC: foo:
+; PPC64: foo:
 entry:
   %0 = load i8** @nextaddr, align 4               ; <i8*> [#uses=2]
   %1 = icmp eq i8* %0, null                       ; <i1> [#uses=1]
@@ -18,6 +20,8 @@ bb2:                                              ; preds = %entry, %bb3
 ; PIC-NEXT: bctr
 ; STATIC: mtctr
 ; STATIC-NEXT: bctr
+; PPC64: mtctr
+; PPC64-NEXT: bctr
   indirectbr i8* %gotovar.4.0, [label %L5, label %L4, label %L3, label %L2, label %L1]
 
 bb3:                                              ; preds = %entry
diff --git a/test/CodeGen/PowerPC/mul-with-overflow.ll b/test/CodeGen/PowerPC/mul-with-overflow.ll
index f03e3cb..76d06df 100644
--- a/test/CodeGen/PowerPC/mul-with-overflow.ll
+++ b/test/CodeGen/PowerPC/mul-with-overflow.ll
@@ -1,14 +1,14 @@
 ; RUN: llc < %s -march=ppc32
 
 declare {i32, i1} @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
-define i1 @a(i32 %x) zeroext nounwind {
+define zeroext i1 @a(i32 %x)  nounwind {
   %res = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %x, i32 3)
   %obil = extractvalue {i32, i1} %res, 1
   ret i1 %obil
 }
 
 declare {i32, i1} @llvm.smul.with.overflow.i32(i32 %a, i32 %b)
-define i1 @b(i32 %x) zeroext nounwind {
+define zeroext i1 @b(i32 %x)  nounwind {
   %res = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %x, i32 3)
   %obil = extractvalue {i32, i1} %res, 1
   ret i1 %obil
diff --git a/test/CodeGen/PowerPC/mulhs.ll b/test/CodeGen/PowerPC/mulhs.ll
index 9ab8d99..5b02e18 100644
--- a/test/CodeGen/PowerPC/mulhs.ll
+++ b/test/CodeGen/PowerPC/mulhs.ll
@@ -5,7 +5,7 @@
 ; RUN: not grep add %t 
 ; RUN: grep mulhw %t | count 1
 
-define i32 @mulhs(i32 %a, i32 %b) {
+define i32 @mulhs(i32 %a, i32 %b) nounwind {
 entry:
         %tmp.1 = sext i32 %a to i64             ; <i64> [#uses=1]
         %tmp.3 = sext i32 %b to i64             ; <i64> [#uses=1]
diff --git a/test/CodeGen/PowerPC/ppc-prologue.ll b/test/CodeGen/PowerPC/ppc-prologue.ll
index 2ebfd3c..5538371 100644
--- a/test/CodeGen/PowerPC/ppc-prologue.ll
+++ b/test/CodeGen/PowerPC/ppc-prologue.ll
@@ -5,9 +5,7 @@ define i32 @_Z4funci(i32 %a) ssp {
 ; CHECK-NEXT:  stw r31, -4(r1)
 ; CHECK-NEXT:  stw r0, 8(r1)
 ; CHECK-NEXT:  stwu r1, -80(r1)
-; CHECK-NEXT: Ltmp0:
-; CHECK-NEXT:  mr r31, r1
-; CHECK-NEXT: Ltmp1:
+; CHECK:  mr r31, r1
 entry:
   %a_addr = alloca i32                            ; <i32*> [#uses=2]
   %retval = alloca i32                            ; <i32*> [#uses=2]
diff --git a/test/CodeGen/PowerPC/small-arguments.ll b/test/CodeGen/PowerPC/small-arguments.ll
index 31bcee6..b4767b0 100644
--- a/test/CodeGen/PowerPC/small-arguments.ll
+++ b/test/CodeGen/PowerPC/small-arguments.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=ppc32 | not grep {extsh\\|rlwinm}
 
-declare i16 @foo() signext 
+declare signext i16 @foo()  
 
 define i32 @test1(i16 signext %X) {
 	%Y = sext i16 %X to i32  ;; dead
@@ -14,12 +14,12 @@ define i32 @test2(i16 zeroext %X) {
 }
 
 define void @test3() {
-	%tmp.0 = call i16 @foo() signext            ;; no extsh!
+	%tmp.0 = call signext i16 @foo()             ;; no extsh!
 	%tmp.1 = icmp slt i16 %tmp.0, 1234
 	br i1 %tmp.1, label %then, label %UnifiedReturnBlock
 
 then:	
-	call i32 @test1(i16 0 signext)
+	call i32 @test1(i16 signext 0)
 	ret void
 UnifiedReturnBlock:
 	ret void
@@ -46,7 +46,7 @@ define i32 @test6(i32* %P) {
         ret i32 %tmp.2
 }
 
-define i16 @test7(float %a) zeroext {
+define zeroext i16 @test7(float %a)  {
         %tmp.1 = fptoui float %a to i16
         ret i16 %tmp.1
 }
diff --git a/test/CodeGen/SPARC/2011-01-22-SRet.ll b/test/CodeGen/SPARC/2011-01-22-SRet.ll
index 506d3a8..5393392 100644
--- a/test/CodeGen/SPARC/2011-01-22-SRet.ll
+++ b/test/CodeGen/SPARC/2011-01-22-SRet.ll
@@ -6,7 +6,6 @@ define weak void @make_foo(%struct.foo_t* noalias sret %agg.result, i32 %a, i32
 entry:
 ;CHECK: make_foo
 ;CHECK: ld [%fp+64], {{.+}}
-;CHECK: or {{.+}}, {{.+}}, %i0
 ;CHECK: jmp %i7+12
   %0 = getelementptr inbounds %struct.foo_t* %agg.result, i32 0, i32 0
   store i32 %a, i32* %0, align 4
diff --git a/test/CodeGen/SystemZ/02-MemArith.ll b/test/CodeGen/SystemZ/02-MemArith.ll
index 04022a0..ee9e5e9 100644
--- a/test/CodeGen/SystemZ/02-MemArith.ll
+++ b/test/CodeGen/SystemZ/02-MemArith.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=systemz | FileCheck %s
 
-define i32 @foo1(i32 %a, i32 *%b, i64 %idx) signext {
+define signext i32 @foo1(i32 %a, i32 *%b, i64 %idx)  {
 ; CHECK: foo1:
 ; CHECK:  a %r2, 4(%r1,%r3)
 entry:
@@ -11,7 +11,7 @@ entry:
     ret i32 %d
 }
 
-define i32 @foo2(i32 %a, i32 *%b, i64 %idx) signext {
+define signext i32 @foo2(i32 %a, i32 *%b, i64 %idx) {
 ; CHECK: foo2:
 ; CHECK:  ay %r2, -4(%r1,%r3)
 entry:
@@ -22,7 +22,7 @@ entry:
     ret i32 %d
 }
 
-define i64 @foo3(i64 %a, i64 *%b, i64 %idx) signext {
+define signext i64 @foo3(i64 %a, i64 *%b, i64 %idx)  {
 ; CHECK: foo3:
 ; CHECK:  ag %r2, 8(%r1,%r3)
 entry:
@@ -33,7 +33,7 @@ entry:
     ret i64 %d
 }
 
-define i32 @foo4(i32 %a, i32 *%b, i64 %idx) signext {
+define signext i32 @foo4(i32 %a, i32 *%b, i64 %idx)  {
 ; CHECK: foo4:
 ; CHECK:  n %r2, 4(%r1,%r3)
 entry:
@@ -44,7 +44,7 @@ entry:
     ret i32 %d
 }
 
-define i32 @foo5(i32 %a, i32 *%b, i64 %idx) signext {
+define signext i32 @foo5(i32 %a, i32 *%b, i64 %idx) {
 ; CHECK: foo5:
 ; CHECK:  ny %r2, -4(%r1,%r3)
 entry:
@@ -55,7 +55,7 @@ entry:
     ret i32 %d
 }
 
-define i64 @foo6(i64 %a, i64 *%b, i64 %idx) signext {
+define signext i64 @foo6(i64 %a, i64 *%b, i64 %idx)  {
 ; CHECK: foo6:
 ; CHECK:  ng %r2, 8(%r1,%r3)
 entry:
@@ -66,7 +66,7 @@ entry:
     ret i64 %d
 }
 
-define i32 @foo7(i32 %a, i32 *%b, i64 %idx) signext {
+define signext i32 @foo7(i32 %a, i32 *%b, i64 %idx) {
 ; CHECK: foo7:
 ; CHECK:  o %r2, 4(%r1,%r3)
 entry:
@@ -77,7 +77,7 @@ entry:
     ret i32 %d
 }
 
-define i32 @foo8(i32 %a, i32 *%b, i64 %idx) signext {
+define signext i32 @foo8(i32 %a, i32 *%b, i64 %idx)  {
 ; CHECK: foo8:
 ; CHECK:  oy %r2, -4(%r1,%r3)
 entry:
@@ -88,7 +88,7 @@ entry:
     ret i32 %d
 }
 
-define i64 @foo9(i64 %a, i64 *%b, i64 %idx) signext {
+define signext i64 @foo9(i64 %a, i64 *%b, i64 %idx)  {
 ; CHECK: foo9:
 ; CHECK:  og %r2, 8(%r1,%r3)
 entry:
@@ -99,7 +99,7 @@ entry:
     ret i64 %d
 }
 
-define i32 @foo10(i32 %a, i32 *%b, i64 %idx) signext {
+define signext i32 @foo10(i32 %a, i32 *%b, i64 %idx)  {
 ; CHECK: foo10:
 ; CHECK:  x %r2, 4(%r1,%r3)
 entry:
@@ -110,7 +110,7 @@ entry:
     ret i32 %d
 }
 
-define i32 @foo11(i32 %a, i32 *%b, i64 %idx) signext {
+define signext i32 @foo11(i32 %a, i32 *%b, i64 %idx)  {
 ; CHECK: foo11:
 ; CHECK:  xy %r2, -4(%r1,%r3)
 entry:
@@ -121,7 +121,7 @@ entry:
     ret i32 %d
 }
 
-define i64 @foo12(i64 %a, i64 *%b, i64 %idx) signext {
+define signext i64 @foo12(i64 %a, i64 *%b, i64 %idx)  {
 ; CHECK: foo12:
 ; CHECK:  xg %r2, 8(%r1,%r3)
 entry:
diff --git a/test/CodeGen/SystemZ/03-RetAddImmSubreg.ll b/test/CodeGen/SystemZ/03-RetAddImmSubreg.ll
index 0a81271..0a7f5ee 100644
--- a/test/CodeGen/SystemZ/03-RetAddImmSubreg.ll
+++ b/test/CodeGen/SystemZ/03-RetAddImmSubreg.ll
@@ -16,25 +16,25 @@ entry:
     ret i32 %c
 }
 
-define i32 @foo3(i32 %a, i32 %b) zeroext {
+define zeroext i32 @foo3(i32 %a, i32 %b)  {
 entry:
     %c = add i32 %a, 1
     ret i32 %c
 }
 
-define i32 @foo4(i32 %a, i32 %b) zeroext {
+define zeroext i32 @foo4(i32 %a, i32 %b)  {
 entry:
     %c = add i32 %a, 131072
     ret i32 %c
 }
 
-define i32 @foo5(i32 %a, i32 %b) signext {
+define signext i32 @foo5(i32 %a, i32 %b)  {
 entry:
     %c = add i32 %a, 1
     ret i32 %c
 }
 
-define i32 @foo6(i32 %a, i32 %b) signext {
+define signext i32 @foo6(i32 %a, i32 %b)  {
 entry:
     %c = add i32 %a, 131072
     ret i32 %c
diff --git a/test/CodeGen/SystemZ/03-RetAddSubreg.ll b/test/CodeGen/SystemZ/03-RetAddSubreg.ll
index 2787083..337bb3f 100644
--- a/test/CodeGen/SystemZ/03-RetAddSubreg.ll
+++ b/test/CodeGen/SystemZ/03-RetAddSubreg.ll
@@ -8,13 +8,13 @@ entry:
     ret i32 %c
 }
 
-define i32 @foo1(i32 %a, i32 %b) zeroext {
+define zeroext i32 @foo1(i32 %a, i32 %b)  {
 entry:
     %c = add i32 %a, %b
     ret i32 %c
 }
 
-define i32 @foo2(i32 %a, i32 %b) signext {
+define signext i32 @foo2(i32 %a, i32 %b)  {
 entry:
     %c = add i32 %a, %b
     ret i32 %c
diff --git a/test/CodeGen/SystemZ/03-RetAndImmSubreg.ll b/test/CodeGen/SystemZ/03-RetAndImmSubreg.ll
index 32673dd..c5326ab 100644
--- a/test/CodeGen/SystemZ/03-RetAndImmSubreg.ll
+++ b/test/CodeGen/SystemZ/03-RetAndImmSubreg.ll
@@ -12,25 +12,25 @@ entry:
     ret i32 %c
 }
 
-define i32 @foo3(i32 %a, i32 %b) zeroext {
+define zeroext i32 @foo3(i32 %a, i32 %b)  {
 entry:
     %c = and i32 %a, 1
     ret i32 %c
 }
 
-define i32 @foo4(i32 %a, i32 %b) signext {
+define signext i32 @foo4(i32 %a, i32 %b)  {
 entry:
     %c = and i32 %a, 131072
     ret i32 %c
 }
 
-define i32 @foo5(i32 %a, i32 %b) zeroext {
+define zeroext i32 @foo5(i32 %a, i32 %b)  {
 entry:
     %c = and i32 %a, 1
     ret i32 %c
 }
 
-define i32 @foo6(i32 %a, i32 %b) signext {
+define signext i32 @foo6(i32 %a, i32 %b)  {
 entry:
     %c = and i32 %a, 131072
     ret i32 %c
diff --git a/test/CodeGen/SystemZ/03-RetAndSubreg.ll b/test/CodeGen/SystemZ/03-RetAndSubreg.ll
index ed5e526..75dc90a 100644
--- a/test/CodeGen/SystemZ/03-RetAndSubreg.ll
+++ b/test/CodeGen/SystemZ/03-RetAndSubreg.ll
@@ -7,13 +7,13 @@ entry:
     ret i32 %c
 }
 
-define i32 @foo1(i32 %a, i32 %b) zeroext {
+define zeroext i32 @foo1(i32 %a, i32 %b)  {
 entry:
     %c = and i32 %a, %b
     ret i32 %c
 }
 
-define i32 @foo2(i32 %a, i32 %b) signext {
+define signext i32 @foo2(i32 %a, i32 %b)  {
 entry:
     %c = and i32 %a, %b
     ret i32 %c
diff --git a/test/CodeGen/SystemZ/03-RetArgSubreg.ll b/test/CodeGen/SystemZ/03-RetArgSubreg.ll
index 0c9bb14..476821a 100644
--- a/test/CodeGen/SystemZ/03-RetArgSubreg.ll
+++ b/test/CodeGen/SystemZ/03-RetArgSubreg.ll
@@ -8,12 +8,12 @@ entry:
     ret i32 %b
 }
 
-define i32 @foo1(i32 %a, i32 %b) zeroext {
+define zeroext i32 @foo1(i32 %a, i32 %b)  {
 entry:
     ret i32 %b
 }
 
-define i32 @foo2(i32 %a, i32 %b) signext {
+define signext i32 @foo2(i32 %a, i32 %b)  {
 entry:
     ret i32 %b
 }
diff --git a/test/CodeGen/SystemZ/03-RetImmSubreg.ll b/test/CodeGen/SystemZ/03-RetImmSubreg.ll
index 343e30b..70da913 100644
--- a/test/CodeGen/SystemZ/03-RetImmSubreg.ll
+++ b/test/CodeGen/SystemZ/03-RetImmSubreg.ll
@@ -30,12 +30,12 @@ entry:
     ret i32 4294967295
 }
 
-define i32 @foo6() zeroext {
+define zeroext i32 @foo6()  {
 entry:
     ret i32 4294967295
 }
 
-define i32 @foo7() signext {
+define signext i32 @foo7()  {
 entry:
     ret i32 4294967295
 }
diff --git a/test/CodeGen/SystemZ/03-RetOrImmSubreg.ll b/test/CodeGen/SystemZ/03-RetOrImmSubreg.ll
index 6d118b5..99adea8 100644
--- a/test/CodeGen/SystemZ/03-RetOrImmSubreg.ll
+++ b/test/CodeGen/SystemZ/03-RetOrImmSubreg.ll
@@ -22,37 +22,37 @@ entry:
     ret i32 %c
 }
 
-define i32 @foo3(i32 %a, i32 %b) zeroext {
+define zeroext i32 @foo3(i32 %a, i32 %b)  {
 entry:
     %c = or i32 %a, 1
     ret i32 %c
 }
 
-define i32 @foo8(i32 %a, i32 %b) zeroext {
+define zeroext i32 @foo8(i32 %a, i32 %b)  {
 entry:
     %c = or i32 %a, 123456
     ret i32 %c
 }
 
-define i32 @foo4(i32 %a, i32 %b) signext {
+define signext i32 @foo4(i32 %a, i32 %b)  {
 entry:
     %c = or i32 %a, 131072
     ret i32 %c
 }
 
-define i32 @foo5(i32 %a, i32 %b) zeroext {
+define zeroext i32 @foo5(i32 %a, i32 %b)  {
 entry:
     %c = or i32 %a, 1
     ret i32 %c
 }
 
-define i32 @foo6(i32 %a, i32 %b) signext {
+define signext i32 @foo6(i32 %a, i32 %b)  {
 entry:
     %c = or i32 %a, 131072
     ret i32 %c
 }
 
-define i32 @foo9(i32 %a, i32 %b) signext {
+define signext i32 @foo9(i32 %a, i32 %b)  {
 entry:
     %c = or i32 %a, 123456
     ret i32 %c
diff --git a/test/CodeGen/SystemZ/03-RetOrSubreg.ll b/test/CodeGen/SystemZ/03-RetOrSubreg.ll
index 4d7661a..7dab5ca 100644
--- a/test/CodeGen/SystemZ/03-RetOrSubreg.ll
+++ b/test/CodeGen/SystemZ/03-RetOrSubreg.ll
@@ -9,13 +9,13 @@ entry:
     ret i32 %c
 }
 
-define i32 @foo1(i32 %a, i32 %b) zeroext {
+define zeroext i32 @foo1(i32 %a, i32 %b)  {
 entry:
     %c = or i32 %a, %b
     ret i32 %c
 }
 
-define i32 @foo2(i32 %a, i32 %b) signext {
+define signext i32 @foo2(i32 %a, i32 %b)  {
 entry:
     %c = or i32 %a, %b
     ret i32 %c
diff --git a/test/CodeGen/SystemZ/03-RetSubImmSubreg.ll b/test/CodeGen/SystemZ/03-RetSubImmSubreg.ll
index 11ca796..21ea9b5 100644
--- a/test/CodeGen/SystemZ/03-RetSubImmSubreg.ll
+++ b/test/CodeGen/SystemZ/03-RetSubImmSubreg.ll
@@ -16,25 +16,25 @@ entry:
     ret i32 %c
 }
 
-define i32 @foo3(i32 %a, i32 %b) zeroext {
+define zeroext i32 @foo3(i32 %a, i32 %b)  {
 entry:
     %c = sub i32 %a, 1
     ret i32 %c
 }
 
-define i32 @foo4(i32 %a, i32 %b) signext {
+define signext i32 @foo4(i32 %a, i32 %b)  {
 entry:
     %c = sub i32 %a, 131072
     ret i32 %c
 }
 
-define i32 @foo5(i32 %a, i32 %b) zeroext {
+define zeroext i32 @foo5(i32 %a, i32 %b)  {
 entry:
     %c = sub i32 %a, 1
     ret i32 %c
 }
 
-define i32 @foo6(i32 %a, i32 %b) signext {
+define signext i32 @foo6(i32 %a, i32 %b)  {
 entry:
     %c = sub i32 %a, 131072
     ret i32 %c
diff --git a/test/CodeGen/SystemZ/03-RetSubSubreg.ll b/test/CodeGen/SystemZ/03-RetSubSubreg.ll
index b3e1ac2..24b7631 100644
--- a/test/CodeGen/SystemZ/03-RetSubSubreg.ll
+++ b/test/CodeGen/SystemZ/03-RetSubSubreg.ll
@@ -8,13 +8,13 @@ entry:
     ret i32 %c
 }
 
-define i32 @foo1(i32 %a, i32 %b) zeroext {
+define zeroext i32 @foo1(i32 %a, i32 %b)  {
 entry:
     %c = sub i32 %a, %b
     ret i32 %c
 }
 
-define i32 @foo2(i32 %a, i32 %b) signext {
+define signext i32 @foo2(i32 %a, i32 %b)  {
 entry:
     %c = sub i32 %a, %b
     ret i32 %c
diff --git a/test/CodeGen/SystemZ/03-RetXorImmSubreg.ll b/test/CodeGen/SystemZ/03-RetXorImmSubreg.ll
index 0033126..70ee454 100644
--- a/test/CodeGen/SystemZ/03-RetXorImmSubreg.ll
+++ b/test/CodeGen/SystemZ/03-RetXorImmSubreg.ll
@@ -20,37 +20,37 @@ entry:
     ret i32 %c
 }
 
-define i32 @foo3(i32 %a, i32 %b) zeroext {
+define zeroext i32 @foo3(i32 %a, i32 %b)  {
 entry:
     %c = xor i32 %a, 1
     ret i32 %c
 }
 
-define i32 @foo8(i32 %a, i32 %b) zeroext {
+define zeroext i32 @foo8(i32 %a, i32 %b)  {
 entry:
     %c = xor i32 %a, 123456
     ret i32 %c
 }
 
-define i32 @foo4(i32 %a, i32 %b) signext {
+define signext i32 @foo4(i32 %a, i32 %b)  {
 entry:
     %c = xor i32 %a, 131072
     ret i32 %c
 }
 
-define i32 @foo5(i32 %a, i32 %b) zeroext {
+define zeroext i32 @foo5(i32 %a, i32 %b)  {
 entry:
     %c = xor i32 %a, 1
     ret i32 %c
 }
 
-define i32 @foo6(i32 %a, i32 %b) signext {
+define signext i32 @foo6(i32 %a, i32 %b)  {
 entry:
     %c = xor i32 %a, 131072
     ret i32 %c
 }
 
-define i32 @foo9(i32 %a, i32 %b) signext {
+define signext i32 @foo9(i32 %a, i32 %b)  {
 entry:
     %c = xor i32 %a, 123456
     ret i32 %c
diff --git a/test/CodeGen/SystemZ/03-RetXorSubreg.ll b/test/CodeGen/SystemZ/03-RetXorSubreg.ll
index a9af231..02c4a2a 100644
--- a/test/CodeGen/SystemZ/03-RetXorSubreg.ll
+++ b/test/CodeGen/SystemZ/03-RetXorSubreg.ll
@@ -9,13 +9,13 @@ entry:
     ret i32 %c
 }
 
-define i32 @foo1(i32 %a, i32 %b) zeroext {
+define zeroext i32 @foo1(i32 %a, i32 %b)  {
 entry:
     %c = xor i32 %a, %b
     ret i32 %c
 }
 
-define i32 @foo2(i32 %a, i32 %b) signext {
+define signext i32 @foo2(i32 %a, i32 %b)  {
 entry:
     %c = xor i32 %a, %b
     ret i32 %c
diff --git a/test/CodeGen/SystemZ/11-BSwap.ll b/test/CodeGen/SystemZ/11-BSwap.ll
index b170a80..1aa9c67 100644
--- a/test/CodeGen/SystemZ/11-BSwap.ll
+++ b/test/CodeGen/SystemZ/11-BSwap.ll
@@ -5,34 +5,34 @@ target datalayout = "E-p:64:64:64-i8:8:16-i16:16:16-i32:32:32-i64:64:64-f32:32:3
 target triple = "s390x-ibm-linux"
 
 
-define i16 @foo(i16 zeroext %a) zeroext {
+define zeroext i16 @foo(i16 zeroext %a)  {
 	%res = tail call i16 @llvm.bswap.i16(i16 %a)
 	ret i16 %res
 }
 
-define i32 @foo2(i32 zeroext %a) zeroext {
+define zeroext i32 @foo2(i32 zeroext %a)  {
 ; CHECK: foo2:
-; CHECK:  lrvr %r1, %r2
+; CHECK:  lrvr [[R1:%r.]], %r2
         %res = tail call i32 @llvm.bswap.i32(i32 %a)
         ret i32 %res
 }
 
-define i64 @foo3(i64 %a) zeroext {
+define zeroext i64 @foo3(i64 %a)  {
 ; CHECK: foo3:
 ; CHECK:  lrvgr %r2, %r2
         %res = tail call i64 @llvm.bswap.i64(i64 %a)
         ret i64 %res
 }
 
-define i16 @foo4(i16* %b) zeroext {
+define zeroext i16 @foo4(i16* %b)  {
 	%a = load i16* %b
         %res = tail call i16 @llvm.bswap.i16(i16 %a)
         ret i16 %res
 }
 
-define i32 @foo5(i32* %b) zeroext {
+define zeroext i32 @foo5(i32* %b)  {
 ; CHECK: foo5:
-; CHECK:  lrv %r1, 0(%r2)
+; CHECK:  lrv [[R1:%r.]], 0(%r2)
 	%a = load i32* %b
         %res = tail call i32 @llvm.bswap.i32(i32 %a)
         ret i32 %res
diff --git a/test/CodeGen/Thumb/2007-03-06-AddR7.ll b/test/CodeGen/Thumb/2007-03-06-AddR7.ll
deleted file mode 100644
index 8d139e9..0000000
--- a/test/CodeGen/Thumb/2007-03-06-AddR7.ll
+++ /dev/null
@@ -1,117 +0,0 @@
-; RUN: llc < %s -march=thumb
-; RUN: llc < %s -mtriple=thumb-apple-darwin -relocation-model=pic \
-; RUN:   -mattr=+v6,+vfp2 | not grep {add r., r7, #2 \\* 4}
-
-	%struct.__fooAllocator = type opaque
-	%struct.__fooY = type { %struct.fooXBase, %struct.__fooString*, %struct.__fooU*, %struct.__fooV*, i8** }
-	%struct.__fooZ = type opaque
-	%struct.__fooU = type opaque
-	%struct.__fooString = type opaque
-	%struct.__fooV = type opaque
-	%struct.fooXBase = type { i32, [4 x i8] }
-	%struct.fooXClass = type { i32, i8*, void (i8*)*, i8* (%struct.__fooAllocator*, i8*)*, void (i8*)*, i8 (i8*, i8*) zeroext *, i32 (i8*)*, %struct.__fooString* (i8*, %struct.__fooZ*)*, %struct.__fooString* (i8*)* }
-	%struct.aa_cache = type { i32, i32, [1 x %struct.aa_method*] }
-	%struct.aa_class = type { %struct.aa_class*, %struct.aa_class*, i8*, i32, i32, i32, %struct.aa_ivar_list*, %struct.aa_method_list**, %struct.aa_cache*, %struct.aa_protocol_list* }
-	%struct.aa_ivar = type { i8*, i8*, i32 }
-	%struct.aa_ivar_list = type { i32, [1 x %struct.aa_ivar] }
-	%struct.aa_method = type { %struct.aa_ss*, i8*, %struct.aa_object* (%struct.aa_object*, %struct.aa_ss*, ...)* }
-	%struct.aa_method_list = type { %struct.aa_method_list*, i32, [1 x %struct.aa_method] }
-	%struct.aa_object = type { %struct.aa_class* }
-	%struct.aa_protocol_list = type { %struct.aa_protocol_list*, i32, [1 x %struct.aa_object*] }
-	%struct.aa_ss = type opaque
-@__kfooYTypeID = external global i32		; <i32*> [#uses=3]
-@__fooYClass = external constant %struct.fooXClass		; <%struct.fooXClass*> [#uses=1]
-@__fooXClassTableSize = external global i32		; <i32*> [#uses=1]
-@__fooXAaClassTable = external global i32*		; <i32**> [#uses=1]
-@s.10319 = external global %struct.aa_ss*		; <%struct.aa_ss**> [#uses=2]
-@str15 = external constant [24 x i8]		; <[24 x i8]*> [#uses=1]
-
-
-define i8 @test(%struct.__fooY* %calendar, double* %atp, i8* %componentDesc, ...) zeroext  {
-entry:
-	%args = alloca i8*, align 4		; <i8**> [#uses=5]
-	%args4 = bitcast i8** %args to i8*		; <i8*> [#uses=2]
-	call void @llvm.va_start( i8* %args4 )
-	%tmp6 = load i32* @__kfooYTypeID		; <i32> [#uses=1]
-	icmp eq i32 %tmp6, 0		; <i1>:0 [#uses=1]
-	br i1 %0, label %cond_true, label %cond_next
-
-cond_true:		; preds = %entry
-	%tmp7 = call i32 @_fooXRegisterClass( %struct.fooXClass* @__fooYClass )		; <i32> [#uses=1]
-	store i32 %tmp7, i32* @__kfooYTypeID
-	br label %cond_next
-
-cond_next:		; preds = %cond_true, %entry
-	%tmp8 = load i32* @__kfooYTypeID		; <i32> [#uses=2]
-	%tmp15 = load i32* @__fooXClassTableSize		; <i32> [#uses=1]
-	icmp ugt i32 %tmp15, %tmp8		; <i1>:1 [#uses=1]
-	br i1 %1, label %cond_next18, label %cond_true58
-
-cond_next18:		; preds = %cond_next
-	%tmp21 = getelementptr %struct.__fooY* %calendar, i32 0, i32 0, i32 0		; <i32*> [#uses=1]
-	%tmp22 = load i32* %tmp21		; <i32> [#uses=2]
-	%tmp29 = load i32** @__fooXAaClassTable		; <i32*> [#uses=1]
-	%tmp31 = getelementptr i32* %tmp29, i32 %tmp8		; <i32*> [#uses=1]
-	%tmp32 = load i32* %tmp31		; <i32> [#uses=1]
-	icmp eq i32 %tmp22, %tmp32		; <i1>:2 [#uses=1]
-	%.not = xor i1 %2, true		; <i1> [#uses=1]
-	icmp ugt i32 %tmp22, 4095		; <i1>:3 [#uses=1]
-	%bothcond = and i1 %.not, %3		; <i1> [#uses=1]
-	br i1 %bothcond, label %cond_true58, label %bb48
-
-bb48:		; preds = %cond_next18
-	%tmp78 = call i32 @strlen( i8* %componentDesc )		; <i32> [#uses=4]
-	%tmp92 = alloca i32, i32 %tmp78		; <i32*> [#uses=2]
-	icmp sgt i32 %tmp78, 0		; <i1>:4 [#uses=1]
-	br i1 %4, label %cond_true111, label %bb114
-
-cond_true58:		; preds = %cond_next18, %cond_next
-	%tmp59 = load %struct.aa_ss** @s.10319		; <%struct.aa_ss*> [#uses=2]
-	icmp eq %struct.aa_ss* %tmp59, null		; <i1>:5 [#uses=1]
-	%tmp6869 = bitcast %struct.__fooY* %calendar to i8*		; <i8*> [#uses=2]
-	br i1 %5, label %cond_true60, label %cond_next64
-
-cond_true60:		; preds = %cond_true58
-	%tmp63 = call %struct.aa_ss* @sel_registerName( i8* getelementptr ([24 x i8]* @str15, i32 0, i32 0) )		; <%struct.aa_ss*> [#uses=2]
-	store %struct.aa_ss* %tmp63, %struct.aa_ss** @s.10319
-	%tmp66137 = volatile load i8** %args		; <i8*> [#uses=1]
-	%tmp73138 = call i8 (i8*, %struct.aa_ss*, ...) zeroext * bitcast (%struct.aa_object* (%struct.aa_object*, %struct.aa_ss*, ...)* @aa_mm to i8 (i8*, %struct.aa_ss*, ...) zeroext *)( i8* %tmp6869, %struct.aa_ss* %tmp63, double* %atp, i8* %componentDesc, i8* %tmp66137) zeroext 		; <i8> [#uses=1]
-	ret i8 %tmp73138
-
-cond_next64:		; preds = %cond_true58
-	%tmp66 = volatile load i8** %args		; <i8*> [#uses=1]
-	%tmp73 = call i8 (i8*, %struct.aa_ss*, ...) zeroext * bitcast (%struct.aa_object* (%struct.aa_object*, %struct.aa_ss*, ...)* @aa_mm to i8 (i8*, %struct.aa_ss*, ...) zeroext *)( i8* %tmp6869, %struct.aa_ss* %tmp59, double* %atp, i8* %componentDesc, i8* %tmp66 ) zeroext 		; <i8> [#uses=1]
-	ret i8 %tmp73
-
-cond_true111:		; preds = %cond_true111, %bb48
-	%idx.2132.0 = phi i32 [ 0, %bb48 ], [ %indvar.next, %cond_true111 ]		; <i32> [#uses=2]
-	%tmp95 = volatile load i8** %args		; <i8*> [#uses=2]
-	%tmp97 = getelementptr i8* %tmp95, i32 4		; <i8*> [#uses=1]
-	volatile store i8* %tmp97, i8** %args
-	%tmp9899 = bitcast i8* %tmp95 to i32*		; <i32*> [#uses=1]
-	%tmp100 = load i32* %tmp9899		; <i32> [#uses=1]
-	%tmp104 = getelementptr i32* %tmp92, i32 %idx.2132.0		; <i32*> [#uses=1]
-	store i32 %tmp100, i32* %tmp104
-	%indvar.next = add i32 %idx.2132.0, 1		; <i32> [#uses=2]
-	icmp eq i32 %indvar.next, %tmp78		; <i1>:6 [#uses=1]
-	br i1 %6, label %bb114, label %cond_true111
-
-bb114:		; preds = %cond_true111, %bb48
-	call void @llvm.va_end( i8* %args4 )
-	%tmp122 = call i8 @_fooYCCV( %struct.__fooY* %calendar, double* %atp, i8* %componentDesc, i32* %tmp92, i32 %tmp78 ) zeroext 		; <i8> [#uses=1]
-	ret i8 %tmp122
-}
-
-declare i32 @_fooXRegisterClass(%struct.fooXClass*)
-
-declare i8 @_fooYCCV(%struct.__fooY*, double*, i8*, i32*, i32) zeroext 
-
-declare %struct.aa_object* @aa_mm(%struct.aa_object*, %struct.aa_ss*, ...)
-
-declare %struct.aa_ss* @sel_registerName(i8*)
-
-declare void @llvm.va_start(i8*)
-
-declare i32 @strlen(i8*)
-
-declare void @llvm.va_end(i8*)
diff --git a/test/CodeGen/Thumb/2009-07-19-SPDecBug.ll b/test/CodeGen/Thumb/2009-07-19-SPDecBug.ll
deleted file mode 100644
index 9cdcd31..0000000
--- a/test/CodeGen/Thumb/2009-07-19-SPDecBug.ll
+++ /dev/null
@@ -1,33 +0,0 @@
-; RUN: llc < %s -mtriple=thumbv6-elf | not grep "subs sp"
-; PR4567
-
-define i8* @__gets_chk(i8* %s, i32 %slen) nounwind {
-entry:
-	br i1 undef, label %bb, label %bb1
-
-bb:		; preds = %entry
-	ret i8* undef
-
-bb1:		; preds = %entry
-	br i1 undef, label %bb3, label %bb2
-
-bb2:		; preds = %bb1
-	%0 = alloca i8, i32 undef, align 4		; <i8*> [#uses=0]
-	br label %bb4
-
-bb3:		; preds = %bb1
-	%1 = malloc i8, i32 undef		; <i8*> [#uses=0]
-	br label %bb4
-
-bb4:		; preds = %bb3, %bb2
-	br i1 undef, label %bb5, label %bb6
-
-bb5:		; preds = %bb4
-	%2 = call  i8* @gets(i8* %s) nounwind		; <i8*> [#uses=1]
-	ret i8* %2
-
-bb6:		; preds = %bb4
-	unreachable
-}
-
-declare i8* @gets(i8*) nounwind
diff --git a/test/CodeGen/Thumb/2009-08-20-ISelBug.ll b/test/CodeGen/Thumb/2009-08-20-ISelBug.ll
index d6ca0d7..7876557 100644
--- a/test/CodeGen/Thumb/2009-08-20-ISelBug.ll
+++ b/test/CodeGen/Thumb/2009-08-20-ISelBug.ll
@@ -11,7 +11,7 @@
 
 define i32 @t(%struct.asl_file_t* %s, i64 %off, i64* %out) nounwind optsize {
 ; CHECK: t:
-; CHECK: adds r0, #8
+; CHECK: adds {{r[0-7]}}, #8
 entry:
   %val = alloca i64, align 4                      ; <i64*> [#uses=3]
   %0 = icmp eq %struct.asl_file_t* %s, null       ; <i1> [#uses=1]
diff --git a/test/CodeGen/Thumb/2010-01-15-local-alloc-spill-physical.ll b/test/CodeGen/Thumb/2010-01-15-local-alloc-spill-physical.ll
deleted file mode 100644
index fad2669..0000000
--- a/test/CodeGen/Thumb/2010-01-15-local-alloc-spill-physical.ll
+++ /dev/null
@@ -1,20 +0,0 @@
-; RUN: llc < %s -regalloc=fast -relocation-model=pic | FileCheck %s
-
-target triple = "thumbv6-apple-darwin10"
-
-@fred = internal global i32 0              ; <i32*> [#uses=1]
-
-define void @foo() nounwind {
-entry:
-; CHECK: str r0, [sp
-  %0 = call  i32 (...)* @bar() nounwind ; <i32> [#uses=1]
-; CHECK: blx _bar
-; CHECK: ldr r1, [sp
-  store i32 %0, i32* @fred, align 4
-  br label %return
-
-return:                                           ; preds = %entry
-  ret void
-}
-
-declare i32 @bar(...)
diff --git a/test/CodeGen/Thumb/2010-07-15-debugOrdering.ll b/test/CodeGen/Thumb/2010-07-15-debugOrdering.ll
index 06c0dfe..9f5a677 100644
--- a/test/CodeGen/Thumb/2010-07-15-debugOrdering.ll
+++ b/test/CodeGen/Thumb/2010-07-15-debugOrdering.ll
@@ -10,7 +10,7 @@
 define void @_Z19getClosestDiagonal3ii(%0* noalias sret, i32, i32) nounwind {
 ; CHECK: blx ___muldf3
 ; CHECK: blx ___muldf3
-; CHECK: beq LBB0_7
+; CHECK: beq LBB0
 ; CHECK: blx ___muldf3
 ; <label>:3
   switch i32 %1, label %4 [
diff --git a/test/CodeGen/Thumb/2011-05-11-DAGLegalizer.ll b/test/CodeGen/Thumb/2011-05-11-DAGLegalizer.ll
new file mode 100644
index 0000000..ed55bb5
--- /dev/null
+++ b/test/CodeGen/Thumb/2011-05-11-DAGLegalizer.ll
@@ -0,0 +1,60 @@
+; RUN: llc -mtriple=thumbv6-apple-darwin < %s
+; rdar://problem/9416774
+; ModuleID = 'reduced.ll'
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"
+target triple = "thumbv7-apple-ios"
+
+%struct.MMMMMMMMMMMM = type { [4 x %struct.RRRRRRRR] }
+%struct.RRRRRRRR = type { [78 x i32] }
+
+@kkkkkk = external constant i8*
+@__PRETTY_FUNCTION__._ZN12CLGll = private unnamed_addr constant [62 x i8] c"static void tttttttttttt::lllllllllllll(const MMMMMMMMMMMM &)\00"
+@.str = private unnamed_addr constant [75 x i8] c"\09GGGGGGGGGGGGGGGGGGGGGGG:,BE:0x%08lx,ALM:0x%08lx,LTO:0x%08lx,CBEE:0x%08lx\0A\00"
+
+define void @_ZN12CLGll(%struct.MMMMMMMMMMMM* %aidData) ssp align 2 {
+entry:
+  %aidData.addr = alloca %struct.MMMMMMMMMMMM*, align 4
+  %agg.tmp = alloca %struct.RRRRRRRR, align 4
+  %agg.tmp4 = alloca %struct.RRRRRRRR, align 4
+  %agg.tmp10 = alloca %struct.RRRRRRRR, align 4
+  %agg.tmp16 = alloca %struct.RRRRRRRR, align 4
+  store %struct.MMMMMMMMMMMM* %aidData, %struct.MMMMMMMMMMMM** %aidData.addr, align 4
+  br label %do.body
+
+do.body:                                          ; preds = %entry
+  %tmp = load i8** @kkkkkk, align 4
+  %tmp1 = load %struct.MMMMMMMMMMMM** %aidData.addr
+  %eph = getelementptr inbounds %struct.MMMMMMMMMMMM* %tmp1, i32 0, i32 0
+  %arrayidx = getelementptr inbounds [4 x %struct.RRRRRRRR]* %eph, i32 0, i32 0
+  %tmp2 = bitcast %struct.RRRRRRRR* %agg.tmp to i8*
+  %tmp3 = bitcast %struct.RRRRRRRR* %arrayidx to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %tmp2, i8* %tmp3, i32 312, i32 4, i1 false)
+  %tmp5 = load %struct.MMMMMMMMMMMM** %aidData.addr
+  %eph6 = getelementptr inbounds %struct.MMMMMMMMMMMM* %tmp5, i32 0, i32 0
+  %arrayidx7 = getelementptr inbounds [4 x %struct.RRRRRRRR]* %eph6, i32 0, i32 1
+  %tmp8 = bitcast %struct.RRRRRRRR* %agg.tmp4 to i8*
+  %tmp9 = bitcast %struct.RRRRRRRR* %arrayidx7 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %tmp8, i8* %tmp9, i32 312, i32 4, i1 false)
+  %tmp11 = load %struct.MMMMMMMMMMMM** %aidData.addr
+  %eph12 = getelementptr inbounds %struct.MMMMMMMMMMMM* %tmp11, i32 0, i32 0
+  %arrayidx13 = getelementptr inbounds [4 x %struct.RRRRRRRR]* %eph12, i32 0, i32 2
+  %tmp14 = bitcast %struct.RRRRRRRR* %agg.tmp10 to i8*
+  %tmp15 = bitcast %struct.RRRRRRRR* %arrayidx13 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %tmp14, i8* %tmp15, i32 312, i32 4, i1 false)
+  %tmp17 = load %struct.MMMMMMMMMMMM** %aidData.addr
+  %eph18 = getelementptr inbounds %struct.MMMMMMMMMMMM* %tmp17, i32 0, i32 0
+  %arrayidx19 = getelementptr inbounds [4 x %struct.RRRRRRRR]* %eph18, i32 0, i32 3
+  %tmp20 = bitcast %struct.RRRRRRRR* %agg.tmp16 to i8*
+  %tmp21 = bitcast %struct.RRRRRRRR* %arrayidx19 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %tmp20, i8* %tmp21, i32 312, i32 4, i1 false)
+  call void (i8*, i32, i8*, i8*, ...)* @CLLoggingLog(i8* %tmp, i32 2, i8* getelementptr inbounds ([62 x i8]* @__PRETTY_FUNCTION__._ZN12CLGll, i32 0, i32 0), i8* getelementptr inbounds ([75 x i8]* @.str, i32 0, i32 0), %struct.RRRRRRRR* byval %agg.tmp, %struct.RRRRRRRR* byval %agg.tmp4, %struct.RRRRRRRR* byval %agg.tmp10, %struct.RRRRRRRR* byval %agg.tmp16)
+  br label %do.end
+
+do.end:                                           ; preds = %do.body
+  ret void
+}
+
+declare void @CLLoggingLog(i8*, i32, i8*, i8*, ...)
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
diff --git a/test/CodeGen/Thumb/2011-06-16-NoGPRs.ll b/test/CodeGen/Thumb/2011-06-16-NoGPRs.ll
new file mode 100644
index 0000000..d39a760
--- /dev/null
+++ b/test/CodeGen/Thumb/2011-06-16-NoGPRs.ll
@@ -0,0 +1,24 @@
+; RUN: llc < %s
+;
+; This test would crash because isel creates a GPR register for the return
+; value from f1. The register is only used by tBLXr_r9 which accepts a full GPR
+; register, but we cannot have live GPRs in thumb mode because we don't know how
+; to spill them.
+;
+; <rdar://problem/9624323>
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"
+target triple = "thumbv6-apple-darwin10"
+
+%0 = type opaque
+
+declare i8* (i8*, i8*, ...)* @f1(i8*, i8*) optsize
+declare i8* @f2(i8*, i8*, ...)
+
+define internal void @f(i8* %self, i8* %_cmd, %0* %inObjects, %0* %inIndexes) optsize ssp {
+entry:
+  %call14 = tail call i8* (i8*, i8*, ...)* (i8*, i8*)* @f1(i8* undef, i8* %_cmd) optsize
+  %0 = bitcast i8* (i8*, i8*, ...)* %call14 to void (i8*, i8*, %0*, %0*)*
+  tail call void %0(i8* %self, i8* %_cmd, %0* %inObjects, %0* %inIndexes) optsize
+  tail call void bitcast (i8* (i8*, i8*, ...)* @f2 to void (i8*, i8*, i32, %0*, %0*)*)(i8* %self, i8* undef, i32 2, %0* %inIndexes, %0* undef) optsize
+  ret void
+}
diff --git a/test/CodeGen/Thumb/rev.ll b/test/CodeGen/Thumb/rev.ll
new file mode 100644
index 0000000..5e163f8
--- /dev/null
+++ b/test/CodeGen/Thumb/rev.ll
@@ -0,0 +1,56 @@
+; RUN: llc < %s -march=thumb -mattr=+v6 | FileCheck %s
+
+define i32 @test1(i32 %X) nounwind {
+; CHECK: test1
+; CHECK: rev16 r0, r0
+        %tmp1 = lshr i32 %X, 8
+        %X15 = bitcast i32 %X to i32
+        %tmp4 = shl i32 %X15, 8
+        %tmp2 = and i32 %tmp1, 16711680
+        %tmp5 = and i32 %tmp4, -16777216
+        %tmp9 = and i32 %tmp1, 255
+        %tmp13 = and i32 %tmp4, 65280
+        %tmp6 = or i32 %tmp5, %tmp2
+        %tmp10 = or i32 %tmp6, %tmp13
+        %tmp14 = or i32 %tmp10, %tmp9
+        ret i32 %tmp14
+}
+
+define i32 @test2(i32 %X) nounwind {
+; CHECK: test2
+; CHECK: revsh r0, r0
+        %tmp1 = lshr i32 %X, 8
+        %tmp1.upgrd.1 = trunc i32 %tmp1 to i16
+        %tmp3 = trunc i32 %X to i16
+        %tmp2 = and i16 %tmp1.upgrd.1, 255
+        %tmp4 = shl i16 %tmp3, 8
+        %tmp5 = or i16 %tmp2, %tmp4
+        %tmp5.upgrd.2 = sext i16 %tmp5 to i32
+        ret i32 %tmp5.upgrd.2
+}
+
+; rdar://9147637
+define i32 @test3(i16 zeroext %a) nounwind {
+entry:
+; CHECK: test3:
+; CHECK: revsh r0, r0
+  %0 = tail call i16 @llvm.bswap.i16(i16 %a)
+  %1 = sext i16 %0 to i32
+  ret i32 %1
+}
+
+declare i16 @llvm.bswap.i16(i16) nounwind readnone
+
+define i32 @test4(i16 zeroext %a) nounwind {
+entry:
+; CHECK: test4:
+; CHECK: revsh r0, r0
+  %conv = zext i16 %a to i32
+  %shr9 = lshr i16 %a, 8
+  %conv2 = zext i16 %shr9 to i32
+  %shl = shl nuw nsw i32 %conv, 8
+  %or = or i32 %conv2, %shl
+  %sext = shl i32 %or, 16
+  %conv8 = ashr exact i32 %sext, 16
+  ret i32 %conv8
+}
diff --git a/test/CodeGen/Thumb/select.ll b/test/CodeGen/Thumb/select.ll
index 780e5fa..3f10b05 100644
--- a/test/CodeGen/Thumb/select.ll
+++ b/test/CodeGen/Thumb/select.ll
@@ -1,10 +1,5 @@
-; RUN: llc < %s -march=thumb | grep beq | count 1
-; RUN: llc < %s -march=thumb | grep bgt | count 1
-; RUN: llc < %s -march=thumb | grep blt | count 3
-; RUN: llc < %s -march=thumb | grep ble | count 1
-; RUN: llc < %s -march=thumb | grep bls | count 1
-; RUN: llc < %s -march=thumb | grep bhi | count 1
-; RUN: llc < %s -mtriple=thumb-apple-darwin | grep __ltdf2
+; RUN: llc < %s -mtriple=thumb-apple-darwin | FileCheck %s
+; RUN: llc < %s -mtriple=thumb-pc-linux-gnueabi | FileCheck -check-prefix=CHECK-EABI %s
 
 define i32 @f1(i32 %a.s) {
 entry:
@@ -12,6 +7,10 @@ entry:
     %tmp1.s = select i1 %tmp, i32 2, i32 3
     ret i32 %tmp1.s
 }
+; CHECK: f1:
+; CHECK: beq
+; CHECK-EABI: f1:
+; CHECK-EABI: beq
 
 define i32 @f2(i32 %a.s) {
 entry:
@@ -19,6 +18,10 @@ entry:
     %tmp1.s = select i1 %tmp, i32 2, i32 3
     ret i32 %tmp1.s
 }
+; CHECK: f2:
+; CHECK: bgt
+; CHECK-EABI: f2:
+; CHECK-EABI: bgt
 
 define i32 @f3(i32 %a.s, i32 %b.s) {
 entry:
@@ -26,6 +29,10 @@ entry:
     %tmp1.s = select i1 %tmp, i32 2, i32 3
     ret i32 %tmp1.s
 }
+; CHECK: f3:
+; CHECK: blt
+; CHECK-EABI: f3:
+; CHECK-EABI: blt
 
 define i32 @f4(i32 %a.s, i32 %b.s) {
 entry:
@@ -33,6 +40,10 @@ entry:
     %tmp1.s = select i1 %tmp, i32 2, i32 3
     ret i32 %tmp1.s
 }
+; CHECK: f4:
+; CHECK: ble
+; CHECK-EABI: f4:
+; CHECK-EABI: ble
 
 define i32 @f5(i32 %a.u, i32 %b.u) {
 entry:
@@ -40,6 +51,10 @@ entry:
     %tmp1.s = select i1 %tmp, i32 2, i32 3
     ret i32 %tmp1.s
 }
+; CHECK: f5:
+; CHECK: bls
+; CHECK-EABI: f5:
+; CHECK-EABI: bls
 
 define i32 @f6(i32 %a.u, i32 %b.u) {
 entry:
@@ -47,9 +62,21 @@ entry:
     %tmp1.s = select i1 %tmp, i32 2, i32 3
     ret i32 %tmp1.s
 }
+; CHECK: f6:
+; CHECK: bhi
+; CHECK-EABI: f6:
+; CHECK-EABI: bhi
 
 define double @f7(double %a, double %b) {
     %tmp = fcmp olt double %a, 1.234e+00
     %tmp1 = select i1 %tmp, double -1.000e+00, double %b
     ret double %tmp1
 }
+; CHECK: f7:
+; CHECK: blt
+; CHECK: blt
+; CHECK: __ltdf2
+; CHECK-EABI: f7:
+; CHECK-EABI: __aeabi_dcmplt
+; CHECK-EABI: bne
+; CHECK-EABI: bne
diff --git a/test/CodeGen/Thumb2/2009-10-15-ITBlockBranch.ll b/test/CodeGen/Thumb2/2009-10-15-ITBlockBranch.ll
index 789a891..9aee910 100644
--- a/test/CodeGen/Thumb2/2009-10-15-ITBlockBranch.ll
+++ b/test/CodeGen/Thumb2/2009-10-15-ITBlockBranch.ll
@@ -12,8 +12,8 @@
 define weak arm_aapcs_vfpcc i32 @_ZNKSs7compareERKSs(%"struct.std::basic_string<char,std::char_traits<char>,std::allocator<char> >"* %this, %"struct.std::basic_string<char,std::char_traits<char>,std::allocator<char> >"* %__str) {
 ; CHECK: _ZNKSs7compareERKSs:
 ; CHECK:      it  eq
-; CHECK-NEXT: subeq r0, r{{[0-9]+}}, r{{[0-9]+}}
-; CHECK-NEXT: ldmia.w sp!, {r4, r5, r6, r7, r8, pc}
+; CHECK-NEXT: subeq{{(.w)?}} r0, r{{[0-9]+}}, r{{[0-9]+}}
+; CHECK-NEXT: ldmia.w sp!,
 entry:
   %0 = tail call arm_aapcs_vfpcc  i32 @_ZNKSs4sizeEv(%"struct.std::basic_string<char,std::char_traits<char>,std::allocator<char> >"* %this) ; <i32> [#uses=3]
   %1 = tail call arm_aapcs_vfpcc  i32 @_ZNKSs4sizeEv(%"struct.std::basic_string<char,std::char_traits<char>,std::allocator<char> >"* %__str) ; <i32> [#uses=3]
diff --git a/test/CodeGen/Thumb2/2010-08-10-VarSizedAllocaBug.ll b/test/CodeGen/Thumb2/2010-08-10-VarSizedAllocaBug.ll
index 41f7f29..47d7a9c 100644
--- a/test/CodeGen/Thumb2/2010-08-10-VarSizedAllocaBug.ll
+++ b/test/CodeGen/Thumb2/2010-08-10-VarSizedAllocaBug.ll
@@ -7,8 +7,8 @@ entry:
 ; CHECK: Callee:
 ; CHECK: push
 ; CHECK: mov r4, sp
-; CHECK: sub.w r12, r4, #1000
-; CHECK: mov sp, r12
+; CHECK: sub.w [[R12:r[0-9]+]], r4, #1000
+; CHECK: mov sp, [[R12]]
   %0 = icmp eq i32 %i, 0                          ; <i1> [#uses=1]
   br i1 %0, label %bb2, label %bb
 
diff --git a/test/CodeGen/Thumb2/2011-04-21-FILoweringBug.ll b/test/CodeGen/Thumb2/2011-04-21-FILoweringBug.ll
new file mode 100644
index 0000000..604a352
--- /dev/null
+++ b/test/CodeGen/Thumb2/2011-04-21-FILoweringBug.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin | FileCheck %s
+
+; Use sp, #imm to lower frame indices when the offset is multiple of 4
+; and in the range of 0-1020. This saves code size by utilizing
+; 16-bit instructions.
+; rdar://9321541
+
+define i32 @t() nounwind {
+entry:
+; CHECK: t:
+; CHECK: sub sp, #12
+; CHECK-NOT: sub
+; CHECK: add r0, sp, #4
+; CHECK: add r1, sp, #8
+; CHECK: mov r2, sp
+  %size = alloca i32, align 4
+  %count = alloca i32, align 4
+  %index = alloca i32, align 4
+  %0 = call i32 @foo(i32* %count, i32* %size, i32* %index) nounwind
+  ret i32 %0
+}
+
+declare i32 @foo(i32*, i32*, i32*)
diff --git a/test/CodeGen/Thumb2/2011-06-07-TwoAddrEarlyClobber.ll b/test/CodeGen/Thumb2/2011-06-07-TwoAddrEarlyClobber.ll
new file mode 100644
index 0000000..9e6d78e
--- /dev/null
+++ b/test/CodeGen/Thumb2/2011-06-07-TwoAddrEarlyClobber.ll
@@ -0,0 +1,36 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"
+target triple = "thumbv7-apple-darwin10"
+
+%struct.op = type { %struct.op*, %struct.op*, %struct.op* ()*, i32, i16, i16, i8, i8 }
+
+; CHECK: Perl_ck_sort
+; CHECK: ldr
+; CHECK: mov [[REGISTER:(r[0-9]+)|(lr)]]
+; CHECK: str {{(r[0-9])|(lr)}}, {{\[}}[[REGISTER]]{{\]}}, #24
+
+define void @Perl_ck_sort() nounwind optsize {
+entry:
+  %tmp27 = load %struct.op** undef, align 4
+  switch i16 undef, label %if.end151 [
+    i16 178, label %if.then60
+    i16 177, label %if.then60
+  ]
+
+if.then60:                                        ; preds = %if.then40
+  br i1 undef, label %if.then67, label %if.end95
+
+if.then67:                                        ; preds = %if.then60
+  %op_next71 = getelementptr inbounds %struct.op* %tmp27, i32 0, i32 0
+  store %struct.op* %tmp27, %struct.op** %op_next71, align 4
+  %0 = getelementptr inbounds %struct.op* %tmp27, i32 1, i32 0
+  br label %if.end95
+
+if.end95:                                         ; preds = %if.else92, %if.then67
+  %.pre-phi = phi %struct.op** [ undef, %if.then60 ], [ %0, %if.then67 ]
+  %tmp98 = load %struct.op** %.pre-phi, align 4
+  br label %if.end151
+
+if.end151:                                        ; preds = %if.end100, %if.end, %entry
+  ret void
+}
diff --git a/test/CodeGen/Thumb2/bfi.ll b/test/CodeGen/Thumb2/bfi.ll
index 0405d98..3612e27 100644
--- a/test/CodeGen/Thumb2/bfi.ll
+++ b/test/CodeGen/Thumb2/bfi.ll
@@ -30,9 +30,8 @@ entry:
 define i32 @f3(i32 %A, i32 %B) nounwind readnone optsize {
 entry:
 ; CHECK: f3
-; CHECK: lsrs  r2, r0, #7
-; CHECK: mov r0, r1
-; CHECK: bfi r0, r2, #7, #16
+; CHECK: lsrs {{.*}}, #7
+; CHECK: bfi {{.*}}, #7, #16
   %and = and i32 %A, 8388480                      ; <i32> [#uses=1]
   %and2 = and i32 %B, -8388481                    ; <i32> [#uses=1]
   %or = or i32 %and2, %and                        ; <i32> [#uses=1]
@@ -42,8 +41,8 @@ entry:
 ; rdar://8752056
 define i32 @f4(i32 %a) nounwind {
 ; CHECK: f4
-; CHECK: movw r1, #3137
-; CHECK: bfi r1, r0, #15, #5
+; CHECK: movw [[R1:r[0-9]+]], #3137
+; CHECK: bfi [[R1]], {{.*}}, #15, #5
   %1 = shl i32 %a, 15
   %ins7 = and i32 %1, 1015808
   %ins12 = or i32 %ins7, 3137
@@ -53,7 +52,7 @@ define i32 @f4(i32 %a) nounwind {
 ; rdar://9177502
 define i32 @f5(i32 %a, i32 %b) nounwind readnone {
 entry:
-; CHECK f5
+; CHECK: f5
 ; CHECK-NOT: bfi r0, r2, #0, #1
 %and = and i32 %a, 2
 %b.masked = and i32 %b, -2
diff --git a/test/CodeGen/Thumb2/ldr-str-imm12.ll b/test/CodeGen/Thumb2/ldr-str-imm12.ll
index 08265b8..e1932bd 100644
--- a/test/CodeGen/Thumb2/ldr-str-imm12.ll
+++ b/test/CodeGen/Thumb2/ldr-str-imm12.ll
@@ -22,7 +22,7 @@
 
 define %union.rec* @Manifest(%union.rec* %x, %union.rec* %env, %struct.STYLE* %style, %union.rec** %bthr, %union.rec** %fthr, %union.rec** %target, %union.rec** %crs, i32 %ok, i32 %need_expand, %union.rec** %enclose, i32 %fcr) nounwind {
 entry:
-; CHECK:       ldr.w	{{(r[0-9])|(lr)}}, [r7, #28]
+; CHECK:       ldr.w	{{(r[0-9]+)|(lr)}}, [r7, #28]
   %xgaps.i = alloca [32 x %union.rec*], align 4   ; <[32 x %union.rec*]*> [#uses=0]
   %ycomp.i = alloca [32 x %union.rec*], align 4   ; <[32 x %union.rec*]*> [#uses=0]
   br label %bb20
diff --git a/test/CodeGen/Thumb2/machine-licm.ll b/test/CodeGen/Thumb2/machine-licm.ll
index 5e776dd..ee054a1 100644
--- a/test/CodeGen/Thumb2/machine-licm.ll
+++ b/test/CodeGen/Thumb2/machine-licm.ll
@@ -14,19 +14,19 @@ entry:
 
 bb.nph:                                           ; preds = %entry
 ; CHECK: BB#1
-; CHECK: movw r2, :lower16:L_GV$non_lazy_ptr
-; CHECK: movt r2, :upper16:L_GV$non_lazy_ptr
-; CHECK: ldr r2, [r2]
-; CHECK: ldr r3, [r2]
+; CHECK: movw r[[R2:[0-9]+]], :lower16:L_GV$non_lazy_ptr
+; CHECK: movt r[[R2]], :upper16:L_GV$non_lazy_ptr
+; CHECK: ldr{{(.w)?}} r[[R2b:[0-9]+]], [r[[R2]]
+; CHECK: ldr{{.*}}, [r[[R2b]]
 ; CHECK: LBB0_2
 ; CHECK-NOT: LCPI0_0:
 
 ; PIC: BB#1
-; PIC: movw r2, :lower16:(L_GV$non_lazy_ptr-(LPC0_0+4))
-; PIC: movt r2, :upper16:(L_GV$non_lazy_ptr-(LPC0_0+4))
-; PIC: add r2, pc
-; PIC: ldr r2, [r2]
-; PIC: ldr r3, [r2]
+; PIC: movw r[[R2:[0-9]+]], :lower16:(L_GV$non_lazy_ptr-(LPC0_0+4))
+; PIC: movt r[[R2]], :upper16:(L_GV$non_lazy_ptr-(LPC0_0+4))
+; PIC: add r[[R2]], pc
+; PIC: ldr{{(.w)?}} r[[R2b:[0-9]+]], [r[[R2]]
+; PIC: ldr{{.*}}, [r[[R2b]]
 ; PIC: LBB0_2
 ; PIC-NOT: LCPI0_0:
 ; PIC: .section
@@ -88,10 +88,10 @@ define zeroext i16 @t3(i8 zeroext %data, i16 zeroext %crc) nounwind readnone {
 bb.nph:
 ; CHECK: bb.nph
 ; CHECK: movw {{(r[0-9])|(lr)}}, #32768
-; CHECK: movs {{(r[0-9])|(lr)}}, #8
-; CHECK: movw [[REGISTER:(r[0-9])|(lr)]], #16386
-; CHECK: movw {{(r[0-9])|(lr)}}, #65534
-; CHECK: movt {{(r[0-9])|(lr)}}, #65535
+; CHECK: movs {{(r[0-9]+)|(lr)}}, #0
+; CHECK: movw [[REGISTER:(r[0-9]+)|(lr)]], #16386
+; CHECK: movw {{(r[0-9]+)|(lr)}}, #65534
+; CHECK: movt {{(r[0-9]+)|(lr)}}, #65535
   br label %bb
 
 bb:                                               ; preds = %bb, %bb.nph
diff --git a/test/CodeGen/Thumb2/thumb2-cbnz.ll b/test/CodeGen/Thumb2/thumb2-cbnz.ll
index 10a4985..0992fa8 100644
--- a/test/CodeGen/Thumb2/thumb2-cbnz.ll
+++ b/test/CodeGen/Thumb2/thumb2-cbnz.ll
@@ -3,26 +3,29 @@
 
 declare double @floor(double) nounwind readnone
 
-define void @t(i1 %a, double %b) {
+define void @t(i32 %c, double %b) {
 entry:
-  br i1 %a, label %bb3, label %bb1
+  %cmp1 = icmp ne i32 %c, 0
+  br i1 %cmp1, label %bb3, label %bb1
 
 bb1:                                              ; preds = %entry
   unreachable
 
 bb3:                                              ; preds = %entry
-  br i1 %a, label %bb7, label %bb5
+  %cmp2 = icmp ne i32 %c, 0
+  br i1 %cmp2, label %bb7, label %bb5
 
 bb5:                                              ; preds = %bb3
   unreachable
 
 bb7:                                              ; preds = %bb3
-  br i1 %a, label %bb11, label %bb9
+  %cmp3 = icmp ne i32 %c, 0
+  br i1 %cmp3, label %bb11, label %bb9
 
 bb9:                                              ; preds = %bb7
-; CHECK:      cmp r0, #0
-; CHECK:      cmp r0, #0
-; CHECK-NEXT: cbnz
+; CHECK:      cmp	r0, #0
+; CHECK:      cmp	r0, #0
+; CHECK-NEXT:      cbnz
   %0 = tail call  double @floor(double %b) nounwind readnone ; <double> [#uses=0]
   br label %bb11
 
diff --git a/test/CodeGen/Thumb2/thumb2-cmn.ll b/test/CodeGen/Thumb2/thumb2-cmn.ll
index eeaaa7f..df221b9 100644
--- a/test/CodeGen/Thumb2/thumb2-cmn.ll
+++ b/test/CodeGen/Thumb2/thumb2-cmn.ll
@@ -1,4 +1,7 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mattr=+thumb2 -join-physregs | FileCheck %s
+
+; These tests implicitly depend on 'movs r0, #0' being rematerialized below the
+; test as 'mov.w r0, #0'. So far, that requires physreg joining.
 
 define i1 @f1(i32 %a, i32 %b) {
     %nb = sub i32 0, %b
diff --git a/test/CodeGen/Thumb2/thumb2-cmp.ll b/test/CodeGen/Thumb2/thumb2-cmp.ll
index 63249f4..da12114 100644
--- a/test/CodeGen/Thumb2/thumb2-cmp.ll
+++ b/test/CodeGen/Thumb2/thumb2-cmp.ll
@@ -1,4 +1,7 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mattr=+thumb2 -join-physregs | FileCheck %s
+
+; These tests implicitly depend on 'movs r0, #0' being rematerialized below the
+; test as 'mov.w r0, #0'. So far, that requires physreg joining.
 
 ; 0x000000bb = 187
 define i1 @f1(i32 %a) {
diff --git a/test/CodeGen/Thumb2/thumb2-cmp2.ll b/test/CodeGen/Thumb2/thumb2-cmp2.ll
index 55c321d..15052e0 100644
--- a/test/CodeGen/Thumb2/thumb2-cmp2.ll
+++ b/test/CodeGen/Thumb2/thumb2-cmp2.ll
@@ -1,4 +1,7 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mattr=+thumb2 -join-physregs | FileCheck %s
+
+; These tests implicitly depend on 'movs r0, #0' being rematerialized below the
+; test as 'mov.w r0, #0'. So far, that requires physreg joining.
 
 define i1 @f1(i32 %a, i32 %b) {
 ; CHECK: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-lsr3.ll b/test/CodeGen/Thumb2/thumb2-lsr3.ll
index 5cfd3f5..e7ba782 100644
--- a/test/CodeGen/Thumb2/thumb2-lsr3.ll
+++ b/test/CodeGen/Thumb2/thumb2-lsr3.ll
@@ -1,9 +1,9 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2
+; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
 
 define i1 @test1(i64 %poscnt, i32 %work) {
 entry:
-; CHECK: rrx r0, r0
 ; CHECK: lsrs.w r1, r1, #1
+; CHECK: rrx r0, r0
 	%0 = lshr i64 %poscnt, 1
 	%1 = icmp eq i64 %0, 0
 	ret i1 %1
@@ -11,8 +11,8 @@ entry:
 
 define i1 @test2(i64 %poscnt, i32 %work) {
 entry:
-; CHECK: rrx r0, r0
 ; CHECK: asrs.w r1, r1, #1
+; CHECK: rrx r0, r0
 	%0 = ashr i64 %poscnt, 1
 	%1 = icmp eq i64 %0, 0
 	ret i1 %1
diff --git a/test/CodeGen/Thumb2/thumb2-ror.ll b/test/CodeGen/Thumb2/thumb2-ror.ll
index 0200116..590c333 100644
--- a/test/CodeGen/Thumb2/thumb2-ror.ll
+++ b/test/CodeGen/Thumb2/thumb2-ror.ll
@@ -1,11 +1,24 @@
 ; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
 
 
+; CHECK: f1:
+; CHECK: 	ror.w	r0, r0, #22
 define i32 @f1(i32 %a) {
     %l8 = shl i32 %a, 10
     %r8 = lshr i32 %a, 22
     %tmp = or i32 %l8, %r8
     ret i32 %tmp
 }
-; CHECK: f1:
-; CHECK: 	ror.w	r0, r0, #22
+
+; CHECK: f2:
+; CHECK-NOT: and
+; CHECK: ror
+define i32 @f2(i32 %v, i32 %nbits) {
+entry:
+  %and = and i32 %nbits, 31
+  %shr = lshr i32 %v, %and
+  %sub = sub i32 32, %and
+  %shl = shl i32 %v, %sub
+  %or = or i32 %shl, %shr
+  ret i32 %or
+}
+\ No newline at end of file
diff --git a/test/CodeGen/Thumb2/thumb2-ror2.ll b/test/CodeGen/Thumb2/thumb2-ror2.ll
deleted file mode 100644
index ffd1dd7..0000000
--- a/test/CodeGen/Thumb2/thumb2-ror2.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
-
-define i32 @f1(i32 %a, i32 %b) {
-; CHECK: f1:
-; CHECK: rors r0, r1
-    %db = sub i32 32, %b
-    %l8 = shl i32 %a, %b
-    %r8 = lshr i32 %a, %db
-    %tmp = or i32 %l8, %r8
-    ret i32 %tmp
-}
diff --git a/test/CodeGen/Thumb2/thumb2-sbc.ll b/test/CodeGen/Thumb2/thumb2-sbc.ll
index de6502d..53f45ea 100644
--- a/test/CodeGen/Thumb2/thumb2-sbc.ll
+++ b/test/CodeGen/Thumb2/thumb2-sbc.ll
@@ -39,7 +39,7 @@ define i64 @f5(i64 %a) {
 ; CHECK: f5
 ; CHECK: subs  r0, #2
 ; CHECK: adc r1, r1, #-1448498775
-    %tmp = sub i64 %a, 6221254862626095106 
+    %tmp = sub i64 %a, 6221254862626095106
     ret i64 %tmp
 }
 
@@ -48,7 +48,22 @@ define i64 @f6(i64 %a) {
 ; CHECK: f6
 ; CHECK: subs  r0, #2
 ; CHECK: sbc r1, r1, #66846720
-    %tmp = sub i64 %a, 287104476244869122 
+    %tmp = sub i64 %a, 287104476244869122
     ret i64 %tmp
 }
 
+; Example from numerics code that manually computes wider-than-64 values.
+;
+; CHECK: livecarry:
+; CHECK: adds
+; CHECK: adcs
+; CHECK: adc
+define i64 @livecarry(i64 %carry, i32 %digit) nounwind {
+  %ch = lshr i64 %carry, 32
+  %cl = and i64 %carry, 4294967295
+  %truncdigit = zext i32 %digit to i64
+  %prod = add i64 %cl, %truncdigit
+  %ph = lshr i64 %prod, 32
+  %carryresult = add i64 %ch, %ph
+  ret i64 %carryresult
+}
diff --git a/test/CodeGen/Thumb2/thumb2-sub3.ll b/test/CodeGen/Thumb2/thumb2-sub3.ll
index 855ad06..1dbda57 100644
--- a/test/CodeGen/Thumb2/thumb2-sub3.ll
+++ b/test/CodeGen/Thumb2/thumb2-sub3.ll
@@ -4,7 +4,7 @@
 define i64 @f1(i64 %a) {
 ; CHECK: f1
 ; CHECK: subs  r0, #171
-; CHECK: adc r1, r1, #-1
+; CHECK: sbc r1, r1, #0
     %tmp = sub i64 %a, 171
     ret i64 %tmp
 }
@@ -13,7 +13,7 @@ define i64 @f1(i64 %a) {
 define i64 @f2(i64 %a) {
 ; CHECK: f2
 ; CHECK: subs.w  r0, r0, #1179666
-; CHECK: adc r1, r1, #-1
+; CHECK: sbc r1, r1, #0
     %tmp = sub i64 %a, 1179666
     ret i64 %tmp
 }
@@ -22,7 +22,7 @@ define i64 @f2(i64 %a) {
 define i64 @f3(i64 %a) {
 ; CHECK: f3
 ; CHECK: subs.w  r0, r0, #872428544
-; CHECK: adc r1, r1, #-1
+; CHECK: sbc r1, r1, #0
     %tmp = sub i64 %a, 872428544
     ret i64 %tmp
 }
@@ -31,7 +31,7 @@ define i64 @f3(i64 %a) {
 define i64 @f4(i64 %a) {
 ; CHECK: f4
 ; CHECK: subs.w  r0, r0, #1448498774
-; CHECK: adc r1, r1, #-1
+; CHECK: sbc r1, r1, #0
     %tmp = sub i64 %a, 1448498774
     ret i64 %tmp
 }
@@ -40,7 +40,7 @@ define i64 @f4(i64 %a) {
 define i64 @f5(i64 %a) {
 ; CHECK: f5
 ; CHECK: subs.w  r0, r0, #66846720
-; CHECK: adc r1, r1, #-1
+; CHECK: sbc r1, r1, #0
     %tmp = sub i64 %a, 66846720
     ret i64 %tmp
 }
diff --git a/test/CodeGen/Thumb2/thumb2-sub5.ll b/test/CodeGen/Thumb2/thumb2-sub5.ll
index c3b56bc..6edd789 100644
--- a/test/CodeGen/Thumb2/thumb2-sub5.ll
+++ b/test/CodeGen/Thumb2/thumb2-sub5.ll
@@ -1,9 +1,10 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mattr=+thumb2 -mattr=+32bit | FileCheck %s
 
 define i64 @f1(i64 %a, i64 %b) {
 ; CHECK: f1:
-; CHECK: subs r0, r0, r2
-; CHECK: sbcs r1, r3
+; CHECK: subs.w r0, r0, r2
+; To test dead_carry, +32bit prevents sbc conveting to 16-bit sbcs
+; CHECK: sbc.w  r1, r1, r3
     %tmp = sub i64 %a, %b
     ret i64 %tmp
 }
diff --git a/test/CodeGen/Thumb2/thumb2-sxt_rot.ll b/test/CodeGen/Thumb2/thumb2-sxt_rot.ll
index 4b685a8..f3d0edf 100644
--- a/test/CodeGen/Thumb2/thumb2-sxt_rot.ll
+++ b/test/CodeGen/Thumb2/thumb2-sxt_rot.ll
@@ -7,7 +7,7 @@ define i32 @test0(i8 %A) {
 	ret i32 %B
 }
 
-define i8 @test1(i32 %A) signext {
+define signext i8 @test1(i32 %A)  {
 ; CHECK: test1
 ; CHECK: sxtb.w r0, r0, ror #8
 	%B = lshr i32 %A, 8
@@ -17,7 +17,7 @@ define i8 @test1(i32 %A) signext {
 	ret i8 %E
 }
 
-define i32 @test2(i32 %A, i32 %X) signext {
+define signext i32 @test2(i32 %A, i32 %X)  {
 ; CHECK: test2
 ; CHECK: lsrs r0, r0, #8
 ; CHECK: sxtab  r0, r1, r0
diff --git a/test/CodeGen/Thumb2/thumb2-teq.ll b/test/CodeGen/Thumb2/thumb2-teq.ll
index 69f0383..566408a 100644
--- a/test/CodeGen/Thumb2/thumb2-teq.ll
+++ b/test/CodeGen/Thumb2/thumb2-teq.ll
@@ -1,5 +1,7 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mattr=+thumb2 -join-physregs | FileCheck %s
 
+; These tests implicitly depend on 'movs r0, #0' being rematerialized below the
+; test as 'mov.w r0, #0'. So far, that requires physreg joining.
 
 ; 0x000000bb = 187
 define i1 @f1(i32 %a) {
diff --git a/test/CodeGen/Thumb2/thumb2-teq2.ll b/test/CodeGen/Thumb2/thumb2-teq2.ll
index 0f122f2..cdd3489 100644
--- a/test/CodeGen/Thumb2/thumb2-teq2.ll
+++ b/test/CodeGen/Thumb2/thumb2-teq2.ll
@@ -1,4 +1,7 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mattr=+thumb2 -join-physregs | FileCheck %s
+
+; These tests implicitly depend on 'movs r0, #0' being rematerialized below the
+; tst as 'mov.w r0, #0'. So far, that requires physreg joining.
 
 define i1 @f1(i32 %a, i32 %b) {
 ; CHECK: f1
diff --git a/test/CodeGen/Thumb2/thumb2-tst.ll b/test/CodeGen/Thumb2/thumb2-tst.ll
index d905217..47f553f 100644
--- a/test/CodeGen/Thumb2/thumb2-tst.ll
+++ b/test/CodeGen/Thumb2/thumb2-tst.ll
@@ -1,5 +1,7 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mattr=+thumb2 -join-physregs | FileCheck %s
 
+; These tests implicitly depend on 'movs r0, #0' being rematerialized below the
+; tst as 'mov.w r0, #0'. So far, that requires physreg joining.
 
 ; 0x000000bb = 187
 define i1 @f1(i32 %a) {
diff --git a/test/CodeGen/Thumb2/thumb2-tst2.ll b/test/CodeGen/Thumb2/thumb2-tst2.ll
index db202dd..405b3bb 100644
--- a/test/CodeGen/Thumb2/thumb2-tst2.ll
+++ b/test/CodeGen/Thumb2/thumb2-tst2.ll
@@ -1,4 +1,7 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mattr=+thumb2 -join-physregs | FileCheck %s
+
+; These tests implicitly depend on 'movs r0, #0' being rematerialized below the
+; tst as 'mov.w r0, #0'. So far, that requires physreg joining.
 
 define i1 @f1(i32 %a, i32 %b) {
 ; CHECK: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-uxt_rot.ll b/test/CodeGen/Thumb2/thumb2-uxt_rot.ll
index b8e4381..03189aa 100644
--- a/test/CodeGen/Thumb2/thumb2-uxt_rot.ll
+++ b/test/CodeGen/Thumb2/thumb2-uxt_rot.ll
@@ -1,13 +1,13 @@
 ; RUN: llc < %s -march=thumb -mattr=+thumb2,+t2xtpk | FileCheck %s
 
-define i8 @test1(i32 %A.u) zeroext {
+define zeroext i8 @test1(i32 %A.u)  {
 ; CHECK: test1
 ; CHECK: uxtb r0, r0
     %B.u = trunc i32 %A.u to i8
     ret i8 %B.u
 }
 
-define i32 @test2(i32 %A.u, i32 %B.u) zeroext {
+define zeroext i32 @test2(i32 %A.u, i32 %B.u)  {
 ; CHECK: test2
 ; CHECK: uxtab  r0, r0, r1
     %C.u = trunc i32 %B.u to i8
@@ -16,7 +16,7 @@ define i32 @test2(i32 %A.u, i32 %B.u) zeroext {
     ret i32 %E.u
 }
 
-define i32 @test3(i32 %A.u) zeroext {
+define zeroext i32 @test3(i32 %A.u)  {
 ; CHECK: test3
 ; CHECK: uxth.w r0, r0, ror #8
     %B.u = lshr i32 %A.u, 8
diff --git a/test/CodeGen/Thumb2/thumb2-uxtb.ll b/test/CodeGen/Thumb2/thumb2-uxtb.ll
index 2074f98..35914b1 100644
--- a/test/CodeGen/Thumb2/thumb2-uxtb.ll
+++ b/test/CodeGen/Thumb2/thumb2-uxtb.ll
@@ -128,9 +128,9 @@ define i32 @test10(i32 %p0) {
 
 ; ARMv7M: test10
 ; ARMv7M: mov.w r1, #16253176
+; ARMv7M: mov.w r2, #458759
 ; ARMv7M: and.w r0, r1, r0, lsr #7
-; ARMv7M: mov.w r1, #458759
-; ARMv7M: and.w r1, r1, r0, lsr #5
+; ARMv7M: and.w r1, r2, r0, lsr #5
 ; ARMv7M: orrs r0, r1
 	%tmp1 = lshr i32 %p0, 7		; <i32> [#uses=1]
 	%tmp2 = and i32 %tmp1, 16253176		; <i32> [#uses=2]
diff --git a/test/CodeGen/X86/2006-05-22-FPSetEQ.ll b/test/CodeGen/X86/2006-05-22-FPSetEQ.ll
index 35b0159..6c5a4fb 100644
--- a/test/CodeGen/X86/2006-05-22-FPSetEQ.ll
+++ b/test/CodeGen/X86/2006-05-22-FPSetEQ.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86 | grep setnp
-; RUN: llc < %s -march=x86 -enable-unsafe-fp-math -enable-no-nans-fp-math | \
+; RUN: llc < %s -march=x86 -mattr=-sse | grep setnp
+; RUN: llc < %s -march=x86 -mattr=-sse -enable-unsafe-fp-math -enable-no-nans-fp-math | \
 ; RUN:   not grep setnp
 
 define i32 @test(float %f) {
diff --git a/test/CodeGen/X86/2007-04-25-MMX-PADDQ.ll b/test/CodeGen/X86/2007-04-25-MMX-PADDQ.ll
index a662dd5..11c0bf9 100644
--- a/test/CodeGen/X86/2007-04-25-MMX-PADDQ.ll
+++ b/test/CodeGen/X86/2007-04-25-MMX-PADDQ.ll
@@ -8,8 +8,8 @@ entry:
 
 bb26:		; preds = %bb26, %entry
 
-; CHECK:  addl  %e
-; CHECK:  adcl  %e
+; CHECK:  addl
+; CHECK:  adcl
 
 	%i.037.0 = phi i32 [ 0, %entry ], [ %tmp25, %bb26 ]		; <i32> [#uses=3]
 	%sum.035.0 = phi <1 x i64> [ zeroinitializer, %entry ], [ %tmp22, %bb26 ]		; <<1 x i64>> [#uses=1]
diff --git a/test/CodeGen/X86/2007-05-05-Personality.ll b/test/CodeGen/X86/2007-05-05-Personality.ll
index a9b17d3..0f49d2e 100644
--- a/test/CodeGen/X86/2007-05-05-Personality.ll
+++ b/test/CodeGen/X86/2007-05-05-Personality.ll
@@ -1,4 +1,7 @@
-; RUN: llc < %s -mtriple=i686-pc-linux-gnu -o - | grep zPL
+; RUN: llc < %s -mtriple=i686-pc-linux-gnu -o - | FileCheck %s
+
+; CHECK: .cfi_personality 0, __gnat_eh_personality
+; CHECK: .cfi_lsda 0, .Lexception0
 
 @error = external global i8		; <i8*> [#uses=2]
 
diff --git a/test/CodeGen/X86/2007-05-07-InvokeSRet.ll b/test/CodeGen/X86/2007-05-07-InvokeSRet.ll
index ae49bd0..22e2750 100644
--- a/test/CodeGen/X86/2007-05-07-InvokeSRet.ll
+++ b/test/CodeGen/X86/2007-05-07-InvokeSRet.ll
@@ -7,7 +7,7 @@ declare void @invokee(%struct.S* sret )
 
 define void @invoker(%struct.S* %name.0.0) {
 entry:
-	invoke void @invokee( %struct.S* %name.0.0 sret  )
+	invoke void @invokee( %struct.S* sret %name.0.0   )
 			to label %return unwind label %return
 
 return:		; preds = %entry, %entry
diff --git a/test/CodeGen/X86/2007-05-14-LiveIntervalAssert.ll b/test/CodeGen/X86/2007-05-14-LiveIntervalAssert.ll
index 8ef2538..ecc5835 100644
--- a/test/CodeGen/X86/2007-05-14-LiveIntervalAssert.ll
+++ b/test/CodeGen/X86/2007-05-14-LiveIntervalAssert.ll
@@ -3,7 +3,7 @@
 	%struct.XDesc = type <{ i32, %struct.OpaqueXDataStorageType** }>
 	%struct.OpaqueXDataStorageType = type opaque
 
-declare i16 @GetParamDesc(%struct.XDesc*, i32, i32, %struct.XDesc*) signext 
+declare signext i16 @GetParamDesc(%struct.XDesc*, i32, i32, %struct.XDesc*)  
 
 declare void @r_raise(i64, i8*, ...)
 
@@ -18,7 +18,7 @@ cond_true109:		; preds = %entry
 	br i1 false, label %cond_next164, label %cond_true239
 
 cond_next164:		; preds = %cond_true109
-	%tmp176 = call i16 @GetParamDesc( %struct.XDesc* null, i32 1701999219, i32 1413830740, %struct.XDesc* null ) signext 		; <i16> [#uses=0]
+	%tmp176 = call signext i16 @GetParamDesc( %struct.XDesc* null, i32 1701999219, i32 1413830740, %struct.XDesc* null ) 
 	call void (i64, i8*, ...)* @r_raise( i64 0, i8* null )
 	unreachable
 
diff --git a/test/CodeGen/X86/2007-06-04-tailmerge4.ll b/test/CodeGen/X86/2007-06-04-tailmerge4.ll
deleted file mode 100644
index d5ec089..0000000
--- a/test/CodeGen/X86/2007-06-04-tailmerge4.ll
+++ /dev/null
@@ -1,454 +0,0 @@
-; RUN: llc < %s -asm-verbose | grep invcont131
-; PR 1496:  tail merge was incorrectly removing this block
-
-; ModuleID = 'report.1.bc'
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
-target triple = "i686-pc-linux-gnu"
-  %struct.ALLOC = type { %struct.string___XUB, [2 x i8] }
-  %struct.RETURN = type { i32, i32, i32, i64 }
-  %struct.ada__streams__root_stream_type = type { %struct.ada__tags__dispatch_table* }
-  %struct.ada__tags__dispatch_table = type { [1 x i8*] }
-  %struct.ada__text_io__text_afcb = type { %struct.system__file_control_block__afcb, i32, i32, i32, i32, i32, %struct.ada__text_io__text_afcb*, i8, i8 }
-  %struct.string___XUB = type { i32, i32 }
-  %struct.string___XUP = type { i8*, %struct.string___XUB* }
-  %struct.system__file_control_block__afcb = type { %struct.ada__streams__root_stream_type, i32, %struct.string___XUP, i32, %struct.string___XUP, i8, i8, i8, i8, i8, i8, i8, %struct.system__file_control_block__afcb*, %struct.system__file_control_block__afcb* }
-  %struct.system__secondary_stack__mark_id = type { i8*, i32 }
-  %struct.wide_string___XUP = type { i16*, %struct.string___XUB* }
-@report_E = global i8 0   ; <i8*> [#uses=0]
-@report__test_status = internal global i8 1   ; <i8*> [#uses=8]
-@report__test_name = internal global [15 x i8] zeroinitializer    ; <[15 x i8]*> [#uses=10]
-@report__test_name_len = internal global i32 0    ; <i32*> [#uses=15]
-@.str = internal constant [12 x i8] c"report.adb\00\00"   ; <[12 x i8]*> [#uses=1]
-@C.26.599 = internal constant %struct.string___XUB { i32 1, i32 1 }   ; <%struct.string___XUB*> [#uses=1]
-@.str1 = internal constant [1 x i8] c":"    ; <[1 x i8]*> [#uses=1]
-@.str2 = internal constant [1 x i8] c" "    ; <[1 x i8]*> [#uses=1]
-@.str3 = internal constant [1 x i8] c"-"    ; <[1 x i8]*> [#uses=1]
-@.str5 = internal constant [10 x i8] c"0123456789"    ; <[10 x i8]*> [#uses=12]
-@C.59.855 = internal constant %struct.string___XUB { i32 1, i32 0 }   ; <%struct.string___XUB*> [#uses=1]
-@C.69.876 = internal constant %struct.string___XUB { i32 1, i32 3 }   ; <%struct.string___XUB*> [#uses=1]
-@C.70.879 = internal constant %struct.string___XUB { i32 1, i32 6 }   ; <%struct.string___XUB*> [#uses=1]
-@C.81.900 = internal constant %struct.string___XUB { i32 1, i32 5 }   ; <%struct.string___XUB*> [#uses=1]
-@.str6 = internal constant [0 x i8] zeroinitializer   ; <[0 x i8]*> [#uses=1]
-@.str7 = internal constant [3 x i8] c"2.5"    ; <[3 x i8]*> [#uses=1]
-@.str8 = internal constant [6 x i8] c"ACATS "   ; <[6 x i8]*> [#uses=1]
-@.str9 = internal constant [5 x i8] c",.,. "    ; <[5 x i8]*> [#uses=1]
-@.str10 = internal constant [1 x i8] c"."   ; <[1 x i8]*> [#uses=1]
-@.str11 = internal constant [5 x i8] c"---- "   ; <[5 x i8]*> [#uses=1]
-@.str12 = internal constant [5 x i8] c"   - "   ; <[5 x i8]*> [#uses=1]
-@.str13 = internal constant [5 x i8] c"   * "   ; <[5 x i8]*> [#uses=1]
-@.str14 = internal constant [5 x i8] c"   + "   ; <[5 x i8]*> [#uses=1]
-@.str15 = internal constant [5 x i8] c"   ! "   ; <[5 x i8]*> [#uses=1]
-@C.209.1380 = internal constant %struct.string___XUB { i32 1, i32 37 }    ; <%struct.string___XUB*> [#uses=1]
-@.str16 = internal constant [37 x i8] c" PASSED ============================."    ; <[37 x i8]*> [#uses=1]
-@.str17 = internal constant [5 x i8] c"==== "   ; <[5 x i8]*> [#uses=1]
-@.str18 = internal constant [37 x i8] c" NOT-APPLICABLE ++++++++++++++++++++."    ; <[37 x i8]*> [#uses=1]
-@.str19 = internal constant [5 x i8] c"++++ "   ; <[5 x i8]*> [#uses=1]
-@.str20 = internal constant [37 x i8] c" TENTATIVELY PASSED !!!!!!!!!!!!!!!!."    ; <[37 x i8]*> [#uses=1]
-@.str21 = internal constant [5 x i8] c"!!!! "   ; <[5 x i8]*> [#uses=1]
-@.str22 = internal constant [37 x i8] c" SEE '!' COMMENTS FOR SPECIAL NOTES!!"    ; <[37 x i8]*> [#uses=1]
-@.str23 = internal constant [37 x i8] c" FAILED ****************************."    ; <[37 x i8]*> [#uses=1]
-@.str24 = internal constant [5 x i8] c"**** "   ; <[5 x i8]*> [#uses=1]
-@__gnat_others_value = external constant i32    ; <i32*> [#uses=2]
-@system__soft_links__abort_undefer = external global void ()*   ; <void ()**> [#uses=1]
-@C.320.1854 = internal constant %struct.string___XUB { i32 2, i32 6 }   ; <%struct.string___XUB*> [#uses=1]
-
-declare void @report__put_msg(i64 %msg.0.0)
-
-declare void @__gnat_rcheck_05(i8*, i32)
-
-declare void @__gnat_rcheck_12(i8*, i32)
-
-declare %struct.ada__text_io__text_afcb* @ada__text_io__standard_output()
-
-declare void @ada__text_io__set_col(%struct.ada__text_io__text_afcb*, i32)
-
-declare void @ada__text_io__put_line(%struct.ada__text_io__text_afcb*, i64)
-
-declare void @report__time_stamp(%struct.string___XUP* sret  %agg.result)
-
-declare i64 @ada__calendar__clock()
-
-declare void @ada__calendar__split(%struct.RETURN* sret , i64)
-
-declare void @system__string_ops_concat_5__str_concat_5(%struct.string___XUP* sret , i64, i64, i64, i64, i64)
-
-declare void @system__string_ops_concat_3__str_concat_3(%struct.string___XUP* sret , i64, i64, i64)
-
-declare i8* @system__secondary_stack__ss_allocate(i32)
-
-declare void @report__test(i64 %name.0.0, i64 %descr.0.0)
-
-declare void @system__secondary_stack__ss_mark(%struct.system__secondary_stack__mark_id* sret )
-
-declare i8* @llvm.eh.exception()
-
-declare i32 @llvm.eh.selector(i8*, i8*, ...)
-
-declare i32 @llvm.eh.typeid.for(i8*)
-
-declare i32 @__gnat_eh_personality(...)
-
-declare i32 @_Unwind_Resume(...)
-
-declare void @__gnat_rcheck_07(i8*, i32)
-
-declare void @system__secondary_stack__ss_release(i64)
-
-declare void @report__comment(i64 %descr.0.0)
-
-declare void @report__failed(i64 %descr.0.0)
-
-declare void @report__not_applicable(i64 %descr.0.0)
-
-declare void @report__special_action(i64 %descr.0.0)
-
-define void @report__result() {
-entry:
-  %tmp = alloca %struct.system__secondary_stack__mark_id, align 8   ; <%struct.system__secondary_stack__mark_id*> [#uses=3]
-  %A.210 = alloca %struct.string___XUB, align 8   ; <%struct.string___XUB*> [#uses=3]
-  %tmp5 = alloca %struct.string___XUP, align 8    ; <%struct.string___XUP*> [#uses=3]
-  %A.229 = alloca %struct.string___XUB, align 8   ; <%struct.string___XUB*> [#uses=3]
-  %tmp10 = alloca %struct.string___XUP, align 8   ; <%struct.string___XUP*> [#uses=3]
-  %A.248 = alloca %struct.string___XUB, align 8   ; <%struct.string___XUB*> [#uses=3]
-  %tmp15 = alloca %struct.string___XUP, align 8   ; <%struct.string___XUP*> [#uses=3]
-  %A.270 = alloca %struct.string___XUB, align 8   ; <%struct.string___XUB*> [#uses=3]
-  %tmp20 = alloca %struct.string___XUP, align 8   ; <%struct.string___XUP*> [#uses=3]
-  %A.284 = alloca %struct.string___XUB, align 8   ; <%struct.string___XUB*> [#uses=3]
-  %tmp25 = alloca %struct.string___XUP, align 8   ; <%struct.string___XUP*> [#uses=3]
-  call void @system__secondary_stack__ss_mark( %struct.system__secondary_stack__mark_id* %tmp sret  )
-  %tmp28 = getelementptr %struct.system__secondary_stack__mark_id* %tmp, i32 0, i32 0   ; <i8**> [#uses=1]
-  %tmp29 = load i8** %tmp28   ; <i8*> [#uses=2]
-  %tmp31 = getelementptr %struct.system__secondary_stack__mark_id* %tmp, i32 0, i32 1   ; <i32*> [#uses=1]
-  %tmp32 = load i32* %tmp31   ; <i32> [#uses=2]
-  %tmp33 = load i8* @report__test_status    ; <i8> [#uses=1]
-  switch i8 %tmp33, label %bb483 [
-     i8 0, label %bb
-     i8 2, label %bb143
-     i8 3, label %bb261
-  ]
-
-bb:   ; preds = %entry
-  %tmp34 = load i32* @report__test_name_len   ; <i32> [#uses=4]
-  %tmp35 = icmp sgt i32 %tmp34, 0   ; <i1> [#uses=2]
-  %tmp40 = icmp sgt i32 %tmp34, 15    ; <i1> [#uses=1]
-  %bothcond139 = and i1 %tmp35, %tmp40    ; <i1> [#uses=1]
-  br i1 %bothcond139, label %cond_true43, label %cond_next44
-
-cond_true43:    ; preds = %bb
-  invoke void @__gnat_rcheck_12( i8* getelementptr ([12 x i8]* @.str, i32 0, i32 0), i32 212 )
-      to label %UnifiedUnreachableBlock unwind label %unwind
-
-unwind:   ; preds = %invcont589, %cond_next567, %bb555, %cond_true497, %invcont249, %cond_next227, %bb215, %cond_true157, %invcont131, %cond_next109, %bb97, %cond_true43
-  %eh_ptr = call i8* @llvm.eh.exception( )    ; <i8*> [#uses=1]
-  br label %cleanup717
-
-cond_next44:    ; preds = %bb
-  %tmp72 = getelementptr %struct.string___XUB* %A.210, i32 0, i32 0   ; <i32*> [#uses=1]
-  store i32 1, i32* %tmp72
-  %tmp73 = getelementptr %struct.string___XUB* %A.210, i32 0, i32 1   ; <i32*> [#uses=1]
-  store i32 %tmp34, i32* %tmp73
-  br i1 %tmp35, label %cond_true80, label %cond_next109
-
-cond_true80:    ; preds = %cond_next44
-  %tmp45.off = add i32 %tmp34, -1   ; <i32> [#uses=1]
-  %bothcond = icmp ugt i32 %tmp45.off, 14   ; <i1> [#uses=1]
-  br i1 %bothcond, label %bb97, label %cond_next109
-
-bb97:   ; preds = %cond_true80
-  invoke void @__gnat_rcheck_05( i8* getelementptr ([12 x i8]* @.str, i32 0, i32 0), i32 212 )
-      to label %UnifiedUnreachableBlock unwind label %unwind
-
-cond_next109:   ; preds = %cond_true80, %cond_next44
-  %A.210128 = ptrtoint %struct.string___XUB* %A.210 to i32    ; <i32> [#uses=1]
-  %A.210128129 = zext i32 %A.210128 to i64    ; <i64> [#uses=1]
-  %A.210128129130 = shl i64 %A.210128129, 32    ; <i64> [#uses=1]
-  %A.210128129130.ins = or i64 %A.210128129130, zext (i32 ptrtoint ([15 x i8]* @report__test_name to i32) to i64)   ; <i64> [#uses=1]
-  invoke void @system__string_ops_concat_3__str_concat_3( %struct.string___XUP* %tmp5 sret , i64 or (i64 zext (i32 ptrtoint ([5 x i8]* @.str17 to i32) to i64), i64 shl (i64 zext (i32 ptrtoint (%struct.string___XUB* @C.81.900 to i32) to i64), i64 32)), i64 %A.210128129130.ins, i64 or (i64 zext (i32 ptrtoint ([37 x i8]* @.str16 to i32) to i64), i64 shl (i64 zext (i32 ptrtoint (%struct.string___XUB* @C.209.1380 to i32) to i64), i64 32)) )
-      to label %invcont131 unwind label %unwind
-
-invcont131:   ; preds = %cond_next109
-  %tmp133 = getelementptr %struct.string___XUP* %tmp5, i32 0, i32 0   ; <i8**> [#uses=1]
-  %tmp134 = load i8** %tmp133   ; <i8*> [#uses=1]
-  %tmp134120 = ptrtoint i8* %tmp134 to i32    ; <i32> [#uses=1]
-  %tmp134120121 = zext i32 %tmp134120 to i64    ; <i64> [#uses=1]
-  %tmp136 = getelementptr %struct.string___XUP* %tmp5, i32 0, i32 1   ; <%struct.string___XUB**> [#uses=1]
-  %tmp137 = load %struct.string___XUB** %tmp136   ; <%struct.string___XUB*> [#uses=1]
-  %tmp137116 = ptrtoint %struct.string___XUB* %tmp137 to i32    ; <i32> [#uses=1]
-  %tmp137116117 = zext i32 %tmp137116 to i64    ; <i64> [#uses=1]
-  %tmp137116117118 = shl i64 %tmp137116117, 32    ; <i64> [#uses=1]
-  %tmp137116117118.ins = or i64 %tmp137116117118, %tmp134120121   ; <i64> [#uses=1]
-  invoke fastcc void @report__put_msg( i64 %tmp137116117118.ins )
-      to label %cond_next618 unwind label %unwind
-
-bb143:    ; preds = %entry
-  %tmp144 = load i32* @report__test_name_len    ; <i32> [#uses=4]
-  %tmp147 = icmp sgt i32 %tmp144, 0   ; <i1> [#uses=2]
-  %tmp154 = icmp sgt i32 %tmp144, 15    ; <i1> [#uses=1]
-  %bothcond140 = and i1 %tmp147, %tmp154    ; <i1> [#uses=1]
-  br i1 %bothcond140, label %cond_true157, label %cond_next160
-
-cond_true157:   ; preds = %bb143
-  invoke void @__gnat_rcheck_12( i8* getelementptr ([12 x i8]* @.str, i32 0, i32 0), i32 215 )
-      to label %UnifiedUnreachableBlock unwind label %unwind
-
-cond_next160:   ; preds = %bb143
-  %tmp189 = getelementptr %struct.string___XUB* %A.229, i32 0, i32 0    ; <i32*> [#uses=1]
-  store i32 1, i32* %tmp189
-  %tmp190 = getelementptr %struct.string___XUB* %A.229, i32 0, i32 1    ; <i32*> [#uses=1]
-  store i32 %tmp144, i32* %tmp190
-  br i1 %tmp147, label %cond_true197, label %cond_next227
-
-cond_true197:   ; preds = %cond_next160
-  %tmp161.off = add i32 %tmp144, -1   ; <i32> [#uses=1]
-  %bothcond1 = icmp ugt i32 %tmp161.off, 14   ; <i1> [#uses=1]
-  br i1 %bothcond1, label %bb215, label %cond_next227
-
-bb215:    ; preds = %cond_true197
-  invoke void @__gnat_rcheck_05( i8* getelementptr ([12 x i8]* @.str, i32 0, i32 0), i32 215 )
-      to label %UnifiedUnreachableBlock unwind label %unwind
-
-cond_next227:   ; preds = %cond_true197, %cond_next160
-  %A.229105 = ptrtoint %struct.string___XUB* %A.229 to i32    ; <i32> [#uses=1]
-  %A.229105106 = zext i32 %A.229105 to i64    ; <i64> [#uses=1]
-  %A.229105106107 = shl i64 %A.229105106, 32    ; <i64> [#uses=1]
-  %A.229105106107.ins = or i64 %A.229105106107, zext (i32 ptrtoint ([15 x i8]* @report__test_name to i32) to i64)   ; <i64> [#uses=1]
-  invoke void @system__string_ops_concat_3__str_concat_3( %struct.string___XUP* %tmp10 sret , i64 or (i64 zext (i32 ptrtoint ([5 x i8]* @.str19 to i32) to i64), i64 shl (i64 zext (i32 ptrtoint (%struct.string___XUB* @C.81.900 to i32) to i64), i64 32)), i64 %A.229105106107.ins, i64 or (i64 zext (i32 ptrtoint ([37 x i8]* @.str18 to i32) to i64), i64 shl (i64 zext (i32 ptrtoint (%struct.string___XUB* @C.209.1380 to i32) to i64), i64 32)) )
-      to label %invcont249 unwind label %unwind
-
-invcont249:   ; preds = %cond_next227
-  %tmp251 = getelementptr %struct.string___XUP* %tmp10, i32 0, i32 0    ; <i8**> [#uses=1]
-  %tmp252 = load i8** %tmp251   ; <i8*> [#uses=1]
-  %tmp25297 = ptrtoint i8* %tmp252 to i32   ; <i32> [#uses=1]
-  %tmp2529798 = zext i32 %tmp25297 to i64   ; <i64> [#uses=1]
-  %tmp254 = getelementptr %struct.string___XUP* %tmp10, i32 0, i32 1    ; <%struct.string___XUB**> [#uses=1]
-  %tmp255 = load %struct.string___XUB** %tmp254   ; <%struct.string___XUB*> [#uses=1]
-  %tmp25593 = ptrtoint %struct.string___XUB* %tmp255 to i32   ; <i32> [#uses=1]
-  %tmp2559394 = zext i32 %tmp25593 to i64   ; <i64> [#uses=1]
-  %tmp255939495 = shl i64 %tmp2559394, 32   ; <i64> [#uses=1]
-  %tmp255939495.ins = or i64 %tmp255939495, %tmp2529798   ; <i64> [#uses=1]
-  invoke fastcc void @report__put_msg( i64 %tmp255939495.ins )
-      to label %cond_next618 unwind label %unwind
-
-bb261:    ; preds = %entry
-  %tmp262 = call i8* @llvm.stacksave( )   ; <i8*> [#uses=2]
-  %tmp263 = load i32* @report__test_name_len    ; <i32> [#uses=4]
-  %tmp266 = icmp sgt i32 %tmp263, 0   ; <i1> [#uses=2]
-  %tmp273 = icmp sgt i32 %tmp263, 15    ; <i1> [#uses=1]
-  %bothcond141 = and i1 %tmp266, %tmp273    ; <i1> [#uses=1]
-  br i1 %bothcond141, label %cond_true276, label %cond_next281
-
-cond_true276:   ; preds = %bb261
-  invoke void @__gnat_rcheck_12( i8* getelementptr ([12 x i8]* @.str, i32 0, i32 0), i32 218 )
-      to label %UnifiedUnreachableBlock unwind label %unwind277
-
-unwind277:    ; preds = %invcont467, %cond_next442, %invcont370, %cond_next348, %bb336, %cond_true276
-  %eh_ptr278 = call i8* @llvm.eh.exception( )   ; <i8*> [#uses=1]
-  call void @llvm.stackrestore( i8* %tmp262 )
-  br label %cleanup717
-
-cond_next281:   ; preds = %bb261
-  %tmp310 = getelementptr %struct.string___XUB* %A.248, i32 0, i32 0    ; <i32*> [#uses=1]
-  store i32 1, i32* %tmp310
-  %tmp311 = getelementptr %struct.string___XUB* %A.248, i32 0, i32 1    ; <i32*> [#uses=1]
-  store i32 %tmp263, i32* %tmp311
-  br i1 %tmp266, label %cond_true318, label %cond_next348
-
-cond_true318:   ; preds = %cond_next281
-  %tmp282.off = add i32 %tmp263, -1   ; <i32> [#uses=1]
-  %bothcond2 = icmp ugt i32 %tmp282.off, 14   ; <i1> [#uses=1]
-  br i1 %bothcond2, label %bb336, label %cond_next348
-
-bb336:    ; preds = %cond_true318
-  invoke void @__gnat_rcheck_05( i8* getelementptr ([12 x i8]* @.str, i32 0, i32 0), i32 218 )
-      to label %UnifiedUnreachableBlock unwind label %unwind277
-
-cond_next348:   ; preds = %cond_true318, %cond_next281
-  %A.24882 = ptrtoint %struct.string___XUB* %A.248 to i32   ; <i32> [#uses=1]
-  %A.2488283 = zext i32 %A.24882 to i64   ; <i64> [#uses=1]
-  %A.248828384 = shl i64 %A.2488283, 32   ; <i64> [#uses=1]
-  %A.248828384.ins = or i64 %A.248828384, zext (i32 ptrtoint ([15 x i8]* @report__test_name to i32) to i64)   ; <i64> [#uses=1]
-  invoke void @system__string_ops_concat_3__str_concat_3( %struct.string___XUP* %tmp15 sret , i64 or (i64 zext (i32 ptrtoint ([5 x i8]* @.str21 to i32) to i64), i64 shl (i64 zext (i32 ptrtoint (%struct.string___XUB* @C.81.900 to i32) to i64), i64 32)), i64 %A.248828384.ins, i64 or (i64 zext (i32 ptrtoint ([37 x i8]* @.str20 to i32) to i64), i64 shl (i64 zext (i32 ptrtoint (%struct.string___XUB* @C.209.1380 to i32) to i64), i64 32)) )
-      to label %invcont370 unwind label %unwind277
-
-invcont370:   ; preds = %cond_next348
-  %tmp372 = getelementptr %struct.string___XUP* %tmp15, i32 0, i32 0    ; <i8**> [#uses=1]
-  %tmp373 = load i8** %tmp372   ; <i8*> [#uses=1]
-  %tmp37374 = ptrtoint i8* %tmp373 to i32   ; <i32> [#uses=1]
-  %tmp3737475 = zext i32 %tmp37374 to i64   ; <i64> [#uses=1]
-  %tmp375 = getelementptr %struct.string___XUP* %tmp15, i32 0, i32 1    ; <%struct.string___XUB**> [#uses=1]
-  %tmp376 = load %struct.string___XUB** %tmp375   ; <%struct.string___XUB*> [#uses=1]
-  %tmp37670 = ptrtoint %struct.string___XUB* %tmp376 to i32   ; <i32> [#uses=1]
-  %tmp3767071 = zext i32 %tmp37670 to i64   ; <i64> [#uses=1]
-  %tmp376707172 = shl i64 %tmp3767071, 32   ; <i64> [#uses=1]
-  %tmp376707172.ins = or i64 %tmp376707172, %tmp3737475   ; <i64> [#uses=1]
-  invoke fastcc void @report__put_msg( i64 %tmp376707172.ins )
-      to label %invcont381 unwind label %unwind277
-
-invcont381:   ; preds = %invcont370
-  %tmp382 = load i32* @report__test_name_len    ; <i32> [#uses=6]
-  %tmp415 = icmp sgt i32 %tmp382, -1    ; <i1> [#uses=1]
-  %max416 = select i1 %tmp415, i32 %tmp382, i32 0   ; <i32> [#uses=1]
-  %tmp417 = alloca i8, i32 %max416    ; <i8*> [#uses=3]
-  %tmp423 = icmp sgt i32 %tmp382, 0   ; <i1> [#uses=1]
-  br i1 %tmp423, label %bb427, label %cond_next442
-
-bb427:    ; preds = %invcont381
-  store i8 32, i8* %tmp417
-  %tmp434 = icmp eq i32 %tmp382, 1    ; <i1> [#uses=1]
-  br i1 %tmp434, label %cond_next442, label %cond_next438.preheader
-
-cond_next438.preheader:   ; preds = %bb427
-  %tmp. = add i32 %tmp382, -1   ; <i32> [#uses=1]
-  br label %cond_next438
-
-cond_next438:   ; preds = %cond_next438, %cond_next438.preheader
-  %indvar = phi i32 [ 0, %cond_next438.preheader ], [ %J130b.513.5, %cond_next438 ]   ; <i32> [#uses=1]
-  %J130b.513.5 = add i32 %indvar, 1   ; <i32> [#uses=3]
-  %tmp43118 = getelementptr i8* %tmp417, i32 %J130b.513.5   ; <i8*> [#uses=1]
-  store i8 32, i8* %tmp43118
-  %exitcond = icmp eq i32 %J130b.513.5, %tmp.   ; <i1> [#uses=1]
-  br i1 %exitcond, label %cond_next442, label %cond_next438
-
-cond_next442:   ; preds = %cond_next438, %bb427, %invcont381
-  %tmp448 = getelementptr %struct.string___XUB* %A.270, i32 0, i32 0    ; <i32*> [#uses=1]
-  store i32 1, i32* %tmp448
-  %tmp449 = getelementptr %struct.string___XUB* %A.270, i32 0, i32 1    ; <i32*> [#uses=1]
-  store i32 %tmp382, i32* %tmp449
-  %tmp41762 = ptrtoint i8* %tmp417 to i32   ; <i32> [#uses=1]
-  %tmp4176263 = zext i32 %tmp41762 to i64   ; <i64> [#uses=1]
-  %A.27058 = ptrtoint %struct.string___XUB* %A.270 to i32   ; <i32> [#uses=1]
-  %A.2705859 = zext i32 %A.27058 to i64   ; <i64> [#uses=1]
-  %A.270585960 = shl i64 %A.2705859, 32   ; <i64> [#uses=1]
-  %A.270585960.ins = or i64 %tmp4176263, %A.270585960   ; <i64> [#uses=1]
-  invoke void @system__string_ops_concat_3__str_concat_3( %struct.string___XUP* %tmp20 sret , i64 or (i64 zext (i32 ptrtoint ([5 x i8]* @.str21 to i32) to i64), i64 shl (i64 zext (i32 ptrtoint (%struct.string___XUB* @C.81.900 to i32) to i64), i64 32)), i64 %A.270585960.ins, i64 or (i64 zext (i32 ptrtoint ([37 x i8]* @.str22 to i32) to i64), i64 shl (i64 zext (i32 ptrtoint (%struct.string___XUB* @C.209.1380 to i32) to i64), i64 32)) )
-      to label %invcont467 unwind label %unwind277
-
-invcont467:   ; preds = %cond_next442
-  %tmp469 = getelementptr %struct.string___XUP* %tmp20, i32 0, i32 0    ; <i8**> [#uses=1]
-  %tmp470 = load i8** %tmp469   ; <i8*> [#uses=1]
-  %tmp47050 = ptrtoint i8* %tmp470 to i32   ; <i32> [#uses=1]
-  %tmp4705051 = zext i32 %tmp47050 to i64   ; <i64> [#uses=1]
-  %tmp472 = getelementptr %struct.string___XUP* %tmp20, i32 0, i32 1    ; <%struct.string___XUB**> [#uses=1]
-  %tmp473 = load %struct.string___XUB** %tmp472   ; <%struct.string___XUB*> [#uses=1]
-  %tmp47346 = ptrtoint %struct.string___XUB* %tmp473 to i32   ; <i32> [#uses=1]
-  %tmp4734647 = zext i32 %tmp47346 to i64   ; <i64> [#uses=1]
-  %tmp473464748 = shl i64 %tmp4734647, 32   ; <i64> [#uses=1]
-  %tmp473464748.ins = or i64 %tmp473464748, %tmp4705051   ; <i64> [#uses=1]
-  invoke fastcc void @report__put_msg( i64 %tmp473464748.ins )
-      to label %cleanup unwind label %unwind277
-
-cleanup:    ; preds = %invcont467
-  call void @llvm.stackrestore( i8* %tmp262 )
-  br label %cond_next618
-
-bb483:    ; preds = %entry
-  %tmp484 = load i32* @report__test_name_len    ; <i32> [#uses=4]
-  %tmp487 = icmp sgt i32 %tmp484, 0   ; <i1> [#uses=2]
-  %tmp494 = icmp sgt i32 %tmp484, 15    ; <i1> [#uses=1]
-  %bothcond142 = and i1 %tmp487, %tmp494    ; <i1> [#uses=1]
-  br i1 %bothcond142, label %cond_true497, label %cond_next500
-
-cond_true497:   ; preds = %bb483
-  invoke void @__gnat_rcheck_12( i8* getelementptr ([12 x i8]* @.str, i32 0, i32 0), i32 223 )
-      to label %UnifiedUnreachableBlock unwind label %unwind
-
-cond_next500:   ; preds = %bb483
-  %tmp529 = getelementptr %struct.string___XUB* %A.284, i32 0, i32 0    ; <i32*> [#uses=1]
-  store i32 1, i32* %tmp529
-  %tmp530 = getelementptr %struct.string___XUB* %A.284, i32 0, i32 1    ; <i32*> [#uses=1]
-  store i32 %tmp484, i32* %tmp530
-  br i1 %tmp487, label %cond_true537, label %cond_next567
-
-cond_true537:   ; preds = %cond_next500
-  %tmp501.off = add i32 %tmp484, -1   ; <i32> [#uses=1]
-  %bothcond3 = icmp ugt i32 %tmp501.off, 14   ; <i1> [#uses=1]
-  br i1 %bothcond3, label %bb555, label %cond_next567
-
-bb555:    ; preds = %cond_true537
-  invoke void @__gnat_rcheck_05( i8* getelementptr ([12 x i8]* @.str, i32 0, i32 0), i32 223 )
-      to label %UnifiedUnreachableBlock unwind label %unwind
-
-cond_next567:   ; preds = %cond_true537, %cond_next500
-  %A.28435 = ptrtoint %struct.string___XUB* %A.284 to i32   ; <i32> [#uses=1]
-  %A.2843536 = zext i32 %A.28435 to i64   ; <i64> [#uses=1]
-  %A.284353637 = shl i64 %A.2843536, 32   ; <i64> [#uses=1]
-  %A.284353637.ins = or i64 %A.284353637, zext (i32 ptrtoint ([15 x i8]* @report__test_name to i32) to i64)   ; <i64> [#uses=1]
-  invoke void @system__string_ops_concat_3__str_concat_3( %struct.string___XUP* %tmp25 sret , i64 or (i64 zext (i32 ptrtoint ([5 x i8]* @.str24 to i32) to i64), i64 shl (i64 zext (i32 ptrtoint (%struct.string___XUB* @C.81.900 to i32) to i64), i64 32)), i64 %A.284353637.ins, i64 or (i64 zext (i32 ptrtoint ([37 x i8]* @.str23 to i32) to i64), i64 shl (i64 zext (i32 ptrtoint (%struct.string___XUB* @C.209.1380 to i32) to i64), i64 32)) )
-      to label %invcont589 unwind label %unwind
-
-invcont589:   ; preds = %cond_next567
-  %tmp591 = getelementptr %struct.string___XUP* %tmp25, i32 0, i32 0    ; <i8**> [#uses=1]
-  %tmp592 = load i8** %tmp591   ; <i8*> [#uses=1]
-  %tmp59228 = ptrtoint i8* %tmp592 to i32   ; <i32> [#uses=1]
-  %tmp5922829 = zext i32 %tmp59228 to i64   ; <i64> [#uses=1]
-  %tmp594 = getelementptr %struct.string___XUP* %tmp25, i32 0, i32 1    ; <%struct.string___XUB**> [#uses=1]
-  %tmp595 = load %struct.string___XUB** %tmp594   ; <%struct.string___XUB*> [#uses=1]
-  %tmp59524 = ptrtoint %struct.string___XUB* %tmp595 to i32   ; <i32> [#uses=1]
-  %tmp5952425 = zext i32 %tmp59524 to i64   ; <i64> [#uses=1]
-  %tmp595242526 = shl i64 %tmp5952425, 32   ; <i64> [#uses=1]
-  %tmp595242526.ins = or i64 %tmp595242526, %tmp5922829   ; <i64> [#uses=1]
-  invoke fastcc void @report__put_msg( i64 %tmp595242526.ins )
-      to label %cond_next618 unwind label %unwind
-
-cond_next618:   ; preds = %invcont589, %cleanup, %invcont249, %invcont131
-  store i8 1, i8* @report__test_status
-  store i32 7, i32* @report__test_name_len
-  store i8 78, i8* getelementptr ([15 x i8]* @report__test_name, i32 0, i32 0)
-  store i8 79, i8* getelementptr ([15 x i8]* @report__test_name, i32 0, i32 1)
-  store i8 95, i8* getelementptr ([15 x i8]* @report__test_name, i32 0, i32 2)
-  store i8 78, i8* getelementptr ([15 x i8]* @report__test_name, i32 0, i32 3)
-  store i8 65, i8* getelementptr ([15 x i8]* @report__test_name, i32 0, i32 4)
-  store i8 77, i8* getelementptr ([15 x i8]* @report__test_name, i32 0, i32 5)
-  store i8 69, i8* getelementptr ([15 x i8]* @report__test_name, i32 0, i32 6)
-  %CHAIN.310.0.0.0.val5.i = ptrtoint i8* %tmp29 to i32    ; <i32> [#uses=1]
-  %CHAIN.310.0.0.0.val56.i = zext i32 %CHAIN.310.0.0.0.val5.i to i64    ; <i64> [#uses=1]
-  %CHAIN.310.0.0.1.val2.i = zext i32 %tmp32 to i64    ; <i64> [#uses=1]
-  %CHAIN.310.0.0.1.val23.i = shl i64 %CHAIN.310.0.0.1.val2.i, 32    ; <i64> [#uses=1]
-  %CHAIN.310.0.0.1.val23.ins.i = or i64 %CHAIN.310.0.0.1.val23.i, %CHAIN.310.0.0.0.val56.i    ; <i64> [#uses=1]
-  call void @system__secondary_stack__ss_release( i64 %CHAIN.310.0.0.1.val23.ins.i )
-  ret void
-
-cleanup717:   ; preds = %unwind277, %unwind
-  %eh_exception.0 = phi i8* [ %eh_ptr278, %unwind277 ], [ %eh_ptr, %unwind ]    ; <i8*> [#uses=1]
-  %CHAIN.310.0.0.0.val5.i8 = ptrtoint i8* %tmp29 to i32   ; <i32> [#uses=1]
-  %CHAIN.310.0.0.0.val56.i9 = zext i32 %CHAIN.310.0.0.0.val5.i8 to i64    ; <i64> [#uses=1]
-  %CHAIN.310.0.0.1.val2.i10 = zext i32 %tmp32 to i64    ; <i64> [#uses=1]
-  %CHAIN.310.0.0.1.val23.i11 = shl i64 %CHAIN.310.0.0.1.val2.i10, 32    ; <i64> [#uses=1]
-  %CHAIN.310.0.0.1.val23.ins.i12 = or i64 %CHAIN.310.0.0.1.val23.i11, %CHAIN.310.0.0.0.val56.i9   ; <i64> [#uses=1]
-  call void @system__secondary_stack__ss_release( i64 %CHAIN.310.0.0.1.val23.ins.i12 )
-  call i32 (...)* @_Unwind_Resume( i8* %eh_exception.0 )    ; <i32>:0 [#uses=0]
-  unreachable
-
-UnifiedUnreachableBlock:    ; preds = %bb555, %cond_true497, %bb336, %cond_true276, %bb215, %cond_true157, %bb97, %cond_true43
-  unreachable
-}
-
-declare i8* @llvm.stacksave()
-
-declare void @llvm.stackrestore(i8*)
-
-declare i32 @report__ident_int(i32 %x)
-
-declare i8 @report__equal(i32 %x, i32 %y)
-
-declare i8 @report__ident_char(i8 zeroext  %x)
-
-declare i16 @report__ident_wide_char(i16 zeroext  %x)
-
-declare i8 @report__ident_bool(i8 %x)
-
-declare void @report__ident_str(%struct.string___XUP* sret  %agg.result, i64 %x.0.0)
-
-declare void @llvm.memcpy.i32(i8*, i8*, i32, i32)
-
-declare void @report__ident_wide_str(%struct.wide_string___XUP* sret  %agg.result, i64 %x.0.0)
-
-declare void @__gnat_begin_handler(i8*)
-
-declare void @__gnat_end_handler(i8*)
-
-declare void @report__legal_file_name(%struct.string___XUP* sret  %agg.result, i32 %x, i64 %nam.0.0)
-
-declare void @__gnat_rcheck_06(i8*, i32)
-
-declare void @system__string_ops__str_concat_cs(%struct.string___XUP* sret , i8 zeroext , i64)
diff --git a/test/CodeGen/X86/2007-08-01-LiveVariablesBug.ll b/test/CodeGen/X86/2007-08-01-LiveVariablesBug.ll
index 3cd8052..62624a7 100644
--- a/test/CodeGen/X86/2007-08-01-LiveVariablesBug.ll
+++ b/test/CodeGen/X86/2007-08-01-LiveVariablesBug.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=x86 | not grep movl
 
-define i8 @t(i8 zeroext  %x, i8 zeroext  %y) zeroext  {
+define zeroext i8 @t(i8 zeroext  %x, i8 zeroext  %y)   {
 	%tmp2 = add i8 %x, 2
 	%tmp4 = add i8 %y, -2
 	%tmp5 = mul i8 %tmp4, %tmp2
diff --git a/test/CodeGen/X86/2007-08-10-SignExtSubreg.ll b/test/CodeGen/X86/2007-08-10-SignExtSubreg.ll
index e93092f..77291f0 100644
--- a/test/CodeGen/X86/2007-08-10-SignExtSubreg.ll
+++ b/test/CodeGen/X86/2007-08-10-SignExtSubreg.ll
@@ -2,7 +2,7 @@
 
 @X = global i32 0               ; <i32*> [#uses=1]
 
-define i8 @_Z3fooi(i32 %x) signext  {
+define signext i8 @_Z3fooi(i32 %x)   {
 entry:
         store i32 %x, i32* @X, align 4
         %retval67 = trunc i32 %x to i8          ; <i8> [#uses=1]
diff --git a/test/CodeGen/X86/2007-09-17-ObjcFrameEH.ll b/test/CodeGen/X86/2007-09-17-ObjcFrameEH.ll
index c3403a0..8518d4c 100644
--- a/test/CodeGen/X86/2007-09-17-ObjcFrameEH.ll
+++ b/test/CodeGen/X86/2007-09-17-ObjcFrameEH.ll
@@ -1,4 +1,6 @@
-; RUN: llc < %s -march=x86 -mtriple=i686-apple-darwin | grep {isNullOrNil].eh"} | count 2
+; RUN: llc < %s -disable-cfi -march=x86 -mtriple=i686-apple-darwin | grep {isNullOrNil].eh"} | FileCheck %s
+
+; CHECK: "_-[NSString(local) isNullOrNil].eh":
 
 	%struct.NSString = type {  }
 	%struct._objc__method_prototype_list = type opaque
@@ -24,7 +26,7 @@
     [1 x %struct._objc_method] [ %struct._objc_method {
         %struct.objc_selector* bitcast ([12 x i8]* @"\01L_OBJC_METH_VAR_NAME_0" to %struct.objc_selector*), 
         i8* getelementptr ([7 x i8]* @"\01L_OBJC_METH_VAR_TYPE_0", i32 0, i32 0), 
-        i8* bitcast (i8 (%struct.NSString*, %struct.objc_selector*) signext * @"-[NSString(local) isNullOrNil]" to i8*) } ] }, section "__OBJC,__cat_inst_meth,regular,no_dead_strip"		; <{ i32, i32, [1 x %struct._objc_method] }*> [#uses=3]
+        i8* bitcast (i8 (%struct.NSString*, %struct.objc_selector*)  * @"-[NSString(local) isNullOrNil]" to i8*) } ] }, section "__OBJC,__cat_inst_meth,regular,no_dead_strip"		; <{ i32, i32, [1 x %struct._objc_method] }*> [#uses=3]
 @"\01L_OBJC_CATEGORY_NSString_local" = internal global { i8*, i8*, %struct._objc_method_list*, i32, i32, i32, i32 } {
     i8* getelementptr ([6 x i8]* @"\01L_OBJC_CLASS_NAME_0", i32 0, i32 0), 
     i8* getelementptr ([9 x i8]* @"\01L_OBJC_CLASS_NAME_1", i32 0, i32 0), 
@@ -49,7 +51,7 @@
 @"\01L_OBJC_METH_VAR_TYPE_0" = internal global [7 x i8] c"c8@0:4\00", section "__TEXT,__cstring,cstring_literals"		; <[7 x i8]*> [#uses=2]
 @llvm.used = appending global [11 x i8*] [ i8* bitcast ({ i32, i32, i16, i16, [1 x %struct._objc_category*] }* @"\01L_OBJC_SYMBOLS" to i8*), i8* bitcast ({ i32, i32, [1 x %struct._objc_method] }* @"\01L_OBJC_CATEGORY_INSTANCE_METHODS_NSString_local" to i8*), i8* bitcast ({ i8*, i8*, %struct._objc_method_list*, i32, i32, i32, i32 }* @"\01L_OBJC_CATEGORY_NSString_local" to i8*), i8* bitcast ([2 x i32]* @"\01L_OBJC_IMAGE_INFO" to i8*), i8* bitcast (%struct._objc_module* @"\01L_OBJC_MODULES" to i8*), i8* bitcast (i32* @"\01.objc_category_name_NSString_local" to i8*), i8* getelementptr ([1 x i8]* @"\01L_OBJC_CLASS_NAME_2", i32 0, i32 0), i8* getelementptr ([9 x i8]* @"\01L_OBJC_CLASS_NAME_1", i32 0, i32 0), i8* getelementptr ([6 x i8]* @"\01L_OBJC_CLASS_NAME_0", i32 0, i32 0), i8* getelementptr ([12 x i8]* @"\01L_OBJC_METH_VAR_NAME_0", i32 0, i32 0), i8* getelementptr ([7 x i8]* @"\01L_OBJC_METH_VAR_TYPE_0", i32 0, i32 0) ], section "llvm.metadata"		; <[11 x i8*]*> [#uses=0]
 
-define internal i8 @"-[NSString(local) isNullOrNil]"(%struct.NSString* %self, %struct.objc_selector* %_cmd) signext  {
+define internal signext i8 @"-[NSString(local) isNullOrNil]"(%struct.NSString* %self, %struct.objc_selector* %_cmd)   {
 entry:
 	%self_addr = alloca %struct.NSString*		; <%struct.NSString**> [#uses=1]
 	%_cmd_addr = alloca %struct.objc_selector*		; <%struct.objc_selector**> [#uses=1]
diff --git a/test/CodeGen/X86/2007-09-27-LDIntrinsics.ll b/test/CodeGen/X86/2007-09-27-LDIntrinsics.ll
index 4d69715..f7ffb93 100644
--- a/test/CodeGen/X86/2007-09-27-LDIntrinsics.ll
+++ b/test/CodeGen/X86/2007-09-27-LDIntrinsics.ll
@@ -23,7 +23,7 @@ entry:
 ; CHECK: fldt 4(%esp)
 ; CHECK-NEXT: fld	%st(0)
 ; CHECK-NEXT: fmul	%st(1)
-; CHECK-NEXT: fmulp	%st(1)
+; CHECK-NEXT: fmulp
 ; CHECK-NEXT: ret
 }
 
diff --git a/test/CodeGen/X86/2007-10-05-3AddrConvert.ll b/test/CodeGen/X86/2007-10-05-3AddrConvert.ll
deleted file mode 100644
index 2c2706d..0000000
--- a/test/CodeGen/X86/2007-10-05-3AddrConvert.ll
+++ /dev/null
@@ -1,48 +0,0 @@
-; RUN: llc < %s -march=x86 | grep lea
-
-	%struct.anon = type { [3 x double], double, %struct.node*, [64 x %struct.bnode*], [64 x %struct.bnode*] }
-	%struct.bnode = type { i16, double, [3 x double], i32, i32, [3 x double], [3 x double], [3 x double], double, %struct.bnode*, %struct.bnode* }
-	%struct.node = type { i16, double, [3 x double], i32, i32 }
-
-define i32 @main(i32 %argc, i8** nocapture %argv) nounwind {
-entry:
-	%0 = malloc %struct.anon		; <%struct.anon*> [#uses=2]
-	%1 = getelementptr %struct.anon* %0, i32 0, i32 2		; <%struct.node**> [#uses=1]
-	br label %bb14.i
-
-bb14.i:		; preds = %bb14.i, %entry
-	%i8.0.reg2mem.0.i = phi i32 [ 0, %entry ], [ %2, %bb14.i ]		; <i32> [#uses=1]
-	%2 = add i32 %i8.0.reg2mem.0.i, 1		; <i32> [#uses=2]
-	%exitcond74.i = icmp eq i32 %2, 32		; <i1> [#uses=1]
-	br i1 %exitcond74.i, label %bb32.i, label %bb14.i
-
-bb32.i:		; preds = %bb32.i, %bb14.i
-	%tmp.0.reg2mem.0.i = phi i32 [ %indvar.next63.i, %bb32.i ], [ 0, %bb14.i ]		; <i32> [#uses=1]
-	%indvar.next63.i = add i32 %tmp.0.reg2mem.0.i, 1		; <i32> [#uses=2]
-	%exitcond64.i = icmp eq i32 %indvar.next63.i, 64		; <i1> [#uses=1]
-	br i1 %exitcond64.i, label %bb47.loopexit.i, label %bb32.i
-
-bb.i.i:		; preds = %bb47.loopexit.i
-	unreachable
-
-stepsystem.exit.i:		; preds = %bb47.loopexit.i
-	store %struct.node* null, %struct.node** %1, align 4
-	br label %bb.i6.i
-
-bb.i6.i:		; preds = %bb.i6.i, %stepsystem.exit.i
-	%tmp.0.i.i = add i32 0, -1		; <i32> [#uses=1]
-	%3 = icmp slt i32 %tmp.0.i.i, 0		; <i1> [#uses=1]
-	br i1 %3, label %bb107.i.i, label %bb.i6.i
-
-bb107.i.i:		; preds = %bb107.i.i, %bb.i6.i
-	%q_addr.0.i.i.in = phi %struct.bnode** [ null, %bb107.i.i ], [ %4, %bb.i6.i ]		; <%struct.bnode**> [#uses=1]
-	%q_addr.0.i.i = load %struct.bnode** %q_addr.0.i.i.in		; <%struct.bnode*> [#uses=1]
-	%q_addr.1 = getelementptr %struct.anon* %0, i32 0, i32 4, i32 1
-	store %struct.bnode* %q_addr.0.i.i, %struct.bnode** %q_addr.1, align 4
-	br label %bb107.i.i
-
-bb47.loopexit.i:		; preds = %bb32.i
-	%4 = getelementptr %struct.anon* %0, i32 0, i32 4, i32 0		; <%struct.bnode**> [#uses=1]
-	%5 = icmp eq %struct.node* null, null		; <i1> [#uses=1]
-	br i1 %5, label %stepsystem.exit.i, label %bb.i.i
-}
diff --git a/test/CodeGen/X86/2007-10-12-CoalesceExtSubReg.ll b/test/CodeGen/X86/2007-10-12-CoalesceExtSubReg.ll
index db13fde..8091bd1 100644
--- a/test/CodeGen/X86/2007-10-12-CoalesceExtSubReg.ll
+++ b/test/CodeGen/X86/2007-10-12-CoalesceExtSubReg.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=x86 | not grep movb
 
-define i16 @f(i32* %bp, i32* %ss) signext  {
+define signext i16 @f(i32* %bp, i32* %ss)   {
 entry:
 	br label %cond_next127
 
diff --git a/test/CodeGen/X86/2007-10-12-SpillerUnfold2.ll b/test/CodeGen/X86/2007-10-12-SpillerUnfold2.ll
index a3872ad..7a3d72d 100644
--- a/test/CodeGen/X86/2007-10-12-SpillerUnfold2.ll
+++ b/test/CodeGen/X86/2007-10-12-SpillerUnfold2.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=x86 | grep sarl | not grep esp
 
-define i16 @t(i16* %qmatrix, i16* %dct, i16* %acBaseTable, i16* %acExtTable, i16 signext  %acBaseRes, i16 signext  %acMaskRes, i16 signext  %acExtRes, i32* %bitptr, i32* %source, i32 %markerPrefix, i8** %byteptr, i32 %scale, i32 %round, i32 %bits) signext  {
+define signext   i16 @t(i16* %qmatrix, i16* %dct, i16* %acBaseTable, i16* %acExtTable, i16 signext  %acBaseRes, i16 signext  %acMaskRes, i16 signext  %acExtRes, i32* %bitptr, i32* %source, i32 %markerPrefix, i8** %byteptr, i32 %scale, i32 %round, i32 %bits) {
 entry:
 	br label %cond_next127
 
diff --git a/test/CodeGen/X86/2007-10-15-CoalescerCrash.ll b/test/CodeGen/X86/2007-10-15-CoalescerCrash.ll
index 1e4ae84..c68628d 100644
--- a/test/CodeGen/X86/2007-10-15-CoalescerCrash.ll
+++ b/test/CodeGen/X86/2007-10-15-CoalescerCrash.ll
@@ -362,7 +362,7 @@ bb1159:		; preds = %cond_next1150
 
 cond_true1169:		; preds = %bb1159
 	%tmp11741175 = trunc i64 %lsum.11225.0 to i32		; <i32> [#uses=1]
-	%tmp1178 = tail call i32 (%struct._IO_FILE* noalias , i8* noalias , ...)* @fprintf( %struct._IO_FILE* %file noalias , i8* getelementptr ([49 x i8]* @.str32, i32 0, i64 0) noalias , i32 %tmp11741175, i32 0 )		; <i32> [#uses=0]
+	%tmp1178 = tail call i32 (%struct._IO_FILE* noalias , i8* noalias , ...)* @fprintf( %struct._IO_FILE* noalias %file  , i8* getelementptr ([49 x i8]* @.str32, i32 0, i64 0)  , i32 %tmp11741175, i32 0 )		; <i32> [#uses=0]
 	ret void
 
 UnifiedReturnBlock:		; preds = %bb1159
@@ -379,9 +379,9 @@ declare i32 @reg_preferred_class(i32)
 
 declare i32 @reg_alternate_class(i32)
 
-declare i8 @maybe_hot_bb_p(%struct.basic_block_def*) zeroext 
+declare zeroext i8 @maybe_hot_bb_p(%struct.basic_block_def*)  
 
-declare i8 @probably_never_executed_bb_p(%struct.basic_block_def*) zeroext 
+declare zeroext i8 @probably_never_executed_bb_p(%struct.basic_block_def*)  
 
 declare void @dump_regset(%struct.bitmap_head_def*, %struct._IO_FILE*)
 
diff --git a/test/CodeGen/X86/2007-10-19-SpillerUnfold.ll b/test/CodeGen/X86/2007-10-19-SpillerUnfold.ll
index 600bd1f..d3120f3 100644
--- a/test/CodeGen/X86/2007-10-19-SpillerUnfold.ll
+++ b/test/CodeGen/X86/2007-10-19-SpillerUnfold.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=x86 -x86-asm-syntax=intel | grep inc | not grep PTR
 
-define i16 @t(i32* %bitptr, i32* %source, i8** %byteptr, i32 %scale, i32 %round) signext  {
+define signext   i16 @t(i32* %bitptr, i32* %source, i8** %byteptr, i32 %scale, i32 %round) {
 entry:
 	br label %bb
 
diff --git a/test/CodeGen/X86/2007-10-29-ExtendSetCC.ll b/test/CodeGen/X86/2007-10-29-ExtendSetCC.ll
index 86d3bbf..573a217 100644
--- a/test/CodeGen/X86/2007-10-29-ExtendSetCC.ll
+++ b/test/CodeGen/X86/2007-10-29-ExtendSetCC.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=x86 | grep mov | count 1
 
-define i16 @t() signext  {
+define signext i16 @t()   {
 entry:
 	%tmp180 = load i16* null, align 2		; <i16> [#uses=3]
 	%tmp180181 = sext i16 %tmp180 to i32		; <i32> [#uses=1]
diff --git a/test/CodeGen/X86/2007-11-02-BadAsm.ll b/test/CodeGen/X86/2007-11-02-BadAsm.ll
deleted file mode 100644
index 4e11cda..0000000
--- a/test/CodeGen/X86/2007-11-02-BadAsm.ll
+++ /dev/null
@@ -1,144 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin | grep movl | not grep rax
-
-	%struct.color_sample = type { i64 }
-	%struct.gs_matrix = type { float, i64, float, i64, float, i64, float, i64, float, i64, float, i64 }
-	%struct.ref = type { %struct.color_sample, i16, i16 }
-	%struct.status = type { %struct.gs_matrix, i8*, i32, i32, i8*, i32, i32, i32, i32, i32, i32, i32 }
-
-define i32 @ztype1imagepath(%struct.ref* %op) {
-entry:
-	br i1 false, label %cond_next, label %UnifiedReturnBlock
-
-cond_next:		; preds = %entry
-	br i1 false, label %cond_next68, label %UnifiedReturnBlock
-
-cond_next68:		; preds = %cond_next
-	%tmp5.i.i = malloc i8, i32 0		; <i8*> [#uses=2]
-	br i1 false, label %bb81.outer.i, label %xit.i
-
-bb81.outer.i:		; preds = %bb87.i, %cond_next68
-	%tmp67.i = add i32 0, 1		; <i32> [#uses=1]
-	br label %bb81.i
-
-bb61.i:		; preds = %bb81.i
-	%tmp71.i = getelementptr i8* %tmp5.i.i, i64 0		; <i8*> [#uses=1]
-	%tmp72.i = load i8* %tmp71.i, align 1		; <i8> [#uses=1]
-	%tmp73.i = icmp eq i8 %tmp72.i, 0		; <i1> [#uses=1]
-	br i1 %tmp73.i, label %bb81.i, label %xit.i
-
-bb81.i:		; preds = %bb61.i, %bb81.outer.i
-	br i1 false, label %bb87.i, label %bb61.i
-
-bb87.i:		; preds = %bb81.i
-	br i1 false, label %bb81.outer.i, label %xit.i
-
-xit.i:		; preds = %bb87.i, %bb61.i, %cond_next68
-	%lsbx.0.reg2mem.1.i = phi i32 [ 0, %cond_next68 ], [ 0, %bb61.i ], [ %tmp67.i, %bb87.i ]		; <i32> [#uses=1]
-	%tmp6162.i.i = fptrunc double 0.000000e+00 to float		; <float> [#uses=1]
-	%tmp67.i15.i = fptrunc double 0.000000e+00 to float		; <float> [#uses=1]
-	%tmp24.i27.i = icmp eq i64 0, 0		; <i1> [#uses=1]
-	br i1 %tmp24.i27.i, label %cond_next.i79.i, label %cond_true.i34.i
-
-cond_true.i34.i:		; preds = %xit.i
-	ret i32 0
-
-cond_next.i79.i:		; preds = %xit.i
-	%phitmp167.i = fptosi double 0.000000e+00 to i64		; <i64> [#uses=1]
-	%tmp142143.i = fpext float %tmp6162.i.i to double		; <double> [#uses=1]
-	%tmp2.i139.i = fadd double %tmp142143.i, 5.000000e-01		; <double> [#uses=1]
-	%tmp23.i140.i = fptosi double %tmp2.i139.i to i64		; <i64> [#uses=1]
-	br i1 false, label %cond_true.i143.i, label %round_coord.exit148.i
-
-cond_true.i143.i:		; preds = %cond_next.i79.i
-	%tmp8.i142.i = icmp sgt i64 %tmp23.i140.i, -32768		; <i1> [#uses=1]
-	br i1 %tmp8.i142.i, label %cond_true11.i145.i, label %round_coord.exit148.i
-
-cond_true11.i145.i:		; preds = %cond_true.i143.i
-	ret i32 0
-
-round_coord.exit148.i:		; preds = %cond_true.i143.i, %cond_next.i79.i
-	%tmp144149.i = phi i32 [ 32767, %cond_next.i79.i ], [ -32767, %cond_true.i143.i ]		; <i32> [#uses=1]
-	store i32 %tmp144149.i, i32* null, align 8
-	%tmp147148.i = fpext float %tmp67.i15.i to double		; <double> [#uses=1]
-	%tmp2.i128.i = fadd double %tmp147148.i, 5.000000e-01		; <double> [#uses=1]
-	%tmp23.i129.i = fptosi double %tmp2.i128.i to i64		; <i64> [#uses=2]
-	%tmp5.i130.i = icmp slt i64 %tmp23.i129.i, 32768		; <i1> [#uses=1]
-	br i1 %tmp5.i130.i, label %cond_true.i132.i, label %round_coord.exit137.i
-
-cond_true.i132.i:		; preds = %round_coord.exit148.i
-	%tmp8.i131.i = icmp sgt i64 %tmp23.i129.i, -32768		; <i1> [#uses=1]
-	br i1 %tmp8.i131.i, label %cond_true11.i134.i, label %round_coord.exit137.i
-
-cond_true11.i134.i:		; preds = %cond_true.i132.i
-	br label %round_coord.exit137.i
-
-round_coord.exit137.i:		; preds = %cond_true11.i134.i, %cond_true.i132.i, %round_coord.exit148.i
-	%tmp149138.i = phi i32 [ 0, %cond_true11.i134.i ], [ 32767, %round_coord.exit148.i ], [ -32767, %cond_true.i132.i ]		; <i32> [#uses=1]
-	br i1 false, label %cond_true.i121.i, label %round_coord.exit126.i
-
-cond_true.i121.i:		; preds = %round_coord.exit137.i
-	br i1 false, label %cond_true11.i123.i, label %round_coord.exit126.i
-
-cond_true11.i123.i:		; preds = %cond_true.i121.i
-	br label %round_coord.exit126.i
-
-round_coord.exit126.i:		; preds = %cond_true11.i123.i, %cond_true.i121.i, %round_coord.exit137.i
-	%tmp153127.i = phi i32 [ 0, %cond_true11.i123.i ], [ 32767, %round_coord.exit137.i ], [ -32767, %cond_true.i121.i ]		; <i32> [#uses=1]
-	br i1 false, label %cond_true.i110.i, label %round_coord.exit115.i
-
-cond_true.i110.i:		; preds = %round_coord.exit126.i
-	br i1 false, label %cond_true11.i112.i, label %round_coord.exit115.i
-
-cond_true11.i112.i:		; preds = %cond_true.i110.i
-	br label %round_coord.exit115.i
-
-round_coord.exit115.i:		; preds = %cond_true11.i112.i, %cond_true.i110.i, %round_coord.exit126.i
-	%tmp157116.i = phi i32 [ 0, %cond_true11.i112.i ], [ 32767, %round_coord.exit126.i ], [ -32767, %cond_true.i110.i ]		; <i32> [#uses=2]
-	br i1 false, label %cond_true.i99.i, label %round_coord.exit104.i
-
-cond_true.i99.i:		; preds = %round_coord.exit115.i
-	br i1 false, label %cond_true11.i101.i, label %round_coord.exit104.i
-
-cond_true11.i101.i:		; preds = %cond_true.i99.i
-	%tmp1213.i100.i = trunc i64 %phitmp167.i to i32		; <i32> [#uses=1]
-	br label %cond_next172.i
-
-round_coord.exit104.i:		; preds = %cond_true.i99.i, %round_coord.exit115.i
-	%UnifiedRetVal.i102.i = phi i32 [ 32767, %round_coord.exit115.i ], [ -32767, %cond_true.i99.i ]		; <i32> [#uses=1]
-	%tmp164.i = call fastcc i32 @put_int( %struct.status* null, i32 %tmp157116.i )		; <i32> [#uses=0]
-	br label %cond_next172.i
-
-cond_next172.i:		; preds = %round_coord.exit104.i, %cond_true11.i101.i
-	%tmp161105.reg2mem.0.i = phi i32 [ %tmp1213.i100.i, %cond_true11.i101.i ], [ %UnifiedRetVal.i102.i, %round_coord.exit104.i ]		; <i32> [#uses=1]
-	%tmp174.i = icmp eq i32 %tmp153127.i, 0		; <i1> [#uses=1]
-	%bothcond.i = and i1 false, %tmp174.i		; <i1> [#uses=1]
-	%tmp235.i = call fastcc i32 @put_int( %struct.status* null, i32 %tmp149138.i )		; <i32> [#uses=0]
-	%tmp245.i = load i8** null, align 8		; <i8*> [#uses=2]
-	%tmp246.i = getelementptr i8* %tmp245.i, i64 1		; <i8*> [#uses=1]
-	br i1 %bothcond.i, label %cond_next254.i, label %bb259.i
-
-cond_next254.i:		; preds = %cond_next172.i
-	store i8 13, i8* %tmp245.i, align 1
-	br label %bb259.i
-
-bb259.i:		; preds = %cond_next254.i, %cond_next172.i
-	%storemerge.i = phi i8* [ %tmp246.i, %cond_next254.i ], [ null, %cond_next172.i ]		; <i8*> [#uses=0]
-	%tmp261.i = shl i32 %lsbx.0.reg2mem.1.i, 2		; <i32> [#uses=1]
-	store i32 %tmp261.i, i32* null, align 8
-	%tmp270.i = add i32 0, %tmp157116.i		; <i32> [#uses=1]
-	store i32 %tmp270.i, i32* null, align 8
-	%tmp275.i = add i32 0, %tmp161105.reg2mem.0.i		; <i32> [#uses=0]
-	br i1 false, label %trace_cells.exit.i, label %bb.preheader.i.i
-
-bb.preheader.i.i:		; preds = %bb259.i
-	ret i32 0
-
-trace_cells.exit.i:		; preds = %bb259.i
-	free i8* %tmp5.i.i
-	ret i32 0
-
-UnifiedReturnBlock:		; preds = %cond_next, %entry
-	ret i32 -20
-}
-
-declare fastcc i32 @put_int(%struct.status*, i32)
diff --git a/test/CodeGen/X86/2007-12-11-FoldImpDefSpill.ll b/test/CodeGen/X86/2007-12-11-FoldImpDefSpill.ll
deleted file mode 100644
index ca995cc..0000000
--- a/test/CodeGen/X86/2007-12-11-FoldImpDefSpill.ll
+++ /dev/null
@@ -1,680 +0,0 @@
-; RUN: llc < %s -mtriple=i686-apple-darwin | not grep IMPLICIT_DEF
-
-	%struct.__sbuf = type { i8*, i32 }
-	%struct.ggBRDF = type { i32 (...)** }
-	%"struct.ggBST<ggMaterial>" = type { %"struct.ggBSTNode<ggMaterial>"*, i32 }
-	%"struct.ggBST<ggRasterSurfaceTexture>" = type { %"struct.ggBSTNode<ggRasterSurfaceTexture>"*, i32 }
-	%"struct.ggBST<ggSolidTexture>" = type { %"struct.ggBSTNode<ggSolidTexture>"*, i32 }
-	%"struct.ggBST<ggSpectrum>" = type { %"struct.ggBSTNode<ggSpectrum>"*, i32 }
-	%"struct.ggBST<mrObjectRecord>" = type { %"struct.ggBSTNode<mrObjectRecord>"*, i32 }
-	%"struct.ggBSTNode<ggMaterial>" = type { %"struct.ggBSTNode<ggMaterial>"*, %"struct.ggBSTNode<ggMaterial>"*, %struct.ggString, %struct.ggMaterial* }
-	%"struct.ggBSTNode<ggRasterSurfaceTexture>" = type { %"struct.ggBSTNode<ggRasterSurfaceTexture>"*, %"struct.ggBSTNode<ggRasterSurfaceTexture>"*, %struct.ggString, %struct.ggRasterSurfaceTexture* }
-	%"struct.ggBSTNode<ggSolidTexture>" = type { %"struct.ggBSTNode<ggSolidTexture>"*, %"struct.ggBSTNode<ggSolidTexture>"*, %struct.ggString, %struct.ggBRDF* }
-	%"struct.ggBSTNode<ggSpectrum>" = type { %"struct.ggBSTNode<ggSpectrum>"*, %"struct.ggBSTNode<ggSpectrum>"*, %struct.ggString, %struct.ggSpectrum* }
-	%"struct.ggBSTNode<mrObjectRecord>" = type { %"struct.ggBSTNode<mrObjectRecord>"*, %"struct.ggBSTNode<mrObjectRecord>"*, %struct.ggString, %struct.mrObjectRecord* }
-	%"struct.ggDictionary<ggMaterial>" = type { %"struct.ggBST<ggMaterial>" }
-	%"struct.ggDictionary<ggRasterSurfaceTexture>" = type { %"struct.ggBST<ggRasterSurfaceTexture>" }
-	%"struct.ggDictionary<ggSolidTexture>" = type { %"struct.ggBST<ggSolidTexture>" }
-	%"struct.ggDictionary<ggSpectrum>" = type { %"struct.ggBST<ggSpectrum>" }
-	%"struct.ggDictionary<mrObjectRecord>" = type { %"struct.ggBST<mrObjectRecord>" }
-	%struct.ggHAffineMatrix3 = type { %struct.ggHMatrix3 }
-	%struct.ggHBoxMatrix3 = type { %struct.ggHAffineMatrix3 }
-	%struct.ggHMatrix3 = type { [4 x [4 x double]] }
-	%struct.ggMaterial = type { i32 (...)**, %struct.ggBRDF* }
-	%struct.ggPoint3 = type { [3 x double] }
-	%"struct.ggRGBPixel<char>" = type { [3 x i8], i8 }
-	%"struct.ggRaster<ggRGBPixel<unsigned char> >" = type { i32, i32, %"struct.ggRGBPixel<char>"* }
-	%struct.ggRasterSurfaceTexture = type { %"struct.ggRaster<ggRGBPixel<unsigned char> >"* }
-	%struct.ggSolidNoise3 = type { i32, [256 x %struct.ggPoint3], [256 x i32] }
-	%struct.ggSpectrum = type { [8 x float] }
-	%struct.ggString = type { %"struct.ggString::StringRep"* }
-	%"struct.ggString::StringRep" = type { i32, i32, [1 x i8] }
-	%"struct.ggTrain<mrPixelRenderer*>" = type { %struct.ggBRDF**, i32, i32 }
-	%struct.mrObjectRecord = type { %struct.ggHBoxMatrix3, %struct.ggHBoxMatrix3, %struct.mrSurfaceList, %struct.ggMaterial*, i32, %struct.ggRasterSurfaceTexture*, %struct.ggBRDF*, i32, i32 }
-	%struct.mrScene = type { %struct.ggSpectrum, %struct.ggSpectrum, %struct.ggBRDF*, %struct.ggBRDF*, %struct.ggBRDF*, i32, double, %"struct.ggDictionary<mrObjectRecord>", %"struct.ggDictionary<ggRasterSurfaceTexture>", %"struct.ggDictionary<ggSolidTexture>", %"struct.ggDictionary<ggSpectrum>", %"struct.ggDictionary<ggMaterial>" }
-	%struct.mrSurfaceList = type { %struct.ggBRDF, %"struct.ggTrain<mrPixelRenderer*>" }
-	%"struct.std::__codecvt_abstract_base<char,char,__mbstate_t>" = type { %"struct.std::locale::facet" }
-	%"struct.std::basic_ios<char,std::char_traits<char> >" = type { %"struct.std::ios_base", %"struct.std::basic_ostream<char,std::char_traits<char> >"*, i8, i8, %"struct.std::basic_streambuf<char,std::char_traits<char> >"*, %"struct.std::ctype<char>"*, %"struct.std::__codecvt_abstract_base<char,char,__mbstate_t>"*, %"struct.std::__codecvt_abstract_base<char,char,__mbstate_t>"* }
-	%"struct.std::basic_istream<char,std::char_traits<char> >" = type { i32 (...)**, i32, %"struct.std::basic_ios<char,std::char_traits<char> >" }
-	%"struct.std::basic_ostream<char,std::char_traits<char> >" = type { i32 (...)**, %"struct.std::basic_ios<char,std::char_traits<char> >" }
-	%"struct.std::basic_streambuf<char,std::char_traits<char> >" = type { i32 (...)**, i8*, i8*, i8*, i8*, i8*, i8*, %"struct.std::locale" }
-	%"struct.std::ctype<char>" = type { %"struct.std::locale::facet", i32*, i8, i32*, i32*, i32*, i8, [256 x i8], [256 x i8], i8 }
-	%"struct.std::ios_base" = type { i32 (...)**, i32, i32, i32, i32, i32, %"struct.std::ios_base::_Callback_list"*, %struct.__sbuf, [8 x %struct.__sbuf], i32, %struct.__sbuf*, %"struct.std::locale" }
-	%"struct.std::ios_base::_Callback_list" = type { %"struct.std::ios_base::_Callback_list"*, void (i32, %"struct.std::ios_base"*, i32)*, i32, i32 }
-	%"struct.std::locale" = type { %"struct.std::locale::_Impl"* }
-	%"struct.std::locale::_Impl" = type { i32, %"struct.std::locale::facet"**, i32, %"struct.std::locale::facet"**, i8** }
-	%"struct.std::locale::facet" = type { i32 (...)**, i32 }
-@.str80 = external constant [7 x i8]		; <[7 x i8]*> [#uses=1]
-@.str81 = external constant [11 x i8]		; <[11 x i8]*> [#uses=1]
-
-define fastcc void @_ZN7mrScene4ReadERSi(%struct.mrScene* %this, %"struct.std::basic_istream<char,std::char_traits<char> >"* %surfaces) {
-entry:
-	%tmp6.i.i8288 = invoke i8* @_Znam( i32 12 )
-			to label %_ZN8ggStringC1Ei.exit unwind label %lpad		; <i8*> [#uses=0]
-
-_ZN8ggStringC1Ei.exit:		; preds = %entry
-	%tmp6.i.i8995 = invoke i8* @_Znam( i32 12 )
-			to label %_ZN8ggStringC1Ei.exit96 unwind label %lpad3825		; <i8*> [#uses=0]
-
-_ZN8ggStringC1Ei.exit96:		; preds = %_ZN8ggStringC1Ei.exit
-	%tmp6.i.i97103 = invoke i8* @_Znam( i32 12 )
-			to label %_ZN8ggStringC1Ei.exit104 unwind label %lpad3829		; <i8*> [#uses=0]
-
-_ZN8ggStringC1Ei.exit104:		; preds = %_ZN8ggStringC1Ei.exit96
-	%tmp6.i.i105111 = invoke i8* @_Znam( i32 12 )
-			to label %_ZN8ggStringC1Ei.exit112 unwind label %lpad3833		; <i8*> [#uses=0]
-
-_ZN8ggStringC1Ei.exit112:		; preds = %_ZN8ggStringC1Ei.exit104
-	%tmp6.i.i122128 = invoke i8* @_Znam( i32 12 )
-			to label %_ZN8ggStringC1Ei.exit129 unwind label %lpad3837		; <i8*> [#uses=0]
-
-_ZN8ggStringC1Ei.exit129:		; preds = %_ZN8ggStringC1Ei.exit112
-	%tmp6.i.i132138 = invoke i8* @_Znam( i32 12 )
-			to label %_ZN8ggStringC1Ei.exit139 unwind label %lpad3841		; <i8*> [#uses=0]
-
-_ZN8ggStringC1Ei.exit139:		; preds = %_ZN8ggStringC1Ei.exit129
-	%tmp295 = invoke i8* @_Znwm( i32 16 )
-			to label %invcont294 unwind label %lpad3845		; <i8*> [#uses=0]
-
-invcont294:		; preds = %_ZN8ggStringC1Ei.exit139
-	%tmp10.i.i141 = invoke i8* @_Znam( i32 16 )
-			to label %_ZN13mrSurfaceListC1Ev.exit unwind label %lpad3849		; <i8*> [#uses=0]
-
-_ZN13mrSurfaceListC1Ev.exit:		; preds = %invcont294
-	%tmp3.i148 = invoke %"struct.std::basic_istream<char,std::char_traits<char> >"* @_ZStrsIcSt11char_traitsIcEERSt13basic_istreamIT_T0_ES6_PS3_( %"struct.std::basic_istream<char,std::char_traits<char> >"* %surfaces, i8* null )
-			to label %tmp3.i.noexc unwind label %lpad3845		; <%"struct.std::basic_istream<char,std::char_traits<char> >"*> [#uses=0]
-
-tmp3.i.noexc:		; preds = %_ZN13mrSurfaceListC1Ev.exit
-	%tmp15.i149 = invoke i8* @_ZNKSt9basic_iosIcSt11char_traitsIcEEcvPvEv( %"struct.std::basic_ios<char,std::char_traits<char> >"* null )
-			to label %tmp15.i.noexc unwind label %lpad3845		; <i8*> [#uses=0]
-
-tmp15.i.noexc:		; preds = %tmp3.i.noexc
-	br i1 false, label %bb308, label %bb.i
-
-bb.i:		; preds = %tmp15.i.noexc
-	ret void
-
-bb308:		; preds = %tmp15.i.noexc
-	br i1 false, label %bb3743.preheader, label %bb315
-
-bb3743.preheader:		; preds = %bb308
-	%tmp16.i3862 = getelementptr %struct.ggPoint3* null, i32 0, i32 0, i32 0		; <double*> [#uses=1]
-	%tmp16.i3859 = getelementptr %struct.ggPoint3* null, i32 0, i32 0, i32 0		; <double*> [#uses=3]
-	br label %bb3743
-
-bb315:		; preds = %bb308
-	ret void
-
-bb333:		; preds = %invcont3758, %invcont335
-	%tmp3.i167180 = invoke %"struct.std::basic_istream<char,std::char_traits<char> >"* @_ZStrsIcSt11char_traitsIcEERSt13basic_istreamIT_T0_ES6_PS3_( %"struct.std::basic_istream<char,std::char_traits<char> >"* %surfaces, i8* null )
-			to label %tmp3.i167.noexc unwind label %lpad3845		; <%"struct.std::basic_istream<char,std::char_traits<char> >"*> [#uses=0]
-
-tmp3.i167.noexc:		; preds = %bb333
-	%tmp15.i182 = invoke i8* @_ZNKSt9basic_iosIcSt11char_traitsIcEEcvPvEv( %"struct.std::basic_ios<char,std::char_traits<char> >"* null )
-			to label %tmp15.i.noexc181 unwind label %lpad3845		; <i8*> [#uses=0]
-
-tmp15.i.noexc181:		; preds = %tmp3.i167.noexc
-	br i1 false, label %invcont335, label %bb.i178
-
-bb.i178:		; preds = %tmp15.i.noexc181
-	ret void
-
-invcont335:		; preds = %tmp15.i.noexc181
-	br i1 false, label %bb3743, label %bb333
-
-bb345:		; preds = %invcont3758
-	br i1 false, label %bb353, label %bb360
-
-bb353:		; preds = %bb345
-	%tmp356 = invoke %"struct.std::basic_istream<char,std::char_traits<char> >"* @_ZNSirsERd( %"struct.std::basic_istream<char,std::char_traits<char> >"* %surfaces, double* null )
-			to label %bb3743 unwind label %lpad3845		; <%"struct.std::basic_istream<char,std::char_traits<char> >"*> [#uses=0]
-
-bb360:		; preds = %bb345
-	br i1 false, label %bb368, label %bb374
-
-bb368:		; preds = %bb360
-	%tmp373 = invoke %"struct.std::basic_istream<char,std::char_traits<char> >"* @_ZNSirsERd( %"struct.std::basic_istream<char,std::char_traits<char> >"* %surfaces, double* null )
-			to label %bb3743 unwind label %lpad3845		; <%"struct.std::basic_istream<char,std::char_traits<char> >"*> [#uses=0]
-
-bb374:		; preds = %bb360
-	br i1 false, label %bb396, label %bb421
-
-bb396:		; preds = %bb374
-	ret void
-
-bb421:		; preds = %bb374
-	br i1 false, label %bb429, label %bb530
-
-bb429:		; preds = %bb421
-	ret void
-
-bb530:		; preds = %bb421
-	br i1 false, label %bb538, label %bb673
-
-bb538:		; preds = %bb530
-	ret void
-
-bb673:		; preds = %bb530
-	br i1 false, label %bb681, label %bb778
-
-bb681:		; preds = %bb673
-	ret void
-
-bb778:		; preds = %bb673
-	br i1 false, label %bb786, label %bb891
-
-bb786:		; preds = %bb778
-	ret void
-
-bb891:		; preds = %bb778
-	br i1 false, label %bb899, label %bb998
-
-bb899:		; preds = %bb891
-	ret void
-
-bb998:		; preds = %bb891
-	br i1 false, label %bb1168, label %bb1190
-
-bb1168:		; preds = %bb998
-	ret void
-
-bb1190:		; preds = %bb998
-	br i1 false, label %bb1198, label %bb1220
-
-bb1198:		; preds = %bb1190
-	ret void
-
-bb1220:		; preds = %bb1190
-	br i1 false, label %bb1228, label %bb1250
-
-bb1228:		; preds = %bb1220
-	ret void
-
-bb1250:		; preds = %bb1220
-	br i1 false, label %bb1258, label %bb1303
-
-bb1258:		; preds = %bb1250
-	ret void
-
-bb1303:		; preds = %bb1250
-	br i1 false, label %bb1311, label %bb1366
-
-bb1311:		; preds = %bb1303
-	ret void
-
-bb1366:		; preds = %bb1303
-	br i1 false, label %bb1374, label %bb1432
-
-bb1374:		; preds = %bb1366
-	ret void
-
-bb1432:		; preds = %bb1366
-	br i1 false, label %bb1440, label %bb1495
-
-bb1440:		; preds = %bb1432
-	ret void
-
-bb1495:		; preds = %bb1432
-	br i1 false, label %bb1503, label %bb1561
-
-bb1503:		; preds = %bb1495
-	ret void
-
-bb1561:		; preds = %bb1495
-	br i1 false, label %bb1569, label %bb1624
-
-bb1569:		; preds = %bb1561
-	ret void
-
-bb1624:		; preds = %bb1561
-	br i1 false, label %bb1632, label %bb1654
-
-bb1632:		; preds = %bb1624
-	store double 0.000000e+00, double* %tmp16.i3859, align 8
-	%tmp3.i38383852 = invoke %"struct.std::basic_istream<char,std::char_traits<char> >"* @_ZStrsIcSt11char_traitsIcEERSt13basic_istreamIT_T0_ES6_PS3_( %"struct.std::basic_istream<char,std::char_traits<char> >"* %surfaces, i8* null )
-			to label %tmp3.i3838.noexc unwind label %lpad3845		; <%"struct.std::basic_istream<char,std::char_traits<char> >"*> [#uses=0]
-
-tmp3.i3838.noexc:		; preds = %bb1632
-	%tmp15.i38473853 = invoke i8* @_ZNKSt9basic_iosIcSt11char_traitsIcEEcvPvEv( %"struct.std::basic_ios<char,std::char_traits<char> >"* null )
-			to label %tmp15.i3847.noexc unwind label %lpad3845		; <i8*> [#uses=0]
-
-tmp15.i3847.noexc:		; preds = %tmp3.i3838.noexc
-	br i1 false, label %invcont1634, label %bb.i3850
-
-bb.i3850:		; preds = %tmp15.i3847.noexc
-	ret void
-
-invcont1634:		; preds = %tmp15.i3847.noexc
-	%tmp3.i38173831 = invoke %"struct.std::basic_istream<char,std::char_traits<char> >"* @_ZStrsIcSt11char_traitsIcEERSt13basic_istreamIT_T0_ES6_PS3_( %"struct.std::basic_istream<char,std::char_traits<char> >"* %surfaces, i8* null )
-			to label %tmp3.i3817.noexc unwind label %lpad3845		; <%"struct.std::basic_istream<char,std::char_traits<char> >"*> [#uses=0]
-
-tmp3.i3817.noexc:		; preds = %invcont1634
-	%tmp15.i38263832 = invoke i8* @_ZNKSt9basic_iosIcSt11char_traitsIcEEcvPvEv( %"struct.std::basic_ios<char,std::char_traits<char> >"* null )
-			to label %tmp15.i3826.noexc unwind label %lpad3845		; <i8*> [#uses=0]
-
-tmp15.i3826.noexc:		; preds = %tmp3.i3817.noexc
-	br i1 false, label %invcont1636, label %bb.i3829
-
-bb.i3829:		; preds = %tmp15.i3826.noexc
-	ret void
-
-invcont1636:		; preds = %tmp15.i3826.noexc
-	%tmp8.i38083811 = invoke %"struct.std::basic_istream<char,std::char_traits<char> >"* @_ZNSirsERd( %"struct.std::basic_istream<char,std::char_traits<char> >"* %surfaces, double* %tmp16.i3862 )
-			to label %tmp8.i3808.noexc unwind label %lpad3845		; <%"struct.std::basic_istream<char,std::char_traits<char> >"*> [#uses=1]
-
-tmp8.i3808.noexc:		; preds = %invcont1636
-	%tmp9.i38093812 = invoke %"struct.std::basic_istream<char,std::char_traits<char> >"* @_ZNSirsERd( %"struct.std::basic_istream<char,std::char_traits<char> >"* %tmp8.i38083811, double* null )
-			to label %tmp9.i3809.noexc unwind label %lpad3845		; <%"struct.std::basic_istream<char,std::char_traits<char> >"*> [#uses=1]
-
-tmp9.i3809.noexc:		; preds = %tmp8.i3808.noexc
-	%tmp10.i38103813 = invoke %"struct.std::basic_istream<char,std::char_traits<char> >"* @_ZNSirsERd( %"struct.std::basic_istream<char,std::char_traits<char> >"* %tmp9.i38093812, double* null )
-			to label %invcont1638 unwind label %lpad3845		; <%"struct.std::basic_istream<char,std::char_traits<char> >"*> [#uses=0]
-
-invcont1638:		; preds = %tmp9.i3809.noexc
-	%tmp8.i37983801 = invoke %"struct.std::basic_istream<char,std::char_traits<char> >"* @_ZNSirsERd( %"struct.std::basic_istream<char,std::char_traits<char> >"* %surfaces, double* %tmp16.i3859 )
-			to label %tmp8.i3798.noexc unwind label %lpad3845		; <%"struct.std::basic_istream<char,std::char_traits<char> >"*> [#uses=1]
-
-tmp8.i3798.noexc:		; preds = %invcont1638
-	%tmp9.i37993802 = invoke %"struct.std::basic_istream<char,std::char_traits<char> >"* @_ZNSirsERd( %"struct.std::basic_istream<char,std::char_traits<char> >"* %tmp8.i37983801, double* null )
-			to label %tmp9.i3799.noexc unwind label %lpad3845		; <%"struct.std::basic_istream<char,std::char_traits<char> >"*> [#uses=1]
-
-tmp9.i3799.noexc:		; preds = %tmp8.i3798.noexc
-	%tmp10.i38003803 = invoke %"struct.std::basic_istream<char,std::char_traits<char> >"* @_ZNSirsERd( %"struct.std::basic_istream<char,std::char_traits<char> >"* %tmp9.i37993802, double* null )
-			to label %invcont1640 unwind label %lpad3845		; <%"struct.std::basic_istream<char,std::char_traits<char> >"*> [#uses=0]
-
-invcont1640:		; preds = %tmp9.i3799.noexc
-	%tmp3.i3778 = load double* %tmp16.i3859, align 8		; <double> [#uses=1]
-	%tmp1643 = invoke i8* @_Znwm( i32 76 )
-			to label %invcont1642 unwind label %lpad3845		; <i8*> [#uses=0]
-
-invcont1642:		; preds = %invcont1640
-	%tmp18.i3770 = fsub double %tmp3.i3778, 0.000000e+00		; <double> [#uses=0]
-	invoke fastcc void @_ZN7mrScene9AddObjectEP9mrSurfaceRK8ggStringS4_i( %struct.mrScene* %this, %struct.ggBRDF* null, %struct.ggString* null, %struct.ggString* null, i32 0 )
-			to label %bb3743 unwind label %lpad3845
-
-bb1654:		; preds = %bb1624
-	br i1 false, label %bb1662, label %bb1693
-
-bb1662:		; preds = %bb1654
-	%tmp3.i37143728 = invoke %"struct.std::basic_istream<char,std::char_traits<char> >"* @_ZStrsIcSt11char_traitsIcEERSt13basic_istreamIT_T0_ES6_PS3_( %"struct.std::basic_istream<char,std::char_traits<char> >"* %surfaces, i8* null )
-			to label %tmp3.i3714.noexc unwind label %lpad3845		; <%"struct.std::basic_istream<char,std::char_traits<char> >"*> [#uses=0]
-
-tmp3.i3714.noexc:		; preds = %bb1662
-	%tmp15.i37233729 = invoke i8* @_ZNKSt9basic_iosIcSt11char_traitsIcEEcvPvEv( %"struct.std::basic_ios<char,std::char_traits<char> >"* null )
-			to label %tmp15.i3723.noexc unwind label %lpad3845		; <i8*> [#uses=0]
-
-tmp15.i3723.noexc:		; preds = %tmp3.i3714.noexc
-	ret void
-
-bb1693:		; preds = %bb1654
-	br i1 false, label %bb1701, label %bb1745
-
-bb1701:		; preds = %bb1693
-	%tmp3.i36493663 = invoke %"struct.std::basic_istream<char,std::char_traits<char> >"* @_ZStrsIcSt11char_traitsIcEERSt13basic_istreamIT_T0_ES6_PS3_( %"struct.std::basic_istream<char,std::char_traits<char> >"* %surfaces, i8* null )
-			to label %tmp3.i3649.noexc unwind label %lpad3845		; <%"struct.std::basic_istream<char,std::char_traits<char> >"*> [#uses=0]
-
-tmp3.i3649.noexc:		; preds = %bb1701
-	ret void
-
-bb1745:		; preds = %bb1693
-	br i1 false, label %bb1753, label %bb1797
-
-bb1753:		; preds = %bb1745
-	ret void
-
-bb1797:		; preds = %bb1745
-	br i1 false, label %bb1805, label %bb1847
-
-bb1805:		; preds = %bb1797
-	ret void
-
-bb1847:		; preds = %bb1797
-	br i1 false, label %bb1855, label %bb1897
-
-bb1855:		; preds = %bb1847
-	%tmp3.i34633477 = invoke %"struct.std::basic_istream<char,std::char_traits<char> >"* @_ZStrsIcSt11char_traitsIcEERSt13basic_istreamIT_T0_ES6_PS3_( %"struct.std::basic_istream<char,std::char_traits<char> >"* %surfaces, i8* null )
-			to label %tmp3.i3463.noexc unwind label %lpad3845		; <%"struct.std::basic_istream<char,std::char_traits<char> >"*> [#uses=0]
-
-tmp3.i3463.noexc:		; preds = %bb1855
-	%tmp15.i34723478 = invoke i8* @_ZNKSt9basic_iosIcSt11char_traitsIcEEcvPvEv( %"struct.std::basic_ios<char,std::char_traits<char> >"* null )
-			to label %tmp15.i3472.noexc unwind label %lpad3845		; <i8*> [#uses=0]
-
-tmp15.i3472.noexc:		; preds = %tmp3.i3463.noexc
-	br i1 false, label %invcont1857, label %bb.i3475
-
-bb.i3475:		; preds = %tmp15.i3472.noexc
-	invoke fastcc void @_ZN8ggStringaSEPKc( %struct.ggString* null, i8* null )
-			to label %invcont1857 unwind label %lpad3845
-
-invcont1857:		; preds = %bb.i3475, %tmp15.i3472.noexc
-	%tmp1860 = invoke %"struct.std::basic_istream<char,std::char_traits<char> >"* @_ZNSirsERd( %"struct.std::basic_istream<char,std::char_traits<char> >"* %surfaces, double* null )
-			to label %invcont1859 unwind label %lpad3845		; <%"struct.std::basic_istream<char,std::char_traits<char> >"*> [#uses=1]
-
-invcont1859:		; preds = %invcont1857
-	%tmp1862 = invoke %"struct.std::basic_istream<char,std::char_traits<char> >"* @_ZNSirsERd( %"struct.std::basic_istream<char,std::char_traits<char> >"* %tmp1860, double* null )
-			to label %invcont1861 unwind label %lpad3845		; <%"struct.std::basic_istream<char,std::char_traits<char> >"*> [#uses=1]
-
-invcont1861:		; preds = %invcont1859
-	%tmp1864 = invoke %"struct.std::basic_istream<char,std::char_traits<char> >"* @_ZNSirsERd( %"struct.std::basic_istream<char,std::char_traits<char> >"* %tmp1862, double* null )
-			to label %invcont1863 unwind label %lpad3845		; <%"struct.std::basic_istream<char,std::char_traits<char> >"*> [#uses=1]
-
-invcont1863:		; preds = %invcont1861
-	%tmp1866 = invoke %"struct.std::basic_istream<char,std::char_traits<char> >"* @_ZNSirsERd( %"struct.std::basic_istream<char,std::char_traits<char> >"* %tmp1864, double* null )
-			to label %invcont1865 unwind label %lpad3845		; <%"struct.std::basic_istream<char,std::char_traits<char> >"*> [#uses=1]
-
-invcont1865:		; preds = %invcont1863
-	%tmp1868 = invoke %"struct.std::basic_istream<char,std::char_traits<char> >"* @_ZNSirsERd( %"struct.std::basic_istream<char,std::char_traits<char> >"* %tmp1866, double* null )
-			to label %invcont1867 unwind label %lpad3845		; <%"struct.std::basic_istream<char,std::char_traits<char> >"*> [#uses=0]
-
-invcont1867:		; preds = %invcont1865
-	%tmp1881 = invoke i8 @_ZNKSt9basic_iosIcSt11char_traitsIcEE4goodEv( %"struct.std::basic_ios<char,std::char_traits<char> >"* null ) zeroext 
-			to label %invcont1880 unwind label %lpad3845		; <i8> [#uses=0]
-
-invcont1880:		; preds = %invcont1867
-	%tmp1883 = invoke i8* @_Znwm( i32 24 )
-			to label %invcont1882 unwind label %lpad3845		; <i8*> [#uses=0]
-
-invcont1882:		; preds = %invcont1880
-	invoke fastcc void @_ZN7mrScene9AddObjectEP9mrSurfaceRK8ggStringS4_i( %struct.mrScene* %this, %struct.ggBRDF* null, %struct.ggString* null, %struct.ggString* null, i32 0 )
-			to label %bb3743 unwind label %lpad3845
-
-bb1897:		; preds = %bb1847
-	br i1 false, label %bb1905, label %bb1947
-
-bb1905:		; preds = %bb1897
-	ret void
-
-bb1947:		; preds = %bb1897
-	br i1 false, label %bb1955, label %bb2000
-
-bb1955:		; preds = %bb1947
-	ret void
-
-bb2000:		; preds = %bb1947
-	br i1 false, label %bb2008, label %bb2053
-
-bb2008:		; preds = %bb2000
-	ret void
-
-bb2053:		; preds = %bb2000
-	br i1 false, label %bb2061, label %bb2106
-
-bb2061:		; preds = %bb2053
-	%tmp3.i32433257 = invoke %"struct.std::basic_istream<char,std::char_traits<char> >"* @_ZStrsIcSt11char_traitsIcEERSt13basic_istreamIT_T0_ES6_PS3_( %"struct.std::basic_istream<char,std::char_traits<char> >"* %surfaces, i8* null )
-			to label %tmp3.i3243.noexc unwind label %lpad3845		; <%"struct.std::basic_istream<char,std::char_traits<char> >"*> [#uses=0]
-
-tmp3.i3243.noexc:		; preds = %bb2061
-	%tmp15.i32523258 = invoke i8* @_ZNKSt9basic_iosIcSt11char_traitsIcEEcvPvEv( %"struct.std::basic_ios<char,std::char_traits<char> >"* null )
-			to label %bb.i3255 unwind label %lpad3845		; <i8*> [#uses=0]
-
-bb.i3255:		; preds = %tmp3.i3243.noexc
-	invoke fastcc void @_ZN8ggStringaSEPKc( %struct.ggString* null, i8* null )
-			to label %invcont2063 unwind label %lpad3845
-
-invcont2063:		; preds = %bb.i3255
-	ret void
-
-bb2106:		; preds = %bb2053
-	%tmp7.i3214 = call i32 @strcmp( i8* %tmp5.i161, i8* getelementptr ([7 x i8]* @.str80, i32 0, i32 0) ) nounwind readonly 		; <i32> [#uses=0]
-	br i1 false, label %bb2114, label %bb2136
-
-bb2114:		; preds = %bb2106
-	%tmp3.i31923206 = invoke %"struct.std::basic_istream<char,std::char_traits<char> >"* @_ZStrsIcSt11char_traitsIcEERSt13basic_istreamIT_T0_ES6_PS3_( %"struct.std::basic_istream<char,std::char_traits<char> >"* %surfaces, i8* null )
-			to label %tmp3.i3192.noexc unwind label %lpad3845		; <%"struct.std::basic_istream<char,std::char_traits<char> >"*> [#uses=0]
-
-tmp3.i3192.noexc:		; preds = %bb2114
-	%tmp15.i32013207 = invoke i8* @_ZNKSt9basic_iosIcSt11char_traitsIcEEcvPvEv( %"struct.std::basic_ios<char,std::char_traits<char> >"* null )
-			to label %tmp15.i3201.noexc unwind label %lpad3845		; <i8*> [#uses=0]
-
-tmp15.i3201.noexc:		; preds = %tmp3.i3192.noexc
-	br i1 false, label %invcont2116, label %bb.i3204
-
-bb.i3204:		; preds = %tmp15.i3201.noexc
-	ret void
-
-invcont2116:		; preds = %tmp15.i3201.noexc
-	%tmp3.i31713185 = invoke %"struct.std::basic_istream<char,std::char_traits<char> >"* @_ZStrsIcSt11char_traitsIcEERSt13basic_istreamIT_T0_ES6_PS3_( %"struct.std::basic_istream<char,std::char_traits<char> >"* %surfaces, i8* null )
-			to label %tmp3.i3171.noexc unwind label %lpad3845		; <%"struct.std::basic_istream<char,std::char_traits<char> >"*> [#uses=0]
-
-tmp3.i3171.noexc:		; preds = %invcont2116
-	%tmp15.i31803186 = invoke i8* @_ZNKSt9basic_iosIcSt11char_traitsIcEEcvPvEv( %"struct.std::basic_ios<char,std::char_traits<char> >"* null )
-			to label %tmp15.i3180.noexc unwind label %lpad3845		; <i8*> [#uses=0]
-
-tmp15.i3180.noexc:		; preds = %tmp3.i3171.noexc
-	br i1 false, label %invcont2118, label %bb.i3183
-
-bb.i3183:		; preds = %tmp15.i3180.noexc
-	ret void
-
-invcont2118:		; preds = %tmp15.i3180.noexc
-	%tmp8.i31623165 = invoke %"struct.std::basic_istream<char,std::char_traits<char> >"* @_ZNSirsERd( %"struct.std::basic_istream<char,std::char_traits<char> >"* %surfaces, double* null )
-			to label %tmp8.i3162.noexc unwind label %lpad3845		; <%"struct.std::basic_istream<char,std::char_traits<char> >"*> [#uses=1]
-
-tmp8.i3162.noexc:		; preds = %invcont2118
-	%tmp9.i31633166 = invoke %"struct.std::basic_istream<char,std::char_traits<char> >"* @_ZNSirsERd( %"struct.std::basic_istream<char,std::char_traits<char> >"* %tmp8.i31623165, double* null )
-			to label %tmp9.i3163.noexc unwind label %lpad3845		; <%"struct.std::basic_istream<char,std::char_traits<char> >"*> [#uses=1]
-
-tmp9.i3163.noexc:		; preds = %tmp8.i3162.noexc
-	%tmp10.i31643167 = invoke %"struct.std::basic_istream<char,std::char_traits<char> >"* @_ZNSirsERd( %"struct.std::basic_istream<char,std::char_traits<char> >"* %tmp9.i31633166, double* null )
-			to label %invcont2120 unwind label %lpad3845		; <%"struct.std::basic_istream<char,std::char_traits<char> >"*> [#uses=0]
-
-invcont2120:		; preds = %tmp9.i3163.noexc
-	%tmp2123 = invoke %"struct.std::basic_istream<char,std::char_traits<char> >"* @_ZNSirsERd( %"struct.std::basic_istream<char,std::char_traits<char> >"* %surfaces, double* null )
-			to label %invcont2122 unwind label %lpad3845		; <%"struct.std::basic_istream<char,std::char_traits<char> >"*> [#uses=0]
-
-invcont2122:		; preds = %invcont2120
-	%tmp2125 = invoke i8* @_Znwm( i32 36 )
-			to label %invcont2124 unwind label %lpad3845		; <i8*> [#uses=0]
-
-invcont2124:		; preds = %invcont2122
-	invoke fastcc void @_ZN7mrScene9AddObjectEP9mrSurfaceRK8ggStringS4_i( %struct.mrScene* %this, %struct.ggBRDF* null, %struct.ggString* null, %struct.ggString* null, i32 0 )
-			to label %bb3743 unwind label %lpad3845
-
-bb2136:		; preds = %bb2106
-	%tmp7.i3128 = call i32 @strcmp( i8* %tmp5.i161, i8* getelementptr ([11 x i8]* @.str81, i32 0, i32 0) ) nounwind readonly 		; <i32> [#uses=0]
-	br i1 false, label %bb2144, label %bb3336
-
-bb2144:		; preds = %bb2136
-	%tmp6.i.i31173123 = invoke i8* @_Znam( i32 12 )
-			to label %_ZN8ggStringC1Ei.exit3124 unwind label %lpad3845		; <i8*> [#uses=0]
-
-_ZN8ggStringC1Ei.exit3124:		; preds = %bb2144
-	%tmp3.i30983112 = invoke %"struct.std::basic_istream<char,std::char_traits<char> >"* @_ZStrsIcSt11char_traitsIcEERSt13basic_istreamIT_T0_ES6_PS3_( %"struct.std::basic_istream<char,std::char_traits<char> >"* %surfaces, i8* null )
-			to label %tmp3.i3098.noexc unwind label %lpad3921		; <%"struct.std::basic_istream<char,std::char_traits<char> >"*> [#uses=0]
-
-tmp3.i3098.noexc:		; preds = %_ZN8ggStringC1Ei.exit3124
-	%tmp15.i31073113 = invoke i8* @_ZNKSt9basic_iosIcSt11char_traitsIcEEcvPvEv( %"struct.std::basic_ios<char,std::char_traits<char> >"* null )
-			to label %tmp15.i3107.noexc unwind label %lpad3921		; <i8*> [#uses=0]
-
-tmp15.i3107.noexc:		; preds = %tmp3.i3098.noexc
-	br i1 false, label %invcont2147, label %bb.i3110
-
-bb.i3110:		; preds = %tmp15.i3107.noexc
-	ret void
-
-invcont2147:		; preds = %tmp15.i3107.noexc
-	%tmp2161 = invoke i8 @_ZNKSt9basic_iosIcSt11char_traitsIcEE4goodEv( %"struct.std::basic_ios<char,std::char_traits<char> >"* null ) zeroext 
-			to label %invcont2160 unwind label %lpad3921		; <i8> [#uses=0]
-
-invcont2160:		; preds = %invcont2147
-	%tmp4.i30933094 = invoke fastcc %struct.ggSpectrum* @_ZN5ggBSTI10ggSpectrumE4findERK8ggString3( %"struct.ggBSTNode<ggSpectrum>"* null, %struct.ggString* null )
-			to label %invcont2164 unwind label %lpad3921		; <%struct.ggSpectrum*> [#uses=0]
-
-invcont2164:		; preds = %invcont2160
-	br i1 false, label %bb2170, label %bb2181
-
-bb2170:		; preds = %invcont2164
-	ret void
-
-bb2181:		; preds = %invcont2164
-	invoke fastcc void @_ZN8ggStringD1Ev( %struct.ggString* null )
-			to label %bb3743 unwind label %lpad3845
-
-bb3336:		; preds = %bb2136
-	br i1 false, label %bb3344, label %bb3734
-
-bb3344:		; preds = %bb3336
-	%tmp6.i.i773779 = invoke i8* @_Znam( i32 12 )
-			to label %_ZN8ggStringC1Ei.exit780 unwind label %lpad3845		; <i8*> [#uses=0]
-
-_ZN8ggStringC1Ei.exit780:		; preds = %bb3344
-	%tmp6.i.i765771 = invoke i8* @_Znam( i32 12 )
-			to label %_ZN8ggStringC1Ei.exit772 unwind label %lpad4025		; <i8*> [#uses=0]
-
-_ZN8ggStringC1Ei.exit772:		; preds = %_ZN8ggStringC1Ei.exit780
-	%tmp3.i746760 = invoke %"struct.std::basic_istream<char,std::char_traits<char> >"* @_ZStrsIcSt11char_traitsIcEERSt13basic_istreamIT_T0_ES6_PS3_( %"struct.std::basic_istream<char,std::char_traits<char> >"* %surfaces, i8* null )
-			to label %tmp3.i746.noexc unwind label %lpad4029		; <%"struct.std::basic_istream<char,std::char_traits<char> >"*> [#uses=0]
-
-tmp3.i746.noexc:		; preds = %_ZN8ggStringC1Ei.exit772
-	%tmp15.i755761 = invoke i8* @_ZNKSt9basic_iosIcSt11char_traitsIcEEcvPvEv( %"struct.std::basic_ios<char,std::char_traits<char> >"* null )
-			to label %tmp15.i755.noexc unwind label %lpad4029		; <i8*> [#uses=0]
-
-tmp15.i755.noexc:		; preds = %tmp3.i746.noexc
-	br i1 false, label %invcont3348, label %bb.i758
-
-bb.i758:		; preds = %tmp15.i755.noexc
-	ret void
-
-invcont3348:		; preds = %tmp15.i755.noexc
-	%tmp3.i726740 = invoke %"struct.std::basic_istream<char,std::char_traits<char> >"* @_ZStrsIcSt11char_traitsIcEERSt13basic_istreamIT_T0_ES6_PS3_( %"struct.std::basic_istream<char,std::char_traits<char> >"* %surfaces, i8* null )
-			to label %tmp3.i726.noexc unwind label %lpad4029		; <%"struct.std::basic_istream<char,std::char_traits<char> >"*> [#uses=0]
-
-tmp3.i726.noexc:		; preds = %invcont3348
-	%tmp15.i735741 = invoke i8* @_ZNKSt9basic_iosIcSt11char_traitsIcEEcvPvEv( %"struct.std::basic_ios<char,std::char_traits<char> >"* null )
-			to label %tmp15.i735.noexc unwind label %lpad4029		; <i8*> [#uses=0]
-
-tmp15.i735.noexc:		; preds = %tmp3.i726.noexc
-	br i1 false, label %bb3458, label %bb.i738
-
-bb.i738:		; preds = %tmp15.i735.noexc
-	ret void
-
-bb3458:		; preds = %tmp15.i735.noexc
-	br i1 false, label %bb3466, label %bb3491
-
-bb3466:		; preds = %bb3458
-	%tmp3469 = invoke %"struct.std::basic_istream<char,std::char_traits<char> >"* @_ZNSirsERd( %"struct.std::basic_istream<char,std::char_traits<char> >"* %surfaces, double* null )
-			to label %invcont3468 unwind label %lpad4029		; <%"struct.std::basic_istream<char,std::char_traits<char> >"*> [#uses=1]
-
-invcont3468:		; preds = %bb3466
-	%tmp3471 = invoke %"struct.std::basic_istream<char,std::char_traits<char> >"* @_ZNSirsERd( %"struct.std::basic_istream<char,std::char_traits<char> >"* %tmp3469, double* null )
-			to label %invcont3470 unwind label %lpad4029		; <%"struct.std::basic_istream<char,std::char_traits<char> >"*> [#uses=1]
-
-invcont3470:		; preds = %invcont3468
-	%tmp3473 = invoke %"struct.std::basic_istream<char,std::char_traits<char> >"* @_ZNSirsERi( %"struct.std::basic_istream<char,std::char_traits<char> >"* %tmp3471, i32* null )
-			to label %invcont3472 unwind label %lpad4029		; <%"struct.std::basic_istream<char,std::char_traits<char> >"*> [#uses=0]
-
-invcont3472:		; preds = %invcont3470
-	%tmp3475 = invoke i8* @_Znwm( i32 7196 )
-			to label %invcont3474 unwind label %lpad4029		; <i8*> [#uses=1]
-
-invcont3474:		; preds = %invcont3472
-	invoke fastcc void @_ZN13ggSolidNoise3C1Ev( %struct.ggSolidNoise3* null )
-			to label %_ZN22ggCoverageSolidTextureC1Eddi.exit unwind label %lpad4045
-
-_ZN22ggCoverageSolidTextureC1Eddi.exit:		; preds = %invcont3474
-	%tmp34823483 = bitcast i8* %tmp3475 to %struct.ggBRDF*		; <%struct.ggBRDF*> [#uses=2]
-	invoke fastcc void @_ZN5ggBSTI14ggSolidTextureE17InsertIntoSubtreeERK8ggStringPS0_RP9ggBSTNodeIS0_E( %"struct.ggBST<ggSolidTexture>"* null, %struct.ggString* null, %struct.ggBRDF* %tmp34823483, %"struct.ggBSTNode<ggSolidTexture>"** null )
-			to label %bb3662 unwind label %lpad4029
-
-bb3491:		; preds = %bb3458
-	ret void
-
-bb3662:		; preds = %_ZN22ggCoverageSolidTextureC1Eddi.exit
-	invoke fastcc void @_ZN8ggStringD1Ev( %struct.ggString* null )
-			to label %invcont3663 unwind label %lpad4025
-
-invcont3663:		; preds = %bb3662
-	invoke fastcc void @_ZN8ggStringD1Ev( %struct.ggString* null )
-			to label %bb3743 unwind label %lpad3845
-
-bb3734:		; preds = %bb3336
-	ret void
-
-bb3743:		; preds = %invcont3663, %bb2181, %invcont2124, %invcont1882, %invcont1642, %bb368, %bb353, %invcont335, %bb3743.preheader
-	%tex1.3 = phi %struct.ggBRDF* [ undef, %bb3743.preheader ], [ %tex1.3, %bb368 ], [ %tex1.3, %invcont1642 ], [ %tex1.3, %invcont1882 ], [ %tex1.3, %invcont2124 ], [ %tex1.3, %bb2181 ], [ %tex1.3, %invcont335 ], [ %tmp34823483, %invcont3663 ], [ %tex1.3, %bb353 ]		; <%struct.ggBRDF*> [#uses=7]
-	%tmp3.i312325 = invoke %"struct.std::basic_istream<char,std::char_traits<char> >"* @_ZStrsIcSt11char_traitsIcEERSt13basic_istreamIT_T0_ES6_PS3_( %"struct.std::basic_istream<char,std::char_traits<char> >"* %surfaces, i8* null )
-			to label %tmp3.i312.noexc unwind label %lpad3845		; <%"struct.std::basic_istream<char,std::char_traits<char> >"*> [#uses=0]
-
-tmp3.i312.noexc:		; preds = %bb3743
-	%tmp15.i327 = invoke i8* @_ZNKSt9basic_iosIcSt11char_traitsIcEEcvPvEv( %"struct.std::basic_ios<char,std::char_traits<char> >"* null )
-			to label %tmp15.i.noexc326 unwind label %lpad3845		; <i8*> [#uses=0]
-
-tmp15.i.noexc326:		; preds = %tmp3.i312.noexc
-	br i1 false, label %invcont3745, label %bb.i323
-
-bb.i323:		; preds = %tmp15.i.noexc326
-	ret void
-
-invcont3745:		; preds = %tmp15.i.noexc326
-	%tmp3759 = invoke i8* @_ZNKSt9basic_iosIcSt11char_traitsIcEEcvPvEv( %"struct.std::basic_ios<char,std::char_traits<char> >"* null )
-			to label %invcont3758 unwind label %lpad3845		; <i8*> [#uses=0]
-
-invcont3758:		; preds = %invcont3745
-	%tmp5.i161 = getelementptr %"struct.ggString::StringRep"* null, i32 0, i32 2, i32 0		; <i8*> [#uses=2]
-	br i1 false, label %bb333, label %bb345
-
-lpad:		; preds = %entry
-	ret void
-
-lpad3825:		; preds = %_ZN8ggStringC1Ei.exit
-	ret void
-
-lpad3829:		; preds = %_ZN8ggStringC1Ei.exit96
-	ret void
-
-lpad3833:		; preds = %_ZN8ggStringC1Ei.exit104
-	ret void
-
-lpad3837:		; preds = %_ZN8ggStringC1Ei.exit112
-	ret void
-
-lpad3841:		; preds = %_ZN8ggStringC1Ei.exit129
-	ret void
-
-lpad3845:		; preds = %invcont3745, %tmp3.i312.noexc, %bb3743, %invcont3663, %bb3344, %bb2181, %bb2144, %invcont2124, %invcont2122, %invcont2120, %tmp9.i3163.noexc, %tmp8.i3162.noexc, %invcont2118, %tmp3.i3171.noexc, %invcont2116, %tmp3.i3192.noexc, %bb2114, %bb.i3255, %tmp3.i3243.noexc, %bb2061, %invcont1882, %invcont1880, %invcont1867, %invcont1865, %invcont1863, %invcont1861, %invcont1859, %invcont1857, %bb.i3475, %tmp3.i3463.noexc, %bb1855, %bb1701, %tmp3.i3714.noexc, %bb1662, %invcont1642, %invcont1640, %tmp9.i3799.noexc, %tmp8.i3798.noexc, %invcont1638, %tmp9.i3809.noexc, %tmp8.i3808.noexc, %invcont1636, %tmp3.i3817.noexc, %invcont1634, %tmp3.i3838.noexc, %bb1632, %bb368, %bb353, %tmp3.i167.noexc, %bb333, %tmp3.i.noexc, %_ZN13mrSurfaceListC1Ev.exit, %_ZN8ggStringC1Ei.exit139
-	ret void
-
-lpad3849:		; preds = %invcont294
-	ret void
-
-lpad3921:		; preds = %invcont2160, %invcont2147, %tmp3.i3098.noexc, %_ZN8ggStringC1Ei.exit3124
-	ret void
-
-lpad4025:		; preds = %bb3662, %_ZN8ggStringC1Ei.exit780
-	ret void
-
-lpad4029:		; preds = %_ZN22ggCoverageSolidTextureC1Eddi.exit, %invcont3472, %invcont3470, %invcont3468, %bb3466, %tmp3.i726.noexc, %invcont3348, %tmp3.i746.noexc, %_ZN8ggStringC1Ei.exit772
-	ret void
-
-lpad4045:		; preds = %invcont3474
-	ret void
-}
-
-declare fastcc void @_ZN8ggStringD1Ev(%struct.ggString*)
-
-declare i8* @_Znam(i32)
-
-declare fastcc void @_ZN8ggStringaSEPKc(%struct.ggString*, i8*)
-
-declare i32 @strcmp(i8*, i8*) nounwind readonly 
-
-declare %"struct.std::basic_istream<char,std::char_traits<char> >"* @_ZNSirsERi(%"struct.std::basic_istream<char,std::char_traits<char> >"*, i32*)
-
-declare i8* @_Znwm(i32)
-
-declare i8* @_ZNKSt9basic_iosIcSt11char_traitsIcEEcvPvEv(%"struct.std::basic_ios<char,std::char_traits<char> >"*)
-
-declare %"struct.std::basic_istream<char,std::char_traits<char> >"* @_ZNSirsERd(%"struct.std::basic_istream<char,std::char_traits<char> >"*, double*)
-
-declare %"struct.std::basic_istream<char,std::char_traits<char> >"* @_ZStrsIcSt11char_traitsIcEERSt13basic_istreamIT_T0_ES6_PS3_(%"struct.std::basic_istream<char,std::char_traits<char> >"*, i8*)
-
-declare fastcc void @_ZN13ggSolidNoise3C1Ev(%struct.ggSolidNoise3*)
-
-declare i8 @_ZNKSt9basic_iosIcSt11char_traitsIcEE4goodEv(%"struct.std::basic_ios<char,std::char_traits<char> >"*) zeroext 
-
-declare fastcc %struct.ggSpectrum* @_ZN5ggBSTI10ggSpectrumE4findERK8ggString3(%"struct.ggBSTNode<ggSpectrum>"*, %struct.ggString*)
-
-declare fastcc void @_ZN5ggBSTI14ggSolidTextureE17InsertIntoSubtreeERK8ggStringPS0_RP9ggBSTNodeIS0_E(%"struct.ggBST<ggSolidTexture>"*, %struct.ggString*, %struct.ggBRDF*, %"struct.ggBSTNode<ggSolidTexture>"**)
-
-declare fastcc void @_ZN7mrScene9AddObjectEP9mrSurfaceRK8ggStringS4_i(%struct.mrScene*, %struct.ggBRDF*, %struct.ggString*, %struct.ggString*, i32)
diff --git a/test/CodeGen/X86/2008-02-25-X86-64-CoalescerBug.ll b/test/CodeGen/X86/2008-02-25-X86-64-CoalescerBug.ll
index 6615b8c..fd9c35e 100644
--- a/test/CodeGen/X86/2008-02-25-X86-64-CoalescerBug.ll
+++ b/test/CodeGen/X86/2008-02-25-X86-64-CoalescerBug.ll
@@ -4,7 +4,7 @@
 	%struct.YY = type { i64 }
 	%struct.ZZ = type opaque
 
-define i8 @f(%struct.XX*** %fontMap, %struct.XX* %uen) signext  {
+define signext i8 @f(%struct.XX*** %fontMap, %struct.XX* %uen)   {
 entry:
 	%tmp45 = add i16 0, 1		; <i16> [#uses=2]
 	br i1 false, label %bb124, label %bb53
diff --git a/test/CodeGen/X86/2008-03-13-TwoAddrPassCrash.ll b/test/CodeGen/X86/2008-03-13-TwoAddrPassCrash.ll
index c6ba22e..19d49b2 100644
--- a/test/CodeGen/X86/2008-03-13-TwoAddrPassCrash.ll
+++ b/test/CodeGen/X86/2008-03-13-TwoAddrPassCrash.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=x86
 
-define i16 @t(i32 %depth) signext nounwind  {
+define signext i16 @t(i32 %depth)  nounwind  {
 entry:
 	br i1 false, label %bb74, label %bb
 bb:		; preds = %entry
diff --git a/test/CodeGen/X86/2008-04-02-unnamedEH.ll b/test/CodeGen/X86/2008-04-02-unnamedEH.ll
index 27bbbaa..ab8ec80 100644
--- a/test/CodeGen/X86/2008-04-02-unnamedEH.ll
+++ b/test/CodeGen/X86/2008-04-02-unnamedEH.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | FileCheck %s
+; RUN: llc < %s -disable-cfi | FileCheck %s
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i386-apple-darwin8"
 
diff --git a/test/CodeGen/X86/2008-04-16-ReMatBug.ll b/test/CodeGen/X86/2008-04-16-ReMatBug.ll
index bfe8ef5..109069e 100644
--- a/test/CodeGen/X86/2008-04-16-ReMatBug.ll
+++ b/test/CodeGen/X86/2008-04-16-ReMatBug.ll
@@ -6,7 +6,7 @@
 	%struct.pthread_mutex_t = type { i32, [40 x i8] }
 @iodbcdm_global_lock = external global %struct.pthread_mutex_t		; <%struct.pthread_mutex_t*> [#uses=1]
 
-define i16 @SQLDriversW(i8* %henv, i16 zeroext  %fDir, i32* %szDrvDesc, i16 signext  %cbDrvDescMax, i16* %pcbDrvDesc, i32* %szDrvAttr, i16 signext  %cbDrvAttrMax, i16* %pcbDrvAttr) signext nounwind  {
+define i16 @SQLDriversW(i8* %henv, i16 zeroext  %fDir, i32* %szDrvDesc, i16 signext  %cbDrvDescMax, i16* %pcbDrvDesc, i32* %szDrvAttr, i16 signext  %cbDrvAttrMax, i16* %pcbDrvAttr) nounwind  {
 entry:
 	%tmp12 = bitcast i8* %henv to %struct.GENV_t*		; <%struct.GENV_t*> [#uses=1]
 	br i1 true, label %bb28, label %bb
@@ -23,7 +23,7 @@ bb74:		; preds = %bb37
 bb92:		; preds = %bb74, %bb37
 	%tmp95180 = shl i16 %cbDrvAttrMax, 2		; <i16> [#uses=1]
 	%tmp100178 = shl i16 %cbDrvDescMax, 2		; <i16> [#uses=1]
-	%tmp113 = tail call i16 @SQLDrivers_Internal( i8* %henv, i16 zeroext  %fDir, i8* null, i16 signext  %tmp100178, i16* %pcbDrvDesc, i8* null, i16 signext  %tmp95180, i16* %pcbDrvAttr, i8 zeroext  87 ) signext nounwind 		; <i16> [#uses=1]
+	%tmp113 = tail call i16 @SQLDrivers_Internal( i8* %henv, i16 zeroext  %fDir, i8* null, i16 signext  %tmp100178, i16* %pcbDrvDesc, i8* null, i16 signext  %tmp95180, i16* %pcbDrvAttr, i8 zeroext  87 )  nounwind 		; <i16> [#uses=1]
 	br i1 false, label %done, label %bb137
 bb137:		; preds = %bb92
 	ret i16 0
@@ -41,6 +41,6 @@ bb167:		; preds = %done
 
 declare i32 @pthread_mutex_unlock(%struct.pthread_mutex_t*)
 
-declare i16 @SQLDrivers_Internal(i8*, i16 zeroext , i8*, i16 signext , i16*, i8*, i16 signext , i16*, i8 zeroext ) signext nounwind 
+declare i16 @SQLDrivers_Internal(i8*, i16 zeroext , i8*, i16 signext , i16*, i8*, i16 signext , i16*, i8 zeroext )  nounwind 
 
 declare void @trace_SQLDriversW(i32, i32, i8*, i16 zeroext , i32*, i16 signext , i16*, i32*, i16 signext , i16*)
diff --git a/test/CodeGen/X86/2008-04-17-CoalescerBug.ll b/test/CodeGen/X86/2008-04-17-CoalescerBug.ll
index ac48285..77720aa 100644
--- a/test/CodeGen/X86/2008-04-17-CoalescerBug.ll
+++ b/test/CodeGen/X86/2008-04-17-CoalescerBug.ll
@@ -75,7 +75,7 @@ bb5334:		; preds = %bb3314
 bb5484:		; preds = %bb3314
 	ret void
 bb5657:		; preds = %bb3314
-	%tmp5661 = invoke i16 @_ZNK10wxDateTime12GetDayOfYearERKNS_8TimeZoneE( %struct.wxDateTime* %this, %"struct.wxDateTime::TimeZone"* %tz ) zeroext 
+	%tmp5661 = invoke zeroext i16 @_ZNK10wxDateTime12GetDayOfYearERKNS_8TimeZoneE( %struct.wxDateTime* %this, %"struct.wxDateTime::TimeZone"* %tz )  
 			to label %invcont5660 unwind label %lpad		; <i16> [#uses=0]
 invcont5660:		; preds = %bb5657
 	ret void
@@ -120,7 +120,7 @@ invcont5814:		; preds = %bb448.i8694, %bb265.i8606
 	invoke void (%struct.wxString*, i32*, ...)* @_ZN8wxString6FormatEPKwz( %struct.wxString* noalias sret  null, i32* null, i32 %tmp58165817 )
 			to label %invcont5831 unwind label %lpad
 invcont5831:		; preds = %invcont5814
-	%tmp5862 = invoke i8 @_ZN12wxStringBase10ConcatSelfEmPKwm( %struct.wxStringBase* null, i32 0, i32* null, i32 0 ) zeroext 
+	%tmp5862 = invoke zeroext  i8 @_ZN12wxStringBase10ConcatSelfEmPKwm( %struct.wxStringBase* null, i32 0, i32* null, i32 0 ) 
 			to label %bb7834 unwind label %lpad8185		; <i8> [#uses=0]
 bb5968:		; preds = %bb3314
 	invoke void (%struct.wxString*, i32*, ...)* @_ZN8wxString6FormatEPKwz( %struct.wxString* noalias sret  null, i32* null, i32 0 )
@@ -158,11 +158,11 @@ lpad8185:		; preds = %invcont5831
 
 declare void @_Z10wxOnAssertPKwiPKcS0_S0_(i32*, i32, i8*, i32*, i32*)
 
-declare i8 @_ZN12wxStringBase10ConcatSelfEmPKwm(%struct.wxStringBase*, i32, i32*, i32) zeroext 
+declare zeroext  i8 @_ZN12wxStringBase10ConcatSelfEmPKwm(%struct.wxStringBase*, i32, i32*, i32) 
 
 declare %struct.tm* @gmtime_r(i32*, %struct.tm*)
 
-declare i16 @_ZNK10wxDateTime12GetDayOfYearERKNS_8TimeZoneE(%struct.wxDateTime*, %"struct.wxDateTime::TimeZone"*) zeroext 
+declare zeroext  i16 @_ZNK10wxDateTime12GetDayOfYearERKNS_8TimeZoneE(%struct.wxDateTime*, %"struct.wxDateTime::TimeZone"*) 
 
 declare %struct.wxStringBase* @_ZN12wxStringBase6appendEmw(%struct.wxStringBase*, i32, i32)
 
diff --git a/test/CodeGen/X86/2008-07-11-SpillerBug.ll b/test/CodeGen/X86/2008-07-11-SpillerBug.ll
deleted file mode 100644
index dee7415..0000000
--- a/test/CodeGen/X86/2008-07-11-SpillerBug.ll
+++ /dev/null
@@ -1,52 +0,0 @@
-; RUN: llc < %s -march=x86 -relocation-model=static -disable-fp-elim -post-RA-scheduler=false -asm-verbose=0 | FileCheck %s
-; PR2536
-
-; CHECK: andl    $65534, %
-; CHECK-NEXT: movl %
-; CHECK-NEXT: movzwl
-
-@g_5 = external global i16		; <i16*> [#uses=2]
-@g_107 = external global i16		; <i16*> [#uses=1]
-@g_229 = external global i32		; <i32*> [#uses=1]
-@g_227 = external global i16		; <i16*> [#uses=1]
-
-define i32 @func_54(i32 %p_55, i16 zeroext  %p_56) nounwind  {
-entry:
-	load i16* @g_5, align 2		; <i16>:0 [#uses=1]
-	zext i16 %0 to i32		; <i32>:1 [#uses=1]
-	%.mask = and i32 %1, 65534		; <i32> [#uses=1]
-	icmp eq i32 %.mask, 0		; <i1>:2 [#uses=1]
-	load i32* @g_229, align 4		; <i32>:3 [#uses=1]
-	load i16* @g_227, align 2		; <i16>:4 [#uses=1]
-	icmp eq i16 %4, 0		; <i1>:5 [#uses=1]
-	load i16* @g_5, align 2		; <i16>:6 [#uses=1]
-	br label %bb
-
-bb:		; preds = %bb7.preheader, %entry
-	%indvar4 = phi i32 [ 0, %entry ], [ %indvar.next5, %bb7.preheader ]		; <i32> [#uses=1]
-	%p_56_addr.1.reg2mem.0 = phi i16 [ %p_56, %entry ], [ %p_56_addr.0, %bb7.preheader ]		; <i16> [#uses=2]
-	br i1 %2, label %bb7.preheader, label %bb5
-
-bb5:		; preds = %bb
-	store i16 %6, i16* @g_107, align 2
-	br label %bb7.preheader
-
-bb7.preheader:		; preds = %bb5, %bb
-	icmp eq i16 %p_56_addr.1.reg2mem.0, 0		; <i1>:7 [#uses=1]
-	%.0 = select i1 %7, i32 1, i32 %3		; <i32> [#uses=1]
-	urem i32 1, %.0		; <i32>:8 [#uses=1]
-	icmp eq i32 %8, 0		; <i1>:9 [#uses=1]
-	%.not = xor i1 %9, true		; <i1> [#uses=1]
-	%.not1 = xor i1 %5, true		; <i1> [#uses=1]
-	%brmerge = or i1 %.not, %.not1		; <i1> [#uses=1]
-	%iftmp.6.0 = select i1 %brmerge, i32 3, i32 0		; <i32> [#uses=1]
-	mul i32 %iftmp.6.0, %3		; <i32>:10 [#uses=1]
-	icmp eq i32 %10, 0		; <i1>:11 [#uses=1]
-	%p_56_addr.0 = select i1 %11, i16 %p_56_addr.1.reg2mem.0, i16 1		; <i16> [#uses=1]
-	%indvar.next5 = add i32 %indvar4, 1		; <i32> [#uses=2]
-	%exitcond6 = icmp eq i32 %indvar.next5, 17		; <i1> [#uses=1]
-	br i1 %exitcond6, label %bb25, label %bb
-
-bb25:		; preds = %bb7.preheader
-	ret i32 1
-}
diff --git a/test/CodeGen/X86/2008-08-23-X86-64AsmBug.ll b/test/CodeGen/X86/2008-08-23-X86-64AsmBug.ll
deleted file mode 100644
index ce9e389..0000000
--- a/test/CodeGen/X86/2008-08-23-X86-64AsmBug.ll
+++ /dev/null
@@ -1,59 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin | grep movd | count 1
-; RUN: llc < %s -mtriple=x86_64-apple-darwin | grep movq
-; PR2677
-
-
-	%struct.Bigint = type { %struct.Bigint*, i32, i32, i32, i32, [1 x i32] }
-
-define double @_Z7qstrtodPKcPS0_Pb(i8* %s00, i8** %se, i8* %ok) nounwind {
-entry:
-	br label %bb163
-
-bb151:		; preds = %entry
-	br label %bb163
-
-bb163:		; preds = %bb151, %entry
-	%tmp366 = load double* null, align 8		; <double> [#uses=1]
-	%tmp368 = fmul double %tmp366, 0.000000e+00		; <double> [#uses=1]
-	%tmp368226 = bitcast double %tmp368 to i64		; <i64> [#uses=1]
-	br label %bb5.i
-
-bb5.i:		; preds = %bb5.i57.i, %bb163
-	%b.0.i = phi %struct.Bigint* [ null, %bb163 ]		; <%struct.Bigint*> [#uses=1]
-	%tmp3.i7.i728 = load i32* null, align 4		; <i32> [#uses=1]
-	br label %bb.i27.i
-
-bb.i27.i:		; preds = %bb.i27.i, %bb5.i
-	%tmp23.i20.i = lshr i32 0, 16		; <i32> [#uses=1]
-	br label %bb5.i57.i
-
-bb5.i57.i:		; preds = %bb.i27.i
-	%tmp50.i35.i = load i32* null, align 4		; <i32> [#uses=1]
-	%tmp51.i36.i = add i32 %tmp50.i35.i, 1		; <i32> [#uses=2]
-	%tmp2.i.i37.i = shl i32 1, %tmp51.i36.i		; <i32> [#uses=2]
-	%tmp4.i.i38.i = shl i32 %tmp2.i.i37.i, 2		; <i32> [#uses=1]
-	%tmp7.i.i39.i = add i32 %tmp4.i.i38.i, 28		; <i32> [#uses=1]
-	%tmp8.i.i40.i = malloc i8, i32 %tmp7.i.i39.i		; <i8*> [#uses=1]
-	%tmp9.i.i41.i = bitcast i8* %tmp8.i.i40.i to %struct.Bigint*		; <%struct.Bigint*> [#uses=2]
-	store i32 %tmp51.i36.i, i32* null, align 8
-	store i32 %tmp2.i.i37.i, i32* null, align 4
-	free %struct.Bigint* %b.0.i
-	store i32 %tmp23.i20.i, i32* null, align 4
-	%tmp74.i61.i = add i32 %tmp3.i7.i728, 1		; <i32> [#uses=1]
-	store i32 %tmp74.i61.i, i32* null, align 4
-	br label %bb7.i
-
-bb7.i:		; preds = %bb5.i57.i
-	%tmp514 = load i32* null, align 4		; <i32> [#uses=1]
-	%tmp515 = sext i32 %tmp514 to i64		; <i64> [#uses=1]
-	%tmp516 = shl i64 %tmp515, 2		; <i64> [#uses=1]
-	%tmp517 = add i64 %tmp516, 8		; <i64> [#uses=1]
-	%tmp519 = getelementptr %struct.Bigint* %tmp9.i.i41.i, i32 0, i32 3		; <i32*> [#uses=1]
-	%tmp523 = bitcast i32* %tmp519 to i8*		; <i8*> [#uses=1]
-	call void @llvm.memcpy.i64( i8* null, i8* %tmp523, i64 %tmp517, i32 1 )
-	%tmp524136 = bitcast i64 %tmp368226 to double		; <double> [#uses=1]
-	store double %tmp524136, double* null
-	unreachable
-}
-
-declare void @llvm.memcpy.i64(i8*, i8*, i64, i32) nounwind
diff --git a/test/CodeGen/X86/2008-08-31-EH_RETURN32.ll b/test/CodeGen/X86/2008-08-31-EH_RETURN32.ll
index b92c789..1d27fc5 100644
--- a/test/CodeGen/X86/2008-08-31-EH_RETURN32.ll
+++ b/test/CodeGen/X86/2008-08-31-EH_RETURN32.ll
@@ -1,5 +1,5 @@
 ; Check that eh_return & unwind_init were properly lowered
-; RUN: llc < %s | grep %ebp | count 7
+; RUN: llc < %s | grep %ebp | count 9
 ; RUN: llc < %s | grep %ecx | count 5
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
diff --git a/test/CodeGen/X86/2008-08-31-EH_RETURN64.ll b/test/CodeGen/X86/2008-08-31-EH_RETURN64.ll
index 00ab735..d423bfc 100644
--- a/test/CodeGen/X86/2008-08-31-EH_RETURN64.ll
+++ b/test/CodeGen/X86/2008-08-31-EH_RETURN64.ll
@@ -1,5 +1,5 @@
 ; Check that eh_return & unwind_init were properly lowered
-; RUN: llc < %s | grep %rbp | count 5
+; RUN: llc < %s | grep %rbp | count 7
 ; RUN: llc < %s | grep %rcx | count 3
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
diff --git a/test/CodeGen/X86/2008-09-18-inline-asm-2.ll b/test/CodeGen/X86/2008-09-18-inline-asm-2.ll
index 947a1f1..dfd165c 100644
--- a/test/CodeGen/X86/2008-09-18-inline-asm-2.ll
+++ b/test/CodeGen/X86/2008-09-18-inline-asm-2.ll
@@ -1,12 +1,32 @@
-; RUN: llc < %s -march=x86 -regalloc=linearscan | grep "#%ebp %edi %ebx 8(%esi) %eax %dl"
-; RUN: llc < %s -march=x86 -regalloc=fast       | grep "#%ebx %esi %edi 8(%ebp) %eax %dl"
-; RUN: llc < %s -march=x86 -regalloc=basic      | grep "#%ebp %esi %edx 8(%edi) %eax %bl"
-; RUN: llc < %s -march=x86 -regalloc=greedy     | grep "#%edx %edi %ebp 8(%esi) %eax %bl"
+; RUN: llc < %s -march=x86 -regalloc=linearscan | FileCheck %s
+; RUN: llc < %s -march=x86 -regalloc=fast       | FileCheck %s
+; RUN: llc < %s -march=x86 -regalloc=basic      | FileCheck %s
+; RUN: llc < %s -march=x86 -regalloc=greedy     | FileCheck %s
 
-; The 1st, 2nd, 3rd and 5th registers above must all be different.  The registers
+; The 1st, 2nd, 3rd and 5th registers must all be different.  The registers
 ; referenced in the 4th and 6th operands must not be the same as the 1st or 5th
-; operand.  There are many combinations that work; this is what llc puts out now.
-; ModuleID = '<stdin>'
+; operand.
+;
+; CHECK: 1st=[[A1:%...]]
+; CHECK-NOT: [[A1]]
+; CHECK: 2nd=[[A2:%...]]
+; CHECK-NOT: [[A1]]
+; CHECK-NOT: [[A2]]
+; CHECK: 3rd=[[A3:%...]]
+; CHECK-NOT: [[A1]]
+; CHECK-NOT: [[A2]]
+; CHECK-NOT: [[A3]]
+; CHECK: 5th=[[A5:%...]]
+; CHECK-NOT: [[A1]]
+; CHECK-NOT; [[A5]]
+; CHECK: =4th
+
+; The 6th operand is an 8-bit register, and it mustn't alias the 1st and 5th.
+; CHECK: 1%e[[S1:.]]x
+; CHECK: 5%e[[S5:.]]x
+; CHECK-NOT: %[[S1]]
+; CHECK-NOT: %[[S5]]
+
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i386-apple-darwin8"
 	%struct.foo = type { i32, i32, i8* }
@@ -19,7 +39,7 @@ entry:
 	%3 = load i32* %0, align 4		; <i32> [#uses=1]
 	%4 = load i32* %1, align 4		; <i32> [#uses=1]
 	%5 = load i8* %state, align 1		; <i8> [#uses=1]
-	%asmtmp = tail call { i32, i32, i32, i32 } asm sideeffect "#$0 $1 $2 $3 $4 $5", "=&r,=r,=r,=*m,=&q,=*imr,1,2,*m,5,~{dirflag},~{fpsr},~{flags},~{cx}"(i8** %2, i8* %state, i32 %3, i32 %4, i8** %2, i8 %5) nounwind		; <{ i32, i32, i32, i32 }> [#uses=3]
+	%asmtmp = tail call { i32, i32, i32, i32 } asm sideeffect "#1st=$0 $1 2nd=$1 $2 3rd=$2 $4 5th=$4 $3=4th 1$0 1%eXx 5$4 5%eXx 6th=$5", "=&r,=r,=r,=*m,=&q,=*imr,1,2,*m,5,~{dirflag},~{fpsr},~{flags},~{cx}"(i8** %2, i8* %state, i32 %3, i32 %4, i8** %2, i8 %5) nounwind		; <{ i32, i32, i32, i32 }> [#uses=3]
 	%asmresult = extractvalue { i32, i32, i32, i32 } %asmtmp, 0		; <i32> [#uses=1]
 	%asmresult1 = extractvalue { i32, i32, i32, i32 } %asmtmp, 1		; <i32> [#uses=1]
 	store i32 %asmresult1, i32* %0
diff --git a/test/CodeGen/X86/2008-09-25-sseregparm-1.ll b/test/CodeGen/X86/2008-09-25-sseregparm-1.ll
index c92a8f4..fc3e35e 100644
--- a/test/CodeGen/X86/2008-09-25-sseregparm-1.ll
+++ b/test/CodeGen/X86/2008-09-25-sseregparm-1.ll
@@ -2,11 +2,11 @@
 ; RUN: llc < %s -march=x86 -mattr=+sse2 | grep fld | count 2
 ; check 'inreg' attribute for sse_regparm
 
-define double @foo1() inreg nounwind {
+define inreg double @foo1()  nounwind {
   ret double 1.0
 }
 
-define float @foo2() inreg nounwind {
+define inreg float @foo2()  nounwind {
   ret float 1.0
 }
 
diff --git a/test/CodeGen/X86/2008-12-12-PrivateEHSymbol.ll b/test/CodeGen/X86/2008-12-12-PrivateEHSymbol.ll
index e97b63d..2e27811 100644
--- a/test/CodeGen/X86/2008-12-12-PrivateEHSymbol.ll
+++ b/test/CodeGen/X86/2008-12-12-PrivateEHSymbol.ll
@@ -1,7 +1,5 @@
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu | grep ^.L_Z1fv.eh
-; RUN: llc < %s -march=x86    -mtriple=i686-unknown-linux-gnu | grep ^.L_Z1fv.eh
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin9 | grep ^__Z1fv.eh
-; RUN: llc < %s -march=x86    -mtriple=i386-apple-darwin9 | grep ^__Z1fv.eh
+; RUN: llc < %s -disable-cfi -march=x86-64 -mtriple=x86_64-apple-darwin9 | grep ^__Z1fv.eh
+; RUN: llc < %s -disable-cfi -march=x86    -mtriple=i386-apple-darwin9 | grep ^__Z1fv.eh
 
 define void @_Z1fv() {
 entry:
diff --git a/test/CodeGen/X86/2008-12-19-EarlyClobberBug.ll b/test/CodeGen/X86/2008-12-19-EarlyClobberBug.ll
index 5eba9b9..75e0b8a 100644
--- a/test/CodeGen/X86/2008-12-19-EarlyClobberBug.ll
+++ b/test/CodeGen/X86/2008-12-19-EarlyClobberBug.ll
@@ -4,7 +4,7 @@
 
 ; CHECK:         ## InlineAsm End
 ; CHECK-NEXT: BB0_2:
-; CHECK-NEXT:    movl	%esi, %eax
+; CHECK-NEXT:    {{movl	%esi, %eax|addl	%edi, %esi}}
 
 
 @"\01LC" = internal constant [7 x i8] c"n0=%d\0A\00"		; <[7 x i8]*> [#uses=1]
diff --git a/test/CodeGen/X86/2009-01-29-LocalRegAllocBug.ll b/test/CodeGen/X86/2009-01-29-LocalRegAllocBug.ll
deleted file mode 100644
index 35fac0c..0000000
--- a/test/CodeGen/X86/2009-01-29-LocalRegAllocBug.ll
+++ /dev/null
@@ -1,38 +0,0 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin9.6 -regalloc=fast -disable-fp-elim
-; rdar://6538384
-
-	%struct.FILE = type { i8*, i32, i32, i16, i16, %struct.__sbuf, i32, i8*, i32 (i8*)*, i32 (i8*, i8*, i32)*, i64 (i8*, i64, i32)*, i32 (i8*, i8*, i32)*, %struct.__sbuf, %struct.__sFILEX*, i32, [3 x i8], [1 x i8], %struct.__sbuf, i32, i64 }
-	%struct.Lit = type { i32 }
-	%struct.StreamBuffer = type { %struct.FILE*, [1048576 x i8], i32, i32 }
-	%struct.__sFILEX = type opaque
-	%struct.__sbuf = type { i8*, i32 }
-
-declare fastcc i32 @_Z8parseIntI12StreamBufferEiRT_(%struct.StreamBuffer*)
-
-declare i8* @llvm.eh.exception() nounwind
-
-define i32 @main(i32 %argc, i8** nocapture %argv) noreturn {
-entry:
-	%0 = invoke fastcc i32 @_Z8parseIntI12StreamBufferEiRT_(%struct.StreamBuffer* null)
-			to label %bb1.i16.i.i unwind label %lpad.i.i		; <i32> [#uses=0]
-
-bb1.i16.i.i:		; preds = %entry
-	br i1 false, label %bb.i.i.i.i, label %_ZN3vecI3LitE4pushERKS0_.exit.i.i.i
-
-bb.i.i.i.i:		; preds = %bb1.i16.i.i
-	br label %_ZN3vecI3LitE4pushERKS0_.exit.i.i.i
-
-_ZN3vecI3LitE4pushERKS0_.exit.i.i.i:		; preds = %bb.i.i.i.i, %bb1.i16.i.i
-	%lits.i.i.0.0 = phi %struct.Lit* [ null, %bb1.i16.i.i ], [ null, %bb.i.i.i.i ]		; <%struct.Lit*> [#uses=1]
-	%1 = invoke fastcc i32 @_Z8parseIntI12StreamBufferEiRT_(%struct.StreamBuffer* null)
-			to label %.noexc21.i.i unwind label %lpad.i.i		; <i32> [#uses=0]
-
-.noexc21.i.i:		; preds = %_ZN3vecI3LitE4pushERKS0_.exit.i.i.i
-	unreachable
-
-lpad.i.i:		; preds = %_ZN3vecI3LitE4pushERKS0_.exit.i.i.i, %entry
-	%lits.i.i.0.3 = phi %struct.Lit* [ %lits.i.i.0.0, %_ZN3vecI3LitE4pushERKS0_.exit.i.i.i ], [ null, %entry ]		; <%struct.Lit*> [#uses=1]
-	%eh_ptr.i.i = call i8* @llvm.eh.exception()		; <i8*> [#uses=0]
-	free %struct.Lit* %lits.i.i.0.3
-	unreachable
-}
diff --git a/test/CodeGen/X86/2009-02-20-PreAllocSplit-Crash.ll b/test/CodeGen/X86/2009-02-20-PreAllocSplit-Crash.ll
deleted file mode 100644
index aba4bfc..0000000
--- a/test/CodeGen/X86/2009-02-20-PreAllocSplit-Crash.ll
+++ /dev/null
@@ -1,71 +0,0 @@
-; RUN: llc < %s -march=x86 -mtriple=i386-apple-darwin8 -pre-alloc-split -regalloc=linearscan
-
-define i32 @main() nounwind {
-bb4.i.thread:
-	br label %bb5.i4
-
-bb16:		; preds = %bb111.i
-	%phitmp = add i32 %indvar.reg2mem.4, 1		; <i32> [#uses=2]
-	switch i32 %indvar.reg2mem.4, label %bb100.i [
-		i32 0, label %bb5.i4
-		i32 1, label %bb5.i4
-		i32 2, label %bb5.i4
-		i32 5, label %bb.i14.i
-		i32 6, label %bb.i14.i
-		i32 7, label %bb.i14.i
-	]
-
-bb5.i4:		; preds = %bb16, %bb16, %bb16, %bb4.i.thread
-	br i1 false, label %bb102.i, label %bb103.i
-
-bb.i14.i:		; preds = %bb16, %bb16, %bb16
-	%0 = malloc [600 x i32]		; <[600 x i32]*> [#uses=0]
-	%1 = icmp eq i32 %phitmp, 7		; <i1> [#uses=1]
-	%tl.0.i = select i1 %1, float 1.000000e+02, float 1.000000e+00		; <float> [#uses=1]
-	%2 = icmp eq i32 %phitmp, 8		; <i1> [#uses=1]
-	%tu.0.i = select i1 %2, float 1.000000e+02, float 1.000000e+00		; <float> [#uses=1]
-	br label %bb30.i
-
-bb30.i:		; preds = %bb36.i, %bb.i14.i
-	%i.1173.i = phi i32 [ 0, %bb.i14.i ], [ %indvar.next240.i, %bb36.i ]		; <i32> [#uses=3]
-	%3 = icmp eq i32 0, %i.1173.i		; <i1> [#uses=1]
-	br i1 %3, label %bb33.i, label %bb34.i
-
-bb33.i:		; preds = %bb30.i
-	store float %tl.0.i, float* null, align 4
-	br label %bb36.i
-
-bb34.i:		; preds = %bb30.i
-	%4 = icmp eq i32 0, %i.1173.i		; <i1> [#uses=1]
-	br i1 %4, label %bb35.i, label %bb36.i
-
-bb35.i:		; preds = %bb34.i
-	store float %tu.0.i, float* null, align 4
-	br label %bb36.i
-
-bb36.i:		; preds = %bb35.i, %bb34.i, %bb33.i
-	%indvar.next240.i = add i32 %i.1173.i, 1		; <i32> [#uses=1]
-	br label %bb30.i
-
-bb100.i:		; preds = %bb16
-	ret i32 0
-
-bb102.i:		; preds = %bb5.i4
-	br label %bb103.i
-
-bb103.i:		; preds = %bb102.i, %bb5.i4
-	%indvar.reg2mem.4 = phi i32 [ 0, %bb5.i4 ], [ 0, %bb102.i ]		; <i32> [#uses=2]
-	%n.0.reg2mem.1.i = phi i32 [ 0, %bb102.i ], [ 0, %bb5.i4 ]		; <i32> [#uses=1]
-	%5 = icmp eq i32 0, 0		; <i1> [#uses=1]
-	br i1 %5, label %bb111.i, label %bb108.i
-
-bb108.i:		; preds = %bb103.i
-	ret i32 0
-
-bb111.i:		; preds = %bb103.i
-	%6 = icmp sgt i32 %n.0.reg2mem.1.i, 7		; <i1> [#uses=1]
-	br i1 %6, label %bb16, label %bb112.i
-
-bb112.i:		; preds = %bb111.i
-	unreachable
-}
diff --git a/test/CodeGen/X86/2009-03-13-PHIElimBug.ll b/test/CodeGen/X86/2009-03-13-PHIElimBug.ll
index 2853930..45fc269 100644
--- a/test/CodeGen/X86/2009-03-13-PHIElimBug.ll
+++ b/test/CodeGen/X86/2009-03-13-PHIElimBug.ll
@@ -28,5 +28,5 @@ lpad:		; preds = %cont, %entry
 }
 
 ; CHECK: call{{.*}}f
-; CHECK-NEXT: Ltmp0:
-; CHECK-NEXT: movl %eax, %esi
+; CHECK: movl %eax, %esi
+; CHECK: call{{.*}}g
diff --git a/test/CodeGen/X86/2009-10-08-MachineLICMBug.ll b/test/CodeGen/X86/2009-10-08-MachineLICMBug.ll
deleted file mode 100644
index 91c5440..0000000
--- a/test/CodeGen/X86/2009-10-08-MachineLICMBug.ll
+++ /dev/null
@@ -1,264 +0,0 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin -relocation-model=pic -stats |& grep {machine-licm} | grep 2
-; rdar://7274692
-
-%0 = type { [125 x i32] }
-%1 = type { i32 }
-%struct..5sPragmaType = type { i8*, i32 }
-%struct.AggInfo = type { i8, i8, i32, %struct.ExprList*, i32, %struct.AggInfo_col*, i32, i32, i32, %struct.AggInfo_func*, i32, i32 }
-%struct.AggInfo_col = type { %struct.Table*, i32, i32, i32, i32, %struct.Expr* }
-%struct.AggInfo_func = type { %struct.Expr*, %struct.FuncDef*, i32, i32 }
-%struct.AuxData = type { i8*, void (i8*)* }
-%struct.Bitvec = type { i32, i32, i32, %0 }
-%struct.BtCursor = type { %struct.Btree*, %struct.BtShared*, %struct.BtCursor*, %struct.BtCursor*, i32 (i8*, i32, i8*, i32, i8*)*, i8*, i32, %struct.MemPage*, i32, %struct.CellInfo, i8, i8, i8*, i64, i32, i8, i32* }
-%struct.BtLock = type { %struct.Btree*, i32, i8, %struct.BtLock* }
-%struct.BtShared = type { %struct.Pager*, %struct.sqlite3*, %struct.BtCursor*, %struct.MemPage*, i8, i8, i8, i8, i8, i8, i8, i8, i32, i16, i16, i32, i32, i32, i32, i8, i32, i8*, void (i8*)*, %struct.sqlite3_mutex*, %struct.BusyHandler, i32, %struct.BtShared*, %struct.BtLock*, %struct.Btree* }
-%struct.Btree = type { %struct.sqlite3*, %struct.BtShared*, i8, i8, i8, i32, %struct.Btree*, %struct.Btree* }
-%struct.BtreeMutexArray = type { i32, [11 x %struct.Btree*] }
-%struct.BusyHandler = type { i32 (i8*, i32)*, i8*, i32 }
-%struct.CellInfo = type { i8*, i64, i32, i32, i16, i16, i16, i16 }
-%struct.CollSeq = type { i8*, i8, i8, i8*, i32 (i8*, i32, i8*, i32, i8*)*, void (i8*)* }
-%struct.Column = type { i8*, %struct.Expr*, i8*, i8*, i8, i8, i8, i8 }
-%struct.Context = type { i64, i32, %struct.Fifo }
-%struct.CountCtx = type { i64 }
-%struct.Cursor = type { %struct.BtCursor*, i32, i64, i64, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i64, %struct.Btree*, i32, i8*, i64, i8*, %struct.KeyInfo*, i32, i64, %struct.sqlite3_vtab_cursor*, %struct.sqlite3_module*, i32, i32, i32*, i32*, i8* }
-%struct.Db = type { i8*, %struct.Btree*, i8, i8, i8*, void (i8*)*, %struct.Schema* }
-%struct.DbPage = type { %struct.Pager*, i32, %struct.DbPage*, %struct.DbPage*, %struct.PagerLruLink, %struct.DbPage*, i8, i8, i8, i8, i8, i16, %struct.DbPage*, %struct.DbPage*, i8* }
-%struct.Expr = type { i8, i8, i16, %struct.CollSeq*, %struct.Expr*, %struct.Expr*, %struct.ExprList*, %struct..5sPragmaType, %struct..5sPragmaType, i32, i32, %struct.AggInfo*, i32, i32, %struct.Select*, %struct.Table*, i32 }
-%struct.ExprList = type { i32, i32, i32, %struct.ExprList_item* }
-%struct.ExprList_item = type { %struct.Expr*, i8*, i8, i8, i8 }
-%struct.FILE = type { i8*, i32, i32, i16, i16, %struct..5sPragmaType, i32, i8*, i32 (i8*)*, i32 (i8*, i8*, i32)*, i64 (i8*, i64, i32)*, i32 (i8*, i8*, i32)*, %struct..5sPragmaType, %struct.__sFILEX*, i32, [3 x i8], [1 x i8], %struct..5sPragmaType, i32, i64 }
-%struct.FKey = type { %struct.Table*, %struct.FKey*, i8*, %struct.FKey*, i32, %struct.sColMap*, i8, i8, i8, i8 }
-%struct.Fifo = type { i32, %struct.FifoPage*, %struct.FifoPage* }
-%struct.FifoPage = type { i32, i32, i32, %struct.FifoPage*, [1 x i64] }
-%struct.FuncDef = type { i16, i8, i8, i8, i8*, %struct.FuncDef*, void (%struct.sqlite3_context*, i32, %struct.Mem**)*, void (%struct.sqlite3_context*, i32, %struct.Mem**)*, void (%struct.sqlite3_context*)*, [1 x i8] }
-%struct.Hash = type { i8, i8, i32, i32, %struct.HashElem*, %struct._ht* }
-%struct.HashElem = type { %struct.HashElem*, %struct.HashElem*, i8*, i8*, i32 }
-%struct.IdList = type { %struct..5sPragmaType*, i32, i32 }
-%struct.Index = type { i8*, i32, i32*, i32*, %struct.Table*, i32, i8, i8, i8*, %struct.Index*, %struct.Schema*, i8*, i8** }
-%struct.KeyInfo = type { %struct.sqlite3*, i8, i8, i8, i32, i8*, [1 x %struct.CollSeq*] }
-%struct.Mem = type { %struct.CountCtx, double, %struct.sqlite3*, i8*, i32, i16, i8, i8, void (i8*)* }
-%struct.MemPage = type { i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i16, i16, i16, i16, i16, i16, [5 x %struct._OvflCell], %struct.BtShared*, i8*, %struct.DbPage*, i32, %struct.MemPage* }
-%struct.Module = type { %struct.sqlite3_module*, i8*, i8*, void (i8*)* }
-%struct.Op = type { i8, i8, i8, i8, i32, i32, i32, %1 }
-%struct.Pager = type { %struct.sqlite3_vfs*, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, %struct.Bitvec*, %struct.Bitvec*, i8*, i8*, i8*, i8*, %struct.sqlite3_file*, %struct.sqlite3_file*, %struct.sqlite3_file*, %struct.BusyHandler*, %struct.PagerLruList, %struct.DbPage*, %struct.DbPage*, %struct.DbPage*, i64, i64, i64, i64, i64, i32, void (%struct.DbPage*, i32)*, void (%struct.DbPage*, i32)*, i32, %struct.DbPage**, i8*, [16 x i8] }
-%struct.PagerLruLink = type { %struct.DbPage*, %struct.DbPage* }
-%struct.PagerLruList = type { %struct.DbPage*, %struct.DbPage*, %struct.DbPage* }
-%struct.Schema = type { i32, %struct.Hash, %struct.Hash, %struct.Hash, %struct.Hash, %struct.Table*, i8, i8, i16, i32, %struct.sqlite3* }
-%struct.Select = type { %struct.ExprList*, i8, i8, i8, i8, i8, i8, i8, %struct.SrcList*, %struct.Expr*, %struct.ExprList*, %struct.Expr*, %struct.ExprList*, %struct.Select*, %struct.Select*, %struct.Select*, %struct.Expr*, %struct.Expr*, i32, i32, [3 x i32] }
-%struct.SrcList = type { i16, i16, [1 x %struct.SrcList_item] }
-%struct.SrcList_item = type { i8*, i8*, i8*, %struct.Table*, %struct.Select*, i8, i8, i32, %struct.Expr*, %struct.IdList*, i64 }
-%struct.Table = type { i8*, i32, %struct.Column*, i32, %struct.Index*, i32, %struct.Select*, i32, %struct.Trigger*, %struct.FKey*, i8*, %struct.Expr*, i32, i8, i8, i8, i8, i8, i8, i8, %struct.Module*, %struct.sqlite3_vtab*, i32, i8**, %struct.Schema* }
-%struct.Trigger = type { i8*, i8*, i8, i8, %struct.Expr*, %struct.IdList*, %struct..5sPragmaType, %struct.Schema*, %struct.Schema*, %struct.TriggerStep*, %struct.Trigger* }
-%struct.TriggerStep = type { i32, i32, %struct.Trigger*, %struct.Select*, %struct..5sPragmaType, %struct.Expr*, %struct.ExprList*, %struct.IdList*, %struct.TriggerStep*, %struct.TriggerStep* }
-%struct.Vdbe = type { %struct.sqlite3*, %struct.Vdbe*, %struct.Vdbe*, i32, i32, %struct.Op*, i32, i32, i32*, %struct.Mem**, %struct.Mem*, i32, %struct.Cursor**, i32, %struct.Mem*, i8**, i32, i32, i32, %struct.Mem*, i32, i32, %struct.Fifo, i32, i32, %struct.Context*, i32, i32, i32, i32, i32, [25 x i32], i32, i32, i8**, i8*, %struct.Mem*, i8, i8, i8, i8, i8, i8, i32, i64, i32, %struct.BtreeMutexArray, i32, i8*, i32 }
-%struct.VdbeFunc = type { %struct.FuncDef*, i32, [1 x %struct.AuxData] }
-%struct._OvflCell = type { i8*, i16 }
-%struct._RuneCharClass = type { [14 x i8], i32 }
-%struct._RuneEntry = type { i32, i32, i32, i32* }
-%struct._RuneLocale = type { [8 x i8], [32 x i8], i32 (i8*, i32, i8**)*, i32 (i32, i8*, i32, i8**)*, i32, [256 x i32], [256 x i32], [256 x i32], %struct._RuneRange, %struct._RuneRange, %struct._RuneRange, i8*, i32, i32, %struct._RuneCharClass* }
-%struct._RuneRange = type { i32, %struct._RuneEntry* }
-%struct.__sFILEX = type opaque
-%struct._ht = type { i32, %struct.HashElem* }
-%struct.callback_data = type { %struct.sqlite3*, i32, i32, %struct.FILE*, i32, i32, i32, i8*, [20 x i8], [100 x i32], [100 x i32], [20 x i8], %struct.previous_mode_data, [1024 x i8], i8* }
-%struct.previous_mode_data = type { i32, i32, i32, [100 x i32] }
-%struct.sColMap = type { i32, i8* }
-%struct.sqlite3 = type { %struct.sqlite3_vfs*, i32, %struct.Db*, i32, i32, i32, i32, i8, i8, i8, i8, i32, %struct.CollSeq*, i64, i64, i32, i32, i32, %struct.sqlite3_mutex*, %struct.sqlite3InitInfo, i32, i8**, %struct.Vdbe*, i32, void (i8*, i8*)*, i8*, void (i8*, i8*, i64)*, i8*, i8*, i32 (i8*)*, i8*, void (i8*)*, i8*, void (i8*, i32, i8*, i8*, i64)*, void (i8*, %struct.sqlite3*, i32, i8*)*, void (i8*, %struct.sqlite3*, i32, i8*)*, i8*, %struct.Mem*, i8*, i8*, %union.anon, i32 (i8*, i32, i8*, i8*, i8*, i8*)*, i8*, i32 (i8*)*, i8*, i32, %struct.Hash, %struct.Table*, %struct.sqlite3_vtab**, i32, %struct.Hash, %struct.Hash, %struct.BusyHandler, i32, [2 x %struct.Db], i8 }
-%struct.sqlite3InitInfo = type { i32, i32, i8 }
-%struct.sqlite3_context = type { %struct.FuncDef*, %struct.VdbeFunc*, %struct.Mem, %struct.Mem*, i32, %struct.CollSeq* }
-%struct.sqlite3_file = type { %struct.sqlite3_io_methods* }
-%struct.sqlite3_index_constraint = type { i32, i8, i8, i32 }
-%struct.sqlite3_index_constraint_usage = type { i32, i8 }
-%struct.sqlite3_index_info = type { i32, %struct.sqlite3_index_constraint*, i32, %struct.sqlite3_index_constraint_usage*, %struct.sqlite3_index_constraint_usage*, i32, i8*, i32, i32, double }
-%struct.sqlite3_io_methods = type { i32, i32 (%struct.sqlite3_file*)*, i32 (%struct.sqlite3_file*, i8*, i32, i64)*, i32 (%struct.sqlite3_file*, i8*, i32, i64)*, i32 (%struct.sqlite3_file*, i64)*, i32 (%struct.sqlite3_file*, i32)*, i32 (%struct.sqlite3_file*, i64*)*, i32 (%struct.sqlite3_file*, i32)*, i32 (%struct.sqlite3_file*, i32)*, i32 (%struct.sqlite3_file*)*, i32 (%struct.sqlite3_file*, i32, i8*)*, i32 (%struct.sqlite3_file*)*, i32 (%struct.sqlite3_file*)* }
-%struct.sqlite3_module = type { i32, i32 (%struct.sqlite3*, i8*, i32, i8**, %struct.sqlite3_vtab**, i8**)*, i32 (%struct.sqlite3*, i8*, i32, i8**, %struct.sqlite3_vtab**, i8**)*, i32 (%struct.sqlite3_vtab*, %struct.sqlite3_index_info*)*, i32 (%struct.sqlite3_vtab*)*, i32 (%struct.sqlite3_vtab*)*, i32 (%struct.sqlite3_vtab*, %struct.sqlite3_vtab_cursor**)*, i32 (%struct.sqlite3_vtab_cursor*)*, i32 (%struct.sqlite3_vtab_cursor*, i32, i8*, i32, %struct.Mem**)*, i32 (%struct.sqlite3_vtab_cursor*)*, i32 (%struct.sqlite3_vtab_cursor*)*, i32 (%struct.sqlite3_vtab_cursor*, %struct.sqlite3_context*, i32)*, i32 (%struct.sqlite3_vtab_cursor*, i64*)*, i32 (%struct.sqlite3_vtab*, i32, %struct.Mem**, i64*)*, i32 (%struct.sqlite3_vtab*)*, i32 (%struct.sqlite3_vtab*)*, i32 (%struct.sqlite3_vtab*)*, i32 (%struct.sqlite3_vtab*)*, i32 (%struct.sqlite3_vtab*, i32, i8*, void (%struct.sqlite3_context*, i32, %struct.Mem**)**, i8**)*, i32 (%struct.sqlite3_vtab*, i8*)* }
-%struct.sqlite3_mutex = type opaque
-%struct.sqlite3_vfs = type { i32, i32, i32, %struct.sqlite3_vfs*, i8*, i8*, i32 (%struct.sqlite3_vfs*, i8*, %struct.sqlite3_file*, i32, i32*)*, i32 (%struct.sqlite3_vfs*, i8*, i32)*, i32 (%struct.sqlite3_vfs*, i8*, i32)*, i32 (%struct.sqlite3_vfs*, i32, i8*)*, i32 (%struct.sqlite3_vfs*, i8*, i32, i8*)*, i8* (%struct.sqlite3_vfs*, i8*)*, void (%struct.sqlite3_vfs*, i32, i8*)*, i8* (%struct.sqlite3_vfs*, i8*, i8*)*, void (%struct.sqlite3_vfs*, i8*)*, i32 (%struct.sqlite3_vfs*, i32, i8*)*, i32 (%struct.sqlite3_vfs*, i32)*, i32 (%struct.sqlite3_vfs*, double*)* }
-%struct.sqlite3_vtab = type { %struct.sqlite3_module*, i32, i8* }
-%struct.sqlite3_vtab_cursor = type { %struct.sqlite3_vtab* }
-%union.anon = type { double }
-
-@_DefaultRuneLocale = external global %struct._RuneLocale ; <%struct._RuneLocale*> [#uses=2]
-@__stderrp = external global %struct.FILE*        ; <%struct.FILE**> [#uses=1]
-@.str10 = internal constant [16 x i8] c"Out of memory!\0A\00", align 1 ; <[16 x i8]*> [#uses=1]
-@llvm.used = appending global [1 x i8*] [i8* bitcast (void (%struct.callback_data*, i8*)* @set_table_name to i8*)], section "llvm.metadata" ; <[1 x i8*]*> [#uses=0]
-
-define fastcc void @set_table_name(%struct.callback_data* nocapture %p, i8* %zName) nounwind ssp {
-entry:
-  %0 = getelementptr inbounds %struct.callback_data* %p, i32 0, i32 7 ; <i8**> [#uses=3]
-  %1 = load i8** %0, align 4                      ; <i8*> [#uses=2]
-  %2 = icmp eq i8* %1, null                       ; <i1> [#uses=1]
-  br i1 %2, label %bb1, label %bb
-
-bb:                                               ; preds = %entry
-  free i8* %1
-  store i8* null, i8** %0, align 4
-  br label %bb1
-
-bb1:                                              ; preds = %bb, %entry
-  %3 = icmp eq i8* %zName, null                   ; <i1> [#uses=1]
-  br i1 %3, label %return, label %bb2
-
-bb2:                                              ; preds = %bb1
-  %4 = load i8* %zName, align 1                   ; <i8> [#uses=2]
-  %5 = zext i8 %4 to i32                          ; <i32> [#uses=2]
-  %6 = icmp sgt i8 %4, -1                         ; <i1> [#uses=1]
-  br i1 %6, label %bb.i.i, label %bb1.i.i
-
-bb.i.i:                                           ; preds = %bb2
-  %7 = getelementptr inbounds %struct._RuneLocale* @_DefaultRuneLocale, i32 0, i32 5, i32 %5 ; <i32*> [#uses=1]
-  %8 = load i32* %7, align 4                      ; <i32> [#uses=1]
-  %9 = and i32 %8, 256                            ; <i32> [#uses=1]
-  br label %isalpha.exit
-
-bb1.i.i:                                          ; preds = %bb2
-  %10 = tail call i32 @__maskrune(i32 %5, i32 256) nounwind ; <i32> [#uses=1]
-  br label %isalpha.exit
-
-isalpha.exit:                                     ; preds = %bb1.i.i, %bb.i.i
-  %storemerge.in.in.i.i = phi i32 [ %9, %bb.i.i ], [ %10, %bb1.i.i ] ; <i32> [#uses=1]
-  %storemerge.in.i.i = icmp eq i32 %storemerge.in.in.i.i, 0 ; <i1> [#uses=1]
-  br i1 %storemerge.in.i.i, label %bb3, label %bb5
-
-bb3:                                              ; preds = %isalpha.exit
-  %11 = load i8* %zName, align 1                  ; <i8> [#uses=2]
-  %12 = icmp eq i8 %11, 95                        ; <i1> [#uses=1]
-  br i1 %12, label %bb5, label %bb12.preheader
-
-bb5:                                              ; preds = %bb3, %isalpha.exit
-  %.pre = load i8* %zName, align 1                ; <i8> [#uses=1]
-  br label %bb12.preheader
-
-bb12.preheader:                                   ; preds = %bb5, %bb3
-  %13 = phi i8 [ %.pre, %bb5 ], [ %11, %bb3 ]     ; <i8> [#uses=1]
-  %needQuote.1.ph = phi i32 [ 0, %bb5 ], [ 1, %bb3 ] ; <i32> [#uses=2]
-  %14 = icmp eq i8 %13, 0                         ; <i1> [#uses=1]
-  br i1 %14, label %bb13, label %bb7
-
-bb7:                                              ; preds = %bb11, %bb12.preheader
-  %i.011 = phi i32 [ %tmp17, %bb11 ], [ 0, %bb12.preheader ] ; <i32> [#uses=2]
-  %n.110 = phi i32 [ %26, %bb11 ], [ 0, %bb12.preheader ] ; <i32> [#uses=3]
-  %needQuote.19 = phi i32 [ %needQuote.0, %bb11 ], [ %needQuote.1.ph, %bb12.preheader ] ; <i32> [#uses=2]
-  %scevgep16 = getelementptr i8* %zName, i32 %i.011 ; <i8*> [#uses=2]
-  %tmp17 = add i32 %i.011, 1                      ; <i32> [#uses=2]
-  %scevgep18 = getelementptr i8* %zName, i32 %tmp17 ; <i8*> [#uses=1]
-  %15 = load i8* %scevgep16, align 1              ; <i8> [#uses=2]
-  %16 = zext i8 %15 to i32                        ; <i32> [#uses=2]
-  %17 = icmp sgt i8 %15, -1                       ; <i1> [#uses=1]
-  br i1 %17, label %bb.i.i2, label %bb1.i.i3
-
-bb.i.i2:                                          ; preds = %bb7
-  %18 = getelementptr inbounds %struct._RuneLocale* @_DefaultRuneLocale, i32 0, i32 5, i32 %16 ; <i32*> [#uses=1]
-  %19 = load i32* %18, align 4                    ; <i32> [#uses=1]
-  %20 = and i32 %19, 1280                         ; <i32> [#uses=1]
-  br label %isalnum.exit
-
-bb1.i.i3:                                         ; preds = %bb7
-  %21 = tail call i32 @__maskrune(i32 %16, i32 1280) nounwind ; <i32> [#uses=1]
-  br label %isalnum.exit
-
-isalnum.exit:                                     ; preds = %bb1.i.i3, %bb.i.i2
-  %storemerge.in.in.i.i4 = phi i32 [ %20, %bb.i.i2 ], [ %21, %bb1.i.i3 ] ; <i32> [#uses=1]
-  %storemerge.in.i.i5 = icmp eq i32 %storemerge.in.in.i.i4, 0 ; <i1> [#uses=1]
-  br i1 %storemerge.in.i.i5, label %bb8, label %bb11
-
-bb8:                                              ; preds = %isalnum.exit
-  %22 = load i8* %scevgep16, align 1              ; <i8> [#uses=2]
-  %23 = icmp eq i8 %22, 95                        ; <i1> [#uses=1]
-  br i1 %23, label %bb11, label %bb9
-
-bb9:                                              ; preds = %bb8
-  %24 = icmp eq i8 %22, 39                        ; <i1> [#uses=1]
-  %25 = zext i1 %24 to i32                        ; <i32> [#uses=1]
-  %.n.1 = add i32 %n.110, %25                     ; <i32> [#uses=1]
-  br label %bb11
-
-bb11:                                             ; preds = %bb9, %bb8, %isalnum.exit
-  %needQuote.0 = phi i32 [ 1, %bb9 ], [ %needQuote.19, %isalnum.exit ], [ %needQuote.19, %bb8 ] ; <i32> [#uses=2]
-  %n.0 = phi i32 [ %.n.1, %bb9 ], [ %n.110, %isalnum.exit ], [ %n.110, %bb8 ] ; <i32> [#uses=1]
-  %26 = add nsw i32 %n.0, 1                       ; <i32> [#uses=2]
-  %27 = load i8* %scevgep18, align 1              ; <i8> [#uses=1]
-  %28 = icmp eq i8 %27, 0                         ; <i1> [#uses=1]
-  br i1 %28, label %bb13, label %bb7
-
-bb13:                                             ; preds = %bb11, %bb12.preheader
-  %n.1.lcssa = phi i32 [ 0, %bb12.preheader ], [ %26, %bb11 ] ; <i32> [#uses=2]
-  %needQuote.1.lcssa = phi i32 [ %needQuote.1.ph, %bb12.preheader ], [ %needQuote.0, %bb11 ] ; <i32> [#uses=1]
-  %29 = add nsw i32 %n.1.lcssa, 2                 ; <i32> [#uses=1]
-  %30 = icmp eq i32 %needQuote.1.lcssa, 0         ; <i1> [#uses=3]
-  %n.1. = select i1 %30, i32 %n.1.lcssa, i32 %29  ; <i32> [#uses=1]
-  %31 = add nsw i32 %n.1., 1                      ; <i32> [#uses=1]
-  %32 = malloc i8, i32 %31                        ; <i8*> [#uses=7]
-  store i8* %32, i8** %0, align 4
-  %33 = icmp eq i8* %32, null                     ; <i1> [#uses=1]
-  br i1 %33, label %bb16, label %bb17
-
-bb16:                                             ; preds = %bb13
-  %34 = load %struct.FILE** @__stderrp, align 4   ; <%struct.FILE*> [#uses=1]
-  %35 = bitcast %struct.FILE* %34 to i8*          ; <i8*> [#uses=1]
-  %36 = tail call i32 @"\01_fwrite$UNIX2003"(i8* getelementptr inbounds ([16 x i8]* @.str10, i32 0, i32 0), i32 1, i32 15, i8* %35) nounwind ; <i32> [#uses=0]
-  tail call void @exit(i32 1) noreturn nounwind
-  unreachable
-
-bb17:                                             ; preds = %bb13
-  br i1 %30, label %bb23.preheader, label %bb18
-
-bb18:                                             ; preds = %bb17
-  store i8 39, i8* %32, align 4
-  br label %bb23.preheader
-
-bb23.preheader:                                   ; preds = %bb18, %bb17
-  %n.3.ph = phi i32 [ 1, %bb18 ], [ 0, %bb17 ]    ; <i32> [#uses=2]
-  %37 = load i8* %zName, align 1                  ; <i8> [#uses=1]
-  %38 = icmp eq i8 %37, 0                         ; <i1> [#uses=1]
-  br i1 %38, label %bb24, label %bb20
-
-bb20:                                             ; preds = %bb22, %bb23.preheader
-  %storemerge18 = phi i32 [ %tmp, %bb22 ], [ 0, %bb23.preheader ] ; <i32> [#uses=2]
-  %n.37 = phi i32 [ %n.4, %bb22 ], [ %n.3.ph, %bb23.preheader ] ; <i32> [#uses=3]
-  %scevgep = getelementptr i8* %zName, i32 %storemerge18 ; <i8*> [#uses=1]
-  %tmp = add i32 %storemerge18, 1                 ; <i32> [#uses=2]
-  %scevgep15 = getelementptr i8* %zName, i32 %tmp ; <i8*> [#uses=1]
-  %39 = load i8* %scevgep, align 1                ; <i8> [#uses=2]
-  %40 = getelementptr inbounds i8* %32, i32 %n.37 ; <i8*> [#uses=1]
-  store i8 %39, i8* %40, align 1
-  %41 = add nsw i32 %n.37, 1                      ; <i32> [#uses=2]
-  %42 = icmp eq i8 %39, 39                        ; <i1> [#uses=1]
-  br i1 %42, label %bb21, label %bb22
-
-bb21:                                             ; preds = %bb20
-  %43 = getelementptr inbounds i8* %32, i32 %41   ; <i8*> [#uses=1]
-  store i8 39, i8* %43, align 1
-  %44 = add nsw i32 %n.37, 2                      ; <i32> [#uses=1]
-  br label %bb22
-
-bb22:                                             ; preds = %bb21, %bb20
-  %n.4 = phi i32 [ %44, %bb21 ], [ %41, %bb20 ]   ; <i32> [#uses=2]
-  %45 = load i8* %scevgep15, align 1              ; <i8> [#uses=1]
-  %46 = icmp eq i8 %45, 0                         ; <i1> [#uses=1]
-  br i1 %46, label %bb24, label %bb20
-
-bb24:                                             ; preds = %bb22, %bb23.preheader
-  %n.3.lcssa = phi i32 [ %n.3.ph, %bb23.preheader ], [ %n.4, %bb22 ] ; <i32> [#uses=3]
-  br i1 %30, label %bb26, label %bb25
-
-bb25:                                             ; preds = %bb24
-  %47 = getelementptr inbounds i8* %32, i32 %n.3.lcssa ; <i8*> [#uses=1]
-  store i8 39, i8* %47, align 1
-  %48 = add nsw i32 %n.3.lcssa, 1                 ; <i32> [#uses=1]
-  br label %bb26
-
-bb26:                                             ; preds = %bb25, %bb24
-  %n.5 = phi i32 [ %48, %bb25 ], [ %n.3.lcssa, %bb24 ] ; <i32> [#uses=1]
-  %49 = getelementptr inbounds i8* %32, i32 %n.5  ; <i8*> [#uses=1]
-  store i8 0, i8* %49, align 1
-  ret void
-
-return:                                           ; preds = %bb1
-  ret void
-}
-
-declare i32 @"\01_fwrite$UNIX2003"(i8*, i32, i32, i8*)
-
-declare void @exit(i32) noreturn nounwind
-
-declare i32 @__maskrune(i32, i32)
diff --git a/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll b/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll
index 848af82..2fceab6 100644
--- a/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll
+++ b/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll
@@ -1,8 +1,10 @@
-; RUN: llc -march=x86-64 -O2 < %s | FileCheck %s
-; RUN: llc -march=x86-64 -O2 -regalloc=basic < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-pc-linux -O2 < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-pc-linux -O2 -regalloc=basic < %s | FileCheck %s
 ; Test to check .debug_loc support. This test case emits many debug_loc entries.
 
 ; CHECK: Loc expr size
+; CHECK-NEXT: .short
+; CHECK-NEXT: .Ltmp
 ; CHECK-NEXT: DW_OP_reg
 
 %0 = type { double }
diff --git a/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll b/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll
index ac26def..7909d27 100644
--- a/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll
+++ b/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -O2 < %s | FileCheck %s
 ; RUN: llc -O2 -regalloc=basic < %s | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
-target triple = "x86_64-apple-darwin"
+target triple = "x86_64-apple-darwin10"
 
 %struct.a = type { i32, %struct.a* }
 
@@ -68,9 +68,15 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 ; CHECK: Ldebug_loc0:
 ; CHECK-NEXT: .quad   Lfunc_begin0
 ; CHECK-NEXT: .quad   [[LABEL]]
-; CHECK-NEXT: .short  1
+; CHECK-NEXT: Lset{{.*}} = Ltmp{{.*}}-Ltmp{{.*}}               ## Loc expr size
+; CHECK-NEXT: .short  Lset{{.*}}
+; CHECK-NEXT: Ltmp{{.*}}:
 ; CHECK-NEXT: .byte   85
+; CHECK-NEXT: Ltmp{{.*}}:
 ; CHECK-NEXT: .quad   [[LABEL]]
 ; CHECK-NEXT: .quad   [[CLOBBER]]
-; CHECK-NEXT: .short  1
+; CHECK-NEXT: Lset{{.*}} = Ltmp{{.*}}-Ltmp{{.*}}               ## Loc expr size
+; CHECK-NEXT: .short  Lset{{.*}}
+; CHECK-NEXT: Ltmp{{.*}}:
 ; CHECK-NEXT: .byte   83
+; CHECK-NEXT: Ltmp{{.*}}:
+\ No newline at end of file
diff --git a/test/CodeGen/X86/2010-06-25-CoalescerSubRegDefDead.ll b/test/CodeGen/X86/2010-06-25-CoalescerSubRegDefDead.ll
index 6db3ce1..bb1db59 100644
--- a/test/CodeGen/X86/2010-06-25-CoalescerSubRegDefDead.ll
+++ b/test/CodeGen/X86/2010-06-25-CoalescerSubRegDefDead.ll
@@ -22,7 +22,7 @@ bb:
 ; it is.
 ;
 ; CHECK: # %bb
-; CHECK: addq $64036, %rdi
+; CHECK: leaq	64036(%rdx), %rdi
 ; CHECK: rep;stosl
 
   %tmp5 = bitcast i32* %tmp4 to i8*
diff --git a/test/CodeGen/X86/2010-08-04-StackVariable.ll b/test/CodeGen/X86/2010-08-04-StackVariable.ll
index edfd1b8..ba36fe7 100644
--- a/test/CodeGen/X86/2010-08-04-StackVariable.ll
+++ b/test/CodeGen/X86/2010-08-04-StackVariable.ll
@@ -1,5 +1,5 @@
-; RUN: llc -O0 -mtriple=x86_64-apple-darwin < %s | grep DW_OP_fbreg
-; Use DW_OP_fbreg in variable's location expression if the variable is in a stack slot.
+; RUN: llc -O0 -mtriple=x86_64-apple-darwin < %s | grep DW_OP_breg7
+; Use DW_OP_breg7 in variable's location expression if the variable is in a stack slot.
 
 %struct.SVal = type { i8*, i32 }
 
diff --git a/test/CodeGen/X86/2010-09-17-SideEffectsInChain.ll b/test/CodeGen/X86/2010-09-17-SideEffectsInChain.ll
index f12001c..eaede30 100644
--- a/test/CodeGen/X86/2010-09-17-SideEffectsInChain.ll
+++ b/test/CodeGen/X86/2010-09-17-SideEffectsInChain.ll
@@ -18,7 +18,7 @@ entry:
   ret i32 0
 }
 
-; CHECK: movq	___stack_chk_guard@GOTPCREL(%rip), %rax
+; CHECK: movq	___stack_chk_guard@GOTPCREL(%rip)
 ; CHECK: movb   38(%rsp), [[R0:%.+]]
 ; CHECK: movb   8(%rsp), [[R1:%.+]]
 ; CHECK: movb   [[R1]], 8(%rsp)
diff --git a/test/CodeGen/Generic/2011-02-12-shuffle.ll b/test/CodeGen/X86/2011-02-12-shuffle.ll
index b4d56d1..b4d56d1 100644
--- a/test/CodeGen/Generic/2011-02-12-shuffle.ll
+++ b/test/CodeGen/X86/2011-02-12-shuffle.ll
diff --git a/test/CodeGen/X86/2011-04-13-SchedCmpJmp.ll b/test/CodeGen/X86/2011-04-13-SchedCmpJmp.ll
new file mode 100644
index 0000000..07b1971
--- /dev/null
+++ b/test/CodeGen/X86/2011-04-13-SchedCmpJmp.ll
@@ -0,0 +1,65 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=yonah | FileCheck %s
+; Reduced from JavaScriptCore
+
+%"class.JSC::CodeLocationCall" = type { [8 x i8] }
+%"class.JSC::JSGlobalData" = type { [4 x i8] }
+%"class.JSC::FunctionPtr" = type { i8* }
+%"class.JSC::Structure" = type { [4 x i8] }
+%"class.JSC::UString" = type { i8* }
+%"class.JSC::JSString" = type { [16 x i8], i32, %"class.JSC::UString", i32 }
+
+declare hidden fastcc void @_ZN3JSCL23returnToThrowTrampolineEPNS_12JSGlobalDataENS_16ReturnAddressPtrERS2_(%"class.JSC::JSGlobalData"* nocapture, i8*, %"class.JSC::FunctionPtr"* nocapture) nounwind noinline ssp
+
+; Avoid hoisting the test above loads or copies
+; CHECK: %entry
+; CHECK: cmpq
+; CHECK-NOT: mov
+; CHECK: jb
+define i32 @cti_op_eq(i8** nocapture %args) nounwind ssp {
+entry:
+  %0 = load i8** null, align 8
+  %tmp13 = bitcast i8* %0 to %"class.JSC::CodeLocationCall"*
+  %tobool.i.i.i = icmp ugt i8* undef, inttoptr (i64 281474976710655 to i8*)
+  %or.cond.i = and i1 %tobool.i.i.i, undef
+  br i1 %or.cond.i, label %if.then.i, label %if.end.i
+
+if.then.i:                                        ; preds = %entry
+  br i1 undef, label %if.then.i.i.i, label %_ZN3JSC7JSValue19equalSlowCaseInlineEPNS_9ExecStateES0_S0_.exit
+
+if.then.i.i.i:                                    ; preds = %if.then.i
+  %conv.i.i.i.i = trunc i64 undef to i32
+  br label %_ZN3JSC7JSValue19equalSlowCaseInlineEPNS_9ExecStateES0_S0_.exit
+
+if.end.i:                                         ; preds = %entry
+  br i1 undef, label %land.rhs.i121.i, label %_ZNK3JSC7JSValue8isStringEv.exit122.i
+
+land.rhs.i121.i:                                  ; preds = %if.end.i
+  %tmp.i.i117.i = load %"class.JSC::Structure"** undef, align 8
+  br label %_ZNK3JSC7JSValue8isStringEv.exit122.i
+
+_ZNK3JSC7JSValue8isStringEv.exit122.i:            ; preds = %land.rhs.i121.i, %if.end.i
+  %brmerge.i = or i1 undef, false
+  %or.cond = or i1 false, %brmerge.i
+  br i1 %or.cond, label %_ZN3JSC7JSValue19equalSlowCaseInlineEPNS_9ExecStateES0_S0_.exit, label %if.then.i92.i
+
+if.then.i92.i:                                    ; preds = %_ZNK3JSC7JSValue8isStringEv.exit122.i
+  tail call void @_ZNK3JSC8JSString11resolveRopeEPNS_9ExecStateE(%"class.JSC::JSString"* undef, %"class.JSC::CodeLocationCall"* %tmp13) nounwind
+  unreachable
+
+_ZN3JSC7JSValue19equalSlowCaseInlineEPNS_9ExecStateES0_S0_.exit: ; preds = %_ZNK3JSC7JSValue8isStringEv.exit122.i, %if.then.i.i.i, %if.then.i
+
+  %1 = load i8** undef, align 8
+  br i1 undef, label %do.end39, label %do.body27
+
+do.body27:                                        ; preds = %_ZN3JSC7JSValue19equalSlowCaseInlineEPNS_9ExecStateES0_S0_.exit
+  %tmp30 = bitcast i8* %1 to %"class.JSC::JSGlobalData"*
+  %2 = getelementptr inbounds i8** %args, i64 -1
+  %3 = bitcast i8** %2 to %"class.JSC::FunctionPtr"*
+  tail call fastcc void @_ZN3JSCL23returnToThrowTrampolineEPNS_12JSGlobalDataENS_16ReturnAddressPtrERS2_(%"class.JSC::JSGlobalData"* %tmp30, i8* undef, %"class.JSC::FunctionPtr"* %3)
+  unreachable
+
+do.end39:                                         ; preds = %_ZN3JSC7JSValue19equalSlowCaseInlineEPNS_9ExecStateES0_S0_.exit
+  ret i32 undef
+}
+
+declare void @_ZNK3JSC8JSString11resolveRopeEPNS_9ExecStateE(%"class.JSC::JSString"*, %"class.JSC::CodeLocationCall"*)
diff --git a/test/CodeGen/X86/2011-05-09-loaduse.ll b/test/CodeGen/X86/2011-05-09-loaduse.ll
new file mode 100644
index 0000000..8673d74
--- /dev/null
+++ b/test/CodeGen/X86/2011-05-09-loaduse.ll
@@ -0,0 +1,13 @@
+; RUN: llc < %s -march=x86 -mcpu=corei7 | FileCheck %s
+
+;CHECK: test
+;CHECK-not: pshufd
+;CHECK: ret
+define float @test(<4 x float>* %A) nounwind {
+entry:
+  %T = load <4 x float>* %A
+  %R = extractelement <4 x float> %T, i32 3
+  store <4 x float><float 0.0, float 0.0, float 0.0, float 0.0>, <4 x float>* %A
+  ret float %R
+}
+
diff --git a/test/CodeGen/X86/2011-05-26-UnreachableBlockElim.ll b/test/CodeGen/X86/2011-05-26-UnreachableBlockElim.ll
new file mode 100644
index 0000000..0f18f09
--- /dev/null
+++ b/test/CodeGen/X86/2011-05-26-UnreachableBlockElim.ll
@@ -0,0 +1,53 @@
+; RUN: llc < %s -verify-coalescing
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-macosx10.6.0"
+
+%struct.attrib = type { i32, i32 }
+%struct.dfa = type { [80 x i8], i32, %struct.state*, i32, i32, %struct.attrib*, i32, i32 }
+%struct.state = type { i32, [4 x i32] }
+
+@aux_temp = external global %struct.dfa, align 8
+
+declare i64 @llvm.objectsize.i64(i8*, i1) nounwind readnone
+
+declare void @__memset_chk() nounwind
+
+define void @dfa_add_string() nounwind uwtable ssp {
+entry:
+  br label %if.end.i
+
+if.end.i:                                         ; preds = %entry
+  %idxprom.i = add i64 0, 1
+  br i1 undef, label %land.end.thread.i, label %land.end.i
+
+land.end.thread.i:                                ; preds = %if.end.i
+  %0 = call i64 @llvm.objectsize.i64(i8* undef, i1 false) nounwind
+  %cmp1710.i = icmp eq i64 %0, -1
+  br i1 %cmp1710.i, label %cond.false156.i, label %cond.true138.i
+
+land.end.i:                                       ; preds = %if.end.i
+  %1 = call i64 @llvm.objectsize.i64(i8* undef, i1 false) nounwind
+  %cmp17.i = icmp eq i64 %1, -1
+  br i1 %cmp17.i, label %cond.false156.i, label %cond.true138.i
+
+cond.true138.i:                                   ; preds = %for.end.i, %land.end.thread.i
+  call void @__memset_chk() nounwind
+  br label %cond.end166.i
+
+cond.false156.i:                                  ; preds = %for.end.i, %land.end.thread.i
+  %idxprom1114.i = phi i64 [ undef, %land.end.thread.i ], [ %idxprom.i, %land.end.i ]
+  call void @__memset_chk() nounwind
+  br label %cond.end166.i
+
+cond.end166.i:                                    ; preds = %cond.false156.i, %cond.true138.i
+  %idxprom1113.i = phi i64 [ %idxprom1114.i, %cond.false156.i ], [ undef, %cond.true138.i ]
+  %tmp235.i = load %struct.state** getelementptr inbounds (%struct.dfa* @aux_temp, i64 0, i32 2), align 8, !tbaa !0
+  %att.i = getelementptr inbounds %struct.state* %tmp235.i, i64 %idxprom1113.i, i32 0
+  store i32 0, i32* %att.i, align 4, !tbaa !3
+  ret void
+}
+
+!0 = metadata !{metadata !"any pointer", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
+!3 = metadata !{metadata !"int", metadata !1}
diff --git a/test/CodeGen/X86/2011-05-27-CrossClassCoalescing.ll b/test/CodeGen/X86/2011-05-27-CrossClassCoalescing.ll
new file mode 100644
index 0000000..c595bba
--- /dev/null
+++ b/test/CodeGen/X86/2011-05-27-CrossClassCoalescing.ll
@@ -0,0 +1,41 @@
+; RUN: llc < %s -verify-coalescing
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-macosx10.6.0"
+
+@bit_count = external constant [256 x i32], align 16
+
+define fastcc void @unate_intersect() nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.inc.i
+  br label %do.body.i
+
+do.body.i:                                        ; preds = %do.body.i, %for.body
+  %exitcond149 = icmp eq i64 undef, undef
+  br i1 %exitcond149, label %land.lhs.true, label %do.body.i
+
+land.lhs.true:                                    ; preds = %do.body.i
+  br label %for.body.i
+
+for.body.i:                                       ; preds = %for.inc.i, %if.then
+  %tmp3524.i = phi i32 [ 0, %land.lhs.true ], [ %tmp351.i, %for.inc.i ]
+  %tmp6.i12 = load i32* undef, align 4
+  br i1 undef, label %for.inc.i, label %if.then.i17
+
+if.then.i17:                                      ; preds = %for.body.i
+  %shr.i14 = lshr i32 %tmp6.i12, 8
+  %and14.i = and i32 %shr.i14, 255
+  %idxprom15.i = zext i32 %and14.i to i64
+  %arrayidx16.i = getelementptr inbounds [256 x i32]* @bit_count, i64 0, i64 %idxprom15.i
+  %tmp17.i15 = load i32* %arrayidx16.i, align 4
+  %add.i = add i32 0, %tmp3524.i
+  %add24.i = add i32 %add.i, %tmp17.i15
+  %add31.i = add i32 %add24.i, 0
+  %add33.i = add i32 %add31.i, 0
+  br label %for.inc.i
+
+for.inc.i:                                        ; preds = %if.then.i17, %for.body.i
+  %tmp351.i = phi i32 [ %add33.i, %if.then.i17 ], [ %tmp3524.i, %for.body.i ]
+  br label %for.body.i
+}
diff --git a/test/CodeGen/X86/2011-05-31-movmsk.ll b/test/CodeGen/X86/2011-05-31-movmsk.ll
new file mode 100644
index 0000000..2b54d5c
--- /dev/null
+++ b/test/CodeGen/X86/2011-05-31-movmsk.ll
@@ -0,0 +1,79 @@
+; RUN: llc -mcpu=core2 < %s | FileCheck %s
+; ModuleID = '<stdin>'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-macosx10.6.6"
+
+%0 = type { double }
+%union.anon = type { float }
+
+define i32 @double_signbit(double %d1) nounwind uwtable readnone ssp {
+entry:
+  %__x.addr.i = alloca double, align 8
+  %__u.i = alloca %0, align 8
+  %0 = bitcast double* %__x.addr.i to i8*
+  %1 = bitcast %0* %__u.i to i8*
+  store double %d1, double* %__x.addr.i, align 8
+  %__f.i = getelementptr inbounds %0* %__u.i, i64 0, i32 0
+  store double %d1, double* %__f.i, align 8
+  %tmp = bitcast double %d1 to i64
+; CHECK-NOT: shr
+; CHECK: movmskpd
+; CHECK-NEXT: and
+  %tmp1 = lshr i64 %tmp, 63
+  %shr.i = trunc i64 %tmp1 to i32
+  ret i32 %shr.i
+}
+
+define i32 @double_add_signbit(double %d1, double %d2) nounwind uwtable readnone ssp {
+entry:
+  %__x.addr.i = alloca double, align 8
+  %__u.i = alloca %0, align 8
+  %add = fadd double %d1, %d2
+  %0 = bitcast double* %__x.addr.i to i8*
+  %1 = bitcast %0* %__u.i to i8*
+  store double %add, double* %__x.addr.i, align 8
+  %__f.i = getelementptr inbounds %0* %__u.i, i64 0, i32 0
+  store double %add, double* %__f.i, align 8
+  %tmp = bitcast double %add to i64
+; CHECK-NOT: shr
+; CHECK: movmskpd
+; CHECK-NEXT: and
+  %tmp1 = lshr i64 %tmp, 63
+  %shr.i = trunc i64 %tmp1 to i32
+  ret i32 %shr.i
+}
+
+define i32 @float_signbit(float %f1) nounwind uwtable readnone ssp {
+entry:
+  %__x.addr.i = alloca float, align 4
+  %__u.i = alloca %union.anon, align 4
+  %0 = bitcast float* %__x.addr.i to i8*
+  %1 = bitcast %union.anon* %__u.i to i8*
+  store float %f1, float* %__x.addr.i, align 4
+  %__f.i = getelementptr inbounds %union.anon* %__u.i, i64 0, i32 0
+  store float %f1, float* %__f.i, align 4
+  %2 = bitcast float %f1 to i32
+; CHECK-NOT: shr
+; CHECK: movmskps
+; CHECK-NEXT: and
+  %shr.i = lshr i32 %2, 31
+  ret i32 %shr.i
+}
+
+define i32 @float_add_signbit(float %f1, float %f2) nounwind uwtable readnone ssp {
+entry:
+  %__x.addr.i = alloca float, align 4
+  %__u.i = alloca %union.anon, align 4
+  %add = fadd float %f1, %f2
+  %0 = bitcast float* %__x.addr.i to i8*
+  %1 = bitcast %union.anon* %__u.i to i8*
+  store float %add, float* %__x.addr.i, align 4
+  %__f.i = getelementptr inbounds %union.anon* %__u.i, i64 0, i32 0
+  store float %add, float* %__f.i, align 4
+  %2 = bitcast float %add to i32
+; CHECK-NOT: shr
+; CHECK: movmskps
+; CHECK-NEXT: and
+  %shr.i = lshr i32 %2, 31
+  ret i32 %shr.i
+}
diff --git a/test/CodeGen/X86/2011-06-01-fildll.ll b/test/CodeGen/X86/2011-06-01-fildll.ll
new file mode 100644
index 0000000..3a0b05f
--- /dev/null
+++ b/test/CodeGen/X86/2011-06-01-fildll.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -march=x86 | FileCheck %s
+; ModuleID = '<stdin>'
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
+target triple = "i386-apple-macosx10.6.6"
+
+define float @f(i64* nocapture %x) nounwind readonly ssp {
+entry:
+; CHECK: movl
+; CHECK-NOT: movl
+  %tmp1 = load i64* %x, align 4
+; CHECK: fildll
+  %conv = sitofp i64 %tmp1 to float
+  %add = fadd float %conv, 1.000000e+00
+  ret float %add
+}
diff --git a/test/CodeGen/X86/2011-06-03-x87chain.ll b/test/CodeGen/X86/2011-06-03-x87chain.ll
new file mode 100644
index 0000000..bf7f583
--- /dev/null
+++ b/test/CodeGen/X86/2011-06-03-x87chain.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s -march=x86 -mattr=+sse | FileCheck %s
+
+define float @chainfail1(i64* nocapture %a, i64* nocapture %b, i32 %x, i32 %y, float* nocapture %f) nounwind uwtable noinline ssp {
+entry:
+  %tmp1 = load i64* %a, align 8
+; Insure x87 ops are properly chained, order preserved.
+; CHECK: fildll
+  %conv = sitofp i64 %tmp1 to float
+; CHECK: fstps
+  store float %conv, float* %f, align 4
+; CHECK: idivl
+  %div = sdiv i32 %x, %y
+  %conv5 = sext i32 %div to i64
+  store i64 %conv5, i64* %b, align 8
+  ret float %conv
+}
+
+define float @chainfail2(i64* nocapture %a, i64* nocapture %b, i32 %x, i32 %y, float* nocapture %f) nounwind uwtable noinline ssp {
+entry:
+; CHECK: movl $0,
+  store i64 0, i64* %b, align 8
+  %mul = mul nsw i32 %y, %x
+  %sub = add nsw i32 %mul, -1
+  %idxprom = sext i32 %sub to i64
+  %arrayidx = getelementptr inbounds i64* %a, i64 %idxprom
+  %tmp4 = load i64* %arrayidx, align 8
+; CHECK: fildll
+  %conv = sitofp i64 %tmp4 to float
+  store float %conv, float* %f, align 4
+  ret float %conv
+}
diff --git a/test/CodeGen/X86/2011-06-06-fgetsign80bit.ll b/test/CodeGen/X86/2011-06-06-fgetsign80bit.ll
new file mode 100644
index 0000000..d934148
--- /dev/null
+++ b/test/CodeGen/X86/2011-06-06-fgetsign80bit.ll
@@ -0,0 +1,8 @@
+; RUN: llc -march=x86-64 < %s
+define i32 @signbitl(x86_fp80 %x) nounwind uwtable readnone {
+entry:
+  %tmp4 = bitcast x86_fp80 %x to i80
+  %tmp4.lobit = lshr i80 %tmp4, 79
+  %tmp = trunc i80 %tmp4.lobit to i32
+  ret i32 %tmp
+}
diff --git a/test/CodeGen/X86/2011-06-12-FastAllocSpill.ll b/test/CodeGen/X86/2011-06-12-FastAllocSpill.ll
new file mode 100644
index 0000000..a51dad0
--- /dev/null
+++ b/test/CodeGen/X86/2011-06-12-FastAllocSpill.ll
@@ -0,0 +1,52 @@
+; RUN: llc < %s -O0 -disable-fp-elim -relocation-model=pic -stats |& FileCheck %s
+;
+; This test should not cause any spilling with RAFast.
+;
+; CHECK: Number of copies coalesced
+; CHECK-NOT: Number of stores added
+;
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-darwin10.0.0"
+
+%0 = type { i64, i64, i8*, i8* }
+%1 = type opaque
+%2 = type opaque
+%3 = type <{ i8*, i32, i32, void (%4*)*, i8*, i64 }>
+%4 = type { i8**, i32, i32, i8**, %5*, i64 }
+%5 = type { i64, i64 }
+%6 = type { i8*, i32, i32, i8*, %5* }
+
+@0 = external hidden constant %0
+
+define hidden void @f() ssp {
+bb:
+  %tmp5 = alloca i64, align 8
+  %tmp6 = alloca void ()*, align 8
+  %tmp7 = alloca %3, align 8
+  store i64 0, i64* %tmp5, align 8
+  br label %bb8
+
+bb8:                                              ; preds = %bb23, %bb
+  %tmp15 = getelementptr inbounds %3* %tmp7, i32 0, i32 4
+  store i8* bitcast (%0* @0 to i8*), i8** %tmp15
+  %tmp16 = bitcast %3* %tmp7 to void ()*
+  store void ()* %tmp16, void ()** %tmp6, align 8
+  %tmp17 = load void ()** %tmp6, align 8
+  %tmp18 = bitcast void ()* %tmp17 to %6*
+  %tmp19 = getelementptr inbounds %6* %tmp18, i32 0, i32 3
+  %tmp20 = bitcast %6* %tmp18 to i8*
+  %tmp21 = load i8** %tmp19
+  %tmp22 = bitcast i8* %tmp21 to void (i8*)*
+  call void %tmp22(i8* %tmp20)
+  br label %bb23
+
+bb23:                                             ; preds = %bb8
+  %tmp24 = load i64* %tmp5, align 8
+  %tmp25 = add i64 %tmp24, 1
+  store i64 %tmp25, i64* %tmp5, align 8
+  %tmp26 = icmp ult i64 %tmp25, 10
+  br i1 %tmp26, label %bb8, label %bb27
+
+bb27:                                             ; preds = %bb23
+  ret void
+}
diff --git a/test/CodeGen/X86/2011-06-14-PreschedRegalias.ll b/test/CodeGen/X86/2011-06-14-PreschedRegalias.ll
new file mode 100644
index 0000000..0532375
--- /dev/null
+++ b/test/CodeGen/X86/2011-06-14-PreschedRegalias.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -march=x86-64 -stress-sched | FileCheck %s
+; REQUIRES: Asserts
+; Test interference between physreg aliases during preRAsched.
+; mul wants an operand in AL, but call clobbers it.
+
+define i8 @f(i8 %v1, i8 %v2) nounwind {
+entry:
+; CHECK: callq
+; CHECK: movb %{{.*}}, %al
+; CHECK: mulb
+; CHECK: mulb
+        %rval = tail call i8 @bar() nounwind
+        %m1 = mul i8 %v1, %v2
+        %m2 = mul i8 %m1, %rval
+        ret i8 %m2
+}
+
+declare i8 @bar()
diff --git a/test/CodeGen/X86/2011-06-14-mmx-inlineasm.ll b/test/CodeGen/X86/2011-06-14-mmx-inlineasm.ll
new file mode 100644
index 0000000..445fc01
--- /dev/null
+++ b/test/CodeGen/X86/2011-06-14-mmx-inlineasm.ll
@@ -0,0 +1,45 @@
+; RUN: llc -mcpu=i686 -mattr=+mmx < %s | FileCheck %s
+; ModuleID = 'tq.c'
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
+target triple = "i386-apple-macosx10.6.6"
+
+%0 = type { x86_mmx, x86_mmx, x86_mmx, x86_mmx, x86_mmx, x86_mmx, x86_mmx }
+
+define i32 @pixman_fill_mmx(i32* nocapture %bits, i32 %stride, i32 %bpp, i32 %x, i32 %y, i32 %width, i32 %height, i32 %xor) nounwind ssp {
+entry:
+  %conv = zext i32 %xor to i64
+  %shl = shl nuw i64 %conv, 32
+  %or = or i64 %shl, %conv
+  %0 = bitcast i64 %or to x86_mmx
+; CHECK:      movq [[MMXR:%mm[0-7],]] {{%mm[0-7]}}
+; CHECK-NEXT: movq [[MMXR]] {{%mm[0-7]}}
+; CHECK-NEXT: movq [[MMXR]] {{%mm[0-7]}}
+; CHECK-NEXT: movq [[MMXR]] {{%mm[0-7]}}
+; CHECK-NEXT: movq [[MMXR]] {{%mm[0-7]}}
+; CHECK-NEXT: movq [[MMXR]] {{%mm[0-7]}}
+; CHECK-NEXT: movq [[MMXR]] {{%mm[0-7]}}
+  %1 = tail call %0 asm "movq\09\09$7,\09$0\0Amovq\09\09$7,\09$1\0Amovq\09\09$7,\09$2\0Amovq\09\09$7,\09$3\0Amovq\09\09$7,\09$4\0Amovq\09\09$7,\09$5\0Amovq\09\09$7,\09$6\0A", "=&y,=&y,=&y,=&y,=&y,=&y,=y,y,~{dirflag},~{fpsr},~{flags}"(x86_mmx %0) nounwind, !srcloc !0
+  %asmresult = extractvalue %0 %1, 0
+  %asmresult6 = extractvalue %0 %1, 1
+  %asmresult7 = extractvalue %0 %1, 2
+  %asmresult8 = extractvalue %0 %1, 3
+  %asmresult9 = extractvalue %0 %1, 4
+  %asmresult10 = extractvalue %0 %1, 5
+  %asmresult11 = extractvalue %0 %1, 6
+; CHECK:      movq {{%mm[0-7]}},
+; CHECK-NEXT: movq {{%mm[0-7]}},
+; CHECK-NEXT: movq {{%mm[0-7]}},
+; CHECK-NEXT: movq {{%mm[0-7]}},
+; CHECK-NEXT: movq {{%mm[0-7]}},
+; CHECK-NEXT: movq {{%mm[0-7]}},
+; CHECK-NEXT: movq {{%mm[0-7]}},
+; CHECK-NEXT: movq {{%mm[0-7]}},
+  tail call void asm sideeffect "movq\09$1,\09  ($0)\0Amovq\09$2,\09 8($0)\0Amovq\09$3,\0916($0)\0Amovq\09$4,\0924($0)\0Amovq\09$5,\0932($0)\0Amovq\09$6,\0940($0)\0Amovq\09$7,\0948($0)\0Amovq\09$8,\0956($0)\0A", "r,y,y,y,y,y,y,y,y,~{memory},~{dirflag},~{fpsr},~{flags}"(i8* undef, x86_mmx %0, x86_mmx %asmresult, x86_mmx %asmresult6, x86_mmx %asmresult7, x86_mmx %asmresult8, x86_mmx %asmresult9, x86_mmx %asmresult10, x86_mmx %asmresult11) nounwind, !srcloc !1
+  tail call void @llvm.x86.mmx.emms() nounwind
+  ret i32 1
+}
+
+declare void @llvm.x86.mmx.emms() nounwind
+
+!0 = metadata !{i32 888, i32 917, i32 945, i32 973, i32 1001, i32 1029, i32 1057}
+!1 = metadata !{i32 1390, i32 1430, i32 1469, i32 1508, i32 1547, i32 1586, i32 1625, i32 1664}
diff --git a/test/CodeGen/X86/2011-06-19-QuicksortCoalescerBug.ll b/test/CodeGen/X86/2011-06-19-QuicksortCoalescerBug.ll
new file mode 100644
index 0000000..08178a3
--- /dev/null
+++ b/test/CodeGen/X86/2011-06-19-QuicksortCoalescerBug.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s -verify-coalescing
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-macosx10.7.0"
+
+define void @Quicksort(i32* %a, i32 %l, i32 %r) nounwind ssp {
+entry:
+  br label %tailrecurse
+
+tailrecurse:                                      ; preds = %do.cond, %entry
+  %l.tr = phi i32 [ %l, %entry ], [ %i.1, %do.cond ]
+  %r.tr = phi i32 [ %r, %entry ], [ %l.tr, %do.cond ]
+  %idxprom12 = sext i32 %r.tr to i64
+  %arrayidx14 = getelementptr inbounds i32* %a, i64 %idxprom12
+  br label %do.body
+
+do.body:                                          ; preds = %do.cond, %tailrecurse
+  %i.0 = phi i32 [ %l.tr, %tailrecurse ], [ %i.1, %do.cond ]
+  %add7 = add nsw i32 %i.0, 1
+  %cmp = icmp sgt i32 %add7, %r.tr
+  br i1 %cmp, label %do.cond, label %if.then
+
+if.then:                                          ; preds = %do.body
+  store i32 %add7, i32* %arrayidx14, align 4
+  %add16 = add i32 %i.0, 2
+  br label %do.cond
+
+do.cond:                                          ; preds = %do.body, %if.then
+  %i.1 = phi i32 [ %add16, %if.then ], [ %add7, %do.body ]
+  %cmp19 = icmp sgt i32 %i.1, %r.tr
+  br i1 %cmp19, label %tailrecurse, label %do.body
+}
diff --git a/test/CodeGen/X86/3dnow-intrinsics.ll b/test/CodeGen/X86/3dnow-intrinsics.ll
new file mode 100644
index 0000000..0b27bf2
--- /dev/null
+++ b/test/CodeGen/X86/3dnow-intrinsics.ll
@@ -0,0 +1,297 @@
+; RUN: llc < %s -march=x86 -mattr=+3dnow | FileCheck %s
+
+define <8 x i8> @test_pavgusb(x86_mmx %a.coerce, x86_mmx %b.coerce) nounwind readnone {
+; CHECK: pavgusb
+entry:
+  %0 = bitcast x86_mmx %a.coerce to <8 x i8>
+  %1 = bitcast x86_mmx %b.coerce to <8 x i8>
+  %2 = bitcast <8 x i8> %0 to x86_mmx
+  %3 = bitcast <8 x i8> %1 to x86_mmx
+  %4 = call x86_mmx @llvm.x86.3dnow.pavgusb(x86_mmx %2, x86_mmx %3)
+  %5 = bitcast x86_mmx %4 to <8 x i8>
+  ret <8 x i8> %5
+}
+
+declare x86_mmx @llvm.x86.3dnow.pavgusb(x86_mmx, x86_mmx) nounwind readnone
+
+define <2 x i32> @test_pf2id(<2 x float> %a) nounwind readnone {
+; CHECK: pf2id
+entry:
+  %0 = bitcast <2 x float> %a to x86_mmx
+  %1 = tail call x86_mmx @llvm.x86.3dnow.pf2id(x86_mmx %0)
+  %2 = bitcast x86_mmx %1 to <2 x i32>
+  ret <2 x i32> %2
+}
+
+declare x86_mmx @llvm.x86.3dnow.pf2id(x86_mmx) nounwind readnone
+
+define <2 x float> @test_pfacc(<2 x float> %a, <2 x float> %b) nounwind readnone {
+; CHECK: pfacc
+entry:
+  %0 = bitcast <2 x float> %a to x86_mmx
+  %1 = bitcast <2 x float> %b to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.3dnow.pfacc(x86_mmx %0, x86_mmx %1)
+  %3 = bitcast x86_mmx %2 to <2 x float>
+  ret <2 x float> %3
+}
+
+declare x86_mmx @llvm.x86.3dnow.pfacc(x86_mmx, x86_mmx) nounwind readnone
+
+define <2 x float> @test_pfadd(<2 x float> %a, <2 x float> %b) nounwind readnone {
+; CHECK: pfadd
+entry:
+  %0 = bitcast <2 x float> %a to x86_mmx
+  %1 = bitcast <2 x float> %b to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.3dnow.pfadd(x86_mmx %0, x86_mmx %1)
+  %3 = bitcast x86_mmx %2 to <2 x float>
+  ret <2 x float> %3
+}
+
+declare x86_mmx @llvm.x86.3dnow.pfadd(x86_mmx, x86_mmx) nounwind readnone
+
+define <2 x i32> @test_pfcmpeq(<2 x float> %a, <2 x float> %b) nounwind readnone {
+; CHECK: pfcmpeq
+entry:
+  %0 = bitcast <2 x float> %a to x86_mmx
+  %1 = bitcast <2 x float> %b to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.3dnow.pfcmpeq(x86_mmx %0, x86_mmx %1)
+  %3 = bitcast x86_mmx %2 to <2 x i32>
+  ret <2 x i32> %3
+}
+
+declare x86_mmx @llvm.x86.3dnow.pfcmpeq(x86_mmx, x86_mmx) nounwind readnone
+
+define <2 x i32> @test_pfcmpge(<2 x float> %a, <2 x float> %b) nounwind readnone {
+; CHECK: pfcmpge
+entry:
+  %0 = bitcast <2 x float> %a to x86_mmx
+  %1 = bitcast <2 x float> %b to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.3dnow.pfcmpge(x86_mmx %0, x86_mmx %1)
+  %3 = bitcast x86_mmx %2 to <2 x i32>
+  ret <2 x i32> %3
+}
+
+declare x86_mmx @llvm.x86.3dnow.pfcmpge(x86_mmx, x86_mmx) nounwind readnone
+
+define <2 x i32> @test_pfcmpgt(<2 x float> %a, <2 x float> %b) nounwind readnone {
+; CHECK: pfcmpgt
+entry:
+  %0 = bitcast <2 x float> %a to x86_mmx
+  %1 = bitcast <2 x float> %b to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.3dnow.pfcmpgt(x86_mmx %0, x86_mmx %1)
+  %3 = bitcast x86_mmx %2 to <2 x i32>
+  ret <2 x i32> %3
+}
+
+declare x86_mmx @llvm.x86.3dnow.pfcmpgt(x86_mmx, x86_mmx) nounwind readnone
+
+define <2 x float> @test_pfmax(<2 x float> %a, <2 x float> %b) nounwind readnone {
+; CHECK: pfmax
+entry:
+  %0 = bitcast <2 x float> %a to x86_mmx
+  %1 = bitcast <2 x float> %b to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.3dnow.pfmax(x86_mmx %0, x86_mmx %1)
+  %3 = bitcast x86_mmx %2 to <2 x float>
+  ret <2 x float> %3
+}
+
+declare x86_mmx @llvm.x86.3dnow.pfmax(x86_mmx, x86_mmx) nounwind readnone
+
+define <2 x float> @test_pfmin(<2 x float> %a, <2 x float> %b) nounwind readnone {
+; CHECK: pfmin
+entry:
+  %0 = bitcast <2 x float> %a to x86_mmx
+  %1 = bitcast <2 x float> %b to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.3dnow.pfmin(x86_mmx %0, x86_mmx %1)
+  %3 = bitcast x86_mmx %2 to <2 x float>
+  ret <2 x float> %3
+}
+
+declare x86_mmx @llvm.x86.3dnow.pfmin(x86_mmx, x86_mmx) nounwind readnone
+
+define <2 x float> @test_pfmul(<2 x float> %a, <2 x float> %b) nounwind readnone {
+; CHECK: pfmul
+entry:
+  %0 = bitcast <2 x float> %a to x86_mmx
+  %1 = bitcast <2 x float> %b to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.3dnow.pfmul(x86_mmx %0, x86_mmx %1)
+  %3 = bitcast x86_mmx %2 to <2 x float>
+  ret <2 x float> %3
+}
+
+declare x86_mmx @llvm.x86.3dnow.pfmul(x86_mmx, x86_mmx) nounwind readnone
+
+define <2 x float> @test_pfrcp(<2 x float> %a) nounwind readnone {
+; CHECK: pfrcp
+entry:
+  %0 = bitcast <2 x float> %a to x86_mmx
+  %1 = tail call x86_mmx @llvm.x86.3dnow.pfrcp(x86_mmx %0)
+  %2 = bitcast x86_mmx %1 to <2 x float>
+  ret <2 x float> %2
+}
+
+declare x86_mmx @llvm.x86.3dnow.pfrcp(x86_mmx) nounwind readnone
+
+define <2 x float> @test_pfrcpit1(<2 x float> %a, <2 x float> %b) nounwind readnone {
+; CHECK: pfrcpit1
+entry:
+  %0 = bitcast <2 x float> %a to x86_mmx
+  %1 = bitcast <2 x float> %b to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.3dnow.pfrcpit1(x86_mmx %0, x86_mmx %1)
+  %3 = bitcast x86_mmx %2 to <2 x float>
+  ret <2 x float> %3
+}
+
+declare x86_mmx @llvm.x86.3dnow.pfrcpit1(x86_mmx, x86_mmx) nounwind readnone
+
+define <2 x float> @test_pfrcpit2(<2 x float> %a, <2 x float> %b) nounwind readnone {
+; CHECK: pfrcpit2
+entry:
+  %0 = bitcast <2 x float> %a to x86_mmx
+  %1 = bitcast <2 x float> %b to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.3dnow.pfrcpit2(x86_mmx %0, x86_mmx %1)
+  %3 = bitcast x86_mmx %2 to <2 x float>
+  ret <2 x float> %3
+}
+
+declare x86_mmx @llvm.x86.3dnow.pfrcpit2(x86_mmx, x86_mmx) nounwind readnone
+
+define <2 x float> @test_pfrsqrt(<2 x float> %a) nounwind readnone {
+; CHECK: pfrsqrt
+entry:
+  %0 = bitcast <2 x float> %a to x86_mmx
+  %1 = tail call x86_mmx @llvm.x86.3dnow.pfrsqrt(x86_mmx %0)
+  %2 = bitcast x86_mmx %1 to <2 x float>
+  ret <2 x float> %2
+}
+
+declare x86_mmx @llvm.x86.3dnow.pfrsqrt(x86_mmx) nounwind readnone
+
+define <2 x float> @test_pfrsqit1(<2 x float> %a, <2 x float> %b) nounwind readnone {
+; CHECK: pfrsqit1
+entry:
+  %0 = bitcast <2 x float> %a to x86_mmx
+  %1 = bitcast <2 x float> %b to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.3dnow.pfrsqit1(x86_mmx %0, x86_mmx %1)
+  %3 = bitcast x86_mmx %2 to <2 x float>
+  ret <2 x float> %3
+}
+
+declare x86_mmx @llvm.x86.3dnow.pfrsqit1(x86_mmx, x86_mmx) nounwind readnone
+
+define <2 x float> @test_pfsub(<2 x float> %a, <2 x float> %b) nounwind readnone {
+; CHECK: pfsub
+entry:
+  %0 = bitcast <2 x float> %a to x86_mmx
+  %1 = bitcast <2 x float> %b to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.3dnow.pfsub(x86_mmx %0, x86_mmx %1)
+  %3 = bitcast x86_mmx %2 to <2 x float>
+  ret <2 x float> %3
+}
+
+declare x86_mmx @llvm.x86.3dnow.pfsub(x86_mmx, x86_mmx) nounwind readnone
+
+define <2 x float> @test_pfsubr(<2 x float> %a, <2 x float> %b) nounwind readnone {
+; CHECK: pfsubr
+entry:
+  %0 = bitcast <2 x float> %a to x86_mmx
+  %1 = bitcast <2 x float> %b to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.3dnow.pfsubr(x86_mmx %0, x86_mmx %1)
+  %3 = bitcast x86_mmx %2 to <2 x float>
+  ret <2 x float> %3
+}
+
+declare x86_mmx @llvm.x86.3dnow.pfsubr(x86_mmx, x86_mmx) nounwind readnone
+
+define <2 x float> @test_pi2fd(x86_mmx %a.coerce) nounwind readnone {
+; CHECK: pi2fd
+entry:
+  %0 = bitcast x86_mmx %a.coerce to <2 x i32>
+  %1 = bitcast <2 x i32> %0 to x86_mmx
+  %2 = call x86_mmx @llvm.x86.3dnow.pi2fd(x86_mmx %1)
+  %3 = bitcast x86_mmx %2 to <2 x float>
+  ret <2 x float> %3
+}
+
+declare x86_mmx @llvm.x86.3dnow.pi2fd(x86_mmx) nounwind readnone
+
+define <4 x i16> @test_pmulhrw(x86_mmx %a.coerce, x86_mmx %b.coerce) nounwind readnone {
+; CHECK: pmulhrw
+entry:
+  %0 = bitcast x86_mmx %a.coerce to <4 x i16>
+  %1 = bitcast x86_mmx %b.coerce to <4 x i16>
+  %2 = bitcast <4 x i16> %0 to x86_mmx
+  %3 = bitcast <4 x i16> %1 to x86_mmx
+  %4 = call x86_mmx @llvm.x86.3dnow.pmulhrw(x86_mmx %2, x86_mmx %3)
+  %5 = bitcast x86_mmx %4 to <4 x i16>
+  ret <4 x i16> %5
+}
+
+declare x86_mmx @llvm.x86.3dnow.pmulhrw(x86_mmx, x86_mmx) nounwind readnone
+
+define <2 x i32> @test_pf2iw(<2 x float> %a) nounwind readnone {
+; CHECK: pf2iw
+entry:
+  %0 = bitcast <2 x float> %a to x86_mmx
+  %1 = tail call x86_mmx @llvm.x86.3dnowa.pf2iw(x86_mmx %0)
+  %2 = bitcast x86_mmx %1 to <2 x i32>
+  ret <2 x i32> %2
+}
+
+declare x86_mmx @llvm.x86.3dnowa.pf2iw(x86_mmx) nounwind readnone
+
+define <2 x float> @test_pfnacc(<2 x float> %a, <2 x float> %b) nounwind readnone {
+; CHECK: pfnacc
+entry:
+  %0 = bitcast <2 x float> %a to x86_mmx
+  %1 = bitcast <2 x float> %b to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.3dnowa.pfnacc(x86_mmx %0, x86_mmx %1)
+  %3 = bitcast x86_mmx %2 to <2 x float>
+  ret <2 x float> %3
+}
+
+declare x86_mmx @llvm.x86.3dnowa.pfnacc(x86_mmx, x86_mmx) nounwind readnone
+
+define <2 x float> @test_pfpnacc(<2 x float> %a, <2 x float> %b) nounwind readnone {
+; CHECK: pfpnacc
+entry:
+  %0 = bitcast <2 x float> %a to x86_mmx
+  %1 = bitcast <2 x float> %b to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.3dnowa.pfpnacc(x86_mmx %0, x86_mmx %1)
+  %3 = bitcast x86_mmx %2 to <2 x float>
+  ret <2 x float> %3
+}
+
+declare x86_mmx @llvm.x86.3dnowa.pfpnacc(x86_mmx, x86_mmx) nounwind readnone
+
+define <2 x float> @test_pi2fw(x86_mmx %a.coerce) nounwind readnone {
+; CHECK: pi2fw
+entry:
+  %0 = bitcast x86_mmx %a.coerce to <2 x i32>
+  %1 = bitcast <2 x i32> %0 to x86_mmx
+  %2 = call x86_mmx @llvm.x86.3dnowa.pi2fw(x86_mmx %1)
+  %3 = bitcast x86_mmx %2 to <2 x float>
+  ret <2 x float> %3
+}
+
+declare x86_mmx @llvm.x86.3dnowa.pi2fw(x86_mmx) nounwind readnone
+
+define <2 x float> @test_pswapdsf(<2 x float> %a) nounwind readnone {
+; CHECK: pswapd
+entry:
+  %0 = bitcast <2 x float> %a to x86_mmx
+  %1 = tail call x86_mmx @llvm.x86.3dnowa.pswapd(x86_mmx %0)
+  %2 = bitcast x86_mmx %1 to <2 x float>
+  ret <2 x float> %2
+}
+
+define <2 x i32> @test_pswapdsi(<2 x i32> %a) nounwind readnone {
+; CHECK: pswapd
+entry:
+  %0 = bitcast <2 x i32> %a to x86_mmx
+  %1 = tail call x86_mmx @llvm.x86.3dnowa.pswapd(x86_mmx %0)
+  %2 = bitcast x86_mmx %1 to <2 x i32>
+  ret <2 x i32> %2
+}
+
+declare x86_mmx @llvm.x86.3dnowa.pswapd(x86_mmx) nounwind readnone
diff --git a/test/CodeGen/X86/4char-promote.ll b/test/CodeGen/X86/4char-promote.ll
new file mode 100644
index 0000000..386057f
--- /dev/null
+++ b/test/CodeGen/X86/4char-promote.ll
@@ -0,0 +1,17 @@
+; A test for checking PR 9623
+;RUN: llc -march=x86-64 -mcpu=corei7 -promote-elements < %s | FileCheck %s
+
+target triple = "x86_64-apple-darwin"
+
+; CHECK:  pmulld 
+; CHECK:  paddd  
+; CHECK:  movdqa 
+
+define <4 x i8> @foo(<4 x i8> %x, <4 x i8> %y) {
+entry:
+ %binop = mul <4 x i8> %x, %y
+ %binop6 = add <4 x i8> %binop, %x
+ ret <4 x i8> %binop6
+}
+
+
diff --git a/test/CodeGen/X86/9601.ll b/test/CodeGen/X86/9601.ll
new file mode 100644
index 0000000..cd65a03
--- /dev/null
+++ b/test/CodeGen/X86/9601.ll
@@ -0,0 +1,12 @@
+; RUN:  llc < %s -mtriple=x86_64-unknown-linux-gnu
+; PR9601
+; Previously we'd crash trying to put a 32-bit float into a constraint
+; for a normal 'r' register.
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @test() {
+entry:
+  %0 = call float asm sideeffect "xchg $0, $1", "=r,*m,0,~{memory},~{dirflag},~{fpsr},~{flags}"(i32* undef, float 2.000000e+00) nounwind
+  unreachable
+}
diff --git a/test/CodeGen/X86/GC/simple_ocaml.ll b/test/CodeGen/X86/GC/simple_ocaml.ll
deleted file mode 100644
index f765dc0..0000000
--- a/test/CodeGen/X86/GC/simple_ocaml.ll
+++ /dev/null
@@ -1,42 +0,0 @@
-; RUN: llc < %s | grep caml.*__frametable
-; RUN: llc < %s -march=x86 | grep {movl	.0}
-
-%struct.obj = type { i8*, %struct.obj* }
-
-define %struct.obj* @fun(%struct.obj* %head) gc "ocaml" {
-entry:
-	%gcroot.0 = alloca i8*
-	%gcroot.1 = alloca i8*
-	
-	call void @llvm.gcroot(i8** %gcroot.0, i8* null)
-	call void @llvm.gcroot(i8** %gcroot.1, i8* null)
-	
-	%local.0 = bitcast i8** %gcroot.0 to %struct.obj**
-	%local.1 = bitcast i8** %gcroot.1 to %struct.obj**
-
-	store %struct.obj* %head, %struct.obj** %local.0
-	br label %bb.loop
-bb.loop:
-	%t0 = load %struct.obj** %local.0
-	%t1 = getelementptr %struct.obj* %t0, i32 0, i32 1
-	%t2 = bitcast %struct.obj* %t0 to i8*
-	%t3 = bitcast %struct.obj** %t1 to i8**
-	%t4 = call i8* @llvm.gcread(i8* %t2, i8** %t3)
-	%t5 = bitcast i8* %t4 to %struct.obj*
-	%t6 = icmp eq %struct.obj* %t5, null
-	br i1 %t6, label %bb.loop, label %bb.end
-bb.end:
-	%t7 = malloc %struct.obj
-	store %struct.obj* %t7, %struct.obj** %local.1
-	%t8 = bitcast %struct.obj* %t7 to i8*
-	%t9 = load %struct.obj** %local.0
-	%t10 = getelementptr %struct.obj* %t9, i32 0, i32 1
-	%t11 = bitcast %struct.obj* %t9 to i8*
-	%t12 = bitcast %struct.obj** %t10 to i8**
-	call void @llvm.gcwrite(i8* %t8, i8* %t11, i8** %t12)
-	ret %struct.obj* %t7
-}
-
-declare void @llvm.gcroot(i8** %value, i8* %tag)
-declare void @llvm.gcwrite(i8* %value, i8* %obj, i8** %field)
-declare i8* @llvm.gcread(i8* %obj, i8** %field)
diff --git a/test/CodeGen/X86/abi-isel.ll b/test/CodeGen/X86/abi-isel.ll
index 7535e07..5068d29 100644
--- a/test/CodeGen/X86/abi-isel.ll
+++ b/test/CodeGen/X86/abi-isel.ll
@@ -12,17 +12,6 @@
 ; RUN: llc < %s -asm-verbose=0 -mtriple=x86_64-apple-darwin -march=x86-64 -relocation-model=dynamic-no-pic -code-model=small | FileCheck %s -check-prefix=DARWIN-64-DYNAMIC
 ; RUN: llc < %s -asm-verbose=0 -mtriple=x86_64-apple-darwin -march=x86-64 -relocation-model=pic -code-model=small | FileCheck %s -check-prefix=DARWIN-64-PIC
 
-; RUN: llc < %s -asm-verbose=0 -regalloc=basic -mtriple=i686-unknown-linux-gnu -march=x86 -relocation-model=static -code-model=small | FileCheck %s -check-prefix=LINUX-32-STATIC
-; RUN: llc < %s -asm-verbose=0 -regalloc=basic -mtriple=i686-unknown-linux-gnu -march=x86 -relocation-model=static -code-model=small | FileCheck %s -check-prefix=LINUX-32-PIC
-; RUN: llc < %s -asm-verbose=0 -regalloc=basic -mtriple=x86_64-unknown-linux-gnu -march=x86-64 -relocation-model=static -code-model=small | FileCheck %s -check-prefix=LINUX-64-STATIC
-; RUN: llc < %s -asm-verbose=0 -regalloc=basic -mtriple=x86_64-unknown-linux-gnu -march=x86-64 -relocation-model=pic -code-model=small | FileCheck %s -check-prefix=LINUX-64-PIC
-; RUN: llc < %s -asm-verbose=0 -regalloc=basic -mtriple=i686-apple-darwin -march=x86 -relocation-model=static -code-model=small | FileCheck %s -check-prefix=DARWIN-32-STATIC
-; RUN: llc < %s -asm-verbose=0 -regalloc=basic -mtriple=i686-apple-darwin -march=x86 -relocation-model=dynamic-no-pic -code-model=small | FileCheck %s -check-prefix=DARWIN-32-DYNAMIC
-; RUN: llc < %s -asm-verbose=0 -regalloc=basic -mtriple=i686-apple-darwin -march=x86 -relocation-model=pic -code-model=small | FileCheck %s -check-prefix=DARWIN-32-PIC
-; RUN: llc < %s -asm-verbose=0 -regalloc=basic -mtriple=x86_64-apple-darwin -march=x86-64 -relocation-model=static -code-model=small | FileCheck %s -check-prefix=DARWIN-64-STATIC
-; RUN: llc < %s -asm-verbose=0 -regalloc=basic -mtriple=x86_64-apple-darwin -march=x86-64 -relocation-model=dynamic-no-pic -code-model=small | FileCheck %s -check-prefix=DARWIN-64-DYNAMIC
-; RUN: llc < %s -asm-verbose=0 -regalloc=basic -mtriple=x86_64-apple-darwin -march=x86-64 -relocation-model=pic -code-model=small | FileCheck %s -check-prefix=DARWIN-64-PIC
-
 @src = external global [131072 x i32]
 @dst = external global [131072 x i32]
 @xsrc = external global [32 x i32]
diff --git a/test/CodeGen/X86/add-of-carry.ll b/test/CodeGen/X86/add-of-carry.ll
index f924ec8..a4abccb 100644
--- a/test/CodeGen/X86/add-of-carry.ll
+++ b/test/CodeGen/X86/add-of-carry.ll
@@ -4,9 +4,9 @@
 define i32 @test1(i32 %sum, i32 %x) nounwind readnone ssp {
 entry:
 ; CHECK: test1:
-; CHECK:	sbbl	%ecx, %ecx
+; CHECK: cmpl %ecx, %eax
 ; CHECK-NOT: addl
-; CHECK: subl	%ecx, %eax
+; CHECK: adcl $0, %eax
   %add4 = add i32 %x, %sum
   %cmp = icmp ult i32 %add4, %x
   %inc = zext i1 %cmp to i32
@@ -18,8 +18,7 @@ entry:
 ; CHECK: test2:
 ; CHECK: movl
 ; CHECK-NEXT: addl
-; CHECK-NEXT: sbbl
-; CHECK-NEXT: subl
+; CHECK-NEXT: adcl $0
 ; CHECK-NEXT: ret
 define i32 @test2(i32 %sum, i32 %x) nounwind readnone ssp {
 entry:
diff --git a/test/CodeGen/X86/add.ll b/test/CodeGen/X86/add.ll
index b95e5b5..7bf527a 100644
--- a/test/CodeGen/X86/add.ll
+++ b/test/CodeGen/X86/add.ll
@@ -1,6 +1,8 @@
 ; RUN: llc < %s -march=x86 | FileCheck %s -check-prefix=X32
-; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s -check-prefix=X64
-; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-linux -join-physregs | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-win32 -join-physregs | FileCheck %s -check-prefix=X64
+
+; Some of these tests depend on -join-physregs to commute instructions.
 
 ; The immediate can be encoded in a smaller way if the
 ; instruction is a sub instead of an add.
diff --git a/test/CodeGen/X86/aliases.ll b/test/CodeGen/X86/aliases.ll
index 3ed3bd6..f920279 100644
--- a/test/CodeGen/X86/aliases.ll
+++ b/test/CodeGen/X86/aliases.ll
@@ -1,6 +1,4 @@
 ; RUN: llc < %s -mtriple=i686-pc-linux-gnu -asm-verbose=false -o %t
-; RUN: grep { = } %t   | count 16
-; RUN: grep set %t   | count 18
 ; RUN: grep globl %t | count 6
 ; RUN: grep weak %t  | count 1
 ; RUN: grep hidden %t | count 1
diff --git a/test/CodeGen/X86/alignment.ll b/test/CodeGen/X86/alignment.ll
index 9678e6d..7e91115 100644
--- a/test/CodeGen/X86/alignment.ll
+++ b/test/CodeGen/X86/alignment.ll
@@ -6,7 +6,7 @@
 
 ; CHECK:	.bss
 ; CHECK:	.globl	GlobalA
-; CHECK:	.align	16
+; CHECK:	.align	8
 ; CHECK: GlobalA:
 ; CHECK:	.zero	384
 
@@ -15,12 +15,12 @@
 ; PR6921
 @GlobalB = common global { [384 x i8] } zeroinitializer, align 8
 
-; CHECK: 	.comm	GlobalB,384,16
+; CHECK: 	.comm	GlobalB,384,8
 
 
 @GlobalC = common global { [384 x i8] } zeroinitializer, align 2
 
-; CHECK: 	.comm	GlobalC,384,16
+; CHECK: 	.comm	GlobalC,384,2
 
 
 
diff --git a/test/CodeGen/X86/andimm8.ll b/test/CodeGen/X86/andimm8.ll
index 640237d..a3dc85f 100644
--- a/test/CodeGen/X86/andimm8.ll
+++ b/test/CodeGen/X86/andimm8.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-pc-linux-gnu -show-mc-encoding | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-pc-linux-gnu -show-mc-encoding -join-physregs | FileCheck %s
 
 ; PR8365
 ; CHECK: andl	$-64, %edi              # encoding: [0x83,0xe7,0xc0]
diff --git a/test/CodeGen/X86/asm-label.ll b/test/CodeGen/X86/asm-label.ll
new file mode 100644
index 0000000..1fc6e2e
--- /dev/null
+++ b/test/CodeGen/X86/asm-label.ll
@@ -0,0 +1,40 @@
+; RUN: llc -mtriple=x86_64-apple-darwin10 -O0 < %s | FileCheck %s
+
+; test that we print a label that we use. We had a bug where
+; we would print the jump, but not the label because it was considered
+; a fall through.
+
+; CHECK:        jmp     LBB0_9
+; CHECK: LBB0_9:                                 ## %cleanup
+
+define void @foo()  {
+entry:
+  br i1 undef, label %land.lhs.true, label %if.end11
+
+land.lhs.true:                                    ; preds = %entry
+  br i1 undef, label %if.then, label %if.end11
+
+if.then:                                          ; preds = %land.lhs.true
+  br i1 undef, label %if.then9, label %if.end
+
+if.then9:                                         ; preds = %if.then
+  br label %cleanup
+
+if.end:                                           ; preds = %if.then
+  br label %cleanup
+
+cleanup:                                          ; preds = %if.end, %if.then9
+  switch i32 undef, label %unreachable [
+    i32 0, label %cleanup.cont
+    i32 1, label %if.end11
+  ]
+
+cleanup.cont:                                     ; preds = %cleanup
+  br label %if.end11
+
+if.end11:                                         ; preds = %cleanup.cont, %cleanup, %land.lhs.true, %entry
+  ret void
+
+unreachable:                                      ; preds = %cleanup
+  unreachable
+}
diff --git a/test/CodeGen/X86/asm-label2.ll b/test/CodeGen/X86/asm-label2.ll
new file mode 100644
index 0000000..0b5de34
--- /dev/null
+++ b/test/CodeGen/X86/asm-label2.ll
@@ -0,0 +1,22 @@
+; RUN: llc -mtriple=x86_64-apple-darwin10 -O0 < %s | FileCheck %s
+
+; test that we print a label that we use. We had a bug where
+; we would print the jump, but not the label because it was considered
+; a fall through.
+
+; CHECK:        jmp     LBB0_1
+; CHECK: LBB0_1:
+
+define void @foobar()  {
+entry:
+  invoke void @_zed()
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:                                      ; preds = %entry
+  ret void
+
+lpad:                                             ; preds = %entry
+  unreachable
+}
+
+declare void @_zed() ssp align 2
diff --git a/test/CodeGen/X86/avx-128.ll b/test/CodeGen/X86/avx-128.ll
index 2bd3b5d..c29cb5d 100644
--- a/test/CodeGen/X86/avx-128.ll
+++ b/test/CodeGen/X86/avx-128.ll
@@ -10,3 +10,13 @@ entry:
   ret void
 }
 
+define void @fpext() nounwind uwtable {
+entry:
+  %f = alloca float, align 4
+  %d = alloca double, align 8
+  %tmp = load float* %f, align 4
+  ; CHECK: vcvtss2sd
+  %conv = fpext float %tmp to double
+  store double %conv, double* %d, align 8
+  ret void
+}
diff --git a/test/CodeGen/X86/avx-intrinsics-x86.ll b/test/CodeGen/X86/avx-intrinsics-x86.ll
index 6c32396..5201688 100644
--- a/test/CodeGen/X86/avx-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx-intrinsics-x86.ll
@@ -247,7 +247,7 @@ declare <2 x double> @llvm.x86.sse2.div.sd(<2 x double>, <2 x double>) nounwind
 
 define <16 x i8> @test_x86_sse2_loadu_dq(i8* %a0) {
   ; CHECK: movl
-  ; CHECK: vmovdqu
+  ; CHECK: vmovups
   %res = call <16 x i8> @llvm.x86.sse2.loadu.dq(i8* %a0) ; <<16 x i8>> [#uses=1]
   ret <16 x i8> %res
 }
@@ -256,7 +256,7 @@ declare <16 x i8> @llvm.x86.sse2.loadu.dq(i8*) nounwind readonly
 
 define <2 x double> @test_x86_sse2_loadu_pd(i8* %a0) {
   ; CHECK: movl
-  ; CHECK: vmovupd
+  ; CHECK: vmovups
   %res = call <2 x double> @llvm.x86.sse2.loadu.pd(i8* %a0) ; <<2 x double>> [#uses=1]
   ret <2 x double> %res
 }
diff --git a/test/CodeGen/X86/basic-promote-integers.ll b/test/CodeGen/X86/basic-promote-integers.ll
new file mode 100644
index 0000000..c80f2b0
--- /dev/null
+++ b/test/CodeGen/X86/basic-promote-integers.ll
@@ -0,0 +1,98 @@
+; Test that vectors are scalarized/lowered correctly
+; (with both legalization methods).
+; RUN: llc -march=x86 -promote-elements < %s
+; RUN: llc -march=x86                   < %s
+
+; A simple test to check copyToParts and copyFromParts.
+
+define <4 x i64> @test_param_0(<4 x i64> %A, <2 x i32> %B, <4 x i8> %C)  {
+   ret <4 x i64> %A
+}
+
+define <2 x i32> @test_param_1(<4 x i64> %A, <2 x i32> %B, <4 x i8> %C)  {
+   ret <2 x i32> %B
+}
+
+define <4 x i8> @test_param_2(<4 x i64> %A, <2 x i32> %B, <4 x i8> %C)  {
+   ret <4 x i8> %C
+}
+
+; Simple tests to check arithmetic and vector operations on types which need to
+; be legalized (no loads/stores to/from memory here).
+
+define <4 x i64> @test_arith_0(<4 x i64> %A, <2 x i32> %B, <4 x i8> %C)  {
+   %K = add <4 x i64> %A, <i64 0, i64 1, i64 3, i64 9>
+   ret <4 x i64> %K
+}
+
+define <2 x i32> @test_arith_1(<4 x i64> %A, <2 x i32> %B, <4 x i8> %C)  {
+   %K = add <2 x i32> %B, <i32 0, i32 1>
+   ret <2 x i32> %K
+}
+
+define <4 x i8> @test_arith_2(<4 x i64> %A, <2 x i32> %B, <4 x i8> %C)  {
+   %K = add <4 x i8> %C, <i8 0, i8 1, i8 3, i8 9>
+   ret <4 x i8> %K
+}
+
+define i8 @test_arith_3(<4 x i64> %A, <2 x i32> %B, <4 x i8> %C)  {
+   %K = add <4 x i8> %C, <i8 0, i8 1, i8 3, i8 9>
+   %Y = extractelement <4 x i8> %K, i32 1
+   ret i8 %Y
+}
+
+define <4 x i8> @test_arith_4(<4 x i64> %A, <2 x i32> %B, <4 x i8> %C)  {
+   %Y = insertelement <4 x i8> %C, i8 1, i32 0
+   ret <4 x i8> %Y
+}
+
+define <4 x i32> @test_arith_5(<4 x i64> %A, <2 x i32> %B, <4 x i32> %C)  {
+   %Y = insertelement <4 x i32> %C, i32 1, i32 0
+   ret <4 x i32> %Y
+}
+
+define <4 x i32> @test_arith_6(<4 x i64> %A, <2 x i32> %B, <4 x i32> %C)  {
+   %F = extractelement <2 x i32> %B, i32 1
+   %Y = insertelement <4 x i32> %C, i32 %F, i32 0
+   ret <4 x i32> %Y
+}
+
+define <4 x i64> @test_arith_7(<4 x i64> %A, <2 x i32> %B, <4 x i32> %C)  {
+   %F = extractelement <2 x i32> %B, i32 1
+   %W = zext i32 %F to i64
+   %Y = insertelement <4 x i64> %A, i64 %W, i32 0
+   ret <4 x i64> %Y
+}
+
+define i64 @test_arith_8(<4 x i64> %A, <2 x i32> %B, <4 x i32> %C)  {
+   %F = extractelement <2 x i32> %B, i32 1
+   %W = zext i32 %F to i64
+   %T = add i64 %W , 11
+   ret i64 %T
+}
+
+define <4 x i64> @test_arith_9(<4 x i64> %A, <2 x i32> %B, <4 x i16> %C)  {
+   %T = add <4 x i16> %C, %C
+   %F0 = extractelement <4 x i16> %T, i32 0
+   %F1 = extractelement <4 x i16> %T, i32 1
+   %W0 = zext i16 %F0 to i64
+   %W1 = zext i16 %F1 to i64
+   %Y0 = insertelement <4 x i64> %A,  i64 %W0, i32 0
+   %Y1 = insertelement <4 x i64> %Y0, i64 %W1, i32 2
+   ret <4 x i64> %Y1
+}
+
+define <4 x i16> @test_arith_10(<4 x i64> %A, <2 x i32> %B, <4 x i32> %C)  {
+   %F = bitcast <2 x i32> %B to <4 x i16>
+   %T = add <4 x i16> %F , <i16 0, i16 1, i16 2, i16 3>
+   ret <4 x i16> %T
+}
+
+
+; Simple tests to check saving/loading from memory
+define <4 x i16> @test_mem_0(<4 x i64> %A, <2 x i32> %B, <4 x i32> %C)  {
+   %F = bitcast <2 x i32> %B to <4 x i16>
+   %T = add <4 x i16> %F , <i16 0, i16 1, i16 2, i16 3>
+   ret <4 x i16> %T
+}
+
diff --git a/test/CodeGen/X86/bool-zext.ll b/test/CodeGen/X86/bool-zext.ll
index d2c30c6..3558376 100644
--- a/test/CodeGen/X86/bool-zext.ll
+++ b/test/CodeGen/X86/bool-zext.ll
@@ -1,8 +1,12 @@
-; RUN: llc < %s -march=x86-64 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-pc-win32 | FileCheck %s -check-prefix=WIN64
 
-; CHECK: @bar1
-; CHECK: movzbl
-; CHECK: callq
+; X64: @bar1
+; X64: movzbl
+; X64: jmp
+; WIN64: @bar1
+; WIN64: movzbl
+; WIN64: callq
 define void @bar1(i1 zeroext %v1) nounwind ssp {
 entry:
   %conv = zext i1 %v1 to i32
@@ -10,9 +14,12 @@ entry:
   ret void
 }
 
-; CHECK: @bar2
-; CHECK-NOT: movzbl
-; CHECK: callq
+; X64: @bar2
+; X64-NOT: movzbl
+; X64: jmp
+; WIN64: @bar2
+; WIN64-NOT: movzbl
+; WIN64: callq
 define void @bar2(i8 zeroext %v1) nounwind ssp {
 entry:
   %conv = zext i8 %v1 to i32
@@ -20,11 +27,16 @@ entry:
   ret void
 }
 
-; CHECK: @bar3
-; CHECK: callq
-; CHECK-NOT: movzbl
-; CHECK-NOT: and
-; CHECK: ret
+; X64: @bar3
+; X64: callq
+; X64-NOT: movzbl
+; X64-NOT: and
+; X64: ret
+; WIN64: @bar3
+; WIN64: callq
+; WIN64-NOT: movzbl
+; WIN64-NOT: and
+; WIN64: ret
 define zeroext i1 @bar3() nounwind ssp {
 entry:
   %call = call i1 @foo2() nounwind
diff --git a/test/CodeGen/X86/byval-align.ll b/test/CodeGen/X86/byval-align.ll
new file mode 100644
index 0000000..c62a181
--- /dev/null
+++ b/test/CodeGen/X86/byval-align.ll
@@ -0,0 +1,59 @@
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
+%struct.S = type { i32}
+
+@.str = private constant [10 x i8] c"ptr = %p\0A\00", align 1 ; <[10 x i8]*> [#uses=1]
+@.str1 = private constant [8 x i8] c"Failed \00", align 1 ; <[8 x i8]*> [#uses=1]
+@.str2 = private constant [2 x i8] c"0\00", align 1 ; <[2 x i8]*> [#uses=1]
+@.str3 = private constant [7 x i8] c"test.c\00", align 1 ; <[7 x i8]*> [#uses=1]
+@__PRETTY_FUNCTION__.2067 = internal constant [13 x i8] c"aligned_func\00" ; <[13 x i8]*> [#uses=1]
+
+define void @aligned_func(%struct.S* byval align 64 %obj) nounwind {
+entry:
+  %ptr = alloca i8*                               ; <i8**> [#uses=3]
+  %p = alloca i64                                 ; <i64*> [#uses=3]
+  %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
+  %obj1 = bitcast %struct.S* %obj to i8*          ; <i8*> [#uses=1]
+  store i8* %obj1, i8** %ptr, align 8
+  %0 = load i8** %ptr, align 8                    ; <i8*> [#uses=1]
+  %1 = ptrtoint i8* %0 to i64                     ; <i64> [#uses=1]
+  store i64 %1, i64* %p, align 8
+  %2 = load i8** %ptr, align 8                    ; <i8*> [#uses=1]
+  %3 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([10 x i8]* @.str, i64 0, i64 0), i8* %2) nounwind ; <i32> [#uses=0]
+  %4 = load i64* %p, align 8                      ; <i64> [#uses=1]
+  %5 = and i64 %4, 140737488355264                ; <i64> [#uses=1]
+  %6 = load i64* %p, align 8                      ; <i64> [#uses=1]
+  %7 = icmp ne i64 %5, %6                         ; <i1> [#uses=1]
+  br i1 %7, label %bb, label %bb2
+
+bb:                                               ; preds = %entry
+  %8 = call i32 @puts(i8* getelementptr inbounds ([8 x i8]* @.str1, i64 0, i64 0)) nounwind ; <i32> [#uses=0]
+  call void @__assert_fail(i8* getelementptr inbounds ([2 x i8]* @.str2, i64 0, i64 0), i8* getelementptr inbounds ([7 x i8]* @.str3, i64 0, i64 0), i32 18, i8* getelementptr inbounds ([13 x i8]* @__PRETTY_FUNCTION__.2067, i64 0, i64 0)) noreturn nounwind
+  unreachable
+
+bb2:                                              ; preds = %entry
+  br label %return
+
+return:                                           ; preds = %bb2
+  ret void
+}
+
+declare i32 @printf(i8*, ...) nounwind
+
+declare i32 @puts(i8*)
+
+declare void @__assert_fail(i8*, i8*, i32, i8*) noreturn nounwind
+
+define void @main() nounwind {
+entry:
+; CHECK: main
+; CHECK: andq    $-64, %rsp
+  %s1 = alloca %struct.S                          ; <%struct.S*> [#uses=4]
+  %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
+  %0 = getelementptr inbounds %struct.S* %s1, i32 0, i32 0 ; <i32*> [#uses=1]
+  store i32 1, i32* %0, align 4
+  call void @aligned_func(%struct.S* byval align 64 %s1) nounwind
+  br label %return
+
+return:                                           ; preds = %entry
+  ret void
+}
diff --git a/test/CodeGen/X86/byval2.ll b/test/CodeGen/X86/byval2.ll
index 03a9f0f..196efe5 100644
--- a/test/CodeGen/X86/byval2.ll
+++ b/test/CodeGen/X86/byval2.ll
@@ -37,8 +37,8 @@ entry:
 	store i64 %b, i64* %tmp2, align 16
 	%tmp4 = getelementptr %struct.s* %d, i32 0, i32 2
 	store i64 %c, i64* %tmp4, align 16
-	call void @f( %struct.s* %d byval)
-	call void @f( %struct.s* %d byval)
+	call void @f( %struct.s*byval %d )
+	call void @f( %struct.s*byval %d )
 	ret void
 }
 
diff --git a/test/CodeGen/X86/byval3.ll b/test/CodeGen/X86/byval3.ll
index 8d5bb6d..f3b125c 100644
--- a/test/CodeGen/X86/byval3.ll
+++ b/test/CodeGen/X86/byval3.ll
@@ -45,8 +45,8 @@ entry:
         store i32 %a5, i32* %tmp8, align 16
         %tmp10 = getelementptr %struct.s* %d, i32 0, i32 5
         store i32 %a6, i32* %tmp10, align 16
-        call void @f( %struct.s* %d byval)
-        call void @f( %struct.s* %d byval)
+        call void @f( %struct.s* byval %d)
+        call void @f( %struct.s* byval %d)
         ret void
 }
 
diff --git a/test/CodeGen/X86/byval4.ll b/test/CodeGen/X86/byval4.ll
index ae1a79a..b7a4aa3 100644
--- a/test/CodeGen/X86/byval4.ll
+++ b/test/CodeGen/X86/byval4.ll
@@ -51,8 +51,8 @@ entry:
         store i16 %a5, i16* %tmp8, align 16
         %tmp10 = getelementptr %struct.s* %a, i32 0, i32 5
         store i16 %a6, i16* %tmp10, align 16
-        call void @f( %struct.s* %a byval )
-        call void @f( %struct.s* %a byval )
+        call void @f( %struct.s* byval %a )
+        call void @f( %struct.s* byval %a )
         ret void
 }
 
diff --git a/test/CodeGen/X86/byval5.ll b/test/CodeGen/X86/byval5.ll
index a376709..dca0936 100644
--- a/test/CodeGen/X86/byval5.ll
+++ b/test/CodeGen/X86/byval5.ll
@@ -59,8 +59,8 @@ entry:
         store i8 %a5, i8* %tmp8, align 8
         %tmp10 = getelementptr %struct.s* %a, i32 0, i32 5
         store i8 %a6, i8* %tmp10, align 8
-        call void @f( %struct.s* %a byval )
-        call void @f( %struct.s* %a byval )
+        call void @f( %struct.s* byval %a )
+        call void @f( %struct.s* byval %a )
         ret void
 }
 
diff --git a/test/CodeGen/X86/byval7.ll b/test/CodeGen/X86/byval7.ll
index 686ed9c..98a26e4 100644
--- a/test/CodeGen/X86/byval7.ll
+++ b/test/CodeGen/X86/byval7.ll
@@ -9,7 +9,6 @@ entry:
 ; CHECK: main:
 ; CHECK: movl $1, (%esp)
 ; CHECK: leal 16(%esp), %edi
-; CHECK: movl $36, %ecx
 ; CHECK: leal 160(%esp), %esi
 ; CHECK: rep;movsl
 	%s = alloca %struct.S		; <%struct.S*> [#uses=2]
diff --git a/test/CodeGen/X86/call-push.ll b/test/CodeGen/X86/call-push.ll
index 02cbccc..8cca10c 100644
--- a/test/CodeGen/X86/call-push.ll
+++ b/test/CodeGen/X86/call-push.ll
@@ -27,3 +27,19 @@ UnifiedReturnBlock:             ; preds = %entry
 }
 
 declare i32 @f(%struct.decode_t*)
+
+
+; There should be no store for the undef operand.
+
+; CHECK: _test2:
+; CHECK-NOT: 8(%esp)
+; CHECK: 4(%esp)
+; CHECK-NOT: 8(%esp)
+; CHECK: calll 
+declare i32 @foo(i32, i32, i32)
+
+define void @test2() nounwind {
+entry:
+  %call = call i32 @foo(i32 8, i32 6, i32 undef)
+  ret void
+}
diff --git a/test/CodeGen/X86/clz.ll b/test/CodeGen/X86/clz.ll
index 623ac75..d76fab4 100644
--- a/test/CodeGen/X86/clz.ll
+++ b/test/CodeGen/X86/clz.ll
@@ -31,3 +31,18 @@ entry:
 }
 
 declare i16 @llvm.ctlz.i16(i16) nounwind readnone 
+
+; Don't generate the cmovne when the source is known non-zero (and bsr would
+; not set ZF).
+; rdar://9490949
+
+define i32 @t4(i32 %n) nounwind {
+entry:
+; CHECK: t4:
+; CHECK: bsrl
+; CHECK-NOT: cmov
+; CHECK: ret
+  %or = or i32 %n, 1
+  %tmp1 = tail call i32 @llvm.ctlz.i32(i32 %or)
+  ret i32 %tmp1
+}
diff --git a/test/CodeGen/X86/coalescer-commute2.ll b/test/CodeGen/X86/coalescer-commute2.ll
index 7306920..6e5c1cf 100644
--- a/test/CodeGen/X86/coalescer-commute2.ll
+++ b/test/CodeGen/X86/coalescer-commute2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux -join-physregs | FileCheck %s
 ; CHECK-NOT:     mov
 ; CHECK:     paddw
 ; CHECK-NOT:     mov
diff --git a/test/CodeGen/X86/dbg-const-int.ll b/test/CodeGen/X86/dbg-const-int.ll
new file mode 100644
index 0000000..bfc96f1
--- /dev/null
+++ b/test/CodeGen/X86/dbg-const-int.ll
@@ -0,0 +1,29 @@
+; RUN: llc < %s - | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-macosx10.6.7"
+; Radar 9511391
+
+;CHECK:         .byte   4                       ## DW_AT_const_value
+define i32 @foo() nounwind uwtable readnone optsize ssp {
+entry:
+  tail call void @llvm.dbg.value(metadata !8, i64 0, metadata !6), !dbg !9
+  ret i32 42, !dbg !10
+}
+
+declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+
+!llvm.dbg.cu = !{!0}
+!llvm.dbg.sp = !{!1}
+!llvm.dbg.lv.foo = !{!6}
+
+!0 = metadata !{i32 589841, i32 0, i32 12, metadata !"a.c", metadata !"/private/tmp", metadata !"clang version 3.0 (trunk 132191)", i1 true, i1 true, metadata !"", i32 0} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{i32 589870, i32 0, metadata !2, metadata !"foo", metadata !"foo", metadata !"", metadata !2, i32 1, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 0, i1 true, i32 ()* @foo, null, null} ; [ DW_TAG_subprogram ]
+!2 = metadata !{i32 589865, metadata !"a.c", metadata !"/private/tmp", metadata !0} ; [ DW_TAG_file_type ]
+!3 = metadata !{i32 589845, metadata !2, metadata !"", metadata !2, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!4 = metadata !{metadata !5}
+!5 = metadata !{i32 589860, metadata !0, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!6 = metadata !{i32 590080, metadata !7, metadata !"i", metadata !2, i32 2, metadata !5, i32 0} ; [ DW_TAG_auto_variable ]
+!7 = metadata !{i32 589835, metadata !1, i32 1, i32 11, metadata !2, i32 0} ; [ DW_TAG_lexical_block ]
+!8 = metadata !{i32 42}
+!9 = metadata !{i32 2, i32 12, metadata !7, null}
+!10 = metadata !{i32 3, i32 2, metadata !7, null}
diff --git a/test/CodeGen/X86/dbg-const.ll b/test/CodeGen/X86/dbg-const.ll
new file mode 100644
index 0000000..5a51eb8
--- /dev/null
+++ b/test/CodeGen/X86/dbg-const.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s - | FileCheck %s
+target triple = "x86_64-apple-darwin10.0.0"
+
+;CHECK:        ## DW_OP_constu
+;CHECK-NEXT:  .byte	42
+define i32 @foobar() nounwind readonly noinline ssp {
+entry:
+  %call = tail call i32 @bar(), !dbg !11
+  tail call void @llvm.dbg.value(metadata !8, i64 0, metadata !6), !dbg !9
+  %call2 = tail call i32 @bar(), !dbg !11
+  tail call void @llvm.dbg.value(metadata !{i32 %call}, i64 0, metadata !6), !dbg !11
+  %add = add nsw i32 %call2, %call, !dbg !12
+  ret i32 %add, !dbg !10
+}
+
+declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare i32 @bar() nounwind readnone
+
+!llvm.dbg.sp = !{!0}
+!llvm.dbg.lv.foobar = !{!6}
+
+!0 = metadata !{i32 524334, i32 0, metadata !1, metadata !"foobar", metadata !"foobar", metadata !"foobar", metadata !1, i32 12, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, i32 ()* @foobar}
+!1 = metadata !{i32 524329, metadata !"mu.c", metadata !"/private/tmp", metadata !2}
+!2 = metadata !{i32 524305, i32 0, i32 12, metadata !"mu.c", metadata !"/private/tmp", metadata !"clang version 2.9 (trunk 114183)", i1 true, i1 true, metadata !"", i32 0}
+!3 = metadata !{i32 524309, metadata !1, metadata !"", metadata !1, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null}
+!4 = metadata !{metadata !5}
+!5 = metadata !{i32 524324, metadata !1, metadata !"int", metadata !1, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5}
+!6 = metadata !{i32 524544, metadata !7, metadata !"j", metadata !1, i32 15, metadata !5}
+!7 = metadata !{i32 524299, metadata !0, i32 12, i32 52, metadata !1, i32 0}
+!8 = metadata !{i32 42}
+!9 = metadata !{i32 15, i32 12, metadata !7, null}
+!10 = metadata !{i32 23, i32 3, metadata !7, null}
+!11 = metadata !{i32 17, i32 3, metadata !7, null}
+!12 = metadata !{i32 18, i32 3, metadata !7, null}
diff --git a/test/CodeGen/X86/dbg-declare-arg.ll b/test/CodeGen/X86/dbg-declare-arg.ll
new file mode 100644
index 0000000..367c1ef
--- /dev/null
+++ b/test/CodeGen/X86/dbg-declare-arg.ll
@@ -0,0 +1,123 @@
+; RUN: llc -O0 -fast-isel=false < %s | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-macosx10.6.7"
+;Radar 9321650
+
+;CHECK: ##DEBUG_VALUE: my_a 
+
+%class.A = type { i32, i32, i32, i32 }
+
+define void @_Z3fooi(%class.A* sret %agg.result, i32 %i) ssp {
+entry:
+  %i.addr = alloca i32, align 4
+  %j = alloca i32, align 4
+  %nrvo = alloca i1
+  %cleanup.dest.slot = alloca i32
+  store i32 %i, i32* %i.addr, align 4
+  call void @llvm.dbg.declare(metadata !{i32* %i.addr}, metadata !26), !dbg !27
+  call void @llvm.dbg.declare(metadata !{i32* %j}, metadata !28), !dbg !30
+  store i32 0, i32* %j, align 4, !dbg !31
+  %tmp = load i32* %i.addr, align 4, !dbg !32
+  %cmp = icmp eq i32 %tmp, 42, !dbg !32
+  br i1 %cmp, label %if.then, label %if.end, !dbg !32
+
+if.then:                                          ; preds = %entry
+  %tmp1 = load i32* %i.addr, align 4, !dbg !33
+  %add = add nsw i32 %tmp1, 1, !dbg !33
+  store i32 %add, i32* %j, align 4, !dbg !33
+  br label %if.end, !dbg !35
+
+if.end:                                           ; preds = %if.then, %entry
+  store i1 false, i1* %nrvo, !dbg !36
+  call void @llvm.dbg.declare(metadata !{%class.A* %agg.result}, metadata !37), !dbg !39
+  %tmp2 = load i32* %j, align 4, !dbg !40
+  %x = getelementptr inbounds %class.A* %agg.result, i32 0, i32 0, !dbg !40
+  store i32 %tmp2, i32* %x, align 4, !dbg !40
+  store i1 true, i1* %nrvo, !dbg !41
+  store i32 1, i32* %cleanup.dest.slot
+  %nrvo.val = load i1* %nrvo, !dbg !42
+  br i1 %nrvo.val, label %nrvo.skipdtor, label %nrvo.unused, !dbg !42
+
+nrvo.unused:                                      ; preds = %if.end
+  call void @_ZN1AD1Ev(%class.A* %agg.result), !dbg !42
+  br label %nrvo.skipdtor, !dbg !42
+
+nrvo.skipdtor:                                    ; preds = %nrvo.unused, %if.end
+  ret void, !dbg !42
+}
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+define linkonce_odr void @_ZN1AD1Ev(%class.A* %this) unnamed_addr ssp align 2 {
+entry:
+  %this.addr = alloca %class.A*, align 8
+  store %class.A* %this, %class.A** %this.addr, align 8
+  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !43), !dbg !44
+  %this1 = load %class.A** %this.addr
+  call void @_ZN1AD2Ev(%class.A* %this1)
+  ret void, !dbg !45
+}
+
+define linkonce_odr void @_ZN1AD2Ev(%class.A* %this) unnamed_addr nounwind ssp align 2 {
+entry:
+  %this.addr = alloca %class.A*, align 8
+  store %class.A* %this, %class.A** %this.addr, align 8
+  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !46), !dbg !47
+  %this1 = load %class.A** %this.addr
+  %x = getelementptr inbounds %class.A* %this1, i32 0, i32 0, !dbg !48
+  store i32 1, i32* %x, align 4, !dbg !48
+  ret void, !dbg !48
+}
+
+!llvm.dbg.sp = !{!0, !10, !14, !19, !22, !25}
+
+!0 = metadata !{i32 589870, i32 0, metadata !1, metadata !"~A", metadata !"~A", metadata !"", metadata !3, i32 2, metadata !11, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null} ; [ DW_TAG_subprogram ]
+!1 = metadata !{i32 589826, metadata !2, metadata !"A", metadata !3, i32 2, i64 128, i64 32, i32 0, i32 0, null, metadata !4, i32 0, null, null} ; [ DW_TAG_class_type ]
+!2 = metadata !{i32 589841, i32 0, i32 4, metadata !"a.cc", metadata !"/private/tmp", metadata !"clang version 3.0 (trunk 130127)", i1 true, i1 false, metadata !"", i32 0} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{i32 589865, metadata !"a.cc", metadata !"/private/tmp", metadata !2} ; [ DW_TAG_file_type ]
+!4 = metadata !{metadata !5, metadata !7, metadata !8, metadata !9, metadata !0, metadata !10, metadata !14}
+!5 = metadata !{i32 589837, metadata !3, metadata !"x", metadata !3, i32 2, i64 32, i64 32, i64 0, i32 0, metadata !6} ; [ DW_TAG_member ]
+!6 = metadata !{i32 589860, metadata !2, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!7 = metadata !{i32 589837, metadata !3, metadata !"y", metadata !3, i32 2, i64 32, i64 32, i64 32, i32 0, metadata !6} ; [ DW_TAG_member ]
+!8 = metadata !{i32 589837, metadata !3, metadata !"z", metadata !3, i32 2, i64 32, i64 32, i64 64, i32 0, metadata !6} ; [ DW_TAG_member ]
+!9 = metadata !{i32 589837, metadata !3, metadata !"o", metadata !3, i32 2, i64 32, i64 32, i64 96, i32 0, metadata !6} ; [ DW_TAG_member ]
+!10 = metadata !{i32 589870, i32 0, metadata !1, metadata !"A", metadata !"A", metadata !"", metadata !3, i32 2, metadata !11, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null} ; [ DW_TAG_subprogram ]
+!11 = metadata !{i32 589845, metadata !3, metadata !"", metadata !3, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !12, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!12 = metadata !{null, metadata !13}
+!13 = metadata !{i32 589839, metadata !2, metadata !"", i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !1} ; [ DW_TAG_pointer_type ]
+!14 = metadata !{i32 589870, i32 0, metadata !1, metadata !"A", metadata !"A", metadata !"", metadata !3, i32 2, metadata !15, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null} ; [ DW_TAG_subprogram ]
+!15 = metadata !{i32 589845, metadata !3, metadata !"", metadata !3, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !16, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!16 = metadata !{null, metadata !13, metadata !17}
+!17 = metadata !{i32 589840, metadata !2, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !18} ; [ DW_TAG_reference_type ]
+!18 = metadata !{i32 589862, metadata !2, metadata !"", null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !1} ; [ DW_TAG_const_type ]
+!19 = metadata !{i32 589870, i32 0, metadata !3, metadata !"foo", metadata !"foo", metadata !"_Z3fooi", metadata !3, i32 4, metadata !20, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, void (%class.A*, i32)* @_Z3fooi, null, null} ; [ DW_TAG_subprogram ]
+!20 = metadata !{i32 589845, metadata !3, metadata !"", metadata !3, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !21, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!21 = metadata !{metadata !1}
+!22 = metadata !{i32 589870, i32 0, metadata !3, metadata !"~A", metadata !"~A", metadata !"_ZN1AD1Ev", metadata !3, i32 2, metadata !23, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, void (%class.A*)* @_ZN1AD1Ev, null, null} ; [ DW_TAG_subprogram ]
+!23 = metadata !{i32 589845, metadata !3, metadata !"", metadata !3, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !24, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!24 = metadata !{null}
+!25 = metadata !{i32 589870, i32 0, metadata !3, metadata !"~A", metadata !"~A", metadata !"_ZN1AD2Ev", metadata !3, i32 2, metadata !23, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, void (%class.A*)* @_ZN1AD2Ev, null, null} ; [ DW_TAG_subprogram ]
+!26 = metadata !{i32 590081, metadata !19, metadata !"i", metadata !3, i32 16777220, metadata !6, i32 0} ; [ DW_TAG_arg_variable ]
+!27 = metadata !{i32 4, i32 11, metadata !19, null}
+!28 = metadata !{i32 590080, metadata !29, metadata !"j", metadata !3, i32 5, metadata !6, i32 0} ; [ DW_TAG_auto_variable ]
+!29 = metadata !{i32 589835, metadata !19, i32 4, i32 14, metadata !3, i32 0} ; [ DW_TAG_lexical_block ]
+!30 = metadata !{i32 5, i32 7, metadata !29, null}
+!31 = metadata !{i32 5, i32 12, metadata !29, null}
+!32 = metadata !{i32 6, i32 3, metadata !29, null}
+!33 = metadata !{i32 7, i32 5, metadata !34, null}
+!34 = metadata !{i32 589835, metadata !29, i32 6, i32 16, metadata !3, i32 1} ; [ DW_TAG_lexical_block ]
+!35 = metadata !{i32 8, i32 3, metadata !34, null}
+!36 = metadata !{i32 9, i32 9, metadata !29, null}
+!37 = metadata !{i32 590080, metadata !29, metadata !"my_a", metadata !3, i32 9, metadata !38, i32 0} ; [ DW_TAG_auto_variable ]
+!38 = metadata !{i32 589840, metadata !2, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !1} ; [ DW_TAG_reference_type ]
+!39 = metadata !{i32 9, i32 5, metadata !29, null}
+!40 = metadata !{i32 10, i32 3, metadata !29, null}
+!41 = metadata !{i32 11, i32 3, metadata !29, null}
+!42 = metadata !{i32 12, i32 1, metadata !29, null}
+!43 = metadata !{i32 590081, metadata !22, metadata !"this", metadata !3, i32 16777218, metadata !13, i32 64} ; [ DW_TAG_arg_variable ]
+!44 = metadata !{i32 2, i32 47, metadata !22, null}
+!45 = metadata !{i32 2, i32 61, metadata !22, null}
+!46 = metadata !{i32 590081, metadata !25, metadata !"this", metadata !3, i32 16777218, metadata !13, i32 64} ; [ DW_TAG_arg_variable ]
+!47 = metadata !{i32 2, i32 47, metadata !25, null}
+!48 = metadata !{i32 2, i32 54, metadata !49, null}
+!49 = metadata !{i32 589835, metadata !25, i32 2, i32 52, metadata !3, i32 2} ; [ DW_TAG_lexical_block ]
diff --git a/test/CodeGen/X86/dbg-file-name.ll b/test/CodeGen/X86/dbg-file-name.ll
index e7d5f92..3a849aa 100644
--- a/test/CodeGen/X86/dbg-file-name.ll
+++ b/test/CodeGen/X86/dbg-file-name.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -mtriple x86_64-apple-darwin10.0.0  < %s | FileCheck %s
 
 ; Radar 8884898
-; CHECK: file	1 "/Users/manav/one/two/simple.c"
+; CHECK: file	1 "/Users/manav/one/two{{/|\\\\}}simple.c"
 
 declare i32 @printf(i8*, ...) nounwind
 
diff --git a/test/CodeGen/X86/dbg-merge-loc-entry.ll b/test/CodeGen/X86/dbg-merge-loc-entry.ll
index 76b93dd..afe1729 100644
--- a/test/CodeGen/X86/dbg-merge-loc-entry.ll
+++ b/test/CodeGen/X86/dbg-merge-loc-entry.ll
@@ -6,8 +6,11 @@ target triple = "x86_64-apple-darwin8"
 ;CHECK: Ldebug_loc0:
 ;CHECK-NEXT:	.quad	Lfunc_begin0
 ;CHECK-NEXT:	.quad	L
-;CHECK-NEXT:	.short	1                       ## Loc expr size
+;CHECK-NEXT: Lset{{.*}} = Ltmp{{.*}}-Ltmp{{.*}}          ## Loc expr size
+;CHECK-NEXT:    .short  Lset
+;CHECK-NEXT: Ltmp
 ;CHECK-NEXT:	.byte	85                      ## DW_OP_reg5
+;CHECK-NEXT: Ltmp7
 ;CHECK-NEXT:	.quad	0
 ;CHECK-NEXT:	.quad	0
 
diff --git a/test/CodeGen/X86/dbg-prolog-end.ll b/test/CodeGen/X86/dbg-prolog-end.ll
new file mode 100644
index 0000000..81303bb
--- /dev/null
+++ b/test/CodeGen/X86/dbg-prolog-end.ll
@@ -0,0 +1,55 @@
+; RUN: llc -O0 < %s | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-macosx10.6.7"
+
+;CHECK: .loc	1 2 11 prologue_end
+define i32 @foo(i32 %i) nounwind ssp {
+entry:
+  %i.addr = alloca i32, align 4
+  %j = alloca i32, align 4
+  store i32 %i, i32* %i.addr, align 4
+  call void @llvm.dbg.declare(metadata !{i32* %i.addr}, metadata !7), !dbg !8
+  call void @llvm.dbg.declare(metadata !{i32* %j}, metadata !9), !dbg !11
+  store i32 2, i32* %j, align 4, !dbg !12
+  %tmp = load i32* %j, align 4, !dbg !13
+  %inc = add nsw i32 %tmp, 1, !dbg !13
+  store i32 %inc, i32* %j, align 4, !dbg !13
+  %tmp1 = load i32* %j, align 4, !dbg !14
+  %tmp2 = load i32* %i.addr, align 4, !dbg !14
+  %add = add nsw i32 %tmp1, %tmp2, !dbg !14
+  store i32 %add, i32* %j, align 4, !dbg !14
+  %tmp3 = load i32* %j, align 4, !dbg !15
+  ret i32 %tmp3, !dbg !15
+}
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+define i32 @main() nounwind ssp {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval
+  %call = call i32 @foo(i32 21), !dbg !16
+  ret i32 %call, !dbg !16
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.dbg.sp = !{!1, !6}
+
+!0 = metadata !{i32 589841, i32 0, i32 12, metadata !"/tmp/a.c", metadata !"/private/tmp", metadata !"clang version 3.0 (trunk 131100)", i1 true, i1 false, metadata !"", i32 0} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{i32 589870, i32 0, metadata !2, metadata !"foo", metadata !"foo", metadata !"", metadata !2, i32 1, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, i32 (i32)* @foo, null, null} ; [ DW_TAG_subprogram ]
+!2 = metadata !{i32 589865, metadata !"/tmp/a.c", metadata !"/private/tmp", metadata !0} ; [ DW_TAG_file_type ]
+!3 = metadata !{i32 589845, metadata !2, metadata !"", metadata !2, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!4 = metadata !{metadata !5}
+!5 = metadata !{i32 589860, metadata !0, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!6 = metadata !{i32 589870, i32 0, metadata !2, metadata !"main", metadata !"main", metadata !"", metadata !2, i32 7, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 0, i1 false, i32 ()* @main, null, null} ; [ DW_TAG_subprogram ]
+!7 = metadata !{i32 590081, metadata !1, metadata !"i", metadata !2, i32 16777217, metadata !5, i32 0} ; [ DW_TAG_arg_variable ]
+!8 = metadata !{i32 1, i32 13, metadata !1, null}
+!9 = metadata !{i32 590080, metadata !10, metadata !"j", metadata !2, i32 2, metadata !5, i32 0} ; [ DW_TAG_auto_variable ]
+!10 = metadata !{i32 589835, metadata !1, i32 1, i32 16, metadata !2, i32 0} ; [ DW_TAG_lexical_block ]
+!11 = metadata !{i32 2, i32 6, metadata !10, null}
+!12 = metadata !{i32 2, i32 11, metadata !10, null}
+!13 = metadata !{i32 3, i32 2, metadata !10, null}
+!14 = metadata !{i32 4, i32 2, metadata !10, null}
+!15 = metadata !{i32 5, i32 2, metadata !10, null}
+!16 = metadata !{i32 8, i32 2, metadata !17, null}
+!17 = metadata !{i32 589835, metadata !6, i32 7, i32 12, metadata !2, i32 1} ; [ DW_TAG_lexical_block ]
diff --git a/test/CodeGen/X86/dbg-value-dag-combine.ll b/test/CodeGen/X86/dbg-value-dag-combine.ll
new file mode 100644
index 0000000..b115bf4
--- /dev/null
+++ b/test/CodeGen/X86/dbg-value-dag-combine.ll
@@ -0,0 +1,48 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-darwin10.0.0"
+; PR 9817
+
+
+declare  <4 x i32> @__amdil_get_global_id_int()
+declare  void @llvm.dbg.value(metadata , i64 , metadata )
+define void @__OpenCL_test_kernel(i32 addrspace(1)* %ip) nounwind {
+entry:
+  call void @llvm.dbg.value(metadata !{i32 addrspace(1)* %ip}, i64 0, metadata
+!7), !dbg !8
+  %0 = call <4 x i32> @__amdil_get_global_id_int() nounwind
+  %1 = extractelement <4 x i32> %0, i32 0
+  call void @llvm.dbg.value(metadata !{i32 %1}, i64 0, metadata !9), !dbg !11
+  call void @llvm.dbg.value(metadata !12, i64 0, metadata !13), !dbg !14
+  %tmp2 = load i32 addrspace(1)* %ip, align 4, !dbg !15
+  %tmp3 = add i32 0, %tmp2, !dbg !15
+; CHECK:  ##DEBUG_VALUE: idx <- EAX+0
+  call void @llvm.dbg.value(metadata !{i32 %tmp3}, i64 0, metadata !13), !dbg
+!15
+  %arrayidx = getelementptr i32 addrspace(1)* %ip, i32 %1, !dbg !16
+  store i32 %tmp3, i32 addrspace(1)* %arrayidx, align 4, !dbg !16
+  ret void, !dbg !17
+}
+!llvm.dbg.sp = !{!0}
+
+!0 = metadata !{i32 589870, i32 0, metadata !1, metadata
+!"__OpenCL_test_kernel", metadata !"__OpenCL_test_kernel", metadata
+!"__OpenCL_test_kernel", metadata !1, i32 2, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 0, i1 false, null} ; [ DW_TAG_subprogram ]
+!1 = metadata !{i32 589865, metadata !"OCL6368.tmp.cl", metadata !"E:\5CUsers\5Cmvillmow.AMD\5CAppData\5CLocal\5CTemp", metadata !2} ; [ DW_TAG_file_type ]
+!2 = metadata !{i32 589841, i32 0, i32 1, metadata !"OCL6368.tmp.cl", metadata !"E:\5CUsers\5Cmvillmow.AMD\5CAppData\5CLocal\5CTemp", metadata !"clc", i1 true, i1 false, metadata !"", i32 0} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{i32 589845, metadata !1, metadata !"", metadata !1, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!4 = metadata !{null, metadata !5}
+!5 = metadata !{i32 589839, metadata !2, metadata !"", null, i32 0, i64 32, i64 32, i64 0, i32 0, metadata !6} ; [ DW_TAG_pointer_type ]
+!6 = metadata !{i32 589860, metadata !2, metadata !"unsigned int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
+!7 = metadata !{i32 590081, metadata !0, metadata !"ip", metadata !1, i32 1, metadata !5, i32 0} ; [ DW_TAG_arg_variable ]
+!8 = metadata !{i32 1, i32 42, metadata !0, null}
+!9 = metadata !{i32 590080, metadata !10, metadata !"gid", metadata !1, i32 3, metadata !6, i32 0} ; [ DW_TAG_auto_variable ]
+!10 = metadata !{i32 589835, metadata !0, i32 2, i32 1, metadata !1, i32 0} ; [ DW_TAG_lexical_block ]
+!11 = metadata !{i32 3, i32 41, metadata !10, null}
+!12 = metadata !{i32 0}
+!13 = metadata !{i32 590080, metadata !10, metadata !"idx", metadata !1, i32 4, metadata !6, i32 0} ; [ DW_TAG_auto_variable ]
+!14 = metadata !{i32 4, i32 20, metadata !10, null}
+!15 = metadata !{i32 5, i32 15, metadata !10, null}
+!16 = metadata !{i32 6, i32 18, metadata !10, null}
+!17 = metadata !{i32 7, i32 1, metadata !0, null}
+
diff --git a/test/CodeGen/X86/dbg-value-isel.ll b/test/CodeGen/X86/dbg-value-isel.ll
new file mode 100644
index 0000000..d1a9e57
--- /dev/null
+++ b/test/CodeGen/X86/dbg-value-isel.ll
@@ -0,0 +1,102 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-darwin10.0.0"
+; PR 9879
+
+; CHECK: ##DEBUG_VALUE: tid <-
+%0 = type { i8*, i8*, i8*, i8*, i32 }
+
+@sgv = internal addrspace(2) constant [1 x i8] zeroinitializer
+@fgv = internal addrspace(2) constant [1 x i8] zeroinitializer
+@lvgv = internal constant [0 x i8*] zeroinitializer
+@llvm.global.annotations = appending global [1 x %0] [%0 { i8* bitcast (void (i32 addrspace(1)*)* @__OpenCL_nbt02_kernel to i8*), i8* bitcast ([1 x i8] addrspace(2)* @sgv to i8*), i8* bitcast ([1 x i8] addrspace(2)* @fgv to i8*), i8* bitcast ([0 x i8*]* @lvgv to i8*), i32 0 }], section "llvm.metadata"
+
+define void @__OpenCL_nbt02_kernel(i32 addrspace(1)* %ip) nounwind {
+entry:
+  call void @llvm.dbg.value(metadata !{i32 addrspace(1)* %ip}, i64 0, metadata !8), !dbg !9
+  %0 = call <4 x i32> @__amdil_get_local_id_int() nounwind
+  %1 = extractelement <4 x i32> %0, i32 0
+  br label %2
+
+; <label>:2                                       ; preds = %entry
+  %3 = phi i32 [ %1, %entry ]
+  br label %4
+
+; <label>:4                                       ; preds = %2
+  %5 = phi i32 [ %3, %2 ]
+  br label %get_local_id.exit
+
+get_local_id.exit:                                ; preds = %4
+  %6 = phi i32 [ %5, %4 ]
+  call void @llvm.dbg.value(metadata !{i32 %6}, i64 0, metadata !10), !dbg !12
+  %7 = call <4 x i32> @__amdil_get_global_id_int() nounwind
+  %8 = extractelement <4 x i32> %7, i32 0
+  br label %9
+
+; <label>:9                                       ; preds = %get_local_id.exit
+  %10 = phi i32 [ %8, %get_local_id.exit ]
+  br label %11
+
+; <label>:11                                      ; preds = %9
+  %12 = phi i32 [ %10, %9 ]
+  br label %get_global_id.exit
+
+get_global_id.exit:                               ; preds = %11
+  %13 = phi i32 [ %12, %11 ]
+  call void @llvm.dbg.value(metadata !{i32 %13}, i64 0, metadata !13), !dbg !14
+  %14 = call <4 x i32> @__amdil_get_local_size_int() nounwind
+  %15 = extractelement <4 x i32> %14, i32 0
+  br label %16
+
+; <label>:16                                      ; preds = %get_global_id.exit
+  %17 = phi i32 [ %15, %get_global_id.exit ]
+  br label %18
+
+; <label>:18                                      ; preds = %16
+  %19 = phi i32 [ %17, %16 ]
+  br label %get_local_size.exit
+
+get_local_size.exit:                              ; preds = %18
+  %20 = phi i32 [ %19, %18 ]
+  call void @llvm.dbg.value(metadata !{i32 %20}, i64 0, metadata !15), !dbg !16
+  %tmp5 = add i32 %6, %13, !dbg !17
+  %tmp7 = add i32 %tmp5, %20, !dbg !17
+  store i32 %tmp7, i32 addrspace(1)* %ip, align 4, !dbg !17
+  br label %return, !dbg !17
+
+return:                                           ; preds = %get_local_size.exit
+  ret void, !dbg !18
+}
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+declare <4 x i32> @__amdil_get_local_size_int() nounwind
+
+declare <4 x i32> @__amdil_get_local_id_int() nounwind
+
+declare <4 x i32> @__amdil_get_global_id_int() nounwind
+
+declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+
+!llvm.dbg.sp = !{!0}
+
+!0 = metadata !{i32 589870, i32 0, metadata !1, metadata !"__OpenCL_nbt02_kernel", metadata !"__OpenCL_nbt02_kernel", metadata !"__OpenCL_nbt02_kernel", metadata !1, i32 2, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 0, i1 false, null} ; [ DW_TAG_subprogram ]
+!1 = metadata !{i32 589865, metadata !"OCLlLwTXZ.cl", metadata !"/tmp", metadata !2} ; [ DW_TAG_file_type ]
+!2 = metadata !{i32 589841, i32 0, i32 1, metadata !"OCLlLwTXZ.cl", metadata !"/tmp", metadata !"clc", i1 true, i1 false, metadata !"", i32 0} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{i32 589845, metadata !1, metadata !"", metadata !1, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!4 = metadata !{null, metadata !5}
+!5 = metadata !{i32 589839, metadata !2, metadata !"", null, i32 0, i64 32, i64 32, i64 0, i32 0, metadata !6} ; [ DW_TAG_pointer_type ]
+!6 = metadata !{i32 589846, metadata !2, metadata !"uint", metadata !1, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !7} ; [ DW_TAG_typedef ]
+!7 = metadata !{i32 589860, metadata !2, metadata !"unsigned int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
+!8 = metadata !{i32 590081, metadata !0, metadata !"ip", metadata !1, i32 1, metadata !5, i32 0} ; [ DW_TAG_arg_variable ]
+!9 = metadata !{i32 1, i32 32, metadata !0, null}
+!10 = metadata !{i32 590080, metadata !11, metadata !"tid", metadata !1, i32 3, metadata !6, i32 0} ; [ DW_TAG_auto_variable ]
+!11 = metadata !{i32 589835, metadata !0, i32 2, i32 1, metadata !1, i32 1} ; [ DW_TAG_lexical_block ]
+!12 = metadata !{i32 5, i32 24, metadata !11, null}
+!13 = metadata !{i32 590080, metadata !11, metadata !"gid", metadata !1, i32 3, metadata !6, i32 0} ; [ DW_TAG_auto_variable ]
+!14 = metadata !{i32 6, i32 25, metadata !11, null}
+!15 = metadata !{i32 590080, metadata !11, metadata !"lsz", metadata !1, i32 3, metadata !6, i32 0} ; [ DW_TAG_auto_variable ]
+!16 = metadata !{i32 7, i32 26, metadata !11, null}
+!17 = metadata !{i32 9, i32 24, metadata !11, null}
+!18 = metadata !{i32 10, i32 1, metadata !0, null}
+
diff --git a/test/CodeGen/X86/dbg-value-range.ll b/test/CodeGen/X86/dbg-value-range.ll
index 67e3eee..28d873b 100644
--- a/test/CodeGen/X86/dbg-value-range.ll
+++ b/test/CodeGen/X86/dbg-value-range.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=x86_64-apple-darwin < %s | FileCheck %s
-; RUN: llc -mtriple=x86_64-apple-darwin -regalloc=basic < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-apple-darwin10 < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-apple-darwin10 -regalloc=basic -join-physregs < %s | FileCheck %s
 
 %struct.a = type { i32 }
 
@@ -53,7 +53,10 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 ;CHECK:Ldebug_loc0:
 ;CHECK-NEXT:	.quad
 ;CHECK-NEXT:	.quad	[[CLOBBER]]
-;CHECK-NEXT:	.short	1
+;CHECK-NEXT: Lset{{.*}} = Ltmp{{.*}}-Ltmp{{.*}}
+;CHECK-NEXT:    .short  Lset
+;CHECK-NEXT: Ltmp
 ;CHECK-NEXT:	.byte	85
+;CHECK-NEXT: Ltmp
 ;CHECK-NEXT:	.quad	0
 ;CHECK-NEXT:	.quad	0
diff --git a/test/CodeGen/X86/div8.ll b/test/CodeGen/X86/div8.ll
new file mode 100644
index 0000000..0825f79
--- /dev/null
+++ b/test/CodeGen/X86/div8.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s | FileCheck %s
+; ModuleID = '8div.c'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-macosx10.6.6"
+
+define signext i8 @test_div(i8 %dividend, i8 %divisor) nounwind ssp {
+entry:
+  %dividend.addr = alloca i8, align 2
+  %divisor.addr = alloca i8, align 1
+  %quotient = alloca i8, align 1
+  store i8 %dividend, i8* %dividend.addr, align 2
+  store i8 %divisor, i8* %divisor.addr, align 1
+  %tmp = load i8* %dividend.addr, align 2
+  %tmp1 = load i8* %divisor.addr, align 1
+; Insist on i8->i32 zero extension, even though divb demands only i16:
+; CHECK: movzbl {{.*}}%eax
+; CHECK: divb
+  %div = udiv i8 %tmp, %tmp1
+  store i8 %div, i8* %quotient, align 1
+  %tmp4 = load i8* %quotient, align 1
+  ret i8 %tmp4
+}
diff --git a/test/CodeGen/X86/eh_frame.ll b/test/CodeGen/X86/eh_frame.ll
new file mode 100644
index 0000000..3b792b2
--- /dev/null
+++ b/test/CodeGen/X86/eh_frame.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -mtriple x86_64-unknown-linux-gnu | FileCheck -check-prefix=STATIC %s
+; RUN: llc < %s -mtriple x86_64-unknown-linux-gnu -relocation-model=pic | FileCheck -check-prefix=PIC %s
+
+@__FRAME_END__ = constant [1 x i32] zeroinitializer, section ".eh_frame"
+
+@foo = external global i32
+@bar1 = constant i8* bitcast (i32* @foo to i8*), section "my_bar1", align 8
+
+
+; STATIC: .section	.eh_frame,"a",@progbits
+; STATIC: .section	my_bar1,"a",@progbits
+
+; PIC:	.section	.eh_frame,"a",@progbits
+; PIC:	.section	my_bar1,"aw",@progbits
diff --git a/test/CodeGen/X86/empty-functions.ll b/test/CodeGen/X86/empty-functions.ll
index b303cd1..874c53a 100644
--- a/test/CodeGen/X86/empty-functions.ll
+++ b/test/CodeGen/X86/empty-functions.ll
@@ -6,10 +6,24 @@ entry:
   unreachable
 }
 ; CHECK-NO-FP:     _func:
-; CHECK-NO-FP-NOT: movq %rsp, %rbp
+; CHECK-NO-FP-NEXT: :
+; CHECK-NO-FP-NEXT: .cfi_startproc
 ; CHECK-NO-FP:     nop
+; CHECK-NO-FP-NEXT: :
+; CHECK-NO-FP-NEXT: .cfi_endproc
 
 ; CHECK-FP:      _func:
-; CHECK-FP:      movq %rsp, %rbp
-; CHECK-FP-NEXT: Ltmp1:
-; CHECK-FP:      nop
+; CHECK-FP-NEXT: :
+; CHECK-FP-NEXT: .cfi_startproc
+; CHECK-FP-NEXT: :
+; CHECK-FP-NEXT: pushq %rbp
+; CHECK-FP-NEXT: :
+; CHECK-FP-NEXT: .cfi_def_cfa_offset 16
+; CHECK-FP-NEXT: :
+; CHECK-FP-NEXT: .cfi_offset %rbp, -16
+; CHECK-FP-NEXT: movq %rsp, %rbp
+; CHECK-FP-NEXT: :
+; CHECK-FP-NEXT: .cfi_def_cfa_register %rbp
+; CHECK-FP-NEXT: nop
+; CHECK-FP-NEXT: :
+; CHECK-FP-NEXT: .cfi_endproc
diff --git a/test/CodeGen/X86/fast-isel-agg-constant.ll b/test/CodeGen/X86/fast-isel-agg-constant.ll
new file mode 100644
index 0000000..ce0dff7
--- /dev/null
+++ b/test/CodeGen/X86/fast-isel-agg-constant.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -march=x86-64 -O0 | FileCheck %s
+; Make sure fast-isel doesn't screw up aggregate constants.
+; (Failing out is okay, as long as we don't miscompile.)
+
+%bar = type { i32 }
+
+define i32 @foo()  {
+  %tmp = extractvalue %bar { i32 3 }, 0
+  ret i32 %tmp
+; CHECK: movl $3, %eax
+}
diff --git a/test/CodeGen/X86/fast-isel-call.ll b/test/CodeGen/X86/fast-isel-call.ll
index 5fcdbbb..3159741 100644
--- a/test/CodeGen/X86/fast-isel-call.ll
+++ b/test/CodeGen/X86/fast-isel-call.ll
@@ -1,6 +1,8 @@
-; RUN: llc < %s -fast-isel -march=x86 | grep and
+; RUN: llc < %s -O0 -fast-isel-abort -march=x86 | FileCheck %s
 
-define i32 @t() nounwind {
+%struct.s = type {i32, i32, i32}
+
+define i32 @test1() nounwind {
 tak:
 	%tmp = call i1 @foo()
 	br i1 %tmp, label %BB1, label %BB2
@@ -8,6 +10,46 @@ BB1:
 	ret i32 1
 BB2:
 	ret i32 0
+; CHECK: test1:
+; CHECK: calll
+; CHECK-NEXT: testb	$1
 }
+declare zeroext i1 @foo()  nounwind
+
+declare void @foo2(%struct.s* byval)
+
+define void @test2(%struct.s* %d) nounwind {
+  call void @foo2(%struct.s* byval %d )
+  ret void
+; CHECK: test2:
+; CHECK: movl	(%eax)
+; CHECK: movl {{.*}}, (%esp)
+; CHECK: movl	4(%eax)
+; CHECK: movl {{.*}}, 4(%esp)
+; CHECK: movl	8(%eax)
+; CHECK: movl {{.*}}, 8(%esp)
+}
+
+declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind
 
-declare i1 @foo() zeroext nounwind
+define void @test3(i8* %a) {
+  call void @llvm.memset.p0i8.i32(i8* %a, i8 0, i32 100, i32 1, i1 false)
+  ret void
+; CHECK: test3:
+; CHECK:   movl	{{.*}}, (%esp)
+; CHECK:   movl	$0, 4(%esp)
+; CHECK:   movl	$100, 8(%esp)
+; CHECK:   calll {{.*}}memset
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
+
+define void @test4(i8* %a, i8* %b) {
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a, i8* %b, i32 100, i32 1, i1 false)
+  ret void
+; CHECK: test4:
+; CHECK:   movl	{{.*}}, (%esp)
+; CHECK:   movl	{{.*}}, 4(%esp)
+; CHECK:   movl	$100, 8(%esp)
+; CHECK:   calll {{.*}}memcpy
+}
diff --git a/test/CodeGen/X86/fast-isel-extract.ll b/test/CodeGen/X86/fast-isel-extract.ll
new file mode 100644
index 0000000..f63396e
--- /dev/null
+++ b/test/CodeGen/X86/fast-isel-extract.ll
@@ -0,0 +1,48 @@
+; RUN: llc < %s -mtriple x86_64-apple-darwin11 -O0 -fast-isel-abort | FileCheck %s
+
+%struct.x = type { i64, i64 }
+%addovf = type { i32, i1 }
+declare %struct.x @f()
+
+define void @test1(i64*) nounwind ssp {
+  %2 = tail call %struct.x @f() nounwind
+  %3 = extractvalue %struct.x %2, 0
+  %4 = add i64 %3, 10
+  store i64 %4, i64* %0
+  ret void
+; CHECK: test1:
+; CHECK: callq _f
+; CHECK-NEXT: addq	$10, %rax
+}
+
+define void @test2(i64*) nounwind ssp {
+  %2 = tail call %struct.x @f() nounwind
+  %3 = extractvalue %struct.x %2, 1
+  %4 = add i64 %3, 10
+  store i64 %4, i64* %0
+  ret void
+; CHECK: test2:
+; CHECK: callq _f
+; CHECK-NEXT: addq	$10, %rdx
+}
+
+declare %addovf @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone
+
+define void @test3(i32 %x, i32 %y, i32* %z) {
+  %r = call %addovf @llvm.sadd.with.overflow.i32(i32 %x, i32 %y)
+  %sum = extractvalue %addovf %r, 0
+  %sum3 = mul i32 %sum, 3
+  %bit = extractvalue %addovf %r, 1
+  br i1 %bit, label %then, label %end
+  
+then:
+  store i32 %sum3, i32* %z
+  br label %end
+
+end:
+  ret void
+; CHECK: test3
+; CHECK: addl
+; CHECK: seto %al
+; CHECK: testb $1, %al
+}
diff --git a/test/CodeGen/X86/fast-isel-fneg.ll b/test/CodeGen/X86/fast-isel-fneg.ll
index 5ffd48b..f42a4a2 100644
--- a/test/CodeGen/X86/fast-isel-fneg.ll
+++ b/test/CodeGen/X86/fast-isel-fneg.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -fast-isel -fast-isel-abort -march=x86-64 | FileCheck %s
+; RUN: llc < %s -fast-isel -fast-isel-abort -mtriple=x86_64-apple-darwin10 | FileCheck %s
 ; RUN: llc < %s -fast-isel -march=x86 -mattr=+sse2 | grep xor | count 2
 
 ; CHECK: doo:
diff --git a/test/CodeGen/X86/fast-isel-gep.ll b/test/CodeGen/X86/fast-isel-gep.ll
index 48abfd0..1a2e34e 100644
--- a/test/CodeGen/X86/fast-isel-gep.ll
+++ b/test/CodeGen/X86/fast-isel-gep.ll
@@ -24,7 +24,7 @@ define i32 @test2(i64 %t3, i32* %t1) nounwind {
        %t15 = load i32* %t9            ; <i32> [#uses=1]
        ret i32 %t15
 ; X32: test2:
-; X32:  	movl	(%edx,%ecx,4), %eax
+; X32:  	movl	(%edx,%ecx,4), %e
 ; X32:  	ret
 
 ; X64: test2:
diff --git a/test/CodeGen/X86/fast-isel-i1.ll b/test/CodeGen/X86/fast-isel-i1.ll
index d066578..bea18a1 100644
--- a/test/CodeGen/X86/fast-isel-i1.ll
+++ b/test/CodeGen/X86/fast-isel-i1.ll
@@ -1,19 +1,41 @@
-; RUN: llc < %s -march=x86 -fast-isel | grep {andb	\$1, %}
+; RUN: llc < %s -mtriple=i686-apple-darwin10 -fast-isel -fast-isel-abort | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -fast-isel -fast-isel-abort | FileCheck %s
 
-declare i64 @bar(i64)
+declare i32 @test1a(i32)
 
-define i32 @foo(i64 %x) nounwind {
-	%y = add i64 %x, -3		; <i64> [#uses=1]
-	%t = call i64 @bar(i64 %y)		; <i64> [#uses=1]
-	%s = mul i64 %t, 77		; <i64> [#uses=1]
-	%z = trunc i64 %s to i1		; <i1> [#uses=1]
+define i32 @test1(i32 %x) nounwind {
+; CHECK: test1:
+; CHECK: andb $1, %
+	%y = add i32 %x, -3
+	%t = call i32 @test1a(i32 %y)
+	%s = mul i32 %t, 77
+	%z = trunc i32 %s to i1
 	br label %next
 
 next:		; preds = %0
-	%u = zext i1 %z to i32		; <i32> [#uses=1]
-	%v = add i32 %u, 1999		; <i32> [#uses=1]
+	%u = zext i1 %z to i32
+	%v = add i32 %u, 1999
 	br label %exit
 
 exit:		; preds = %next
 	ret i32 %v
 }
+
+define void @test2(i8* %a) nounwind {
+entry:
+; CHECK: test2:
+; CHECK: movb {{.*}} %al
+; CHECK-NEXT: xorb $1, %al
+; CHECK-NEXT: testb $1
+  %tmp = load i8* %a, align 1
+  %tobool = trunc i8 %tmp to i1
+  %tobool2 = xor i1 %tobool, true
+  br i1 %tobool2, label %if.then, label %if.end
+
+if.then:
+  call void @test2(i8* null)
+  br label %if.end
+
+if.end:
+  ret void
+}
diff --git a/test/CodeGen/X86/fast-isel-ret-ext.ll b/test/CodeGen/X86/fast-isel-ret-ext.ll
new file mode 100644
index 0000000..fd768cb
--- /dev/null
+++ b/test/CodeGen/X86/fast-isel-ret-ext.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s  -O0 -fast-isel-abort -mtriple i686-apple-darwin10 | FileCheck %s
+; RUN: llc < %s  -O0 -fast-isel-abort -mtriple x86_64-apple-darwin10 | FileCheck %s
+
+define zeroext i8 @test1(i32 %y) nounwind {
+  %conv = trunc i32 %y to i8
+  ret i8 %conv
+  ; CHECK: test1:
+  ; CHECK: movzbl {{.*}}, %eax
+}
+
+define signext i8 @test2(i32 %y) nounwind {
+  %conv = trunc i32 %y to i8
+  ret i8 %conv
+  ; CHECK: test2:
+  ; CHECK: movsbl {{.*}}, %eax
+}
+
+define zeroext i16 @test3(i32 %y) nounwind {
+  %conv = trunc i32 %y to i16
+  ret i16 %conv
+  ; CHECK: test3:
+  ; CHECK: movzwl {{.*}}, %eax
+}
+
+define signext i16 @test4(i32 %y) nounwind {
+  %conv = trunc i32 %y to i16
+  ret i16 %conv
+  ; CHECK: test4:
+  ; CHECK: movswl {{.*}}, %eax
+}
+
+define zeroext i1 @test5(i32 %y) nounwind {
+  %conv = trunc i32 %y to i1
+  ret i1 %conv
+  ; CHECK: test5:
+  ; CHECK: andb $1
+  ; CHECK: movzbl {{.*}}, %eax
+}
diff --git a/test/CodeGen/X86/fast-isel-shift-imm.ll b/test/CodeGen/X86/fast-isel-shift-imm.ll
deleted file mode 100644
index 5c62c18..0000000
--- a/test/CodeGen/X86/fast-isel-shift-imm.ll
+++ /dev/null
@@ -1,8 +0,0 @@
-; RUN: llc < %s -march=x86 -O0 | grep {sarl	\$80, %e}
-; PR3242
-
-define void @foo(i32 %x, i32* %p) nounwind {
-  %y = ashr i32 %x, 50000
-  store i32 %y, i32* %p
-  ret void
-}
diff --git a/test/CodeGen/X86/fast-isel-x86-64.ll b/test/CodeGen/X86/fast-isel-x86-64.ll
new file mode 100644
index 0000000..c4afc10
--- /dev/null
+++ b/test/CodeGen/X86/fast-isel-x86-64.ll
@@ -0,0 +1,262 @@
+; RUN: llc < %s  -fast-isel -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-darwin10.0.0"
+
+; Make sure that fast-isel folds the immediate into the binop even though it
+; is non-canonical.
+define i32 @test1(i32 %i) nounwind ssp {
+  %and = and i32 8, %i
+  ret i32 %and
+}
+
+; CHECK: test1:
+; CHECK: andl	$8, 
+
+
+; rdar://9289512 - The load should fold into the compare.
+define void @test2(i64 %x) nounwind ssp {
+entry:
+  %x.addr = alloca i64, align 8
+  store i64 %x, i64* %x.addr, align 8
+  %tmp = load i64* %x.addr, align 8
+  %cmp = icmp sgt i64 %tmp, 42
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+; CHECK: test2:
+; CHECK: movq	%rdi, -8(%rsp)
+; CHECK: cmpq	$42, -8(%rsp)
+}
+
+
+
+
+@G = external global i32
+define i64 @test3() nounwind {
+  %A = ptrtoint i32* @G to i64
+  ret i64 %A
+; CHECK: test3:
+; CHECK: movq _G@GOTPCREL(%rip), %rax
+; CHECK-NEXT: ret
+}
+
+
+
+; rdar://9289558
+@rtx_length = external global [153 x i8]
+
+define i32 @test4(i64 %idxprom9) nounwind {
+  %arrayidx10 = getelementptr inbounds [153 x i8]* @rtx_length, i32 0, i64 %idxprom9
+  %tmp11 = load i8* %arrayidx10, align 1
+  %conv = zext i8 %tmp11 to i32
+  ret i32 %conv
+
+; CHECK: test4:
+; CHECK: movq	_rtx_length@GOTPCREL(%rip), %rax
+; CHECK-NEXT: movzbl	(%rax,%rdi), %eax
+; CHECK-NEXT: ret
+}
+
+
+; PR3242 - Out of range shifts should not be folded by fastisel.
+define void @test5(i32 %x, i32* %p) nounwind {
+  %y = ashr i32 %x, 50000
+  store i32 %y, i32* %p
+  ret void
+
+; CHECK: test5:
+; CHECK: movl	$50000, %ecx
+; CHECK: sarl	%cl, %edi
+; CHECK: ret
+}
+
+; rdar://9289501 - fast isel should fold trivial multiplies to shifts.
+define i64 @test6(i64 %x) nounwind ssp {
+entry:
+  %mul = mul nsw i64 %x, 8
+  ret i64 %mul
+
+; CHECK: test6:
+; CHECK: leaq	(,%rdi,8), %rax
+}
+
+define i32 @test7(i32 %x) nounwind ssp {
+entry:
+  %mul = mul nsw i32 %x, 8
+  ret i32 %mul
+; CHECK: test7:
+; CHECK: leal	(,%rdi,8), %eax
+}
+
+
+; rdar://9289507 - folding of immediates into 64-bit operations.
+define i64 @test8(i64 %x) nounwind ssp {
+entry:
+  %add = add nsw i64 %x, 7
+  ret i64 %add
+
+; CHECK: test8:
+; CHECK: addq	$7, %rdi
+}
+
+define i64 @test9(i64 %x) nounwind ssp {
+entry:
+  %add = mul nsw i64 %x, 7
+  ret i64 %add
+; CHECK: test9:
+; CHECK: imulq	$7, %rdi, %rax
+}
+
+; rdar://9297011 - Don't reject udiv by a power of 2.
+define i32 @test10(i32 %X) nounwind {
+  %Y = udiv i32 %X, 8
+  ret i32 %Y
+; CHECK: test10:
+; CHECK: shrl	$3, 
+}
+
+define i32 @test11(i32 %X) nounwind {
+  %Y = sdiv exact i32 %X, 8
+  ret i32 %Y
+; CHECK: test11:
+; CHECK: sarl	$3, 
+}
+
+
+; rdar://9297006 - Trunc to bool.
+define void @test12(i8 %tmp) nounwind ssp noredzone {
+entry:
+  %tobool = trunc i8 %tmp to i1
+  br i1 %tobool, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  call void @test12(i8 0) noredzone
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+; CHECK: test12:
+; CHECK: testb	$1,
+; CHECK-NEXT: je L
+; CHECK-NEXT: movl $0, %edi
+; CHECK-NEXT: callq
+}
+
+declare void @test13f(i1 %X)
+
+define void @test13() nounwind {
+  call void @test13f(i1 0)
+  ret void
+; CHECK: test13:
+; CHECK: movl $0, %edi
+; CHECK-NEXT: callq
+}
+
+
+
+; rdar://9297003 - fast isel bails out on all functions taking bools
+define void @test14(i8 %tmp) nounwind ssp noredzone {
+entry:
+  %tobool = trunc i8 %tmp to i1
+  call void @test13f(i1 zeroext %tobool) noredzone
+  ret void
+; CHECK: test14:
+; CHECK: andb	$1, 
+; CHECK: callq
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8*, i8*, i64, i32, i1)
+
+; rdar://9289488 - fast-isel shouldn't bail out on llvm.memcpy
+define void @test15(i8* %a, i8* %b) nounwind {
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* %b, i64 4, i32 4, i1 false)
+  ret void
+; CHECK: test15:
+; CHECK-NEXT: movl	(%rsi), %eax
+; CHECK-NEXT: movl	%eax, (%rdi)
+; CHECK-NEXT: ret
+}
+
+; Handling for varargs calls
+declare void @test16callee(...) nounwind
+define void @test16() nounwind {
+; CHECK: test16:
+; CHECK: movl $1, %edi
+; CHECK: movb $0, %al
+; CHECK: callq _test16callee
+  call void (...)* @test16callee(i32 1)
+  br label %block2
+
+block2:
+; CHECK: movabsq $1
+; CHECK: cvtsi2sdq {{.*}} %xmm0
+; CHECK: movb $1, %al
+; CHECK: callq _test16callee
+  call void (...)* @test16callee(double 1.000000e+00)
+  ret void
+}
+
+
+declare void @foo() unnamed_addr ssp align 2
+
+; Verify that we don't fold the load into the compare here.  That would move it
+; w.r.t. the call.
+define i32 @test17(i32 *%P) ssp nounwind {
+entry:
+  %tmp = load i32* %P
+  %cmp = icmp ne i32 %tmp, 5
+  call void @foo()
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  ret i32 1
+
+if.else:                                          ; preds = %entry
+  ret i32 2
+; CHECK: test17:
+; CHECK: movl	(%rdi), %eax
+; CHECK: callq _foo
+; CHECK: cmpl	$5, %eax
+; CHECK-NEXT: je 
+}
+
+; Check that 0.0 is materialized using pxor
+define void @test18(float* %p1) {
+  store float 0.0, float* %p1
+  ret void
+; CHECK: test18:
+; CHECK: pxor
+}
+define void @test19(double* %p1) {
+  store double 0.0, double* %p1
+  ret void
+; CHECK: test19:
+; CHECK: pxor
+}
+
+; Check that we fast-isel sret
+%struct.a = type { i64, i64, i64 }
+define void @test20() nounwind ssp {
+entry:
+  %tmp = alloca %struct.a, align 8
+  call void @test20sret(%struct.a* sret %tmp)
+  ret void
+; CHECK: test20:
+; CHECK: leaq (%rsp), %rdi
+; CHECK: callq _test20sret
+}
+declare void @test20sret(%struct.a* sret)
+
+; Check that -0.0 is not materialized using pxor
+define void @test21(double* %p1) {
+  store double -0.0, double* %p1
+  ret void
+; CHECK: test21:
+; CHECK-NOT: pxor
+; CHECK: movsd	LCPI
+}
+\ No newline at end of file
diff --git a/test/CodeGen/X86/fast-isel-x86.ll b/test/CodeGen/X86/fast-isel-x86.ll
index 56aeb3a..19972f7 100644
--- a/test/CodeGen/X86/fast-isel-x86.ll
+++ b/test/CodeGen/X86/fast-isel-x86.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86 -relocation-model=pic < %s
+; RUN: llc -fast-isel -O0 -mtriple=i386-apple-darwin10 -relocation-model=pic < %s | FileCheck %s
 
 ; This should use flds to set the return value.
 ; CHECK: test0:
@@ -31,3 +31,18 @@ define i32 @test2() nounwind {
   %t = load i32* @HHH
   ret i32 %t
 }
+
+; Check that we fast-isel sret, and handle the callee-pops behavior correctly.
+%struct.a = type { i64, i64, i64 }
+define void @test3() nounwind ssp {
+entry:
+  %tmp = alloca %struct.a, align 8
+  call void @test3sret(%struct.a* sret %tmp)
+  ret void
+; CHECK: test3:
+; CHECK: subl $44
+; CHECK: leal 16(%esp)
+; CHECK: calll _test3sret
+; CHECK: addl $40
+}
+declare void @test3sret(%struct.a* sret)
diff --git a/test/CodeGen/X86/fast-isel.ll b/test/CodeGen/X86/fast-isel.ll
index 177c06b..8391860 100644
--- a/test/CodeGen/X86/fast-isel.ll
+++ b/test/CodeGen/X86/fast-isel.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -fast-isel -fast-isel-abort -march=x86 -mattr=sse2
-; RUN: llc < %s -fast-isel -fast-isel-abort -march=x86-64
+; RUN: llc < %s -fast-isel -fast-isel-abort -mtriple=x86_64-apple-darwin10
 
 ; This tests very minimal fast-isel functionality.
 
@@ -20,13 +20,14 @@ fast:
   %t6 = add i32 %t5, 2
   %t7 = getelementptr i32* %y, i32 1
   %t8 = getelementptr i32* %t7, i32 %t6
+  call void asm sideeffect "hello world", ""()
   br label %exit
 
 exit:
   ret i32* %t8
 }
 
-define double @bar(double* %p, double* %q) nounwind {
+define void @bar(double* %p, double* %q) nounwind {
 entry:
   %r = load double* %p
   %s = load double* %q
@@ -40,7 +41,8 @@ fast:
   br label %exit
 
 exit:
-  ret double %t3
+  store double %t3, double* %q
+  ret void
 }
 
 define i32 @cast() nounwind {
@@ -67,24 +69,28 @@ define i8* @inttoptr_i32(i32 %p) nounwind {
   ret i8* %t
 }
 
-define i8 @trunc_i32_i8(i32 %x) signext nounwind  {
+define void @trunc_i32_i8(i32 %x, i8* %p) nounwind  {
 	%tmp1 = trunc i32 %x to i8
-	ret i8 %tmp1
+	store i8 %tmp1, i8* %p
+	ret void
 }
 
-define i8 @trunc_i16_i8(i16 signext %x) signext nounwind  {
+define void @trunc_i16_i8(i16 signext %x, i8* %p) nounwind  {
 	%tmp1 = trunc i16 %x to i8
-	ret i8 %tmp1
+	store i8 %tmp1, i8* %p
+	ret void
 }
 
-define i8 @shl_i8(i8 %a, i8 %c) nounwind {
-       %tmp = shl i8 %a, %c
-       ret i8 %tmp
+define void @shl_i8(i8 %a, i8 %c, i8* %p) nounwind {
+  %tmp = shl i8 %a, %c
+  store i8 %tmp, i8* %p
+  ret void
 }
 
-define i8 @mul_i8(i8 %a) nounwind {
-       %tmp = mul i8 %a, 17
-       ret i8 %tmp
+define void @mul_i8(i8 %a, i8* %p) nounwind {
+  %tmp = mul i8 %a, 17
+  store i8 %tmp, i8* %p
+  ret void
 }
 
 define void @load_store_i1(i1* %p, i1* %q) nounwind {
@@ -92,3 +98,13 @@ define void @load_store_i1(i1* %p, i1* %q) nounwind {
   store i1 %t, i1* %q
   ret void
 }
+
+
+@crash_test1x = external global <2 x i32>, align 8
+
+define void @crash_test1() nounwind ssp {
+  %tmp = load <2 x i32>* @crash_test1x, align 8
+  %neg = xor <2 x i32> %tmp, <i32 -1, i32 -1>
+  ret void
+}
+
diff --git a/test/CodeGen/X86/fold-xmm-zero.ll b/test/CodeGen/X86/fold-xmm-zero.ll
new file mode 100644
index 0000000..b4eeb40
--- /dev/null
+++ b/test/CodeGen/X86/fold-xmm-zero.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s -mtriple=i386-apple-macosx10.6.7 -mattr=+sse2 | FileCheck %s
+
+; Simple test to make sure folding for special constants (like float zero)
+; isn't completely broken.
+
+; CHECK: divss	LCPI0
+
+%0 = type { float, float, float, float, float, float, float, float }
+
+define void @f() nounwind ssp {
+entry:
+  %0 = tail call %0 asm sideeffect "foo", "={xmm0},={xmm1},={xmm2},={xmm3},={xmm4},={xmm5},={xmm6},={xmm7},0,1,2,3,4,5,6,7,~{dirflag},~{fpsr},~{flags}"(float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, float 8.000000e+00) nounwind
+  %asmresult = extractvalue %0 %0, 0
+  %asmresult8 = extractvalue %0 %0, 1
+  %asmresult9 = extractvalue %0 %0, 2
+  %asmresult10 = extractvalue %0 %0, 3
+  %asmresult11 = extractvalue %0 %0, 4
+  %asmresult12 = extractvalue %0 %0, 5
+  %asmresult13 = extractvalue %0 %0, 6
+  %asmresult14 = extractvalue %0 %0, 7
+  %div = fdiv float %asmresult, 0.000000e+00
+  %1 = tail call %0 asm sideeffect "bar", "={xmm0},={xmm1},={xmm2},={xmm3},={xmm4},={xmm5},={xmm6},={xmm7},0,1,2,3,4,5,6,7,~{dirflag},~{fpsr},~{flags}"(float %div, float %asmresult8, float %asmresult9, float %asmresult10, float %asmresult11, float %asmresult12, float %asmresult13, float %asmresult14) nounwind
+  %asmresult24 = extractvalue %0 %1, 0
+  %asmresult25 = extractvalue %0 %1, 1
+  %asmresult26 = extractvalue %0 %1, 2
+  %asmresult27 = extractvalue %0 %1, 3
+  %asmresult28 = extractvalue %0 %1, 4
+  %asmresult29 = extractvalue %0 %1, 5
+  %asmresult30 = extractvalue %0 %1, 6
+  %asmresult31 = extractvalue %0 %1, 7
+  %div33 = fdiv float %asmresult24, 0.000000e+00
+  %2 = tail call %0 asm sideeffect "baz", "={xmm0},={xmm1},={xmm2},={xmm3},={xmm4},={xmm5},={xmm6},={xmm7},0,1,2,3,4,5,6,7,~{dirflag},~{fpsr},~{flags}"(float %div33, float %asmresult25, float %asmresult26, float %asmresult27, float %asmresult28, float %asmresult29, float %asmresult30, float %asmresult31) nounwind
+  ret void
+}
diff --git a/test/CodeGen/X86/fold-zext-trunc.ll b/test/CodeGen/X86/fold-zext-trunc.ll
new file mode 100644
index 0000000..f901ad2
--- /dev/null
+++ b/test/CodeGen/X86/fold-zext-trunc.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s | FileCheck %s
+; PR9055
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32"
+target triple = "i686-pc-linux-gnu"
+
+%struct.S0 = type { i32, [2 x i8], [2 x i8], [4 x i8] }
+
+@g_98 = common global %struct.S0 zeroinitializer, align 4
+
+define void @foo() nounwind {
+; CHECK: movzbl
+; CHECK-NOT: movzbl
+; CHECK: calll
+entry:
+  %tmp17 = load i8* getelementptr inbounds (%struct.S0* @g_98, i32 0, i32 1, i32 0), align 4
+  %tmp54 = zext i8 %tmp17 to i32
+  %foo = load i32* bitcast (i8* getelementptr inbounds (%struct.S0* @g_98, i32 0, i32 1, i32 0) to i32*), align 4
+  %conv.i = trunc i32 %foo to i8
+  tail call void @func_12(i32 %tmp54, i8 zeroext %conv.i) nounwind
+  ret void
+}
+
+declare void @func_12(i32, i8 zeroext)
diff --git a/test/CodeGen/X86/fp-stack-compare.ll b/test/CodeGen/X86/fp-stack-compare.ll
index b216914..f3998b6 100644
--- a/test/CodeGen/X86/fp-stack-compare.ll
+++ b/test/CodeGen/X86/fp-stack-compare.ll
@@ -1,11 +1,11 @@
-; RUN: llc < %s -march=x86 -mcpu=i386 | grep {fucompi.*st.\[12\]}
+; RUN: llc < %s -march=x86 -mcpu=i386 | FileCheck %s
 ; PR1012
 
 define float @foo(float* %col.2.0) {
-        %tmp = load float* %col.2.0             ; <float> [#uses=3]
-        %tmp16 = fcmp olt float %tmp, 0.000000e+00              ; <i1> [#uses=1]
-        %tmp20 = fsub float -0.000000e+00, %tmp          ; <float> [#uses=1]
-        %iftmp.2.0 = select i1 %tmp16, float %tmp20, float %tmp         ; <float> [#uses=1]
-        ret float %iftmp.2.0
+; CHECK: fucompi
+  %tmp = load float* %col.2.0
+  %tmp16 = fcmp olt float %tmp, 0.000000e+00
+  %tmp20 = fsub float -0.000000e+00, %tmp
+  %iftmp.2.0 = select i1 %tmp16, float %tmp20, float %tmp
+  ret float %iftmp.2.0
 }
-
diff --git a/test/CodeGen/X86/fp-trunc.ll b/test/CodeGen/X86/fp-trunc.ll
index 4fe78ec..170637a 100644
--- a/test/CodeGen/X86/fp-trunc.ll
+++ b/test/CodeGen/X86/fp-trunc.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2,-avx
+; RUN: llc < %s -march=x86 -mattr=+sse2,-avx | FileCheck %s
 
 define <1 x float> @test1(<1 x double> %x) nounwind {
 ; CHECK: cvtsd2ss
diff --git a/test/CodeGen/X86/hidden-vis-pic.ll b/test/CodeGen/X86/hidden-vis-pic.ll
index ba130a2..67be3d0 100644
--- a/test/CodeGen/X86/hidden-vis-pic.ll
+++ b/test/CodeGen/X86/hidden-vis-pic.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin9 -relocation-model=pic -disable-fp-elim -unwind-tables | FileCheck %s
+; RUN: llc < %s -disable-cfi -mtriple=i386-apple-darwin9 -relocation-model=pic -disable-fp-elim | FileCheck %s
 
 
 
@@ -26,7 +26,7 @@ entry:
 
 @.str = private constant [12 x i8] c"hello world\00", align 1 ; <[12 x i8]*> [#uses=1]
 
-define hidden void @func() nounwind ssp {
+define hidden void @func() nounwind ssp uwtable {
 entry:
   %0 = call i32 @puts(i8* getelementptr inbounds ([12 x i8]* @.str, i64 0, i64 0)) nounwind ; <i32> [#uses=0]
   br label %return
@@ -37,7 +37,7 @@ return:                                           ; preds = %entry
 
 declare i32 @puts(i8*)
 
-define hidden i32 @main() nounwind ssp {
+define hidden i32 @main() nounwind ssp uwtable {
 entry:
   %retval = alloca i32                            ; <i32*> [#uses=1]
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
diff --git a/test/CodeGen/X86/hoist-common.ll b/test/CodeGen/X86/hoist-common.ll
new file mode 100644
index 0000000..72e17c0
--- /dev/null
+++ b/test/CodeGen/X86/hoist-common.ll
@@ -0,0 +1,28 @@
+; RUN: llc < %s -mtriple=x86_64-apple-macosx  | FileCheck %s
+
+; Common "xorb al, al" instruction in the two successor blocks should be
+; moved to the entry block above the test + je.
+
+; rdar://9145558
+
+define zeroext i1 @t(i32 %c) nounwind ssp {
+entry:
+; CHECK: t:
+; CHECK: xorb %al, %al
+; CHECK: test
+; CHECK: je
+  %tobool = icmp eq i32 %c, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:
+; CHECK: callq
+  %call = tail call zeroext i1 (...)* @foo() nounwind
+  br label %return
+
+return:
+; CHECK: ret
+  %retval.0 = phi i1 [ %call, %if.then ], [ false, %entry ]
+  ret i1 %retval.0
+}
+
+declare zeroext i1 @foo(...)
diff --git a/test/CodeGen/X86/inline-asm-error.ll b/test/CodeGen/X86/inline-asm-error.ll
new file mode 100644
index 0000000..29c5ae5
--- /dev/null
+++ b/test/CodeGen/X86/inline-asm-error.ll
@@ -0,0 +1,17 @@
+; RUN: not llc -march x86 -regalloc=fast       < %s 2> %t1
+; RUN: not llc -march x86 -regalloc=basic      < %s 2> %t2
+; RUN: not llc -march x86 -regalloc=greedy     < %s 2> %t3
+; RUN: FileCheck %s < %t1
+; RUN: FileCheck %s < %t2
+; RUN: FileCheck %s < %t3
+
+; The register allocator must fail on this function, and it should print the
+; inline asm in the diagnostic.
+; CHECK: LLVM ERROR: Ran out of registers during register allocation!
+; CHECK: INLINEASM <es:hello world>
+
+define void @f(i32 %x0, i32 %x1, i32 %x2, i32 %x3, i32 %x4, i32 %x5, i32 %x6, i32 %x7, i32 %x8, i32 %x9) nounwind ssp {
+entry:
+  tail call void asm sideeffect "hello world", "r,r,r,r,r,r,r,r,r,r,~{dirflag},~{fpsr},~{flags}"(i32 %x0, i32 %x1, i32 %x2, i32 %x3, i32 %x4, i32 %x5, i32 %x6, i32 %x7, i32 %x8, i32 %x9) nounwind
+  ret void
+}
diff --git a/test/CodeGen/X86/isint.ll b/test/CodeGen/X86/isint.ll
index 507a328..4a98e63 100644
--- a/test/CodeGen/X86/isint.ll
+++ b/test/CodeGen/X86/isint.ll
@@ -1,17 +1,15 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 > %t
-; RUN: not grep cmp %t
-; RUN: not grep xor %t
-; RUN: grep jne %t | count 1
-; RUN: grep jp %t | count 1
-; RUN: grep setnp %t | count 1
-; RUN: grep sete %t | count 1
-; RUN: grep and %t | count 1
-; RUN: grep cvt %t | count 4
+; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s
 
 define i32 @isint_return(double %d) nounwind {
+; CHECK-NOT: xor
+; CHECK: cvt
   %i = fptosi double %d to i32
+; CHECK-NEXT: cvt
   %e = sitofp i32 %i to double
+; CHECK: cmpeqsd
   %c = fcmp oeq double %d, %e
+; CHECK-NEXT: movd
+; CHECK-NEXT: andl
   %z = zext i1 %c to i32
   ret i32 %z
 }
@@ -19,9 +17,14 @@ define i32 @isint_return(double %d) nounwind {
 declare void @foo()
 
 define void @isint_branch(double %d) nounwind {
+; CHECK: cvt
   %i = fptosi double %d to i32
+; CHECK-NEXT: cvt
   %e = sitofp i32 %i to double
+; CHECK: ucomisd
   %c = fcmp oeq double %d, %e
+; CHECK-NEXT: jne
+; CHECK-NEXT: jp
   br i1 %c, label %true, label %false
 true:
   call void @foo()
diff --git a/test/CodeGen/X86/lea-3.ll b/test/CodeGen/X86/lea-3.ll
index 040c5c2..c439ee1 100644
--- a/test/CodeGen/X86/lea-3.ll
+++ b/test/CodeGen/X86/lea-3.ll
@@ -14,7 +14,7 @@ define i32 @test(i32 %a) {
         ret i32 %tmp2
 }
 
-;; TODO!  LEA instead of shift + copy.
+; CHECK: leaq (,[[A0]],8), %rax
 define i64 @test3(i64 %a) {
         %tmp2 = shl i64 %a, 3
         ret i64 %tmp2
diff --git a/test/CodeGen/X86/lock-inst-encoding.ll b/test/CodeGen/X86/lock-inst-encoding.ll
index 03468e2..2d10fbc 100644
--- a/test/CodeGen/X86/lock-inst-encoding.ll
+++ b/test/CodeGen/X86/lock-inst-encoding.ll
@@ -4,10 +4,9 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 target triple = "x86_64-apple-darwin10.0.0"
 
 ; CHECK: f0:
-; CHECK: addq %rax, (%rdi)
-; CHECK: # encoding: [0xf0,0x48,0x01,0x07]
+; CHECK: addq %{{.*}}, ({{.*}}){{.*}}encoding: [0xf0,
 ; CHECK: ret
-define void @f0(i64* %a0) {
+define void @f0(i64* %a0) nounwind {
   %t0 = and i64 1, 1
   call void @llvm.memory.barrier(i1 true, i1 true, i1 true, i1 true, i1 true) nounwind
   %1 = call i64 @llvm.atomic.load.add.i64.p0i64(i64* %a0, i64 %t0) nounwind
diff --git a/test/CodeGen/X86/lsr-interesting-step.ll b/test/CodeGen/X86/lsr-interesting-step.ll
index 4b7050b..d1de051 100644
--- a/test/CodeGen/X86/lsr-interesting-step.ll
+++ b/test/CodeGen/X86/lsr-interesting-step.ll
@@ -1,9 +1,9 @@
-; RUN: llc < %s -march=x86-64 -relocation-model=static -mtriple=x86_64-unknown-linux-gnu
+; RUN: llc < %s -march=x86-64 -relocation-model=static -mtriple=x86_64-unknown-linux-gnu -asm-verbose=0 | FileCheck %s
 
 ; The inner loop should require only one add (and no leas either).
 ; rdar://8100380
 
-; CHECK:      BB0_4:
+; CHECK:      BB0_3:
 ; CHECK-NEXT:   movb    $0, flags(%rdx)
 ; CHECK-NEXT:   addq    %rcx, %rdx
 ; CHECK-NEXT:   cmpq    $8192, %rdx
diff --git a/test/CodeGen/X86/lsr-loop-exit-cond.ll b/test/CodeGen/X86/lsr-loop-exit-cond.ll
index d33cc3a..938023f 100644
--- a/test/CodeGen/X86/lsr-loop-exit-cond.ll
+++ b/test/CodeGen/X86/lsr-loop-exit-cond.ll
@@ -1,4 +1,3 @@
-; XFAIL: *
 ; RUN: llc -march=x86-64 < %s | FileCheck %s
 
 ; CHECK: decq
diff --git a/test/CodeGen/X86/lsr-overflow.ll b/test/CodeGen/X86/lsr-overflow.ll
index 5bc4f7e..09c1c07 100644
--- a/test/CodeGen/X86/lsr-overflow.ll
+++ b/test/CodeGen/X86/lsr-overflow.ll
@@ -25,3 +25,21 @@ __ABContainsLabel.exit:
   %cmp = icmp eq i64 %indvar, 9223372036854775807
   ret i1 %cmp
 }
+
+define void @func_37() noreturn nounwind readonly {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc8, %entry
+  %indvar = phi i64 [ 0, %entry ], [ %indvar.next, %for.inc8 ]
+  %sub.i = add i64 undef, %indvar
+  %cmp.i = icmp eq i64 %sub.i, -9223372036854775808
+  br i1 undef, label %for.inc8, label %for.cond4
+
+for.cond4:                                        ; preds = %for.cond4, %for.body
+  br label %for.cond4
+
+for.inc8:                                         ; preds = %for.body
+  %indvar.next = add i64 %indvar, 1
+  br label %for.body
+}
diff --git a/test/CodeGen/X86/movntdq-no-avx.ll b/test/CodeGen/X86/movntdq-no-avx.ll
new file mode 100644
index 0000000..8b7e6ef
--- /dev/null
+++ b/test/CodeGen/X86/movntdq-no-avx.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+
+; Test that we produce a movntdq, not a vmovntdq
+; CHECK-NOT: vmovntdq
+
+define void @test(<2 x i64>* nocapture %a, <2 x i64> %b) nounwind optsize {
+entry:
+  store <2 x i64> %b, <2 x i64>* %a, align 16, !nontemporal !0
+  ret void
+}
+
+!0 = metadata !{i32 1}
diff --git a/test/CodeGen/X86/narrow-shl-cst.ll b/test/CodeGen/X86/narrow-shl-cst.ll
new file mode 100644
index 0000000..a404f34
--- /dev/null
+++ b/test/CodeGen/X86/narrow-shl-cst.ll
@@ -0,0 +1,101 @@
+; RUN: llc < %s -march=x86-64 | FileCheck %s
+; PR5039
+
+define i32 @test1(i32 %x) nounwind {
+  %and = shl i32 %x, 10
+  %shl = and i32 %and, 31744
+  ret i32 %shl
+; CHECK: test1:
+; CHECK: andl $31
+; CHECK: shll $10
+}
+
+define i32 @test2(i32 %x) nounwind {
+  %or = shl i32 %x, 10
+  %shl = or i32 %or, 31744
+  ret i32 %shl
+; CHECK: test2:
+; CHECK: orl $31
+; CHECK: shll $10
+}
+
+define i32 @test3(i32 %x) nounwind {
+  %xor = shl i32 %x, 10
+  %shl = xor i32 %xor, 31744
+  ret i32 %shl
+; CHECK: test3:
+; CHECK: xorl $31
+; CHECK: shll $10
+}
+
+define i64 @test4(i64 %x) nounwind {
+  %and = shl i64 %x, 40
+  %shl = and i64 %and, 264982302294016
+  ret i64 %shl
+; CHECK: test4:
+; CHECK: andq $241
+; CHECK: shlq $40
+}
+
+define i64 @test5(i64 %x) nounwind {
+  %and = shl i64 %x, 40
+  %shl = and i64 %and, 34084860461056
+  ret i64 %shl
+; CHECK: test5:
+; CHECK: andq $31
+; CHECK: shlq $40
+}
+
+define i64 @test6(i64 %x) nounwind {
+  %and = shl i64 %x, 32
+  %shl = and i64 %and, -281474976710656
+  ret i64 %shl
+; CHECK: test6:
+; CHECK: andq $-65536
+; CHECK: shlq $32
+}
+
+define i64 @test7(i64 %x) nounwind {
+  %or = shl i64 %x, 40
+  %shl = or i64 %or, 264982302294016
+  ret i64 %shl
+; CHECK: test7:
+; CHECK: orq $241
+; CHECK: shlq $40
+}
+
+define i64 @test8(i64 %x) nounwind {
+  %or = shl i64 %x, 40
+  %shl = or i64 %or, 34084860461056
+  ret i64 %shl
+; CHECK: test8:
+; CHECK: orq $31
+; CHECK: shlq $40
+}
+
+define i64 @test9(i64 %x) nounwind {
+  %xor = shl i64 %x, 40
+  %shl = xor i64 %xor, 264982302294016
+  ret i64 %shl
+; CHECK: test9:
+; CHECK: orq $241
+; CHECK: shlq $40
+}
+
+define i64 @test10(i64 %x) nounwind {
+  %xor = shl i64 %x, 40
+  %shl = xor i64 %xor, 34084860461056
+  ret i64 %shl
+; CHECK: test10:
+; CHECK: xorq $31
+; CHECK: shlq $40
+}
+
+define i64 @test11(i64 %x) nounwind {
+  %xor = shl i64 %x, 33
+  %shl = xor i64 %xor, -562949953421312
+  ret i64 %shl
+; CHECK: test11:
+; CHECK: xorq $-65536
+; CHECK: shlq $33
+}
diff --git a/test/CodeGen/X86/no-cfi.ll b/test/CodeGen/X86/no-cfi.ll
new file mode 100644
index 0000000..f9985d4
--- /dev/null
+++ b/test/CodeGen/X86/no-cfi.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -disable-cfi | FileCheck --check-prefix=STATIC %s
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -disable-cfi -relocation-model=pic | FileCheck --check-prefix=PIC %s
+
+; STATIC:      .ascii   "zPLR"
+; STATIC:      .byte   3
+; STATIC-NEXT: .long   __gxx_personality_v0
+; STATIC-NEXT: .byte   3
+; STATIC-NEXT: .byte   3
+
+; PIC:      .ascii   "zPLR"
+; PIC:      .byte   155
+; PIC-NEXT: .L
+; PIC-NEXT: .long   DW.ref.__gxx_personality_v0-.L
+; PIC-NEXT: .byte   27
+; PIC-NEXT: .byte   27
+
+
+define void @bar() {
+entry:
+  %call = invoke i32 @foo()
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:
+  ret void
+
+lpad:
+  %exn = call i8* @llvm.eh.exception() nounwind
+  %eh.selector = call i32 (i8*, i8*, ...)* @llvm.eh.selector(i8* %exn, i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*), i8* null) nounwind
+  ret void
+}
+
+declare i32 @foo()
+
+declare i8* @llvm.eh.exception() nounwind readonly
+
+declare i32 @__gxx_personality_v0(...)
+
+declare i32 @llvm.eh.selector(i8*, i8*, ...) nounwind
diff --git a/test/CodeGen/X86/non-lazy-bind.ll b/test/CodeGen/X86/non-lazy-bind.ll
new file mode 100644
index 0000000..f729658
--- /dev/null
+++ b/test/CodeGen/X86/non-lazy-bind.ll
@@ -0,0 +1,27 @@
+; RUN: llc -mtriple=x86_64-apple-darwin < %s | FileCheck %s
+
+declare void @lazy() nonlazybind
+declare void @not()
+
+; CHECK: foo:
+; CHECK:  callq _not
+; CHECK:  callq *_lazy@GOTPCREL(%rip)
+define void @foo() nounwind {
+  call void @not()
+  call void @lazy()
+  ret void
+}
+
+; CHECK: tail_call_regular:
+; CHECK:   jmp _not
+define void @tail_call_regular() nounwind {
+  tail call void @not()
+  ret void
+}
+
+; CHECK: tail_call_eager:
+; CHECK:   jmpq *_lazy@GOTPCREL(%rip)
+define void @tail_call_eager() nounwind {
+  tail call void @lazy()
+  ret void
+}
diff --git a/test/CodeGen/X86/nontemporal.ll b/test/CodeGen/X86/nontemporal.ll
new file mode 100644
index 0000000..1d09535
--- /dev/null
+++ b/test/CodeGen/X86/nontemporal.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s
+
+define void @f(<4 x float> %A, i8* %B, <2 x double> %C, i32 %D, <2 x i64> %E) {
+; CHECK: movntps
+  %cast = bitcast i8* %B to <4 x float>*
+  store <4 x float> %A, <4 x float>* %cast, align 16, !nontemporal !0
+; CHECK: movntdq
+  %cast1 = bitcast i8* %B to <2 x i64>*
+  store <2 x i64> %E, <2 x i64>* %cast1, align 16, !nontemporal !0
+; CHECK: movntpd
+  %cast2 = bitcast i8* %B to <2 x double>*
+  store <2 x double> %C, <2 x double>* %cast2, align 16, !nontemporal !0
+; CHECK: movnti
+  %cast3 = bitcast i8* %B to i32*
+  store i32 %D, i32* %cast3, align 16, !nontemporal !0
+  ret void
+}
+
+!0 = metadata !{i32 1}
diff --git a/test/CodeGen/X86/opt-ext-uses.ll b/test/CodeGen/X86/opt-ext-uses.ll
index fa2aef5..72fb38b 100644
--- a/test/CodeGen/X86/opt-ext-uses.ll
+++ b/test/CodeGen/X86/opt-ext-uses.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=x86 | grep movw | count 1
 
-define i16 @t() signext  {
+define signext i16 @t()   {
 entry:
         %tmp180 = load i16* null, align 2               ; <i16> [#uses=3]
         %tmp180181 = sext i16 %tmp180 to i32            ; <i32> [#uses=1]
diff --git a/test/CodeGen/X86/optimize-max-3.ll b/test/CodeGen/X86/optimize-max-3.ll
index e35eb70..e42aa9d 100644
--- a/test/CodeGen/X86/optimize-max-3.ll
+++ b/test/CodeGen/X86/optimize-max-3.ll
@@ -41,14 +41,13 @@ for.end:                                          ; preds = %for.body, %entry
 
 ;      CHECK:         jle
 ;  CHECK-NOT:         cmov
-;      CHECK:         xorl    {{%edi, %edi|%ecx, %ecx}}
+;      CHECK:         xorl    {{%edi, %edi|%ecx, %ecx|%eax, %eax}}
 ; CHECK-NEXT:         align
 ; CHECK-NEXT: BB1_2:
-; CHECK-NEXT:         callq
+; CHECK:              callq
 ; CHECK-NEXT:         incl    [[BX:%[a-z0-9]+]]
 ; CHECK-NEXT:         cmpl    [[R14:%[a-z0-9]+]], [[BX]]
-; CHECK-NEXT:         movq    %rax, %r{{di|cx}}
-; CHECK-NEXT:         jl
+; CHECK:              jl
 
 define void @_Z18GenerateStatusPagei(i32 %jobs_to_display) nounwind {
 entry:
diff --git a/test/CodeGen/X86/peep-setb.ll b/test/CodeGen/X86/peep-setb.ll
new file mode 100644
index 0000000..0bab789
--- /dev/null
+++ b/test/CodeGen/X86/peep-setb.ll
@@ -0,0 +1,82 @@
+; RUN: llc -march=x86-64 < %s | FileCheck %s
+
+define i8 @test1(i8 %a, i8 %b) nounwind {
+  %cmp = icmp ult i8 %a, %b
+  %cond = zext i1 %cmp to i8
+  %add = add i8 %cond, %b
+  ret i8 %add
+; CHECK: test1:
+; CHECK: adcb $0
+}
+
+define i32 @test2(i32 %a, i32 %b) nounwind {
+  %cmp = icmp ult i32 %a, %b
+  %cond = zext i1 %cmp to i32
+  %add = add i32 %cond, %b
+  ret i32 %add
+; CHECK: test2:
+; CHECK: adcl $0
+}
+
+define i64 @test3(i64 %a, i64 %b) nounwind {
+  %cmp = icmp ult i64 %a, %b
+  %conv = zext i1 %cmp to i64
+  %add = add i64 %conv, %b
+  ret i64 %add
+; CHECK: test3:
+; CHECK: adcq $0
+}
+
+define i8 @test4(i8 %a, i8 %b) nounwind {
+  %cmp = icmp ult i8 %a, %b
+  %cond = zext i1 %cmp to i8
+  %sub = sub i8 %b, %cond
+  ret i8 %sub
+; CHECK: test4:
+; CHECK: sbbb $0
+}
+
+define i32 @test5(i32 %a, i32 %b) nounwind {
+  %cmp = icmp ult i32 %a, %b
+  %cond = zext i1 %cmp to i32
+  %sub = sub i32 %b, %cond
+  ret i32 %sub
+; CHECK: test5:
+; CHECK: sbbl $0
+}
+
+define i64 @test6(i64 %a, i64 %b) nounwind {
+  %cmp = icmp ult i64 %a, %b
+  %conv = zext i1 %cmp to i64
+  %sub = sub i64 %b, %conv
+  ret i64 %sub
+; CHECK: test6:
+; CHECK: sbbq $0
+}
+
+define i8 @test7(i8 %a, i8 %b) nounwind {
+  %cmp = icmp ult i8 %a, %b
+  %cond = sext i1 %cmp to i8
+  %sub = sub i8 %b, %cond
+  ret i8 %sub
+; CHECK: test7:
+; CHECK: adcb $0
+}
+
+define i32 @test8(i32 %a, i32 %b) nounwind {
+  %cmp = icmp ult i32 %a, %b
+  %cond = sext i1 %cmp to i32
+  %sub = sub i32 %b, %cond
+  ret i32 %sub
+; CHECK: test8:
+; CHECK: adcl $0
+}
+
+define i64 @test9(i64 %a, i64 %b) nounwind {
+  %cmp = icmp ult i64 %a, %b
+  %conv = sext i1 %cmp to i64
+  %sub = sub i64 %b, %conv
+  ret i64 %sub
+; CHECK: test9:
+; CHECK: adcq $0
+}
diff --git a/test/CodeGen/X86/personality.ll b/test/CodeGen/X86/personality.ll
index 705e489..e952a9b 100644
--- a/test/CodeGen/X86/personality.ll
+++ b/test/CodeGen/X86/personality.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -disable-cgp-branch-opts | FileCheck %s -check-prefix=X64
-; RUN: llc < %s -mtriple=i386-apple-darwin9 -disable-cgp-branch-opts | FileCheck %s -check-prefix=X32
+; RUN: llc < %s -disable-cfi -mtriple=x86_64-apple-darwin9 -disable-cgp-branch-opts | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -disable-cfi -mtriple=i386-apple-darwin9 -disable-cgp-branch-opts | FileCheck %s -check-prefix=X32
 ; PR1632
 
 define void @_Z1fv() {
@@ -38,13 +38,15 @@ declare void @__gxx_personality_v0()
 
 declare void @__cxa_end_catch()
 
-; X64: Leh_frame_common_begin0:
-; X64: .long	___gxx_personality_v0@GOTPCREL+4
+; X64:      zPLR
+; X64:      .byte 155
+; X64-NEXT: .long	___gxx_personality_v0@GOTPCREL+4
 
-; X32: Leh_frame_common_begin0:
-; X32: .long	L___gxx_personality_v0$non_lazy_ptr-
-; ....
+; X32:        .section	__IMPORT,__pointers,non_lazy_symbol_pointers
+; X32-NEXT: L___gxx_personality_v0$non_lazy_ptr:
+; X32-NEXT:   .indirect_symbol ___gxx_personality_v0
 
-; X32: .section	__IMPORT,__pointers,non_lazy_symbol_pointers
-; X32: L___gxx_personality_v0$non_lazy_ptr:
-; X32:   .indirect_symbol ___gxx_personality_v0
+; X32:      zPLR
+; X32:      .byte 155
+; X32-NEXT: :
+; X32-NEXT: .long	L___gxx_personality_v0$non_lazy_ptr-
diff --git a/test/CodeGen/X86/phys_subreg_coalesce-2.ll b/test/CodeGen/X86/phys_subreg_coalesce-2.ll
index 13e804d..02c519f 100644
--- a/test/CodeGen/X86/phys_subreg_coalesce-2.ll
+++ b/test/CodeGen/X86/phys_subreg_coalesce-2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep mov | count 4
+; RUN: llc < %s -march=x86 | FileCheck %s
 ; PR2659
 
 define i32 @binomial(i32 %n, i32 %k) nounwind {
@@ -12,7 +12,8 @@ forcond.preheader:		; preds = %entry
 
 ifthen:		; preds = %entry
 	ret i32 0
-
+; CHECK: forbody
+; CHECK-NOT: mov
 forbody:		; preds = %forbody, %forcond.preheader
 	%indvar = phi i32 [ 0, %forcond.preheader ], [ %divisor.02, %forbody ]		; <i32> [#uses=3]
 	%accumulator.01 = phi i32 [ 1, %forcond.preheader ], [ %div, %forbody ]		; <i32> [#uses=1]
diff --git a/test/CodeGen/X86/phys_subreg_coalesce-3.ll b/test/CodeGen/X86/phys_subreg_coalesce-3.ll
index f23669e..4162015 100644
--- a/test/CodeGen/X86/phys_subreg_coalesce-3.ll
+++ b/test/CodeGen/X86/phys_subreg_coalesce-3.ll
@@ -1,6 +1,11 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin | FileCheck %s
+; RUN: llc < %s -mtriple=i386-apple-darwin -join-physregs | FileCheck %s
 ; rdar://5571034
 
+; This requires physreg joining, %vreg13 is live everywhere:
+; 304L		%CL<def> = COPY %vreg13:sub_8bit; GR32_ABCD:%vreg13
+; 320L		%vreg15<def> = COPY %vreg19; GR32:%vreg15 GR32_NOSP:%vreg19
+; 336L		%vreg15<def> = SAR32rCL %vreg15, %EFLAGS<imp-def,dead>, %CL<imp-use,kill>; GR32:%vreg15
+
 define void @foo(i32* nocapture %quadrant, i32* nocapture %ptr, i32 %bbSize, i32 %bbStart, i32 %shifts) nounwind ssp {
 ; CHECK: foo:
 entry:
diff --git a/test/CodeGen/X86/pic.ll b/test/CodeGen/X86/pic.ll
index dc5fcd7..fb60ac2 100644
--- a/test/CodeGen/X86/pic.ll
+++ b/test/CodeGen/X86/pic.ll
@@ -79,8 +79,8 @@ entry:
 ; LINUX-NEXT: .L3$pb:
 ; LINUX: 	popl
 ; LINUX: 	addl	$_GLOBAL_OFFSET_TABLE_+(.L{{.*}}-.L3$pb), %[[REG3:e..]]
-; LINUX: 	movl	pfoo@GOT(%[[REG3]]),
 ; LINUX: 	calll	afoo@PLT
+; LINUX: 	movl	pfoo@GOT(%[[REG3]]),
 ; LINUX: 	calll	*
 }
 
diff --git a/test/CodeGen/X86/pmul.ll b/test/CodeGen/X86/pmul.ll
index bf5229a..d8ed4c0 100644
--- a/test/CodeGen/X86/pmul.ll
+++ b/test/CodeGen/X86/pmul.ll
@@ -1,7 +1,9 @@
-; RUN: llc < %s -march=x86 -mattr=sse41 -stack-alignment=16 > %t
+; RUN: llc < %s -march=x86 -mattr=sse41 -stack-alignment=16 -join-physregs > %t
 ; RUN: grep pmul %t | count 12
 ; RUN: grep mov %t | count 11
 
+; The f() arguments in %xmm0 and %xmm1 cause an extra movdqa without -join-physregs.
+
 define <4 x i32> @a(<4 x i32> %i) nounwind  {
         %A = mul <4 x i32> %i, < i32 117, i32 117, i32 117, i32 117 >
         ret <4 x i32> %A
diff --git a/test/CodeGen/X86/pr10068.ll b/test/CodeGen/X86/pr10068.ll
new file mode 100644
index 0000000..8829c5d
--- /dev/null
+++ b/test/CodeGen/X86/pr10068.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -march=x86
+
+define void @foobar() {
+entry:
+  %sub.i = trunc i64 undef to i32
+  %shr80.i = ashr i32 %sub.i, 16
+  %add82.i = add nsw i32 %shr80.i, 1
+  %notlhs.i = icmp slt i32 %shr80.i, undef
+  %notrhs.i = icmp sgt i32 %add82.i, -1
+  %or.cond.not.i = and i1 %notrhs.i, %notlhs.i
+  %cmp154.i = icmp slt i32 0, undef
+  %or.cond406.i = and i1 %or.cond.not.i, %cmp154.i
+  %or.cond406.not.i = xor i1 %or.cond406.i, true
+  %or.cond407.i = or i1 undef, %or.cond406.not.i
+  br i1 %or.cond407.i, label %if.then158.i, label %if.end163.i
+
+if.then158.i:
+  ret void
+
+if.end163.i:                                      ; preds = %if.end67.i
+  ret void
+}
diff --git a/test/CodeGen/X86/pr2659.ll b/test/CodeGen/X86/pr2659.ll
index 54d043d..5dab5c9 100644
--- a/test/CodeGen/X86/pr2659.ll
+++ b/test/CodeGen/X86/pr2659.ll
@@ -1,5 +1,4 @@
-; RUN: llc < %s -march=x86 -mtriple=i686-apple-darwin9.4.0 | grep movl | count 4
-; RUN: llc < %s -march=x86 -mtriple=i686-apple-darwin9.4.0 | FileCheck %s
+; RUN: llc < %s -march=x86 -mtriple=i686-apple-darwin9.4.0 -disable-branch-fold | FileCheck %s
 ; PR2659
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
@@ -18,7 +17,12 @@ forcond.preheader:              ; preds = %entry
 ; CHECK: movl $1
 ; CHECK-NOT: xorl
 ; CHECK-NOT: movl
-; CHECK-NEXT: je
+; CHECK-NOT: LBB
+; CHECK: jne
+
+; There should be no moves required in the for loop body.
+; CHECK: %forbody
+; CHECK-NOT: mov
 
 ifthen:         ; preds = %entry
   ret i32 0
diff --git a/test/CodeGen/X86/pr9127.ll b/test/CodeGen/X86/pr9127.ll
index 9b251f5..ba92c77 100644
--- a/test/CodeGen/X86/pr9127.ll
+++ b/test/CodeGen/X86/pr9127.ll
@@ -10,4 +10,4 @@ entry:
 }
 
 ; test that the load is folded.
-; CHECK: ucomisd	(%{{rdi|rdx}}), %xmm0
+; CHECK: cmpeqsd	(%{{rdi|rdx}}), %xmm0
diff --git a/test/CodeGen/X86/pr9743.ll b/test/CodeGen/X86/pr9743.ll
new file mode 100644
index 0000000..6597c23
--- /dev/null
+++ b/test/CodeGen/X86/pr9743.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -disable-fp-elim -asm-verbose=0 | FileCheck %s
+
+define void @f() {
+  ret void
+}
+
+; CHECK:       .cfi_startproc
+; CHECK-NEXT:  pushq
+; CHECK-NEXT: :
+; CHECK-NEXT:  .cfi_def_cfa_offset 16
+; CHECK-NEXT: :
+; CHECK-NEXT:  .cfi_offset %rbp, -16
+; CHECK-NEXT:  movq    %rsp, %rbp
+; CHECK-NEXT: :
+; CHECK-NEXT:  .cfi_def_cfa_register %rbp
+; CHECK-NEXT:  popq    %rbp
+; CHECK-NEXT:  ret
diff --git a/test/CodeGen/X86/prefetch.ll b/test/CodeGen/X86/prefetch.ll
index 48d2673..ebe11a5 100644
--- a/test/CodeGen/X86/prefetch.ll
+++ b/test/CodeGen/X86/prefetch.ll
@@ -6,11 +6,11 @@ entry:
 ; CHECK: prefetcht1
 ; CHECK: prefetcht0
 ; CHECK: prefetchnta
-	tail call void @llvm.prefetch( i8* %ptr, i32 0, i32 1 )
-	tail call void @llvm.prefetch( i8* %ptr, i32 0, i32 2 )
-	tail call void @llvm.prefetch( i8* %ptr, i32 0, i32 3 )
-	tail call void @llvm.prefetch( i8* %ptr, i32 0, i32 0 )
+	tail call void @llvm.prefetch( i8* %ptr, i32 0, i32 1, i32 1 )
+	tail call void @llvm.prefetch( i8* %ptr, i32 0, i32 2, i32 1 )
+	tail call void @llvm.prefetch( i8* %ptr, i32 0, i32 3, i32 1 )
+	tail call void @llvm.prefetch( i8* %ptr, i32 0, i32 0, i32 1 )
 	ret void
 }
 
-declare void @llvm.prefetch(i8*, i32, i32) nounwind 
+declare void @llvm.prefetch(i8*, i32, i32, i32) nounwind 
diff --git a/test/CodeGen/X86/promote-i16.ll b/test/CodeGen/X86/promote-i16.ll
index 101bb29..3c91d74 100644
--- a/test/CodeGen/X86/promote-i16.ll
+++ b/test/CodeGen/X86/promote-i16.ll
@@ -3,9 +3,19 @@
 define signext i16 @foo(i16 signext %x) nounwind {
 entry:
 ; CHECK: foo:
-; CHECK: movzwl 4(%esp), %eax
+; CHECK-NOT: movzwl
+; CHECK: movswl 4(%esp), %eax
 ; CHECK: xorl $21998, %eax
-; CHECK: movswl %ax, %eax
   %0 = xor i16 %x, 21998
   ret i16 %0
 }
+
+define signext i16 @bar(i16 signext %x) nounwind {
+entry:
+; CHECK: bar:
+; CHECK-NOT: movzwl
+; CHECK: movswl 4(%esp), %eax
+; CHECK: xorl $-10770, %eax
+  %0 = xor i16 %x, 54766
+  ret i16 %0
+}
diff --git a/test/CodeGen/X86/ret-mmx.ll b/test/CodeGen/X86/ret-mmx.ll
index 04b57dd..865e147 100644
--- a/test/CodeGen/X86/ret-mmx.ll
+++ b/test/CodeGen/X86/ret-mmx.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=+mmx,+sse2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin11 -mattr=+mmx,+sse2 | FileCheck %s
 ; rdar://6602459
 
 @g_v1di = external global <1 x i64>
@@ -8,19 +8,32 @@ entry:
 	%call = call <1 x i64> @return_v1di()		; <<1 x i64>> [#uses=0]
 	store <1 x i64> %call, <1 x i64>* @g_v1di
         ret void
+; CHECK: t1:
+; CHECK: callq
+; CHECK-NEXT: movq	_g_v1di
+; CHECK-NEXT: movq	%rax,
 }
 
 declare <1 x i64> @return_v1di()
 
 define <1 x i64> @t2() nounwind {
 	ret <1 x i64> <i64 1>
+; CHECK: t2:
+; CHECK: movl	$1
+; CHECK-NEXT: ret
 }
 
 define <2 x i32> @t3() nounwind {
 	ret <2 x i32> <i32 1, i32 0>
+; CHECK: t3:
+; CHECK: movl $1
+; CHECK: movd {{.*}}, %xmm0
 }
 
 define double @t4() nounwind {
 	ret double bitcast (<2 x i32> <i32 1, i32 0> to double)
+; CHECK: t4:
+; CHECK: movl $1
+; CHECK: movd {{.*}}, %xmm0
 }
 
diff --git a/test/CodeGen/X86/setoeq.ll b/test/CodeGen/X86/setoeq.ll
index 4a9c1ba..aa2f0af 100644
--- a/test/CodeGen/X86/setoeq.ll
+++ b/test/CodeGen/X86/setoeq.ll
@@ -1,5 +1,4 @@
-; RUN: llc < %s -march=x86  | grep set | count 2
-; RUN: llc < %s -march=x86  | grep and
+; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s
 
 define zeroext i8 @t(double %x) nounwind readnone {
 entry:
@@ -7,5 +6,16 @@ entry:
 	%1 = sitofp i32 %0 to double		; <double> [#uses=1]
 	%2 = fcmp oeq double %1, %x		; <i1> [#uses=1]
 	%retval12 = zext i1 %2 to i8		; <i8> [#uses=1]
+; CHECK: cmpeqsd
+	ret i8 %retval12
+}
+
+define zeroext i8 @u(double %x) nounwind readnone {
+entry:
+	%0 = fptosi double %x to i32		; <i32> [#uses=1]
+	%1 = sitofp i32 %0 to double		; <double> [#uses=1]
+	%2 = fcmp une double %1, %x		; <i1> [#uses=1]
+	%retval12 = zext i1 %2 to i8		; <i8> [#uses=1]
+; CHECK: cmpneqsd
 	ret i8 %retval12
 }
diff --git a/test/CodeGen/X86/sext-trunc.ll b/test/CodeGen/X86/sext-trunc.ll
index 2eaf425..22b3791 100644
--- a/test/CodeGen/X86/sext-trunc.ll
+++ b/test/CodeGen/X86/sext-trunc.ll
@@ -3,7 +3,7 @@
 ; RUN: not grep movz %t
 ; RUN: not grep and %t
 
-define i8 @foo(i16 signext  %x) signext nounwind  {
+define signext  i8 @foo(i16 signext  %x) nounwind  {
 	%retval56 = trunc i16 %x to i8
 	ret i8 %retval56
 }
diff --git a/test/CodeGen/X86/shift-pair.ll b/test/CodeGen/X86/shift-pair.ll
new file mode 100644
index 0000000..24ba1fc
--- /dev/null
+++ b/test/CodeGen/X86/shift-pair.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -march=x86-64 | FileCheck %s
+
+define i64 @test(i64 %A) {
+; CHECK: @test
+; CHECK: shrq $54
+; CHECK: andq $1020
+; CHECK: ret
+    %B = lshr i64 %A, 56
+    %C = shl i64 %B, 2
+    ret i64 %C
+}
diff --git a/test/CodeGen/X86/shl_undef.ll b/test/CodeGen/X86/shl_undef.ll
new file mode 100644
index 0000000..54b74cc
--- /dev/null
+++ b/test/CodeGen/X86/shl_undef.ll
@@ -0,0 +1,53 @@
+; RUN: llc < %s -O1 -mtriple=i386-apple-darwin | FileCheck %s
+;
+; Interesting test case where %tmp1220 = xor i32 %tmp862, %tmp592 and
+; %tmp1676 = xor i32 %tmp1634, %tmp1530 have zero demanded bits after
+; DAGCombiner optimization pass.  These are changed to undef and in turn
+; the successor shl(s) become shl undef, 1.  This pattern then matches
+; shl x, 1 -> add x, x.  add undef, undef doesn't guarentee the low
+; order bit is zero and is incorrect.
+;
+; See rdar://9453156 and rdar://9487392.
+;
+
+; CHECK-NOT: shl
+define i32 @foo(i8* %a0, i32* %a2) nounwind {
+entry:
+  %tmp0 = alloca i8
+  %tmp1 = alloca i32
+  store i8 1, i8* %tmp0
+  %tmp921.i7845 = load i8* %a0, align 1
+  %tmp309 = xor i8 %tmp921.i7845, 104
+  %tmp592 = zext i8 %tmp309 to i32
+  %tmp862 = xor i32 1293461297, %tmp592
+  %tmp1220 = xor i32 %tmp862, %tmp592
+  %tmp1506 = shl i32 %tmp1220, 1
+  %tmp1530 = sub i32 %tmp592, %tmp1506
+  %tmp1557 = sub i32 %tmp1530, 542767629
+  %tmp1607 = and i32 %tmp1557, 1
+  store i32 %tmp1607, i32* %tmp1
+  %tmp1634 = and i32 %tmp1607, 2080309246
+  %tmp1676 = xor i32 %tmp1634, %tmp1530
+  %tmp1618 = shl i32 %tmp1676, 1
+  %tmp1645 = sub i32 %tmp862, %tmp1618
+  %tmp1697 = and i32 %tmp1645, 1
+  store i32 %tmp1697, i32* %a2
+  ret i32 %tmp1607
+}
+
+; CHECK-NOT: shl
+; shl undef, 0 -> undef
+define i32 @foo2_undef() nounwind {
+entry:
+  %tmp2 = shl i32 undef, 0;
+  ret i32 %tmp2
+}
+
+; CHECK-NOT: shl
+; shl undef, x -> 0
+define i32 @foo1_undef(i32* %a0) nounwind {
+entry:
+  %tmp1 = load i32* %a0, align 1
+  %tmp2 = shl i32 undef, %tmp1;
+  ret i32 %tmp2
+}
diff --git a/test/CodeGen/X86/shrink-compare.ll b/test/CodeGen/X86/shrink-compare.ll
new file mode 100644
index 0000000..8d4b07f
--- /dev/null
+++ b/test/CodeGen/X86/shrink-compare.ll
@@ -0,0 +1,36 @@
+; RUN: llc < %s -march=x86-64 | FileCheck %s
+
+declare void @bar()
+
+define void @test1(i32* nocapture %X) nounwind {
+entry:
+  %tmp1 = load i32* %X, align 4
+  %and = and i32 %tmp1, 255
+  %cmp = icmp eq i32 %and, 47
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @bar() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+; CHECK: test1:
+; CHECK: cmpb $47, (%{{rdi|rcx}})
+}
+
+define void @test2(i32 %X) nounwind {
+entry:
+  %and = and i32 %X, 255
+  %cmp = icmp eq i32 %and, 47
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @bar() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+; CHECK: test2:
+; CHECK: cmpb $47, %{{dil|cl}}
+}
diff --git a/test/CodeGen/X86/sibcall.ll b/test/CodeGen/X86/sibcall.ll
index de2a81e..4a98efb 100644
--- a/test/CodeGen/X86/sibcall.ll
+++ b/test/CodeGen/X86/sibcall.ll
@@ -198,7 +198,7 @@ declare i32 @foo6(i32, i32, %struct.t* byval align 4)
 
 ; rdar://r7717598
 %struct.ns = type { i32, i32 }
-%struct.cp = type { float, float }
+%struct.cp = type { float, float, float, float, float }
 
 define %struct.ns* @t13(%struct.cp* %yy) nounwind ssp {
 ; 32: t13:
@@ -229,7 +229,7 @@ entry:
 ; 64: t14:
 ; 64: movq 32(%rdi)
 ; 64-NOT: movq 16(%rdi)
-; 64: jmpq *16(%rdi)
+; 64: jmpq *16({{%rdi|%rax}})
   %0 = getelementptr inbounds %struct.__block_literal_2* %.block_descriptor, i64 0, i32 5 ; <void ()**> [#uses=1]
   %1 = load void ()** %0, align 8                 ; <void ()*> [#uses=2]
   %2 = bitcast void ()* %1 to %struct.__block_literal_1* ; <%struct.__block_literal_1*> [#uses=1]
diff --git a/test/CodeGen/X86/smul-with-overflow-2.ll b/test/CodeGen/X86/smul-with-overflow-2.ll
deleted file mode 100644
index 7c23adb..0000000
--- a/test/CodeGen/X86/smul-with-overflow-2.ll
+++ /dev/null
@@ -1,20 +0,0 @@
-; RUN: llc < %s -march=x86 | grep mul | count 1
-; RUN: llc < %s -march=x86 | grep add | count 3
-
-define i32 @t1(i32 %a, i32 %b) nounwind readnone {
-entry:
-        %tmp0 = add i32 %b, %a
-	%tmp1 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 %tmp0, i32 2)
-	%tmp2 = extractvalue { i32, i1 } %tmp1, 0
-	ret i32 %tmp2
-}
-
-define i32 @t2(i32 %a, i32 %b) nounwind readnone {
-entry:
-        %tmp0 = add i32 %b, %a
-	%tmp1 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 %tmp0, i32 4)
-	%tmp2 = extractvalue { i32, i1 } %tmp1, 0
-	ret i32 %tmp2
-}
-
-declare { i32, i1 } @llvm.smul.with.overflow.i32(i32, i32) nounwind
diff --git a/test/CodeGen/X86/smul-with-overflow-3.ll b/test/CodeGen/X86/smul-with-overflow-3.ll
deleted file mode 100644
index 49c31f5..0000000
--- a/test/CodeGen/X86/smul-with-overflow-3.ll
+++ /dev/null
@@ -1,23 +0,0 @@
-; RUN: llc < %s -march=x86 | grep {jno} | count 1
-
-@ok = internal constant [4 x i8] c"%d\0A\00"
-@no = internal constant [4 x i8] c"no\0A\00"
-
-define i1 @func1(i32 %v1, i32 %v2) nounwind {
-entry:
-  %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
-  %sum = extractvalue {i32, i1} %t, 0
-  %obit = extractvalue {i32, i1} %t, 1
-  br i1 %obit, label %overflow, label %normal
-
-overflow:
-  %t2 = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([4 x i8]* @no, i32 0, i32 0) ) nounwind
-  ret i1 false
-
-normal:
-  %t1 = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([4 x i8]* @ok, i32 0, i32 0), i32 %sum ) nounwind
-  ret i1 true
-}
-
-declare i32 @printf(i8*, ...) nounwind
-declare {i32, i1} @llvm.smul.with.overflow.i32(i32, i32)
diff --git a/test/CodeGen/X86/smul-with-overflow.ll b/test/CodeGen/X86/smul-with-overflow.ll
index 6d125e4..7c2e247 100644
--- a/test/CodeGen/X86/smul-with-overflow.ll
+++ b/test/CodeGen/X86/smul-with-overflow.ll
@@ -1,9 +1,9 @@
-; RUN: llc < %s -march=x86 | grep {jo} | count 1
+; RUN: llc < %s -march=x86 | FileCheck %s
 
 @ok = internal constant [4 x i8] c"%d\0A\00"
 @no = internal constant [4 x i8] c"no\0A\00"
 
-define i1 @func1(i32 %v1, i32 %v2) nounwind {
+define i1 @test1(i32 %v1, i32 %v2) nounwind {
 entry:
   %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
   %sum = extractvalue {i32, i1} %t, 0
@@ -17,7 +17,53 @@ normal:
 overflow:
   %t2 = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([4 x i8]* @no, i32 0, i32 0) ) nounwind
   ret i1 false
+; CHECK: test1:
+; CHECK: imull
+; CHECK-NEXT: jo
+}
+
+define i1 @test2(i32 %v1, i32 %v2) nounwind {
+entry:
+  %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
+  %sum = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  br i1 %obit, label %overflow, label %normal
+
+overflow:
+  %t2 = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([4 x i8]* @no, i32 0, i32 0) ) nounwind
+  ret i1 false
+
+normal:
+  %t1 = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([4 x i8]* @ok, i32 0, i32 0), i32 %sum ) nounwind
+  ret i1 true
+; CHECK: test2:
+; CHECK: imull
+; CHECK-NEXT: jno
 }
 
 declare i32 @printf(i8*, ...) nounwind
 declare {i32, i1} @llvm.smul.with.overflow.i32(i32, i32)
+
+define i32 @test3(i32 %a, i32 %b) nounwind readnone {
+entry:
+	%tmp0 = add i32 %b, %a
+	%tmp1 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 %tmp0, i32 2)
+	%tmp2 = extractvalue { i32, i1 } %tmp1, 0
+	ret i32 %tmp2
+; CHECK: test3:
+; CHECK: addl
+; CHECK-NEXT: addl
+; CHECK-NEXT: ret
+}
+
+define i32 @test4(i32 %a, i32 %b) nounwind readnone {
+entry:
+	%tmp0 = add i32 %b, %a
+	%tmp1 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 %tmp0, i32 4)
+	%tmp2 = extractvalue { i32, i1 } %tmp1, 0
+	ret i32 %tmp2
+; CHECK: test4:
+; CHECK: addl
+; CHECK: mull
+; CHECK-NEXT: ret
+}
diff --git a/test/CodeGen/X86/sse-minmax.ll b/test/CodeGen/X86/sse-minmax.ll
index 348121a..ff0af25 100644
--- a/test/CodeGen/X86/sse-minmax.ll
+++ b/test/CodeGen/X86/sse-minmax.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=x86-64 -asm-verbose=false | FileCheck %s
-; RUN: llc < %s -march=x86-64 -asm-verbose=false -enable-unsafe-fp-math -enable-no-nans-fp-math | FileCheck -check-prefix=UNSAFE %s
-; RUN: llc < %s -march=x86-64 -asm-verbose=false -enable-no-nans-fp-math | FileCheck -check-prefix=FINITE %s
+; RUN: llc < %s -march=x86-64 -asm-verbose=false -join-physregs | FileCheck %s
+; RUN: llc < %s -march=x86-64 -asm-verbose=false -join-physregs -enable-unsafe-fp-math -enable-no-nans-fp-math | FileCheck -check-prefix=UNSAFE %s
+; RUN: llc < %s -march=x86-64 -asm-verbose=false -join-physregs -enable-no-nans-fp-math | FileCheck -check-prefix=FINITE %s
 
 ; Some of these patterns can be matched as SSE min or max. Some of
 ; then can be matched provided that the operands are swapped.
@@ -12,6 +12,9 @@
 ; y_ : use -0.0 instead of %y
 ; _inverse : swap the arms of the select.
 
+; Some of these tests depend on -join-physregs commuting instructions to
+; eliminate copies.
+
 ; CHECK:      ogt:
 ; CHECK-NEXT: maxsd %xmm1, %xmm0
 ; CHECK-NEXT: ret
diff --git a/test/CodeGen/X86/sse3.ll b/test/CodeGen/X86/sse3.ll
index 8e72f13..8c2e58d 100644
--- a/test/CodeGen/X86/sse3.ll
+++ b/test/CodeGen/X86/sse3.ll
@@ -62,11 +62,10 @@ define <8 x i16> @t4(<8 x i16> %A, <8 x i16> %B) nounwind {
 	%tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 0, i32 7, i32 2, i32 3, i32 1, i32 5, i32 6, i32 5 >
 	ret <8 x i16> %tmp
 ; X64: t4:
-; X64: 	pextrw	$7, %xmm0, %eax
-; X64: 	pshufhw	$100, %xmm0, %xmm1
-; X64: 	pinsrw	$1, %eax, %xmm1
-; X64: 	pextrw	$1, %xmm0, %eax
-; X64: 	movdqa	%xmm1, %xmm0
+; X64: 	pextrw	$7, [[XMM0:%xmm[0-9]+]], %eax
+; X64: 	pshufhw	$100, [[XMM0]], [[XMM1:%xmm[0-9]+]]
+; X64: 	pinsrw	$1, %eax, [[XMM1]]
+; X64: 	pextrw	$1, [[XMM0]], %eax
 ; X64: 	pinsrw	$4, %eax, %xmm0
 ; X64: 	ret
 }
@@ -251,13 +250,13 @@ entry:
         %tmp9 = shufflevector <16 x i8> %tmp8, <16 x i8> %T0,  <16 x i32> < i32 0, i32 1, i32 2, i32 17,  i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef >
         ret <16 x i8> %tmp9
 ; X64: 	t16:
-; X64: 		pinsrw	$0, %eax, %xmm1
-; X64: 		pextrw	$8, %xmm0, %eax
-; X64: 		pinsrw	$1, %eax, %xmm1
-; X64: 		pextrw	$1, %xmm1, %ecx
-; X64: 		movd	%xmm1, %edx
-; X64: 		pinsrw	$0, %edx, %xmm1
-; X64: 		pinsrw	$1, %eax, %xmm0
+; X64: 		pinsrw	$0, %eax, [[X1:%xmm[0-9]+]]
+; X64: 		pextrw	$8, [[X0:%xmm[0-9]+]], %eax
+; X64: 		pinsrw	$1, %eax, [[X1]]
+; X64: 		pextrw	$1, [[X1]], %ecx
+; X64: 		movd	[[X1]], %edx
+; X64: 		pinsrw	$0, %edx, %xmm
+; X64: 		pinsrw	$1, %eax, %xmm
 ; X64: 		ret
 }
 
diff --git a/test/CodeGen/X86/sse42.ll b/test/CodeGen/X86/sse42.ll
index 1723909..c787523 100644
--- a/test/CodeGen/X86/sse42.ll
+++ b/test/CodeGen/X86/sse42.ll
@@ -1,38 +1,39 @@
 ; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse42 | FileCheck %s -check-prefix=X32
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse42 | FileCheck %s -check-prefix=X64
 
-declare i32 @llvm.x86.sse42.crc32.8(i32, i8) nounwind
-declare i32 @llvm.x86.sse42.crc32.16(i32, i16) nounwind
-declare i32 @llvm.x86.sse42.crc32.32(i32, i32) nounwind
+declare i32 @llvm.x86.sse42.crc32.32.8(i32, i8) nounwind
+declare i32 @llvm.x86.sse42.crc32.32.16(i32, i16) nounwind
+declare i32 @llvm.x86.sse42.crc32.32.32(i32, i32) nounwind
 
-define i32 @crc32_8(i32 %a, i8 %b) nounwind {
-  %tmp = call i32 @llvm.x86.sse42.crc32.8(i32 %a, i8 %b)
+define i32 @crc32_32_8(i32 %a, i8 %b) nounwind {
+  %tmp = call i32 @llvm.x86.sse42.crc32.32.8(i32 %a, i8 %b)
   ret i32 %tmp
-; X32: _crc32_8:
+; X32: _crc32_32_8:
 ; X32:     crc32b   8(%esp), %eax
 
-; X64: _crc32_8:
-; X64:     crc32b   %sil, %eax
+; X64: _crc32_32_8:
+; X64:     crc32b   %sil,
 }
 
 
-define i32 @crc32_16(i32 %a, i16 %b) nounwind {
-  %tmp = call i32 @llvm.x86.sse42.crc32.16(i32 %a, i16 %b)
+define i32 @crc32_32_16(i32 %a, i16 %b) nounwind {
+  %tmp = call i32 @llvm.x86.sse42.crc32.32.16(i32 %a, i16 %b)
   ret i32 %tmp
-; X32: _crc32_16:
+; X32: _crc32_32_16:
 ; X32:     crc32w   8(%esp), %eax
 
-; X64: _crc32_16:
-; X64:     crc32w   %si, %eax
+; X64: _crc32_32_16:
+; X64:     crc32w   %si,
 }
 
 
-define i32 @crc32_32(i32 %a, i32 %b) nounwind {
-  %tmp = call i32 @llvm.x86.sse42.crc32.32(i32 %a, i32 %b)
+define i32 @crc32_32_32(i32 %a, i32 %b) nounwind {
+  %tmp = call i32 @llvm.x86.sse42.crc32.32.32(i32 %a, i32 %b)
   ret i32 %tmp
-; X32: _crc32_32:
+; X32: _crc32_32_32:
 ; X32:     crc32l   8(%esp), %eax
 
-; X64: _crc32_32:
-; X64:     crc32l   %esi, %eax
+; X64: _crc32_32_32:
+; X64:     crc32l   %esi,
 }
+
diff --git a/test/CodeGen/X86/sse42_64.ll b/test/CodeGen/X86/sse42_64.ll
new file mode 100644
index 0000000..8b3a69b
--- /dev/null
+++ b/test/CodeGen/X86/sse42_64.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse42 | FileCheck %s -check-prefix=X64
+
+declare i64 @llvm.x86.sse42.crc32.64.8(i64, i8) nounwind
+declare i64 @llvm.x86.sse42.crc32.64.64(i64, i64) nounwind
+
+define i64 @crc32_64_8(i64 %a, i8 %b) nounwind {
+  %tmp = call i64 @llvm.x86.sse42.crc32.64.8(i64 %a, i8 %b)
+  ret i64 %tmp
+
+; X64: _crc32_64_8:
+; X64:     crc32b   %sil,
+}
+
+define i64 @crc32_64_64(i64 %a, i64 %b) nounwind {
+  %tmp = call i64 @llvm.x86.sse42.crc32.64.64(i64 %a, i64 %b)
+  ret i64 %tmp
+
+; X64: _crc32_64_64:
+; X64:     crc32q   %rsi,
+}
+
diff --git a/test/CodeGen/X86/sse_reload_fold.ll b/test/CodeGen/X86/sse_reload_fold.ll
index 02399c4..a57fa58 100644
--- a/test/CodeGen/X86/sse_reload_fold.ll
+++ b/test/CodeGen/X86/sse_reload_fold.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-linux -mattr=+64bit,+sse3 -print-failed-fuse-candidates |& FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux -mattr=+64bit,+sse3 -print-failed-fuse-candidates -regalloc=basic |& FileCheck %s
 ; CHECK: fail
 ; CHECK-NOT: fail
 
@@ -117,7 +117,16 @@ define <2 x double> @d8(<2 x double> %f) {
   ret <2 x double> %t
 }
 
-; This one should fail to fuse.
+; This one should fail to fuse, but -regalloc=greedy isn't even trying. Instead
+; it produces:
+;   callq	test_vd
+;   movapd	(%rsp), %xmm1           # 16-byte Reload
+;   hsubpd	%xmm0, %xmm1
+;   movapd	%xmm1, %xmm0
+;   addq	$24, %rsp
+;   ret
+; RABasic still tries to fold this one.
+
 define <2 x double> @z0(<2 x double> %f) {
   %y = call <2 x double> @test_vd(<2 x double> %f)
   %t = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %f, <2 x double> %y)
diff --git a/test/CodeGen/X86/tail-opts.ll b/test/CodeGen/X86/tail-opts.ll
index 424bd21..d6c16ca 100644
--- a/test/CodeGen/X86/tail-opts.ll
+++ b/test/CodeGen/X86/tail-opts.ll
@@ -109,15 +109,15 @@ altret:
 
 ; CHECK: dont_merge_oddly:
 ; CHECK-NOT:   ret
-; CHECK:        ucomiss %xmm1, %xmm2
+; CHECK:        ucomiss %xmm{{[0-2]}}, %xmm{{[0-2]}}
 ; CHECK-NEXT:   jbe .LBB2_3
-; CHECK-NEXT:   ucomiss %xmm0, %xmm1
+; CHECK-NEXT:   ucomiss %xmm{{[0-2]}}, %xmm{{[0-2]}}
 ; CHECK-NEXT:   ja .LBB2_4
 ; CHECK-NEXT: .LBB2_2:
 ; CHECK-NEXT:   movb $1, %al
 ; CHECK-NEXT:   ret
 ; CHECK-NEXT: .LBB2_3:
-; CHECK-NEXT:   ucomiss %xmm0, %xmm2
+; CHECK-NEXT:   ucomiss %xmm{{[0-2]}}, %xmm{{[0-2]}}
 ; CHECK-NEXT:   jbe .LBB2_2
 ; CHECK-NEXT: .LBB2_4:
 ; CHECK-NEXT:   xorb %al, %al
@@ -412,9 +412,9 @@ return:
 ; can fall-through into the ret and the other side has to branch anyway.
 
 ; CHECK: TESTE:
-; CHECK: imulq
-; CHECK-NEXT: LBB8_2:
-; CHECK-NEXT: ret
+; CHECK: ret
+; CHECK-NOT: ret
+; CHECK: size TESTE
 
 define i64 @TESTE(i64 %parami, i64 %paraml) nounwind readnone {
 entry:
diff --git a/test/CodeGen/X86/tail-threshold.ll b/test/CodeGen/X86/tail-threshold.ll
new file mode 100644
index 0000000..f2296a0
--- /dev/null
+++ b/test/CodeGen/X86/tail-threshold.ll
@@ -0,0 +1,44 @@
+; RUN: llc -mtriple=x86_64-pc-linux-gnu -tail-merge-threshold 2 < %s | FileCheck %s
+
+; Test that we still do some merging if a block has more than
+; tail-merge-threshold predecessors.
+
+; CHECK: 	callq	bar
+; CHECK:	callq	bar
+; CHECK:	callq	bar
+; CHECK-NOT:    callq
+
+declare void @bar()
+
+define void @foo(i32 %xxx) {
+entry:
+  switch i32 %xxx, label %bb4 [
+    i32 0, label %bb0
+    i32 1, label %bb1
+    i32 2, label %bb2
+    i32 3, label %bb3
+  ]
+
+bb0:
+  call void @bar()
+  br label %bb5
+
+bb1:
+ call void @bar()
+ br label %bb5
+
+bb2:
+  call void @bar()
+  br label %bb5
+
+bb3:
+  call void @bar()
+  br label %bb5
+
+bb4:
+  call void @bar()
+  br label %bb5
+
+bb5:
+  ret void
+}
diff --git a/test/CodeGen/X86/tailcallbyval.ll b/test/CodeGen/X86/tailcallbyval.ll
index 7002560..03d6f94 100644
--- a/test/CodeGen/X86/tailcallbyval.ll
+++ b/test/CodeGen/X86/tailcallbyval.ll
@@ -13,6 +13,6 @@ entry:
 
 define  fastcc i32 @tailcaller(%struct.s* byval %a) nounwind {
 entry:
-        %tmp4 = tail call fastcc i32 @tailcallee(%struct.s* %a byval)
+        %tmp4 = tail call fastcc i32 @tailcallee(%struct.s* byval %a )
         ret i32 %tmp4
 }
diff --git a/test/CodeGen/X86/tailcallbyval64.ll b/test/CodeGen/X86/tailcallbyval64.ll
index 1b1efe7..7ecf379 100644
--- a/test/CodeGen/X86/tailcallbyval64.ll
+++ b/test/CodeGen/X86/tailcallbyval64.ll
@@ -37,6 +37,6 @@ define  fastcc i64 @tailcaller(i64 %b, %struct.s* byval %a) {
 entry:
         %tmp2 = getelementptr %struct.s* %a, i32 0, i32 1
         %tmp3 = load i64* %tmp2, align 8
-        %tmp4 = tail call fastcc i64 @tailcallee(%struct.s* %a byval, i64 %tmp3, i64 %b, i64 7, i64 13, i64 17)
+        %tmp4 = tail call fastcc i64 @tailcallee(%struct.s* byval %a , i64 %tmp3, i64 %b, i64 7, i64 13, i64 17)
         ret i64 %tmp4
 }
diff --git a/test/CodeGen/X86/tailcallstack64.ll b/test/CodeGen/X86/tailcallstack64.ll
index 060ce0f..c18c7aa 100644
--- a/test/CodeGen/X86/tailcallstack64.ll
+++ b/test/CodeGen/X86/tailcallstack64.ll
@@ -2,7 +2,7 @@
 ; RUN: llc < %s -tailcallopt -mtriple=x86_64-win32 -post-RA-scheduler=true | FileCheck %s
 
 ; FIXME: Redundant unused stack allocation could be eliminated.
-; CHECK: subq  ${{24|72}}, %rsp
+; CHECK: subq  ${{24|72|80}}, %rsp
 
 ; Check that lowered arguments on the stack do not overwrite each other.
 ; Add %in1 %p1 to a different temporary register (%eax).
diff --git a/test/CodeGen/X86/test-nofold.ll b/test/CodeGen/X86/test-nofold.ll
index f1063dc..97db1b3 100644
--- a/test/CodeGen/X86/test-nofold.ll
+++ b/test/CodeGen/X86/test-nofold.ll
@@ -2,10 +2,10 @@
 ; rdar://5752025
 
 ; We want:
-;      CHECK: movl	4(%esp), %ecx
-; CHECK-NEXT: andl	$15, %ecx
-; CHECK-NEXT: movl	$42, %eax
-; CHECK-NEXT: cmovel	%ecx, %eax
+;      CHECK: movl	$42, %ecx
+; CHECK-NEXT: movl	4(%esp), %eax
+; CHECK-NEXT: andl	$15, %eax
+; CHECK-NEXT: cmovnel	%ecx, %eax
 ; CHECK-NEXT: ret
 ;
 ; We don't want:
diff --git a/test/CodeGen/X86/trunc-to-bool.ll b/test/CodeGen/X86/trunc-to-bool.ll
index 6062084..92b6859 100644
--- a/test/CodeGen/X86/trunc-to-bool.ll
+++ b/test/CodeGen/X86/trunc-to-bool.ll
@@ -3,7 +3,7 @@
 ; value and as the operand of a branch.
 ; RUN: llc < %s -march=x86 | FileCheck %s
 
-define i1 @test1(i32 %X) zeroext nounwind {
+define zeroext i1 @test1(i32 %X)  nounwind {
     %Y = trunc i32 %X to i1
     ret i1 %Y
 }
diff --git a/test/CodeGen/X86/umul-with-overflow.ll b/test/CodeGen/X86/umul-with-overflow.ll
index c997661..e5858de 100644
--- a/test/CodeGen/X86/umul-with-overflow.ll
+++ b/test/CodeGen/X86/umul-with-overflow.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=x86 | FileCheck %s
 
 declare {i32, i1} @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
-define i1 @a(i32 %x) zeroext nounwind {
+define zeroext i1 @a(i32 %x)  nounwind {
   %res = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %x, i32 3)
   %obil = extractvalue {i32, i1} %res, 1
   ret i1 %obil
@@ -12,3 +12,27 @@ define i1 @a(i32 %x) zeroext nounwind {
 ; CHECK: movzbl	%al, %eax
 ; CHECK: ret
 }
+
+define i32 @test2(i32 %a, i32 %b) nounwind readnone {
+entry:
+	%tmp0 = add i32 %b, %a
+	%tmp1 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %tmp0, i32 2)
+	%tmp2 = extractvalue { i32, i1 } %tmp1, 0
+	ret i32 %tmp2
+; CHECK: test2:
+; CHECK: addl
+; CHECK-NEXT: addl
+; CHECK-NEXT: ret
+}
+
+define i32 @test3(i32 %a, i32 %b) nounwind readnone {
+entry:
+	%tmp0 = add i32 %b, %a
+	%tmp1 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %tmp0, i32 4)
+	%tmp2 = extractvalue { i32, i1 } %tmp1, 0
+	ret i32 %tmp2
+; CHECK: test3:
+; CHECK: addl
+; CHECK: mull
+; CHECK-NEXT: ret
+}
diff --git a/test/CodeGen/X86/unaligned-load.ll b/test/CodeGen/X86/unaligned-load.ll
index 6a493c0..9f70489 100644
--- a/test/CodeGen/X86/unaligned-load.ll
+++ b/test/CodeGen/X86/unaligned-load.ll
@@ -29,8 +29,8 @@ return:
 declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind
 
 ; CORE2: .section
-; CORE2: .align  4
+; CORE2: .align  3
 ; CORE2-NEXT: _.str1:
 ; CORE2-NEXT: .asciz "DHRYSTONE PROGRAM, SOME STRING"
-; CORE2: .align 4
+; CORE2: .align 3
 ; CORE2-NEXT: _.str3:
diff --git a/test/CodeGen/X86/undef-label.ll b/test/CodeGen/X86/undef-label.ll
new file mode 100644
index 0000000..1afd935
--- /dev/null
+++ b/test/CodeGen/X86/undef-label.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
+
+; This is a case where we would incorrectly conclude that LBB0_1 could only
+; be reached via fall through and would therefore omit the label.
+
+; CHECK:      jne     .LBB0_1
+; CHECK-NEXT: jnp     .LBB0_3
+; CHECK-NEXT: .LBB0_1:
+
+define void @xyz() {
+entry:
+  br i1 fcmp oeq (double fsub (double undef, double undef), double 0.000000e+00), label %bar, label %foo
+
+foo:
+  br i1 fcmp ogt (double fdiv (double fsub (double fmul (double undef, double undef), double fsub (double undef, double undef)), double fmul (double undef, double undef)), double 1.0), label %foo, label %bar
+
+bar:
+  ret void
+}
diff --git a/test/CodeGen/X86/use-add-flags.ll b/test/CodeGen/X86/use-add-flags.ll
index 8fbbd39..a0448ec 100644
--- a/test/CodeGen/X86/use-add-flags.ll
+++ b/test/CodeGen/X86/use-add-flags.ll
@@ -7,10 +7,10 @@
 ; Use the flags on the add.
 
 ; CHECK: test1:
-;      CHECK: addl    (%r[[A0:di|cx]]), {{%esi|%edx}}
-; CHECK-NEXT: movl    {{%edx|%r8d}}, %eax
-; CHECK-NEXT: cmovnsl {{%ecx|%r9d}}, %eax
-; CHECK-NEXT: ret
+;     CHECK: addl
+; CHECK-NOT: test
+;     CHECK: cmovnsl
+;     CHECK: ret
 
 define i32 @test1(i32* %x, i32 %y, i32 %a, i32 %b) nounwind {
 	%tmp2 = load i32* %x, align 4		; <i32> [#uses=1]
@@ -42,7 +42,7 @@ false:
 ; Do use the flags result of the and here, since the and has another use.
 
 ; CHECK: test3:
-;      CHECK: andl    $16, %e[[A0]]
+;      CHECK: andl    $16, %e
 ; CHECK-NEXT: jne
 
 define void @test3(i32 %x) nounwind {
diff --git a/test/CodeGen/X86/vararg_tailcall.ll b/test/CodeGen/X86/vararg_tailcall.ll
new file mode 100644
index 0000000..73d80eb
--- /dev/null
+++ b/test/CodeGen/X86/vararg_tailcall.ll
@@ -0,0 +1,98 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-pc-win32 | FileCheck %s -check-prefix=WIN64
+
+@.str = private unnamed_addr constant [5 x i8] c"%ld\0A\00"
+@sel = external global i8*
+@sel3 = external global i8*
+@sel4 = external global i8*
+@sel5 = external global i8*
+@sel6 = external global i8*
+@sel7 = external global i8*
+
+; X64: @foo
+; X64: jmp
+; WIN64: @foo
+; WIN64: callq
+define void @foo(i64 %arg) nounwind optsize ssp noredzone {
+entry:
+  %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([5 x i8]* @.str, i64 0, i64 0), i64 %arg) nounwind optsize noredzone
+  ret void
+}
+
+declare i32 @printf(i8*, ...) optsize noredzone
+
+; X64: @bar
+; X64: jmp
+; WIN64: @bar
+; WIN64: jmp
+define void @bar(i64 %arg) nounwind optsize ssp noredzone {
+entry:
+  tail call void @bar2(i8* getelementptr inbounds ([5 x i8]* @.str, i64 0, i64 0), i64 %arg) nounwind optsize noredzone
+  ret void
+}
+
+declare void @bar2(i8*, i64) optsize noredzone
+
+; X64: @foo2
+; X64: jmp
+; WIN64: @foo2
+; WIN64: callq
+define i8* @foo2(i8* %arg) nounwind optsize ssp noredzone {
+entry:
+  %tmp1 = load i8** @sel, align 8, !tbaa !0
+  %call = tail call i8* (i8*, i8*, ...)* @x2(i8* %arg, i8* %tmp1) nounwind optsize noredzone
+  ret i8* %call
+}
+
+declare i8* @x2(i8*, i8*, ...) optsize noredzone
+
+; X64: @foo6
+; X64: jmp
+; WIN64: @foo6
+; WIN64: callq
+define i8* @foo6(i8* %arg1, i8* %arg2) nounwind optsize ssp noredzone {
+entry:
+  %tmp2 = load i8** @sel3, align 8, !tbaa !0
+  %tmp3 = load i8** @sel4, align 8, !tbaa !0
+  %tmp4 = load i8** @sel5, align 8, !tbaa !0
+  %tmp5 = load i8** @sel6, align 8, !tbaa !0
+  %call = tail call i8* (i8*, i8*, i8*, ...)* @x3(i8* %arg1, i8* %arg2, i8* %tmp2, i8* %tmp3, i8* %tmp4, i8* %tmp5) nounwind optsize noredzone
+  ret i8* %call
+}
+
+declare i8* @x3(i8*, i8*, i8*, ...) optsize noredzone
+
+; X64: @foo7
+; X64: callq
+; WIN64: @foo7
+; WIN64: callq
+define i8* @foo7(i8* %arg1, i8* %arg2) nounwind optsize ssp noredzone {
+entry:
+  %tmp2 = load i8** @sel3, align 8, !tbaa !0
+  %tmp3 = load i8** @sel4, align 8, !tbaa !0
+  %tmp4 = load i8** @sel5, align 8, !tbaa !0
+  %tmp5 = load i8** @sel6, align 8, !tbaa !0
+  %tmp6 = load i8** @sel7, align 8, !tbaa !0
+  %call = tail call i8* (i8*, i8*, i8*, i8*, i8*, i8*, i8*, ...)* @x7(i8* %arg1, i8* %arg2, i8* %tmp2, i8* %tmp3, i8* %tmp4, i8* %tmp5, i8* %tmp6) nounwind optsize noredzone
+  ret i8* %call
+}
+
+declare i8* @x7(i8*, i8*, i8*, i8*, i8*, i8*, i8*, ...) optsize noredzone
+
+; X64: @foo8
+; X64: callq
+; WIN64: @foo8
+; WIN64: callq
+define i8* @foo8(i8* %arg1, i8* %arg2) nounwind optsize ssp noredzone {
+entry:
+  %tmp2 = load i8** @sel3, align 8, !tbaa !0
+  %tmp3 = load i8** @sel4, align 8, !tbaa !0
+  %tmp4 = load i8** @sel5, align 8, !tbaa !0
+  %tmp5 = load i8** @sel6, align 8, !tbaa !0
+  %call = tail call i8* (i8*, i8*, i8*, ...)* @x3(i8* %arg1, i8* %arg2, i8* %tmp2, i8* %tmp3, i8* %tmp4, i8* %tmp5, i32 48879, i32 48879) nounwind optsize noredzone
+  ret i8* %call
+}
+
+!0 = metadata !{metadata !"any pointer", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/test/CodeGen/X86/vec_extract-sse4.ll b/test/CodeGen/X86/vec_extract-sse4.ll
index dab5dd1..f487654 100644
--- a/test/CodeGen/X86/vec_extract-sse4.ll
+++ b/test/CodeGen/X86/vec_extract-sse4.ll
@@ -1,8 +1,8 @@
-; RUN: llc < %s -march=x86 -mattr=+sse41 -o %t
-; RUN: grep extractps   %t | count 1
-; RUN: grep pextrd      %t | count 1
+; RUN: llc < %s -mcpu=corei7 -march=x86 -mattr=+sse41 -o %t
+; RUN: not grep extractps   %t
+; RUN: not grep pextrd      %t
 ; RUN: not grep pshufd  %t
-; RUN: not grep movss   %t
+; RUN: grep movss   %t | count 2
 
 define void @t1(float* %R, <4 x float>* %P1) nounwind {
 	%X = load <4 x float>* %P1
diff --git a/test/CodeGen/X86/vec_extract.ll b/test/CodeGen/X86/vec_extract.ll
index b013730..2c8796b 100644
--- a/test/CodeGen/X86/vec_extract.ll
+++ b/test/CodeGen/X86/vec_extract.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2,-sse41 -o %t
-; RUN: grep movss    %t | count 3
+; RUN: llc < %s -mcpu=corei7 -march=x86 -mattr=+sse2,-sse41 -o %t
+; RUN: grep movss    %t | count 4
 ; RUN: grep movhlps  %t | count 1
-; RUN: grep pshufd   %t | count 1
+; RUN: not grep pshufd   %t 
 ; RUN: grep unpckhpd %t | count 1
 
 define void @test1(<4 x float>* %F, float* %f) nounwind {
diff --git a/test/CodeGen/X86/vec_shuffle-16.ll b/test/CodeGen/X86/vec_shuffle-16.ll
index 2ee87fe..06f38ed 100644
--- a/test/CodeGen/X86/vec_shuffle-16.ll
+++ b/test/CodeGen/X86/vec_shuffle-16.ll
@@ -1,8 +1,9 @@
 ; RUN: llc < %s -march=x86 -mattr=+sse,-sse2 -mtriple=i386-apple-darwin | FileCheck %s -check-prefix=sse
 ; RUN: llc < %s -march=x86 -mattr=+sse2 -mtriple=i386-apple-darwin | FileCheck %s -check-prefix=sse2
 
+; sse:  t1:
+; sse2: t1:
 define <4 x float> @t1(<4 x float> %a, <4 x float> %b) nounwind  {
-; sse: movaps
 ; sse: shufps
 ; sse2: pshufd
 ; sse2-NEXT: ret
@@ -10,6 +11,8 @@ define <4 x float> @t1(<4 x float> %a, <4 x float> %b) nounwind  {
         ret <4 x float> %tmp1
 }
 
+; sse:  t2:
+; sse2: t2:
 define <4 x float> @t2(<4 x float> %A, <4 x float> %B) nounwind {
 ; sse: shufps
 ; sse2: pshufd
@@ -18,8 +21,9 @@ define <4 x float> @t2(<4 x float> %A, <4 x float> %B) nounwind {
 	ret <4 x float> %tmp
 }
 
+; sse:  t3:
+; sse2: t3:
 define <4 x float> @t3(<4 x float> %A, <4 x float> %B) nounwind {
-; sse: movaps
 ; sse: shufps
 ; sse2: pshufd
 ; sse2-NEXT: ret
@@ -27,7 +31,10 @@ define <4 x float> @t3(<4 x float> %A, <4 x float> %B) nounwind {
 	ret <4 x float> %tmp
 }
 
+; sse:  t4:
+; sse2: t4:
 define <4 x float> @t4(<4 x float> %A, <4 x float> %B) nounwind {
+
 ; sse: shufps
 ; sse2: pshufd
 ; sse2-NEXT: ret
diff --git a/test/CodeGen/X86/vec_uint_to_fp.ll b/test/CodeGen/X86/vec_uint_to_fp.ll
index 39e7d71..fe7fa2f 100644
--- a/test/CodeGen/X86/vec_uint_to_fp.ll
+++ b/test/CodeGen/X86/vec_uint_to_fp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=sandybridge | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=corei7-avx | FileCheck %s
 
 ; Test that we are not lowering uinttofp to scalars
 define <4 x float> @test1(<4 x i32> %A) nounwind {
diff --git a/test/CodeGen/X86/visibility2.ll b/test/CodeGen/X86/visibility2.ll
new file mode 100644
index 0000000..72ea733
--- /dev/null
+++ b/test/CodeGen/X86/visibility2.ll
@@ -0,0 +1,18 @@
+; This test case ensures that when the visibility of a global declaration is 
+; emitted they are not treated as definitions.  Test case for r132825.
+; Fixes <rdar://problem/9429892>.
+;
+; RUN: llc -mtriple=x86_64-apple-darwin %s -o - | FileCheck %s
+
+@foo_private_extern_str = external hidden global i8*
+
+define void @foo1() nounwind ssp {
+entry:
+  %tmp = load i8** @foo_private_extern_str, align 8
+  call void @foo3(i8* %tmp)
+  ret void
+}
+
+declare void @foo3(i8*)
+
+; CHECK-NOT: .private_extern
diff --git a/test/CodeGen/X86/widen_load-0.ll b/test/CodeGen/X86/widen_load-0.ll
index 82c8252..c91627c 100644
--- a/test/CodeGen/X86/widen_load-0.ll
+++ b/test/CodeGen/X86/widen_load-0.ll
@@ -4,15 +4,15 @@
 
 ; Both loads should happen before either store.
 
-; CHECK: movl  (%rdi), %eax
-; CHECK: movl  (%rsi), %ecx
-; CHECK: movl  %ecx, (%rdi)
-; CHECK: movl  %eax, (%rsi)
+; CHECK: movl  (%rdi), %[[R1:...]]
+; CHECK: movl  (%rsi), %[[R2:...]]
+; CHECK: movl  %[[R2]], (%rdi)
+; CHECK: movl  %[[R1]], (%rsi)
 
-; WIN64: movl  (%rcx), %eax
-; WIN64: movl  (%rdx), %esi
-; WIN64: movl  %esi, (%rcx)
-; WIN64: movl  %eax, (%rdx)
+; WIN64: movl  (%rcx), %[[R1:...]]
+; WIN64: movl  (%rdx), %[[R2:...]]
+; WIN64: movl  %[[R2]], (%rcx)
+; WIN64: movl  %[[R1]], (%rdx)
 
 define void @short2_int_swap(<2 x i16>* nocapture %b, i32* nocapture %c) nounwind {
 entry:
diff --git a/test/CodeGen/X86/win64_alloca_dynalloca.ll b/test/CodeGen/X86/win64_alloca_dynalloca.ll
index cbd38da..e39d007 100644
--- a/test/CodeGen/X86/win64_alloca_dynalloca.ll
+++ b/test/CodeGen/X86/win64_alloca_dynalloca.ll
@@ -1,9 +1,12 @@
-; RUN: llc < %s -mtriple=x86_64-mingw32     | FileCheck %s -check-prefix=M64
-; RUN: llc < %s -mtriple=x86_64-win32       | FileCheck %s -check-prefix=W64
-; RUN: llc < %s -mtriple=x86_64-win32-macho | FileCheck %s -check-prefix=EFI
+; RUN: llc < %s -join-physregs -mtriple=x86_64-mingw32     | FileCheck %s -check-prefix=M64
+; RUN: llc < %s -join-physregs -mtriple=x86_64-win32       | FileCheck %s -check-prefix=W64
+; RUN: llc < %s -join-physregs -mtriple=x86_64-win32-macho | FileCheck %s -check-prefix=EFI
 ; PR8777
 ; PR8778
 
+; Passing the same value in two registers creates a false interference that
+; only -join-physregs resolves. It could also be handled by a parallel copy.
+
 define i64 @foo(i64 %n, i64 %x) nounwind {
 entry:
 
@@ -40,9 +43,9 @@ entry:
 ; W64: subq  %rax, %rsp
 ; W64: movq  %rsp, %rax
 
-; EFI: leaq  15(%rcx), [[R1:%r..]]
+; EFI: leaq  15(%rcx), [[R1:%r.*]]
 ; EFI: andq  $-16, [[R1]]
-; EFI: movq  %rsp, [[R64:%r..]]
+; EFI: movq  %rsp, [[R64:%r.*]]
 ; EFI: subq  [[R1]], [[R64]]
 ; EFI: movq  [[R64]], %rsp
 
diff --git a/test/CodeGen/X86/x86-64-and-mask.ll b/test/CodeGen/X86/x86-64-and-mask.ll
index 2465f23..07ccb23 100644
--- a/test/CodeGen/X86/x86-64-and-mask.ll
+++ b/test/CodeGen/X86/x86-64-and-mask.ll
@@ -39,7 +39,7 @@ define void @ccc(i64 %x) nounwind {
 
 ; This requires a mov and a 64-bit and.
 ; CHECK: ddd:
-; CHECK: movabsq $4294967296, %rax
+; CHECK: movabsq $4294967296, %r
 ; CHECK: andq %rax, %rdi
 
 define void @ddd(i64 %x) nounwind {
diff --git a/test/CodeGen/X86/x86-64-extend-shift.ll b/test/CodeGen/X86/x86-64-extend-shift.ll
index 6852785..6ebaeee 100644
--- a/test/CodeGen/X86/x86-64-extend-shift.ll
+++ b/test/CodeGen/X86/x86-64-extend-shift.ll
@@ -2,7 +2,7 @@
 ; Formerly there were two shifts.
 
 define i64 @baz(i32 %A) nounwind {
-; CHECK:  shlq  $49, %rax
+; CHECK:  shlq  $49, %r
         %tmp1 = shl i32 %A, 17
         %tmp2 = zext i32 %tmp1 to i64
         %tmp3 = shl i64 %tmp2, 32
diff --git a/test/CodeGen/X86/x86-64-malloc.ll b/test/CodeGen/X86/x86-64-malloc.ll
deleted file mode 100644
index 4aa0ec3..0000000
--- a/test/CodeGen/X86/x86-64-malloc.ll
+++ /dev/null
@@ -1,12 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s
-; CHECK: shll $3, {{%edi|%ecx}}
-; PR3829
-; The generated code should multiply by 3 (sizeof i8*) as an i32,
-; not as an i64!
-
-define i8** @test(i32 %sz) {
-	%sub = add i32 %sz, 536870911		; <i32> [#uses=1]
-	%call = malloc i8*, i32 %sub		; <i8**> [#uses=1]
-	ret i8** %call
-}
diff --git a/test/CodeGen/X86/x86-64-shortint.ll b/test/CodeGen/X86/x86-64-shortint.ll
index 7f96543..cbf6588 100644
--- a/test/CodeGen/X86/x86-64-shortint.ll
+++ b/test/CodeGen/X86/x86-64-shortint.ll
@@ -5,7 +5,7 @@ target triple = "x86_64-apple-darwin8"
 
 
 define void @bar(i16 zeroext  %A) {
-        tail call void @foo( i16 %A signext  )
+        tail call void @foo( i16 signext %A   )
         ret void
 }
 declare void @foo(i16 signext )
diff --git a/test/CodeGen/X86/x86-shifts.ll b/test/CodeGen/X86/x86-shifts.ll
new file mode 100644
index 0000000..fdf68f9
--- /dev/null
+++ b/test/CodeGen/X86/x86-shifts.ll
@@ -0,0 +1,142 @@
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s
+
+; Splat patterns below
+
+
+define <4 x i32> @shl4(<4 x i32> %A) nounwind {
+entry:
+; CHECK:      shl4
+; CHECK:      pslld
+; CHECK-NEXT: pslld
+  %B = shl <4 x i32> %A,  < i32 2, i32 2, i32 2, i32 2>
+  %C = shl <4 x i32> %A,  < i32 1, i32 1, i32 1, i32 1>
+  %K = xor <4 x i32> %B, %C
+  ret <4 x i32> %K
+}
+
+define <4 x i32> @shr4(<4 x i32> %A) nounwind {
+entry:
+; CHECK:      shr4
+; CHECK:      psrld
+; CHECK-NEXT: psrld
+  %B = lshr <4 x i32> %A,  < i32 2, i32 2, i32 2, i32 2>
+  %C = lshr <4 x i32> %A,  < i32 1, i32 1, i32 1, i32 1>
+  %K = xor <4 x i32> %B, %C
+  ret <4 x i32> %K
+}
+
+define <4 x i32> @sra4(<4 x i32> %A) nounwind {
+entry:
+; CHECK:      sra4
+; CHECK:      psrad
+; CHECK-NEXT: psrad
+  %B = ashr <4 x i32> %A,  < i32 2, i32 2, i32 2, i32 2>
+  %C = ashr <4 x i32> %A,  < i32 1, i32 1, i32 1, i32 1>
+  %K = xor <4 x i32> %B, %C
+  ret <4 x i32> %K
+}
+
+define <2 x i64> @shl2(<2 x i64> %A) nounwind {
+entry:
+; CHECK:      shl2
+; CHECK:      psllq
+; CHECK-NEXT: psllq
+  %B = shl <2 x i64> %A,  < i64 2, i64 2>
+  %C = shl <2 x i64> %A,  < i64 9, i64 9>
+  %K = xor <2 x i64> %B, %C
+  ret <2 x i64> %K
+}
+
+define <2 x i64> @shr2(<2 x i64> %A) nounwind {
+entry:
+; CHECK:      shr2
+; CHECK:      psrlq
+; CHECK-NEXT: psrlq
+  %B = lshr <2 x i64> %A,  < i64 8, i64 8>
+  %C = lshr <2 x i64> %A,  < i64 1, i64 1>
+  %K = xor <2 x i64> %B, %C
+  ret <2 x i64> %K
+}
+
+
+define <8 x i16> @shl8(<8 x i16> %A) nounwind {
+entry:
+; CHECK:      shl8
+; CHECK:      psllw
+; CHECK-NEXT: psllw
+  %B = shl <8 x i16> %A,  < i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
+  %C = shl <8 x i16> %A,  < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %K = xor <8 x i16> %B, %C
+  ret <8 x i16> %K
+}
+
+define <8 x i16> @shr8(<8 x i16> %A) nounwind {
+entry:
+; CHECK:      shr8
+; CHECK:      psrlw
+; CHECK-NEXT: psrlw
+  %B = lshr <8 x i16> %A,  < i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
+  %C = lshr <8 x i16> %A,  < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %K = xor <8 x i16> %B, %C
+  ret <8 x i16> %K
+}
+
+define <8 x i16> @sra8(<8 x i16> %A) nounwind {
+entry:
+; CHECK:      sra8
+; CHECK:      psraw
+; CHECK-NEXT: psraw
+  %B = ashr <8 x i16> %A,  < i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
+  %C = ashr <8 x i16> %A,  < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %K = xor <8 x i16> %B, %C
+  ret <8 x i16> %K
+}
+
+; non splat test
+
+
+define <8 x i16> @sll8_nosplat(<8 x i16> %A) nounwind {
+entry:
+; CHECK: sll8_nosplat
+; CHECK-NOT: psll
+; CHECK-NOT: psll
+  %B = shl <8 x i16> %A,  < i16 1, i16 2, i16 3, i16 6, i16 2, i16 2, i16 2, i16 2>
+  %C = shl <8 x i16> %A,  < i16 9, i16 7, i16 5, i16 1, i16 4, i16 1, i16 1, i16 1>
+  %K = xor <8 x i16> %B, %C
+  ret <8 x i16> %K
+}
+
+
+define <2 x i64> @shr2_nosplat(<2 x i64> %A) nounwind {
+entry:
+; CHECK: shr2_nosplat
+; CHECK-NOT:  psrlq
+; CHECK-NOT:  psrlq
+  %B = lshr <2 x i64> %A,  < i64 8, i64 1>
+  %C = lshr <2 x i64> %A,  < i64 1, i64 0>
+  %K = xor <2 x i64> %B, %C
+  ret <2 x i64> %K
+}
+
+
+; Other shifts
+
+define <2 x i32> @shl2_other(<2 x i32> %A) nounwind {
+entry:
+; CHECK: shl2_other
+; CHECK-not:      psllq
+  %B = shl <2 x i32> %A,  < i32 2, i32 2>
+  %C = shl <2 x i32> %A,  < i32 9, i32 9>
+  %K = xor <2 x i32> %B, %C
+  ret <2 x i32> %K
+}
+
+define <2 x i32> @shr2_other(<2 x i32> %A) nounwind {
+entry:
+; CHECK: shr2_other
+; CHECK-NOT:      psrlq
+  %B = lshr <2 x i32> %A,  < i32 8, i32 8>
+  %C = lshr <2 x i32> %A,  < i32 1, i32 1>
+  %K = xor <2 x i32> %B, %C
+  ret <2 x i32> %K
+}
diff --git a/test/CodeGen/X86/xor.ll b/test/CodeGen/X86/xor.ll
index b90d81a..178c59d 100644
--- a/test/CodeGen/X86/xor.ll
+++ b/test/CodeGen/X86/xor.ll
@@ -29,9 +29,8 @@ entry:
         ret i32 %tmp4
         
 ; X64: test3:
-; X64:	notl	[[A1:%esi|%edx]]
-; X64:	andl	[[A0:%edi|%ecx]], [[A1]]
-; X64:	movl	[[A1]], %eax
+; X64:	notl
+; X64:	andl
 ; X64:	shrl	%eax
 ; X64:	ret
 
@@ -139,7 +138,7 @@ entry:
   %t2 = add i32 %t1, -1
   ret i32 %t2
 ; X64: test8:
-; X64:   notl %eax
+; X64:   notl {{%eax|%edi|%ecx}}
 ; X32: test8:
 ; X32:   notl %eax
 }
diff --git a/test/CodeGen/X86/zext-fold.ll b/test/CodeGen/X86/zext-fold.ll
new file mode 100644
index 0000000..b3f5cdb
--- /dev/null
+++ b/test/CodeGen/X86/zext-fold.ll
@@ -0,0 +1,41 @@
+; RUN: llc < %s -march=x86 | FileCheck %s
+
+;; Simple case
+define i32 @test1(i8 %x) nounwind readnone {
+  %A = and i8 %x, -32
+  %B = zext i8 %A to i32
+  ret i32 %B
+}
+; CHECK: test1
+; CHECK: movzbl
+; CHECK-NEXT: andl {{.*}}224
+
+;; Multiple uses of %x but easily extensible. 
+define i32 @test2(i8 %x) nounwind readnone {
+  %A = and i8 %x, -32
+  %B = zext i8 %A to i32
+  %C = or i8 %x, 63
+  %D = zext i8 %C to i32
+  %E = add i32 %B, %D
+  ret i32 %E
+}
+; CHECK: test2
+; CHECK: movzbl
+; CHECK: orl $63
+; CHECK: andl $224
+
+declare void @use(i32, i8)
+
+;; Multiple uses of %x where we shouldn't extend the load.
+define void @test3(i8 %x) nounwind readnone {
+  %A = and i8 %x, -32
+  %B = zext i8 %A to i32
+  call void @use(i32 %B, i8 %x)
+  ret void
+}
+; CHECK: test3
+; CHECK: movzbl 16(%esp), [[REGISTER:%e[a-z]{2}]]
+; CHECK-NEXT: movl [[REGISTER]], 4(%esp)
+; CHECK-NEXT: andl $224, [[REGISTER]]
+; CHECK-NEXT: movl [[REGISTER]], (%esp)
+; CHECK-NEXT: call{{.*}}use
diff --git a/test/CodeGen/XCore/bitrev.ll b/test/CodeGen/XCore/bitrev.ll
deleted file mode 100644
index 09202d3..0000000
--- a/test/CodeGen/XCore/bitrev.ll
+++ /dev/null
@@ -1,8 +0,0 @@
-; RUN: llc < %s -march=xcore > %t1.s
-; RUN: grep bitrev %t1.s | count 1 
-declare i32 @llvm.xcore.bitrev(i32)
-
-define i32 @test(i32 %val) {
-	%result = call i32 @llvm.xcore.bitrev(i32 %val)
-	ret i32 %result
-}
diff --git a/test/CodeGen/XCore/misc-intrinsics.ll b/test/CodeGen/XCore/misc-intrinsics.ll
new file mode 100644
index 0000000..f504a2e
--- /dev/null
+++ b/test/CodeGen/XCore/misc-intrinsics.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s -march=xcore | FileCheck %s
+%0 = type { i32, i32 }
+
+declare i32 @llvm.xcore.bitrev(i32)
+declare i32 @llvm.xcore.crc32(i32, i32, i32)
+declare %0 @llvm.xcore.crc8(i32, i32, i32)
+
+define i32 @bitrev(i32 %val) {
+; CHECK: bitrev:
+; CHECK: bitrev r0, r0
+	%result = call i32 @llvm.xcore.bitrev(i32 %val)
+	ret i32 %result
+}
+
+define i32 @crc32(i32 %crc, i32 %data, i32 %poly) {
+; CHECK: crc32:
+; CHECK: crc32 r0, r1, r2
+	%result = call i32 @llvm.xcore.crc32(i32 %crc, i32 %data, i32 %poly)
+	ret i32 %result
+}
+
+define %0 @crc8(i32 %crc, i32 %data, i32 %poly) {
+; CHECK: crc8:
+; CHECK: crc8 r0, r1, r1, r2
+	%result = call %0 @llvm.xcore.crc8(i32 %crc, i32 %data, i32 %poly)
+	ret %0 %result
+}
diff --git a/test/CodeGen/XCore/mul64.ll b/test/CodeGen/XCore/mul64.ll
index 77c6b42..3d373b1 100644
--- a/test/CodeGen/XCore/mul64.ll
+++ b/test/CodeGen/XCore/mul64.ll
@@ -9,7 +9,7 @@ entry:
 }
 ; CHECK: umul_lohi:
 ; CHECK: ldc [[REG:r[0-9]+]], 0
-; CHECK-NEXT: lmul r1, r0, r1, r0, [[REG]], [[REG]]
+; CHECK-NEXT: lmul {{.*}}, [[REG]], [[REG]]
 ; CHECK-NEXT: retsp 0
 
 define i64 @smul_lohi(i32 %a, i32 %b) {
@@ -23,9 +23,7 @@ entry:
 ; CHECK: ldc
 ; CHECK-NEXT: mov
 ; CHECK-NEXT: maccs
-; CHECK-NEXT: mov r0,
-; CHECK-NEXT: mov r1,
-; CHECK-NEXT: retsp 0
+; CHECK: retsp 0
 
 define i64 @mul64(i64 %a, i64 %b) {
 entry:
@@ -37,7 +35,6 @@ entry:
 ; CHECK-NEXT: lmul
 ; CHECK-NEXT: mul
 ; CHECK-NEXT: lmul
-; CHECK-NEXT: mov r0,
 
 define i64 @mul64_2(i64 %a, i32 %b) {
 entry:
@@ -50,4 +47,4 @@ entry:
 ; CHECK-NEXT: lmul
 ; CHECK-NEXT: mul
 ; CHECK-NEXT: add r1,
-; CHECK-NEXT: retsp 0
+; CHECK: retsp 0
diff --git a/test/DebugInfo/2010-04-13-PubType.ll b/test/DebugInfo/2010-04-13-PubType.ll
index 371169f..db7bb0a 100644
--- a/test/DebugInfo/2010-04-13-PubType.ll
+++ b/test/DebugInfo/2010-04-13-PubType.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -O0 -asm-verbose < %s > %t
 ; RUN: grep "External Name" %t | grep -v X
 ; RUN: grep "External Name" %t | grep Y | count 1
-; Test to check type with no defintion is listed in pubtypes section.
+; Test to check type with no definition is listed in pubtypes section.
 %struct.X = type opaque
 %struct.Y = type { i32 }
 
diff --git a/test/DebugInfo/X86/debug_frame.ll b/test/DebugInfo/X86/debug_frame.ll
new file mode 100644
index 0000000..d273d73
--- /dev/null
+++ b/test/DebugInfo/X86/debug_frame.ll
@@ -0,0 +1,18 @@
+; RUN: llc %s -mtriple=i686-pc-linux-gnu -o - | FileCheck %s
+
+; Test that we produce a .debug_frame, not an .eh_frame
+
+; CHECK: .cfi_sections .debug_frame
+
+define void @f() nounwind {
+entry:
+  ret void
+}
+
+!llvm.dbg.sp = !{!0}
+
+!0 = metadata !{i32 589870, i32 0, metadata !1, metadata !"f", metadata !"f", metadata !"", metadata !1, i32 1, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, void ()* @f, null, null} ; [ DW_TAG_subprogram ]
+!1 = metadata !{i32 589865, metadata !"/home/espindola/llvm/test.c", metadata !"/home/espindola/llvm/build", metadata !2} ; [ DW_TAG_file_type ]
+!2 = metadata !{i32 589841, i32 0, i32 12, metadata !"/home/espindola/llvm/test.c", metadata !"/home/espindola/llvm/build", metadata !"clang version 3.0 ()", i1 true, i1 true, metadata !"", i32 0} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{i32 589845, metadata !1, metadata !"", metadata !1, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!4 = metadata !{null}
diff --git a/test/DebugInfo/X86/dg.exp b/test/DebugInfo/X86/dg.exp
new file mode 100644
index 0000000..7b7bd4e
--- /dev/null
+++ b/test/DebugInfo/X86/dg.exp
@@ -0,0 +1,5 @@
+load_lib llvm.exp
+
+if { [llvm_supports_target X86] } {
+  RunLLVMTests [lsort [glob -nocomplain $srcdir/$subdir/*.{ll}]]
+}
diff --git a/test/DebugInfo/X86/eh_symbol.ll b/test/DebugInfo/X86/eh_symbol.ll
new file mode 100644
index 0000000..a87afed
--- /dev/null
+++ b/test/DebugInfo/X86/eh_symbol.ll
@@ -0,0 +1,18 @@
+; RUN: llc -mtriple=i386-apple-macosx -disable-cfi %s -o - | FileCheck %s
+
+; test that we don't produce foo.eh symbols is a debug_frame section.
+; CHECK-NOT: .globl	_f.eh
+
+define i32 @f() nounwind readnone optsize {
+entry:
+  ret i32 42
+}
+
+!llvm.dbg.sp = !{!0}
+
+!0 = metadata !{i32 589870, i32 0, metadata !1, metadata !"f", metadata !"f", metadata !"", metadata !1, i32 1, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i32 ()* @f, null, null} ; [ DW_TAG_subprogram ]
+!1 = metadata !{i32 589865, metadata !"/home/espindola/llvm/test.c", metadata !"/home/espindola/tmpfs/build", metadata !2} ; [ DW_TAG_file_type ]
+!2 = metadata !{i32 589841, i32 0, i32 12, metadata !"/home/espindola/llvm/test.c", metadata !"/home/espindola/tmpfs/build", metadata !"clang version 3.0 ()", i1 true, i1 true, metadata !"", i32 0} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{i32 589845, metadata !1, metadata !"", metadata !1, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!4 = metadata !{metadata !5}
+!5 = metadata !{i32 589860, metadata !2, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
diff --git a/test/DebugInfo/X86/pr9951.ll b/test/DebugInfo/X86/pr9951.ll
new file mode 100644
index 0000000..7716cd7
--- /dev/null
+++ b/test/DebugInfo/X86/pr9951.ll
@@ -0,0 +1,23 @@
+; RUN: llc -mtriple x86_64-apple-darwin10.0.0 -disable-cfi %s -o - | FileCheck %s
+
+define i32 @f() nounwind {
+entry:
+  ret i32 42
+}
+
+!llvm.dbg.sp = !{!0}
+
+!0 = metadata !{i32 589870, i32 0, metadata !1, metadata !"f", metadata !"f", metadata !"", metadata !1, i32 1, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, i32 ()* @f, null, null} ; [ DW_TAG_subprogram ]
+!1 = metadata !{i32 589865, metadata !"/home/espindola/llvm/test.c", metadata !"/home/espindola/llvm/build-rust2", metadata !2} ; [ DW_TAG_file_type ]
+!2 = metadata !{i32 589841, i32 0, i32 12, metadata !"/home/espindola/llvm/test.c", metadata !"/home/espindola/llvm/build-rust2", metadata !"clang version 3.0 ()", i1 true, i1 false, metadata !"", i32 0} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{i32 589845, metadata !1, metadata !"", metadata !1, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!4 = metadata !{metadata !5}
+!5 = metadata !{i32 589860, metadata !2, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+
+
+; CHECK:      _f:                                     ## @f
+; CHECK-NEXT: Ltmp0:
+
+; CHECK:      Ltmp9 = (Ltmp3-Ltmp2)-0
+; CHECK-NEXT:	.long	Ltmp9
+; CHECK-NEXT:	.quad	Ltmp0
diff --git a/test/DebugInfo/X86/stmt-list.ll b/test/DebugInfo/X86/stmt-list.ll
new file mode 100644
index 0000000..145649b
--- /dev/null
+++ b/test/DebugInfo/X86/stmt-list.ll
@@ -0,0 +1,19 @@
+; RUN: llc -mtriple x86_64-pc-linux-gnu < %s | FileCheck %s
+
+; CHECK:      .section        .debug_line,"",@progbits
+; CHECK-NEXT: .Lsection_line:
+
+; CHECK:      .long   .Lsection_line          # DW_AT_stmt_list
+
+define void @f() {
+entry:
+  ret void
+}
+
+!llvm.dbg.sp = !{!0}
+
+!0 = metadata !{i32 589870, i32 0, metadata !1, metadata !"f", metadata !"f", metadata !"", metadata !1, i32 1, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, void ()* @f, null, null} ; [ DW_TAG_subprogram ]
+!1 = metadata !{i32 589865, metadata !"test2.c", metadata !"/home/espindola/llvm", metadata !2} ; [ DW_TAG_file_type ]
+!2 = metadata !{i32 589841, i32 0, i32 12, metadata !"test2.c", metadata !"/home/espindola/llvm", metadata !"clang version 3.0 ()", i1 true, i1 true, metadata !"", i32 0} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{i32 589845, metadata !1, metadata !"", metadata !1, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!4 = metadata !{null}
diff --git a/test/DebugInfo/X86/subreg.ll b/test/DebugInfo/X86/subreg.ll
new file mode 100644
index 0000000..1c4456f
--- /dev/null
+++ b/test/DebugInfo/X86/subreg.ll
@@ -0,0 +1,27 @@
+; RUN: llc %s -mtriple=x86_64-pc-linux-gnu -O0 -o - | FileCheck %s
+
+; We are testing that a value in a 16 bit register gets reported as
+; being in its superregister.
+; FIXME: There should be a DW_OP_bit_piece too.
+
+; CHECK: .byte   80                      # DW_OP_reg0
+
+define i16 @f(i16 signext %zzz) nounwind {
+entry:
+  call void @llvm.dbg.value(metadata !{i16 %zzz}, i64 0, metadata !0)
+  %conv = sext i16 %zzz to i32, !dbg !7
+  %conv1 = trunc i32 %conv to i16
+  ret i16 %conv1
+}
+
+declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+
+!0 = metadata !{i32 590081, metadata !1, metadata !"zzz", metadata !2, i32 16777219, metadata !6, i32 0} ; [ DW_TAG_arg_variable ]
+!1 = metadata !{i32 589870, i32 0, metadata !2, metadata !"f", metadata !"f", metadata !"", metadata !2, i32 3, metadata !4, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, i16 (i16)* @f, null, null} ; [ DW_TAG_subprogram ]
+!2 = metadata !{i32 589865, metadata !"/home/espindola/llvm/test.c", metadata !"/home/espindola/tmpfs/build", metadata !3} ; [ DW_TAG_file_type ]
+!3 = metadata !{i32 589841, i32 0, i32 12, metadata !"/home/espindola/llvm/test.c", metadata !"/home/espindola/tmpfs/build", metadata !"clang version 3.0 ()", i1 true, i1 false, metadata !"", i32 0} ; [ DW_TAG_compile_unit ]
+!4 = metadata !{i32 589845, metadata !2, metadata !"", metadata !2, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !5, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!5 = metadata !{null}
+!6 = metadata !{i32 589860, metadata !3, metadata !"short", null, i32 0, i64 16, i64 16, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!7 = metadata !{i32 4, i32 22, metadata !8, null}
+!8 = metadata !{i32 589835, metadata !1, i32 3, i32 19, metadata !2, i32 0} ; [ DW_TAG_lexical_block ]
diff --git a/test/DebugInfo/array.ll b/test/DebugInfo/array.ll
new file mode 100644
index 0000000..9f592a1
--- /dev/null
+++ b/test/DebugInfo/array.ll
@@ -0,0 +1,34 @@
+; RUN: llc -O0 < %s | FileCheck %s
+; Do not emit AT_upper_bound for an unbounded array.
+; radar 9241695
+define i32 @main() nounwind ssp {
+entry:
+  %retval = alloca i32, align 4
+  %a = alloca [0 x i32], align 4
+  store i32 0, i32* %retval
+  call void @llvm.dbg.declare(metadata !{[0 x i32]* %a}, metadata !6), !dbg !11
+  ret i32 0, !dbg !12
+}
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+!llvm.dbg.sp = !{!0}
+
+!0 = metadata !{i32 589870, i32 0, metadata !1, metadata !"main", metadata !"main", metadata !"", metadata !1, i32 3, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 0, i1 false, i32 ()* @main, null} ; [ DW_TAG_subprogram ]
+!1 = metadata !{i32 589865, metadata !"array.c", metadata !"/private/tmp", metadata !2} ; [ DW_TAG_file_type ]
+!2 = metadata !{i32 589841, i32 0, i32 12, metadata !"array.c", metadata !"/private/tmp", metadata !"clang version 3.0 (trunk 129138)", i1 true, i1 false, metadata !"", i32 0} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{i32 589845, metadata !1, metadata !"", metadata !1, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!4 = metadata !{metadata !5}
+!5 = metadata !{i32 589860, metadata !2, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!6 = metadata !{i32 590080, metadata !7, metadata !"a", metadata !1, i32 4, metadata !8, i32 0} ; [ DW_TAG_auto_variable ]
+!7 = metadata !{i32 589835, metadata !0, i32 3, i32 12, metadata !1, i32 0} ; [ DW_TAG_lexical_block ]
+!8 = metadata !{i32 589825, metadata !2, metadata !"", metadata !2, i32 0, i64 0, i64 32, i32 0, i32 0, metadata !5, metadata !9, i32 0, i32 0} ; [ DW_TAG_array_type ]
+!9 = metadata !{metadata !10}
+;CHECK: DW_TAG_subrange_type
+;CHECK-NEXT: DW_AT_type
+;CHECK-NOT: DW_AT_lower_bound
+;CHECK-NOT: DW_AT_upper_bound
+;CHECK-NEXT: End Of Children Mark
+!10 = metadata !{i32 589857, i64 1, i64 0}        ; [ DW_TAG_subrange_type ]
+!11 = metadata !{i32 4, i32 7, metadata !7, null}
+!12 = metadata !{i32 5, i32 3, metadata !7, null}
diff --git a/test/ExecutionEngine/test-malloc.ll b/test/ExecutionEngine/test-malloc.ll
deleted file mode 100644
index b3400df..0000000
--- a/test/ExecutionEngine/test-malloc.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-; RUN: lli %s > /dev/null
-
-define i32 @main() {
-	%X = malloc i32		; <i32*> [#uses=1]
-	%Y = malloc i32, i32 100		; <i32*> [#uses=1]
-	%u = add i32 1, 2		; <i32> [#uses=1]
-	%Z = malloc i32, i32 %u		; <i32*> [#uses=1]
-	free i32* %X
-	free i32* %Y
-	free i32* %Z
-	ret i32 0
-}
-
diff --git a/test/Feature/alignment.ll b/test/Feature/alignment.ll
index ef35a13..f6dbe33 100644
--- a/test/Feature/alignment.ll
+++ b/test/Feature/alignment.ll
@@ -10,15 +10,6 @@ define i32* @test() align 32 {
         %Z = alloca i32         ; <i32*> [#uses=0]
         ret i32* %X
 }
-
-define i32* @test2() {
-        %X = malloc i32, align 4                ; <i32*> [#uses=1]
-        %Y = malloc i32, i32 42, align 16               ; <i32*> [#uses=0]
-        %Z = malloc i32         ; <i32*> [#uses=0]
-        %T = malloc i32, align 256              ; <i32*> [#uses=0]
-        ret i32* %X
-}
-
 define void @test3() alignstack(16) {
         ret void
 }
diff --git a/test/Feature/noalias-ret.ll b/test/Feature/noalias-ret.ll
deleted file mode 100644
index d88452b..0000000
--- a/test/Feature/noalias-ret.ll
+++ /dev/null
@@ -1,6 +0,0 @@
-; RUN: llvm-as < %s
-
-define noalias i8* @_Znwj(i32 %x) nounwind {
-  %A = malloc i8, i32 %x
-  ret i8* %A
-}
diff --git a/test/Feature/paramattrs.ll b/test/Feature/paramattrs.ll
index 3bee617..91aa460 100644
--- a/test/Feature/paramattrs.ll
+++ b/test/Feature/paramattrs.ll
@@ -5,8 +5,8 @@
 %ZFunTy = type i32(i8 zeroext)
 %SFunTy = type i32(i8 signext)
 
-declare i16 @"test"(i16 signext %arg) signext 
-declare i8 @"test2" (i16 zeroext %a2) zeroext 
+declare signext i16 @"test"(i16 signext %arg)  
+declare zeroext i8 @"test2" (i16 zeroext %a2) 
 
 declare i32 @"test3"(i32* noalias %p)
 
@@ -14,9 +14,11 @@ declare void @exit(i32) noreturn nounwind
 
 define i32 @main(i32 inreg %argc, i8 ** inreg %argv) nounwind {
     %val = trunc i32 %argc to i16
-    %res1 = call i16 (i16 signext) signext *@test(i16 signext %val) signext
+    %res1 = call signext i16 (i16 signext) *@test(i16 signext %val) 
     %two = add i16 %res1, %res1
-    %res2 = call i8 @test2(i16 %two zeroext) zeroext 
+    %res2 = call zeroext i8 @test2(i16 zeroext %two )  
     %retVal = sext i16 %two to i32
     ret i32 %retVal
 }
+
+declare void @function_to_resolve_eagerly() nonlazybind
diff --git a/test/Feature/testmemory.ll b/test/Feature/testmemory.ll
deleted file mode 100644
index a9019f0..0000000
--- a/test/Feature/testmemory.ll
+++ /dev/null
@@ -1,36 +0,0 @@
-; RUN: llvm-as < %s | llvm-dis > %t1.ll
-; RUN: llvm-as %t1.ll -o - | llvm-dis > %t2.ll
-; RUN: diff %t1.ll %t2.ll
-
-
-        %complexty = type { i32, { [4 x i8*], float }, double }
-        %struct = type { i32, { float, { i8 } }, i64 }
-
-define i32 @main() {
-        call i32 @testfunction( i64 0, i64 1 )          ; <i32>:1 [#uses=0]
-        ret i32 0
-}
-
-define i32 @testfunction(i64 %i0, i64 %j0) {
-        %array0 = malloc [4 x i8]               ; <[4 x i8]*> [#uses=2]
-        %size = add i32 2, 2            ; <i32> [#uses=1]
-        %array1 = malloc i8, i32 4              ; <i8*> [#uses=1]
-        %array2 = malloc i8, i32 %size          ; <i8*> [#uses=1]
-        %idx = getelementptr [4 x i8]* %array0, i64 0, i64 2            ; <i8*> [#uses=1]
-        store i8 123, i8* %idx
-        free [4 x i8]* %array0
-        free i8* %array1
-        free i8* %array2
-        %aa = alloca %complexty, i32 5          ; <%complexty*> [#uses=1]
-        %idx2 = getelementptr %complexty* %aa, i64 %i0, i32 1, i32 0, i64 %j0           ; <i8**> [#uses=1]
-        store i8* null, i8** %idx2
-        %ptr = alloca i32               ; <i32*> [#uses=2]
-        store i32 3, i32* %ptr
-        %val = load i32* %ptr           ; <i32> [#uses=0]
-        %sptr = alloca %struct          ; <%struct*> [#uses=1]
-        %ubsptr = getelementptr %struct* %sptr, i64 0, i32 1, i32 1             ; <{ i8 }*> [#uses=1]
-        %idx3 = getelementptr { i8 }* %ubsptr, i64 0, i32 0             ; <i8*> [#uses=1]
-        store i8 4, i8* %idx3
-        ret i32 3
-}
-
diff --git a/test/FrontendC++/2006-11-06-StackTrace.cpp b/test/FrontendC++/2006-11-06-StackTrace.cpp
index b79c0bf..2813c36 100644
--- a/test/FrontendC++/2006-11-06-StackTrace.cpp
+++ b/test/FrontendC++/2006-11-06-StackTrace.cpp
@@ -1,7 +1,7 @@
 // This is a regression test on debug info to make sure that we can get a
 // meaningful stack trace from a C++ program.
 // RUN: %llvmgcc -S -O0 -g %s -o - | \
-// RUN:    llc --disable-fp-elim -o %t.s -O0 -relocation-model=pic
+// RUN:    llc --disable-cfi --disable-fp-elim -o %t.s -O0 -relocation-model=pic
 // RUN: %compile_c %t.s -o %t.o
 // RUN: %link %t.o -o %t.exe
 // RUN: echo {break DeepStack::deepest\nrun 17\nwhere\n} > %t.in 
diff --git a/test/FrontendC++/2006-11-30-Pubnames.cpp b/test/FrontendC++/2006-11-30-Pubnames.cpp
index 239d3f5..fc7beeb 100644
--- a/test/FrontendC++/2006-11-30-Pubnames.cpp
+++ b/test/FrontendC++/2006-11-30-Pubnames.cpp
@@ -1,7 +1,7 @@
 // This is a regression test on debug info to make sure that we can access 
 // qualified global names.
 // RUN: %llvmgcc -S -O0 -g %s -o - | \
-// RUN:   llc --disable-fp-elim -o %t.s -O0
+// RUN:   llc -disable-cfi --disable-fp-elim -o %t.s -O0
 // RUN: %compile_c %t.s -o %t.o
 // RUN: %link %t.o -o %t.exe
 // RUN: %llvmdsymutil %t.exe 
diff --git a/test/FrontendC++/2009-04-21-DtorNames-dbg.cpp b/test/FrontendC++/2009-04-21-DtorNames-dbg.cpp
index e3616da..da09c0b 100644
--- a/test/FrontendC++/2009-04-21-DtorNames-dbg.cpp
+++ b/test/FrontendC++/2009-04-21-DtorNames-dbg.cpp
@@ -1,4 +1,4 @@
-// RUN: %llvmgcc -S -g %s -o - | llc -O0 -o %t.s
+// RUN: %llvmgcc -S -g %s -o - | llc --disable-cfi -O0 -o %t.s
 // RUN: %compile_c %t.s -o %t.o
 // PR4025
 
diff --git a/test/FrontendC++/2009-07-15-LineNumbers.cpp b/test/FrontendC++/2009-07-15-LineNumbers.cpp
deleted file mode 100644
index e1cc81f..0000000
--- a/test/FrontendC++/2009-07-15-LineNumbers.cpp
+++ /dev/null
@@ -1,27 +0,0 @@
-// This is a regression test on debug info to make sure that we can
-// print line numbers in asm.
-// RUN: %llvmgcc -S -O0 -g %s -o - | \
-// RUN:    llc --disable-fp-elim -O0 -relocation-model=pic | grep {2009-07-15-LineNumbers.cpp:25$}
-
-#include <stdlib.h>
-
-class DeepStack {
-  int seedVal;
-public:
-  DeepStack(int seed) : seedVal(seed) {}
-
-  int shallowest( int x ) { return shallower(x + 1); }
-  int shallower ( int x ) { return shallow(x + 2); }
-  int shallow   ( int x ) { return deep(x + 3); }
-  int deep      ( int x ) { return deeper(x + 4); }
-  int deeper    ( int x ) { return deepest(x + 6); }
-  int deepest   ( int x ) { return x + 7; }
-
-  int runit() { return shallowest(seedVal); }
-};
-
-int main ( int argc, char** argv) {
-
-  DeepStack DS9( (argc > 1 ? atoi(argv[1]) : 0) );
-  return DS9.runit();
-}
diff --git a/test/FrontendC++/2010-08-31-ByValArg.cpp b/test/FrontendC++/2010-08-31-ByValArg.cpp
index be0d354..4ccaabd 100644
--- a/test/FrontendC++/2010-08-31-ByValArg.cpp
+++ b/test/FrontendC++/2010-08-31-ByValArg.cpp
@@ -1,7 +1,7 @@
 // This regression test checks byval arguments' debug info.
 // Radar 8367011
 // RUN: %llvmgcc -S -O0 -g %s -o - | \
-// RUN:    llc --disable-fp-elim -o %t.s -O0 -relocation-model=pic
+// RUN:    llc --disable-cfi --disable-fp-elim -o %t.s -O0 -relocation-model=pic
 // RUN: %compile_c %t.s -o %t.o
 // RUN: %link %t.o -o %t.exe
 // RUN: echo {break get\nrun\np missing_arg.b} > %t.in 
diff --git a/test/FrontendC/2006-05-01-AppleAlignmentPragma.c b/test/FrontendC/2006-05-01-AppleAlignmentPragma.c
index c9050aa..233968b 100644
--- a/test/FrontendC/2006-05-01-AppleAlignmentPragma.c
+++ b/test/FrontendC/2006-05-01-AppleAlignmentPragma.c
@@ -1,7 +1,7 @@
 // RUN: %llvmgcc %s -S -o -
 
 #ifdef __APPLE__
-/* test that X is layed out correctly when this pragma is used. */
+/* test that X is laid out correctly when this pragma is used. */
 #pragma options align=mac68k
 #endif
 
diff --git a/test/FrontendC/2008-07-29-EHLabel.ll b/test/FrontendC/2008-07-29-EHLabel.ll
index 7577bc9..186eafa 100644
--- a/test/FrontendC/2008-07-29-EHLabel.ll
+++ b/test/FrontendC/2008-07-29-EHLabel.ll
@@ -1,4 +1,4 @@
-; RUN: llc %s -o - | %llvmgcc -xassembler -c -o /dev/null -
+; RUN: llc -disable-cfi %s -o - | %llvmgcc -xassembler -c -o /dev/null -
 ; PR2609
 	%struct..0._11 = type { i32 }
 	%struct..1__pthread_mutex_s = type { i32, i32, i32, i32, i32, %struct..0._11 }
diff --git a/test/FrontendC/2009-02-17-BitField-dbg.c b/test/FrontendC/2009-02-17-BitField-dbg.c
index 80ccc4a..88d2cbb 100644
--- a/test/FrontendC/2009-02-17-BitField-dbg.c
+++ b/test/FrontendC/2009-02-17-BitField-dbg.c
@@ -1,6 +1,6 @@
 // Check bitfields.
 // RUN: %llvmgcc -S -O0 -g %s -o - | \
-// RUN: llc --disable-fp-elim -o 2009-02-17-BitField-dbg.s
+// RUN: llc -disable-cfi --disable-fp-elim -o 2009-02-17-BitField-dbg.s
 // RUN: %compile_c 2009-02-17-BitField-dbg.s -o 2009-02-17-BitField-dbg.o
 // RUN: echo {ptype mystruct} > %t2
 // RUN: gdb -q -batch -n -x %t2 2009-02-17-BitField-dbg.o | \
diff --git a/test/FrontendC/2010-01-05-LinkageName.c b/test/FrontendC/2010-01-05-LinkageName.c
index 9c1a215..279df03 100644
--- a/test/FrontendC/2010-01-05-LinkageName.c
+++ b/test/FrontendC/2010-01-05-LinkageName.c
@@ -1,4 +1,4 @@
-// RUN: %llvmgcc -O2 -S -g %s -o - | llc -o 2010-01-05-LinkageName.s -O0 
+// RUN: %llvmgcc -O2 -S -g %s -o - | llc -disable-cfi -o 2010-01-05-LinkageName.s -O0 
 // RUN: %compile_c 2010-01-05-LinkageName.s -o 2010-01-05-LinkageName.s
 
 struct tm {};
diff --git a/test/FrontendC/2010-01-14-StaticVariable.c b/test/FrontendC/2010-01-14-StaticVariable.c
index 80dd4d4..0635900 100644
--- a/test/FrontendC/2010-01-14-StaticVariable.c
+++ b/test/FrontendC/2010-01-14-StaticVariable.c
@@ -1,7 +1,7 @@
 // This is a regression test on debug info to make sure that llvm emitted
 // debug info does not crash gdb.
 // RUN: %llvmgcc -S -O0 -g %s -o - | \
-// RUN:    llc --disable-fp-elim -o %t.s -O0 -relocation-model=pic
+// RUN:    llc -disable-cfi --disable-fp-elim -o %t.s -O0 -relocation-model=pic
 // RUN: %compile_c %t.s -o %t.o
 // RUN: echo {quit\n} > %t.in 
 // RUN: gdb -q -batch -n -x %t.in %t.o > /dev/null
diff --git a/test/FrontendC/2010-02-16-DbgVarScope.c b/test/FrontendC/2010-02-16-DbgVarScope.c
index 1d912d0..24910ad 100644
--- a/test/FrontendC/2010-02-16-DbgVarScope.c
+++ b/test/FrontendC/2010-02-16-DbgVarScope.c
@@ -1,5 +1,5 @@
 // RUN: %llvmgcc -S -O0 -g %s -o - | \
-// RUN:    llc --disable-fp-elim -o %t.s -O0 -relocation-model=pic
+// RUN:    llc -disable-cfi --disable-fp-elim -o %t.s -O0 -relocation-model=pic
 // RUN: %compile_c %t.s -o %t.o
 // RUN: %link %t.o -o %t.exe
 // RUN: echo {break 24\nrun\np loc\n} > %t.in 
diff --git a/test/FrontendC/2010-05-18-asmsched.c b/test/FrontendC/2010-05-18-asmsched.c
index 33b8770..ca7625f 100644
--- a/test/FrontendC/2010-05-18-asmsched.c
+++ b/test/FrontendC/2010-05-18-asmsched.c
@@ -3,8 +3,9 @@
 
 void foo(int x, int y) {
 // CHECK: bar
-// CHECK: movq  %r9, %r10
-// CHECK: movq  %rdi, %r9
+// CHECK-NOT: {{, %r9$}}
+// CHECK: movq  %r9,
+// CHECK: movq  {{.*}}, %r9
 // CHECK: bar
   register int lr9 asm("r9") = x;
   register int lr10 asm("r10") = y;
diff --git a/test/FrontendC/2010-07-27-MinNoFoldConst.c b/test/FrontendC/2010-07-27-MinNoFoldConst.c
index 7cd8b4c..ea711e5 100644
--- a/test/FrontendC/2010-07-27-MinNoFoldConst.c
+++ b/test/FrontendC/2010-07-27-MinNoFoldConst.c
@@ -10,7 +10,7 @@ static void bad(unsigned int v1, unsigned int v2) {
 //   MIN(1631381461u * v2 - 4047041419, 1631381461u * v1 - 4047041419)
 //
 // 1631381461u * 1273463329u = 2077504466193943669, but 32-bit overflow clips
-// this to 4047041419. This breaks the comparision implicit in the MIN().
+// this to 4047041419. This breaks the comparison implicit in the MIN().
 // Two multiply operations suggests the bad optimization is happening;
 // one multiplication, after the MIN(), is correct.
 // CHECK: mul
diff --git a/test/FrontendC/ARM/dg.exp b/test/FrontendC/ARM/dg.exp
new file mode 100644
index 0000000..df7d49e
--- /dev/null
+++ b/test/FrontendC/ARM/dg.exp
@@ -0,0 +1,5 @@
+load_lib llvm.exp
+
+if { [llvm_supports_target ARM] && [llvm_gcc_supports c] } {
+    RunLLVMTests [lsort [glob -nocomplain $srcdir/$subdir/*.{ll,c,cpp,s}]]
+}
diff --git a/test/FrontendC/ARM/inline-asm-multichar.c b/test/FrontendC/ARM/inline-asm-multichar.c
new file mode 100644
index 0000000..7e2eeef
--- /dev/null
+++ b/test/FrontendC/ARM/inline-asm-multichar.c
@@ -0,0 +1,11 @@
+// RUN: %llvmgcc -S -march=armv7a %s 
+
+// XFAIL: *
+// XTARGET: arm
+
+int t1() {
+  static float k = 1.0f;
+CHECK: call void asm sideeffect "flds s15, $0 \0A", "*^Uv,~{s15}"
+  __asm__ volatile ("flds s15, %[k] \n" :: [k] "Uv,m" (k) : "s15");
+  return 0;
+}
diff --git a/test/FrontendC/cstring-align.c b/test/FrontendC/cstring-align.c
deleted file mode 100644
index 544c9f3..0000000
--- a/test/FrontendC/cstring-align.c
+++ /dev/null
@@ -1,11 +0,0 @@
-// RUN: %llvmgcc %s -S -Os -o - | llc -march=x86 -mtriple=i386-apple-darwin10 | FileCheck %s
-
-extern void func(const char *, const char *);
-
-void long_function_name() {
-  func("%s: the function name", __func__);
-}
-
-// CHECK: .align 4
-// CHECK: ___func__.
-// CHECK: .asciz "long_function_name"
diff --git a/test/FrontendC/pr4349.c b/test/FrontendC/pr4349.c
index 24acd9c..49c89e2 100644
--- a/test/FrontendC/pr4349.c
+++ b/test/FrontendC/pr4349.c
@@ -16,22 +16,22 @@ struct svar
 {
     void *ptr;
 };
-// CHECK: @svars1 = unnamed_addr global [1 x %struct.svar] [%struct.svar { i8* bitcast (%struct.cpu* @cpu to i8*) }]
+// CHECK: @svars1 = global [1 x %struct.svar] [%struct.svar { i8* bitcast (%struct.cpu* @cpu to i8*) }]
 struct svar svars1[] =
 {
     { &((cpu.pc).w[0]) }
 };
-// CHECK: @svars2 = unnamed_addr global [1 x %struct.svar] [%struct.svar { i8* getelementptr ([2 x i8]* bitcast (%struct.cpu* @cpu to [2 x i8]*), i{{[0-9]+}} 0, i{{[0-9]+}} 1) }]
+// CHECK: @svars2 = global [1 x %struct.svar] [%struct.svar { i8* getelementptr ([2 x i8]* bitcast (%struct.cpu* @cpu to [2 x i8]*), i{{[0-9]+}} 0, i{{[0-9]+}} 1) }]
 struct svar svars2[] =
 {
     { &((cpu.pc).b[0][1]) }
 };
-// CHECK: @svars3 = unnamed_addr global [1 x %struct.svar] [%struct.svar { i8* bitcast (i16* getelementptr ([2 x i16]* bitcast (%struct.cpu* @cpu to [2 x i16]*), i{{[0-9]+}} 0, i{{[0-9]+}} 1) to i8*) }]
+// CHECK: @svars3 = global [1 x %struct.svar] [%struct.svar { i8* bitcast (i16* getelementptr ([2 x i16]* bitcast (%struct.cpu* @cpu to [2 x i16]*), i{{[0-9]+}} 0, i{{[0-9]+}} 1) to i8*) }]
 struct svar svars3[] =
 {
     { &((cpu.pc).w[1]) }
 };
-// CHECK: @svars4 = unnamed_addr global [1 x %struct.svar] [%struct.svar { i8* getelementptr ([2 x [2 x i8]]* bitcast (%struct.cpu* @cpu to [2 x [2 x i8]]*), i{{[0-9]+}} 0, i{{[0-9]+}} 1, i{{[0-9]+}} 1) }]
+// CHECK: @svars4 = global [1 x %struct.svar] [%struct.svar { i8* getelementptr ([2 x [2 x i8]]* bitcast (%struct.cpu* @cpu to [2 x [2 x i8]]*), i{{[0-9]+}} 0, i{{[0-9]+}} 1, i{{[0-9]+}} 1) }]
 struct svar svars4[] =
 {
     { &((cpu.pc).b[1][1]) }
diff --git a/test/FrontendC/struct-matching-constraint.c b/test/FrontendC/struct-matching-constraint.c
new file mode 100644
index 0000000..d002cdd
--- /dev/null
+++ b/test/FrontendC/struct-matching-constraint.c
@@ -0,0 +1,19 @@
+// RUN: %llvmgcc -S -march=armv7a %s 
+
+// XFAIL: *
+// XTARGET: arm
+
+typedef struct __simd128_uint16_t
+{
+  __neon_uint16x8_t val;
+} uint16x8_t;
+
+void b(uint16x8_t sat, uint16x8_t luma)
+{
+  __asm__("vmov.16 %1, %0   \n\t"
+                                           "vtrn.16 %0, %1   \n\t"
+   :"=w"(luma), "=w"(sat)
+   :"0"(luma)
+   );
+
+}
diff --git a/test/FrontendObjC/2009-08-17-DebugInfo.m b/test/FrontendObjC/2009-08-17-DebugInfo.m
index 8ed7c24..825bbd7 100644
--- a/test/FrontendObjC/2009-08-17-DebugInfo.m
+++ b/test/FrontendObjC/2009-08-17-DebugInfo.m
@@ -1,6 +1,6 @@
 // This is a regression test on debug info to make sure that we can set a
 // breakpoint on a objective message.
-// RUN: %llvmgcc -S -O0 -g %s -o - | llc -o %t.s -O0
+// RUN: %llvmgcc -S -O0 -g %s -o - | llc -disable-cfi -o %t.s -O0
 // RUN: %compile_c %t.s -o %t.o
 // RUN: %link %t.o -o %t.exe -framework Foundation
 // RUN: echo {break randomFunc\n} > %t.in 
diff --git a/test/Integer/BitMem.ll b/test/Integer/BitMem.ll
deleted file mode 100644
index 2c093bc..0000000
--- a/test/Integer/BitMem.ll
+++ /dev/null
@@ -1,29 +0,0 @@
-; RUN: llvm-as %s -o - | llvm-dis > %t1.ll
-; RUN: llvm-as %t1.ll -o - | llvm-dis > %t2.ll
-; RUN: diff %t1.ll %t2.ll
-
-declare void @"foo"()
-
-
-; foo test basic arith operations
-define void @"foo"() {
-	%t1 = malloc i31, i32 4
-        %t2 = malloc i31, i32 7, align 1024
-        %t3 = malloc [4 x i15]
-
-        %idx = getelementptr [4 x i15]* %t3, i64 0, i64 2
-        store i15 -123, i15* %idx
-
-        free [4 x i15]* %t3
-        free i31* %t2
-        free i31* %t1
-        
-        %t4 = alloca i12, i32 100
-        free i12* %t4
-
-        %t5 = alloca i31
-        store i31 -123, i31* %t5
-
-        free i31* %t5
-	ret void
-}
diff --git a/test/Integer/alignment_bt.ll b/test/Integer/alignment_bt.ll
deleted file mode 100644
index 3a9d051..0000000
--- a/test/Integer/alignment_bt.ll
+++ /dev/null
@@ -1,21 +0,0 @@
-; RUN: llvm-as %s -o - | llvm-dis > %t1.ll
-; RUN: llvm-as %t1.ll -o - | llvm-dis > %t2.ll
-; RUN: diff %t1.ll %t2.ll
-
-@X = global i19 4, align 16
-
-define i19 *@test() align 32 {
-	%X = alloca i19, align 4
-	%Y = alloca i51, i32 42, align 16
-	%Z = alloca i32, align 1
-	ret i19 *%X
-}
-
-define i19 *@test2() {
-	%X = malloc i19, align 4
-	%Y = malloc i51, i32 42, align 16
-	%Z = malloc i32, align 1
-	ret i19 *%X
-}
-
-
diff --git a/test/Integer/paramattrs_bt.ll b/test/Integer/paramattrs_bt.ll
index 47ef753..6db9a53 100644
--- a/test/Integer/paramattrs_bt.ll
+++ b/test/Integer/paramattrs_bt.ll
@@ -5,15 +5,15 @@
 %ZFunTy = type i33(i8 zeroext)
 %SFunTy = type i33(i8 signext)
 
-declare i16 @"test"(i16 signext %arg) signext 
-declare i8  @"test2" (i16 zeroext %a2) zeroext 
+declare signext i16 @"test"(i16 signext %arg)  
+declare zeroext i8  @"test2" (i16 zeroext %a2)  
 
 
 define i33 @main(i33 %argc, i8 **%argv) {
     %val = trunc i33 %argc to i16
-    %res = call i16 (i16 signext) signext *@test(i16 signext %val) signext
+    %res = call signext i16 (i16 signext) *@test(i16 signext %val) 
     %two = add i16 %res, %res
-    %res2 = call i8 @test2(i16 %two zeroext) zeroext 
+    %res2 = call zeroext i8 @test2(i16 zeroext %two )  
     %retVal = sext i16 %two to i33
     ret i33 %retVal
 }
diff --git a/test/Integer/testmemory_bt.ll b/test/Integer/testmemory_bt.ll
deleted file mode 100644
index e503c56..0000000
--- a/test/Integer/testmemory_bt.ll
+++ /dev/null
@@ -1,45 +0,0 @@
-; RUN: llvm-as %s -o - | llvm-dis > %t1.ll
-; RUN: llvm-as %t1.ll -o - | llvm-dis > %t2.ll
-; RUN: diff %t1.ll %t2.ll
-
-
-%struct = type { i31 , {float, {i9 } } , i64 }
-%complexty = type {i31, {[4 x i9 *], float}, double}
-
-
-define i31 @"main"()
-begin
-  call i31 @testfunction(i64 0, i64 1)
-  ret i31 0
-end
-
-define i31 @"testfunction"(i64 %i0, i64 %j0)
-begin
-    %array0 = malloc [4 x i9]            ; yields {[4 x i9]*}:array0
-    %size   = add i32 2, 2                 ; yields {i31}:size = i31 %4
-    %array1 = malloc i9, i32 4          ; yields {i9*}:array1
-    %array2 = malloc i9, i32 %size      ; yields {i9*}:array2
-
-    %idx = getelementptr [4 x i9]* %array0, i64 0, i64 2
-    store i9 123, i9* %idx
-    free [4x i9]* %array0
-    free i9* %array1
-    free i9* %array2
-
-
-    %aa = alloca %complexty, i32 5
-    %idx2 = getelementptr %complexty* %aa, i64 %i0, i32 1, i32 0, i64 %j0
-    store i9 *null, i9** %idx2
-    
-    %ptr = alloca i31                       ; yields {i31*}:ptr
-    store i31 3, i31* %ptr                  ; yields {void}
-    %val = load i31* %ptr                   ; yields {i31}:val = i31 %3
-
-    %sptr = alloca %struct                  ; yields {%struct*}:sptr
-    %ubsptr = getelementptr %struct * %sptr, i64 0, i32 1, i32 1  ; yields {{i9}*}:ubsptr
-    %idx3 = getelementptr {i9} * %ubsptr, i64 0, i32 0
-    store i9 4, i9* %idx3
-
-    ret i31 3
-end
-
diff --git a/test/MC/ARM/arm_instructions.s b/test/MC/ARM/arm_instructions.s
index 50a2b70..f789441 100644
--- a/test/MC/ARM/arm_instructions.s
+++ b/test/MC/ARM/arm_instructions.s
@@ -270,6 +270,9 @@
 @ CHECK: msr  cpsr_fc, r0 @ encoding: [0x00,0xf0,0x29,0xe1]
         msr  cpsr_fc, r0
 
+@ CHECK: msr  cpsr_fc, r0 @ encoding: [0x00,0xf0,0x29,0xe1]
+        msr  cpsr_all, r0
+
 @ CHECK: msr  cpsr_fsx, r0 @ encoding: [0x00,0xf0,0x2e,0xe1]
         msr  cpsr_fsx, r0
 
@@ -309,3 +312,6 @@
 @ CHECK: ldrexd  r0, r1, [r0] @ encoding: [0x9f,0x0f,0xb0,0xe1]
         ldrexd  r0, r1, [r0]
 
+@ CHECK: ssat16  r0, #7, r0 @ encoding: [0x30,0x0f,0xa6,0xe6]
+        ssat16  r0, #7, r0
+
diff --git a/test/MC/ARM/elf-movt.s b/test/MC/ARM/elf-movt.s
index 0fe7c50..18061f5 100644
--- a/test/MC/ARM/elf-movt.s
+++ b/test/MC/ARM/elf-movt.s
@@ -9,10 +9,10 @@
 barf:                                   @ @barf
 @ BB#0:                                 @ %entry
 	movw	r0, :lower16:GOT-(.LPC0_2+8)
-	movt	r0, :upper16:GOT-(.LPC0_2+16)
+	movt	r0, :upper16:GOT-(.LPC0_2+8)
 .LPC0_2:
 @ ASM:          movw    r0, :lower16:(GOT-(.LPC0_2+8))
-@ ASM-NEXT:     movt    r0, :upper16:(GOT-(.LPC0_2+16))
+@ ASM-NEXT:     movt    r0, :upper16:(GOT-(.LPC0_2+8))
 
 @@ make sure that the text section fixups are sane too
 @ OBJ:                 '.text'
@@ -25,7 +25,7 @@ barf:                                   @ @barf
 @ OBJ-NEXT:            'sh_info', 0x00000000
 @ OBJ-NEXT:            'sh_addralign', 0x00000004
 @ OBJ-NEXT:            'sh_entsize', 0x00000000
-@ OBJ-NEXT:            '_section_data', 'f00f0fe3 ff0f4fe3'
+@ OBJ-NEXT:            '_section_data', 'f00f0fe3 f40f4fe3'
 
 @ OBJ:              Relocation 0x00000000
 @ OBJ-NEXT:         'r_offset', 0x00000000
diff --git a/test/MC/ARM/elf-thumbfunc-reloc.ll b/test/MC/ARM/elf-thumbfunc-reloc.ll
new file mode 100644
index 0000000..6fce403
--- /dev/null
+++ b/test/MC/ARM/elf-thumbfunc-reloc.ll
@@ -0,0 +1,37 @@
+; RUN: llc %s -mtriple=thumbv7-linux-gnueabi -relocation-model=pic \
+; RUN: -filetype=obj -o - | elf-dump --dump-section-data | \
+; RUN: FileCheck %s
+
+; FIXME: This file needs to be in .s form!
+; We wanna test relocatable thumb function call,
+; but ARMAsmParser cannot handle "bl foo(PLT)" yet
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:32-n32"
+target triple = "thumbv7-none--gnueabi"
+
+define void @foo() nounwind {
+entry:
+  ret void
+}
+
+define void @bar() nounwind {
+entry:
+  call void @foo()
+  ret void
+}
+
+
+; make sure that bl 0 <foo> (fff7feff) is correctly encoded
+; CHECK: '_section_data', '70470000 2de90048 fff7feff bde80088'
+
+;  Offset     Info    Type            Sym.Value  Sym. Name
+; 00000008  0000070a R_ARM_THM_CALL    00000001   foo
+; CHECK:           Relocation 0x00000000 
+; CHECK-NEXT:      'r_offset', 0x00000008
+; CHECK-NEXT:      'r_sym', 0x00000007
+; CHECK-NEXT:      'r_type', 0x0000000a
+
+; make sure foo is thumb function: bit 0 = 1
+; CHECK:           Symbol 0x00000007
+; CHECK-NEXT:      'foo'
+; CHECK-NEXT:      'st_value', 0x00000001
diff --git a/test/MC/ARM/elf-thumbfunc.s b/test/MC/ARM/elf-thumbfunc.s
new file mode 100644
index 0000000..a1b3c31
--- /dev/null
+++ b/test/MC/ARM/elf-thumbfunc.s
@@ -0,0 +1,20 @@
+@@ test st_value bit 0 of thumb function
+@ RUN: llvm-mc %s -triple=thumbv7-linux-gnueabi -filetype=obj -o - | \
+@ RUN: elf-dump  | FileCheck %s
+	.syntax unified
+	.text
+	.globl	foo
+	.align	2
+	.type	foo,%function
+	.code	16
+	.thumb_func
+foo:
+	bx	lr
+
+@@ make sure foo is thumb function: bit 0 = 1 (st_value)
+@CHECK:           Symbol 0x00000004
+@CHECK-NEXT:      'st_name', 0x00000001
+@CHECK-NEXT:      'st_value', 0x00000001
+@CHECK-NEXT:      'st_size', 0x00000000
+@CHECK-NEXT:      'st_bind', 0x00000001
+@CHECK-NEXT:      'st_type', 0x00000002
diff --git a/test/MC/ARM/simple-encoding.ll b/test/MC/ARM/simple-encoding.ll
index 7b581b3..3322803 100644
--- a/test/MC/ARM/simple-encoding.ll
+++ b/test/MC/ARM/simple-encoding.ll
@@ -1,4 +1,4 @@
-;RUN: llc -mtriple=armv7-apple-darwin -show-mc-encoding -disable-cgp-branch-opts < %s | FileCheck %s
+;RUN: llc -mtriple=armv7-apple-darwin -show-mc-encoding -disable-cgp-branch-opts -join-physregs < %s | FileCheck %s
 
 
 ;FIXME: Once the ARM integrated assembler is up and going, these sorts of tests
diff --git a/test/MC/ARM/thumb.s b/test/MC/ARM/thumb.s
index 342a390..55d9789 100644
--- a/test/MC/ARM/thumb.s
+++ b/test/MC/ARM/thumb.s
@@ -12,6 +12,8 @@
 
 @ CHECK: blx	r9                   @ encoding: [0xc8,0x47]
 	blx	r9
+@ CHECK: blx	r10                     @ encoding: [0xd0,0x47]
+  blx r10
 
 @ CHECK: rev	r2, r3               @ encoding: [0x1a,0xba]
 @ CHECK: rev16	r3, r4               @ encoding: [0x63,0xba]
@@ -68,3 +70,6 @@
 
 @ CHECK: cpsie aif @ encoding: [0x67,0xb6]
         cpsie aif
+
+@ CHECK: mov  r0, pc @ encoding: [0x78,0x46]
+        mov  r0, pc
diff --git a/test/MC/ARM/thumb2.s b/test/MC/ARM/thumb2.s
index 5342b90..4e9d4e1 100644
--- a/test/MC/ARM/thumb2.s
+++ b/test/MC/ARM/thumb2.s
@@ -300,3 +300,5 @@
   ldrex  r0, [r0]
 @ CHECK: ldrexd  r0, r1, [r0] @ encoding: [0xd0,0xe8,0x7f,0x01]
   ldrexd  r0, r1, [r0]
+@ CHECK: ssat16  r0, #7, r0 @ encoding: [0x20,0xf3,0x06,0x00]
+  ssat16  r0, #7, r0
diff --git a/test/MC/ARM/xscale-attributes.ll b/test/MC/ARM/xscale-attributes.ll
new file mode 100644
index 0000000..e576278
--- /dev/null
+++ b/test/MC/ARM/xscale-attributes.ll
@@ -0,0 +1,31 @@
+; RUN: llc %s -mtriple=thumbv5-linux-gnueabi -mcpu=xscale -o - | \
+; RUN: FileCheck -check-prefix=ASM %s
+
+; RUN: llc %s -mtriple=thumbv5-linux-gnueabi -filetype=obj \
+; RUN: -mcpu=xscale -o - | elf-dump --dump-section-data | \
+; RUN: FileCheck -check-prefix=OBJ %s
+
+; FIXME: The OBJ test should be a .s to .o test and the ASM test should
+; be moved to test/CodeGen/ARM.
+
+define void @foo() nounwind {
+entry:
+  ret void
+}
+
+; ASM:           .eabi_attribute 6, 5
+; ASM-NEXT:      .eabi_attribute 8, 1
+; ASM-NEXT:      .eabi_attribute 9, 1
+
+; OBJ:           Section 0x00000004
+; OBJ-NEXT:      'sh_name', 0x0000000c
+; OBJ-NEXT:      'sh_type', 0x70000003
+; OBJ-NEXT:	   'sh_flags', 0x00000000
+; OBJ-NEXT:	   'sh_addr', 0x00000000
+; OBJ-NEXT:	   'sh_offset', 0x00000038
+; OBJ-NEXT:	   'sh_size', 0x00000020
+; OBJ-NEXT:	   'sh_link', 0x00000000
+; OBJ-NEXT:	   'sh_info', 0x00000000
+; OBJ-NEXT:	   'sh_addralign', 0x00000001
+; OBJ-NEXT:	   'sh_entsize', 0x00000000
+; OBJ-NEXT:      '_section_data', '411f0000 00616561 62690001 15000000 06050801 09011401 15011703 18011901'
diff --git a/test/MC/AsmParser/directive_seh.s b/test/MC/AsmParser/directive_seh.s
new file mode 100644
index 0000000..98fc606
--- /dev/null
+++ b/test/MC/AsmParser/directive_seh.s
@@ -0,0 +1,48 @@
+# RUN: llvm-mc -triple x86_64-pc-win32 %s | FileCheck %s
+
+# CHECK: .seh_proc func
+# CHECK: .seh_pushframe @code
+# CHECK: .seh_stackalloc 24
+# CHECK: .seh_savereg 6, 16
+# CHECK: .seh_savexmm 8, 0
+# CHECK: .seh_pushreg 3
+# CHECK: .seh_setframe 3, 0
+# CHECK: .seh_endprologue
+# CHECK: .seh_handler __C_specific_handler, @except
+# CHECK-NOT: .section{{.*}}.xdata
+# CHECK: .seh_handlerdata
+# CHECK: .text
+# CHECK: .seh_startchained
+# CHECK: .seh_endprologue
+# CHECK: .seh_endchained
+# CHECK: .seh_endproc
+
+    .text
+    .globl func
+    .def func; .scl 2; .type 32; .endef
+    .seh_proc func
+func:
+    .seh_pushframe @code
+    subq $24, %rsp
+    .seh_stackalloc 24
+    movq %rsi, 16(%rsp)
+    .seh_savereg %rsi, 16
+    movups %xmm8, (%rsp)
+    .seh_savexmm %xmm8, 0
+    pushq %rbx
+    .seh_pushreg 3
+    mov %rsp, %rbx
+    .seh_setframe 3, 0
+    .seh_endprologue
+    .seh_handler __C_specific_handler, @except
+    .seh_handlerdata
+    .long 0
+    .text
+    .seh_startchained
+    .seh_endprologue
+    .seh_endchained
+    lea (%rbx), %rsp
+    pop %rbx
+    addq $24, %rsp
+    ret
+    .seh_endproc
diff --git a/test/MC/AsmParser/directive_values.s b/test/MC/AsmParser/directive_values.s
index 98259bd..6c79c38 100644
--- a/test/MC/AsmParser/directive_values.s
+++ b/test/MC/AsmParser/directive_values.s
@@ -56,3 +56,10 @@ TEST6:
 # CHECK:        .byte   35
 # CHECK:        .byte   9
 # CHECK:        .byte   10
+
+TEST7:
+        .byte 1, 2, 3, 4
+# CHECK:        .byte   1
+# CHECK-NEXT:   .byte   2
+# CHECK-NEXT:   .byte   3
+# CHECK-NEXT:   .byte   4
diff --git a/test/MC/AsmParser/exprs-invalid.s b/test/MC/AsmParser/exprs-invalid.s
index 5358fc5..dc27d80 100644
--- a/test/MC/AsmParser/exprs-invalid.s
+++ b/test/MC/AsmParser/exprs-invalid.s
@@ -1,13 +1,8 @@
-// RUN: not llvm-mc -triple i386-unknown-unknown %s 2> %t
-// RUN: FileCheck -input-file %t %s
+// RUN: not llvm-mc -triple x86_64-apple-darwin10 %s 2> %t.err | FileCheck %s
+// RUN: FileCheck --check-prefix=CHECK-ERRORS %s < %t.err
+// CHECK: 	.section	__TEXT,__text,regular,pure_instructions
+// CHECK-ERRORS: error: invalid octal number
+.long 80+08
 
-// Currently XFAIL'ed, since the front-end isn't validating this. Figure out the
-// right resolution.
-//
-// XFAIL: *
-
-        .text
-a:
-        .data
-// CHECK: expected relocatable expression
-        .long -(0 + a)
+// CHECK-ERRORS: error: invalid hexadecimal number
+.long 80+0xzz
diff --git a/test/MC/AsmParser/macro-args.s b/test/MC/AsmParser/macro-args.s
new file mode 100644
index 0000000..808b6eb
--- /dev/null
+++ b/test/MC/AsmParser/macro-args.s
@@ -0,0 +1,10 @@
+// RUN: llvm-mc -triple x86_64-apple-darwin10 %s | FileCheck %s
+
+.macro GET   var,re2g
+    movl   \var@GOTOFF(%ebx),\re2g
+.endm
+
+
+GET    is_sse, %eax
+
+// CHECK: movl	is_sse@GOTOFF(%ebx), %eax
diff --git a/test/MC/AsmParser/rename.s b/test/MC/AsmParser/rename.s
index 64ca515..934cee8 100644
--- a/test/MC/AsmParser/rename.s
+++ b/test/MC/AsmParser/rename.s
@@ -1,10 +1,14 @@
 // RUN: llvm-mc -triple i386-unknown-unknown %s | FileCheck %s
 
         .size bar, . - bar
+.Ltmp01:
+       .size foo, .Ltmp01 - foo
 .Ltmp0:
-       .size foo, .Ltmp0 - foo
+       .size qux, .Ltmp0 - qux
 
 // CHECK: .Ltmp0:
 // CHECK: .size  bar, .Ltmp0-bar
 // CHECK: .Ltmp01
 // CHECK: .size foo, .Ltmp01-foo
+// CHECK: .Ltmp02
+// CHECK: .size qux, .Ltmp02-qux
diff --git a/test/MC/COFF/basic-coff.s b/test/MC/COFF/basic-coff.s
index 0c86582..23156b8 100644
--- a/test/MC/COFF/basic-coff.s
+++ b/test/MC/COFF/basic-coff.s
@@ -1,133 +1,133 @@
-// This test checks that the COFF object emitter works for the most basic
-// programs.
-
-// RUN: llvm-mc -filetype=obj -triple i686-pc-win32 %s | coff-dump.py | FileCheck %s
-// I WOULD RUN, BUT THIS FAILS: llvm-mc -filetype=obj -triple x86_64-pc-win32 %s
-
-.def	 _main;
-	.scl	2;
-	.type	32;
-	.endef
-	.text
-	.globl	_main
-	.align	16, 0x90
-_main:                                  # @main
-# BB#0:                                 # %entry
-	subl	$4, %esp
-	movl	$L_.str, (%esp)
-	calll	_printf
-	xorl	%eax, %eax
-	addl	$4, %esp
-	ret
-
-	.data
-L_.str:                                 # @.str
-	.asciz	 "Hello World"
-
-// CHECK: {
-// CHECK:   MachineType              = IMAGE_FILE_MACHINE_I386 (0x14C)
-// CHECK:   NumberOfSections         = 2
-// CHECK:   TimeDateStamp            = {{[0-9]+}}
-// CHECK:   PointerToSymbolTable     = 0x{{[0-9A-F]+}}
-// CHECK:   NumberOfSymbols          = 6
-// CHECK:   SizeOfOptionalHeader     = 0
-// CHECK:   Characteristics          = 0x0
-// CHECK:   Sections                 = [
-// CHECK:     1 = {
-// CHECK:       Name                     = .text
-// CHECK:       VirtualSize              = 0
-// CHECK:       VirtualAddress           = 0
-// CHECK:       SizeOfRawData            = {{[0-9]+}}
-// CHECK:       PointerToRawData         = 0x{{[0-9A-F]+}}
-// CHECK:       PointerToRelocations     = 0x{{[0-9A-F]+}}
-// CHECK:       PointerToLineNumbers     = 0x0
-// CHECK:       NumberOfRelocations      = 2
-// CHECK:       NumberOfLineNumbers      = 0
-// CHECK:       Charateristics           = 0x60500020
-// CHECK:         IMAGE_SCN_CNT_CODE
-// CHECK:         IMAGE_SCN_ALIGN_16BYTES
-// CHECK:         IMAGE_SCN_MEM_EXECUTE
-// CHECK:         IMAGE_SCN_MEM_READ
-// CHECK:       SectionData              =
-// CHECK:       Relocations              = [
-// CHECK:         0 = {
-// CHECK:           VirtualAddress           = 0x{{[0-9A-F]+}}
-// CHECK:           SymbolTableIndex         = 2
-// CHECK:           Type                     = IMAGE_REL_I386_DIR32 (6)
-// CHECK:           SymbolName               = .data
-// CHECK:         }
-// CHECK:         1 = {
-// CHECK:           VirtualAddress           = 0x{{[0-9A-F]+}}
-// CHECK:           SymbolTableIndex         = 5
-// CHECK:           Type                     = IMAGE_REL_I386_REL32 (20)
-// CHECK:           SymbolName               = _printf
-// CHECK:         }
-// CHECK:       ]
-// CHECK:     }
-// CHECK:     2 = {
-// CHECK:       Name                     = .data
-// CHECK:       VirtualSize              = 0
-// CHECK:       VirtualAddress           = 0
-// CHECK:       SizeOfRawData            = {{[0-9]+}}
-// CHECK:       PointerToRawData         = 0x{{[0-9A-F]+}}
-// CHECK:       PointerToRelocations     = 0x0
-// CHECK:       PointerToLineNumbers     = 0x0
-// CHECK:       NumberOfRelocations      = 0
-// CHECK:       NumberOfLineNumbers      = 0
-// CHECK:       Charateristics           = 0xC0300040
-// CHECK:         IMAGE_SCN_CNT_INITIALIZED_DATA
-// CHECK:         IMAGE_SCN_ALIGN_4BYTES
-// CHECK:         IMAGE_SCN_MEM_READ
-// CHECK:         IMAGE_SCN_MEM_WRITE
-// CHECK:       SectionData              =
-// CHECK:         48 65 6C 6C 6F 20 57 6F - 72 6C 64 00             |Hello World.|
-// CHECK:       Relocations              = None
-// CHECK:     }
-// CHECK:   ]
-// CHECK:   Symbols                  = [
-// CHECK:     0 = {
-// CHECK:       Name                     = .text
-// CHECK:       Value                    = 0
-// CHECK:       SectionNumber            = 1
-// CHECK:       SimpleType               = IMAGE_SYM_TYPE_NULL (0)
-// CHECK:       ComplexType              = IMAGE_SYM_DTYPE_NULL (0)
-// CHECK:       StorageClass             = IMAGE_SYM_CLASS_STATIC (3)
-// CHECK:       NumberOfAuxSymbols       = 1
-// CHECK:       AuxillaryData            =
-// CHECK:         15 00 00 00 02 00 00 00 - 00 00 00 00 01 00 00 00 |................|
-// CHECK:         00 00                                             |..|
-// CHECK:     }
-// CHECK:     2 = {
-// CHECK:       Name                     = .data
-// CHECK:       Value                    = 0
-// CHECK:       SectionNumber            = 2
-// CHECK:       SimpleType               = IMAGE_SYM_TYPE_NULL (0)
-// CHECK:       ComplexType              = IMAGE_SYM_DTYPE_NULL (0)
-// CHECK:       StorageClass             = IMAGE_SYM_CLASS_STATIC (3)
-// CHECK:       NumberOfAuxSymbols       = 1
-// CHECK:       AuxillaryData            =
-// CHECK:         0C 00 00 00 00 00 00 00 - 00 00 00 00 02 00 00 00 |................|
-// CHECK:         00 00                                             |..|
-// CHECK:     }
-// CHECK:     4 = {
-// CHECK:       Name                     = _main
-// CHECK:       Value                    = 0
-// CHECK:       SectionNumber            = 1
-// CHECK:       SimpleType               = IMAGE_SYM_TYPE_NULL (0)
-// CHECK:       ComplexType              = IMAGE_SYM_DTYPE_FUNCTION (2)
-// CHECK:       StorageClass             = IMAGE_SYM_CLASS_EXTERNAL (2)
-// CHECK:       NumberOfAuxSymbols       = 0
-// CHECK:       AuxillaryData            =
-// CHECK:     }
-// CHECK:     5 = {
-// CHECK:       Name                     = _printf
-// CHECK:       Value                    = 0
-// CHECK:       SectionNumber            = 0
-// CHECK:       SimpleType               = IMAGE_SYM_TYPE_NULL (0)
-// CHECK:       ComplexType              = IMAGE_SYM_DTYPE_NULL (0)
-// CHECK:       StorageClass             = IMAGE_SYM_CLASS_EXTERNAL (2)
-// CHECK:       NumberOfAuxSymbols       = 0
-// CHECK:       AuxillaryData            =
-// CHECK:     }
-// CHECK:   ]
-// CHECK: }
+// This test checks that the COFF object emitter works for the most basic
+// programs.
+
+// RUN: llvm-mc -filetype=obj -triple i686-pc-win32 %s | coff-dump.py | FileCheck %s
+// I WOULD RUN, BUT THIS FAILS: llvm-mc -filetype=obj -triple x86_64-pc-win32 %s
+
+.def	 _main;
+	.scl	2;
+	.type	32;
+	.endef
+	.text
+	.globl	_main
+	.align	16, 0x90
+_main:                                  # @main
+# BB#0:                                 # %entry
+	subl	$4, %esp
+	movl	$L_.str, (%esp)
+	calll	_printf
+	xorl	%eax, %eax
+	addl	$4, %esp
+	ret
+
+	.data
+L_.str:                                 # @.str
+	.asciz	 "Hello World"
+
+// CHECK: {
+// CHECK:   MachineType              = IMAGE_FILE_MACHINE_I386 (0x14C)
+// CHECK:   NumberOfSections         = 2
+// CHECK:   TimeDateStamp            = {{[0-9]+}}
+// CHECK:   PointerToSymbolTable     = 0x{{[0-9A-F]+}}
+// CHECK:   NumberOfSymbols          = 6
+// CHECK:   SizeOfOptionalHeader     = 0
+// CHECK:   Characteristics          = 0x0
+// CHECK:   Sections                 = [
+// CHECK:     1 = {
+// CHECK:       Name                     = .text
+// CHECK:       VirtualSize              = 0
+// CHECK:       VirtualAddress           = 0
+// CHECK:       SizeOfRawData            = {{[0-9]+}}
+// CHECK:       PointerToRawData         = 0x{{[0-9A-F]+}}
+// CHECK:       PointerToRelocations     = 0x{{[0-9A-F]+}}
+// CHECK:       PointerToLineNumbers     = 0x0
+// CHECK:       NumberOfRelocations      = 2
+// CHECK:       NumberOfLineNumbers      = 0
+// CHECK:       Charateristics           = 0x60500020
+// CHECK:         IMAGE_SCN_CNT_CODE
+// CHECK:         IMAGE_SCN_ALIGN_16BYTES
+// CHECK:         IMAGE_SCN_MEM_EXECUTE
+// CHECK:         IMAGE_SCN_MEM_READ
+// CHECK:       SectionData              =
+// CHECK:       Relocations              = [
+// CHECK:         0 = {
+// CHECK:           VirtualAddress           = 0x{{[0-9A-F]+}}
+// CHECK:           SymbolTableIndex         = 2
+// CHECK:           Type                     = IMAGE_REL_I386_DIR32 (6)
+// CHECK:           SymbolName               = .data
+// CHECK:         }
+// CHECK:         1 = {
+// CHECK:           VirtualAddress           = 0x{{[0-9A-F]+}}
+// CHECK:           SymbolTableIndex         = 5
+// CHECK:           Type                     = IMAGE_REL_I386_REL32 (20)
+// CHECK:           SymbolName               = _printf
+// CHECK:         }
+// CHECK:       ]
+// CHECK:     }
+// CHECK:     2 = {
+// CHECK:       Name                     = .data
+// CHECK:       VirtualSize              = 0
+// CHECK:       VirtualAddress           = 0
+// CHECK:       SizeOfRawData            = {{[0-9]+}}
+// CHECK:       PointerToRawData         = 0x{{[0-9A-F]+}}
+// CHECK:       PointerToRelocations     = 0x0
+// CHECK:       PointerToLineNumbers     = 0x0
+// CHECK:       NumberOfRelocations      = 0
+// CHECK:       NumberOfLineNumbers      = 0
+// CHECK:       Charateristics           = 0xC0300040
+// CHECK:         IMAGE_SCN_CNT_INITIALIZED_DATA
+// CHECK:         IMAGE_SCN_ALIGN_4BYTES
+// CHECK:         IMAGE_SCN_MEM_READ
+// CHECK:         IMAGE_SCN_MEM_WRITE
+// CHECK:       SectionData              =
+// CHECK:         48 65 6C 6C 6F 20 57 6F - 72 6C 64 00             |Hello World.|
+// CHECK:       Relocations              = None
+// CHECK:     }
+// CHECK:   ]
+// CHECK:   Symbols                  = [
+// CHECK:     0 = {
+// CHECK:       Name                     = .text
+// CHECK:       Value                    = 0
+// CHECK:       SectionNumber            = 1
+// CHECK:       SimpleType               = IMAGE_SYM_TYPE_NULL (0)
+// CHECK:       ComplexType              = IMAGE_SYM_DTYPE_NULL (0)
+// CHECK:       StorageClass             = IMAGE_SYM_CLASS_STATIC (3)
+// CHECK:       NumberOfAuxSymbols       = 1
+// CHECK:       AuxillaryData            =
+// CHECK:         15 00 00 00 02 00 00 00 - 00 00 00 00 01 00 00 00 |................|
+// CHECK:         00 00                                             |..|
+// CHECK:     }
+// CHECK:     2 = {
+// CHECK:       Name                     = .data
+// CHECK:       Value                    = 0
+// CHECK:       SectionNumber            = 2
+// CHECK:       SimpleType               = IMAGE_SYM_TYPE_NULL (0)
+// CHECK:       ComplexType              = IMAGE_SYM_DTYPE_NULL (0)
+// CHECK:       StorageClass             = IMAGE_SYM_CLASS_STATIC (3)
+// CHECK:       NumberOfAuxSymbols       = 1
+// CHECK:       AuxillaryData            =
+// CHECK:         0C 00 00 00 00 00 00 00 - 00 00 00 00 02 00 00 00 |................|
+// CHECK:         00 00                                             |..|
+// CHECK:     }
+// CHECK:     4 = {
+// CHECK:       Name                     = _main
+// CHECK:       Value                    = 0
+// CHECK:       SectionNumber            = 1
+// CHECK:       SimpleType               = IMAGE_SYM_TYPE_NULL (0)
+// CHECK:       ComplexType              = IMAGE_SYM_DTYPE_FUNCTION (2)
+// CHECK:       StorageClass             = IMAGE_SYM_CLASS_EXTERNAL (2)
+// CHECK:       NumberOfAuxSymbols       = 0
+// CHECK:       AuxillaryData            =
+// CHECK:     }
+// CHECK:     5 = {
+// CHECK:       Name                     = _printf
+// CHECK:       Value                    = 0
+// CHECK:       SectionNumber            = 0
+// CHECK:       SimpleType               = IMAGE_SYM_TYPE_NULL (0)
+// CHECK:       ComplexType              = IMAGE_SYM_DTYPE_NULL (0)
+// CHECK:       StorageClass             = IMAGE_SYM_CLASS_EXTERNAL (2)
+// CHECK:       NumberOfAuxSymbols       = 0
+// CHECK:       AuxillaryData            =
+// CHECK:     }
+// CHECK:   ]
+// CHECK: }
diff --git a/test/MC/COFF/bss.s b/test/MC/COFF/bss.s
index f44225b..3bed13d 100644
--- a/test/MC/COFF/bss.s
+++ b/test/MC/COFF/bss.s
@@ -1,15 +1,15 @@
-// The purpose of this test is to verify that bss sections are emited correctly.
-
-// RUN: llvm-mc -filetype=obj -triple i686-pc-win32 %s | coff-dump.py | FileCheck %s
-// RUN: llvm-mc -filetype=obj -triple x86_64-pc-win32 %s | coff-dump.py | FileCheck %s
-
-    .bss
-    .globl _g0
-    .align 4
-_g0:
-    .long 0
-
-// CHECK:      Name           = .bss
-// CHECK-NEXT: VirtualSize    = 0
-// CHECK-NEXT: VirtualAddress = 0
-// CHECK-NEXT: SizeOfRawData  = 4
+// The purpose of this test is to verify that bss sections are emited correctly.
+
+// RUN: llvm-mc -filetype=obj -triple i686-pc-win32 %s | coff-dump.py | FileCheck %s
+// RUN: llvm-mc -filetype=obj -triple x86_64-pc-win32 %s | coff-dump.py | FileCheck %s
+
+    .bss
+    .globl _g0
+    .align 4
+_g0:
+    .long 0
+
+// CHECK:      Name           = .bss
+// CHECK-NEXT: VirtualSize    = 0
+// CHECK-NEXT: VirtualAddress = 0
+// CHECK-NEXT: SizeOfRawData  = 4
diff --git a/test/MC/COFF/diff.s b/test/MC/COFF/diff.s
new file mode 100644
index 0000000..aa683f2
--- /dev/null
+++ b/test/MC/COFF/diff.s
@@ -0,0 +1,46 @@
+// RUN: llvm-mc -filetype=obj -triple i686-pc-mingw32 %s | coff-dump.py | FileCheck %s
+
+	.def	 _foobar;
+	.scl	2;
+	.type	32;
+	.endef
+	.text
+	.long   0
+	.globl	_foobar
+	.align	16, 0x90
+_foobar:                                # @foobar
+# BB#0:
+	ret
+
+	.data
+	.globl	_rust_crate             # @rust_crate
+	.align	4
+_rust_crate:
+	.long   0
+	.long   _foobar
+	.long	_foobar-_rust_crate
+	.long	_foobar-_rust_crate
+
+// CHECK:      Name                     = .data
+// CHECK:      SectionData              =
+// CHECK-NEXT:   00 00 00 00 00 00 00 00 - 1C 00 00 00 20 00 00 00 |............ ...|
+// CHECK:        Relocations              = [
+// CHECK-NEXT:   0 = {
+// CHECK-NEXT:     VirtualAddress           = 0x4
+// CHECK-NEXT:     SymbolTableIndex         =
+// CHECK-NEXT:     Type                     = IMAGE_REL_I386_DIR32 (6)
+// CHECK-NEXT:     SymbolName               = _foobar
+// CHECK-NEXT:   }
+// CHECK-NEXT:   1 = {
+// CHECK-NEXT:     VirtualAddress           = 0x8
+// CHECK-NEXT:     SymbolTableIndex         = 0
+// CHECK-NEXT:     Type                     = IMAGE_REL_I386_REL32 (20)
+// CHECK-NEXT:     SymbolName               = .text
+// CHECK-NEXT:   }
+// CHECK-NEXT:   2 = {
+// CHECK-NEXT:     VirtualAddress           = 0xC
+// CHECK-NEXT:     SymbolTableIndex         = 0
+// CHECK-NEXT:     Type                     = IMAGE_REL_I386_REL32 (20)
+// CHECK-NEXT:     SymbolName               = .text
+// CHECK-NEXT:   }
+// CHECK-NEXT: ]
diff --git a/test/MC/COFF/seh-section.s b/test/MC/COFF/seh-section.s
new file mode 100644
index 0000000..802cba5
--- /dev/null
+++ b/test/MC/COFF/seh-section.s
@@ -0,0 +1,37 @@
+// This test ensures that, if the section containing a function has a suffix
+// (e.g. .text$foo), its unwind info section also has a suffix (.xdata$foo).
+// RUN: llvm-mc -filetype=obj -triple x86_64-pc-win32 %s | coff-dump.py | FileCheck %s
+// XFAIL: *
+
+// CHECK:      Name                 = .xdata$foo
+// CHECK-NEXT: VirtualSize
+// CHECK-NEXT: VirtualAddress
+// CHECK-NEXT: SizeOfRawData        = 8
+// CHECK-NEXT: PointerToRawData
+// CHECK-NEXT: PointerToRelocations
+// CHECK-NEXT: PointerToLineNumbers
+// CHECK-NEXT: NumberOfRelocations  = 0
+// CHECK-NEXT: NumberOfLineNumbers  = 0
+// CHECK-NEXT: Charateristics
+// CHECK-NEXT:   IMAGE_SCN_CNT_INITIALIZED_DATA
+// CHECK-NEXT:   IMAGE_SCN_ALIGN_4BYTES
+// CHECK-NEXT:   IMAGE_SCN_MEM_READ
+// CHECK-NEXT:   IMAGE_SCN_MEM_WRITE
+// CHECK-NEXT: SectionData
+// CHECK-NEXT:   01 05 02 00 05 50 04 02
+
+    .section .text$foo,"x"
+    .globl foo
+    .def foo; .scl 2; .type 32; .endef
+    .seh_proc foo
+foo:
+    subq $8, %rsp
+    .seh_stackalloc 8
+    pushq %rbp
+    .seh_pushreg %rbp
+    .seh_endprologue
+    popq %rbp
+    addq $8, %rsp
+    ret
+    .seh_endproc
+
diff --git a/test/MC/COFF/seh.s b/test/MC/COFF/seh.s
new file mode 100644
index 0000000..8cafcb3
--- /dev/null
+++ b/test/MC/COFF/seh.s
@@ -0,0 +1,60 @@
+// This test checks that the SEH directives emit the correct unwind data.
+// RUN: llvm-mc -triple x86_64-pc-win32 -filetype=obj %s | coff-dump.py | FileCheck %s
+
+// CHECK:      Name                 = .xdata
+// CHECK-NEXT: VirtualSize
+// CHECK-NEXT: VirtualAddress
+// CHECK-NEXT: SizeOfRawData        = 52
+// CHECK-NEXT: PointerToRawData
+// CHECK-NEXT: PointerToRelocations
+// CHECK-NEXT: PointerToLineNumbers
+// CHECK-NEXT: NumberOfRelocations  = 4
+// CHECK-NEXT: NumberOfLineNumbers  = 0
+// CHECK-NEXT: Charateristics
+// CHECK-NEXT:   IMAGE_SCN_CNT_INITIALIZED_DATA
+// CHECK-NEXT:   IMAGE_SCN_ALIGN_4BYTES
+// CHECK-NEXT:   IMAGE_SCN_MEM_READ
+// CHECK-NEXT:   IMAGE_SCN_MEM_WRITE
+// CHECK-NEXT: SectionData
+// CHECK-NEXT:   09 12 08 03 00 03 0F 30 - 0E 88 00 00 09 64 02 00
+// CHECK-NEXT:   04 22 00 1A 00 00 00 00 - 00 00 00 00 21 00 00 00
+// CHECK-NEXT:   00 00 00 00 1B 00 00 00 - 00 00 00 00 01 00 00 00
+// CHECK-NEXT:   00 00 00 00
+
+    .text
+    .globl func
+    .def func; .scl 2; .type 32; .endef
+    .seh_proc func
+func:
+    .seh_pushframe @code
+    subq $24, %rsp
+    .seh_stackalloc 24
+    movq %rsi, 16(%rsp)
+    .seh_savereg %rsi, 16
+    movups %xmm8, (%rsp)
+    .seh_savexmm %xmm8, 0
+    pushq %rbx
+    .seh_pushreg 3
+    mov %rsp, %rbx
+    .seh_setframe 3, 0
+    .seh_endprologue
+    .seh_handler __C_specific_handler, @except
+    .seh_handlerdata
+    .long 0
+    .text
+    .seh_startchained
+    .seh_endprologue
+    .seh_endchained
+    lea (%rbx), %rsp
+    pop %rbx
+    addq $24, %rsp
+    ret
+    .seh_endproc
+
+// Test emission of small functions.
+    .globl smallFunc
+    .def smallFunc; .scl 2; .type 32; .endef
+    .seh_proc smallFunc
+smallFunc:
+    ret
+    .seh_endproc
diff --git a/test/MC/COFF/simple-fixups.s b/test/MC/COFF/simple-fixups.s
index f86f4a9..4c9b4d4 100644
--- a/test/MC/COFF/simple-fixups.s
+++ b/test/MC/COFF/simple-fixups.s
@@ -1,50 +1,50 @@
-// The purpose of this test is to verify that we do not produce unneeded
-// relocations when symbols are in the same section and we know their offset.
-
-// RUN: llvm-mc -filetype=obj -triple i686-pc-win32 %s | coff-dump.py | FileCheck %s
-// I WOULD RUN, BUT THIS FAILS: llvm-mc -filetype=obj -triple x86_64-pc-win32 %s | coff-dump.py | FileCheck %s
-
-	.def	 _foo;
-	.scl	2;
-	.type	32;
-	.endef
-	.text
-	.globl	_foo
-	.align	16, 0x90
-_foo:                                   # @foo
-# BB#0:                                 # %e
-	.align	16, 0x90
-LBB0_1:                                 # %i
-                                        # =>This Inner Loop Header: Depth=1
-	jmp	LBB0_1
-
-	.def	 _bar;
-	.scl	2;
-	.type	32;
-	.endef
-	.globl	_bar
-	.align	16, 0x90
-_bar:                                   # @bar
-# BB#0:                                 # %e
-	.align	16, 0x90
-LBB1_1:                                 # %i
-                                        # =>This Inner Loop Header: Depth=1
-	jmp	LBB1_1
-
-	.def	 _baz;
-	.scl	2;
-	.type	32;
-	.endef
-	.globl	_baz
-	.align	16, 0x90
-_baz:                                   # @baz
-# BB#0:                                 # %e
-	subl	$4, %esp
-Ltmp0:
-	calll	_baz
-	addl	$4, %esp
-	ret
-
-// CHECK:     Sections = [
-// CHECK-NOT: NumberOfRelocations = {{[^0]}}
-// CHECK:     Symbols = [
+// The purpose of this test is to verify that we do not produce unneeded
+// relocations when symbols are in the same section and we know their offset.
+
+// RUN: llvm-mc -filetype=obj -triple i686-pc-win32 %s | coff-dump.py | FileCheck %s
+// I WOULD RUN, BUT THIS FAILS: llvm-mc -filetype=obj -triple x86_64-pc-win32 %s | coff-dump.py | FileCheck %s
+
+	.def	 _foo;
+	.scl	2;
+	.type	32;
+	.endef
+	.text
+	.globl	_foo
+	.align	16, 0x90
+_foo:                                   # @foo
+# BB#0:                                 # %e
+	.align	16, 0x90
+LBB0_1:                                 # %i
+                                        # =>This Inner Loop Header: Depth=1
+	jmp	LBB0_1
+
+	.def	 _bar;
+	.scl	2;
+	.type	32;
+	.endef
+	.globl	_bar
+	.align	16, 0x90
+_bar:                                   # @bar
+# BB#0:                                 # %e
+	.align	16, 0x90
+LBB1_1:                                 # %i
+                                        # =>This Inner Loop Header: Depth=1
+	jmp	LBB1_1
+
+	.def	 _baz;
+	.scl	2;
+	.type	32;
+	.endef
+	.globl	_baz
+	.align	16, 0x90
+_baz:                                   # @baz
+# BB#0:                                 # %e
+	subl	$4, %esp
+Ltmp0:
+	calll	_baz
+	addl	$4, %esp
+	ret
+
+// CHECK:     Sections = [
+// CHECK-NOT: NumberOfRelocations = {{[^0]}}
+// CHECK:     Symbols = [
diff --git a/test/MC/COFF/symbol-alias.s b/test/MC/COFF/symbol-alias.s
index ede6b53..03f07b2 100644
--- a/test/MC/COFF/symbol-alias.s
+++ b/test/MC/COFF/symbol-alias.s
@@ -1,62 +1,62 @@
-// The purpose of this test is to verify that symbol aliases
-// (@foo = alias <type> @bar) generate the correct entries in the symbol table.
-// They should be identical except for the name.
-
-// RUN: llvm-mc -filetype=obj -triple i686-pc-win32 %s | coff-dump.py | FileCheck %s
-// RUN: llvm-mc -filetype=obj -triple x86_64-pc-win32 %s | coff-dump.py | FileCheck %s
-
-	.def	 _foo;
-	.scl	2;
-	.type	32;
-	.endef
-	.text
-	.globl	_foo
-	.align	16, 0x90
-_foo:                                   # @foo
-# BB#0:                                 # %entry
-	ret
-
-	.data
-	.globl	_bar                    # @bar
-	.align	4
-_bar:
-	.long	0                       # 0x0
-
-
-	.globl	_foo_alias
-_foo_alias = _foo
-	.globl	_bar_alias
-_bar_alias = _bar
-
-// CHECK:      Name               = {{_?}}foo
-// CHECK-NEXT: Value              = [[FOO_VALUE:.*$]]
-// CHECK-NEXT: SectionNumber      = [[FOO_SECTION_NUMBER:.*$]]
-// CHECK-NEXT: SimpleType         = [[FOO_SIMPLE_TYPE:.*$]]
-// CHECK-NEXT: ComplexType        = [[FOO_COMPLEX_TYPE:.*$]]
-// CHECK-NEXT: StorageClass       = [[FOO_STORAGE_CLASS:.*$]]
-// CHECK-NEXT: NumberOfAuxSymbols = [[FOO_NUMBER_OF_AUX_SYMBOLS:.*$]]
-
-// CHECK:      Name               = {{_?}}bar
-// CHECK-NEXT: Value              = [[BAR_VALUE:.*$]]
-// CHECK-NEXT: SectionNumber      = [[BAR_SECTION_NUMBER:.*$]]
-// CHECK-NEXT: SimpleType         = [[BAR_SIMPLE_TYPE:.*$]]
-// CHECK-NEXT: ComplexType        = [[BAR_COMPLEX_TYPE:.*$]]
-// CHECK-NEXT: StorageClass       = [[BAR_STORAGE_CLASS:.*$]]
-// CHECK-NEXT: NumberOfAuxSymbols = [[BAR_NUMBER_OF_AUX_SYMBOLS:.*$]]
-
-// CHECK:      Name               = {{_?}}foo_alias
-// CHECK-NEXT: Value              = [[FOO_VALUE]]
-// CHECK-NEXT: SectionNumber      = [[FOO_SECTION_NUMBER]]
-// CHECK-NEXT: SimpleType         = [[FOO_SIMPLE_TYPE]]
-// CHECK-NEXT: ComplexType        = [[FOO_COMPLEX_TYPE]]
-// CHECK-NEXT: StorageClass       = [[FOO_STORAGE_CLASS]]
-// CHECK-NEXT: NumberOfAuxSymbols = [[FOO_NUMBER_OF_AUX_SYMBOLS]]
-
-// CHECK:      Name               = {{_?}}bar_alias
-// CHECK-NEXT: Value              = [[BAR_VALUE]]
-// CHECK-NEXT: SectionNumber      = [[BAR_SECTION_NUMBER]]
-// CHECK-NEXT: SimpleType         = [[BAR_SIMPLE_TYPE]]
-// CHECK-NEXT: ComplexType        = [[BAR_COMPLEX_TYPE]]
-// CHECK-NEXT: StorageClass       = [[BAR_STORAGE_CLASS]]
-// CHECK-NEXT: NumberOfAuxSymbols = [[BAR_NUMBER_OF_AUX_SYMBOLS]]
-
+// The purpose of this test is to verify that symbol aliases
+// (@foo = alias <type> @bar) generate the correct entries in the symbol table.
+// They should be identical except for the name.
+
+// RUN: llvm-mc -filetype=obj -triple i686-pc-win32 %s | coff-dump.py | FileCheck %s
+// RUN: llvm-mc -filetype=obj -triple x86_64-pc-win32 %s | coff-dump.py | FileCheck %s
+
+	.def	 _foo;
+	.scl	2;
+	.type	32;
+	.endef
+	.text
+	.globl	_foo
+	.align	16, 0x90
+_foo:                                   # @foo
+# BB#0:                                 # %entry
+	ret
+
+	.data
+	.globl	_bar                    # @bar
+	.align	4
+_bar:
+	.long	0                       # 0x0
+
+
+	.globl	_foo_alias
+_foo_alias = _foo
+	.globl	_bar_alias
+_bar_alias = _bar
+
+// CHECK:      Name               = {{_?}}foo
+// CHECK-NEXT: Value              = [[FOO_VALUE:.*$]]
+// CHECK-NEXT: SectionNumber      = [[FOO_SECTION_NUMBER:.*$]]
+// CHECK-NEXT: SimpleType         = [[FOO_SIMPLE_TYPE:.*$]]
+// CHECK-NEXT: ComplexType        = [[FOO_COMPLEX_TYPE:.*$]]
+// CHECK-NEXT: StorageClass       = [[FOO_STORAGE_CLASS:.*$]]
+// CHECK-NEXT: NumberOfAuxSymbols = [[FOO_NUMBER_OF_AUX_SYMBOLS:.*$]]
+
+// CHECK:      Name               = {{_?}}bar
+// CHECK-NEXT: Value              = [[BAR_VALUE:.*$]]
+// CHECK-NEXT: SectionNumber      = [[BAR_SECTION_NUMBER:.*$]]
+// CHECK-NEXT: SimpleType         = [[BAR_SIMPLE_TYPE:.*$]]
+// CHECK-NEXT: ComplexType        = [[BAR_COMPLEX_TYPE:.*$]]
+// CHECK-NEXT: StorageClass       = [[BAR_STORAGE_CLASS:.*$]]
+// CHECK-NEXT: NumberOfAuxSymbols = [[BAR_NUMBER_OF_AUX_SYMBOLS:.*$]]
+
+// CHECK:      Name               = {{_?}}foo_alias
+// CHECK-NEXT: Value              = [[FOO_VALUE]]
+// CHECK-NEXT: SectionNumber      = [[FOO_SECTION_NUMBER]]
+// CHECK-NEXT: SimpleType         = [[FOO_SIMPLE_TYPE]]
+// CHECK-NEXT: ComplexType        = [[FOO_COMPLEX_TYPE]]
+// CHECK-NEXT: StorageClass       = [[FOO_STORAGE_CLASS]]
+// CHECK-NEXT: NumberOfAuxSymbols = [[FOO_NUMBER_OF_AUX_SYMBOLS]]
+
+// CHECK:      Name               = {{_?}}bar_alias
+// CHECK-NEXT: Value              = [[BAR_VALUE]]
+// CHECK-NEXT: SectionNumber      = [[BAR_SECTION_NUMBER]]
+// CHECK-NEXT: SimpleType         = [[BAR_SIMPLE_TYPE]]
+// CHECK-NEXT: ComplexType        = [[BAR_COMPLEX_TYPE]]
+// CHECK-NEXT: StorageClass       = [[BAR_STORAGE_CLASS]]
+// CHECK-NEXT: NumberOfAuxSymbols = [[BAR_NUMBER_OF_AUX_SYMBOLS]]
+
diff --git a/test/MC/COFF/symbol-fragment-offset.s b/test/MC/COFF/symbol-fragment-offset.s
index c314ac2..1df8baa 100644
--- a/test/MC/COFF/symbol-fragment-offset.s
+++ b/test/MC/COFF/symbol-fragment-offset.s
@@ -1,187 +1,187 @@
-// The purpose of this test is to see if the COFF object writer is emitting the
-// proper relocations for multiple pieces of data in a single data fragment.
-
-// RUN: llvm-mc -filetype=obj -triple i686-pc-win32 %s | coff-dump.py | FileCheck %s
-// I WOULD RUN, BUT THIS FAILS: llvm-mc -filetype=obj -triple x86_64-pc-win32 %s
-
-.def	 _main;
-	.scl	2;
-	.type	32;
-	.endef
-	.text
-	.globl	_main
-	.align	16, 0x90
-_main:                                  # @main
-# BB#0:                                 # %entry
-	subl	$4, %esp
-	movl	$L_.str0, (%esp)
-	calll	_printf
-	movl	$L_.str1, (%esp)
-	calll	_puts
-	movl	$L_.str2, (%esp)
-	calll	_puts
-	xorl	%eax, %eax
-	addl	$4, %esp
-	ret
-
-	.data
-L_.str0:                                # @.str0
-	.asciz	 "Hello "
-
-L_.str1:                                # @.str1
-	.asciz	 "World!"
-
-	.align	16                      # @.str2
-L_.str2:
-	.asciz	 "I'm The Last Line."
-
-// CHECK: {
-// CHECK:   MachineType              = IMAGE_FILE_MACHINE_I386 (0x14C)
-// CHECK:   NumberOfSections         = 2
-// CHECK:   TimeDateStamp            = {{[0-9]+}}
-// CHECK:   PointerToSymbolTable     = 0x{{[0-9A-F]+}}
-// CHECK:   NumberOfSymbols          = 7
-// CHECK:   SizeOfOptionalHeader     = 0
-// CHECK:   Characteristics          = 0x0
-// CHECK:   Sections                 = [
-// CHECK:     1 = {
-// CHECK:       Name                     = .text
-// CHECK:       VirtualSize              = 0
-// CHECK:       VirtualAddress           = 0
-// CHECK:       SizeOfRawData            = {{[0-9]+}}
-// CHECK:       PointerToRawData         = 0x{{[0-9A-F]+}}
-// CHECK:       PointerToRelocations     = 0x{{[0-9A-F]+}}
-// CHECK:       PointerToLineNumbers     = 0x0
-// CHECK:       NumberOfRelocations      = 6
-// CHECK:       NumberOfLineNumbers      = 0
-// CHECK:       Charateristics           = 0x60500020
-// CHECK:         IMAGE_SCN_CNT_CODE
-// CHECK:         IMAGE_SCN_ALIGN_16BYTES
-// CHECK:         IMAGE_SCN_MEM_EXECUTE
-// CHECK:         IMAGE_SCN_MEM_READ
-// CHECK:       SectionData              =
-// CHECK:         83 EC 04 C7 04 24 00 00 - 00 00 E8 00 00 00 00 C7 |.....$..........|
-// CHECK:         04 24 07 00 00 00 E8 00 - 00 00 00 C7 04 24 10 00 |.$...........$..|
-// CHECK:         00 00 E8 00 00 00 00 31 - C0 83 C4 04 C3 |.......1.....|
-// CHECK:       Relocations              = [
-// CHECK:         0 = {
-// CHECK:           VirtualAddress           = 0x6
-// CHECK:           SymbolTableIndex         = 2
-// CHECK:           Type                     = IMAGE_REL_I386_DIR32 (6)
-// CHECK:           SymbolName               = .data
-// CHECK:         }
-// CHECK:         1 = {
-// CHECK:           VirtualAddress           = 0xB
-// CHECK:           SymbolTableIndex         = 5
-// CHECK:           Type                     = IMAGE_REL_I386_REL32 (20)
-// CHECK:           SymbolName               = _printf
-// CHECK:         }
-// CHECK:         2 = {
-// CHECK:           VirtualAddress           = 0x12
-// CHECK:           SymbolTableIndex         = 2
-// CHECK:           Type                     = IMAGE_REL_I386_DIR32 (6)
-// CHECK:           SymbolName               = .data
-// CHECK:         }
-// CHECK:         3 = {
-// CHECK:           VirtualAddress           = 0x17
-// CHECK:           SymbolTableIndex         = 6
-// CHECK:           Type                     = IMAGE_REL_I386_REL32 (20)
-// CHECK:           SymbolName               = _puts
-// CHECK:         }
-// CHECK:         4 = {
-// CHECK:           VirtualAddress           = 0x1E
-// CHECK:           SymbolTableIndex         = 2
-// CHECK:           Type                     = IMAGE_REL_I386_DIR32 (6)
-// CHECK:           SymbolName               = .data
-// CHECK:         }
-// CHECK:         5 = {
-// CHECK:           VirtualAddress           = 0x23
-// CHECK:           SymbolTableIndex         = 6
-// CHECK:           Type                     = IMAGE_REL_I386_REL32 (20)
-// CHECK:           SymbolName               = _puts
-// CHECK:         }
-// CHECK:       ]
-// CHECK:     }
-// CHECK:     2 = {
-// CHECK:       Name                     = .data
-// CHECK:       VirtualSize              = 0
-// CHECK:       VirtualAddress           = 0
-// CHECK:       SizeOfRawData            = {{[0-9]+}}
-// CHECK:       PointerToRawData         = 0x{{[0-9A-F]+}}
-// CHECK:       PointerToRelocations     = 0x0
-// CHECK:       PointerToLineNumbers     = 0x0
-// CHECK:       NumberOfRelocations      = 0
-// CHECK:       NumberOfLineNumbers      = 0
-// CHECK:       Charateristics           = 0xC0500040
-// CHECK:         IMAGE_SCN_CNT_INITIALIZED_DATA
-// CHECK:         IMAGE_SCN_ALIGN_16BYTES
-// CHECK:         IMAGE_SCN_MEM_READ
-// CHECK:         IMAGE_SCN_MEM_WRITE
-// CHECK:       SectionData              =
-// CHECK:         48 65 6C 6C 6F 20 00 57 - 6F 72 6C 64 21 00 00 00 |Hello .World!...|
-// CHECK:         49 27 6D 20 54 68 65 20 - 4C 61 73 74 20 4C 69 6E |I'm The Last Lin|
-// CHECK:         65 2E 00                                          |e..|
-// CHECK:       Relocations              = None
-// CHECK:     }
-// CHECK:   ]
-// CHECK:   Symbols                  = [
-// CHECK:     0 = {
-// CHECK:       Name                     = .text
-// CHECK:       Value                    = 0
-// CHECK:       SectionNumber            = 1
-// CHECK:       SimpleType               = IMAGE_SYM_TYPE_NULL (0)
-// CHECK:       ComplexType              = IMAGE_SYM_DTYPE_NULL (0)
-// CHECK:       StorageClass             = IMAGE_SYM_CLASS_STATIC (3)
-// CHECK:       NumberOfAuxSymbols       = 1
-// CHECK:       AuxillaryData            =
-// CHECK:         2D 00 00 00 06 00 00 00 - 00 00 00 00 01 00 00 00 |-...............|
-// CHECK:         00 00                                             |..|
-
-// CHECK:     }
-// CHECK:     2 = {
-// CHECK:       Name                     = .data
-// CHECK:       Value                    = 0
-// CHECK:       SectionNumber            = 2
-// CHECK:       SimpleType               = IMAGE_SYM_TYPE_NULL (0)
-// CHECK:       ComplexType              = IMAGE_SYM_DTYPE_NULL (0)
-// CHECK:       StorageClass             = IMAGE_SYM_CLASS_STATIC (3)
-// CHECK:       NumberOfAuxSymbols       = 1
-// CHECK:       AuxillaryData            =
-// CHECK:         23 00 00 00 00 00 00 00 - 00 00 00 00 02 00 00 00 |#...............|
-// CHECK:         00 00                                             |..|
-
-// CHECK:     }
-// CHECK:     4 = {
-// CHECK:       Name                     = _main
-// CHECK:       Value                    = 0
-// CHECK:       SectionNumber            = 1
-// CHECK:       SimpleType               = IMAGE_SYM_TYPE_NULL (0)
-// CHECK:       ComplexType              = IMAGE_SYM_DTYPE_FUNCTION (2)
-// CHECK:       StorageClass             = IMAGE_SYM_CLASS_EXTERNAL (2)
-// CHECK:       NumberOfAuxSymbols       = 0
-// CHECK:       AuxillaryData            =
-
-// CHECK:     5 = {
-// CHECK:       Name                     = _printf
-// CHECK:       Value                    = 0
-// CHECK:       SectionNumber            = 0
-// CHECK:       SimpleType               = IMAGE_SYM_TYPE_NULL (0)
-// CHECK:       ComplexType              = IMAGE_SYM_DTYPE_NULL (0)
-// CHECK:       StorageClass             = IMAGE_SYM_CLASS_EXTERNAL (2)
-// CHECK:       NumberOfAuxSymbols       = 0
-// CHECK:       AuxillaryData            =
-
-// CHECK:     }
-// CHECK:     6 = {
-// CHECK:       Name                     = _puts
-// CHECK:       Value                    = 0
-// CHECK:       SectionNumber            = 0
-// CHECK:       SimpleType               = IMAGE_SYM_TYPE_NULL (0)
-// CHECK:       ComplexType              = IMAGE_SYM_DTYPE_NULL (0)
-// CHECK:       StorageClass             = IMAGE_SYM_CLASS_EXTERNAL (2)
-// CHECK:       NumberOfAuxSymbols       = 0
-// CHECK:       AuxillaryData            =
-
-// CHECK:     }
-// CHECK:   ]
-// CHECK: }
+// The purpose of this test is to see if the COFF object writer is emitting the
+// proper relocations for multiple pieces of data in a single data fragment.
+
+// RUN: llvm-mc -filetype=obj -triple i686-pc-win32 %s | coff-dump.py | FileCheck %s
+// I WOULD RUN, BUT THIS FAILS: llvm-mc -filetype=obj -triple x86_64-pc-win32 %s
+
+.def	 _main;
+	.scl	2;
+	.type	32;
+	.endef
+	.text
+	.globl	_main
+	.align	16, 0x90
+_main:                                  # @main
+# BB#0:                                 # %entry
+	subl	$4, %esp
+	movl	$L_.str0, (%esp)
+	calll	_printf
+	movl	$L_.str1, (%esp)
+	calll	_puts
+	movl	$L_.str2, (%esp)
+	calll	_puts
+	xorl	%eax, %eax
+	addl	$4, %esp
+	ret
+
+	.data
+L_.str0:                                # @.str0
+	.asciz	 "Hello "
+
+L_.str1:                                # @.str1
+	.asciz	 "World!"
+
+	.align	16                      # @.str2
+L_.str2:
+	.asciz	 "I'm The Last Line."
+
+// CHECK: {
+// CHECK:   MachineType              = IMAGE_FILE_MACHINE_I386 (0x14C)
+// CHECK:   NumberOfSections         = 2
+// CHECK:   TimeDateStamp            = {{[0-9]+}}
+// CHECK:   PointerToSymbolTable     = 0x{{[0-9A-F]+}}
+// CHECK:   NumberOfSymbols          = 7
+// CHECK:   SizeOfOptionalHeader     = 0
+// CHECK:   Characteristics          = 0x0
+// CHECK:   Sections                 = [
+// CHECK:     1 = {
+// CHECK:       Name                     = .text
+// CHECK:       VirtualSize              = 0
+// CHECK:       VirtualAddress           = 0
+// CHECK:       SizeOfRawData            = {{[0-9]+}}
+// CHECK:       PointerToRawData         = 0x{{[0-9A-F]+}}
+// CHECK:       PointerToRelocations     = 0x{{[0-9A-F]+}}
+// CHECK:       PointerToLineNumbers     = 0x0
+// CHECK:       NumberOfRelocations      = 6
+// CHECK:       NumberOfLineNumbers      = 0
+// CHECK:       Charateristics           = 0x60500020
+// CHECK:         IMAGE_SCN_CNT_CODE
+// CHECK:         IMAGE_SCN_ALIGN_16BYTES
+// CHECK:         IMAGE_SCN_MEM_EXECUTE
+// CHECK:         IMAGE_SCN_MEM_READ
+// CHECK:       SectionData              =
+// CHECK:         83 EC 04 C7 04 24 00 00 - 00 00 E8 00 00 00 00 C7 |.....$..........|
+// CHECK:         04 24 07 00 00 00 E8 00 - 00 00 00 C7 04 24 10 00 |.$...........$..|
+// CHECK:         00 00 E8 00 00 00 00 31 - C0 83 C4 04 C3 |.......1.....|
+// CHECK:       Relocations              = [
+// CHECK:         0 = {
+// CHECK:           VirtualAddress           = 0x6
+// CHECK:           SymbolTableIndex         = 2
+// CHECK:           Type                     = IMAGE_REL_I386_DIR32 (6)
+// CHECK:           SymbolName               = .data
+// CHECK:         }
+// CHECK:         1 = {
+// CHECK:           VirtualAddress           = 0xB
+// CHECK:           SymbolTableIndex         = 5
+// CHECK:           Type                     = IMAGE_REL_I386_REL32 (20)
+// CHECK:           SymbolName               = _printf
+// CHECK:         }
+// CHECK:         2 = {
+// CHECK:           VirtualAddress           = 0x12
+// CHECK:           SymbolTableIndex         = 2
+// CHECK:           Type                     = IMAGE_REL_I386_DIR32 (6)
+// CHECK:           SymbolName               = .data
+// CHECK:         }
+// CHECK:         3 = {
+// CHECK:           VirtualAddress           = 0x17
+// CHECK:           SymbolTableIndex         = 6
+// CHECK:           Type                     = IMAGE_REL_I386_REL32 (20)
+// CHECK:           SymbolName               = _puts
+// CHECK:         }
+// CHECK:         4 = {
+// CHECK:           VirtualAddress           = 0x1E
+// CHECK:           SymbolTableIndex         = 2
+// CHECK:           Type                     = IMAGE_REL_I386_DIR32 (6)
+// CHECK:           SymbolName               = .data
+// CHECK:         }
+// CHECK:         5 = {
+// CHECK:           VirtualAddress           = 0x23
+// CHECK:           SymbolTableIndex         = 6
+// CHECK:           Type                     = IMAGE_REL_I386_REL32 (20)
+// CHECK:           SymbolName               = _puts
+// CHECK:         }
+// CHECK:       ]
+// CHECK:     }
+// CHECK:     2 = {
+// CHECK:       Name                     = .data
+// CHECK:       VirtualSize              = 0
+// CHECK:       VirtualAddress           = 0
+// CHECK:       SizeOfRawData            = {{[0-9]+}}
+// CHECK:       PointerToRawData         = 0x{{[0-9A-F]+}}
+// CHECK:       PointerToRelocations     = 0x0
+// CHECK:       PointerToLineNumbers     = 0x0
+// CHECK:       NumberOfRelocations      = 0
+// CHECK:       NumberOfLineNumbers      = 0
+// CHECK:       Charateristics           = 0xC0500040
+// CHECK:         IMAGE_SCN_CNT_INITIALIZED_DATA
+// CHECK:         IMAGE_SCN_ALIGN_16BYTES
+// CHECK:         IMAGE_SCN_MEM_READ
+// CHECK:         IMAGE_SCN_MEM_WRITE
+// CHECK:       SectionData              =
+// CHECK:         48 65 6C 6C 6F 20 00 57 - 6F 72 6C 64 21 00 00 00 |Hello .World!...|
+// CHECK:         49 27 6D 20 54 68 65 20 - 4C 61 73 74 20 4C 69 6E |I'm The Last Lin|
+// CHECK:         65 2E 00                                          |e..|
+// CHECK:       Relocations              = None
+// CHECK:     }
+// CHECK:   ]
+// CHECK:   Symbols                  = [
+// CHECK:     0 = {
+// CHECK:       Name                     = .text
+// CHECK:       Value                    = 0
+// CHECK:       SectionNumber            = 1
+// CHECK:       SimpleType               = IMAGE_SYM_TYPE_NULL (0)
+// CHECK:       ComplexType              = IMAGE_SYM_DTYPE_NULL (0)
+// CHECK:       StorageClass             = IMAGE_SYM_CLASS_STATIC (3)
+// CHECK:       NumberOfAuxSymbols       = 1
+// CHECK:       AuxillaryData            =
+// CHECK:         2D 00 00 00 06 00 00 00 - 00 00 00 00 01 00 00 00 |-...............|
+// CHECK:         00 00                                             |..|
+
+// CHECK:     }
+// CHECK:     2 = {
+// CHECK:       Name                     = .data
+// CHECK:       Value                    = 0
+// CHECK:       SectionNumber            = 2
+// CHECK:       SimpleType               = IMAGE_SYM_TYPE_NULL (0)
+// CHECK:       ComplexType              = IMAGE_SYM_DTYPE_NULL (0)
+// CHECK:       StorageClass             = IMAGE_SYM_CLASS_STATIC (3)
+// CHECK:       NumberOfAuxSymbols       = 1
+// CHECK:       AuxillaryData            =
+// CHECK:         23 00 00 00 00 00 00 00 - 00 00 00 00 02 00 00 00 |#...............|
+// CHECK:         00 00                                             |..|
+
+// CHECK:     }
+// CHECK:     4 = {
+// CHECK:       Name                     = _main
+// CHECK:       Value                    = 0
+// CHECK:       SectionNumber            = 1
+// CHECK:       SimpleType               = IMAGE_SYM_TYPE_NULL (0)
+// CHECK:       ComplexType              = IMAGE_SYM_DTYPE_FUNCTION (2)
+// CHECK:       StorageClass             = IMAGE_SYM_CLASS_EXTERNAL (2)
+// CHECK:       NumberOfAuxSymbols       = 0
+// CHECK:       AuxillaryData            =
+
+// CHECK:     5 = {
+// CHECK:       Name                     = _printf
+// CHECK:       Value                    = 0
+// CHECK:       SectionNumber            = 0
+// CHECK:       SimpleType               = IMAGE_SYM_TYPE_NULL (0)
+// CHECK:       ComplexType              = IMAGE_SYM_DTYPE_NULL (0)
+// CHECK:       StorageClass             = IMAGE_SYM_CLASS_EXTERNAL (2)
+// CHECK:       NumberOfAuxSymbols       = 0
+// CHECK:       AuxillaryData            =
+
+// CHECK:     }
+// CHECK:     6 = {
+// CHECK:       Name                     = _puts
+// CHECK:       Value                    = 0
+// CHECK:       SectionNumber            = 0
+// CHECK:       SimpleType               = IMAGE_SYM_TYPE_NULL (0)
+// CHECK:       ComplexType              = IMAGE_SYM_DTYPE_NULL (0)
+// CHECK:       StorageClass             = IMAGE_SYM_CLASS_EXTERNAL (2)
+// CHECK:       NumberOfAuxSymbols       = 0
+// CHECK:       AuxillaryData            =
+
+// CHECK:     }
+// CHECK:   ]
+// CHECK: }
diff --git a/test/MC/COFF/weak.s b/test/MC/COFF/weak.s
index a240d71..0f99313 100644
--- a/test/MC/COFF/weak.s
+++ b/test/MC/COFF/weak.s
@@ -1,51 +1,51 @@
-// This tests that default-null weak symbols (a GNU extension) are created
-// properly via the .weak directive.
-
-// RUN: llvm-mc -filetype=obj -triple i686-pc-win32 < %s | coff-dump.py | FileCheck %s
-
-    .def    _main;
-    .scl    2;
-    .type   32;
-    .endef
-    .text
-    .globl  _main
-    .align  16, 0x90
-_main:                                  # @main
-# BB#0:                                 # %entry
-    subl    $4, %esp
-    movl    $_test_weak, %eax
-    testl   %eax, %eax
-    je      LBB0_2
-# BB#1:                                 # %if.then
-    calll   _test_weak
-    movl    $1, %eax
-    addl    $4, %esp
-    ret
-LBB0_2:                                 # %return
-    xorl    %eax, %eax
-    addl    $4, %esp
-    ret
-
-    .weak   _test_weak
-
-// CHECK: Symbols = [
-
-// CHECK:      Name               = _test_weak
-// CHECK-NEXT: Value              = 0
-// CHECK-NEXT: SectionNumber      = 0
-// CHECK-NEXT: SimpleType         = IMAGE_SYM_TYPE_NULL (0)
-// CHECK-NEXT: ComplexType        = IMAGE_SYM_DTYPE_NULL (0)
-// CHECK-NEXT: StorageClass       = IMAGE_SYM_CLASS_WEAK_EXTERNAL (105)
-// CHECK-NEXT: NumberOfAuxSymbols = 1
-// CHECK-NEXT: AuxillaryData      =
-// CHECK-NEXT: 05 00 00 00 02 00 00 00 - 00 00 00 00 00 00 00 00 |................|
-// CHECK-NEXT: 00 00                                             |..|
-
-// CHECK:      Name               = .weak._test_weak.default
-// CHECK-NEXT: Value              = 0
-// CHECK-NEXT: SectionNumber      = 65535
-// CHECK-NEXT: SimpleType         = IMAGE_SYM_TYPE_NULL (0)
-// CHECK-NEXT: ComplexType        = IMAGE_SYM_DTYPE_NULL (0)
-// CHECK-NEXT: StorageClass       = IMAGE_SYM_CLASS_EXTERNAL (2)
-// CHECK-NEXT: NumberOfAuxSymbols = 0
-// CHECK-NEXT: AuxillaryData      =
+// This tests that default-null weak symbols (a GNU extension) are created
+// properly via the .weak directive.
+
+// RUN: llvm-mc -filetype=obj -triple i686-pc-win32 < %s | coff-dump.py | FileCheck %s
+
+    .def    _main;
+    .scl    2;
+    .type   32;
+    .endef
+    .text
+    .globl  _main
+    .align  16, 0x90
+_main:                                  # @main
+# BB#0:                                 # %entry
+    subl    $4, %esp
+    movl    $_test_weak, %eax
+    testl   %eax, %eax
+    je      LBB0_2
+# BB#1:                                 # %if.then
+    calll   _test_weak
+    movl    $1, %eax
+    addl    $4, %esp
+    ret
+LBB0_2:                                 # %return
+    xorl    %eax, %eax
+    addl    $4, %esp
+    ret
+
+    .weak   _test_weak
+
+// CHECK: Symbols = [
+
+// CHECK:      Name               = _test_weak
+// CHECK-NEXT: Value              = 0
+// CHECK-NEXT: SectionNumber      = 0
+// CHECK-NEXT: SimpleType         = IMAGE_SYM_TYPE_NULL (0)
+// CHECK-NEXT: ComplexType        = IMAGE_SYM_DTYPE_NULL (0)
+// CHECK-NEXT: StorageClass       = IMAGE_SYM_CLASS_WEAK_EXTERNAL (105)
+// CHECK-NEXT: NumberOfAuxSymbols = 1
+// CHECK-NEXT: AuxillaryData      =
+// CHECK-NEXT: 05 00 00 00 02 00 00 00 - 00 00 00 00 00 00 00 00 |................|
+// CHECK-NEXT: 00 00                                             |..|
+
+// CHECK:      Name               = .weak._test_weak.default
+// CHECK-NEXT: Value              = 0
+// CHECK-NEXT: SectionNumber      = 65535
+// CHECK-NEXT: SimpleType         = IMAGE_SYM_TYPE_NULL (0)
+// CHECK-NEXT: ComplexType        = IMAGE_SYM_DTYPE_NULL (0)
+// CHECK-NEXT: StorageClass       = IMAGE_SYM_CLASS_EXTERNAL (2)
+// CHECK-NEXT: NumberOfAuxSymbols = 0
+// CHECK-NEXT: AuxillaryData      =
diff --git a/test/MC/Disassembler/ARM/arm-tests.txt b/test/MC/Disassembler/ARM/arm-tests.txt
index 86d4f34..ca072c7 100644
--- a/test/MC/Disassembler/ARM/arm-tests.txt
+++ b/test/MC/Disassembler/ARM/arm-tests.txt
@@ -21,6 +21,12 @@
 # CHECK:	mov	pc, lr
 0x0e 0xf0 0xa0 0xe1
 
+# CHECK:	mov	pc, #255, #2
+0xff 0xf1 0xa0 0xe3
+
+# CHECK:	movw	r7, #4096
+0x00 0x70 0x01 0xe3
+
 # CHECK:	cmn	r0, #1
 0x01 0x00 0x70 0xe3
 
@@ -158,6 +164,9 @@
 # CHECK: bx r12
 0x1c 0xff 0x2f 0xe1
 
+# CHECK: bxeq r5
+0x15 0xff 0x2f 0x01
+
 # CHECK:	uqadd16mi	r6, r11, r8
 0x18 0x60 0x6b 0x46
 
@@ -272,3 +281,9 @@
 
 # CHECK:	uqsax	r5, r6, r7
 0x57 0x5f 0x66 0xe6
+
+# CHECK:	smmlareq	r0, r0, r0, r0
+0x30 0x00 0x50 0x07
+
+# CHECK:	nop
+0x00 0xf0 0x20 0xe3
diff --git a/test/MC/Disassembler/ARM/invalid-Bcc-thumb.txt b/test/MC/Disassembler/ARM/invalid-Bcc-thumb.txt
new file mode 100644
index 0000000..66c43c2
--- /dev/null
+++ b/test/MC/Disassembler/ARM/invalid-Bcc-thumb.txt
@@ -0,0 +1,10 @@
+# RUN: llvm-mc --disassemble %s -triple=thumb-apple-darwin9 |& grep {invalid instruction encoding}
+
+# Opcode=2249 Name=tBcc Format=ARM_FORMAT_THUMBFRM(25)
+#  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0 
+# -------------------------------------------------------------------------------------------------
+# | 0: 0: 0: 0| 0: 0: 0: 0| 0: 0: 0: 0| 0: 0: 0: 0| 1: 1: 0: 1| 1: 1: 1: 0| 0: 1: 1: 0| 1: 1: 1: 1|
+# -------------------------------------------------------------------------------------------------
+# 
+# if cond = '1110' then UNDEFINED
+0x6f 0xde
diff --git a/test/MC/Disassembler/ARM/invalid-DMB-thumb.txt b/test/MC/Disassembler/ARM/invalid-DMB-thumb.txt
new file mode 100644
index 0000000..0a4be68
--- /dev/null
+++ b/test/MC/Disassembler/ARM/invalid-DMB-thumb.txt
@@ -0,0 +1,16 @@
+# RUN: llvm-mc --disassemble %s -triple=thumb-apple-darwin9 |& grep {invalid instruction encoding}
+
+# Opcode=1908 Name=t2DMB Format=ARM_FORMAT_THUMBFRM(25)
+#  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0 
+# -------------------------------------------------------------------------------------------------
+# | 1: 1: 1: 1| 0: 0: 1: 1| 1: 0: 1: 1| 1: 1: 1: 1| 1: 0: 0: 0| 1: 1: 1: 1| 0: 1: 0: 1| 0: 0: 0: 1|
+# -------------------------------------------------------------------------------------------------
+# 
+# Inst{3-0} encodes the option: SY, ST, ISH, ISHST, NSH, NSHST, OSH, OSHST.
+# Reject invalid encodings.
+#
+# See also A8.6.42 DSB
+# All other encodings of option are reserved. It is IMPLEMENTATION DEFINED whether options
+# other than SY are implemented. All unsupported and reserved options must execute as a full
+# system DSB operation, but software must not rely on this behavior.
+0xbf 0xf3 0x51 0x8f
diff --git a/test/MC/Disassembler/ARM/invalid-DSB-arm.txt b/test/MC/Disassembler/ARM/invalid-DSB-arm.txt
new file mode 100644
index 0000000..afa2baf
--- /dev/null
+++ b/test/MC/Disassembler/ARM/invalid-DSB-arm.txt
@@ -0,0 +1,16 @@
+# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 |& grep {invalid instruction encoding}
+
+# Opcode=102 Name=DSB Format=ARM_FORMAT_MISCFRM(26)
+#  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0 
+# -------------------------------------------------------------------------------------------------
+# | 1: 1: 1: 1| 0: 1: 0: 1| 0: 1: 1: 1| 1: 1: 1: 1| 1: 1: 1: 1| 0: 0: 0: 0| 0: 1: 0: 0| 0: 0: 0: 0|
+# -------------------------------------------------------------------------------------------------
+# 
+# Inst{3-0} encodes the option: SY, ST, ISH, ISHST, NSH, NSHST, OSH, OSHST.
+# Reject invalid encodings.
+#
+# See also A8.6.42 DSB
+# All other encodings of option are reserved. It is IMPLEMENTATION DEFINED whether options
+# other than SY are implemented. All unsupported and reserved options must execute as a full
+# system DSB operation, but software must not rely on this behavior.
+0x40 0xf0 0x7f 0xf5
diff --git a/test/MC/Disassembler/ARM/invalid-LDRB_POST-arm.txt b/test/MC/Disassembler/ARM/invalid-LDRB_POST-arm.txt
new file mode 100644
index 0000000..7a35c2d
--- /dev/null
+++ b/test/MC/Disassembler/ARM/invalid-LDRB_POST-arm.txt
@@ -0,0 +1,10 @@
+# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 |& grep {invalid instruction encoding}
+
+# Opcode=140 Name=LDRB_POST Format=ARM_FORMAT_LDFRM(6)
+#  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0 
+# -------------------------------------------------------------------------------------------------
+# | 1: 1: 1: 0| 0: 1: 1: 0| 1: 1: 0: 1| 0: 1: 1: 1| 0: 1: 1: 1| 0: 0: 0: 0| 0: 0: 0: 0| 0: 1: 0: 1|
+# -------------------------------------------------------------------------------------------------
+# 
+# if wback && (n == 15 || n == t) then UNPREDICTABLE
+0x05 0x70 0xd7 0xe6
diff --git a/test/MC/Disassembler/ARM/invalid-LDRD_PRE-thumb.txt b/test/MC/Disassembler/ARM/invalid-LDRD_PRE-thumb.txt
new file mode 100644
index 0000000..da2e6be
--- /dev/null
+++ b/test/MC/Disassembler/ARM/invalid-LDRD_PRE-thumb.txt
@@ -0,0 +1,13 @@
+# RUN: llvm-mc --disassemble %s -triple=thumb-apple-darwin9 |& grep {invalid instruction encoding}
+
+# Opcode=1930 Name=t2LDRD_PRE Format=ARM_FORMAT_THUMBFRM(25)
+#  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0 
+# -------------------------------------------------------------------------------------------------
+# | 1: 1: 1: 0| 1: 0: 0: 1| 1: 1: 1: 1| 1: 1: 1: 1| 1: 1: 1: 0| 1: 0: 1: 1| 0: 0: 0: 0| 0: 0: 0: 0|
+# -------------------------------------------------------------------------------------------------
+# 
+# A8.6.66 LDRD (immediate)
+# if Rn = '1111' then SEE LDRD (literal)
+# A8.6.67 LDRD (literal)
+# Inst{21} = 0
+0xff 0xe9 0x0 0xeb
diff --git a/test/MC/Disassembler/ARM/invalid-LDR_POST-arm.txt b/test/MC/Disassembler/ARM/invalid-LDR_POST-arm.txt
new file mode 100644
index 0000000..ad79986
--- /dev/null
+++ b/test/MC/Disassembler/ARM/invalid-LDR_POST-arm.txt
@@ -0,0 +1,4 @@
+# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 |& grep {invalid instruction encoding}
+
+# LDR_PRE/POST has encoding Inst{4} = 0.
+0xde 0x69 0x18 0x46
diff --git a/test/MC/Disassembler/ARM/invalid-LDR_PRE-arm.txt b/test/MC/Disassembler/ARM/invalid-LDR_PRE-arm.txt
new file mode 100644
index 0000000..36c1124
--- /dev/null
+++ b/test/MC/Disassembler/ARM/invalid-LDR_PRE-arm.txt
@@ -0,0 +1,10 @@
+# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 |& grep {invalid instruction encoding}
+
+# Opcode=165 Name=LDR_PRE Format=ARM_FORMAT_LDFRM(6)
+#  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0 
+# -------------------------------------------------------------------------------------------------
+# | 1: 1: 1: 0| 0: 1: 1: 1| 1: 0: 1: 1| 0: 1: 1: 1| 0: 1: 1: 0| 0: 0: 0: 0| 1: 0: 0: 0| 1: 1: 1: 1|
+# -------------------------------------------------------------------------------------------------
+# 
+# if m == 15 then UNPREDICTABLE
+0x8f 0x60 0xb7 0xe7
diff --git a/test/MC/Disassembler/ARM/invalid-MOVTi16-arm.txt b/test/MC/Disassembler/ARM/invalid-MOVTi16-arm.txt
new file mode 100644
index 0000000..0b8a077
--- /dev/null
+++ b/test/MC/Disassembler/ARM/invalid-MOVTi16-arm.txt
@@ -0,0 +1,10 @@
+# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 |& grep {invalid instruction encoding}
+
+# Opcode=185 Name=MOVTi16 Format=ARM_FORMAT_DPFRM(4)
+#  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0 
+# -------------------------------------------------------------------------------------------------
+# | 1: 1: 1: 0| 0: 0: 1: 1| 0: 1: 0: 0| 0: 0: 0: 1| 1: 1: 1: 1| 0: 0: 0: 0| 0: 0: 0: 0| 0: 0: 0: 0|
+# -------------------------------------------------------------------------------------------------
+# 
+# if d == 15 then UNPREDICTABLE
+0x00 0xf0 0x41 0xe3
diff --git a/test/MC/Disassembler/ARM/invalid-STMIA_UPD-thumb.txt b/test/MC/Disassembler/ARM/invalid-STMIA_UPD-thumb.txt
new file mode 100644
index 0000000..0000c60
--- /dev/null
+++ b/test/MC/Disassembler/ARM/invalid-STMIA_UPD-thumb.txt
@@ -0,0 +1,10 @@
+# RUN: llvm-mc --disassemble %s -triple=thumb-apple-darwin9 |& grep {invalid instruction encoding}
+
+# Opcode=2313 Name=tSTMIA_UPD Format=ARM_FORMAT_THUMBFRM(25)
+#  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0 
+# -------------------------------------------------------------------------------------------------
+# | 0: 0: 0: 0| 0: 0: 0: 0| 0: 0: 0: 0| 0: 0: 0: 0| 1: 1: 0: 0| 0: 1: 1: 1| 0: 0: 0: 0| 0: 0: 0: 0|
+# -------------------------------------------------------------------------------------------------
+# 
+# if BitCount(registers) < 1 then UNPREDICTABLE
+0x00 0xc7
diff --git a/test/MC/Disassembler/ARM/invalid-STRBrs-arm.txt b/test/MC/Disassembler/ARM/invalid-STRBrs-arm.txt
new file mode 100644
index 0000000..5209323
--- /dev/null
+++ b/test/MC/Disassembler/ARM/invalid-STRBrs-arm.txt
@@ -0,0 +1,10 @@
+# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 |& grep {invalid instruction encoding}
+
+# Opcode=355 Name=STRBrs Format=ARM_FORMAT_STFRM(7)
+#  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0 
+# -------------------------------------------------------------------------------------------------
+# | 1: 1: 1: 0| 0: 1: 1: 1| 1: 1: 0: 0| 1: 1: 1: 1| 1: 1: 1: 1| 0: 0: 0: 0| 0: 0: 0: 0| 0: 0: 0: 0|
+# -------------------------------------------------------------------------------------------------
+# 
+# if t == 15 then UNPREDICTABLE
+0x00 0xf0 0xcf 0xe7
diff --git a/test/MC/Disassembler/ARM/invalid-VLD1DUPq8_UPD-arm.txt b/test/MC/Disassembler/ARM/invalid-VLD1DUPq8_UPD-arm.txt
new file mode 100644
index 0000000..56d9ad7
--- /dev/null
+++ b/test/MC/Disassembler/ARM/invalid-VLD1DUPq8_UPD-arm.txt
@@ -0,0 +1,10 @@
+# RUN: llvm-mc --disassemble %s -triple=arm-apple-darwin9 |& grep {invalid instruction encoding}
+
+# Opcode=737 Name=VLD1DUPq8_UPD Format=ARM_FORMAT_NLdSt(30)
+#  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0 
+# -------------------------------------------------------------------------------------------------
+# | 1: 1: 1: 1| 0: 1: 0: 0| 1: 0: 1: 0| 0: 0: 0: 0| 0: 0: 1: 1| 1: 1: 0: 0| 0: 0: 1: 1| 1: 1: 0: 1|
+# -------------------------------------------------------------------------------------------------
+# 
+# 'a' == 1 and data_size == 8 is invalid
+0x3d 0x3c 0xa0 0xf4
diff --git a/test/MC/Disassembler/ARM/invalid-VLD3DUPd32_UPD-thumb.txt b/test/MC/Disassembler/ARM/invalid-VLD3DUPd32_UPD-thumb.txt
new file mode 100644
index 0000000..5fd0251
--- /dev/null
+++ b/test/MC/Disassembler/ARM/invalid-VLD3DUPd32_UPD-thumb.txt
@@ -0,0 +1,11 @@
+# RUN: llvm-mc --disassemble %s -triple=thumb-apple-darwin9 |& grep {invalid instruction encoding}
+
+# Opcode=871 Name=VLD3DUPd32_UPD Format=ARM_FORMAT_NLdSt(30)
+#  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0 
+# -------------------------------------------------------------------------------------------------
+# | 1: 1: 1: 1| 0: 1: 0: 0| 1: 0: 1: 0| 0: 0: 1: 0| 0: 0: 1: 0| 1: 1: 1: 0| 1: 0: 0: 1| 0: 0: 1: 0|
+# -------------------------------------------------------------------------------------------------
+# 
+# A8.6.315 VLD3 (single 3-element structure to all lanes)
+# The a bit must be encoded as 0.
+0xa2 0xf9 0x92 0x2e
diff --git a/test/MC/Disassembler/ARM/invalid-t2Bcc-thumb.txt b/test/MC/Disassembler/ARM/invalid-t2Bcc-thumb.txt
new file mode 100644
index 0000000..d0bc51e
--- /dev/null
+++ b/test/MC/Disassembler/ARM/invalid-t2Bcc-thumb.txt
@@ -0,0 +1,11 @@
+# RUN: llvm-mc --disassemble %s -triple=thumb-apple-darwin9 |& grep {invalid instruction encoding}
+
+# Opcode=1894 Name=t2Bcc Format=ARM_FORMAT_THUMBFRM(25)
+#  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0 
+# -------------------------------------------------------------------------------------------------
+# | 1: 1: 1: 1| 0: 1: 1: 1| 1: 0: 1: 0| 1: 1: 1: 1| 1: 0: 0: 0| 1: 0: 1: 1| 0: 1: 0: 0| 0: 1: 0: 0|
+# -------------------------------------------------------------------------------------------------
+# 
+# A8.6.16 B
+# if cond<3:1> == '111' then SEE "Related Encodings"
+0xaf 0xf7 0x44 0x8b
diff --git a/test/MC/Disassembler/ARM/invalid-t2LDRBT-thumb.txt b/test/MC/Disassembler/ARM/invalid-t2LDRBT-thumb.txt
new file mode 100644
index 0000000..9befbd6
--- /dev/null
+++ b/test/MC/Disassembler/ARM/invalid-t2LDRBT-thumb.txt
@@ -0,0 +1,10 @@
+# RUN: llvm-mc --disassemble %s -triple=thumb-apple-darwin9 |& grep {invalid instruction encoding}
+
+# Opcode=1922 Name=t2LDRBT Format=ARM_FORMAT_THUMBFRM(25)
+#  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0 
+# -------------------------------------------------------------------------------------------------
+# | 1: 1: 1: 1| 1: 0: 0: 0| 0: 0: 0: 1| 0: 0: 0: 0| 1: 1: 1: 1| 1: 1: 1: 0| 0: 0: 0: 0| 0: 0: 1: 1|
+# -------------------------------------------------------------------------------------------------
+# 
+# The unpriviledged Load/Store cannot have SP or PC as Rt.
+0x10 0xf8 0x3 0xfe
diff --git a/test/MC/Disassembler/ARM/invalid-t2LDREXD-thumb.txt b/test/MC/Disassembler/ARM/invalid-t2LDREXD-thumb.txt
new file mode 100644
index 0000000..598efd1
--- /dev/null
+++ b/test/MC/Disassembler/ARM/invalid-t2LDREXD-thumb.txt
@@ -0,0 +1,10 @@
+# RUN: llvm-mc --disassemble %s -triple=thumb-apple-darwin9 |& grep {invalid instruction encoding}
+
+# Opcode=1934 Name=t2LDREXD Format=ARM_FORMAT_THUMBFRM(25)
+#  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0 
+# -------------------------------------------------------------------------------------------------
+# | 1: 1: 1: 0| 1: 0: 0: 0| 1: 1: 0: 1| 0: 0: 1: 0| 1: 0: 0: 0| 1: 0: 0: 0| 0: 1: 1: 1| 1: 1: 1: 1|
+# -------------------------------------------------------------------------------------------------
+# 
+# if t == t2 then UNPREDICTABLE
+0xd2 0xe8 0x7f 0x88
diff --git a/test/MC/Disassembler/ARM/invalid-t2LDRSHi12-thumb.txt b/test/MC/Disassembler/ARM/invalid-t2LDRSHi12-thumb.txt
new file mode 100644
index 0000000..a501eb9
--- /dev/null
+++ b/test/MC/Disassembler/ARM/invalid-t2LDRSHi12-thumb.txt
@@ -0,0 +1,10 @@
+# RUN: llvm-mc --disassemble %s -triple=thumb-apple-darwin9 |& grep {invalid instruction encoding}
+
+# Opcode=1953 Name=t2LDRSHi12 Format=ARM_FORMAT_THUMBFRM(25)
+#  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0 
+# -------------------------------------------------------------------------------------------------
+# | 1: 1: 1: 1| 1: 0: 0: 1| 1: 0: 1: 1| 0: 0: 1: 1| 1: 1: 1: 1| 1: 0: 0: 0| 1: 1: 0: 1| 1: 1: 1: 1|
+# -------------------------------------------------------------------------------------------------
+# 
+# if Rt = '1111' then SEE "Unallocated memory hints"
+0xb3 0xf9 0xdf 0xf8
diff --git a/test/MC/Disassembler/ARM/invalid-t2LDRSHi8-thumb.txt b/test/MC/Disassembler/ARM/invalid-t2LDRSHi8-thumb.txt
new file mode 100644
index 0000000..f886a6f
--- /dev/null
+++ b/test/MC/Disassembler/ARM/invalid-t2LDRSHi8-thumb.txt
@@ -0,0 +1,10 @@
+# RUN: llvm-mc --disassemble %s -triple=thumb-apple-darwin9 |& grep {invalid instruction encoding}
+
+# Opcode=1954 Name=t2LDRSHi8 Format=ARM_FORMAT_THUMBFRM(25)
+#  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0 
+# -------------------------------------------------------------------------------------------------
+# | 1: 1: 1: 1| 1: 0: 0: 1| 0: 0: 1: 1| 0: 1: 0: 1| 1: 1: 1: 1| 1: 1: 0: 0| 0: 0: 0: 0| 0: 0: 0: 0|
+# -------------------------------------------------------------------------------------------------
+# 
+# if Rt == '1111' and PUW == '100' then SEE "Unallocated memory hints"
+0x35 0xf9 0x00 0xfc
diff --git a/test/MC/Disassembler/ARM/invalid-t2STRD_PRE-thumb.txt b/test/MC/Disassembler/ARM/invalid-t2STRD_PRE-thumb.txt
new file mode 100644
index 0000000..c8f8ec2
--- /dev/null
+++ b/test/MC/Disassembler/ARM/invalid-t2STRD_PRE-thumb.txt
@@ -0,0 +1,10 @@
+# RUN: llvm-mc --disassemble %s -triple=thumb-apple-darwin9 |& grep {invalid instruction encoding}
+
+# Opcode=2124 Name=t2STRD_PRE Format=ARM_FORMAT_THUMBFRM(25)
+#  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0 
+# -------------------------------------------------------------------------------------------------
+# | 1: 1: 1: 0| 1: 0: 0: 1| 1: 1: 1: 0| 0: 1: 0: 0| 0: 1: 0: 0| 0: 1: 1: 0| 0: 0: 0: 0| 0: 0: 1: 0|
+# -------------------------------------------------------------------------------------------------
+# 
+# if wback && (n == t || n == t2) then UNPREDICTABLE
+0xe4 0xe9 0x02 0x46
diff --git a/test/MC/Disassembler/ARM/invalid-t2STREXB-thumb.txt b/test/MC/Disassembler/ARM/invalid-t2STREXB-thumb.txt
new file mode 100644
index 0000000..35ea651
--- /dev/null
+++ b/test/MC/Disassembler/ARM/invalid-t2STREXB-thumb.txt
@@ -0,0 +1,10 @@
+# RUN: llvm-mc --disassemble %s -triple=thumb-apple-darwin9 |& grep {invalid instruction encoding}
+
+# Opcode=2127 Name=t2STREXB Format=ARM_FORMAT_THUMBFRM(25)
+#  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0 
+# -------------------------------------------------------------------------------------------------
+# | 1: 1: 1: 0| 1: 0: 0: 0| 1: 1: 0: 0| 0: 0: 1: 0| 1: 0: 0: 0| 1: 1: 1: 1| 0: 1: 0: 0| 0: 0: 1: 0|
+# -------------------------------------------------------------------------------------------------
+# 
+# if d == n || d == t then UNPREDICTABLE
+0xc2 0xe8 0x42 0x8f
diff --git a/test/MC/Disassembler/ARM/invalid-t2STREXD-thumb.txt b/test/MC/Disassembler/ARM/invalid-t2STREXD-thumb.txt
new file mode 100644
index 0000000..9b0cf24
--- /dev/null
+++ b/test/MC/Disassembler/ARM/invalid-t2STREXD-thumb.txt
@@ -0,0 +1,10 @@
+# RUN: llvm-mc --disassemble %s -triple=thumb-apple-darwin9 |& grep {invalid instruction encoding}
+
+# Opcode=2128 Name=t2STREXD Format=ARM_FORMAT_THUMBFRM(25)
+#  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0 
+# -------------------------------------------------------------------------------------------------
+# | 1: 1: 1: 0| 1: 0: 0: 0| 1: 1: 0: 0| 0: 0: 1: 0| 0: 1: 1: 1| 1: 0: 0: 0| 0: 1: 1: 1| 1: 0: 0: 0|
+# -------------------------------------------------------------------------------------------------
+# 
+# if d == n || d == t || d == t2 then UNPREDICTABLE
+mc-input.txt:1:1: warning: invalid instruction encoding
diff --git a/test/MC/Disassembler/ARM/invalid-t2STR_POST-thumb.txt b/test/MC/Disassembler/ARM/invalid-t2STR_POST-thumb.txt
new file mode 100644
index 0000000..129a270
--- /dev/null
+++ b/test/MC/Disassembler/ARM/invalid-t2STR_POST-thumb.txt
@@ -0,0 +1,10 @@
+# RUN: llvm-mc --disassemble %s -triple=thumb-apple-darwin9 |& grep {invalid instruction encoding}
+
+# Opcode=2137 Name=t2STR_POST Format=ARM_FORMAT_THUMBFRM(25)
+#  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0 
+# -------------------------------------------------------------------------------------------------
+# | 1: 1: 1: 1| 1: 0: 0: 0| 0: 1: 0: 0| 1: 1: 1: 1| 1: 1: 1: 0| 1: 0: 1: 1| 1: 1: 1: 1| 1: 1: 1: 1|
+# -------------------------------------------------------------------------------------------------
+# 
+# if Rn == '1111' then UNDEFINED
+0x4f 0xf8 0xff 0xeb
diff --git a/test/MC/Disassembler/ARM/neon-tests.txt b/test/MC/Disassembler/ARM/neon-tests.txt
index 964459f..cfb5949 100644
--- a/test/MC/Disassembler/ARM/neon-tests.txt
+++ b/test/MC/Disassembler/ARM/neon-tests.txt
@@ -21,6 +21,12 @@
 # CHECK:	vld4.8	{d4, d6, d8, d10}, [r2]
 0x0f 0x41 0x22 0xf4
 
+# CHECK:	vld1.32	{d3[], d4[]}, [r0, :32]!
+0xbd 0x3c 0xa0 0xf4
+
+# CHECK:	vld4.16	{d3[], d4[], d5[], d6[]}, [r0, :64]!
+0x7d 0x3f 0xa0 0xf4
+
 # CHECK:	vmov	d0, d15
 0x1f 0x01 0x2f 0xf2
 
diff --git a/test/MC/Disassembler/ARM/thumb-printf.txt b/test/MC/Disassembler/ARM/thumb-printf.txt
index 09f54ab..6c2c500 100644
--- a/test/MC/Disassembler/ARM/thumb-printf.txt
+++ b/test/MC/Disassembler/ARM/thumb-printf.txt
@@ -7,17 +7,17 @@
 # CHECK-NEXT:	add	r3, sp, #20
 # CHECK-NEXT:	ldr	r5, [r3], #4
 # CHECK-NEXT:	str	r3, [sp]
-# CHECK-NEXT:	ldr.n	r3, #52
+# CHECK-NEXT:	ldr	r3, #52
 # CHECK-NEXT:	add	r3, pc
 # CHECK-NEXT:	ldr	r0, [r3]
 # CHECK-NEXT:	ldr	r4, [r0]
-# CHECK-NEXT:	ldr.n	r0, #48
+# CHECK-NEXT:	ldr	r0, #48
 # CHECK-NEXT:	add	r0, pc
 # CHECK-NEXT:	ldr	r0, [r0]
 # CHECK-NEXT:	ldr	r0, [r0]
 # CHECK-NEXT:	blx	#191548
 # CHECK-NEXT:	cbnz	r0, #6
-# CHECK-NEXT:	ldr.n	r1, #40
+# CHECK-NEXT:	ldr	r1, #40
 # CHECK-NEXT:	add	r1, pc
 # CHECK-NEXT:	ldr	r1, [r1]
 # CHECK-NEXT:	b	#0
diff --git a/test/MC/Disassembler/ARM/thumb-tests.txt b/test/MC/Disassembler/ARM/thumb-tests.txt
index 358ecf0..895a5bb 100644
--- a/test/MC/Disassembler/ARM/thumb-tests.txt
+++ b/test/MC/Disassembler/ARM/thumb-tests.txt
@@ -9,8 +9,11 @@
 # CHECK:	b	#30
 0x0f 0xe0
 
-# CHECK:	b.w	#-16
-0xff 0xf7 0xf8 0xaf
+# CHECK:	bgt.w	#-16
+0x3f 0xf7 0xf8 0xaf
+
+# CHECK:	bfc	r0, #10, #10
+0x6f 0xf3 0x93 0x20
 
 # CHECK:	bfi	r2, r10, #0, #1
 0x6a 0xf3 0x00 0x02
@@ -27,6 +30,9 @@
 # CHECK:	ldmia	r0!, {r1}
 0x02 0xc8
 
+# CHECK:	ldr	r5, #432
+0x6c 0x4d
+
 # CHECK:	str	r0, [r3]
 0x18 0x60
 
@@ -158,6 +164,9 @@
 # CHECK:	ldrex	r8, [r2]
 0x52 0xe8 0x00 0x8f
 
+# CHECK:	ldrexd	r8, r9, [r2]
+0xd2 0xe8 0x7f 0x89
+
 # CHECK:	strexd	r1, r7, r8, [r2]
 0xc2 0xe8 0x71 0x78
 
@@ -171,7 +180,16 @@
 0x5d 0xf8 0x34 0x40
 
 # CHECK:	ldr.w	r5, [r6, #30]
-0x56 0xf8 0x1e 0x56
+0xd6 0xf8 0x1e 0x50
+
+# CHECK:	ldrh.w	r5, [r6, #30]
+0xb6 0xf8 0x1e 0x50
+
+# CHECK:	ldrt	r5, [r6, #30]
+0x56 0xf8 0x1e 0x5e
+
+# CHECK:	ldr	r5, [r6, #-30]
+0x56 0xf8 0x1e 0x5c
 
 # CHECK:	sel	r7, r3, r5
 0xa3 0xfa 0x85 0xf7
@@ -197,8 +215,53 @@
 # CHECK:	pld	[pc, #-16]
 0x1f 0xf8 0x10 0xf0
 
+# CHECK:	pld	[r5, #30]
+0x95 0xf8 0x1e 0xf0
+
 # CHECK:	stc2	p12, cr15, [r9], {137}
 0x89 0xfc 0x89 0xfc
 
 # CHECK:	vmov	r1, r0, d11
 0x50 0xec 0x1b 0x1b
+
+# CHECK:	dsb	nsh
+0xbf 0xf3 0x47 0x8f
+
+# CHECK:	isb
+0xbf 0xf3 0x6f 0x8f
+
+# CHECK:	asrs	r1, r0, #32
+0x1 0x10
+
+# CHECK:	lsr.w	r10, r0, #32
+0x4f 0xea 0x10 0x0a
+
+# CHECK:	blx	sp
+0xe8 0x47
+
+# CHECK:	bx	lr
+0x70 0x47
+
+# CHECK:	bx	pc
+0x78 0x47
+
+# CHECK:	svc	#230
+0xe6 0xdf
+
+# CHECK:	rfedb	lr
+0x1e 0xe8 0x00 0xc0
+
+# CHECK:	mov.w	r3, #4294967295
+0x4f 0xf0 0xff 0x33
+
+# CHECK:	mov	pc, sp
+0xef 0x46
+
+# CHECK:	nop
+0x00 0xbf
+
+# CHECK:	nop.w
+0xaf 0xf3 0x00 0x80
+
+# CHECK:	bne	#24
+0x0c 0xd1
diff --git a/test/MC/ELF/cfi-adjust-cfa-offset.s b/test/MC/ELF/cfi-adjust-cfa-offset.s
new file mode 100644
index 0000000..5c1a9f9
--- /dev/null
+++ b/test/MC/ELF/cfi-adjust-cfa-offset.s
@@ -0,0 +1,46 @@
+// RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - | elf-dump  --dump-section-data | FileCheck %s
+
+f:
+	.cfi_startproc
+	subq	$8, %rsp
+	.cfi_def_cfa_offset 16
+        nop
+        .cfi_adjust_cfa_offset 4
+	addq	$8, %rsp
+	.cfi_def_cfa_offset 8
+	ret
+	.cfi_endproc
+
+// CHECK:       # Section 0x00000004
+// CHECK-NEXT:  (('sh_name', 0x00000011) # '.eh_frame'
+// CHECK-NEXT:   ('sh_type', 0x00000001)
+// CHECK-NEXT:   ('sh_flags', 0x00000002)
+// CHECK-NEXT:   ('sh_addr', 0x00000000)
+// CHECK-NEXT:   ('sh_offset', 0x00000050)
+// CHECK-NEXT:   ('sh_size', 0x00000038)
+// CHECK-NEXT:   ('sh_link', 0x00000000)
+// CHECK-NEXT:   ('sh_info', 0x00000000)
+// CHECK-NEXT:   ('sh_addralign', 0x00000008)
+// CHECK-NEXT:   ('sh_entsize', 0x00000000)
+// CHECK-NEXT:   ('_section_data', '14000000 00000000 017a5200 01781001 1b0c0708 90010000 1c000000 1c000000 00000000 0a000000 00440e10 410e1444 0e080000 00000000')
+// CHECK-NEXT:  ),
+// CHECK-NEXT:  # Section 0x00000005
+// CHECK-NEXT:  (('sh_name', 0x0000000c) # '.rela.eh_frame'
+// CHECK-NEXT:   ('sh_type', 0x00000004)
+// CHECK-NEXT:   ('sh_flags', 0x00000000)
+// CHECK-NEXT:   ('sh_addr', 0x00000000)
+// CHECK-NEXT:   ('sh_offset', 0x000003a0)
+// CHECK-NEXT:   ('sh_size', 0x00000018)
+// CHECK-NEXT:   ('sh_link', 0x00000007)
+// CHECK-NEXT:   ('sh_info', 0x00000004)
+// CHECK-NEXT:   ('sh_addralign', 0x00000008)
+// CHECK-NEXT:   ('sh_entsize', 0x00000018)
+// CHECK-NEXT:   ('_relocations', [
+// CHECK-NEXT:    # Relocation 0x00000000
+// CHECK-NEXT:    (('r_offset', 0x00000020)
+// CHECK-NEXT:     ('r_sym', 0x00000002)
+// CHECK-NEXT:     ('r_type', 0x00000002)
+// CHECK-NEXT:     ('r_addend', 0x00000000)
+// CHECK-NEXT:    ),
+// CHECK-NEXT:   ])
+// CHECK-NEXT:  ),
diff --git a/test/MC/ELF/cfi-offset.s b/test/MC/ELF/cfi-offset.s
index 963a76c..f54dec0 100644
--- a/test/MC/ELF/cfi-offset.s
+++ b/test/MC/ELF/cfi-offset.s
@@ -3,7 +3,7 @@
 f:
 	.cfi_startproc
         nop
-	.cfi_offset %ebp, -16
+	.cfi_offset %rbp, -16
         nop
 	.cfi_endproc
 
diff --git a/test/MC/ELF/cfi-rel-offset.s b/test/MC/ELF/cfi-rel-offset.s
new file mode 100644
index 0000000..87c0cf3
--- /dev/null
+++ b/test/MC/ELF/cfi-rel-offset.s
@@ -0,0 +1,49 @@
+// RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - | elf-dump  --dump-section-data | FileCheck %s
+
+f:
+	.cfi_startproc
+        nop
+        .cfi_def_cfa_offset 8
+        nop
+        .cfi_def_cfa_register 6
+        nop
+        .cfi_rel_offset 6,16
+        nop
+        .cfi_def_cfa_offset 16
+        nop
+        .cfi_rel_offset 6,0
+	.cfi_endproc
+
+// CHECK:       # Section 0x00000004
+// CHECK-NEXT:  (('sh_name', 0x00000011) # '.eh_frame'
+// CHECK-NEXT:   ('sh_type', 0x00000001)
+// CHECK-NEXT:   ('sh_flags', 0x00000002)
+// CHECK-NEXT:   ('sh_addr', 0x00000000)
+// CHECK-NEXT:   ('sh_offset', 0x00000048)
+// CHECK-NEXT:   ('sh_size', 0x00000040)
+// CHECK-NEXT:   ('sh_link', 0x00000000)
+// CHECK-NEXT:   ('sh_info', 0x00000000)
+// CHECK-NEXT:   ('sh_addralign', 0x00000008)
+// CHECK-NEXT:   ('sh_entsize', 0x00000000)
+// CHECK-NEXT:   ('_section_data', '14000000 00000000 017a5200 01781001 1b0c0708 90010000 24000000 1c000000 00000000 05000000 00410e08 410d0641 11067f41 0e104186 02000000 00000000')
+// CHECK-NEXT:  ),
+// CHECK-NEXT:  # Section 0x00000005
+// CHECK-NEXT:  (('sh_name', 0x0000000c) # '.rela.eh_frame'
+// CHECK-NEXT:   ('sh_type', 0x00000004)
+// CHECK-NEXT:   ('sh_flags', 0x00000000)
+// CHECK-NEXT:   ('sh_addr', 0x00000000)
+// CHECK-NEXT:   ('sh_offset', 0x000003a0)
+// CHECK-NEXT:   ('sh_size', 0x00000018)
+// CHECK-NEXT:   ('sh_link', 0x00000007)
+// CHECK-NEXT:   ('sh_info', 0x00000004)
+// CHECK-NEXT:   ('sh_addralign', 0x00000008)
+// CHECK-NEXT:   ('sh_entsize', 0x00000018)
+// CHECK-NEXT:   ('_relocations', [
+// CHECK-NEXT:    # Relocation 0x00000000
+// CHECK-NEXT:    (('r_offset', 0x00000020)
+// CHECK-NEXT:     ('r_sym', 0x00000002)
+// CHECK-NEXT:     ('r_type', 0x00000002)
+// CHECK-NEXT:     ('r_addend', 0x00000000)
+// CHECK-NEXT:    ),
+// CHECK-NEXT:   ])
+// CHECK-NEXT:  ),
diff --git a/test/MC/ELF/cfi-rel-offset2.s b/test/MC/ELF/cfi-rel-offset2.s
new file mode 100644
index 0000000..f14beaf
--- /dev/null
+++ b/test/MC/ELF/cfi-rel-offset2.s
@@ -0,0 +1,41 @@
+// RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - | elf-dump  --dump-section-data | FileCheck %s
+
+f:
+	.cfi_startproc
+        nop
+        .cfi_rel_offset 6,16
+	.cfi_endproc
+
+// CHECK:       # Section 0x00000004
+// CHECK-NEXT:  (('sh_name', 0x00000011) # '.eh_frame'
+// CHECK-NEXT:   ('sh_type', 0x00000001)
+// CHECK-NEXT:   ('sh_flags', 0x00000002)
+// CHECK-NEXT:   ('sh_addr', 0x00000000)
+// CHECK-NEXT:   ('sh_offset', 0x00000048)
+// CHECK-NEXT:   ('sh_size', 0x00000030)
+// CHECK-NEXT:   ('sh_link', 0x00000000)
+// CHECK-NEXT:   ('sh_info', 0x00000000)
+// CHECK-NEXT:   ('sh_addralign', 0x00000008)
+// CHECK-NEXT:   ('sh_entsize', 0x00000000)
+// CHECK-NEXT:   ('_section_data', '14000000 00000000 017a5200 01781001 1b0c0708 90010000 14000000 1c000000 00000000 01000000 00411106 7f000000')
+// CHECK-NEXT:  ),
+// CHECK-NEXT:  # Section 0x00000005
+// CHECK-NEXT:  (('sh_name', 0x0000000c) # '.rela.eh_frame'
+// CHECK-NEXT:   ('sh_type', 0x00000004)
+// CHECK-NEXT:   ('sh_flags', 0x00000000)
+// CHECK-NEXT:   ('sh_addr', 0x00000000)
+// CHECK-NEXT:   ('sh_offset', 0x00000390)
+// CHECK-NEXT:   ('sh_size', 0x00000018)
+// CHECK-NEXT:   ('sh_link', 0x00000007)
+// CHECK-NEXT:   ('sh_info', 0x00000004)
+// CHECK-NEXT:   ('sh_addralign', 0x00000008)
+// CHECK-NEXT:   ('sh_entsize', 0x00000018)
+// CHECK-NEXT:   ('_relocations', [
+// CHECK-NEXT:    # Relocation 0x00000000
+// CHECK-NEXT:    (('r_offset', 0x00000020)
+// CHECK-NEXT:     ('r_sym', 0x00000002)
+// CHECK-NEXT:     ('r_type', 0x00000002)
+// CHECK-NEXT:     ('r_addend', 0x00000000)
+// CHECK-NEXT:    ),
+// CHECK-NEXT:   ])
+// CHECK-NEXT:  ),
diff --git a/test/MC/ELF/cfi-same-value.s b/test/MC/ELF/cfi-same-value.s
new file mode 100644
index 0000000..eab1ae4
--- /dev/null
+++ b/test/MC/ELF/cfi-same-value.s
@@ -0,0 +1,42 @@
+// RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - | elf-dump  --dump-section-data | FileCheck %s
+
+f:
+	.cfi_startproc
+        nop
+        .cfi_same_value 6
+        nop
+	.cfi_endproc
+
+// CHECK:       # Section 0x00000004
+// CHECK-NEXT:  (('sh_name', 0x00000011) # '.eh_frame'
+// CHECK-NEXT:   ('sh_type', 0x00000001)
+// CHECK-NEXT:   ('sh_flags', 0x00000002)
+// CHECK-NEXT:   ('sh_addr', 0x00000000)
+// CHECK-NEXT:   ('sh_offset', 0x00000048)
+// CHECK-NEXT:   ('sh_size', 0x00000030)
+// CHECK-NEXT:   ('sh_link', 0x00000000)
+// CHECK-NEXT:   ('sh_info', 0x00000000)
+// CHECK-NEXT:   ('sh_addralign', 0x00000008)
+// CHECK-NEXT:   ('sh_entsize', 0x00000000)
+// CHECK-NEXT:   ('_section_data', '14000000 00000000 017a5200 01781001 1b0c0708 90010000 14000000 1c000000 00000000 02000000 00410806 00000000')
+// CHECK-NEXT:  ),
+// CHECK-NEXT:  # Section 0x00000005
+// CHECK-NEXT:  (('sh_name', 0x0000000c) # '.rela.eh_frame'
+// CHECK-NEXT:   ('sh_type', 0x00000004)
+// CHECK-NEXT:   ('sh_flags', 0x00000000)
+// CHECK-NEXT:   ('sh_addr', 0x00000000)
+// CHECK-NEXT:   ('sh_offset', 0x00000390)
+// CHECK-NEXT:   ('sh_size', 0x00000018)
+// CHECK-NEXT:   ('sh_link', 0x00000007)
+// CHECK-NEXT:   ('sh_info', 0x00000004)
+// CHECK-NEXT:   ('sh_addralign', 0x00000008)
+// CHECK-NEXT:   ('sh_entsize', 0x00000018)
+// CHECK-NEXT:   ('_relocations', [
+// CHECK-NEXT:    # Relocation 0x00000000
+// CHECK-NEXT:    (('r_offset', 0x00000020)
+// CHECK-NEXT:     ('r_sym', 0x00000002)
+// CHECK-NEXT:     ('r_type', 0x00000002)
+// CHECK-NEXT:     ('r_addend', 0x00000000)
+// CHECK-NEXT:    ),
+// CHECK-NEXT:   ])
+// CHECK-NEXT:  ),
diff --git a/test/MC/ELF/cfi-sections.s b/test/MC/ELF/cfi-sections.s
new file mode 100644
index 0000000..a73f3a9
--- /dev/null
+++ b/test/MC/ELF/cfi-sections.s
@@ -0,0 +1,38 @@
+// RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - | elf-dump  --dump-section-data | FileCheck -check-prefix=ELF_64 %s
+// RUN: llvm-mc -filetype=obj -triple i686-pc-linux-gnu %s -o - | elf-dump  --dump-section-data | FileCheck -check-prefix=ELF_32 %s
+
+.cfi_sections .debug_frame
+
+f1:
+        .cfi_startproc
+        nop
+        .cfi_endproc
+
+f2:
+        .cfi_startproc
+        nop
+        .cfi_endproc
+
+// ELF_64:      (('sh_name', 0x00000011) # '.debug_frame'
+// ELF_64-NEXT:  ('sh_type', 0x00000001)
+// ELF_64-NEXT:  ('sh_flags', 0x00000000)
+// ELF_64-NEXT:  ('sh_addr', 0x00000000)
+// ELF_64-NEXT:  ('sh_offset', 0x00000048)
+// ELF_64-NEXT:  ('sh_size', 0x00000048)
+// ELF_64-NEXT:  ('sh_link', 0x00000000)
+// ELF_64-NEXT:  ('sh_info', 0x00000000)
+// ELF_64-NEXT:  ('sh_addralign', 0x00000008)
+// ELF_64-NEXT:  ('sh_entsize', 0x00000000)
+// ELF_64-NEXT:  ('_section_data', '14000000 ffffffff 01000178 100c0708 90010000 00000000 14000000 00000000 00000000 00000000 01000000 00000000 14000000 00000000 00000000 00000000 01000000 00000000')
+
+// ELF_32:      (('sh_name', 0x00000010) # '.debug_frame'
+// ELF_32-NEXT:  ('sh_type', 0x00000001)
+// ELF_32-NEXT:  ('sh_flags', 0x00000000)
+// ELF_32-NEXT:  ('sh_addr', 0x00000000)
+// ELF_32-NEXT:  ('sh_offset', 0x00000038)
+// ELF_32-NEXT:  ('sh_size', 0x00000034)
+// ELF_32-NEXT:  ('sh_link', 0x00000000)
+// ELF_32-NEXT:  ('sh_info', 0x00000000)
+// ELF_32-NEXT:  ('sh_addralign', 0x00000004)
+// ELF_32-NEXT:  ('sh_entsize', 0x00000000)
+// ELF_32-NEXT:  ('_section_data', '10000000 ffffffff 0100017c 080c0404 88010000 0c000000 00000000 00000000 01000000 0c000000 00000000 01000000 01000000')
diff --git a/test/MC/ELF/relocation-386.s b/test/MC/ELF/relocation-386.s
index f7b20b5..25f3450 100644
--- a/test/MC/ELF/relocation-386.s
+++ b/test/MC/ELF/relocation-386.s
@@ -153,6 +153,13 @@
 // CHECK-NEXT:  ('r_sym',
 // CHECK-NEXT:  ('r_type', 0x00000001)
 // CHECK-NEXT: ),
+// Relocation 24 (foo@GOTTPOFF(%edx)) is of type R_386_TLS_IE_32 and uses the
+// symbol
+// CHECK-NEXT: Relocation 0x00000018
+// CHECK-NEXT: (('r_offset', 0x0000008e)
+// CHECK-NEXT:  ('r_sym', 0x0000000d)
+// CHECK-NEXT:  ('r_type', 0x00000021)
+// CHECK-NEXT: ),
 
 // Section 4 is bss
 // CHECK:      # Section 0x00000004
@@ -217,6 +224,7 @@ bar2:
         movl zed@TPOFF(%eax), %eax
         movl zed@DTPOFF(%eax), %eax
         pushl $bar
+        addl foo@GOTTPOFF(%edx), %eax
 
         .section        zedsec,"awT",@progbits
 zed:
diff --git a/test/MC/ELF/relocation.s b/test/MC/ELF/relocation.s
index 4df09e1..2760232 100644
--- a/test/MC/ELF/relocation.s
+++ b/test/MC/ELF/relocation.s
@@ -17,6 +17,7 @@ bar:
         pushq    $bar
         movq	foo(%rip), %rdx
         leaq    foo-bar(%r14),%r14
+        addq	$bar,%rax         # R_X86_64_32S
 
 
 // CHECK:  # Section 0x00000001
@@ -106,6 +107,12 @@ bar:
 // CHECK-NEXT:  ('r_type', 0x00000002)
 // CHECK-NEXT:  ('r_addend', 0x0000005c)
 
+// CHECK: # Relocation 0x0000000e
+// CHECK-NEXT: (('r_offset', 0x00000063)
+// CHECK-NEXT:  ('r_sym', 0x00000002)
+// CHECK-NEXT:  ('r_type', 0x0000000b)
+// CHECK-NEXT:  ('r_addend', 0x00000000)
+
 // CHECK:   # Symbol 0x00000002
 // CHECK: (('st_name', 0x00000000) # ''
 // CHECK:  ('st_bind', 0x00000000)
diff --git a/test/MC/MachO/darwin-x86_64-diff-relocs.s b/test/MC/MachO/darwin-x86_64-diff-relocs.s
index 449d2f5..f5d93ae 100644
--- a/test/MC/MachO/darwin-x86_64-diff-relocs.s
+++ b/test/MC/MachO/darwin-x86_64-diff-relocs.s
@@ -157,7 +157,7 @@ L3:
 // FIXME: Unfortunately, we do not get these relocations in exactly the same
 // order as Darwin 'as'. It turns out that 'as' *usually* ends up emitting
 // them in reverse address order, but sometimes it allocates some
-// additional relocations late so these end up preceed the other entries. I
+// additional relocations late so these end up precede the other entries. I
 // haven't figured out the exact criteria for this yet.
         
 // CHECK:     (('word-0', 0x56),
diff --git a/test/MC/MachO/debug_frame.s b/test/MC/MachO/debug_frame.s
new file mode 100644
index 0000000..47264ef
--- /dev/null
+++ b/test/MC/MachO/debug_frame.s
@@ -0,0 +1,38 @@
+// RUN: llvm-mc -triple i386-apple-darwin %s -filetype=obj -o - | macho-dump | FileCheck %s
+
+// Check that we don't produce a relocation for the CIE pointer and therefore
+// we have only one relocation in __debug_frame.
+
+	.section	__TEXT,__text,regular,pure_instructions
+	.globl	_f
+	.align	4, 0x90
+_f:                                     ## @f
+Ltmp0:
+	.cfi_startproc
+## BB#0:                                ## %entry
+	movl	$42, %eax
+	ret
+Ltmp1:
+	.cfi_endproc
+Leh_func_end0:
+
+	.cfi_sections .debug_frame
+Ltext_end:
+
+// CHECK:       (('section_name', '__debug_frame\x00\x00\x00')
+// CHECK-NEXT:   ('segment_name', '__DWARF\x00\x00\x00\x00\x00\x00\x00\x00\x00')
+// CHECK-NEXT:   ('address', 8)
+// CHECK-NEXT:   ('size', 36)
+// CHECK-NEXT:   ('offset', 332)
+// CHECK-NEXT:   ('alignment', 2)
+// CHECK-NEXT:   ('reloc_offset', 368)
+// CHECK-NEXT:   ('num_reloc', 1)
+// CHECK-NEXT:   ('flags', 0x2000000)
+// CHECK-NEXT:   ('reserved1', 0)
+// CHECK-NEXT:   ('reserved2', 0)
+// CHECK-NEXT:  ),
+// CHECK-NEXT: ('_relocations', [
+// CHECK-NEXT:   # Relocation 0
+// CHECK-NEXT:   (('word-0', 0x1c),
+// CHECK-NEXT:    ('word-1', 0x4000001)),
+// CHECK-NEXT: ])
diff --git a/test/MC/MachO/variable-errors.s b/test/MC/MachO/variable-errors.s
new file mode 100644
index 0000000..28308c6
--- /dev/null
+++ b/test/MC/MachO/variable-errors.s
@@ -0,0 +1,8 @@
+// RUN: not llvm-mc -triple x86_64-apple-darwin10 %s -filetype=obj -o %t.o 2> %t.err
+// RUN: FileCheck < %t.err %s
+
+        .data
+t0_a:
+t0_x = t0_a - t0_b
+// CHECK: unable to evaluate offset to undefined symbol 't0_b'
+	.long	t0_x
diff --git a/test/MC/MachO/variable-exprs.s b/test/MC/MachO/variable-exprs.s
new file mode 100644
index 0000000..8eeb82f
--- /dev/null
+++ b/test/MC/MachO/variable-exprs.s
@@ -0,0 +1,446 @@
+// RUN: llvm-mc -triple i386-apple-darwin10 %s -filetype=obj -o %t.o
+// RUN: macho-dump --dump-section-data < %t.o > %t.dump
+// RUN: FileCheck --check-prefix=CHECK-I386 < %t.dump %s
+
+// RUN: llvm-mc -triple x86_64-apple-darwin10 %s -filetype=obj -o %t.o
+// RUN: macho-dump --dump-section-data < %t.o > %t.dump
+// RUN: FileCheck --check-prefix=CHECK-X86_64 < %t.dump %s
+
+.data
+
+        .long 0
+a:
+        .long 0
+b = a
+
+c:      .long b
+
+d2 = d
+.globl d2
+d3 = d + 4
+.globl d3
+
+e = a + 4
+
+g:
+f = g
+        .long 0
+        
+        .long b
+        .long e
+        .long a + 4
+        .long d
+        .long d2
+        .long d3
+        .long f
+        .long g
+
+///
+        .text
+t0:
+Lt0_a:
+        ret
+
+	.data
+Lt0_b:
+Lt0_x = Lt0_a - Lt0_b
+	.quad	Lt0_x
+
+// CHECK-I386: ('cputype', 7)
+// CHECK-I386: ('cpusubtype', 3)
+// CHECK-I386: ('filetype', 1)
+// CHECK-I386: ('num_load_commands', 3)
+// CHECK-I386: ('load_commands_size', 296)
+// CHECK-I386: ('flag', 0)
+// CHECK-I386: ('load_commands', [
+// CHECK-I386:   # Load Command 0
+// CHECK-I386:  (('command', 1)
+// CHECK-I386:   ('size', 192)
+// CHECK-I386:   ('segment_name', '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
+// CHECK-I386:   ('vm_addr', 0)
+// CHECK-I386:   ('vm_size', 57)
+// CHECK-I386:   ('file_offset', 324)
+// CHECK-I386:   ('file_size', 57)
+// CHECK-I386:   ('maxprot', 7)
+// CHECK-I386:   ('initprot', 7)
+// CHECK-I386:   ('num_sections', 2)
+// CHECK-I386:   ('flags', 0)
+// CHECK-I386:   ('sections', [
+// CHECK-I386:     # Section 0
+// CHECK-I386:    (('section_name', '__text\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
+// CHECK-I386:     ('segment_name', '__TEXT\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
+// CHECK-I386:     ('address', 0)
+// CHECK-I386:     ('size', 1)
+// CHECK-I386:     ('offset', 324)
+// CHECK-I386:     ('alignment', 0)
+// CHECK-I386:     ('reloc_offset', 0)
+// CHECK-I386:     ('num_reloc', 0)
+// CHECK-I386:     ('flags', 0x80000400)
+// CHECK-I386:     ('reserved1', 0)
+// CHECK-I386:     ('reserved2', 0)
+// CHECK-I386:    ),
+// CHECK-I386:   ('_relocations', [
+// CHECK-I386:   ])
+// CHECK-I386:   ('_section_data', 'c3')
+// CHECK-I386:     # Section 1
+// CHECK-I386:    (('section_name', '__data\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
+// CHECK-I386:     ('segment_name', '__DATA\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
+// CHECK-I386:     ('address', 1)
+// CHECK-I386:     ('size', 56)
+// CHECK-I386:     ('offset', 325)
+// CHECK-I386:     ('alignment', 0)
+// CHECK-I386:     ('reloc_offset', 384)
+// CHECK-I386:     ('num_reloc', 9)
+// CHECK-I386:     ('flags', 0x0)
+// CHECK-I386:     ('reserved1', 0)
+// CHECK-I386:     ('reserved2', 0)
+// CHECK-I386:    ),
+// CHECK-I386:   ('_relocations', [
+// CHECK-I386:     # Relocation 0
+// CHECK-I386:     (('word-0', 0x2c),
+// CHECK-I386:      ('word-1', 0x4000002)),
+// CHECK-I386:     # Relocation 1
+// CHECK-I386:     (('word-0', 0x28),
+// CHECK-I386:      ('word-1', 0x4000002)),
+// CHECK-I386:     # Relocation 2
+// CHECK-I386:     (('word-0', 0x24),
+// CHECK-I386:      ('word-1', 0xc000009)),
+// CHECK-I386:     # Relocation 3
+// CHECK-I386:     (('word-0', 0x20),
+// CHECK-I386:      ('word-1', 0xc000008)),
+// CHECK-I386:     # Relocation 4
+// CHECK-I386:     (('word-0', 0x1c),
+// CHECK-I386:      ('word-1', 0xc000007)),
+// CHECK-I386:     # Relocation 5
+// CHECK-I386:     (('word-0', 0xa0000018),
+// CHECK-I386:      ('word-1', 0x5)),
+// CHECK-I386:     # Relocation 6
+// CHECK-I386:     (('word-0', 0x14),
+// CHECK-I386:      ('word-1', 0x4000002)),
+// CHECK-I386:     # Relocation 7
+// CHECK-I386:     (('word-0', 0x10),
+// CHECK-I386:      ('word-1', 0x4000002)),
+// CHECK-I386:     # Relocation 8
+// CHECK-I386:     (('word-0', 0x8),
+// CHECK-I386:      ('word-1', 0x4000002)),
+// CHECK-I386:   ])
+// CHECK-I386:   ('_section_data', '00000000 00000000 05000000 00000000 05000000 09000000 09000000 00000000 00000000 00000000 0d000000 0d000000 cfffffff ffffffff')
+// CHECK-I386:   ])
+// CHECK-I386:  ),
+// CHECK-I386:   # Load Command 1
+// CHECK-I386:  (('command', 2)
+// CHECK-I386:   ('size', 24)
+// CHECK-I386:   ('symoff', 456)
+// CHECK-I386:   ('nsyms', 10)
+// CHECK-I386:   ('stroff', 576)
+// CHECK-I386:   ('strsize', 24)
+// CHECK-I386:   ('_string_data', '\x00d2\x00d\x00d3\x00a\x00b\x00c\x00e\x00g\x00f\x00t0\x00')
+// CHECK-I386:   ('_symbols', [
+// CHECK-I386:     # Symbol 0
+// CHECK-I386:    (('n_strx', 9)
+// CHECK-I386:     ('n_type', 0xe)
+// CHECK-I386:     ('n_sect', 2)
+// CHECK-I386:     ('n_desc', 0)
+// CHECK-I386:     ('n_value', 5)
+// CHECK-I386:     ('_string', 'a')
+// CHECK-I386:    ),
+// CHECK-I386:     # Symbol 1
+// CHECK-I386:    (('n_strx', 11)
+// CHECK-I386:     ('n_type', 0xe)
+// CHECK-I386:     ('n_sect', 2)
+// CHECK-I386:     ('n_desc', 0)
+// CHECK-I386:     ('n_value', 5)
+// CHECK-I386:     ('_string', 'b')
+// CHECK-I386:    ),
+// CHECK-I386:     # Symbol 2
+// CHECK-I386:    (('n_strx', 13)
+// CHECK-I386:     ('n_type', 0xe)
+// CHECK-I386:     ('n_sect', 2)
+// CHECK-I386:     ('n_desc', 0)
+// CHECK-I386:     ('n_value', 9)
+// CHECK-I386:     ('_string', 'c')
+// CHECK-I386:    ),
+// CHECK-I386:     # Symbol 3
+// CHECK-I386:    (('n_strx', 15)
+// CHECK-I386:     ('n_type', 0xe)
+// CHECK-I386:     ('n_sect', 2)
+// CHECK-I386:     ('n_desc', 0)
+// CHECK-I386:     ('n_value', 9)
+// CHECK-I386:     ('_string', 'e')
+// CHECK-I386:    ),
+// CHECK-I386:     # Symbol 4
+// CHECK-I386:    (('n_strx', 17)
+// CHECK-I386:     ('n_type', 0xe)
+// CHECK-I386:     ('n_sect', 2)
+// CHECK-I386:     ('n_desc', 0)
+// CHECK-I386:     ('n_value', 13)
+// CHECK-I386:     ('_string', 'g')
+// CHECK-I386:    ),
+// CHECK-I386:     # Symbol 5
+// CHECK-I386:    (('n_strx', 19)
+// CHECK-I386:     ('n_type', 0xe)
+// CHECK-I386:     ('n_sect', 2)
+// CHECK-I386:     ('n_desc', 0)
+// CHECK-I386:     ('n_value', 13)
+// CHECK-I386:     ('_string', 'f')
+// CHECK-I386:    ),
+// CHECK-I386:     # Symbol 6
+// CHECK-I386:    (('n_strx', 21)
+// CHECK-I386:     ('n_type', 0xe)
+// CHECK-I386:     ('n_sect', 1)
+// CHECK-I386:     ('n_desc', 0)
+// CHECK-I386:     ('n_value', 0)
+// CHECK-I386:     ('_string', 't0')
+// CHECK-I386:    ),
+// CHECK-I386:     # Symbol 7
+// CHECK-I386:    (('n_strx', 4)
+// CHECK-I386:     ('n_type', 0x1)
+// CHECK-I386:     ('n_sect', 0)
+// CHECK-I386:     ('n_desc', 0)
+// CHECK-I386:     ('n_value', 0)
+// CHECK-I386:     ('_string', 'd')
+// CHECK-I386:    ),
+// CHECK-I386:     # Symbol 8
+// CHECK-I386:    (('n_strx', 1)
+// CHECK-I386:     ('n_type', 0x1)
+// CHECK-I386:     ('n_sect', 0)
+// CHECK-I386:     ('n_desc', 0)
+// CHECK-I386:     ('n_value', 0)
+// CHECK-I386:     ('_string', 'd2')
+// CHECK-I386:    ),
+// CHECK-I386:     # Symbol 9
+// CHECK-I386:    (('n_strx', 6)
+// CHECK-I386:     ('n_type', 0x1)
+// CHECK-I386:     ('n_sect', 0)
+// CHECK-I386:     ('n_desc', 0)
+// CHECK-I386:     ('n_value', 0)
+// CHECK-I386:     ('_string', 'd3')
+// CHECK-I386:    ),
+// CHECK-I386:   ])
+// CHECK-I386:  ),
+// CHECK-I386:   # Load Command 2
+// CHECK-I386:  (('command', 11)
+// CHECK-I386:   ('size', 80)
+// CHECK-I386:   ('ilocalsym', 0)
+// CHECK-I386:   ('nlocalsym', 7)
+// CHECK-I386:   ('iextdefsym', 7)
+// CHECK-I386:   ('nextdefsym', 0)
+// CHECK-I386:   ('iundefsym', 7)
+// CHECK-I386:   ('nundefsym', 3)
+// CHECK-I386:   ('tocoff', 0)
+// CHECK-I386:   ('ntoc', 0)
+// CHECK-I386:   ('modtaboff', 0)
+// CHECK-I386:   ('nmodtab', 0)
+// CHECK-I386:   ('extrefsymoff', 0)
+// CHECK-I386:   ('nextrefsyms', 0)
+// CHECK-I386:   ('indirectsymoff', 0)
+// CHECK-I386:   ('nindirectsyms', 0)
+// CHECK-I386:   ('extreloff', 0)
+// CHECK-I386:   ('nextrel', 0)
+// CHECK-I386:   ('locreloff', 0)
+// CHECK-I386:   ('nlocrel', 0)
+// CHECK-I386:   ('_indirect_symbols', [
+// CHECK-I386:   ])
+// CHECK-I386:  ),
+// CHECK-I386: ])
+
+// CHECK-X86_64: ('cputype', 16777223)
+// CHECK-X86_64: ('cpusubtype', 3)
+// CHECK-X86_64: ('filetype', 1)
+// CHECK-X86_64: ('num_load_commands', 3)
+// CHECK-X86_64: ('load_commands_size', 336)
+// CHECK-X86_64: ('flag', 0)
+// CHECK-X86_64: ('reserved', 0)
+// CHECK-X86_64: ('load_commands', [
+// CHECK-X86_64:   # Load Command 0
+// CHECK-X86_64:  (('command', 25)
+// CHECK-X86_64:   ('size', 232)
+// CHECK-X86_64:   ('segment_name', '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
+// CHECK-X86_64:   ('vm_addr', 0)
+// CHECK-X86_64:   ('vm_size', 57)
+// CHECK-X86_64:   ('file_offset', 368)
+// CHECK-X86_64:   ('file_size', 57)
+// CHECK-X86_64:   ('maxprot', 7)
+// CHECK-X86_64:   ('initprot', 7)
+// CHECK-X86_64:   ('num_sections', 2)
+// CHECK-X86_64:   ('flags', 0)
+// CHECK-X86_64:   ('sections', [
+// CHECK-X86_64:     # Section 0
+// CHECK-X86_64:    (('section_name', '__text\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
+// CHECK-X86_64:     ('segment_name', '__TEXT\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
+// CHECK-X86_64:     ('address', 0)
+// CHECK-X86_64:     ('size', 1)
+// CHECK-X86_64:     ('offset', 368)
+// CHECK-X86_64:     ('alignment', 0)
+// CHECK-X86_64:     ('reloc_offset', 0)
+// CHECK-X86_64:     ('num_reloc', 0)
+// CHECK-X86_64:     ('flags', 0x80000400)
+// CHECK-X86_64:     ('reserved1', 0)
+// CHECK-X86_64:     ('reserved2', 0)
+// CHECK-X86_64:     ('reserved3', 0)
+// CHECK-X86_64:    ),
+// CHECK-X86_64:   ('_relocations', [
+// CHECK-X86_64:   ])
+// CHECK-X86_64:   ('_section_data', 'c3')
+// CHECK-X86_64:     # Section 1
+// CHECK-X86_64:    (('section_name', '__data\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
+// CHECK-X86_64:     ('segment_name', '__DATA\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
+// CHECK-X86_64:     ('address', 1)
+// CHECK-X86_64:     ('size', 56)
+// CHECK-X86_64:     ('offset', 369)
+// CHECK-X86_64:     ('alignment', 0)
+// CHECK-X86_64:     ('reloc_offset', 428)
+// CHECK-X86_64:     ('num_reloc', 9)
+// CHECK-X86_64:     ('flags', 0x0)
+// CHECK-X86_64:     ('reserved1', 0)
+// CHECK-X86_64:     ('reserved2', 0)
+// CHECK-X86_64:     ('reserved3', 0)
+// CHECK-X86_64:    ),
+// CHECK-X86_64:   ('_relocations', [
+// CHECK-X86_64:     # Relocation 0
+// CHECK-X86_64:     (('word-0', 0x2c),
+// CHECK-X86_64:      ('word-1', 0xc000004)),
+// CHECK-X86_64:     # Relocation 1
+// CHECK-X86_64:     (('word-0', 0x28),
+// CHECK-X86_64:      ('word-1', 0xc000005)),
+// CHECK-X86_64:     # Relocation 2
+// CHECK-X86_64:     (('word-0', 0x24),
+// CHECK-X86_64:      ('word-1', 0xc000009)),
+// CHECK-X86_64:     # Relocation 3
+// CHECK-X86_64:     (('word-0', 0x20),
+// CHECK-X86_64:      ('word-1', 0xc000008)),
+// CHECK-X86_64:     # Relocation 4
+// CHECK-X86_64:     (('word-0', 0x1c),
+// CHECK-X86_64:      ('word-1', 0xc000007)),
+// CHECK-X86_64:     # Relocation 5
+// CHECK-X86_64:     (('word-0', 0x18),
+// CHECK-X86_64:      ('word-1', 0xc000000)),
+// CHECK-X86_64:     # Relocation 6
+// CHECK-X86_64:     (('word-0', 0x14),
+// CHECK-X86_64:      ('word-1', 0xc000003)),
+// CHECK-X86_64:     # Relocation 7
+// CHECK-X86_64:     (('word-0', 0x10),
+// CHECK-X86_64:      ('word-1', 0xc000001)),
+// CHECK-X86_64:     # Relocation 8
+// CHECK-X86_64:     (('word-0', 0x8),
+// CHECK-X86_64:      ('word-1', 0xc000001)),
+// CHECK-X86_64:   ])
+// CHECK-X86_64:   ('_section_data', '00000000 00000000 00000000 00000000 00000000 00000000 04000000 00000000 00000000 00000000 00000000 00000000 cfffffff ffffffff')
+// CHECK-X86_64:   ])
+// CHECK-X86_64:  ),
+// CHECK-X86_64:   # Load Command 1
+// CHECK-X86_64:  (('command', 2)
+// CHECK-X86_64:   ('size', 24)
+// CHECK-X86_64:   ('symoff', 500)
+// CHECK-X86_64:   ('nsyms', 10)
+// CHECK-X86_64:   ('stroff', 660)
+// CHECK-X86_64:   ('strsize', 24)
+// CHECK-X86_64:   ('_string_data', '\x00d2\x00d\x00d3\x00a\x00b\x00c\x00e\x00g\x00f\x00t0\x00')
+// CHECK-X86_64:   ('_symbols', [
+// CHECK-X86_64:     # Symbol 0
+// CHECK-X86_64:    (('n_strx', 9)
+// CHECK-X86_64:     ('n_type', 0xe)
+// CHECK-X86_64:     ('n_sect', 2)
+// CHECK-X86_64:     ('n_desc', 0)
+// CHECK-X86_64:     ('n_value', 5)
+// CHECK-X86_64:     ('_string', 'a')
+// CHECK-X86_64:    ),
+// CHECK-X86_64:     # Symbol 1
+// CHECK-X86_64:    (('n_strx', 11)
+// CHECK-X86_64:     ('n_type', 0xe)
+// CHECK-X86_64:     ('n_sect', 2)
+// CHECK-X86_64:     ('n_desc', 0)
+// CHECK-X86_64:     ('n_value', 5)
+// CHECK-X86_64:     ('_string', 'b')
+// CHECK-X86_64:    ),
+// CHECK-X86_64:     # Symbol 2
+// CHECK-X86_64:    (('n_strx', 13)
+// CHECK-X86_64:     ('n_type', 0xe)
+// CHECK-X86_64:     ('n_sect', 2)
+// CHECK-X86_64:     ('n_desc', 0)
+// CHECK-X86_64:     ('n_value', 9)
+// CHECK-X86_64:     ('_string', 'c')
+// CHECK-X86_64:    ),
+// CHECK-X86_64:     # Symbol 3
+// CHECK-X86_64:    (('n_strx', 15)
+// CHECK-X86_64:     ('n_type', 0xe)
+// CHECK-X86_64:     ('n_sect', 2)
+// CHECK-X86_64:     ('n_desc', 0)
+// CHECK-X86_64:     ('n_value', 9)
+// CHECK-X86_64:     ('_string', 'e')
+// CHECK-X86_64:    ),
+// CHECK-X86_64:     # Symbol 4
+// CHECK-X86_64:    (('n_strx', 17)
+// CHECK-X86_64:     ('n_type', 0xe)
+// CHECK-X86_64:     ('n_sect', 2)
+// CHECK-X86_64:     ('n_desc', 0)
+// CHECK-X86_64:     ('n_value', 13)
+// CHECK-X86_64:     ('_string', 'g')
+// CHECK-X86_64:    ),
+// CHECK-X86_64:     # Symbol 5
+// CHECK-X86_64:    (('n_strx', 19)
+// CHECK-X86_64:     ('n_type', 0xe)
+// CHECK-X86_64:     ('n_sect', 2)
+// CHECK-X86_64:     ('n_desc', 0)
+// CHECK-X86_64:     ('n_value', 13)
+// CHECK-X86_64:     ('_string', 'f')
+// CHECK-X86_64:    ),
+// CHECK-X86_64:     # Symbol 6
+// CHECK-X86_64:    (('n_strx', 21)
+// CHECK-X86_64:     ('n_type', 0xe)
+// CHECK-X86_64:     ('n_sect', 1)
+// CHECK-X86_64:     ('n_desc', 0)
+// CHECK-X86_64:     ('n_value', 0)
+// CHECK-X86_64:     ('_string', 't0')
+// CHECK-X86_64:    ),
+// CHECK-X86_64:     # Symbol 7
+// CHECK-X86_64:    (('n_strx', 4)
+// CHECK-X86_64:     ('n_type', 0x1)
+// CHECK-X86_64:     ('n_sect', 0)
+// CHECK-X86_64:     ('n_desc', 0)
+// CHECK-X86_64:     ('n_value', 0)
+// CHECK-X86_64:     ('_string', 'd')
+// CHECK-X86_64:    ),
+// CHECK-X86_64:     # Symbol 8
+// CHECK-X86_64:    (('n_strx', 1)
+// CHECK-X86_64:     ('n_type', 0x1)
+// CHECK-X86_64:     ('n_sect', 0)
+// CHECK-X86_64:     ('n_desc', 0)
+// CHECK-X86_64:     ('n_value', 0)
+// CHECK-X86_64:     ('_string', 'd2')
+// CHECK-X86_64:    ),
+// CHECK-X86_64:     # Symbol 9
+// CHECK-X86_64:    (('n_strx', 6)
+// CHECK-X86_64:     ('n_type', 0x1)
+// CHECK-X86_64:     ('n_sect', 0)
+// CHECK-X86_64:     ('n_desc', 0)
+// CHECK-X86_64:     ('n_value', 0)
+// CHECK-X86_64:     ('_string', 'd3')
+// CHECK-X86_64:    ),
+// CHECK-X86_64:   ])
+// CHECK-X86_64:  ),
+// CHECK-X86_64:   # Load Command 2
+// CHECK-X86_64:  (('command', 11)
+// CHECK-X86_64:   ('size', 80)
+// CHECK-X86_64:   ('ilocalsym', 0)
+// CHECK-X86_64:   ('nlocalsym', 7)
+// CHECK-X86_64:   ('iextdefsym', 7)
+// CHECK-X86_64:   ('nextdefsym', 0)
+// CHECK-X86_64:   ('iundefsym', 7)
+// CHECK-X86_64:   ('nundefsym', 3)
+// CHECK-X86_64:   ('tocoff', 0)
+// CHECK-X86_64:   ('ntoc', 0)
+// CHECK-X86_64:   ('modtaboff', 0)
+// CHECK-X86_64:   ('nmodtab', 0)
+// CHECK-X86_64:   ('extrefsymoff', 0)
+// CHECK-X86_64:   ('nextrefsyms', 0)
+// CHECK-X86_64:   ('indirectsymoff', 0)
+// CHECK-X86_64:   ('nindirectsyms', 0)
+// CHECK-X86_64:   ('extreloff', 0)
+// CHECK-X86_64:   ('nextrel', 0)
+// CHECK-X86_64:   ('locreloff', 0)
+// CHECK-X86_64:   ('nlocrel', 0)
+// CHECK-X86_64:   ('_indirect_symbols', [
+// CHECK-X86_64:   ])
+// CHECK-X86_64:  ),
+// CHECK-X86_64: ])
diff --git a/test/MC/X86/x86-32-coverage.s b/test/MC/X86/x86-32-coverage.s
index 4ec9fcd..d2dd78d 100644
--- a/test/MC/X86/x86-32-coverage.s
+++ b/test/MC/X86/x86-32-coverage.s
@@ -372,6 +372,14 @@
 // CHECK: 	nop
         	nop
 
+// CHECK: flds	(%edi)
+// CHECK:  encoding: [0xd9,0x07]
+        	flds	(%edi)
+
+// CHECK: filds	(%edi)
+// CHECK:  encoding: [0xdf,0x07]
+        	filds	(%edi)
+
 // CHECK: 	fldl	3735928559(%ebx,%ecx,8)
         	fldl	0xdeadbeef(%ebx,%ecx,8)
 
@@ -19562,3 +19570,8 @@
 
 // CHECK: 	aeskeygenassist	$125, (%edx,%eax,4), %xmm2
                 aeskeygenassist $125, (%edx,%eax,4), %xmm2
+
+// CHECK:   blendvps	(%rax), %xmm1   # encoding: [0x66,0x0f,0x38,0x14,0x08]
+            blendvps (%rax), %xmm1
+// CHECK:   blendvps	%xmm2, %xmm1    # encoding: [0x66,0x0f,0x38,0x14,0xca]
+            blendvps %xmm2, %xmm1
diff --git a/test/MC/X86/x86-32.s b/test/MC/X86/x86-32.s
index ad9aee5..6017880 100644
--- a/test/MC/X86/x86-32.s
+++ b/test/MC/X86/x86-32.s
@@ -613,11 +613,11 @@ pshufw $90, %mm4, %mm0
 // CHECK:  encoding: [0xd5,0x01]
         	aad	$1
 
-// CHECK: aad	$10
+// CHECK: aad
 // CHECK:  encoding: [0xd5,0x0a]
         	aad	$0xA
 
-// CHECK: aad	$10
+// CHECK: aad
 // CHECK:  encoding: [0xd5,0x0a]
         	aad
 
@@ -625,11 +625,11 @@ pshufw $90, %mm4, %mm0
 // CHECK:  encoding: [0xd4,0x02]
         	aam	$2
 
-// CHECK: aam	$10
+// CHECK: aam
 // CHECK:  encoding: [0xd4,0x0a]
         	aam	$0xA
 
-// CHECK: aam	$10
+// CHECK: aam
 // CHECK:  encoding: [0xd4,0x0a]
         	aam
 
@@ -725,7 +725,7 @@ pshufw $90, %mm4, %mm0
 // CHECK:  encoding: [0xdf,0xf2]
         	fcompi	%st(2)
 
-// CHECK: fcompi	%st(1)
+// CHECK: fcompi
 // CHECK:  encoding: [0xdf,0xf1]
         	fcompi
 
@@ -737,7 +737,7 @@ pshufw $90, %mm4, %mm0
 // CHECK:  encoding: [0xdf,0xea]
         	fucompi	%st(2)
 
-// CHECK: fucompi	%st(1)
+// CHECK: fucompi
 // CHECK:  encoding: [0xdf,0xe9]
         	fucompi
 
@@ -866,9 +866,9 @@ pshufw $90, %mm4, %mm0
 	movsw	%ds:(%esi), %es:(%edi)
 	movsw	(%esi), %es:(%edi)
 
-// CHECK: movsl # encoding: [0xa5]
-// CHECK: movsl
-// CHECK: movsl
+// CHECK: movsd # encoding: [0xa5]
+// CHECK: movsd
+// CHECK: movsd
 	movsl
 	movsl	%ds:(%esi), %es:(%edi)
 	movsl	(%esi), %es:(%edi)
@@ -934,3 +934,15 @@ pshufw $90, %mm4, %mm0
 // CHECK: strl
 // CHECK: encoding: [0x0f,0x00,0xc8]
 	str %eax
+
+
+// PR9378
+// CHECK: fsubp
+// CHECK: encoding: [0xde,0xe1]
+fsubp %st,%st(1)
+
+// PR9164
+// CHECK: fsubp	%st(2)
+// CHECK: encoding: [0xde,0xe2]
+fsubp   %st, %st(2)
+
diff --git a/test/MC/X86/x86-64.s b/test/MC/X86/x86-64.s
index 1d41d5b..a36ba25 100644
--- a/test/MC/X86/x86-64.s
+++ b/test/MC/X86/x86-64.s
@@ -112,12 +112,12 @@
 // rdar://8470918
 smovb // CHECK: movsb
 smovw // CHECK: movsw
-smovl // CHECK: movsl
+smovl // CHECK: movsd
 smovq // CHECK: movsq
 
 // rdar://8456361
 // CHECK: rep
-// CHECK: movsl
+// CHECK: movsd
         rep movsd
 
 // CHECK: rep
@@ -190,6 +190,10 @@ fadd %st(7)
 // CHECK: int3
 INT3
 
+// rdar://8735979 - int $3 -> int3
+// CHECK: int3
+int	$3
+
 
 // Allow scale factor without index register.
 // CHECK: movaps	%xmm3, (%esi)
@@ -228,10 +232,10 @@ cmovnzq %rbx, %rax
 
 // rdar://8407928
 // CHECK: inb	$127, %al
-// CHECK: inw	%dx, %ax
+// CHECK: inw	%dx
 // CHECK: outb	%al, $127
-// CHECK: outw	%ax, %dx
-// CHECK: inl	%dx, %eax
+// CHECK: outw	%dx
+// CHECK: inl	%dx
 inb	$0x7f
 inw	%dx
 outb	$0x7f
@@ -240,12 +244,12 @@ inl	%dx
 
 
 // PR8114
-// CHECK: outb	%al, %dx
-// CHECK: outb	%al, %dx
-// CHECK: outw	%ax, %dx
-// CHECK: outw	%ax, %dx
-// CHECK: outl	%eax, %dx
-// CHECK: outl	%eax, %dx
+// CHECK: outb	%dx
+// CHECK: outb	%dx
+// CHECK: outw	%dx
+// CHECK: outw	%dx
+// CHECK: outl	%dx
+// CHECK: outl	%dx
 
 out	%al, (%dx)
 outb	%al, (%dx)
@@ -254,12 +258,12 @@ outw	%ax, (%dx)
 out	%eax, (%dx)
 outl	%eax, (%dx)
 
-// CHECK: inb	%dx, %al
-// CHECK: inb	%dx, %al
-// CHECK: inw	%dx, %ax
-// CHECK: inw	%dx, %ax
-// CHECK: inl	%dx, %eax
-// CHECK: inl	%dx, %eax
+// CHECK: inb	%dx
+// CHECK: inb	%dx
+// CHECK: inw	%dx
+// CHECK: inw	%dx
+// CHECK: inl	%dx
+// CHECK: inl	%dx
 
 in	(%dx), %al
 inb	(%dx), %al
@@ -270,16 +274,16 @@ inl	(%dx), %eax
 
 // rdar://8431422
 
-// CHECK: fxch	%st(1)
-// CHECK: fucom	%st(1)
-// CHECK: fucomp	%st(1)
-// CHECK: faddp	%st(1)
+// CHECK: fxch
+// CHECK: fucom
+// CHECK: fucomp
+// CHECK: faddp
 // CHECK: faddp	%st(0)
-// CHECK: fsubp	%st(1)
-// CHECK: fsubrp	%st(1)
-// CHECK: fmulp	%st(1)
-// CHECK: fdivp	%st(1)
-// CHECK: fdivrp	%st(1)
+// CHECK: fsubp
+// CHECK: fsubrp
+// CHECK: fmulp
+// CHECK: fdivp
+// CHECK: fdivrp
 
 fxch
 fucom
@@ -292,11 +296,11 @@ fmulp
 fdivp
 fdivrp
 
-// CHECK: fcomi	%st(1)
+// CHECK: fcomi
 // CHECK: fcomi	%st(2)
-// CHECK: fucomi	%st(1)
-// CHECK: fucomi	%st(2)
-// CHECK: fucomi	%st(2)
+// CHECK: fucomi
+// CHECK: fucomi %st(2)
+// CHECK: fucomi %st(2)
 
 fcomi
 fcomi	%st(2)
@@ -304,10 +308,10 @@ fucomi
 fucomi	%st(2)
 fucomi	%st(2), %st
 
-// CHECK: fnstsw %ax
-// CHECK: fnstsw %ax
-// CHECK: fnstsw %ax
-// CHECK: fnstsw %ax
+// CHECK: fnstsw
+// CHECK: fnstsw
+// CHECK: fnstsw
+// CHECK: fnstsw
 
 fnstsw
 fnstsw %ax
@@ -453,7 +457,7 @@ cdq   // CHECK: cltd
 // rdar://8456378 and PR7557 - fstsw
 fstsw %ax
 // CHECK: wait
-// CHECK: fnstsw %ax
+// CHECK: fnstsw
 fstsw (%rax)
 // CHECK: wait
 // CHECK: fnstsw (%rax)
@@ -600,7 +604,7 @@ movsq
 // CHECK:   encoding: [0x48,0xa5]
 
 movsl
-// CHECK: movsl
+// CHECK: movsd
 // CHECK:   encoding: [0xa5]
 
 stosq
@@ -786,7 +790,7 @@ lock/incl 1(%rsp)
 rep movsl
 // CHECK: rep
 // CHECK: encoding: [0xf3]
-// CHECK: movsl
+// CHECK: movsd
 // CHECK: encoding: [0xa5]
 
 
@@ -1023,9 +1027,9 @@ xsetbv // CHECK: xsetbv # encoding: [0x0f,0x01,0xd1]
 	movsw	%ds:(%rsi), %es:(%rdi)
 	movsw	(%rsi), %es:(%rdi)
 
-// CHECK: movsl # encoding: [0xa5]
-// CHECK: movsl
-// CHECK: movsl
+// CHECK: movsd # encoding: [0xa5]
+// CHECK: movsd
+// CHECK: movsd
 	movsl
 	movsl	%ds:(%rsi), %es:(%rdi)
 	movsl	(%rsi), %es:(%rdi)
diff --git a/test/MC/X86/x86_64-encoding.s b/test/MC/X86/x86_64-encoding.s
index 756da4d..cfdf87f 100644
--- a/test/MC/X86/x86_64-encoding.s
+++ b/test/MC/X86/x86_64-encoding.s
@@ -155,3 +155,19 @@ pshufb	CPI1_0(%rip), %xmm1
 // CHECK: leave
 // CHECK:  encoding: [0xc9]
         	leaveq
+
+// CHECK: flds	(%edi)
+// CHECK:  encoding: [0x67,0xd9,0x07]
+        	flds	(%edi)
+
+// CHECK: filds	(%edi)
+// CHECK:  encoding: [0x67,0xdf,0x07]
+        	filds	(%edi)
+
+// CHECK: flds	(%rdi)
+// CHECK:  encoding: [0xd9,0x07]
+        	flds	(%rdi)
+
+// CHECK: filds	(%rdi)
+// CHECK:  encoding: [0xdf,0x07]
+        	filds	(%rdi)
diff --git a/test/Makefile b/test/Makefile
index 0d84186..e38226a 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -176,6 +176,7 @@ lit.site.cfg: site.exp
 	@$(ECHOPATH) s=@LLVM_SOURCE_DIR@=$(LLVM_SRC_ROOT)=g > lit.tmp
 	@$(ECHOPATH) s=@LLVM_BINARY_DIR@=$(LLVM_OBJ_ROOT)=g >> lit.tmp
 	@$(ECHOPATH) s=@LLVM_TOOLS_DIR@=$(ToolDir)=g >> lit.tmp
+	@$(ECHOPATH) s=@LLVM_BUILD_MODE@=$(BuildMode)=g >> lit.tmp
 	@$(ECHOPATH) s=@LLVMGCCDIR@=$(LLVMGCCDIR)=g >> lit.tmp
 	@$(ECHOPATH) s=@PYTHON_EXECUTABLE@=python=g >> lit.tmp
 	@$(ECHOPATH) s=@ENABLE_SHARED@=$(ENABLE_SHARED)=g >> lit.tmp
diff --git a/test/Other/X86/dg.exp b/test/Other/X86/dg.exp
new file mode 100644
index 0000000..7b7bd4e
--- /dev/null
+++ b/test/Other/X86/dg.exp
@@ -0,0 +1,5 @@
+load_lib llvm.exp
+
+if { [llvm_supports_target X86] } {
+  RunLLVMTests [lsort [glob -nocomplain $srcdir/$subdir/*.{ll}]]
+}
diff --git a/test/Other/inline-asm-newline-terminator.ll b/test/Other/X86/inline-asm-newline-terminator.ll
index af93cc0..af93cc0 100644
--- a/test/Other/inline-asm-newline-terminator.ll
+++ b/test/Other/X86/inline-asm-newline-terminator.ll
diff --git a/test/TableGen/SetTheory.td b/test/TableGen/SetTheory.td
new file mode 100644
index 0000000..e0abc63
--- /dev/null
+++ b/test/TableGen/SetTheory.td
@@ -0,0 +1,167 @@
+// Test evaluation of set operations in dags.
+// RUN: tblgen -print-sets %s | FileCheck %s
+// XFAIL: vg_leak
+//
+// The -print-sets driver configures a primitive SetTheory instance that
+// understands these sets:
+
+class Set<dag d> {
+  dag Elements = d;
+}
+
+// It prints all Set instances and their ordered set interpretation.
+
+// Define some elements.
+def a;
+def b;
+def c;
+def d;
+
+// The 'add' operator evaluates and concatenates its arguments.
+def add;
+def S0a : Set<(add)>;
+def S0b : Set<(add a)>;
+def S0c : Set<(add a, b)>;
+def S0d : Set<(add b, a)>;
+def S0e : Set<(add a, a)>;
+def S0f : Set<(add a, a, b, a, c, b, d, a)>;
+def S0g : Set<(add b, a, b)>;
+// CHECK: S0a = [ ]
+// CHECK: S0b = [ a ]
+// CHECK: S0c = [ a b ]
+// CHECK: S0d = [ b a ]
+// CHECK: S0e = [ a ]
+// CHECK: S0f = [ a b c d ]
+// CHECK: S0g = [ b a ]
+
+// Defs of Set class expand into their elements.
+// Mixed sets and elements are flattened.
+def S1a : Set<(add S0a)>;
+def S1b : Set<(add S0a, S0a)>;
+def S1c : Set<(add S0d, S0f)>;
+def S1d : Set<(add d, S0d, S0f)>;
+// CHECK: S1a = [ ]
+// CHECK: S1b = [ ]
+// CHECK: S1c = [ b a c d ]
+// CHECK: S1d = [ d b a c ]
+
+// The 'sub' operator returns the first argument with the following arguments
+// removed.
+def sub;
+def S2a : Set<(sub S1a, S1c)>;
+def S2b : Set<(sub S1c, S1d)>;
+def S2c : Set<(sub S1c, b)>;
+def S2d : Set<(sub S1c, S0c)>;
+def S2e : Set<(sub S1c, S2d)>;
+// CHECK: S2a = [ ]
+// CHECK: S2b = [ ]
+// CHECK: S2c = [ a c d ]
+// CHECK: S2d = [ c d ]
+// CHECK: S2e = [ b a ]
+
+// The 'and' operator intersects two sets. The result has the same order as the
+// first argument.
+def and;
+def S3a : Set<(and S2d, S2e)>;
+def S3b : Set<(and S2d, S1d)>;
+// CHECK: S3a = [ ]
+// CHECK: S3b = [ c d ]
+
+// The 'shl' operator removes the first N elements.
+def shl;
+def S4a : Set<(shl S0f, 0)>;
+def S4b : Set<(shl S0f, 1)>;
+def S4c : Set<(shl S0f, 3)>;
+def S4d : Set<(shl S0f, 4)>;
+def S4e : Set<(shl S0f, 5)>;
+// CHECK: S4a = [ a b c d ]
+// CHECK: S4b = [ b c d ]
+// CHECK: S4c = [ d ]
+// CHECK: S4d = [ ]
+// CHECK: S4e = [ ]
+
+// The 'trunc' operator truncates after the first N elements.
+def trunc;
+def S5a : Set<(trunc S0f, 0)>;
+def S5b : Set<(trunc S0f, 1)>;
+def S5c : Set<(trunc S0f, 3)>;
+def S5d : Set<(trunc S0f, 4)>;
+def S5e : Set<(trunc S0f, 5)>;
+// CHECK: S5a = [ ]
+// CHECK: S5b = [ a ]
+// CHECK: S5c = [ a b c ]
+// CHECK: S5d = [ a b c d ]
+// CHECK: S5e = [ a b c d ]
+
+// The 'rotl' operator rotates left, but also accepts a negative shift.
+def rotl;
+def S6a : Set<(rotl S0f, 0)>;
+def S6b : Set<(rotl S0f, 1)>;
+def S6c : Set<(rotl S0f, 3)>;
+def S6d : Set<(rotl S0f, 4)>;
+def S6e : Set<(rotl S0f, 5)>;
+def S6f : Set<(rotl S0f, -1)>;
+def S6g : Set<(rotl S0f, -4)>;
+def S6h : Set<(rotl S0f, -5)>;
+// CHECK: S6a = [ a b c d ]
+// CHECK: S6b = [ b c d a ]
+// CHECK: S6c = [ d a b c ]
+// CHECK: S6d = [ a b c d ]
+// CHECK: S6e = [ b c d a ]
+// CHECK: S6f = [ d a b c ]
+// CHECK: S6g = [ a b c d ]
+// CHECK: S6h = [ d a b c ]
+
+// The 'rotr' operator rotates right, but also accepts a negative shift.
+def rotr;
+def S7a : Set<(rotr S0f, 0)>;
+def S7b : Set<(rotr S0f, 1)>;
+def S7c : Set<(rotr S0f, 3)>;
+def S7d : Set<(rotr S0f, 4)>;
+def S7e : Set<(rotr S0f, 5)>;
+def S7f : Set<(rotr S0f, -1)>;
+def S7g : Set<(rotr S0f, -4)>;
+def S7h : Set<(rotr S0f, -5)>;
+// CHECK: S7a = [ a b c d ]
+// CHECK: S7b = [ d a b c ]
+// CHECK: S7c = [ b c d a ]
+// CHECK: S7d = [ a b c d ]
+// CHECK: S7e = [ d a b c ]
+// CHECK: S7f = [ b c d a ]
+// CHECK: S7g = [ a b c d ]
+// CHECK: S7h = [ b c d a ]
+
+// The 'decimate' operator picks every N'th element.
+def decimate;
+def e0;
+def e1;
+def e2;
+def e3;
+def e4;
+def e5;
+def e6;
+def e7;
+def e8;
+def e9;
+def E : Set<(add e0, e1, e2, e3, e4, e5, e6, e7, e8, e9)>;
+def S8a : Set<(decimate E, 3)>;
+def S8b : Set<(decimate E, 9)>;
+def S8c : Set<(decimate E, 10)>;
+def S8d : Set<(decimate (rotl E, 1), 2)>;
+def S8e : Set<(add (decimate E, 2), (decimate (rotl E, 1), 2))>;
+// CHECK: S8a = [ e0 e3 e6 e9 ]
+// CHECK: S8b = [ e0 e9 ]
+// CHECK: S8c = [ e0 ]
+// CHECK: S8d = [ e1 e3 e5 e7 e9 ]
+// CHECK: S8e = [ e0 e2 e4 e6 e8 e1 e3 e5 e7 e9 ]
+
+// The 'sequence' operator finds a sequence of records from their name.
+def sequence;
+def S9a : Set<(sequence "e%u", 3, 7)>;
+def S9b : Set<(sequence "e%u", 7, 3)>;
+def S9c : Set<(sequence "e%u", 0, 0)>;
+def S9d : Set<(sequence "S%ua", 7, 9)>;
+// CHECK: S9a = [ e3 e4 e5 e6 e7 ]
+// CHECK: S9b = [ e7 e6 e5 e4 e3 ]
+// CHECK: S9c = [ e0 ]
+// CHECK: S9d = [ a b c d e0 e3 e6 e9 e4 e5 e7 ]
diff --git a/test/TableGen/TargetInstrInfo.td b/test/TableGen/TargetInstrInfo.td
index 146ef6f..6c39d5c 100644
--- a/test/TableGen/TargetInstrInfo.td
+++ b/test/TableGen/TargetInstrInfo.td
@@ -110,7 +110,7 @@ def SHL32rCL : Inst<(ops R32:$dst, R32:$src),
                   [(set R32:$dst, (shl R32:$src, CL))]>;
 
 // The RTL list is a list, allowing complex instructions to be defined easily.
-// Temporary 'internal' registers can be used to break instructions appart.
+// Temporary 'internal' registers can be used to break instructions apart.
 let isTwoAddress = 1 in
 def XOR32mi : Inst<(ops addr:$addr, imm32:$imm),
                    "xor $dst, $src2", 0x81, MRM6m,
diff --git a/test/Transforms/CodeExtractor/2004-03-17-OutputMismatch.ll b/test/Transforms/CodeExtractor/2004-03-17-OutputMismatch.ll
deleted file mode 100644
index 0fbd330..0000000
--- a/test/Transforms/CodeExtractor/2004-03-17-OutputMismatch.ll
+++ /dev/null
@@ -1,20 +0,0 @@
-; RUN: opt < %s -loop-extract -disable-output
-
-%struct.node_t = type { double*, %struct.node_t*, %struct.node_t**, double**, double*, i32, i32 }
-%struct.table_t = type { [1 x %struct.node_t**], [1 x %struct.node_t**] }
-
-define void @make_tables() {
-entry:
-        %tmp.0.i = malloc %struct.node_t                ; <%struct.node_t*> [#uses=1]
-        br i1 false, label %no_exit.i, label %loopexit.i
-
-no_exit.i:              ; preds = %no_exit.i, %entry
-        %prev_node.0.i.1 = phi %struct.node_t* [ %tmp.16.i, %no_exit.i ], [ %tmp.0.i, %entry ]          ; <%struct.node_t*> [#uses=0]
-        %tmp.16.i = malloc %struct.node_t               ; <%struct.node_t*> [#uses=2]
-        br i1 false, label %no_exit.i, label %loopexit.i
-
-loopexit.i:             ; preds = %no_exit.i, %entry
-        %cur_node.0.i.0 = phi %struct.node_t* [ null, %entry ], [ %tmp.16.i, %no_exit.i ]               ; <%struct.node_t*> [#uses=0]
-        ret void
-}
-
diff --git a/test/Transforms/ConstProp/2002-05-03-NotOperator.ll b/test/Transforms/ConstProp/2002-05-03-NotOperator.ll
index d9cd674..b957220 100644
--- a/test/Transforms/ConstProp/2002-05-03-NotOperator.ll
+++ b/test/Transforms/ConstProp/2002-05-03-NotOperator.ll
@@ -1,4 +1,4 @@
-; This bug has to do with the fact that constant propogation was implemented in
+; This bug has to do with the fact that constant propagation was implemented in
 ; terms of _logical_ not (! in C) instead of _bitwise_ not (~ in C).  This was
 ; due to a spec change.
 
diff --git a/test/Transforms/ConstProp/basictest.ll b/test/Transforms/ConstProp/basictest.ll
index df57fb6..d0d0a5b 100644
--- a/test/Transforms/ConstProp/basictest.ll
+++ b/test/Transforms/ConstProp/basictest.ll
@@ -1,6 +1,6 @@
 ; RUN: opt < %s -constprop -die -S | FileCheck %s
 
-; This is a basic sanity check for constant propogation.  The add instruction 
+; This is a basic sanity check for constant propagation.  The add instruction
 ; should be eliminated.
 define i32 @test1(i1 %B) {
         br i1 %B, label %BB1, label %BB2
diff --git a/test/Transforms/ConstProp/calls.ll b/test/Transforms/ConstProp/calls.ll
index 82d7324..3b6010a 100644
--- a/test/Transforms/ConstProp/calls.ll
+++ b/test/Transforms/ConstProp/calls.ll
@@ -7,6 +7,7 @@ declare double @sin(double)
 declare double @tan(double)
 
 declare double @sqrt(double)
+declare double @exp2(double)
 
 define double @T() {
 ; CHECK: @T
@@ -19,7 +20,11 @@ define double @T() {
   %b = fadd double %a, %C
   %D = call double @sqrt(double 4.000000e+00)
   %c = fadd double %b, %D
-  ret double %c
+
+  ; PR9315
+  %E = call double @exp2(double 4.0)
+  %d = fadd double %c, %E 
+  ret double %d
 }
 
 define i1 @test_sse_cvt() nounwind readnone {
diff --git a/test/Transforms/ConstProp/logicaltest.ll b/test/Transforms/ConstProp/logicaltest.ll
index c74296a..abd3275 100644
--- a/test/Transforms/ConstProp/logicaltest.ll
+++ b/test/Transforms/ConstProp/logicaltest.ll
@@ -1,4 +1,4 @@
-; Ensure constant propogation of logical instructions is working correctly.
+; Ensure constant propagation of logical instructions is working correctly.
 
 ; RUN: opt < %s -constprop -die -S | FileCheck %s
 ; CHECK-NOT:     {{and|or|xor}}
diff --git a/test/Transforms/ConstProp/phi.ll b/test/Transforms/ConstProp/phi.ll
index 3d9e284..c65d34c 100644
--- a/test/Transforms/ConstProp/phi.ll
+++ b/test/Transforms/ConstProp/phi.ll
@@ -1,4 +1,4 @@
-; This is a basic sanity check for constant propogation.  The add instruction 
+; This is a basic sanity check for constant propagation.  The add instruction
 ; should be eliminated.
 
 ; RUN: opt < %s -constprop -die -S | not grep phi
diff --git a/test/Transforms/DeadArgElim/2007-12-20-ParamAttrs.ll b/test/Transforms/DeadArgElim/2007-12-20-ParamAttrs.ll
index 0e9c4f7..7c6c575 100644
--- a/test/Transforms/DeadArgElim/2007-12-20-ParamAttrs.ll
+++ b/test/Transforms/DeadArgElim/2007-12-20-ParamAttrs.ll
@@ -9,12 +9,12 @@
 
 @g = global i8 0
 
-define internal i8 @foo(i8* inreg %p, i8 signext %y, ... ) zeroext nounwind {
+define internal zeroext i8 @foo(i8* inreg %p, i8 signext %y, ... )  nounwind {
 	store i8 %y, i8* @g
 	ret i8 0
 }
 
 define i32 @bar() {
-	%A = call i8(i8*, i8, ...)* @foo(i8* inreg null, i8 signext 1, %struct* byval null ) zeroext nounwind
+	%A = call zeroext i8(i8*, i8, ...)* @foo(i8* inreg null, i8 signext 1, %struct* byval null ) nounwind
 	ret i32 0
 }
diff --git a/test/Transforms/DeadArgElim/2008-06-23-DeadAfterLive.ll b/test/Transforms/DeadArgElim/2008-06-23-DeadAfterLive.ll
index adfd019..858c935 100644
--- a/test/Transforms/DeadArgElim/2008-06-23-DeadAfterLive.ll
+++ b/test/Transforms/DeadArgElim/2008-06-23-DeadAfterLive.ll
@@ -2,7 +2,7 @@
 ; RUN: cat %t | grep 123
 
 ; This test tries to catch wrongful removal of return values for a specific case
-; that was break llvm-gcc builds.
+; that was breaking llvm-gcc builds.
 
 ; This function has a live return value, it is used by @alive.
 define internal i32 @test5() {
diff --git a/test/Transforms/DeadStoreElimination/free.ll b/test/Transforms/DeadStoreElimination/free.ll
index 3c980cc..aa3f0ab 100644
--- a/test/Transforms/DeadStoreElimination/free.ll
+++ b/test/Transforms/DeadStoreElimination/free.ll
@@ -9,7 +9,8 @@ target datalayout = "e-p:64:64:64"
 define void @test(i32* %Q, i32* %P) {
         %DEAD = load i32* %Q            ; <i32> [#uses=1]
         store i32 %DEAD, i32* %P
-        free i32* %P
+        %1 = bitcast i32* %P to i8*
+        tail call void @free(i8* %1)
         ret void
 }
 
@@ -20,7 +21,8 @@ define void @test(i32* %Q, i32* %P) {
 define void @test2({i32, i32}* %P) {
 	%Q = getelementptr {i32, i32} *%P, i32 0, i32 1
 	store i32 4, i32* %Q
-	free {i32,i32}* %P
+        %1 = bitcast {i32, i32}* %P to i8*
+        tail call void @free(i8* %1)
 	ret void
 }
 
diff --git a/test/Transforms/DeadStoreElimination/simple.ll b/test/Transforms/DeadStoreElimination/simple.ll
index a61eac9..23576da 100644
--- a/test/Transforms/DeadStoreElimination/simple.ll
+++ b/test/Transforms/DeadStoreElimination/simple.ll
@@ -236,3 +236,20 @@ define void @test18(i8* %P, i8* %Q, i8* %R) nounwind ssp {
 ; CHECK-NEXT: call void @llvm.memcpy
 ; CHECK-NEXT: ret
 }
+
+
+; The store here is not dead because the byval call reads it.
+declare void @test19f({i32}* byval align 4 %P)
+
+define void @test19({i32} * nocapture byval align 4 %arg5) nounwind ssp {
+bb:
+  %tmp7 = getelementptr inbounds {i32}* %arg5, i32 0, i32 0
+  store i32 912, i32* %tmp7
+  call void @test19f({i32}* byval align 4 %arg5)
+  ret void
+
+; CHECK: @test19(
+; CHECK: store i32 912
+; CHECK: call void @test19f
+}
+
diff --git a/test/Transforms/FunctionAttrs/2009-05-06-Malloc.ll b/test/Transforms/FunctionAttrs/2009-05-06-Malloc.ll
deleted file mode 100644
index 488e6a9..0000000
--- a/test/Transforms/FunctionAttrs/2009-05-06-Malloc.ll
+++ /dev/null
@@ -1,7 +0,0 @@
-; RUN: opt < %s -functionattrs -S | not grep read
-; PR3754
-
-define i8* @m(i32 %size) {
-	%tmp = malloc i8, i32 %size		; <i8*> [#uses=1]
-	ret i8* %tmp
-}
diff --git a/test/Transforms/GVN/2007-07-26-InterlockingLoops.ll b/test/Transforms/GVN/2007-07-26-InterlockingLoops.ll
index 14cb91b..a1cc008 100644
--- a/test/Transforms/GVN/2007-07-26-InterlockingLoops.ll
+++ b/test/Transforms/GVN/2007-07-26-InterlockingLoops.ll
@@ -4,8 +4,11 @@
 
 define i32 @NextRootMove(i32 %wtm) {
 entry:
+        %A = alloca i32*
 	%tmp17618 = load i32** getelementptr ([65 x i32*]* @last, i32 0, i32 1), align 4
+        store i32* %tmp17618, i32** %A
 ; CHECK: entry:
+; CHECK-NEXT: alloca i32
 ; CHECK-NEXT: %tmp17618 = load
 ; CHECK-NOT: load
 ; CHECK-NOT: phi
@@ -16,6 +19,7 @@ cond_true116:
 
 cond_true128:
 	%tmp17625 = load i32** getelementptr ([65 x i32*]* @last, i32 0, i32 1), align 4
+        store i32* %tmp17625, i32** %A
 	br i1 false, label %bb98.backedge, label %return.loopexit
 
 bb98.backedge:
@@ -23,6 +27,7 @@ bb98.backedge:
 
 cond_true145:
 	%tmp17631 = load i32** getelementptr ([65 x i32*]* @last, i32 0, i32 1), align 4
+        store i32* %tmp17631, i32** %A
 	br i1 false, label %bb98.backedge, label %return.loopexit
 
 return.loopexit:
diff --git a/test/Transforms/GVN/2008-07-02-Unreachable.ll b/test/Transforms/GVN/2008-07-02-Unreachable.ll
index be69cfc..407940b 100644
--- a/test/Transforms/GVN/2008-07-02-Unreachable.ll
+++ b/test/Transforms/GVN/2008-07-02-Unreachable.ll
@@ -5,6 +5,7 @@
 
 define i8 @func_1() nounwind  {
 entry:
+  %A = alloca i8
 	br i1 false, label %ifelse, label %ifthen
 
 ifthen:		; preds = %entry
@@ -12,6 +13,7 @@ ifthen:		; preds = %entry
 
 ifelse:		; preds = %entry
 	%tmp3 = load i8* @g_3		; <i8> [#uses=0]
+        store i8 %tmp3, i8* %A
 	br label %forcond.thread
 
 forcond.thread:		; preds = %ifelse
diff --git a/test/Transforms/GVN/2011-06-01-NonLocalMemdepMiscompile.ll b/test/Transforms/GVN/2011-06-01-NonLocalMemdepMiscompile.ll
new file mode 100644
index 0000000..f24e956
--- /dev/null
+++ b/test/Transforms/GVN/2011-06-01-NonLocalMemdepMiscompile.ll
@@ -0,0 +1,61 @@
+; RUN: opt < %s -basicaa -gvn -S | FileCheck %s
+; This test is checking that (a) this doesn't crash, and (b) we don't
+; conclude the value of %tmp17 is available in bb1.bb15_crit_edge.
+; rdar://9429882
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-macosx10.7.0"
+
+define i1 @rb_intern() nounwind ssp {
+; CHECK: @rb_intern
+
+bb:
+  %tmp = alloca i8*, align 8
+  store i8* null, i8** %tmp, align 8
+  store i8 undef, i8* null, align 536870912
+  br label %bb1
+
+bb1:
+  br i1 undef, label %bb3, label %bb15
+
+; CHECK: bb1:
+; CHECK: %tmp16 = phi i8* [ getelementptr (i8* null, i64 undef), %bb10 ], [ null, %bb ]
+
+; CHECK: bb1.bb15_crit_edge:
+; CHECK: %tmp17.pre = load i8* %tmp16, align 1
+
+bb3:
+  call void @isalnum()
+  br i1 undef, label %bb10, label %bb5
+
+bb5:
+  br i1 undef, label %bb10, label %bb6
+
+bb6:
+  %tmp7 = load i8** %tmp, align 8
+  %tmp8 = load i8* %tmp7, align 1
+  %tmp9 = zext i8 %tmp8 to i64
+  br i1 undef, label %bb15, label %bb10
+
+bb10:
+  %tmp11 = load i8** %tmp, align 8
+  %tmp12 = load i8* %tmp11, align 1
+  %tmp13 = zext i8 %tmp12 to i64
+  %tmp14 = getelementptr inbounds i8* null, i64 undef
+  store i8* %tmp14, i8** %tmp, align 8
+  br label %bb1
+
+bb15:
+  %tmp16 = load i8** %tmp, align 8
+  %tmp17 = load i8* %tmp16, align 1
+  %tmp18 = icmp eq i8 %tmp17, 0
+  br label %bb19
+
+; CHECK: bb15:
+; CHECK: %tmp17 = phi i8 [ %tmp17.pre, %bb1.bb15_crit_edge ], [ %tmp8, %bb6 ]
+
+bb19:                                             ; preds = %bb15
+  ret i1 %tmp18
+}
+
+declare void @isalnum() nounwind inlinehint ssp
diff --git a/test/Transforms/GVN/crash.ll b/test/Transforms/GVN/crash.ll
index 4a3aa1c..31eae25 100644
--- a/test/Transforms/GVN/crash.ll
+++ b/test/Transforms/GVN/crash.ll
@@ -151,3 +151,15 @@ dead:
 dead2:
   ret i32 %A
 }
+
+
+; PR9841
+define fastcc i8 @test5(i8* %P) nounwind {
+entry:
+  %0 = load i8* %P, align 2
+
+  %Q = getelementptr i8* %P, i32 1
+  %1 = load i8* %Q, align 1
+  ret i8 %1
+}
+
diff --git a/test/Transforms/GVN/invariant-simple.ll b/test/Transforms/GVN/invariant-simple.ll
deleted file mode 100644
index 98ea48c..0000000
--- a/test/Transforms/GVN/invariant-simple.ll
+++ /dev/null
@@ -1,36 +0,0 @@
-; RUN: opt < %s -basicaa -gvn -S | FileCheck %s
-
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
-target triple = "i386-apple-darwin7"
-
-define i8 @test(i8* %P) nounwind {
-; CHECK: @test
-; CHECK-NOT: load
-; CHECK: ret i8
-entry:
-  store i8 1, i8* %P
-  %0 = call {}* @llvm.invariant.start(i64 32, i8* %P)
-  %1 = tail call i32 @foo(i8* %P)
-  call void @llvm.invariant.end({}* %0, i64 32, i8* %P)
-  %2 = load i8* %P
-  ret i8 %2
-}
-
-define i8 @test2(i8* %P) nounwind {
-; CHECK: @test2
-; CHECK: store i8 1
-; CHECK: store i8 2
-; CHECK: ret i8 0
-entry:
-  store i8 1, i8* %P
-  %0 = call {}* @llvm.invariant.start(i64 32, i8* %P)
-  %1 = tail call i32 @bar(i8* %P)
-  call void @llvm.invariant.end({}* %0, i64 32, i8* %P)
-  store i8 2, i8* %P
-  ret i8 0
-}
-
-declare i32 @foo(i8*) nounwind 
-declare i32 @bar(i8*) nounwind readonly
-declare {}* @llvm.invariant.start(i64 %S, i8* nocapture %P) readonly
-declare void @llvm.invariant.end({}* %S, i64 %SS, i8* nocapture %P)
diff --git a/test/Transforms/GVN/mixed.ll b/test/Transforms/GVN/mixed.ll
deleted file mode 100644
index 6bfada2..0000000
--- a/test/Transforms/GVN/mixed.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-; RUN: opt < %s -basicaa -gvn -S | not grep DEADLOAD
-; RUN: opt < %s -basicaa -gvn -S | not grep DEADGEP
-
-define i32 @main(i32** %p) {
-block1:
-	%z1 = load i32** %p
-	%z2 = getelementptr i32* %z1, i32 0
-	%z3 = load i32* %z2
-	%DEADLOAD = load i32** %p
-	%DEADGEP = getelementptr i32* %DEADLOAD, i32 0
-	%DEADLOAD2 = load i32* %DEADGEP
-	ret i32 %DEADLOAD2
-}
diff --git a/test/Transforms/GVN/phi-translate-partial-alias.ll b/test/Transforms/GVN/phi-translate-partial-alias.ll
new file mode 100644
index 0000000..47bec41
--- /dev/null
+++ b/test/Transforms/GVN/phi-translate-partial-alias.ll
@@ -0,0 +1,27 @@
+; RUN: opt -basicaa -gvn -S < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-f128:128:128-n8:16:32:64"
+
+; GVN shouldn't PRE the load around the loop backedge because it's
+; not actually redundant around the loop backedge, despite appearances
+; if phi-translation is ignored.
+
+; CHECK: define void @test0(i8* %begin)
+; CHECK: loop:
+; CHECK:   %l0 = load i8* %phi
+; CHECK:   call void @bar(i8 %l0)
+; CHECK:   %l1 = load i8* %phi
+define void @test0(i8* %begin) {
+entry:
+  br label %loop
+
+loop:
+  %phi = phi i8* [ %begin, %entry ], [ %next, %loop ]
+  %l0 = load i8* %phi
+  call void @bar(i8 %l0)
+  %l1 = load i8* %phi
+  %next = getelementptr inbounds i8* %phi, i8 %l1
+  br label %loop
+}
+
+declare void @bar(i8)
diff --git a/test/Transforms/GVN/preserve-tbaa.ll b/test/Transforms/GVN/preserve-tbaa.ll
index 2fcfc47..a936755 100644
--- a/test/Transforms/GVN/preserve-tbaa.ll
+++ b/test/Transforms/GVN/preserve-tbaa.ll
@@ -5,9 +5,9 @@ target datalayout = "e-p:64:64:64"
 ; GVN should preserve the TBAA tag on loads when doing PRE.
 
 ; CHECK: @test
-; CHECK: %tmp33.pre = load i16* undef, align 2, !tbaa !0
+; CHECK: %tmp33.pre = load i16* %P, align 2, !tbaa !0
 ; CHECK: br label %for.body
-define void @test() nounwind {
+define void @test(i16 *%P, i16* %Q) nounwind {
 entry:
   br i1 undef, label %bb.nph, label %for.end
 
@@ -15,8 +15,10 @@ bb.nph:                                           ; preds = %entry
   br label %for.body
 
 for.body:                                         ; preds = %for.body, %bb.nph
-  %tmp33 = load i16* undef, align 2, !tbaa !0
-  store i16 undef, i16* undef, align 2, !tbaa !0
+  %tmp33 = load i16* %P, align 2, !tbaa !0
+  store i16 %tmp33, i16* %Q
+
+  store i16 0, i16* %P, align 2, !tbaa !0
   br i1 false, label %for.end, label %for.body
 
 for.end:                                          ; preds = %for.body, %entry
diff --git a/test/Transforms/GVN/rle.ll b/test/Transforms/GVN/rle.ll
index 2e43321..28b1fc7 100644
--- a/test/Transforms/GVN/rle.ll
+++ b/test/Transforms/GVN/rle.ll
@@ -1,7 +1,7 @@
-; RUN: opt < %s -basicaa -gvn -S | FileCheck %s
+; RUN: opt < %s -basicaa -gvn -S -die | FileCheck %s
 
 ; 32-bit little endian target.
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
 
 ;; Trivial RLE test.
 define i32 @test0(i32 %V, i32* %P) {
@@ -360,8 +360,11 @@ Cont:
 
 define i32 @chained_load(i32** %p) {
 block1:
+  %A = alloca i32*
+
   %z = load i32** %p
-	br i1 true, label %block2, label %block3
+  store i32* %z, i32** %A
+  br i1 true, label %block2, label %block3
 
 block2:
  %a = load i32** %p
@@ -544,3 +547,99 @@ entry:
 ; CHECK: ret i32 0
 }
 
+
+;;===----------------------------------------------------------------------===;;
+;; Load -> Load forwarding in partial alias case.
+;;===----------------------------------------------------------------------===;;
+
+define i32 @load_load_partial_alias(i8* %P) nounwind ssp {
+entry:
+  %0 = bitcast i8* %P to i32*
+  %tmp2 = load i32* %0
+  %add.ptr = getelementptr inbounds i8* %P, i64 1
+  %tmp5 = load i8* %add.ptr
+  %conv = zext i8 %tmp5 to i32
+  %add = add nsw i32 %tmp2, %conv
+  ret i32 %add
+
+; TEMPORARILYDISABLED: @load_load_partial_alias
+; TEMPORARILYDISABLED: load i32*
+; TEMPORARILYDISABLED-NOT: load
+; TEMPORARILYDISABLED: lshr i32 {{.*}}, 8
+; TEMPORARILYDISABLED-NOT: load
+; TEMPORARILYDISABLED: trunc i32 {{.*}} to i8
+; TEMPORARILYDISABLED-NOT: load
+; TEMPORARILYDISABLED: ret i32
+}
+
+
+; Cross block partial alias case.
+define i32 @load_load_partial_alias_cross_block(i8* %P) nounwind ssp {
+entry:
+  %xx = bitcast i8* %P to i32*
+  %x1 = load i32* %xx, align 4
+  %cmp = icmp eq i32 %x1, 127
+  br i1 %cmp, label %land.lhs.true, label %if.end
+
+land.lhs.true:                                    ; preds = %entry
+  %arrayidx4 = getelementptr inbounds i8* %P, i64 1
+  %tmp5 = load i8* %arrayidx4, align 1
+  %conv6 = zext i8 %tmp5 to i32
+  ret i32 %conv6
+
+if.end:
+  ret i32 52
+; TEMPORARILY_DISABLED: @load_load_partial_alias_cross_block
+; TEMPORARILY_DISABLED: land.lhs.true:
+; TEMPORARILY_DISABLED-NOT: load i8
+; TEMPORARILY_DISABLED: ret i32 %conv6
+}
+
+
+;;===----------------------------------------------------------------------===;;
+;; Load Widening
+;;===----------------------------------------------------------------------===;;
+
+%widening1 = type { i32, i8, i8, i8, i8 }
+
+@f = global %widening1 zeroinitializer, align 4
+
+define i32 @test_widening1(i8* %P) nounwind ssp noredzone {
+entry:
+  %tmp = load i8* getelementptr inbounds (%widening1* @f, i64 0, i32 1), align 4
+  %conv = zext i8 %tmp to i32
+  %tmp1 = load i8* getelementptr inbounds (%widening1* @f, i64 0, i32 2), align 1
+  %conv2 = zext i8 %tmp1 to i32
+  %add = add nsw i32 %conv, %conv2
+  ret i32 %add
+; CHECK: @test_widening1
+; CHECK-NOT: load
+; CHECK: load i16*
+; CHECK-NOT: load
+; CHECK-ret i32
+}
+
+define i32 @test_widening2() nounwind ssp noredzone {
+entry:
+  %tmp = load i8* getelementptr inbounds (%widening1* @f, i64 0, i32 1), align 4
+  %conv = zext i8 %tmp to i32
+  %tmp1 = load i8* getelementptr inbounds (%widening1* @f, i64 0, i32 2), align 1
+  %conv2 = zext i8 %tmp1 to i32
+  %add = add nsw i32 %conv, %conv2
+
+  %tmp2 = load i8* getelementptr inbounds (%widening1* @f, i64 0, i32 3), align 2
+  %conv3 = zext i8 %tmp2 to i32
+  %add2 = add nsw i32 %add, %conv3
+
+  %tmp3 = load i8* getelementptr inbounds (%widening1* @f, i64 0, i32 4), align 1
+  %conv4 = zext i8 %tmp3 to i32
+  %add3 = add nsw i32 %add2, %conv3
+
+  ret i32 %add3
+; CHECK: @test_widening2
+; CHECK-NOT: load
+; CHECK: load i32*
+; CHECK-NOT: load
+; CHECK-ret i32
+}
+
diff --git a/test/Transforms/GlobalOpt/2007-04-05-Crash.ll b/test/Transforms/GlobalOpt/2007-04-05-Crash.ll
index d306d14..c7aca62 100644
--- a/test/Transforms/GlobalOpt/2007-04-05-Crash.ll
+++ b/test/Transforms/GlobalOpt/2007-04-05-Crash.ll
@@ -6,7 +6,7 @@ target triple = "thumb-apple-darwin8"
 @"L_OBJC_IMAGE_INFO" = internal global [2 x i32] zeroinitializer		; <[2 x i32]*> [#uses=1]
 @llvm.used = appending global [1 x i8*] [ i8* bitcast ([2 x i32]* @"L_OBJC_IMAGE_INFO" to i8*) ]		; <[1 x i8*]*> [#uses=0]
 
-define i16 @__NSCharToUnicharCFWrapper(i8 zeroext  %ch) zeroext  {
+define zeroext i16 @__NSCharToUnicharCFWrapper(i8 zeroext  %ch)   {
 entry:
 	%iftmp.0.0.in.in = select i1 false, i16* @replacementUnichar, i16* null		; <i16*> [#uses=1]
 	%iftmp.0.0.in = load i16* %iftmp.0.0.in.in		; <i16> [#uses=1]
diff --git a/test/Transforms/GlobalOpt/2008-12-16-HeapSRACrash-2.ll b/test/Transforms/GlobalOpt/2008-12-16-HeapSRACrash-2.ll
index 3242e1e..b74e4fc 100644
--- a/test/Transforms/GlobalOpt/2008-12-16-HeapSRACrash-2.ll
+++ b/test/Transforms/GlobalOpt/2008-12-16-HeapSRACrash-2.ll
@@ -6,12 +6,16 @@ target triple = "i386-apple-darwin7"
 
 define void @bar(i32 %Size) nounwind noinline {
 entry:
-	%tmp = malloc [1000000 x %struct.foo]		; <[1000000 x %struct.foo]*> [#uses=1]
+        %malloccall = tail call i8* @malloc(i32 trunc (i64 mul (i64 ptrtoint (i32* getelementptr (i32* null, i32 1) to i64), i64 2000000) to i32))
+        %tmp = bitcast i8* %malloccall to [1000000 x %struct.foo]*
 	%.sub = getelementptr [1000000 x %struct.foo]* %tmp, i32 0, i32 0		; <%struct.foo*> [#uses=1]
 	store %struct.foo* %.sub, %struct.foo** @X, align 4
 	ret void
 }
 
+declare noalias i8* @malloc(i32)
+
+
 define i32 @baz() nounwind readonly noinline {
 bb1.thread:
 	%tmpLD1 = load %struct.foo** @X, align 4		; <%struct.foo*> [#uses=2]
diff --git a/test/Transforms/GlobalOpt/2008-12-16-HeapSRACrash.ll b/test/Transforms/GlobalOpt/2008-12-16-HeapSRACrash.ll
index 51dcac1..613cb7b 100644
--- a/test/Transforms/GlobalOpt/2008-12-16-HeapSRACrash.ll
+++ b/test/Transforms/GlobalOpt/2008-12-16-HeapSRACrash.ll
@@ -6,12 +6,15 @@ target triple = "i386-apple-darwin7"
 
 define void @bar(i32 %Size) nounwind noinline {
 entry:
-	%tmp = malloc [1000000 x %struct.foo]		; <[1000000 x %struct.foo]*> [#uses=1]
+        %malloccall = tail call i8* @malloc(i32 trunc (i64 mul (i64 ptrtoint (i32* getelementptr (i32* null, i32 1) to i64), i64 2000000) to i32))
+        %tmp = bitcast i8* %malloccall to [1000000 x %struct.foo]*
 	%.sub = getelementptr [1000000 x %struct.foo]* %tmp, i32 0, i32 0		; <%struct.foo*> [#uses=1]
 	store %struct.foo* %.sub, %struct.foo** @X, align 4
 	ret void
 }
 
+declare noalias i8* @malloc(i32)
+
 define i32 @baz() nounwind readonly noinline {
 bb1.thread:
 	%tmpLD1 = load %struct.foo** @X, align 4		; <%struct.foo*> [#uses=3]
diff --git a/test/Transforms/GlobalOpt/2011-04-09-EmptyGlobalCtors.ll b/test/Transforms/GlobalOpt/2011-04-09-EmptyGlobalCtors.ll
new file mode 100644
index 0000000..321a487
--- /dev/null
+++ b/test/Transforms/GlobalOpt/2011-04-09-EmptyGlobalCtors.ll
@@ -0,0 +1,5 @@
+; RUN: opt < %s -globalopt -disable-output
+
+%0 = type { i32, void ()* }
+@llvm.global_ctors = appending global [0 x %0] zeroinitializer
+
diff --git a/test/Transforms/GlobalOpt/crash.ll b/test/Transforms/GlobalOpt/crash.ll
index 9da5a5e..366a874 100644
--- a/test/Transforms/GlobalOpt/crash.ll
+++ b/test/Transforms/GlobalOpt/crash.ll
@@ -64,3 +64,17 @@ define void @memset_with_strange_user() ssp {
   ret void
 }
 declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
+
+
+; PR9856
+@g_52 = internal global i32** null, align 8
+@g_90 = external global i32*, align 8
+
+define void @icmp_user_of_stored_once() nounwind ssp {
+entry:
+  %tmp4 = load i32*** @g_52, align 8
+  store i32** @g_90, i32*** @g_52
+  %cmp17 = icmp ne i32*** undef, @g_52
+  ret void
+}
+
diff --git a/test/Transforms/GlobalOpt/memset-null.ll b/test/Transforms/GlobalOpt/memset-null.ll
new file mode 100644
index 0000000..0153402
--- /dev/null
+++ b/test/Transforms/GlobalOpt/memset-null.ll
@@ -0,0 +1,29 @@
+; RUN: opt -globalopt %s -S -o - | FileCheck %s
+; PR10047
+
+%0 = type { i32, void ()* }
+%struct.A = type { [100 x i32] }
+
+; CHECK: @a
+@a = global %struct.A zeroinitializer, align 4
+@llvm.global_ctors = appending global [2 x %0] [%0 { i32 65535, void ()* @_GLOBAL__I_a }, %0 { i32 65535, void ()* @_GLOBAL__I_b }]
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
+
+; CHECK-NOT: GLOBAL__I_a
+define internal void @_GLOBAL__I_a() nounwind {
+entry:
+  tail call void @llvm.memset.p0i8.i64(i8* bitcast (%struct.A* @a to i8*), i8 0, i64 400, i32 4, i1 false) nounwind
+  ret void
+}
+
+%struct.X = type { i8 }
+@y = global i8* null, align 8
+@x = global %struct.X zeroinitializer, align 1
+
+define internal void @_GLOBAL__I_b() nounwind {
+entry:
+  %tmp.i.i.i = load i8** @y, align 8
+  tail call void @llvm.memset.p0i8.i64(i8* %tmp.i.i.i, i8 0, i64 10, i32 1, i1 false) nounwind
+  ret void
+}
diff --git a/test/Transforms/IndVarSimplify/2008-09-02-IVType.ll b/test/Transforms/IndVarSimplify/2008-09-02-IVType.ll
index 288431a..a004831 100644
--- a/test/Transforms/IndVarSimplify/2008-09-02-IVType.ll
+++ b/test/Transforms/IndVarSimplify/2008-09-02-IVType.ll
@@ -16,7 +16,7 @@
 	%struct.YUVGeneralParams = type { i16*, i8*, i8*, i8*, i8*, i8*, void (i8*, i16**, i32, %struct.YUVGeneralParams*)*, i16, i16, i16, [6 x i8], void (i8*, i16**, i32, %struct.YUVGeneralParams*)*, i16, i16 }
 @llvm.used = appending global [1 x i8*] [ i8* bitcast (i16 (%struct.JPEGGlobals*)* @ExtractBufferedBlocksIgnored to i8*) ], section "llvm.metadata"		; <[1 x i8*]*> [#uses=0]
 
-define i16 @ExtractBufferedBlocksIgnored(%struct.JPEGGlobals* %globp) signext nounwind {
+define signext i16 @ExtractBufferedBlocksIgnored(%struct.JPEGGlobals* %globp)  nounwind {
 entry:
 	%tmp4311 = getelementptr %struct.JPEGGlobals* %globp, i32 0, i32 70		; <i32*> [#uses=1]
 	%tmp4412 = load i32* %tmp4311, align 16		; <i32> [#uses=2]
diff --git a/test/Transforms/IndVarSimplify/2009-04-27-Floating.ll b/test/Transforms/IndVarSimplify/2009-04-27-Floating.ll
index 9fd2d2f..47164d8 100644
--- a/test/Transforms/IndVarSimplify/2009-04-27-Floating.ll
+++ b/test/Transforms/IndVarSimplify/2009-04-27-Floating.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -indvars -S | grep icmp | grep next
+; RUN: opt < %s -indvars -S | FileCheck %s
 ; PR4086
 declare void @foo()
 
@@ -6,13 +6,14 @@ define void @test() {
 entry:
         br label %loop_body
 
-loop_body:              
-        %i = phi float [ %nexti, %loop_body ], [ 0.0, %entry ]          
+loop_body:
+        %i = phi float [ %nexti, %loop_body ], [ 0.0, %entry ]
         tail call void @foo()
         %nexti = fadd float %i, 1.0
-        %less = fcmp olt float %nexti, 2.0              
+        ; CHECK: icmp ne i32 %{{[a-zA-Z$._0-9]+}}, 2
+        %less = fcmp olt float %nexti, 2.0
         br i1 %less, label %loop_body, label %done
 
-done:           
+done:
         ret void
 }
diff --git a/test/Transforms/IndVarSimplify/elim-extend.ll b/test/Transforms/IndVarSimplify/elim-extend.ll
new file mode 100644
index 0000000..0367e11
--- /dev/null
+++ b/test/Transforms/IndVarSimplify/elim-extend.ll
@@ -0,0 +1,153 @@
+; RUN: opt < %s -indvars -disable-iv-rewrite -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+
+; IV with constant start, preinc and postinc sign extends, with and without NSW.
+; IV rewrite only removes one sext. WidenIVs removes all three.
+define void @postincConstIV(i8* %base, i32 %limit) nounwind {
+entry:
+  br label %loop
+; CHECK: loop:
+; CHECK-NOT: sext
+; CHECK: exit:
+loop:
+  %iv = phi i32 [ %postiv, %loop ], [ 0, %entry ]
+  %ivnsw = phi i32 [ %postivnsw, %loop ], [ 0, %entry ]
+  %preofs = sext i32 %iv to i64
+  %preadr = getelementptr i8* %base, i64 %preofs
+  store i8 0, i8* %preadr
+  %postiv = add i32 %iv, 1
+  %postofs = sext i32 %postiv to i64
+  %postadr = getelementptr i8* %base, i64 %postofs
+  store i8 0, i8* %postadr
+  %postivnsw = add nsw i32 %ivnsw, 1
+  %postofsnsw = sext i32 %postivnsw to i64
+  %postadrnsw = getelementptr i8* %base, i64 %postofsnsw
+  store i8 0, i8* %postadrnsw
+  %cond = icmp sgt i32 %limit, %iv
+  br i1 %cond, label %loop, label %exit
+exit:
+  br label %return
+return:
+  ret void
+}
+
+; IV with nonconstant start, preinc and postinc sign extends,
+; with and without NSW.
+; As with postincConstIV, WidenIVs removes all three sexts.
+define void @postincVarIV(i8* %base, i32 %init, i32 %limit) nounwind {
+entry:
+  %precond = icmp sgt i32 %limit, %init
+  br i1 %precond, label %loop, label %return
+; CHECK: loop:
+; CHECK-NOT: sext
+; CHECK: exit:
+loop:
+  %iv = phi i32 [ %postiv, %loop ], [ %init, %entry ]
+  %ivnsw = phi i32 [ %postivnsw, %loop ], [ %init, %entry ]
+  %preofs = sext i32 %iv to i64
+  %preadr = getelementptr i8* %base, i64 %preofs
+  store i8 0, i8* %preadr
+  %postiv = add i32 %iv, 1
+  %postofs = sext i32 %postiv to i64
+  %postadr = getelementptr i8* %base, i64 %postofs
+  store i8 0, i8* %postadr
+  %postivnsw = add nsw i32 %ivnsw, 1
+  %postofsnsw = sext i32 %postivnsw to i64
+  %postadrnsw = getelementptr i8* %base, i64 %postofsnsw
+  store i8 0, i8* %postadrnsw
+  %cond = icmp sgt i32 %limit, %postiv
+  br i1 %cond, label %loop, label %exit
+exit:
+  br label %return
+return:
+  ret void
+}
+
+; Test sign extend elimination in the inner and outer loop.
+; %outercount is straightforward to widen, besides being in an outer loop.
+; %innercount is currently blocked by lcssa, so is not widened.
+; %inneriv can be widened only after proving it has no signed-overflow
+;   based on the loop test.
+define void @nestedIV(i8* %address, i32 %limit) nounwind {
+entry:
+  %limitdec = add i32 %limit, -1
+  br label %outerloop
+
+; CHECK: outerloop:
+;
+; Eliminate %ofs1 after widening outercount.
+; CHECK-NOT: sext
+; CHECK: getelementptr
+;
+; IV rewriting hoists a gep into this block. We don't like that.
+; CHECK-NOT: getelementptr
+outerloop:
+  %outercount   = phi i32 [ %outerpostcount, %outermerge ], [ 0, %entry ]
+  %innercount = phi i32 [ %innercount.merge, %outermerge ], [ 0, %entry ]
+
+  %outercountdec = add i32 %outercount, -1
+  %ofs1 = sext i32 %outercountdec to i64
+  %adr1 = getelementptr i8* %address, i64 %ofs1
+  store i8 0, i8* %adr1
+
+  br label %innerpreheader
+
+innerpreheader:
+  %innerprecmp = icmp sgt i32 %limitdec, %innercount
+  br i1 %innerprecmp, label %innerloop, label %outermerge
+
+; CHECK: innerloop:
+;
+; Eliminate %ofs2 after widening inneriv.
+; Eliminate %ofs3 after normalizing sext(innerpostiv)
+; CHECK-NOT: sext
+; CHECK: getelementptr
+;
+; FIXME: We should check that indvars does not increase the number of
+; IVs in this loop. sext elimination plus LFTR currently results in 2 final
+; IVs. Waiting to remove LFTR.
+innerloop:
+  %inneriv = phi i32 [ %innerpostiv, %innerloop ], [ %innercount, %innerpreheader ]
+  %innerpostiv = add i32 %inneriv, 1
+
+  %ofs2 = sext i32 %inneriv to i64
+  %adr2 = getelementptr i8* %address, i64 %ofs2
+  store i8 0, i8* %adr2
+
+  %ofs3 = sext i32 %innerpostiv to i64
+  %adr3 = getelementptr i8* %address, i64 %ofs3
+  store i8 0, i8* %adr3
+
+  %innercmp = icmp sgt i32 %limitdec, %innerpostiv
+  br i1 %innercmp, label %innerloop, label %innerexit
+
+innerexit:
+  %innercount.lcssa = phi i32 [ %innerpostiv, %innerloop ]
+  br label %outermerge
+
+; CHECK: outermerge:
+;
+; Eliminate %ofs4 after widening outercount
+; CHECK-NOT: sext
+; CHECK: getelementptr
+;
+; TODO: Eliminate %ofs5 after removing lcssa
+outermerge:
+  %innercount.merge = phi i32 [ %innercount.lcssa, %innerexit ], [ %innercount, %innerpreheader ]
+
+  %ofs4 = sext i32 %outercount to i64
+  %adr4 = getelementptr i8* %address, i64 %ofs4
+  store i8 0, i8* %adr4
+
+  %ofs5 = sext i32 %innercount.merge to i64
+  %adr5 = getelementptr i8* %address, i64 %ofs5
+  store i8 0, i8* %adr5
+
+  %outerpostcount = add i32 %outercount, 1
+  %tmp47 = icmp slt i32 %outerpostcount, %limit
+  br i1 %tmp47, label %outerloop, label %return
+
+return:
+  ret void
+}
diff --git a/test/Transforms/IndVarSimplify/iv-sext.ll b/test/Transforms/IndVarSimplify/iv-sext.ll
index 3e90873..6c7a627 100644
--- a/test/Transforms/IndVarSimplify/iv-sext.ll
+++ b/test/Transforms/IndVarSimplify/iv-sext.ll
@@ -1,6 +1,4 @@
-; RUN: opt < %s -indvars -S > %t
-; RUN: grep {= sext} %t | count 4
-; RUN: grep {phi i64} %t | count 2
+; RUN: opt < %s -indvars -S | FileCheck %s
 
 ; Indvars should be able to promote the hiPart induction variable in the
 ; inner loop to i64.
@@ -18,6 +16,9 @@ bb.nph22:		; preds = %entry
 	%tmp3 = add i32 %bandEdgeIndex, -1		; <i32> [#uses=2]
 	br label %bb
 
+; CHECK: bb:
+; CHECK: phi i64
+; CHECK-NOT: phi i64
 bb:		; preds = %bb8, %bb.nph22
 	%distERBhi.121 = phi float [ %distERBhi.2.lcssa, %bb8 ], [ 0.000000e+00, %bb.nph22 ]		; <float> [#uses=2]
 	%distERBlo.120 = phi float [ %distERBlo.0.lcssa, %bb8 ], [ 0.000000e+00, %bb.nph22 ]		; <float> [#uses=2]
@@ -28,6 +29,7 @@ bb:		; preds = %bb8, %bb.nph22
 	%tmp4 = icmp sgt i32 %part.016, 0		; <i1> [#uses=1]
 	br i1 %tmp4, label %bb1, label %bb3.preheader
 
+; CHECK: bb1:
 bb1:		; preds = %bb
 	%tmp5 = add i32 %part.016, -1		; <i32> [#uses=1]
 	%tmp6 = sext i32 %tmp5 to i64		; <i64> [#uses=1]
@@ -86,7 +88,10 @@ bb5.preheader:		; preds = %bb3.bb5.preheader_crit_edge, %bb3.preheader
 
 bb.nph12:		; preds = %bb5.preheader
 	br label %bb4
-
+; CHECK: bb4:
+; CHECK: phi i64
+; CHECK-NOT: phi i64
+; CHECK-NOT: sext
 bb4:		; preds = %bb5, %bb.nph12
 	%distERBhi.29 = phi float [ %tmp30, %bb5 ], [ %distERBhi.0.ph, %bb.nph12 ]		; <float> [#uses=1]
 	%hiPart.08 = phi i32 [ %tmp31, %bb5 ], [ %hiPart.119, %bb.nph12 ]		; <i32> [#uses=2]
@@ -102,6 +107,7 @@ bb4:		; preds = %bb5, %bb.nph12
 	%tmp35 = fadd float %tmp34, %peakCount.27		; <float> [#uses=2]
 	br label %bb5
 
+; CHECK: bb5:
 bb5:		; preds = %bb4
 	%.not = fcmp olt float %tmp30, 2.500000e+00		; <i1> [#uses=1]
 	%tmp36 = icmp sgt i32 %tmp3, %tmp31		; <i1> [#uses=1]
diff --git a/test/Transforms/IndVarSimplify/iv-zext.ll b/test/Transforms/IndVarSimplify/iv-zext.ll
index 80a77b6..00018ec 100644
--- a/test/Transforms/IndVarSimplify/iv-zext.ll
+++ b/test/Transforms/IndVarSimplify/iv-zext.ll
@@ -1,6 +1,6 @@
-; RUN: opt < %s -indvars -S > %t
-; RUN: not grep and %t
-; RUN: not grep zext %t
+; RUN: opt < %s -indvars -S | FileCheck %s
+; CHECK-NOT: and
+; CHECK-NOT: zext
 
 target datalayout = "-p:64:64:64-n:32:64"
 
diff --git a/test/Transforms/IndVarSimplify/no-iv-rewrite.ll b/test/Transforms/IndVarSimplify/no-iv-rewrite.ll
new file mode 100644
index 0000000..c35feef
--- /dev/null
+++ b/test/Transforms/IndVarSimplify/no-iv-rewrite.ll
@@ -0,0 +1,123 @@
+; RUN: opt < %s -indvars -disable-iv-rewrite -S | FileCheck %s
+;
+; Make sure that indvars isn't inserting canonical IVs.
+; This is kinda hard to do until linear function test replacement is removed.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+
+define i32 @sum(i32* %arr, i32 %n) nounwind {
+entry:
+  %precond = icmp slt i32 0, %n
+  br i1 %precond, label %ph, label %return
+
+ph:
+  br label %loop
+
+; CHECK: loop:
+;
+; We should only have 2 IVs.
+; CHECK: phi
+; CHECK: phi
+; CHECK-NOT: phi
+;
+; sext should be eliminated while preserving gep inboundsness.
+; CHECK-NOT: sext
+; CHECK: getelementptr inbounds
+loop:
+  %i.02 = phi i32 [ 0, %ph ], [ %iinc, %loop ]
+  %s.01 = phi i32 [ 0, %ph ], [ %sinc, %loop ]
+  %ofs = sext i32 %i.02 to i64
+  %adr = getelementptr inbounds i32* %arr, i64 %ofs
+  %val = load i32* %adr
+  %sinc = add nsw i32 %s.01, %val
+  %iinc = add nsw i32 %i.02, 1
+  %cond = icmp slt i32 %iinc, %n
+  br i1 %cond, label %loop, label %exit
+
+exit:
+  %s.lcssa = phi i32 [ %sinc, %loop ]
+  br label %return
+
+return:
+  %s.0.lcssa = phi i32 [ %s.lcssa, %exit ], [ 0, %entry ]
+  ret i32 %s.0.lcssa
+}
+
+define i64 @suml(i32* %arr, i32 %n) nounwind {
+entry:
+  %precond = icmp slt i32 0, %n
+  br i1 %precond, label %ph, label %return
+
+ph:
+  br label %loop
+
+; CHECK: loop:
+;
+; We should only have 2 IVs.
+; CHECK: phi
+; CHECK: phi
+; CHECK-NOT: phi
+;
+; %ofs sext should be eliminated while preserving gep inboundsness.
+; CHECK-NOT: sext
+; CHECK: getelementptr inbounds
+; %vall sext should obviously not be eliminated
+; CHECK: sext
+loop:
+  %i.02 = phi i32 [ 0, %ph ], [ %iinc, %loop ]
+  %s.01 = phi i64 [ 0, %ph ], [ %sinc, %loop ]
+  %ofs = sext i32 %i.02 to i64
+  %adr = getelementptr inbounds i32* %arr, i64 %ofs
+  %val = load i32* %adr
+  %vall = sext i32 %val to i64
+  %sinc = add nsw i64 %s.01, %vall
+  %iinc = add nsw i32 %i.02, 1
+  %cond = icmp slt i32 %iinc, %n
+  br i1 %cond, label %loop, label %exit
+
+exit:
+  %s.lcssa = phi i64 [ %sinc, %loop ]
+  br label %return
+
+return:
+  %s.0.lcssa = phi i64 [ %s.lcssa, %exit ], [ 0, %entry ]
+  ret i64 %s.0.lcssa
+}
+
+define void @outofbounds(i32* %first, i32* %last, i32 %idx) nounwind {
+  %precond = icmp ne i32* %first, %last
+  br i1 %precond, label %ph, label %return
+
+; CHECK: ph:
+; It's not indvars' job to perform LICM on %ofs
+; CHECK-NOT: sext
+ph:
+  br label %loop
+
+; CHECK: loop:
+;
+; Preserve exactly one pointer type IV.
+; CHECK: phi i32*
+; CHECK-NOT: phi
+;
+; Don't create any extra adds.
+; CHECK-NOT: add
+;
+; Preserve gep inboundsness, and don't factor it.
+; CHECK: getelementptr inbounds i32* %ptriv, i32 1
+; CHECK-NOT: add
+loop:
+  %ptriv = phi i32* [ %first, %ph ], [ %ptrpost, %loop ]
+  %ofs = sext i32 %idx to i64
+  %adr = getelementptr inbounds i32* %ptriv, i64 %ofs
+  store i32 3, i32* %adr
+  %ptrpost = getelementptr inbounds i32* %ptriv, i32 1
+  %cond = icmp ne i32* %ptrpost, %last
+  br i1 %cond, label %loop, label %exit
+
+exit:
+  br label %return
+
+return:
+  ret void
+}
diff --git a/test/Transforms/IndVarSimplify/phi-uses-value-multiple-times.ll b/test/Transforms/IndVarSimplify/phi-uses-value-multiple-times.ll
index 34d432b..52c9e5c 100644
--- a/test/Transforms/IndVarSimplify/phi-uses-value-multiple-times.ll
+++ b/test/Transforms/IndVarSimplify/phi-uses-value-multiple-times.ll
@@ -1,4 +1,8 @@
-; RUN: opt < %s -indvars
+; RUN: opt < %s -indvars -disable-output -stats -info-output-file - | FileCheck %s
+; Check that IndVarSimplify is not creating unnecessary canonical IVs
+; that will never be used.
+; CHECK-NOT: indvars
+
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 
 @ue = external global i64
diff --git a/test/Transforms/Inline/array_merge.ll b/test/Transforms/Inline/array_merge.ll
index 0d176b8..b2eafeb 100644
--- a/test/Transforms/Inline/array_merge.ll
+++ b/test/Transforms/Inline/array_merge.ll
@@ -19,7 +19,7 @@ entry:
 ; CHECK-NEXT: entry:
 ; CHECK-NEXT: %A.i = alloca
 ; CHECK-NEXT: %B.i = alloca
-; CHECK-NEXT: call void
+; CHECK-NOT: alloca
   call void @foo() nounwind
   call void @foo() nounwind
   ret void
diff --git a/test/Transforms/Inline/inline_invoke.ll b/test/Transforms/Inline/inline_invoke.ll
new file mode 100644
index 0000000..2a1b883
--- /dev/null
+++ b/test/Transforms/Inline/inline_invoke.ll
@@ -0,0 +1,336 @@
+; RUN: opt < %s -inline -S | FileCheck %s
+
+; Test that the inliner correctly handles inlining into invoke sites
+; by appending selectors and forwarding _Unwind_Resume directly to the
+; enclosing landing pad.
+
+;; Test 0 - basic functionality.
+
+%struct.A = type { i8 }
+
+@_ZTIi = external constant i8*
+
+declare void @_ZN1AC1Ev(%struct.A*)
+
+declare void @_ZN1AD1Ev(%struct.A*)
+
+declare void @use(i32) nounwind
+
+declare void @opaque()
+
+declare i8* @llvm.eh.exception() nounwind readonly
+
+declare i32 @llvm.eh.selector(i8*, i8*, ...) nounwind
+
+declare i32 @llvm.eh.typeid.for(i8*) nounwind
+
+declare void @llvm.eh.resume(i8*, i32)
+
+declare i32 @__gxx_personality_v0(...)
+
+declare i8* @__cxa_begin_catch(i8*)
+
+declare void @__cxa_end_catch()
+
+declare void @_ZSt9terminatev()
+
+define internal void @test0_in() alwaysinline uwtable ssp {
+entry:
+  %a = alloca %struct.A, align 1
+  %b = alloca %struct.A, align 1
+  call void @_ZN1AC1Ev(%struct.A* %a)
+  invoke void @_ZN1AC1Ev(%struct.A* %b)
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:
+  invoke void @_ZN1AD1Ev(%struct.A* %b)
+          to label %invoke.cont1 unwind label %lpad
+
+invoke.cont1:
+  call void @_ZN1AD1Ev(%struct.A* %a)
+  ret void
+
+lpad:
+  %exn = call i8* @llvm.eh.exception() nounwind
+  %eh.selector = call i32 (i8*, i8*, ...)* @llvm.eh.selector(i8* %exn, i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*), i32 0) nounwind
+  invoke void @_ZN1AD1Ev(%struct.A* %a)
+          to label %invoke.cont2 unwind label %terminate.lpad
+
+invoke.cont2:
+  call void @llvm.eh.resume(i8* %exn, i32 %eh.selector) noreturn
+  unreachable
+
+terminate.lpad:
+  %exn3 = call i8* @llvm.eh.exception() nounwind
+  %eh.selector4 = call i32 (i8*, i8*, ...)* @llvm.eh.selector(i8* %exn3, i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*), i8* null) nounwind
+  call void @_ZSt9terminatev() noreturn nounwind
+  unreachable
+}
+
+define void @test0_out() uwtable ssp {
+entry:
+  invoke void @test0_in()
+          to label %ret unwind label %lpad
+
+ret:
+  ret void
+
+lpad:                                             ; preds = %entry
+  %exn = call i8* @llvm.eh.exception() nounwind
+  %eh.selector = call i32 (i8*, i8*, ...)* @llvm.eh.selector(i8* %exn, i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*), i8* bitcast (i8** @_ZTIi to i8*)) nounwind
+  %0 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*)) nounwind
+  %1 = icmp eq i32 %eh.selector, %0
+  br i1 %1, label %catch, label %eh.resume
+
+catch:
+  %ignored = call i8* @__cxa_begin_catch(i8* %exn) nounwind
+  call void @__cxa_end_catch() nounwind
+  br label %ret
+
+eh.resume:
+  call void @llvm.eh.resume(i8* %exn, i32 %eh.selector) noreturn
+  unreachable
+}
+
+; CHECK:    define void @test0_out()
+; CHECK:      [[A:%.*]] = alloca %struct.A,
+; CHECK:      [[B:%.*]] = alloca %struct.A,
+; CHECK:      invoke void @_ZN1AC1Ev(%struct.A* [[A]])
+; CHECK:      invoke void @_ZN1AC1Ev(%struct.A* [[B]])
+; CHECK:      invoke void @_ZN1AD1Ev(%struct.A* [[B]])
+; CHECK:      invoke void @_ZN1AD1Ev(%struct.A* [[A]])
+; CHECK:      call i32 (i8*, i8*, ...)* @llvm.eh.selector(i8* {{%.*}}, i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*), i32 0, i8* bitcast (i8** @_ZTIi to i8*))
+; CHECK-NEXT: invoke void @_ZN1AD1Ev(%struct.A* [[A]])
+; CHECK-NEXT:   to label %[[LBL:[^\s]+]] unwind
+; CHECK: [[LBL]]:
+; CHECK-NEXT: br label %[[LPAD:[^\s]+]]
+; CHECK:      ret void
+; CHECK:      call i8* @llvm.eh.exception()
+; CHECK-NEXT: call i32 (i8*, i8*, ...)* @llvm.eh.selector(i8* {{%.*}}, i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*), i8* bitcast (i8** @_ZTIi to i8*))
+; CHECK-NEXT: br label %[[LPAD]]
+; CHECK: [[LPAD]]:
+; CHECK-NEXT: phi i8* [
+; CHECK-NEXT: phi i32 [
+; CHECK-NEXT: call i32 @llvm.eh.typeid.for(
+
+
+;; Test 1 - Correctly handle phis in outer landing pads.
+
+define void @test1_out() uwtable ssp {
+entry:
+  invoke void @test0_in()
+          to label %cont unwind label %lpad
+
+cont:
+  invoke void @test0_in()
+          to label %ret unwind label %lpad
+
+ret:
+  ret void
+
+lpad:
+  %x = phi i32 [ 0, %entry ], [ 1, %cont ]
+  %y = phi i32 [ 1, %entry ], [ 4, %cont ]
+  %exn = call i8* @llvm.eh.exception() nounwind
+  %eh.selector = call i32 (i8*, i8*, ...)* @llvm.eh.selector(i8* %exn, i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*), i8* bitcast (i8** @_ZTIi to i8*)) nounwind
+  %0 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*)) nounwind
+  %1 = icmp eq i32 %eh.selector, %0
+  br i1 %1, label %catch, label %eh.resume
+
+catch:
+  %ignored = call i8* @__cxa_begin_catch(i8* %exn) nounwind
+  call void @use(i32 %x)
+  call void @use(i32 %y)
+  call void @__cxa_end_catch() nounwind
+  br label %ret
+
+eh.resume:
+  call void @llvm.eh.resume(i8* %exn, i32 %eh.selector) noreturn
+  unreachable
+}
+
+; CHECK:    define void @test1_out()
+; CHECK:      [[A2:%.*]] = alloca %struct.A,
+; CHECK:      [[B2:%.*]] = alloca %struct.A,
+; CHECK:      [[A1:%.*]] = alloca %struct.A,
+; CHECK:      [[B1:%.*]] = alloca %struct.A,
+; CHECK:      invoke void @_ZN1AC1Ev(%struct.A* [[A1]])
+; CHECK-NEXT:   unwind label %[[LPAD:[^\s]+]]
+; CHECK:      invoke void @_ZN1AC1Ev(%struct.A* [[B1]])
+; CHECK-NEXT:   unwind label %[[LPAD1:[^\s]+]]
+; CHECK:      invoke void @_ZN1AD1Ev(%struct.A* [[B1]])
+; CHECK-NEXT:   unwind label %[[LPAD1]]
+; CHECK:      invoke void @_ZN1AD1Ev(%struct.A* [[A1]])
+; CHECK-NEXT:   unwind label %[[LPAD]]
+
+; Inner landing pad from first inlining.
+; CHECK:    [[LPAD1]]:
+; CHECK-NEXT: [[EXN1:%.*]] = call i8* @llvm.eh.exception()
+; CHECK-NEXT: [[SEL1:%.*]] = call i32 (i8*, i8*, ...)* @llvm.eh.selector(i8* [[EXN1]], i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*), i32 0, i8* bitcast (i8** @_ZTIi to i8*))
+; CHECK-NEXT: invoke void @_ZN1AD1Ev(%struct.A* [[A1]])
+; CHECK-NEXT:   to label %[[RESUME1:[^\s]+]] unwind
+; CHECK: [[RESUME1]]:
+; CHECK-NEXT: br label %[[LPAD_JOIN1:[^\s]+]]
+
+; CHECK:      invoke void @_ZN1AC1Ev(%struct.A* [[A2]])
+; CHECK-NEXT:   unwind label %[[LPAD]]
+; CHECK:      invoke void @_ZN1AC1Ev(%struct.A* [[B2]])
+; CHECK-NEXT:   unwind label %[[LPAD2:[^\s]+]]
+; CHECK:      invoke void @_ZN1AD1Ev(%struct.A* [[B2]])
+; CHECK-NEXT:   unwind label %[[LPAD2]]
+; CHECK:      invoke void @_ZN1AD1Ev(%struct.A* [[A2]])
+; CHECK-NEXT:   unwind label %[[LPAD]]
+
+; Inner landing pad from second inlining.
+; CHECK:    [[LPAD2]]:
+; CHECK-NEXT: [[EXN2:%.*]] = call i8* @llvm.eh.exception()
+; CHECK-NEXT: [[SEL2:%.*]] = call i32 (i8*, i8*, ...)* @llvm.eh.selector(i8* [[EXN2]], i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*), i32 0, i8* bitcast (i8** @_ZTIi to i8*))
+; CHECK-NEXT: invoke void @_ZN1AD1Ev(%struct.A* [[A2]])
+; CHECK-NEXT:   to label %[[RESUME2:[^\s]+]] unwind
+; CHECK: [[RESUME2]]:
+; CHECK-NEXT: br label %[[LPAD_JOIN2:[^\s]+]]
+
+; CHECK:      ret void
+
+; CHECK:    [[LPAD]]:
+; CHECK-NEXT: [[X:%.*]] = phi i32 [ 0, %entry ], [ 0, {{%.*}} ], [ 1, %cont ], [ 1, {{%.*}} ]
+; CHECK-NEXT: [[Y:%.*]] = phi i32 [ 1, %entry ], [ 1, {{%.*}} ], [ 4, %cont ], [ 4, {{%.*}} ]
+; CHECK-NEXT: [[EXN:%.*]] = call i8* @llvm.eh.exception()
+; CHECK-NEXT: [[SEL:%.*]] = call i32 (i8*, i8*, ...)* @llvm.eh.selector(i8* [[EXN]], i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*), i8* bitcast (i8** @_ZTIi to i8*))
+; CHECK-NEXT: br label %[[LPAD_JOIN2]]
+
+; CHECK: [[LPAD_JOIN2]]:
+; CHECK-NEXT: [[XJ2:%.*]] = phi i32 [ [[X]], %[[LPAD]] ], [ 1, %[[RESUME2]] ]
+; CHECK-NEXT: [[YJ2:%.*]] = phi i32 [ [[Y]], %[[LPAD]] ], [ 4, %[[RESUME2]] ]
+; CHECK-NEXT: [[EXNJ2:%.*]] = phi i8* [ [[EXN]], %[[LPAD]] ], [ [[EXN2]], %[[RESUME2]] ]
+; CHECK-NEXT: [[SELJ2:%.*]] = phi i32 [ [[SEL]], %[[LPAD]] ], [ [[SEL2]], %[[RESUME2]] ]
+; CHECK-NEXT: br label %[[LPAD_JOIN1]]
+
+; CHECK: [[LPAD_JOIN1]]:
+; CHECK-NEXT: [[XJ1:%.*]] = phi i32 [ [[XJ2]], %[[LPAD_JOIN2]] ], [ 0, %[[RESUME1]] ]
+; CHECK-NEXT: [[YJ1:%.*]] = phi i32 [ [[YJ2]], %[[LPAD_JOIN2]] ], [ 1, %[[RESUME1]] ]
+; CHECK-NEXT: [[EXNJ1:%.*]] = phi i8* [ [[EXNJ2]], %[[LPAD_JOIN2]] ], [ [[EXN1]], %[[RESUME1]] ]
+; CHECK-NEXT: [[SELJ1:%.*]] = phi i32 [ [[SELJ2]], %[[LPAD_JOIN2]] ], [ [[SEL1]], %[[RESUME1]] ]
+; CHECK-NEXT: [[T:%.*]] = call i32 @llvm.eh.typeid.for(
+; CHECK-NEXT: icmp eq i32 [[SELJ1]], [[T]]
+
+; CHECK:      call void @use(i32 [[XJ1]])
+; CHECK:      call void @use(i32 [[YJ1]])
+
+; CHECK:      call void @llvm.eh.resume(i8* [[EXNJ1]], i32 [[SELJ1]])
+
+
+;; Test 2 - Don't make invalid IR for inlines into landing pads without eh.exception calls
+define void @test2_out() uwtable ssp {
+entry:
+  invoke void @test0_in()
+          to label %ret unwind label %lpad
+
+ret:
+  ret void
+
+lpad:
+  call void @_ZSt9terminatev()
+  unreachable
+}
+
+; CHECK: define void @test2_out()
+; CHECK:      [[A:%.*]] = alloca %struct.A,
+; CHECK:      [[B:%.*]] = alloca %struct.A,
+; CHECK:      invoke void @_ZN1AC1Ev(%struct.A* [[A]])
+; CHECK-NEXT:   unwind label %[[LPAD:[^\s]+]]
+; CHECK:      invoke void @_ZN1AC1Ev(%struct.A* [[B]])
+; CHECK-NEXT:   unwind label %[[LPAD2:[^\s]+]]
+; CHECK:      invoke void @_ZN1AD1Ev(%struct.A* [[B]])
+; CHECK-NEXT:   unwind label %[[LPAD2]]
+; CHECK:      invoke void @_ZN1AD1Ev(%struct.A* [[A]])
+; CHECK-NEXT:   unwind label %[[LPAD]]
+
+
+;; Test 3 - Deal correctly with split unwind edges.
+define void @test3_out() uwtable ssp {
+entry:
+  invoke void @test0_in()
+          to label %ret unwind label %lpad
+
+ret:
+  ret void
+
+lpad:
+  br label %lpad.cont
+
+lpad.cont:
+  %exn = call i8* @llvm.eh.exception() nounwind
+  %eh.selector = call i32 (i8*, i8*, ...)* @llvm.eh.selector(i8* %exn, i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*), i8* bitcast (i8** @_ZTIi to i8*)) nounwind
+  call void @_ZSt9terminatev()
+  unreachable
+}
+
+; CHECK: define void @test3_out()
+; CHECK:      call i32 (i8*, i8*, ...)* @llvm.eh.selector(i8* {{%.*}}, i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*), i32 0, i8* bitcast (i8** @_ZTIi to i8*))
+; CHECK-NEXT: invoke void @_ZN1AD1Ev(
+; CHECK-NEXT:   to label %[[L:[^\s]+]] unwind
+; CHECK:    [[L]]:
+; CHECK-NEXT: br label %[[JOIN:[^\s]+]]
+; CHECK:    [[JOIN]]:
+; CHECK-NEXT: phi
+; CHECK-NEXT: phi
+; CHECK-NEXT: br label %lpad.cont
+; CHECK:    lpad.cont:
+; CHECK-NEXT: call void @_ZSt9terminatev()
+
+
+;; Test 4 - Split unwind edges with a dominance problem
+define void @test4_out() uwtable ssp {
+entry:
+  invoke void @test0_in()
+          to label %cont unwind label %lpad.crit
+
+cont:
+  invoke void @opaque()
+          to label %ret unwind label %lpad
+
+ret:
+  ret void
+
+lpad.crit:
+  call void @opaque() nounwind
+  br label %lpad
+
+lpad:
+  %phi = phi i32 [ 0, %lpad.crit ], [ 1, %cont ]
+  %exn = call i8* @llvm.eh.exception() nounwind
+  %eh.selector = call i32 (i8*, i8*, ...)* @llvm.eh.selector(i8* %exn, i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*), i8* bitcast (i8** @_ZTIi to i8*)) nounwind
+  call void @use(i32 %phi)
+  call void @_ZSt9terminatev()
+  unreachable
+}
+
+; CHECK: define void @test4_out()
+; CHECK:      call i32 (i8*, i8*, ...)* @llvm.eh.selector(i8* {{%.*}}, i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*), i32 0, i8* bitcast (i8** @_ZTIi to i8*))
+; CHECK-NEXT: invoke void @_ZN1AD1Ev(
+; CHECK-NEXT:   to label %[[L:[^\s]+]] unwind
+; CHECK:    [[L]]:
+; CHECK-NEXT: br label %[[JOIN:[^\s]+]]
+; CHECK:      invoke void @opaque()
+; CHECK-NEXT:                  unwind label %lpad
+; CHECK:    lpad.crit:
+; CHECK-NEXT: call i8* @llvm.eh.exception()
+; CHECK-NEXT: call i32 (i8*, i8*, ...)* @llvm.eh.selector(i8* %4, i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*), i8* bitcast (i8** @_ZTIi to i8*))
+; CHECK-NEXT: br label %[[JOIN]]
+; CHECK:    [[JOIN]]:
+; CHECK-NEXT: phi i8*
+; CHECK-NEXT: phi i32
+; CHECK-NEXT: call void @opaque() nounwind
+; CHECK-NEXT: br label %[[FIX:[^\s]+]]
+; CHECK:    lpad:
+; CHECK-NEXT: [[T0:%.*]] = phi i32 [ 1, %cont ]
+; CHECK-NEXT: call i8* @llvm.eh.exception() nounwind
+; CHECK-NEXT: call i32 (i8*, i8*, ...)* @llvm.eh.selector(i8* %exn, i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*), i8* bitcast (i8** @_ZTIi to i8*))
+; CHECK-NEXT: br label %[[FIX]]
+; CHECK:    [[FIX]]:
+; CHECK-NEXT: [[T1:%.*]] = phi i32 [ [[T0]], %lpad ], [ 0, %[[JOIN]] ]
+; CHECK-NEXT: phi i8*
+; CHECK-NEXT: phi i32
+; CHECK-NEXT: call void @use(i32 [[T1]])
+; CHECK-NEXT: call void @_ZSt9terminatev()
diff --git a/test/Transforms/Inline/lifetime.ll b/test/Transforms/Inline/lifetime.ll
new file mode 100644
index 0000000..a95c836
--- /dev/null
+++ b/test/Transforms/Inline/lifetime.ll
@@ -0,0 +1,78 @@
+; RUN: opt -inline %s -S -o - | FileCheck %s
+
+declare void @llvm.lifetime.start(i64, i8*)
+declare void @llvm.lifetime.end(i64, i8*)
+
+define void @helper_both_markers() {
+  %a = alloca i8
+  call void @llvm.lifetime.start(i64 1, i8* %a)
+  call void @llvm.lifetime.end(i64 1, i8* %a)
+  ret void
+}
+
+define void @test_both_markers() {
+; CHECK: @test_both_markers
+; CHECK: llvm.lifetime.start(i64 1
+; CHECK-NEXT: llvm.lifetime.end(i64 1
+  call void @helper_both_markers()
+; CHECK-NEXT: llvm.lifetime.start(i64 1
+; CHECK-NEXT: llvm.lifetime.end(i64 1
+  call void @helper_both_markers()
+; CHECK-NEXT: ret void
+  ret void
+}
+
+;; Without this, the inliner will simplify out @test_no_marker before adding
+;; any lifetime markers.
+declare void @use(i8* %a)
+
+define void @helper_no_markers() {
+  %a = alloca i8
+  call void @use(i8* %a)
+  ret void
+}
+
+;; We can't use CHECK-NEXT because there's an extra call void @use in between.
+;; Instead, we use CHECK-NOT to verify that there are no other lifetime calls.
+define void @test_no_marker() {
+; CHECK: @test_no_marker
+; CHECK-NOT: lifetime
+; CHECK: llvm.lifetime.start(i64 -1
+; CHECK-NOT: lifetime
+; CHECK: llvm.lifetime.end(i64 -1
+  call void @helper_no_markers()
+; CHECK-NOT: lifetime
+; CHECK: llvm.lifetime.start(i64 -1
+; CHECK-NOT: lifetime
+; CHECK: llvm.lifetime.end(i64 -1
+  call void @helper_no_markers()
+; CHECK-NOT: lifetime
+; CHECK: ret void
+  ret void
+}
+
+define void @helper_two_casts() {
+  %a = alloca i32
+  %b = bitcast i32* %a to i8*
+  call void @llvm.lifetime.start(i64 4, i8* %b)
+  %c = bitcast i32* %a to i8*
+  call void @llvm.lifetime.end(i64 4, i8* %c)
+  ret void
+}
+
+define void @test_two_casts() {
+; CHECK: @test_two_casts
+; CHECK-NOT: lifetime
+; CHECK: llvm.lifetime.start(i64 4
+; CHECK-NOT: lifetime
+; CHECK: llvm.lifetime.end(i64 4
+  call void @helper_two_casts()
+; CHECK-NOT: lifetime
+; CHECK: llvm.lifetime.start(i64 4
+; CHECK-NOT: lifetime
+; CHECK: llvm.lifetime.end(i64 4
+  call void @helper_two_casts()
+; CHECK-NOT: lifetime
+; CHECK: ret void
+  ret void
+}
diff --git a/test/Transforms/InstCombine/2007-04-04-BadFoldBitcastIntoMalloc.ll b/test/Transforms/InstCombine/2007-04-04-BadFoldBitcastIntoMalloc.ll
deleted file mode 100644
index b59d3c8..0000000
--- a/test/Transforms/InstCombine/2007-04-04-BadFoldBitcastIntoMalloc.ll
+++ /dev/null
@@ -1,19 +0,0 @@
-; In the presence of a negative offset (the -8 below), a fold of a bitcast into
-; a malloc messes up the element count, causing an extra 4GB to be allocated on
-; 64-bit targets.
-;
-; RUN: opt < %s -instcombine -S | not grep {= add }
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
-target triple = "x86_64-unknown-freebsd6.2"
-
-define i1 @test(i32 %tmp141, double** %tmp145)
-{
-  %tmp133 = add i32 %tmp141, 1
-  %tmp134 = shl i32 %tmp133, 3
-  %tmp135 = add i32 %tmp134, -8
-  %tmp136 = malloc i8, i32 %tmp135
-  %tmp137 = bitcast i8* %tmp136 to double*
-  store double* %tmp137, double** %tmp145
-  ret i1 false
-}
diff --git a/test/Transforms/InstCombine/2007-05-18-CastFoldBug.ll b/test/Transforms/InstCombine/2007-05-18-CastFoldBug.ll
index 40818d4..1c24df3 100644
--- a/test/Transforms/InstCombine/2007-05-18-CastFoldBug.ll
+++ b/test/Transforms/InstCombine/2007-05-18-CastFoldBug.ll
@@ -3,7 +3,7 @@
 
 define void @blah(i16* %tmp10) {
 entry:
-	call void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend_stret to void (i16* sret )*)( i16* %tmp10 sret  )
+	call void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend_stret to void (i16* sret )*)( i16*  sret %tmp10  )
 	ret void
 }
 
diff --git a/test/Transforms/InstCombine/2007-11-25-CompatibleAttributes.ll b/test/Transforms/InstCombine/2007-11-25-CompatibleAttributes.ll
index 24394c6..2109d34 100644
--- a/test/Transforms/InstCombine/2007-11-25-CompatibleAttributes.ll
+++ b/test/Transforms/InstCombine/2007-11-25-CompatibleAttributes.ll
@@ -5,7 +5,7 @@
 
 define i32 @main(i32 %argc, i8** %argv) {
 entry:
-	%tmp32 = tail call i32 (i8* noalias , ...) nounwind * bitcast (i32 (i8*, ...) nounwind * @printf to i32 (i8* noalias , ...) nounwind *)( i8* getelementptr ([4 x i8]* @.str, i32 0, i32 0) noalias , i32 0 ) nounwind 		; <i32> [#uses=0]
+	%tmp32 = tail call i32 (i8* noalias , ...) * bitcast (i32 (i8*, ...) nounwind * @printf to i32 (i8* noalias , ...) nounwind *)( i8* getelementptr ([4 x i8]* @.str, i32 0, i32 0)  , i32 0 ) nounwind 		; <i32> [#uses=0]
 	ret i32 undef
 }
 
diff --git a/test/Transforms/InstCombine/2008-01-06-BitCastAttributes.ll b/test/Transforms/InstCombine/2008-01-06-BitCastAttributes.ll
index 5f4fa47..23b6067 100644
--- a/test/Transforms/InstCombine/2008-01-06-BitCastAttributes.ll
+++ b/test/Transforms/InstCombine/2008-01-06-BitCastAttributes.ll
@@ -5,7 +5,7 @@ define void @a() {
 	ret void
 }
 
-define i32 @b(i32* inreg  %x) signext  {
+define signext i32 @b(i32* inreg  %x)   {
 	ret i32 0
 }
 
diff --git a/test/Transforms/InstCombine/2008-01-13-NoBitCastAttributes.ll b/test/Transforms/InstCombine/2008-01-13-NoBitCastAttributes.ll
index 7b3281f..510a68c 100644
--- a/test/Transforms/InstCombine/2008-01-13-NoBitCastAttributes.ll
+++ b/test/Transforms/InstCombine/2008-01-13-NoBitCastAttributes.ll
@@ -1,6 +1,6 @@
 ; RUN: opt < %s -instcombine -S | grep bitcast | count 2
 
-define i32 @b(i32* inreg  %x) signext  {
+define signext i32 @b(i32* inreg  %x)   {
 	ret i32 0
 }
 
diff --git a/test/Transforms/InstCombine/2011-05-02-VectorBoolean.ll b/test/Transforms/InstCombine/2011-05-02-VectorBoolean.ll
new file mode 100644
index 0000000..02b64e3
--- /dev/null
+++ b/test/Transforms/InstCombine/2011-05-02-VectorBoolean.ll
@@ -0,0 +1,15 @@
+; RUN: opt < %s -instcombine
+; PR9579
+
+define <2 x i16> @entry(<2 x i16> %a) nounwind {
+entry:
+  %a.addr = alloca <2 x i16>, align 4
+  %.compoundliteral = alloca <2 x i16>, align 4
+  store <2 x i16> %a, <2 x i16>* %a.addr, align 4
+  %tmp = load <2 x i16>* %a.addr, align 4
+  store <2 x i16> zeroinitializer, <2 x i16>* %.compoundliteral
+  %tmp1 = load <2 x i16>* %.compoundliteral
+  %cmp = icmp uge <2 x i16> %tmp, %tmp1
+  %sext = sext <2 x i1> %cmp to <2 x i16>
+  ret <2 x i16> %sext
+}
diff --git a/test/Transforms/InstCombine/2011-05-13-InBoundsGEP.ll b/test/Transforms/InstCombine/2011-05-13-InBoundsGEP.ll
new file mode 100644
index 0000000..fba7239
--- /dev/null
+++ b/test/Transforms/InstCombine/2011-05-13-InBoundsGEP.ll
@@ -0,0 +1,21 @@
+; RUN: opt < %s -S -instcombine | FileCheck %s
+; rdar://problem/9267970
+; ideally this test will run on a 32-bit host
+; must not discard GEPs that might overflow at runtime (aren't inbounds)
+
+define i32 @main(i32 %argc) {
+entry:
+    %tmp1 = add i32 %argc, -2
+    %tmp2 = add i32 %argc, 1879048192
+    %p = alloca i8
+; CHECK: getelementptr
+    %p1 = getelementptr i8* %p, i32 %tmp1
+; CHECK: getelementptr
+    %p2 = getelementptr i8* %p, i32 %tmp2
+    %cmp = icmp ult i8* %p1, %p2
+    br i1 %cmp, label %bbtrue, label %bbfalse
+bbtrue:          ; preds = %entry
+    ret i32 -1
+bbfalse:         ; preds = %entry
+    ret i32 0
+}
diff --git a/test/Transforms/InstCombine/2011-05-28-swapmulsub.ll b/test/Transforms/InstCombine/2011-05-28-swapmulsub.ll
new file mode 100644
index 0000000..b096d1f
--- /dev/null
+++ b/test/Transforms/InstCombine/2011-05-28-swapmulsub.ll
@@ -0,0 +1,57 @@
+; ModuleID = 'test1.c'
+; RUN: opt -S -instcombine < %s | FileCheck %s
+target triple = "x86_64-apple-macosx10.6.6"
+
+define zeroext i16 @foo1(i32 %on_off) nounwind uwtable ssp {
+entry:
+  %on_off.addr = alloca i32, align 4
+  %a = alloca i32, align 4
+  store i32 %on_off, i32* %on_off.addr, align 4
+  %tmp = load i32* %on_off.addr, align 4
+  %sub = sub i32 1, %tmp
+; CHECK-NOT: mul i32
+  %mul = mul i32 %sub, -2
+; CHECK: shl
+; CHECK-NEXT: add
+  store i32 %mul, i32* %a, align 4
+  %tmp1 = load i32* %a, align 4
+  %conv = trunc i32 %tmp1 to i16
+  ret i16 %conv
+}
+
+define zeroext i16 @foo2(i32 %on_off, i32 %q) nounwind uwtable ssp {
+entry:
+  %on_off.addr = alloca i32, align 4
+  %q.addr = alloca i32, align 4
+  %a = alloca i32, align 4
+  store i32 %on_off, i32* %on_off.addr, align 4
+  store i32 %q, i32* %q.addr, align 4
+  %tmp = load i32* %q.addr, align 4
+  %tmp1 = load i32* %on_off.addr, align 4
+  %sub = sub i32 %tmp, %tmp1
+; CHECK-NOT: mul i32
+  %mul = mul i32 %sub, -4
+; CHECK: sub i32
+; CHECK-NEXT: shl
+  store i32 %mul, i32* %a, align 4
+  %tmp2 = load i32* %a, align 4
+  %conv = trunc i32 %tmp2 to i16
+  ret i16 %conv
+}
+
+define zeroext i16 @foo3(i32 %on_off) nounwind uwtable ssp {
+entry:
+  %on_off.addr = alloca i32, align 4
+  %a = alloca i32, align 4
+  store i32 %on_off, i32* %on_off.addr, align 4
+  %tmp = load i32* %on_off.addr, align 4
+  %sub = sub i32 7, %tmp
+; CHECK-NOT: mul i32
+  %mul = mul i32 %sub, -4
+; CHECK: shl
+; CHECK-NEXT: add
+  store i32 %mul, i32* %a, align 4
+  %tmp1 = load i32* %a, align 4
+  %conv = trunc i32 %tmp1 to i16
+  ret i16 %conv
+}
diff --git a/test/Transforms/InstCombine/2011-06-13-nsw-alloca.ll b/test/Transforms/InstCombine/2011-06-13-nsw-alloca.ll
new file mode 100644
index 0000000..2f72b73
--- /dev/null
+++ b/test/Transforms/InstCombine/2011-06-13-nsw-alloca.ll
@@ -0,0 +1,60 @@
+; RUN: opt -S -instcombine < %s | FileCheck %s
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
+target triple = "i386-apple-darwin10.0.0"
+
+define void @fu1(i32 %parm) nounwind ssp {
+  %1 = alloca i32, align 4
+  %ptr = alloca double*, align 4
+  store i32 %parm, i32* %1, align 4
+  store double* null, double** %ptr, align 4
+  %2 = load i32* %1, align 4
+  %3 = icmp ne i32 %2, 0
+  br i1 %3, label %4, label %10
+
+; <label>:4                                       ; preds = %0
+  %5 = load i32* %1, align 4
+  %6 = mul nsw i32 %5, 8
+; With "nsw", the alloca and its bitcast can be fused:
+  %7 = add nsw i32 %6, 2048
+; CHECK: alloca double*
+  %8 = alloca i8, i32 %7
+  %9 = bitcast i8* %8 to double*
+  store double* %9, double** %ptr, align 4
+  br label %10
+
+; <label>:10                                      ; preds = %4, %0
+  %11 = load double** %ptr, align 4
+  call void @bar(double* %11)
+; CHECK: ret
+  ret void
+}
+
+declare void @bar(double*)
+
+define void @fu2(i32 %parm) nounwind ssp {
+  %1 = alloca i32, align 4
+  %ptr = alloca double*, align 4
+  store i32 %parm, i32* %1, align 4
+  store double* null, double** %ptr, align 4
+  %2 = load i32* %1, align 4
+  %3 = icmp ne i32 %2, 0
+  br i1 %3, label %4, label %10
+
+; <label>:4                                       ; preds = %0
+  %5 = load i32* %1, align 4
+  %6 = mul nsw i32 %5, 8
+; Without "nsw", the alloca and its bitcast cannot be fused:
+  %7 = add  i32 %6, 2048
+; CHECK: alloca i8
+  %8 = alloca i8, i32 %7
+; CHECK-NEXT: bitcast i8*
+  %9 = bitcast i8* %8 to double*
+  store double* %9, double** %ptr, align 4
+  br label %10
+
+; <label>:10                                      ; preds = %4, %0
+  %11 = load double** %ptr, align 4
+  call void @bar(double* %11)
+  ret void
+}
+
diff --git a/test/Transforms/InstCombine/and-or-not.ll b/test/Transforms/InstCombine/and-or-not.ll
index 37ec3bc..bd878b0 100644
--- a/test/Transforms/InstCombine/and-or-not.ll
+++ b/test/Transforms/InstCombine/and-or-not.ll
@@ -4,7 +4,7 @@
 
 ; PR1510
 
-; These are all equivelent to A^B
+; These are all equivalent to A^B
 
 define i32 @test1(i32 %a, i32 %b) {
 entry:
diff --git a/test/Transforms/InstCombine/call.ll b/test/Transforms/InstCombine/call.ll
index 2ef8dc0..d084873 100644
--- a/test/Transforms/InstCombine/call.ll
+++ b/test/Transforms/InstCombine/call.ll
@@ -53,8 +53,8 @@ define i8 @test4a() {
 define i32 @test4() {
         %X = call i32 bitcast (i8 ()* @test4a to i32 ()*)( )            ; <i32> [#uses=1]
         ret i32 %X
-; CHECK: %X1 = call i8 @test4a()
-; CHECK: %tmp = zext i8 %X1 to i32
+; CHECK: %X = call i8 @test4a()
+; CHECK: %tmp = zext i8 %X to i32
 ; CHECK: ret i32 %tmp
 }
 
@@ -77,8 +77,8 @@ declare i32 @test6a(i32)
 define i32 @test6() {
         %X = call i32 bitcast (i32 (i32)* @test6a to i32 ()*)( )
         ret i32 %X
-; CHECK: %X1 = call i32 @test6a(i32 0)
-; CHECK: ret i32 %X1
+; CHECK: %X = call i32 @test6a(i32 0)
+; CHECK: ret i32 %X
 }
 
 
diff --git a/test/Transforms/InstCombine/cast.ll b/test/Transforms/InstCombine/cast.ll
index bc5e365..f85636f 100644
--- a/test/Transforms/InstCombine/cast.ll
+++ b/test/Transforms/InstCombine/cast.ll
@@ -99,14 +99,6 @@ define void @test11(i32* %P) {
 ; CHECK: ret void
 }
 
-define i32* @test12() {
-        %p = malloc [4 x i8]            ; <[4 x i8]*> [#uses=1]
-        %c = bitcast [4 x i8]* %p to i32*               ; <i32*> [#uses=1]
-        ret i32* %c
-; CHECK: %malloccall = tail call i8* @malloc(i32 4)
-; CHECK: ret i32* %c
-}
-
 define i8* @test13(i64 %A) {
         %c = getelementptr [0 x i8]* bitcast ([32832 x i8]* @inbuf to [0 x i8]*), i64 0, i64 %A             ; <i8*> [#uses=1]
         ret i8* %c
@@ -265,22 +257,11 @@ define i1 @test31(i64 %A) {
         %C = and i32 %B, 42             ; <i32> [#uses=1]
         %D = icmp eq i32 %C, 10         ; <i1> [#uses=1]
         ret i1 %D
-; CHECK: %C1 = and i64 %A, 42
-; CHECK: %D = icmp eq i64 %C1, 10
+; CHECK: %C = and i64 %A, 42
+; CHECK: %D = icmp eq i64 %C, 10
 ; CHECK: ret i1 %D
 }
 
-define void @test32(double** %tmp) {
-        %tmp8 = malloc [16 x i8]                ; <[16 x i8]*> [#uses=1]
-        %tmp8.upgrd.1 = bitcast [16 x i8]* %tmp8 to double*             ; <double*> [#uses=1]
-        store double* %tmp8.upgrd.1, double** %tmp
-        ret void
-; CHECK: %malloccall = tail call i8* @malloc(i32 16)
-; CHECK: %tmp8.upgrd.1 = bitcast i8* %malloccall to double*
-; CHECK: store double* %tmp8.upgrd.1, double** %tmp
-; CHECK: ret void
-}
-
 define i32 @test33(i32 %c1) {
         %x = bitcast i32 %c1 to float           ; <float> [#uses=1]
         %y = bitcast float %x to i32            ; <i32> [#uses=1]
diff --git a/test/Transforms/InstCombine/div.ll b/test/Transforms/InstCombine/div.ll
index 0d13980..8a0897b 100644
--- a/test/Transforms/InstCombine/div.ll
+++ b/test/Transforms/InstCombine/div.ll
@@ -1,34 +1,44 @@
 ; This test makes sure that div instructions are properly eliminated.
 
-; RUN: opt < %s -instcombine -S | not grep div
+; RUN: opt < %s -instcombine -S | FileCheck %s
 
 define i32 @test1(i32 %A) {
         %B = sdiv i32 %A, 1             ; <i32> [#uses=1]
         ret i32 %B
+; CHECK: @test1
+; CHECK-NEXT: ret i32 %A
 }
 
 define i32 @test2(i32 %A) {
         ; => Shift
         %B = udiv i32 %A, 8             ; <i32> [#uses=1]
         ret i32 %B
+; CHECK: @test2
+; CHECK-NEXT: lshr i32 %A, 3
 }
 
 define i32 @test3(i32 %A) {
         ; => 0, don't need to keep traps
         %B = sdiv i32 0, %A             ; <i32> [#uses=1]
         ret i32 %B
+; CHECK: @test3
+; CHECK-NEXT: ret i32 0
 }
 
 define i32 @test4(i32 %A) {
         ; 0-A
         %B = sdiv i32 %A, -1            ; <i32> [#uses=1]
         ret i32 %B
+; CHECK: @test4
+; CHECK-NEXT: sub i32 0, %A
 }
 
 define i32 @test5(i32 %A) {
         %B = udiv i32 %A, -16           ; <i32> [#uses=1]
         %C = udiv i32 %B, -4            ; <i32> [#uses=1]
         ret i32 %C
+; CHECK: @test5
+; CHECK-NEXT: ret i32 0
 }
 
 define i1 @test6(i32 %A) {
@@ -36,6 +46,8 @@ define i1 @test6(i32 %A) {
         ; A < 123
         %C = icmp eq i32 %B, 0          ; <i1> [#uses=1]
         ret i1 %C
+; CHECK: @test6
+; CHECK-NEXT: icmp ult i32 %A, 123
 }
 
 define i1 @test7(i32 %A) {
@@ -43,6 +55,9 @@ define i1 @test7(i32 %A) {
         ; A >= 20 && A < 30
         %C = icmp eq i32 %B, 2          ; <i1> [#uses=1]
         ret i1 %C
+; CHECK: @test7
+; CHECK-NEXT: add i32 %A, -20
+; CHECK-NEXT: icmp ult i32
 }
 
 define i1 @test8(i8 %A) {
@@ -50,6 +65,8 @@ define i1 @test8(i8 %A) {
         ; A >= 246
         %C = icmp eq i8 %B, 2           ; <i1> [#uses=1]
         ret i1 %C
+; CHECK: @test8
+; CHECK-NEXT: icmp ugt i8 %A, -11
 }
 
 define i1 @test9(i8 %A) {
@@ -57,28 +74,61 @@ define i1 @test9(i8 %A) {
         ; A < 246
         %C = icmp ne i8 %B, 2           ; <i1> [#uses=1]
         ret i1 %C
+; CHECK: @test9
+; CHECK-NEXT: icmp ult i8 %A, -10
 }
 
 define i32 @test10(i32 %X, i1 %C) {
         %V = select i1 %C, i32 64, i32 8                ; <i32> [#uses=1]
         %R = udiv i32 %X, %V            ; <i32> [#uses=1]
         ret i32 %R
+; CHECK: @test10
+; CHECK-NEXT: select i1 %C, i32 6, i32 3
+; CHECK-NEXT: lshr i32 %X
 }
 
 define i32 @test11(i32 %X, i1 %C) {
         %A = select i1 %C, i32 1024, i32 32             ; <i32> [#uses=1]
         %B = udiv i32 %X, %A            ; <i32> [#uses=1]
         ret i32 %B
+; CHECK: @test11
+; CHECK-NEXT: select i1 %C, i32 10, i32 5
+; CHECK-NEXT: lshr i32 %X
 }
 
 ; PR2328
 define i32 @test12(i32 %x) nounwind  {
 	%tmp3 = udiv i32 %x, %x		; 1
 	ret i32 %tmp3
+; CHECK: @test12
+; CHECK-NEXT: ret i32 1
 }
 
 define i32 @test13(i32 %x) nounwind  {
 	%tmp3 = sdiv i32 %x, %x		; 1
 	ret i32 %tmp3
+; CHECK: @test13
+; CHECK-NEXT: ret i32 1
 }
 
+define i32 @test14(i8 %x) nounwind {
+	%zext = zext i8 %x to i32
+	%div = udiv i32 %zext, 257	; 0
+	ret i32 %div
+; CHECK: @test14
+; CHECK-NEXT: ret i32 0
+}
+
+; PR9814
+define i32 @test15(i32 %a, i32 %b) nounwind {
+  %shl = shl i32 1, %b
+  %div = lshr i32 %shl, 2
+  %div2 = udiv i32 %a, %div
+  ret i32 %div2
+; CHECK: @test15
+; CHECK-NEXT: add i32 %b, -2
+; CHECK-NEXT: lshr i32 %a, 
+; CHECK-NEXT: ret i32
+}
+
+
diff --git a/test/Transforms/InstCombine/exact.ll b/test/Transforms/InstCombine/exact.ll
index 58f8b5d..14741e3 100644
--- a/test/Transforms/InstCombine/exact.ll
+++ b/test/Transforms/InstCombine/exact.ll
@@ -96,6 +96,22 @@ define i1 @ashr_icmp2(i64 %X) nounwind {
  ret i1 %Z
 }
 
+; PR9998
+; Make sure we don't transform the ashr here into an sdiv
+; CHECK: @pr9998
+; CHECK: = and i32 %V, 1
+; CHECK: %Z = icmp ne
+; CHECK: ret i1 %Z
+define i1 @pr9998(i32 %V) nounwind {
+entry:
+  %W = shl i32 %V, 31
+  %X = ashr exact i32 %W, 31
+  %Y = sext i32 %X to i64
+  %Z = icmp ugt i64 %Y, 7297771788697658747
+  ret i1 %Z
+}
+
+
 ; CHECK: @udiv_icmp1
 ; CHECK: icmp ne i64 %X, 0
 define i1 @udiv_icmp1(i64 %X) nounwind {
diff --git a/test/Transforms/InstCombine/getelementptr.ll b/test/Transforms/InstCombine/getelementptr.ll
index 9e8547b..b869392 100644
--- a/test/Transforms/InstCombine/getelementptr.ll
+++ b/test/Transforms/InstCombine/getelementptr.ll
@@ -52,14 +52,6 @@ define void @test5(i8 %B) {
 ; CHECK: store i8 %B, i8* getelementptr inbounds ([10 x i8]* @Global, i64 0, i64 4)
 }
 
-define i32* @test6() {
-        %M = malloc [4 x i32] 
-        %A = getelementptr [4 x i32]* %M, i64 0, i64 0
-        %B = getelementptr i32* %A, i64 2             
-        ret i32* %B
-; CHECK: @test6
-; CHECK: getelementptr i8* %malloccall, i64 8
-}
 
 define i32* @test7(i32* %I, i64 %C, i64 %D) {
         %A = getelementptr i32* %I, i64 %C 
diff --git a/test/Transforms/InstCombine/icmp.ll b/test/Transforms/InstCombine/icmp.ll
index 7ba4368..c8f7f81 100644
--- a/test/Transforms/InstCombine/icmp.ll
+++ b/test/Transforms/InstCombine/icmp.ll
@@ -494,3 +494,56 @@ define i1 @test51(i32 %X, i32 %Y) {
   %C = icmp sgt i32 %B, -1
   ret i1 %C
 }
+
+; CHECK: @test52
+; CHECK-NEXT: and i32 %x1, 16711935
+; CHECK-NEXT: icmp eq i32 {{.*}}, 4980863
+; CHECK-NEXT: ret i1
+define i1 @test52(i32 %x1) nounwind {
+  %conv = and i32 %x1, 255
+  %cmp = icmp eq i32 %conv, 127
+  %tmp2 = lshr i32 %x1, 16
+  %tmp3 = trunc i32 %tmp2 to i8
+  %cmp15 = icmp eq i8 %tmp3, 76
+
+  %A = and i1 %cmp, %cmp15
+  ret i1 %A
+}
+
+; PR9838
+; CHECK: @test53
+; CHECK-NEXT: ashr exact
+; CHECK-NEXT: ashr
+; CHECK-NEXT: icmp
+define i1 @test53(i32 %a, i32 %b) nounwind {
+ %x = ashr exact i32 %a, 30
+ %y = ashr i32 %b, 30
+ %z = icmp eq i32 %x, %y
+ ret i1 %z
+}
+
+; CHECK: @test54
+; CHECK-NEXT: %and = and i8 %a, -64
+; CHECK-NEXT icmp eq i8 %and, -128
+define i1 @test54(i8 %a) nounwind {
+  %ext = zext i8 %a to i32
+  %and = and i32 %ext, 192
+  %ret = icmp eq i32 %and, 128
+  ret i1 %ret
+}
+
+; CHECK: @test55
+; CHECK-NEXT: icmp eq i32 %a, -123
+define i1 @test55(i32 %a) {
+  %sub = sub i32 0, %a
+  %cmp = icmp eq i32 %sub, 123
+  ret i1 %cmp
+}
+
+; CHECK: @test56
+; CHECK-NEXT: icmp eq i32 %a, -113
+define i1 @test56(i32 %a) {
+  %sub = sub i32 10, %a
+  %cmp = icmp eq i32 %sub, 123
+  ret i1 %cmp
+}
diff --git a/test/Transforms/InstCombine/intrinsics.ll b/test/Transforms/InstCombine/intrinsics.ll
index 332cd46..107f313 100644
--- a/test/Transforms/InstCombine/intrinsics.ll
+++ b/test/Transforms/InstCombine/intrinsics.ll
@@ -30,9 +30,9 @@ define i8 @uaddtest2(i8 %A, i8 %B, i1* %overflowPtr) {
 ; CHECK: @uaddtest2
 ; CHECK-NEXT: %and.A = and i8 %A, 127
 ; CHECK-NEXT: %and.B = and i8 %B, 127
-; CHECK-NEXT: %1 = add nuw i8 %and.A, %and.B
+; CHECK-NEXT: %x = add nuw i8 %and.A, %and.B
 ; CHECK-NEXT: store i1 false, i1* %overflowPtr
-; CHECK-NEXT: ret i8 %1
+; CHECK-NEXT: ret i8 %x
 }
 
 define i8 @uaddtest3(i8 %A, i8 %B, i1* %overflowPtr) {
@@ -46,9 +46,9 @@ define i8 @uaddtest3(i8 %A, i8 %B, i1* %overflowPtr) {
 ; CHECK: @uaddtest3
 ; CHECK-NEXT: %or.A = or i8 %A, -128
 ; CHECK-NEXT: %or.B = or i8 %B, -128
-; CHECK-NEXT: %1 = add i8 %or.A, %or.B
+; CHECK-NEXT: %x = add i8 %or.A, %or.B
 ; CHECK-NEXT: store i1 true, i1* %overflowPtr
-; CHECK-NEXT: ret i8 %1
+; CHECK-NEXT: ret i8 %x
 }
 
 define i8 @uaddtest4(i8 %A, i1* %overflowPtr) {
diff --git a/test/Transforms/InstCombine/malloc-free-delete.ll b/test/Transforms/InstCombine/malloc-free-delete.ll
index 317786f..8455300 100644
--- a/test/Transforms/InstCombine/malloc-free-delete.ll
+++ b/test/Transforms/InstCombine/malloc-free-delete.ll
@@ -1,14 +1,14 @@
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 ; PR1201
 define i32 @main(i32 %argc, i8** %argv) {
-        %c_19 = alloca i8*
-        %malloc_206 = malloc i8, i32 10
+    %c_19 = alloca i8*
+    %malloc_206 = tail call i8* @malloc(i32 mul (i32 ptrtoint (i8* getelementptr (i8* null, i32 1) to i32), i32 10))
+    store i8* %malloc_206, i8** %c_19
+    %tmp_207 = load i8** %c_19
+    tail call void @free(i8* %tmp_207)
+    ret i32 0
 ; CHECK-NOT: malloc
-        store i8* %malloc_206, i8** %c_19
-        %tmp_207 = load i8** %c_19
-        free i8* %tmp_207
 ; CHECK-NOT: free
-        ret i32 0
 ; CHECK: ret i32 0
 }
 
diff --git a/test/Transforms/InstCombine/malloc.ll b/test/Transforms/InstCombine/malloc.ll
deleted file mode 100644
index b6ebbea..0000000
--- a/test/Transforms/InstCombine/malloc.ll
+++ /dev/null
@@ -1,7 +0,0 @@
-; test that malloc's with a constant argument are promoted to array allocations
-; RUN: opt < %s -instcombine -S | grep getelementptr
-
-define i32* @test() {
-	%X = malloc i32, i32 4
-	ret i32* %X
-}
diff --git a/test/Transforms/InstCombine/malloc2.ll b/test/Transforms/InstCombine/malloc2.ll
deleted file mode 100644
index 8462dac..0000000
--- a/test/Transforms/InstCombine/malloc2.ll
+++ /dev/null
@@ -1,22 +0,0 @@
-; RUN: opt < %s -instcombine -S | FileCheck %s
-; PR1313
-
-define i32 @test1(i32 %argc, i8* %argv, i8* %envp) {
-        %tmp15.i.i.i23 = malloc [2564 x i32]            ; <[2564 x i32]*> [#uses=1]
-; CHECK-NOT: call i8* @malloc
-        %c = icmp eq [2564 x i32]* %tmp15.i.i.i23, null              ; <i1>:0 [#uses=1]
-        %retval = zext i1 %c to i32             ; <i32> [#uses=1]
-        ret i32 %retval
-; CHECK: ret i32 0
-}
-
-define i32 @test2(i32 %argc, i8* %argv, i8* %envp) {
-        %tmp15.i.i.i23 = malloc [2564 x i32]            ; <[2564 x i32]*> [#uses=1]
-; CHECK-NOT: call i8* @malloc
-        %X = bitcast [2564 x i32]* %tmp15.i.i.i23 to i32*
-        %c = icmp ne i32* %X, null
-        %retval = zext i1 %c to i32             ; <i32> [#uses=1]
-        ret i32 %retval
-; CHECK: ret i32 1
-}
-
diff --git a/test/Transforms/InstCombine/malloc3.ll b/test/Transforms/InstCombine/malloc3.ll
deleted file mode 100644
index f1c0cae..0000000
--- a/test/Transforms/InstCombine/malloc3.ll
+++ /dev/null
@@ -1,26 +0,0 @@
-; RUN: opt < %s -instcombine -S | not grep load
-; PR1728
-
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
-target triple = "i686-apple-darwin8"
-        %struct.foo = type { %struct.foo*, [10 x i32] }
-@.str = internal constant [21 x i8] c"tmp = %p, next = %p\0A\00"                ; <[21 x i8]*> [#uses=1]
-
-define i32 @main() {
-entry:
-        %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
-        %tmp1 = malloc i8, i32 44               ; <i8*> [#uses=1]
-        %tmp12 = bitcast i8* %tmp1 to %struct.foo*              ; <%struct.foo*> [#uses=3]
-        %tmp3 = malloc i8, i32 44               ; <i8*> [#uses=1]
-        %tmp34 = bitcast i8* %tmp3 to %struct.foo*              ; <%struct.foo*> [#uses=1]
-        %tmp6 = getelementptr %struct.foo* %tmp12, i32 0, i32 0         ; <%struct.foo**> [#uses=1]
-        store %struct.foo* %tmp34, %struct.foo** %tmp6, align 4
-        %tmp8 = getelementptr %struct.foo* %tmp12, i32 0, i32 0         ; <%struct.foo**> [#uses=1]
-        %tmp9 = load %struct.foo** %tmp8, align 4               ; <%struct.foo*> [#uses=1]
-        %tmp10 = getelementptr [21 x i8]* @.str, i32 0, i32 0           ; <i8*> [#uses=1]
-        %tmp13 = call i32 (i8*, ...)* @printf( i8* %tmp10, %struct.foo* %tmp12, %struct.foo* %tmp9 )            ; <i32> [#uses=0]
-        ret i32 undef
-}
-
-declare i32 @printf(i8*, ...)
-
diff --git a/test/Transforms/InstCombine/merge-icmp.ll b/test/Transforms/InstCombine/merge-icmp.ll
new file mode 100644
index 0000000..00020b1
--- /dev/null
+++ b/test/Transforms/InstCombine/merge-icmp.ll
@@ -0,0 +1,29 @@
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+define i1 @test1(i16* %x) {
+  %load = load i16* %x, align 4
+  %trunc = trunc i16 %load to i8
+  %cmp1 = icmp eq i8 %trunc, 127
+  %and = and i16 %load, -256
+  %cmp2 = icmp eq i16 %and, 17664
+  %or = and i1 %cmp1, %cmp2
+  ret i1 %or
+; CHECK: @test1
+; CHECK-NEXT: load i16
+; CHECK-NEXT: icmp eq i16 %load, 17791
+; CHECK-NEXT: ret i1
+}
+
+define i1 @test2(i16* %x) {
+  %load = load i16* %x, align 4
+  %and = and i16 %load, -256
+  %cmp1 = icmp eq i16 %and, 32512
+  %trunc = trunc i16 %load to i8
+  %cmp2 = icmp eq i8 %trunc, 69
+  %or = and i1 %cmp1, %cmp2
+  ret i1 %or
+; CHECK: @test2
+; CHECK-NEXT: load i16
+; CHECK-NEXT: icmp eq i16 %load, 32581
+; CHECK-NEXT: ret i1
+}
diff --git a/test/Transforms/InstCombine/not.ll b/test/Transforms/InstCombine/not.ll
index c58ce11..4a8825b 100644
--- a/test/Transforms/InstCombine/not.ll
+++ b/test/Transforms/InstCombine/not.ll
@@ -43,7 +43,7 @@ define i32 @test5(i32 %A, i32 %B) {
 }
 
 ; PR2298
-define i8 @test6(i32 %a, i32 %b) zeroext nounwind  {
+define zeroext i8 @test6(i32 %a, i32 %b)  nounwind  {
 entry:
 	%tmp1not = xor i32 %a, -1		; <i32> [#uses=1]
 	%tmp2not = xor i32 %b, -1		; <i32> [#uses=1]
diff --git a/test/Transforms/InstCombine/or.ll b/test/Transforms/InstCombine/or.ll
index f82f9fa..c0bb28d 100644
--- a/test/Transforms/InstCombine/or.ll
+++ b/test/Transforms/InstCombine/or.ll
@@ -332,8 +332,8 @@ define i64 @test31(i64 %A) nounwind readnone ssp noredzone {
   %F = or i64 %D, %E
   ret i64 %F
 ; CHECK: @test31
-; CHECK-NEXT: %E1 = and i64 %A, 4294908984
-; CHECK-NEXT: %F = or i64 %E1, 32962
+; CHECK-NEXT: %E = and i64 %A, 4294908984
+; CHECK-NEXT: %F = or i64 %E, 32962
 ; CHECK-NEXT: ret i64 %F
 }
 
@@ -390,3 +390,22 @@ define i1 @test36(i32 %x) {
 ; CHECK-NEXT: ret i1
 }
 
+define i32 @test37(i32* %xp, i32 %y) {
+; CHECK: @test37
+; CHECK: select i1 %tobool, i32 -1, i32 %x
+  %tobool = icmp ne i32 %y, 0
+  %sext = sext i1 %tobool to i32
+  %x = load i32* %xp
+  %or = or i32 %sext, %x
+  ret i32 %or
+}
+
+define i32 @test38(i32* %xp, i32 %y) {
+; CHECK: @test38
+; CHECK: select i1 %tobool, i32 -1, i32 %x
+  %tobool = icmp ne i32 %y, 0
+  %sext = sext i1 %tobool to i32
+  %x = load i32* %xp
+  %or = or i32 %x, %sext
+  ret i32 %or
+}
diff --git a/test/Transforms/InstCombine/phi.ll b/test/Transforms/InstCombine/phi.ll
index 62c6a63..cd865ae 100644
--- a/test/Transforms/InstCombine/phi.ll
+++ b/test/Transforms/InstCombine/phi.ll
@@ -197,25 +197,25 @@ declare i1 @test11a()
 define i1 @test11() {
 entry:
   %a = alloca i32
-  %i = ptrtoint i32* %a to i32
+  %i = ptrtoint i32* %a to i64
   %b = call i1 @test11a()
   br i1 %b, label %one, label %two
 
 one:
-  %x = phi i32 [%i, %entry], [%y, %two]
+  %x = phi i64 [%i, %entry], [%y, %two]
   %c = call i1 @test11a()
   br i1 %c, label %two, label %end
 
 two:
-  %y = phi i32 [%i, %entry], [%x, %one]
+  %y = phi i64 [%i, %entry], [%x, %one]
   %d = call i1 @test11a()
   br i1 %d, label %one, label %end
 
 end:
-  %f = phi i32 [ %x, %one], [%y, %two]
+  %f = phi i64 [ %x, %one], [%y, %two]
   ; Change the %f to %i, and the optimizer suddenly becomes a lot smarter
   ; even though %f must equal %i at this point
-  %g = inttoptr i32 %f to i32*
+  %g = inttoptr i64 %f to i32*
   store i32 10, i32* %g
   %z = call i1 @test11a()
   ret i1 %z
@@ -544,3 +544,79 @@ BB2:
 ; CHECK-NEXT: %C = add nuw i32 %A, 1
 ; CHECK-NEXT: ret i32 %C
 }
+
+; Same as test11, but used to be missed due to a bug.
+declare i1 @test25a()
+
+define i1 @test25() {
+entry:
+  %a = alloca i32
+  %i = ptrtoint i32* %a to i64
+  %b = call i1 @test25a()
+  br i1 %b, label %one, label %two
+
+one:
+  %x = phi i64 [%y, %two], [%i, %entry]
+  %c = call i1 @test25a()
+  br i1 %c, label %two, label %end
+
+two:
+  %y = phi i64 [%x, %one], [%i, %entry]
+  %d = call i1 @test25a()
+  br i1 %d, label %one, label %end
+
+end:
+  %f = phi i64 [ %x, %one], [%y, %two]
+  ; Change the %f to %i, and the optimizer suddenly becomes a lot smarter
+  ; even though %f must equal %i at this point
+  %g = inttoptr i64 %f to i32*
+  store i32 10, i32* %g
+  %z = call i1 @test25a()
+  ret i1 %z
+; CHECK: @test25
+; CHECK-NOT: phi i32
+; CHECK: ret i1 %z
+}
+
+declare i1 @test26a()
+
+define i1 @test26(i32 %n) {
+entry:
+  %a = alloca i32
+  %i = ptrtoint i32* %a to i64
+  %b = call i1 @test26a()
+  br label %one
+
+one:
+  %x = phi i64 [%y, %two], [%w, %three], [%i, %entry]
+  %c = call i1 @test26a()
+  switch i32 %n, label %end [
+          i32 2, label %two
+          i32 3, label %three
+  ]
+
+two:
+  %y = phi i64 [%x, %one], [%w, %three]
+  %d = call i1 @test26a()
+  switch i32 %n, label %end [
+          i32 10, label %one
+          i32 30, label %three
+  ]
+
+three:
+  %w = phi i64 [%y, %two], [%x, %one]
+  %e = call i1 @test26a()
+  br i1 %e, label %one, label %two
+
+end:
+  %f = phi i64 [ %x, %one], [%y, %two]
+  ; Change the %f to %i, and the optimizer suddenly becomes a lot smarter
+  ; even though %f must equal %i at this point
+  %g = inttoptr i64 %f to i32*
+  store i32 10, i32* %g
+  %z = call i1 @test26a()
+  ret i1 %z
+; CHECK: @test26
+; CHECK-NOT: phi i32
+; CHECK: ret i1 %z
+}
diff --git a/test/Transforms/InstCombine/select.ll b/test/Transforms/InstCombine/select.ll
index 3925907..4ca9bd2 100644
--- a/test/Transforms/InstCombine/select.ll
+++ b/test/Transforms/InstCombine/select.ll
@@ -749,3 +749,53 @@ define i1 @test55(i1 %X, i32 %Y, i32 %Z) {
 ; CHECK: icmp eq
 ; CHECK: ret i1
 }
+
+define i32 @test56(i16 %x) nounwind {
+  %tobool = icmp eq i16 %x, 0
+  %conv = zext i16 %x to i32
+  %cond = select i1 %tobool, i32 0, i32 %conv
+  ret i32 %cond
+; CHECK: @test56
+; CHECK-NEXT: zext
+; CHECK-NEXT: ret
+}
+
+define i32 @test57(i32 %x, i32 %y) nounwind {
+  %and = and i32 %x, %y
+  %tobool = icmp eq i32 %x, 0
+  %.and = select i1 %tobool, i32 0, i32 %and
+  ret i32 %.and
+; CHECK: @test57
+; CHECK-NEXT: and i32 %x, %y
+; CHECK-NEXT: ret
+}
+
+define i32 @test58(i16 %x) nounwind {
+  %tobool = icmp ne i16 %x, 1
+  %conv = zext i16 %x to i32
+  %cond = select i1 %tobool, i32 %conv, i32 1
+  ret i32 %cond
+; CHECK: @test58
+; CHECK-NEXT: zext
+; CHECK-NEXT: ret
+}
+
+define i32 @test59(i32 %x, i32 %y) nounwind {
+  %and = and i32 %x, %y
+  %tobool = icmp ne i32 %x, %y
+  %.and = select i1 %tobool, i32 %and, i32 %y
+  ret i32 %.and
+; CHECK: @test59
+; CHECK-NEXT: and i32 %x, %y
+; CHECK-NEXT: ret
+}
+
+define i1 @test60(i32 %x, i1* %y) nounwind {
+  %cmp = icmp eq i32 %x, 0
+  %load = load i1* %y, align 1
+  %cmp1 = icmp slt i32 %x, 1
+  %sel = select i1 %cmp, i1 %load, i1 %cmp1
+  ret i1 %sel
+; CHECK: @test60
+; CHECK: select
+}
diff --git a/test/Transforms/InstCombine/shift.ll b/test/Transforms/InstCombine/shift.ll
index 7fab1d2..d9ac9cb 100644
--- a/test/Transforms/InstCombine/shift.ll
+++ b/test/Transforms/InstCombine/shift.ll
@@ -485,3 +485,45 @@ entry:
 ; CHECK: ret i8 %tmp551
   ret i8 %tmp55
 }
+
+; PR9809
+define i32 @test40(i32 %a, i32 %b) nounwind {
+  %shl1 = shl i32 1, %b
+  %shl2 = shl i32 %shl1, 2
+  %div = udiv i32 %a, %shl2
+  ret i32 %div
+; CHECK: @test40
+; CHECK-NEXT: add i32 %b, 2
+; CHECK-NEXT: lshr i32 %a
+; CHECK-NEXT: ret i32
+}
+
+define i32 @test41(i32 %a, i32 %b) nounwind {
+  %1 = shl i32 1, %b
+  %2 = shl i32 %1, 3
+  ret i32 %2
+; CHECK: @test41
+; CHECK-NEXT: shl i32 8, %b
+; CHECK-NEXT: ret i32
+}
+
+define i32 @test42(i32 %a, i32 %b) nounwind {
+  %div = lshr i32 4096, %b    ; must be exact otherwise we'd divide by zero
+  %div2 = udiv i32 %a, %div
+  ret i32 %div2
+; CHECK: @test42
+; CHECK-NEXT: lshr exact i32 4096, %b
+}
+
+define i32 @test43(i32 %a, i32 %b) nounwind {
+  %div = shl i32 4096, %b    ; must be exact otherwise we'd divide by zero
+  %div2 = udiv i32 %a, %div
+  ret i32 %div2
+; CHECK: @test43
+; CHECK-NEXT: add i32 %b, 12
+; CHECK-NEXT: lshr
+; CHECK-NEXT: ret
+}
+
+
+
diff --git a/test/Transforms/InstCombine/sub.ll b/test/Transforms/InstCombine/sub.ll
index 9656a7e..37de328 100644
--- a/test/Transforms/InstCombine/sub.ll
+++ b/test/Transforms/InstCombine/sub.ll
@@ -203,7 +203,7 @@ define i1 @test21(i32 %g, i32 %h) {
 }
 
 ; PR2298
-define i1 @test22(i32 %a, i32 %b) zeroext nounwind  {
+define zeroext i1 @test22(i32 %a, i32 %b)  nounwind  {
 	%tmp2 = sub i32 0, %a	
 	%tmp4 = sub i32 0, %b	
 	%tmp5 = icmp eq i32 %tmp2, %tmp4	
diff --git a/test/Transforms/InstCombine/udivrem-change-width.ll b/test/Transforms/InstCombine/udivrem-change-width.ll
index 9983944..b388a3b 100644
--- a/test/Transforms/InstCombine/udivrem-change-width.ll
+++ b/test/Transforms/InstCombine/udivrem-change-width.ll
@@ -1,14 +1,16 @@
-; RUN: opt < %s -instcombine -S | not grep zext
-; PR4548
+; RUN: opt < %s -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 
+; PR4548
 define i8 @udiv_i8(i8 %a, i8 %b) nounwind {
   %conv = zext i8 %a to i32       
   %conv2 = zext i8 %b to i32      
   %div = udiv i32 %conv, %conv2   
   %conv3 = trunc i32 %div to i8   
   ret i8 %conv3
+; CHECK: @udiv_i8
+; CHECK: udiv i8 %a, %b
 }
 
 define i8 @urem_i8(i8 %a, i8 %b) nounwind {
@@ -17,5 +19,44 @@ define i8 @urem_i8(i8 %a, i8 %b) nounwind {
   %div = urem i32 %conv, %conv2   
   %conv3 = trunc i32 %div to i8   
   ret i8 %conv3
+; CHECK: @urem_i8
+; CHECK: urem i8 %a, %b
 }
 
+define i32 @udiv_i32(i8 %a, i8 %b) nounwind {
+  %conv = zext i8 %a to i32
+  %conv2 = zext i8 %b to i32
+  %div = udiv i32 %conv, %conv2
+  ret i32 %div
+; CHECK: @udiv_i32
+; CHECK: udiv i8 %a, %b
+; CHECK: zext
+}
+
+define i32 @urem_i32(i8 %a, i8 %b) nounwind {
+  %conv = zext i8 %a to i32
+  %conv2 = zext i8 %b to i32
+  %div = urem i32 %conv, %conv2
+  ret i32 %div
+; CHECK: @urem_i32
+; CHECK: urem i8 %a, %b
+; CHECK: zext
+}
+
+define i32 @udiv_i32_c(i8 %a) nounwind {
+  %conv = zext i8 %a to i32
+  %div = udiv i32 %conv, 10
+  ret i32 %div
+; CHECK: @udiv_i32_c
+; CHECK: udiv i8 %a, 10
+; CHECK: zext
+}
+
+define i32 @urem_i32_c(i8 %a) nounwind {
+  %conv = zext i8 %a to i32
+  %div = urem i32 %conv, 10
+  ret i32 %div
+; CHECK: @urem_i32_c
+; CHECK: urem i8 %a, 10
+; CHECK: zext
+}
diff --git a/test/Transforms/InstCombine/vec_demanded_elts.ll b/test/Transforms/InstCombine/vec_demanded_elts.ll
index 9f308aa..e0188fe 100644
--- a/test/Transforms/InstCombine/vec_demanded_elts.ll
+++ b/test/Transforms/InstCombine/vec_demanded_elts.ll
@@ -136,3 +136,19 @@ declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>)
 declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>)
 declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>)
 declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>)
+
+; <rdar://problem/6945110>
+define <4 x i32> @kernel3_vertical(<4 x i16> * %src, <8 x i16> * %foo) nounwind {
+entry:
+	%tmp = load <4 x i16>* %src
+	%tmp1 = load <8 x i16>* %foo
+; CHECK: %tmp2 = shufflevector
+	%tmp2 = shufflevector <4 x i16> %tmp, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; pmovzxwd ignores the upper 64-bits of its input; -instcombine should remove this shuffle:
+; CHECK-NOT: shufflevector
+	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: pmovzxwd
+	%0 = call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %tmp3)
+	ret <4 x i32> %0
+}
+declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>) nounwind readnone
diff --git a/test/Transforms/InstCombine/x86-crc32-demanded.ll b/test/Transforms/InstCombine/x86-crc32-demanded.ll
new file mode 100644
index 0000000..878b97d
--- /dev/null
+++ b/test/Transforms/InstCombine/x86-crc32-demanded.ll
@@ -0,0 +1,17 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+; crc32 with 64-bit destination zeros high 32-bit.
+; rdar://9467055
+
+define i64 @test() nounwind {
+entry:
+; CHECK: test
+; CHECK: tail call i64 @llvm.x86.sse42.crc32.64.64
+; CHECK-NOT: and
+; CHECK: ret
+  %0 = tail call i64 @llvm.x86.sse42.crc32.64.64(i64 0, i64 4) nounwind
+  %1 = and i64 %0, 4294967295
+  ret i64 %1
+}
+
+declare i64 @llvm.x86.sse42.crc32.64.64(i64, i64) nounwind readnone
diff --git a/test/Transforms/InstCombine/zext-or-icmp.ll b/test/Transforms/InstCombine/zext-or-icmp.ll
index 969c301..ddc6083 100644
--- a/test/Transforms/InstCombine/zext-or-icmp.ll
+++ b/test/Transforms/InstCombine/zext-or-icmp.ll
@@ -4,7 +4,7 @@
 	%struct.Rock = type { i16, i16 }
 @some_idx = internal constant [4 x i8] c"\0A\0B\0E\0F"		; <[4 x i8]*> [#uses=1]
 
-define i8 @t(%struct.FooBar* %up, i8 zeroext  %intra_flag, i32 %blk_i) zeroext nounwind  {
+define zeroext  i8 @t(%struct.FooBar* %up, i8 zeroext  %intra_flag, i32 %blk_i) nounwind  {
 entry:
 	%tmp2 = lshr i32 %blk_i, 1		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp2, 2		; <i32> [#uses=1]
diff --git a/test/Transforms/InstSimplify/maxmin.ll b/test/Transforms/InstSimplify/maxmin.ll
new file mode 100644
index 0000000..e921214
--- /dev/null
+++ b/test/Transforms/InstSimplify/maxmin.ll
@@ -0,0 +1,269 @@
+; RUN: opt < %s -instsimplify -S | FileCheck %s
+
+define i1 @max1(i32 %x, i32 %y) {
+; CHECK: @max1
+  %c = icmp sgt i32 %x, %y
+  %m = select i1 %c, i32 %x, i32 %y
+  %r = icmp slt i32 %m, %x
+  ret i1 %r
+; CHECK: ret i1 false
+}
+
+define i1 @max2(i32 %x, i32 %y) {
+; CHECK: @max2
+  %c = icmp sge i32 %x, %y
+  %m = select i1 %c, i32 %x, i32 %y
+  %r = icmp sge i32 %m, %x
+  ret i1 %r
+; CHECK: ret i1 true
+}
+
+define i1 @max3(i32 %x, i32 %y) {
+; CHECK: @max3
+  %c = icmp ugt i32 %x, %y
+  %m = select i1 %c, i32 %x, i32 %y
+  %r = icmp ult i32 %m, %x
+  ret i1 %r
+; CHECK: ret i1 false
+}
+
+define i1 @max4(i32 %x, i32 %y) {
+; CHECK: @max4
+  %c = icmp uge i32 %x, %y
+  %m = select i1 %c, i32 %x, i32 %y
+  %r = icmp uge i32 %m, %x
+  ret i1 %r
+; CHECK: ret i1 true
+}
+
+define i1 @max5(i32 %x, i32 %y) {
+; CHECK: @max5
+  %c = icmp sgt i32 %x, %y
+  %m = select i1 %c, i32 %x, i32 %y
+  %r = icmp sgt i32 %x, %m
+  ret i1 %r
+; CHECK: ret i1 false
+}
+
+define i1 @max6(i32 %x, i32 %y) {
+; CHECK: @max6
+  %c = icmp sge i32 %x, %y
+  %m = select i1 %c, i32 %x, i32 %y
+  %r = icmp sle i32 %x, %m
+  ret i1 %r
+; CHECK: ret i1 true
+}
+
+define i1 @max7(i32 %x, i32 %y) {
+; CHECK: @max7
+  %c = icmp ugt i32 %x, %y
+  %m = select i1 %c, i32 %x, i32 %y
+  %r = icmp ugt i32 %x, %m
+  ret i1 %r
+; CHECK: ret i1 false
+}
+
+define i1 @max8(i32 %x, i32 %y) {
+; CHECK: @max8
+  %c = icmp uge i32 %x, %y
+  %m = select i1 %c, i32 %x, i32 %y
+  %r = icmp ule i32 %x, %m
+  ret i1 %r
+; CHECK: ret i1 true
+}
+
+define i1 @min1(i32 %x, i32 %y) {
+; CHECK: @min1
+  %c = icmp sgt i32 %x, %y
+  %m = select i1 %c, i32 %y, i32 %x
+  %r = icmp sgt i32 %m, %x
+  ret i1 %r
+; CHECK: ret i1 false
+}
+
+define i1 @min2(i32 %x, i32 %y) {
+; CHECK: @min2
+  %c = icmp sge i32 %x, %y
+  %m = select i1 %c, i32 %y, i32 %x
+  %r = icmp sle i32 %m, %x
+  ret i1 %r
+; CHECK: ret i1 true
+}
+
+define i1 @min3(i32 %x, i32 %y) {
+; CHECK: @min3
+  %c = icmp ugt i32 %x, %y
+  %m = select i1 %c, i32 %y, i32 %x
+  %r = icmp ugt i32 %m, %x
+  ret i1 %r
+; CHECK: ret i1 false
+}
+
+define i1 @min4(i32 %x, i32 %y) {
+; CHECK: @min4
+  %c = icmp uge i32 %x, %y
+  %m = select i1 %c, i32 %y, i32 %x
+  %r = icmp ule i32 %m, %x
+  ret i1 %r
+; CHECK: ret i1 true
+}
+
+define i1 @min5(i32 %x, i32 %y) {
+; CHECK: @min5
+  %c = icmp sgt i32 %x, %y
+  %m = select i1 %c, i32 %y, i32 %x
+  %r = icmp slt i32 %x, %m
+  ret i1 %r
+; CHECK: ret i1 false
+}
+
+define i1 @min6(i32 %x, i32 %y) {
+; CHECK: @min6
+  %c = icmp sge i32 %x, %y
+  %m = select i1 %c, i32 %y, i32 %x
+  %r = icmp sge i32 %x, %m
+  ret i1 %r
+; CHECK: ret i1 true
+}
+
+define i1 @min7(i32 %x, i32 %y) {
+; CHECK: @min7
+  %c = icmp ugt i32 %x, %y
+  %m = select i1 %c, i32 %y, i32 %x
+  %r = icmp ult i32 %x, %m
+  ret i1 %r
+; CHECK: ret i1 false
+}
+
+define i1 @min8(i32 %x, i32 %y) {
+; CHECK: @min8
+  %c = icmp uge i32 %x, %y
+  %m = select i1 %c, i32 %y, i32 %x
+  %r = icmp uge i32 %x, %m
+  ret i1 %r
+; CHECK: ret i1 true
+}
+
+define i1 @maxmin1(i32 %x, i32 %y, i32 %z) {
+; CHECK: @maxmin1
+  %c1 = icmp sge i32 %x, %y
+  %max = select i1 %c1, i32 %x, i32 %y
+  %c2 = icmp sge i32 %x, %z
+  %min = select i1 %c2, i32 %z, i32 %x
+  %c = icmp sge i32 %max, %min
+  ret i1 %c
+; CHECK: ret i1 true
+}
+
+define i1 @maxmin2(i32 %x, i32 %y, i32 %z) {
+; CHECK: @maxmin2
+  %c1 = icmp sge i32 %x, %y
+  %max = select i1 %c1, i32 %x, i32 %y
+  %c2 = icmp sge i32 %x, %z
+  %min = select i1 %c2, i32 %z, i32 %x
+  %c = icmp sgt i32 %min, %max
+  ret i1 %c
+; CHECK: ret i1 false
+}
+
+define i1 @maxmin3(i32 %x, i32 %y, i32 %z) {
+; CHECK: @maxmin3
+  %c1 = icmp sge i32 %x, %y
+  %max = select i1 %c1, i32 %x, i32 %y
+  %c2 = icmp sge i32 %x, %z
+  %min = select i1 %c2, i32 %z, i32 %x
+  %c = icmp sle i32 %min, %max
+  ret i1 %c
+; CHECK: ret i1 true
+}
+
+define i1 @maxmin4(i32 %x, i32 %y, i32 %z) {
+; CHECK: @maxmin4
+  %c1 = icmp sge i32 %x, %y
+  %max = select i1 %c1, i32 %x, i32 %y
+  %c2 = icmp sge i32 %x, %z
+  %min = select i1 %c2, i32 %z, i32 %x
+  %c = icmp slt i32 %max, %min
+  ret i1 %c
+; CHECK: ret i1 false
+}
+
+define i1 @maxmin5(i32 %x, i32 %y, i32 %z) {
+; CHECK: @maxmin5
+  %c1 = icmp uge i32 %x, %y
+  %max = select i1 %c1, i32 %x, i32 %y
+  %c2 = icmp uge i32 %x, %z
+  %min = select i1 %c2, i32 %z, i32 %x
+  %c = icmp uge i32 %max, %min
+  ret i1 %c
+; CHECK: ret i1 true
+}
+
+define i1 @maxmin6(i32 %x, i32 %y, i32 %z) {
+; CHECK: @maxmin6
+  %c1 = icmp uge i32 %x, %y
+  %max = select i1 %c1, i32 %x, i32 %y
+  %c2 = icmp uge i32 %x, %z
+  %min = select i1 %c2, i32 %z, i32 %x
+  %c = icmp ugt i32 %min, %max
+  ret i1 %c
+; CHECK: ret i1 false
+}
+
+define i1 @maxmin7(i32 %x, i32 %y, i32 %z) {
+; CHECK: @maxmin7
+  %c1 = icmp uge i32 %x, %y
+  %max = select i1 %c1, i32 %x, i32 %y
+  %c2 = icmp uge i32 %x, %z
+  %min = select i1 %c2, i32 %z, i32 %x
+  %c = icmp ule i32 %min, %max
+  ret i1 %c
+; CHECK: ret i1 true
+}
+
+define i1 @maxmin8(i32 %x, i32 %y, i32 %z) {
+; CHECK: @maxmin8
+  %c1 = icmp uge i32 %x, %y
+  %max = select i1 %c1, i32 %x, i32 %y
+  %c2 = icmp uge i32 %x, %z
+  %min = select i1 %c2, i32 %z, i32 %x
+  %c = icmp ult i32 %max, %min
+  ret i1 %c
+; CHECK: ret i1 false
+}
+
+define i1 @eqcmp1(i32 %x, i32 %y) {
+; CHECK: @eqcmp1
+  %c = icmp sge i32 %x, %y
+  %max = select i1 %c, i32 %x, i32 %y
+  %r = icmp eq i32 %max, %x
+  ret i1 %r
+; CHECK: ret i1 %c
+}
+
+define i1 @eqcmp2(i32 %x, i32 %y) {
+; CHECK: @eqcmp2
+  %c = icmp sge i32 %x, %y
+  %max = select i1 %c, i32 %x, i32 %y
+  %r = icmp eq i32 %x, %max
+  ret i1 %r
+; CHECK: ret i1 %c
+}
+
+define i1 @eqcmp3(i32 %x, i32 %y) {
+; CHECK: @eqcmp3
+  %c = icmp uge i32 %x, %y
+  %max = select i1 %c, i32 %x, i32 %y
+  %r = icmp eq i32 %max, %x
+  ret i1 %r
+; CHECK: ret i1 %c
+}
+
+define i1 @eqcmp4(i32 %x, i32 %y) {
+; CHECK: @eqcmp4
+  %c = icmp uge i32 %x, %y
+  %max = select i1 %c, i32 %x, i32 %y
+  %r = icmp eq i32 %x, %max
+  ret i1 %r
+; CHECK: ret i1 %c
+}
diff --git a/test/Transforms/InstSimplify/rem.ll b/test/Transforms/InstSimplify/rem.ll
new file mode 100644
index 0000000..4c8f87c
--- /dev/null
+++ b/test/Transforms/InstSimplify/rem.ll
@@ -0,0 +1,17 @@
+; RUN: opt < %s -instsimplify -S | FileCheck %s
+
+define i32 @select1(i32 %x, i1 %b) {
+; CHECK: @select1
+  %rhs = select i1 %b, i32 %x, i32 1
+  %rem = srem i32 %x, %rhs
+  ret i32 %rem
+; CHECK: ret i32 0
+}
+
+define i32 @select2(i32 %x, i1 %b) {
+; CHECK: @select2
+  %rhs = select i1 %b, i32 %x, i32 1
+  %rem = urem i32 %x, %rhs
+  ret i32 %rem
+; CHECK: ret i32 0
+}
diff --git a/test/Transforms/JumpThreading/2011-04-14-InfLoop.ll b/test/Transforms/JumpThreading/2011-04-14-InfLoop.ll
new file mode 100644
index 0000000..46aaa00
--- /dev/null
+++ b/test/Transforms/JumpThreading/2011-04-14-InfLoop.ll
@@ -0,0 +1,31 @@
+; RUN: opt -jump-threading < %s
+; <rdar://problem/9284786>
+
+%0 = type <{ i64, i16, i64, i8, i8 }>
+
+@g_338 = external global %0, align 8
+
+define void @func_1() nounwind ssp {
+entry:
+  ret void
+
+for.cond1177:
+  %inc1187 = add nsw i32 0, 1
+  %cmp1179 = icmp slt i32 %inc1187, 5
+  br i1 %cmp1179, label %for.cond1177, label %land.rhs1320
+
+land.rhs1320:
+  %tmp1324 = volatile load i64* getelementptr inbounds (%0* @g_338, i64 0, i32 2), align 1, !tbaa !0
+  br label %if.end.i
+
+if.end.i:
+  %tobool.pr.i = phi i1 [ false, %if.end.i ], [ false, %land.rhs1320 ]
+  br i1 %tobool.pr.i, label %return, label %if.end.i
+
+return:
+  ret void
+}
+
+!0 = metadata !{metadata !"long long", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/test/Transforms/LICM/2005-03-24-LICM-Aggregate-Crash.ll b/test/Transforms/LICM/2005-03-24-LICM-Aggregate-Crash.ll
deleted file mode 100644
index 91740cf..0000000
--- a/test/Transforms/LICM/2005-03-24-LICM-Aggregate-Crash.ll
+++ /dev/null
@@ -1,9 +0,0 @@
-; RUN: opt < %s -licm -disable-output
-
-define void @test({ i32 }* %P) {
-	br label %Loop
-Loop:		; preds = %Loop, %0
-	free { i32 }* %P
-	br label %Loop
-}
-
diff --git a/test/Transforms/LICM/2011-04-06-HoistMissedASTUpdate.ll b/test/Transforms/LICM/2011-04-06-HoistMissedASTUpdate.ll
new file mode 100644
index 0000000..5774f58
--- /dev/null
+++ b/test/Transforms/LICM/2011-04-06-HoistMissedASTUpdate.ll
@@ -0,0 +1,32 @@
+; RUN: opt < %s -basicaa -licm -S | FileCheck %s
+; PR9630
+
+@g_39 = external global i16, align 2
+
+declare i32* @func_84(i32** nocapture) nounwind readonly
+
+declare i32** @func_108(i32*** nocapture) nounwind readonly
+
+define void @func() nounwind {
+entry:
+  br label %for.body4.lr.ph
+
+for.body4.lr.ph:
+  br label %for.body4
+
+; CHECK: for.body4:
+; CHECK: volatile load i16* @g_39
+
+for.body4:
+  %l_612.11 = phi i32* [ undef, %for.body4.lr.ph ], [ %call19, %for.body4 ]
+  %tmp7 = volatile load i16* @g_39, align 2
+  %call = call i32** @func_108(i32*** undef)
+  %call19 = call i32* @func_84(i32** %call)
+  br i1 false, label %for.body4, label %for.cond.loopexit
+
+for.cond.loopexit:
+  br i1 false, label %for.body4.lr.ph, label %for.end26
+
+for.end26:
+  ret void
+}
diff --git a/test/Transforms/LICM/2011-04-09-RAUW-AST.ll b/test/Transforms/LICM/2011-04-09-RAUW-AST.ll
new file mode 100644
index 0000000..4285bd1
--- /dev/null
+++ b/test/Transforms/LICM/2011-04-09-RAUW-AST.ll
@@ -0,0 +1,49 @@
+; RUN: opt < %s -loop-rotate -licm -S | FileCheck %s
+; PR9604
+
+@g_3 = global i32 0, align 4
+@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00"
+
+define i32 @main() nounwind {
+entry:
+  %tmp = load i32* @g_3, align 4
+  %tobool = icmp eq i32 %tmp, 0
+  br i1 %tobool, label %for.cond, label %if.then
+
+if.then:                                          ; preds = %entry
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc10, %if.then, %entry
+  %g.0 = phi i32* [ %g.0, %for.inc10 ], [ @g_3, %entry ], [ null, %if.then ]
+  %x.0 = phi i32 [ %inc12, %for.inc10 ], [ 0, %entry ], [ 0, %if.then ]
+  %cmp = icmp slt i32 %x.0, 5
+  br i1 %cmp, label %for.cond4, label %for.end13
+
+for.cond4:                                        ; preds = %for.body7, %for.cond
+  %y.0 = phi i32 [ %inc, %for.body7 ], [ 0, %for.cond ]
+  %cmp6 = icmp slt i32 %y.0, 5
+  br i1 %cmp6, label %for.body7, label %for.inc10
+
+; CHECK: for.body7:
+; CHECK-NEXT: phi
+; CHECK-NEXT: store i32 0
+; CHECK-NEXT: store i32 1
+
+for.body7:                                        ; preds = %for.cond4
+  store i32 0, i32* @g_3, align 4
+  store i32 1, i32* %g.0, align 4
+  %inc = add nsw i32 %y.0, 1
+  br label %for.cond4
+
+for.inc10:                                        ; preds = %for.cond4
+  %inc12 = add nsw i32 %x.0, 1
+  br label %for.cond
+
+for.end13:                                        ; preds = %for.cond
+  %tmp14 = load i32* @g_3, align 4
+  %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %tmp14) nounwind
+  ret i32 0
+}
+
+declare i32 @printf(i8* nocapture, ...) nounwind
+
diff --git a/test/Transforms/LoopIdiom/basic.ll b/test/Transforms/LoopIdiom/basic.ll
index 485114c..9695418 100644
--- a/test/Transforms/LoopIdiom/basic.ll
+++ b/test/Transforms/LoopIdiom/basic.ll
@@ -347,3 +347,40 @@ for.end:                                          ; preds = %for.body
 ; CHECK-NOT: store
 ; CHECK: ret void
 }
+
+
+
+; PR9815 - This is a partial overlap case that cannot be safely transformed
+; into a memcpy.
+@g_50 = global [7 x i32] [i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0], align 16
+
+define i32 @test14() nounwind {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc, %for.body.lr.ph
+  %tmp5 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %add = add nsw i32 %tmp5, 4
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds [7 x i32]* @g_50, i32 0, i64 %idxprom
+  %tmp2 = load i32* %arrayidx, align 4
+  %add4 = add nsw i32 %tmp5, 5
+  %idxprom5 = sext i32 %add4 to i64
+  %arrayidx6 = getelementptr inbounds [7 x i32]* @g_50, i32 0, i64 %idxprom5
+  store i32 %tmp2, i32* %arrayidx6, align 4
+  %inc = add nsw i32 %tmp5, 1
+  %cmp = icmp slt i32 %inc, 2
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.inc
+  %tmp8 = load i32* getelementptr inbounds ([7 x i32]* @g_50, i32 0, i64 6), align 4
+  ret i32 %tmp8
+; CHECK: @test14
+; CHECK: for.body:
+; CHECK: load i32
+; CHECK: store i32
+; CHECK: br i1 %cmp
+
+}
+
+
diff --git a/test/Transforms/LoopRotate/crash.ll b/test/Transforms/LoopRotate/crash.ll
index 9dc9862..16a6868 100644
--- a/test/Transforms/LoopRotate/crash.ll
+++ b/test/Transforms/LoopRotate/crash.ll
@@ -137,3 +137,19 @@ bb17:		; preds = %bb15
 }
 
 
+
+
+; PR9523 - Non-canonical loop.
+define void @test7(i8* %P) nounwind {
+entry:
+  indirectbr i8* %P, [label %"3", label %"5"]
+
+"3":                                              ; preds = %"4", %entry
+  br i1 undef, label %"5", label %"4"
+
+"4":                                              ; preds = %"3"
+  br label %"3"
+
+"5":                                              ; preds = %"3", %entry
+  ret void
+}
diff --git a/test/Transforms/LoopSimplify/2007-10-28-InvokeCrash.ll b/test/Transforms/LoopSimplify/2007-10-28-InvokeCrash.ll
index e73fff1..e7d0f84 100644
--- a/test/Transforms/LoopSimplify/2007-10-28-InvokeCrash.ll
+++ b/test/Transforms/LoopSimplify/2007-10-28-InvokeCrash.ll
@@ -278,7 +278,7 @@ invcont:		; preds = %bb_main
 	br label %bb_main
 
 invcont.fragment:		; preds = %bb_main
-	invoke void @_ZN9Fibonacci10get_numberEj( %struct.BigInt* null sret , %struct.Fibonacci* %this_this, i32 %n_i_n_i )
+	invoke void @_ZN9Fibonacci10get_numberEj( %struct.BigInt* sret null  , %struct.Fibonacci* %this_this, i32 %n_i_n_i )
 			to label %invcont14 unwind label %meshBB37
 
 invcont.unwind10_crit_edge:		; preds = %bb_main
@@ -304,7 +304,7 @@ invcont14:		; preds = %invcont.fragment, %bb_main
 	br label %bb_main
 
 invcont14.normaldest:		; No predecessors!
-	invoke %"struct.__gnu_cxx::__normal_iterator<BigInt*,std::vector<BigInt, std::allocator<BigInt> > >"* @___ZN9__gnu_cxx17__normal_iteratorIP6BigIntSt6vectorIS1_SaIS1_EEEppEv___ZNSt6vectorImSaImEED1Ev___ZN6BigIntD1Ev___ZN9__gnu_cxx13new_allocatorI6BigIntE7destroyEPS1____ZSt8_DestroyIP6BigIntSaIS0_EEvT_S3_T0_( i32 14, %"struct.__gnu_cxx::__normal_iterator<BigInt*,std::vector<BigInt, std::allocator<BigInt> > >"* null, %"struct.std::vector<ulong,std::allocator<ulong> >"* null, %struct.BigInt* null, %struct.__false_type* null, %struct.BigInt* null, %struct.__false_type* null noalias  )
+	invoke %"struct.__gnu_cxx::__normal_iterator<BigInt*,std::vector<BigInt, std::allocator<BigInt> > >"* @___ZN9__gnu_cxx17__normal_iteratorIP6BigIntSt6vectorIS1_SaIS1_EEEppEv___ZNSt6vectorImSaImEED1Ev___ZN6BigIntD1Ev___ZN9__gnu_cxx13new_allocatorI6BigIntE7destroyEPS1____ZSt8_DestroyIP6BigIntSaIS0_EEvT_S3_T0_( i32 14, %"struct.__gnu_cxx::__normal_iterator<BigInt*,std::vector<BigInt, std::allocator<BigInt> > >"* null, %"struct.std::vector<ulong,std::allocator<ulong> >"* null, %struct.BigInt* null, %struct.__false_type* null, %struct.BigInt* null, %struct.__false_type* noalias null   )
 			to label %invcont15 unwind label %meshBB345		; <%"struct.__gnu_cxx::__normal_iterator<BigInt*,std::vector<BigInt, std::allocator<BigInt> > >"*>:0 [#uses=0]
 
 invcont14.unwind10_crit_edge:		; preds = %bb_main
@@ -372,7 +372,7 @@ invcont.cond_next_crit_edge:		; preds = %bb_main
 	br label %bb_main
 
 cond_true:		; preds = %bb_main
-	invoke void @_ZN9Fibonacci10get_numberEj( %struct.BigInt* null sret , %struct.Fibonacci* %this_this, i32 %n_i_n_i )
+	invoke void @_ZN9Fibonacci10get_numberEj( %struct.BigInt* sret null , %struct.Fibonacci* %this_this, i32 %n_i_n_i )
 			to label %meshBB323 unwind label %cond_true.unwind_crit_edge
 
 cond_true.unwind_crit_edge:		; preds = %cond_true, %bb_main
@@ -385,7 +385,7 @@ invcont12:		; preds = %bb_main
 	br label %bb_main
 
 invcont12.fragment:		; preds = %bb_main
-	invoke %"struct.__gnu_cxx::__normal_iterator<BigInt*,std::vector<BigInt, std::allocator<BigInt> > >"* @___ZN9__gnu_cxx17__normal_iteratorIP6BigIntSt6vectorIS1_SaIS1_EEEppEv___ZNSt6vectorImSaImEED1Ev___ZN6BigIntD1Ev___ZN9__gnu_cxx13new_allocatorI6BigIntE7destroyEPS1____ZSt8_DestroyIP6BigIntSaIS0_EEvT_S3_T0_( i32 14, %"struct.__gnu_cxx::__normal_iterator<BigInt*,std::vector<BigInt, std::allocator<BigInt> > >"* null, %"struct.std::vector<ulong,std::allocator<ulong> >"* null, %struct.BigInt* null, %struct.__false_type* null, %struct.BigInt* null, %struct.__false_type* null noalias  )
+	invoke %"struct.__gnu_cxx::__normal_iterator<BigInt*,std::vector<BigInt, std::allocator<BigInt> > >"* @___ZN9__gnu_cxx17__normal_iteratorIP6BigIntSt6vectorIS1_SaIS1_EEEppEv___ZNSt6vectorImSaImEED1Ev___ZN6BigIntD1Ev___ZN9__gnu_cxx13new_allocatorI6BigIntE7destroyEPS1____ZSt8_DestroyIP6BigIntSaIS0_EEvT_S3_T0_( i32 14, %"struct.__gnu_cxx::__normal_iterator<BigInt*,std::vector<BigInt, std::allocator<BigInt> > >"* null, %"struct.std::vector<ulong,std::allocator<ulong> >"* null, %struct.BigInt* null, %struct.__false_type* null, %struct.BigInt* null, %struct.__false_type* noalias null   )
 			to label %meshBB30 unwind label %meshBB337		; <%"struct.__gnu_cxx::__normal_iterator<BigInt*,std::vector<BigInt, std::allocator<BigInt> > >"*>:1 [#uses=0]
 
 invcont12.unwind_crit_edge:		; preds = %bb_main
@@ -467,7 +467,7 @@ invcont30.unwind_crit_edge.unwinddest:		; No predecessors!
 	br label %bb_main
 
 invcont33:		; preds = %bb_main
-	invoke void @_ZNKSt19basic_ostringstreamIcSt11char_traitsIcESaIcEE3strEv( %"struct.std::basic_string<char,std::char_traits<char>,std::allocator<char> >"* null sret , %"struct.std::ostringstream"* null )
+	invoke void @_ZNKSt19basic_ostringstreamIcSt11char_traitsIcESaIcEE3strEv( %"struct.std::basic_string<char,std::char_traits<char>,std::allocator<char> >"* sret null  , %"struct.std::ostringstream"* null )
 			to label %invcont36 unwind label %invcont33.unwind_crit_edge
 
 invcont33.unwind_crit_edge:		; preds = %invcont33, %bb_main
diff --git a/test/Transforms/LoopStrengthReduce/2009-11-10-LSRCrash.ll b/test/Transforms/LoopStrengthReduce/X86/2009-11-10-LSRCrash.ll
index 4032a59..4032a59 100644
--- a/test/Transforms/LoopStrengthReduce/2009-11-10-LSRCrash.ll
+++ b/test/Transforms/LoopStrengthReduce/X86/2009-11-10-LSRCrash.ll
diff --git a/test/Transforms/LoopStrengthReduce/X86/dg.exp b/test/Transforms/LoopStrengthReduce/X86/dg.exp
new file mode 100644
index 0000000..7b7bd4e
--- /dev/null
+++ b/test/Transforms/LoopStrengthReduce/X86/dg.exp
@@ -0,0 +1,5 @@
+load_lib llvm.exp
+
+if { [llvm_supports_target X86] } {
+  RunLLVMTests [lsort [glob -nocomplain $srcdir/$subdir/*.{ll}]]
+}
diff --git a/test/Transforms/LoopStrengthReduce/post-inc-icmpzero.ll b/test/Transforms/LoopStrengthReduce/post-inc-icmpzero.ll
new file mode 100644
index 0000000..294c090
--- /dev/null
+++ b/test/Transforms/LoopStrengthReduce/post-inc-icmpzero.ll
@@ -0,0 +1,91 @@
+; RUN: opt -loop-reduce -S < %s | FileCheck %s
+; PR9939
+
+; LSR should property handle the post-inc offset when folding the
+; non-IV operand of an icmp into the IV.
+
+; CHECK:   %tmp2 = sub i64 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast
+; CHECK:   %tmp3 = lshr i64 %tmp2, 1
+; CHECK:   %tmp4 = mul i64 %tmp3, 2
+; CHECK:   br label %for.body
+; CHECK: for.body:
+; CHECK:   %lsr.iv5 = phi i64 [ %lsr.iv.next, %for.body ], [ %tmp4, %for.body.lr.ph ]
+; CHECK:   %lsr.iv.next = add i64 %lsr.iv5, -2
+; CHECK:   %lsr.iv.next6 = inttoptr i64 %lsr.iv.next to i16*
+; CHECK:   %cmp27 = icmp eq i16* %lsr.iv.next6, null
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.Vector2 = type { i16*, [64 x i16], i32 }
+
+@.str = private unnamed_addr constant [37 x i8] c"0123456789abcdefghijklmnopqrstuvwxyz\00"
+
+define void @_Z15IntegerToStringjjR7Vector2(i32 %i, i32 %radix, %struct.Vector2* nocapture %result) nounwind noinline {
+entry:
+  %buffer = alloca [33 x i16], align 16
+  %add.ptr = getelementptr inbounds [33 x i16]* %buffer, i64 0, i64 33
+  br label %do.body
+
+do.body:                                          ; preds = %do.body, %entry
+  %0 = phi i64 [ %indvar.next44, %do.body ], [ 0, %entry ]
+  %i.addr.0 = phi i32 [ %div, %do.body ], [ %i, %entry ]
+  %tmp51 = sub i64 32, %0
+  %incdec.ptr = getelementptr [33 x i16]* %buffer, i64 0, i64 %tmp51
+  %rem = urem i32 %i.addr.0, 10
+  %div = udiv i32 %i.addr.0, 10
+  %idxprom = zext i32 %rem to i64
+  %arrayidx = getelementptr inbounds [37 x i8]* @.str, i64 0, i64 %idxprom
+  %tmp5 = load i8* %arrayidx, align 1
+  %conv = sext i8 %tmp5 to i16
+  store i16 %conv, i16* %incdec.ptr, align 2
+  %1 = icmp ugt i32 %i.addr.0, 9
+  %indvar.next44 = add i64 %0, 1
+  br i1 %1, label %do.body, label %do.end
+
+do.end:                                           ; preds = %do.body
+  %xap.0 = inttoptr i64 %0 to i1*
+  %cap.0 = ptrtoint i1* %xap.0 to i64
+  %sub.ptr.lhs.cast = ptrtoint i16* %add.ptr to i64
+  %sub.ptr.rhs.cast = ptrtoint i16* %incdec.ptr to i64
+  %sub.ptr.sub = sub i64 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast
+  %sub.ptr.div39 = lshr exact i64 %sub.ptr.sub, 1
+  %conv11 = trunc i64 %sub.ptr.div39 to i32
+  %mLength = getelementptr inbounds %struct.Vector2* %result, i64 0, i32 2
+  %idx.ext21 = bitcast i64 %sub.ptr.div39 to i64
+  %incdec.ptr.sum = add i64 %idx.ext21, -1
+  %cp.0.sum = sub i64 %incdec.ptr.sum, %0
+  %add.ptr22 = getelementptr [33 x i16]* %buffer, i64 1, i64 %cp.0.sum
+  %cmp2740 = icmp eq i64 %idx.ext21, 0
+  br i1 %cmp2740, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %do.end
+  %tmp16 = load i32* %mLength, align 4
+  %mBegin = getelementptr inbounds %struct.Vector2* %result, i64 0, i32 0
+  %tmp14 = load i16** %mBegin, align 8
+  %tmp48 = zext i32 %tmp16 to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %indvar = phi i64 [ 0, %for.body.lr.ph ], [ %indvar.next, %for.body ]
+  %tmp46 = add i64 %tmp51, %indvar
+  %p.042 = getelementptr [33 x i16]* %buffer, i64 0, i64 %tmp46
+  %tmp47 = sub i64 %indvar, %0
+  %incdec.ptr32 = getelementptr [33 x i16]* %buffer, i64 1, i64 %tmp47
+  %tmp49 = add i64 %tmp48, %indvar
+  %dst.041 = getelementptr i16* %tmp14, i64 %tmp49
+  %tmp29 = load i16* %p.042, align 2
+  store i16 %tmp29, i16* %dst.041, align 2
+  %cmp27 = icmp eq i16* %incdec.ptr32, %add.ptr22
+  %indvar.next = add i64 %indvar, 1
+  br i1 %cmp27, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %do.end
+  %tmp38 = load i32* %mLength, align 4
+  %add = add i32 %tmp38, %conv11
+  store i32 %add, i32* %mLength, align 4
+  ret void
+}
diff --git a/test/Transforms/LoopUnswitch/2011-06-02-CritSwitch.ll b/test/Transforms/LoopUnswitch/2011-06-02-CritSwitch.ll
new file mode 100644
index 0000000..61c54dd
--- /dev/null
+++ b/test/Transforms/LoopUnswitch/2011-06-02-CritSwitch.ll
@@ -0,0 +1,28 @@
+; RUN: opt -loop-unswitch -disable-output
+; PR10031
+
+define i32 @test(i32 %command) {
+entry:
+  br label %tailrecurse
+
+tailrecurse:                                      ; preds = %if.then14, %tailrecurse, %entry
+  br i1 undef, label %if.then, label %tailrecurse
+
+if.then:                                          ; preds = %tailrecurse
+  switch i32 %command, label %sw.bb [
+    i32 2, label %land.lhs.true
+    i32 0, label %land.lhs.true
+  ]
+
+land.lhs.true:                                    ; preds = %if.then, %if.then
+  br i1 undef, label %sw.bb, label %if.then14
+
+if.then14:                                        ; preds = %land.lhs.true
+  switch i32 %command, label %tailrecurse [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb
+  ]
+
+sw.bb:                                            ; preds = %if.then14
+  unreachable
+}
diff --git a/test/Transforms/MemCpyOpt/2011-06-02-CallSlotOverwritten.ll b/test/Transforms/MemCpyOpt/2011-06-02-CallSlotOverwritten.ll
new file mode 100644
index 0000000..132966e
--- /dev/null
+++ b/test/Transforms/MemCpyOpt/2011-06-02-CallSlotOverwritten.ll
@@ -0,0 +1,36 @@
+; RUN: opt < %s -basicaa -memcpyopt -S | FileCheck %s
+; PR10067
+; Make sure the call+copy isn't optimized in such a way that
+; %ret ends up with the wrong value.
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
+target triple = "i386-apple-darwin10"
+
+%struct1 = type { i32, i32 }
+%struct2 = type { %struct1, i8* }
+
+declare void @bar(%struct1* nocapture sret %agg.result) nounwind
+
+define i32 @foo() nounwind {
+  %x = alloca %struct1, align 8
+  %y = alloca %struct2, align 8
+  call void @bar(%struct1* sret %x) nounwind
+; CHECK: call void @bar(%struct1* sret %x)
+
+  %gepn1 = getelementptr inbounds %struct2* %y, i32 0, i32 0, i32 0
+  store i32 0, i32* %gepn1, align 8
+  %gepn2 = getelementptr inbounds %struct2* %y, i32 0, i32 0, i32 1
+  store i32 0, i32* %gepn2, align 4
+
+  %bit1 = bitcast %struct1* %x to i64*
+  %bit2 = bitcast %struct2* %y to i64*
+  %load = load i64* %bit1, align 8
+  store i64 %load, i64* %bit2, align 8
+
+; CHECK: %load = load i64* %bit1, align 8
+; CHECK: store i64 %load, i64* %bit2, align 8
+
+  %gep1 = getelementptr %struct2* %y, i32 0, i32 0, i32 0
+  %ret = load i32* %gep1
+  ret i32 %ret
+}
diff --git a/test/Transforms/MemCpyOpt/memcpy.ll b/test/Transforms/MemCpyOpt/memcpy.ll
index b387d32..5c6a94c 100644
--- a/test/Transforms/MemCpyOpt/memcpy.ll
+++ b/test/Transforms/MemCpyOpt/memcpy.ll
@@ -109,3 +109,23 @@ define void @test6(i8 *%P) {
 ; CHECK-NEXT: ret void
 }
 
+
+; PR9794 - Should forward memcpy into byval argument even though the memcpy
+; isn't itself 8 byte aligned.
+%struct.p = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }
+
+define i32 @test7(%struct.p* nocapture byval align 8 %q) nounwind ssp {
+entry:
+  %agg.tmp = alloca %struct.p, align 4
+  %tmp = bitcast %struct.p* %agg.tmp to i8*
+  %tmp1 = bitcast %struct.p* %q to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %tmp, i8* %tmp1, i64 48, i32 4, i1 false)
+  %call = call i32 @g(%struct.p* byval align 8 %agg.tmp) nounwind
+  ret i32 %call
+; CHECK: @test7
+; CHECK: call i32 @g(%struct.p* byval align 8 %q) nounwind
+}
+
+declare i32 @g(%struct.p* byval align 8)
+
+
diff --git a/test/Transforms/MemCpyOpt/memmove.ll b/test/Transforms/MemCpyOpt/memmove.ll
index 8babb04..8d3fbd2 100644
--- a/test/Transforms/MemCpyOpt/memmove.ll
+++ b/test/Transforms/MemCpyOpt/memmove.ll
@@ -11,11 +11,14 @@ entry:
 ; CHECK: @test1
 ; CHECK: call void @llvm.memcpy
 
-  %call3 = malloc [13 x i8]                       ; <[13 x i8]*> [#uses=1]
+  %malloccall = tail call i8* @malloc(i32 trunc (i64 mul nuw (i64 ptrtoint (i8* getelementptr (i8* null, i32 1) to i64), i64 13) to i32))
+  %call3 = bitcast i8* %malloccall to [13 x i8]*
   %call3.sub = getelementptr inbounds [13 x i8]* %call3, i64 0, i64 0 ; <i8*> [#uses=2]
   tail call void @llvm.memmove.i64(i8* %call3.sub, i8* %src, i64 13, i32 1)
   ret i8* %call3.sub
 }
+declare noalias i8* @malloc(i32)
+
 
 define void @test2(i8* %P) nounwind {
 entry:
diff --git a/test/Transforms/ObjCARC/basic.ll b/test/Transforms/ObjCARC/basic.ll
new file mode 100644
index 0000000..a6bbf86
--- /dev/null
+++ b/test/Transforms/ObjCARC/basic.ll
@@ -0,0 +1,1898 @@
+; RUN: opt -objc-arc -S < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64"
+
+declare i8* @objc_retain(i8*)
+declare void @objc_release(i8*)
+declare i8* @objc_autorelease(i8*)
+declare void @objc_autoreleasePoolPop(i8*)
+declare void @objc_autoreleasePoolPush()
+declare i8* @objc_retainBlock(i8*)
+
+declare i8* @objc_retainedObject(i8*)
+declare i8* @objc_unretainedObject(i8*)
+declare i8* @objc_unretainedPointer(i8*)
+
+declare void @use_pointer(i8*)
+declare void @callee()
+declare void @callee_fnptr(void ()*)
+declare void @invokee()
+declare i8* @returner()
+
+declare void @llvm.dbg.value(metadata, i64, metadata)
+
+declare i8* @objc_msgSend(i8*, i8*, ...)
+
+; Simple retain+release pair deletion, with some intervening control
+; flow and harmless instructions.
+
+; CHECK: define void @test0(
+; CHECK-NOT: @objc_
+; CHECK: }
+define void @test0(i32* %x, i1 %p) nounwind {
+entry:
+  %a = bitcast i32* %x to i8*
+  %0 = call i8* @objc_retain(i8* %a) nounwind
+  br i1 %p, label %t, label %f
+
+t:
+  store i8 3, i8* %a
+  %b = bitcast i32* %x to float*
+  store float 2.0, float* %b
+  br label %return
+
+f:
+  store i32 7, i32* %x
+  br label %return
+
+return:
+  %c = bitcast i32* %x to i8*
+  call void @objc_release(i8* %c) nounwind
+  ret void
+}
+
+; Like test0 but the release isn't always executed when the retain is,
+; so the optimization is not safe.
+
+; TODO: Make the objc_release's argument be %0.
+
+; CHECK: define void @test1(
+; CHECK: @objc_retain(i8* %a)
+; CHECK: @objc_release
+; CHECK: }
+define void @test1(i32* %x, i1 %p, i1 %q) nounwind {
+entry:
+  %a = bitcast i32* %x to i8*
+  %0 = call i8* @objc_retain(i8* %a) nounwind
+  br i1 %p, label %t, label %f
+
+t:
+  store i8 3, i8* %a
+  %b = bitcast i32* %x to float*
+  store float 2.0, float* %b
+  br label %return
+
+f:
+  store i32 7, i32* %x
+  call void @callee()
+  br i1 %q, label %return, label %alt_return
+
+return:
+  %c = bitcast i32* %x to i8*
+  call void @objc_release(i8* %c) nounwind
+  ret void
+
+alt_return:
+  ret void
+}
+
+; Like test0 but the pointer is passed to an intervening call,
+; so the optimization is not safe.
+
+; CHECK: define void @test2(
+; CHECK: @objc_retain(i8* %a)
+; CHECK: @objc_release
+; CHECK: }
+define void @test2(i32* %x, i1 %p) nounwind {
+entry:
+  %a = bitcast i32* %x to i8*
+  %0 = call i8* @objc_retain(i8* %a) nounwind
+  br i1 %p, label %t, label %f
+
+t:
+  store i8 3, i8* %a
+  %b = bitcast i32* %x to float*
+  store float 2.0, float* %b
+  br label %return
+
+f:
+  store i32 7, i32* %x
+  call void @use_pointer(i8* %0)
+  %d = bitcast i32* %x to float*
+  store float 3.0, float* %d
+  br label %return
+
+return:
+  %c = bitcast i32* %x to i8*
+  call void @objc_release(i8* %c) nounwind
+  ret void
+}
+
+; Like test0 but the release is in a loop,
+; so the optimization is not safe.
+
+; TODO: For now, assume this can't happen.
+
+; CHECK: define void @test3(
+; TODO: @objc_retain(i8* %a)
+; TODO: @objc_release
+; CHECK: }
+define void @test3(i32* %x, i1* %q) nounwind {
+entry:
+  %a = bitcast i32* %x to i8*
+  %0 = call i8* @objc_retain(i8* %a) nounwind
+  br label %loop
+
+loop:
+  %c = bitcast i32* %x to i8*
+  call void @objc_release(i8* %c) nounwind
+  %j = volatile load i1* %q
+  br i1 %j, label %loop, label %return
+
+return:
+  ret void
+}
+
+; TODO: For now, assume this can't happen.
+
+; Like test0 but the retain is in a loop,
+; so the optimization is not safe.
+
+; CHECK: define void @test4(
+; TODO: @objc_retain(i8* %a)
+; TODO: @objc_release
+; CHECK: }
+define void @test4(i32* %x, i1* %q) nounwind {
+entry:
+  br label %loop
+
+loop:
+  %a = bitcast i32* %x to i8*
+  %0 = call i8* @objc_retain(i8* %a) nounwind
+  %j = volatile load i1* %q
+  br i1 %j, label %loop, label %return
+
+return:
+  %c = bitcast i32* %x to i8*
+  call void @objc_release(i8* %c) nounwind
+  ret void
+}
+
+; Like test0 but the pointer is conditionally passed to an intervening call,
+; so the optimization is not safe.
+
+; CHECK: define void @test5(
+; CHECK: @objc_retain(i8*
+; CHECK: @objc_release
+; CHECK: }
+define void @test5(i32* %x, i1 %q, i8* %y) nounwind {
+entry:
+  %a = bitcast i32* %x to i8*
+  %0 = call i8* @objc_retain(i8* %a) nounwind
+  %s = select i1 %q, i8* %y, i8* %0
+  call void @use_pointer(i8* %s)
+  store i32 7, i32* %x
+  %c = bitcast i32* %x to i8*
+  call void @objc_release(i8* %c) nounwind
+  ret void
+}
+
+; retain+release pair deletion, where the release happens on two different
+; flow paths.
+
+; CHECK: define void @test6(
+; CHECK-NOT: @objc_
+; CHECK: }
+define void @test6(i32* %x, i1 %p) nounwind {
+entry:
+  %a = bitcast i32* %x to i8*
+  %0 = call i8* @objc_retain(i8* %a) nounwind
+  br i1 %p, label %t, label %f
+
+t:
+  store i8 3, i8* %a
+  %b = bitcast i32* %x to float*
+  store float 2.0, float* %b
+  %ct = bitcast i32* %x to i8*
+  call void @objc_release(i8* %ct) nounwind
+  br label %return
+
+f:
+  store i32 7, i32* %x
+  call void @callee()
+  %cf = bitcast i32* %x to i8*
+  call void @objc_release(i8* %cf) nounwind
+  br label %return
+
+return:
+  ret void
+}
+
+; retain+release pair deletion, where the retain happens on two different
+; flow paths.
+
+; CHECK: define void @test7(
+; CHECK-NOT: @objc_
+; CHECK: }
+define void @test7(i32* %x, i1 %p) nounwind {
+entry:
+  %a = bitcast i32* %x to i8*
+  br i1 %p, label %t, label %f
+
+t:
+  %0 = call i8* @objc_retain(i8* %a) nounwind
+  store i8 3, i8* %a
+  %b = bitcast i32* %x to float*
+  store float 2.0, float* %b
+  br label %return
+
+f:
+  %1 = call i8* @objc_retain(i8* %a) nounwind
+  store i32 7, i32* %x
+  call void @callee()
+  br label %return
+
+return:
+  %c = bitcast i32* %x to i8*
+  call void @objc_release(i8* %c) nounwind
+  ret void
+}
+
+; Like test7, but there's a retain/retainBlock mismatch. Don't delete!
+
+; CHECK: define void @test7b
+; CHECK: t:
+; CHECK: call i8* @objc_retainBlock
+; CHECK: f:
+; CHECK: call i8* @objc_retain
+; CHECK: return:
+; CHECK: call void @objc_release
+; CHECK: }
+define void @test7b(i32* %x, i1 %p) nounwind {
+entry:
+  %a = bitcast i32* %x to i8*
+  br i1 %p, label %t, label %f
+
+t:
+  %0 = call i8* @objc_retainBlock(i8* %a) nounwind
+  store i8 3, i8* %a
+  %b = bitcast i32* %x to float*
+  store float 2.0, float* %b
+  br label %return
+
+f:
+  %1 = call i8* @objc_retain(i8* %a) nounwind
+  store i32 7, i32* %x
+  call void @callee()
+  br label %return
+
+return:
+  %c = bitcast i32* %x to i8*
+  call void @objc_release(i8* %c) nounwind
+  ret void
+}
+
+; retain+release pair deletion, where the retain and release both happen on
+; different flow paths. Wild!
+
+; CHECK: define void @test8(
+; CHECK-NOT: @objc_
+; CHECK: }
+define void @test8(i32* %x, i1 %p, i1 %q) nounwind {
+entry:
+  %a = bitcast i32* %x to i8*
+  br i1 %p, label %t, label %f
+
+t:
+  %0 = call i8* @objc_retain(i8* %a) nounwind
+  store i8 3, i8* %a
+  %b = bitcast i32* %x to float*
+  store float 2.0, float* %b
+  br label %mid
+
+f:
+  %1 = call i8* @objc_retain(i8* %a) nounwind
+  store i32 7, i32* %x
+  br label %mid
+
+mid:
+  br i1 %q, label %u, label %g
+
+u:
+  call void @callee()
+  %cu = bitcast i32* %x to i8*
+  call void @objc_release(i8* %cu) nounwind
+  br label %return
+
+g:
+  %cg = bitcast i32* %x to i8*
+  call void @objc_release(i8* %cg) nounwind
+  br label %return
+
+return:
+  ret void
+}
+
+; Trivial retain+release pair deletion.
+
+; CHECK: define void @test9(
+; CHECK-NOT: @objc_
+; CHECK: }
+define void @test9(i8* %x) nounwind {
+entry:
+  %0 = call i8* @objc_retain(i8* %x) nounwind
+  call void @objc_release(i8* %0) nounwind
+  ret void
+}
+
+; Retain+release pair, but on an unknown pointer relationship. Don't delete!
+
+; CHECK: define void @test9b
+; CHECK: @objc_retain(i8* %x)
+; CHECK: @objc_release(i8* %s)
+; CHECK: }
+define void @test9b(i8* %x, i1 %j, i8* %p) nounwind {
+entry:
+  %0 = call i8* @objc_retain(i8* %x) nounwind
+  %s = select i1 %j, i8* %x, i8* %p
+  call void @objc_release(i8* %s) nounwind
+  ret void
+}
+
+; Trivial retain+release pair with intervening calls - don't delete!
+
+; CHECK: define void @test10(
+; CHECK: @objc_retain(i8* %x)
+; CHECK: @use_pointer
+; CHECK: @objc_release
+; CHECK: }
+define void @test10(i8* %x) nounwind {
+entry:
+  %0 = call i8* @objc_retain(i8* %x) nounwind
+  call void @use_pointer(i8* %x)
+  call void @use_pointer(i8* %x)
+  call void @objc_release(i8* %0) nounwind
+  ret void
+}
+
+; Trivial retain+autoreleaserelease pair. Don't delete!
+; Also, add a tail keyword, since objc_retain can never be passed
+; a stack argument.
+
+; CHECK: define void @test11(
+; CHECK: tail call i8* @objc_retain(i8* %x) nounwind
+; CHECK: tail call i8* @objc_autorelease(i8* %0) nounwind
+; CHECK: }
+define void @test11(i8* %x) nounwind {
+entry:
+  %0 = call i8* @objc_retain(i8* %x) nounwind
+  call i8* @objc_autorelease(i8* %0) nounwind
+  call void @use_pointer(i8* %x)
+  ret void
+}
+
+; Same as test11 but with no use_pointer call. Delete the pair!
+
+; CHECK: define void @test11a(
+; CHECK: entry:
+; CHECK-NEXT: ret void
+; CHECK: }
+define void @test11a(i8* %x) nounwind {
+entry:
+  %0 = call i8* @objc_retain(i8* %x) nounwind
+  call i8* @objc_autorelease(i8* %0) nounwind
+  ret void
+}
+
+; Same as test11 but the value is returned. Do an RV optimization.
+
+; CHECK: define i8* @test11b(
+; CHECK: tail call i8* @objc_retain(i8* %x) nounwind
+; CHECK: tail call i8* @objc_autoreleaseReturnValue(i8* %0) nounwind
+; CHECK: }
+define i8* @test11b(i8* %x) nounwind {
+entry:
+  %0 = call i8* @objc_retain(i8* %x) nounwind
+  call i8* @objc_autorelease(i8* %0) nounwind
+  ret i8* %x
+}
+
+; Trivial retain,release pair with intervening call, but it's dominated
+; by another retain - delete!
+
+; CHECK: define void @test12(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: @objc_retain(i8* %x)
+; CHECK-NOT: @objc_
+; CHECK: }
+define void @test12(i8* %x, i64 %n) {
+entry:
+  call i8* @objc_retain(i8* %x) nounwind
+  call i8* @objc_retain(i8* %x) nounwind
+  call void @use_pointer(i8* %x)
+  call void @use_pointer(i8* %x)
+  call void @objc_release(i8* %x) nounwind
+  ret void
+}
+
+; Trivial retain,autorelease pair. Don't delete!
+
+; CHECK: define void @test13(
+; CHECK: tail call i8* @objc_retain(i8* %x) nounwind
+; CHECK: tail call i8* @objc_retain(i8* %x) nounwind
+; CHECK: @use_pointer(i8* %x)
+; CHECK: tail call i8* @objc_autorelease(i8* %x) nounwind
+; CHECK: }
+define void @test13(i8* %x, i64 %n) {
+entry:
+  call i8* @objc_retain(i8* %x) nounwind
+  call i8* @objc_retain(i8* %x) nounwind
+  call void @use_pointer(i8* %x)
+  call i8* @objc_autorelease(i8* %x) nounwind
+  ret void
+}
+
+; Delete the retain+release pair.
+
+; CHECK: define void @test13b
+; CHECK-NEXT: entry:
+; CHECK-NEXT: @objc_retain(i8* %x)
+; CHECK-NEXT: @use_pointer
+; CHECK-NEXT: @use_pointer
+; CHECK-NEXT: ret void
+define void @test13b(i8* %x, i64 %n) {
+entry:
+  call i8* @objc_retain(i8* %x) nounwind
+  call i8* @objc_retain(i8* %x) nounwind
+  call void @use_pointer(i8* %x)
+  call void @use_pointer(i8* %x)
+  call void @objc_release(i8* %x) nounwind
+  ret void
+}
+
+; Don't delete the retain+release pair because there's an
+; autoreleasePoolPop in the way.
+
+; CHECK: define void @test13c
+; CHECK: @objc_retain(i8* %x)
+; CHECK: @objc_autoreleasePoolPop
+; CHECK: @objc_retain(i8* %x)
+; CHECK: @use_pointer
+; CHECK: @objc_release
+; CHECK: }
+define void @test13c(i8* %x, i64 %n) {
+entry:
+  call i8* @objc_retain(i8* %x) nounwind
+  call void @objc_autoreleasePoolPop(i8* undef)
+  call i8* @objc_retain(i8* %x) nounwind
+  call void @use_pointer(i8* %x)
+  call void @use_pointer(i8* %x)
+  call void @objc_release(i8* %x) nounwind
+  ret void
+}
+
+; Like test13c, but there's an autoreleasePoolPush in the way, but that
+; doesn't matter.
+
+; CHECK: define void @test13d
+; CHECK-NEXT: entry:
+; CHECK-NEXT: @objc_retain(i8* %x)
+; CHECK-NEXT: @objc_autoreleasePoolPush
+; CHECK-NEXT: @use_pointer
+; CHECK-NEXT: @use_pointer
+; CHECK-NEXT: ret void
+define void @test13d(i8* %x, i64 %n) {
+entry:
+  call i8* @objc_retain(i8* %x) nounwind
+  call void @objc_autoreleasePoolPush()
+  call i8* @objc_retain(i8* %x) nounwind
+  call void @use_pointer(i8* %x)
+  call void @use_pointer(i8* %x)
+  call void @objc_release(i8* %x) nounwind
+  ret void
+}
+
+; Trivial retain,release pair with intervening call, but it's post-dominated
+; by another release - delete!
+
+; CHECK: define void @test14(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: @use_pointer
+; CHECK-NEXT: @use_pointer
+; CHECK-NEXT: @objc_release
+; CHECK-NEXT: ret void
+; CHECK-NEXT: }
+define void @test14(i8* %x, i64 %n) {
+entry:
+  call i8* @objc_retain(i8* %x) nounwind
+  call void @use_pointer(i8* %x)
+  call void @use_pointer(i8* %x)
+  call void @objc_release(i8* %x) nounwind
+  call void @objc_release(i8* %x) nounwind
+  ret void
+}
+
+; Trivial retain,autorelease pair with intervening call, but it's post-dominated
+; by another release. Don't delete anything.
+
+; CHECK: define void @test15(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: @objc_retain(i8* %x)
+; CHECK-NEXT: @use_pointer
+; CHECK-NEXT: @objc_autorelease(i8* %x)
+; CHECK-NEXT: @objc_release
+; CHECK-NEXT: ret void
+; CHECK-NEXT: }
+define void @test15(i8* %x, i64 %n) {
+entry:
+  call i8* @objc_retain(i8* %x) nounwind
+  call void @use_pointer(i8* %x)
+  call i8* @objc_autorelease(i8* %x) nounwind
+  call void @objc_release(i8* %x) nounwind
+  ret void
+}
+
+; Trivial retain,autorelease pair, post-dominated
+; by another release. Delete the retain and release.
+
+; CHECK: define void @test15b
+; CHECK-NEXT: entry:
+; CHECK-NEXT: @objc_autorelease
+; CHECK-NEXT: ret void
+; CHECK-NEXT: }
+define void @test15b(i8* %x, i64 %n) {
+entry:
+  call i8* @objc_retain(i8* %x) nounwind
+  call i8* @objc_autorelease(i8* %x) nounwind
+  call void @objc_release(i8* %x) nounwind
+  ret void
+}
+
+; Retain+release pairs in diamonds, all dominated by a retain.
+
+; CHECK: define void @test16(
+; CHECK: @objc_retain(i8* %x)
+; CHECK-NOT: @objc
+; CHECK: }
+define void @test16(i1 %a, i1 %b, i8* %x) {
+entry:
+  call i8* @objc_retain(i8* %x) nounwind
+  br i1 %a, label %red, label %orange
+
+red:
+  call i8* @objc_retain(i8* %x) nounwind
+  br label %yellow
+
+orange:
+  call i8* @objc_retain(i8* %x) nounwind
+  br label %yellow
+
+yellow:
+  call void @use_pointer(i8* %x)
+  call void @use_pointer(i8* %x)
+  br i1 %b, label %green, label %blue
+
+green:
+  call void @objc_release(i8* %x) nounwind
+  br label %purple
+
+blue:
+  call void @objc_release(i8* %x) nounwind
+  br label %purple
+
+purple:
+  ret void
+}
+
+; Retain+release pairs in diamonds, all post-dominated by a release.
+
+; CHECK: define void @test17(
+; CHECK-NOT: @objc_
+; CHECK: purple:
+; CHECK: @objc_release
+; CHECK: }
+define void @test17(i1 %a, i1 %b, i8* %x) {
+entry:
+  br i1 %a, label %red, label %orange
+
+red:
+  call i8* @objc_retain(i8* %x) nounwind
+  br label %yellow
+
+orange:
+  call i8* @objc_retain(i8* %x) nounwind
+  br label %yellow
+
+yellow:
+  call void @use_pointer(i8* %x)
+  call void @use_pointer(i8* %x)
+  br i1 %b, label %green, label %blue
+
+green:
+  call void @objc_release(i8* %x) nounwind
+  br label %purple
+
+blue:
+  call void @objc_release(i8* %x) nounwind
+  br label %purple
+
+purple:
+  call void @objc_release(i8* %x) nounwind
+  ret void
+}
+
+; Delete no-ops.
+
+; CHECK: define void @test18(
+; CHECK-NOT: @objc_
+; CHECK: }
+define void @test18() {
+  call i8* @objc_retain(i8* null)
+  call void @objc_release(i8* null)
+  call i8* @objc_autorelease(i8* null)
+  ret void
+}
+
+; Delete no-ops where undef can be assumed to be null.
+
+; CHECK: define void @test18b
+; CHECK-NOT: @objc_
+; CHECK: }
+define void @test18b() {
+  call i8* @objc_retain(i8* undef)
+  call void @objc_release(i8* undef)
+  call i8* @objc_autorelease(i8* undef)
+  ret void
+}
+
+; Replace uses of arguments with uses of return values, to reduce
+; register pressure.
+
+; CHECK: define void @test19(i32* %y) {
+; CHECK:   %z = bitcast i32* %y to i8*
+; CHECK:   %0 = bitcast i32* %y to i8*
+; CHECK:   %1 = tail call i8* @objc_retain(i8* %0)
+; CHECK:   call void @use_pointer(i8* %z)
+; CHECK:   call void @use_pointer(i8* %z)
+; CHECK:   %2 = bitcast i32* %y to i8*
+; CHECK:   call void @objc_release(i8* %2)
+; CHECK:   ret void
+; CHECK: }
+define void @test19(i32* %y) {
+entry:
+  %x = bitcast i32* %y to i8*
+  %0 = call i8* @objc_retain(i8* %x) nounwind
+  %z = bitcast i32* %y to i8*
+  call void @use_pointer(i8* %z)
+  call void @use_pointer(i8* %z)
+  call void @objc_release(i8* %x)
+  ret void
+}
+
+; Bitcast insertion
+
+; CHECK: define void @test20(
+; CHECK: %tmp1 = tail call i8* @objc_retain(i8* %tmp) nounwind
+; CHECK-NEXT: invoke
+define void @test20(double* %self) {
+if.then12:
+  %tmp = bitcast double* %self to i8*
+  %tmp1 = call i8* @objc_retain(i8* %tmp) nounwind
+  invoke void @invokee()
+          to label %invoke.cont23 unwind label %lpad20
+
+invoke.cont23:                                    ; preds = %if.then12
+  invoke void @invokee()
+          to label %if.end unwind label %lpad20
+
+lpad20:                                           ; preds = %invoke.cont23, %if.then12
+  %tmp502 = phi double* [ undef, %invoke.cont23 ], [ %self, %if.then12 ]
+  unreachable
+
+if.end:                                           ; preds = %invoke.cont23
+  ret void
+}
+
+; Delete a redundant retain,autorelease when forwaring a call result
+; directly to a return value.
+
+; CHECK: define i8* @test21(
+; CHECK: call i8* @returner()
+; CHECK-NEXT: ret i8* %call
+define i8* @test21() {
+entry:
+  %call = call i8* @returner()
+  %0 = call i8* @objc_retain(i8* %call) nounwind
+  %1 = call i8* @objc_autorelease(i8* %0) nounwind
+  ret i8* %1
+}
+
+; Move an objc call up through a phi that has null operands.
+
+; CHECK: define void @test22(
+; CHECK: B:
+; CHECK:   %1 = bitcast double* %p to i8*
+; CHECK:   call void @objc_release(i8* %1)
+; CHECK:   br label %C
+; CHECK: C:                                                ; preds = %B, %A
+; CHECK-NOT: @objc_release
+; CHECK: }
+define void @test22(double* %p, i1 %a) {
+  br i1 %a, label %A, label %B
+A:
+  br label %C
+B:
+  br label %C
+C:
+  %h = phi double* [ null, %A ], [ %p, %B ]
+  %c = bitcast double* %h to i8*
+  call void @objc_release(i8* %c)
+  ret void
+}
+
+; Optimize objc_retainBlock.
+
+; CHECK: define void @test23(
+; CHECK-NOT: @objc_
+; CHECK: }
+%block0 = type { i64, i64, i8*, i8* }
+%block1 = type { i8**, i32, i32, i32 (%struct.__block_literal_1*)*, %block0* }
+%struct.__block_descriptor = type { i64, i64 }
+%struct.__block_literal_1 = type { i8**, i32, i32, i8**, %struct.__block_descriptor* }
+@__block_holder_tmp_1 = external constant %block1
+define void @test23() {
+entry:
+  %0 = call i8* @objc_retainBlock(i8* bitcast (%block1* @__block_holder_tmp_1 to i8*)) nounwind
+  call void @bar(i32 ()* bitcast (%block1* @__block_holder_tmp_1 to i32 ()*))
+  call void @bar(i32 ()* bitcast (%block1* @__block_holder_tmp_1 to i32 ()*))
+  call void @objc_release(i8* bitcast (%block1* @__block_holder_tmp_1 to i8*)) nounwind
+  ret void
+}
+
+; Don't optimize objc_retainBlock.
+
+; CHECK: define void @test23b
+; CHECK: @objc_retainBlock
+; CHECK: @objc_release
+; CHECK: }
+define void @test23b(i8* %p) {
+entry:
+  %0 = call i8* @objc_retainBlock(i8* %p) nounwind
+  call void @use_pointer(i8* %p)
+  call void @use_pointer(i8* %p)
+  call void @objc_release(i8* %p) nounwind
+  ret void
+}
+
+; Any call can decrement a retain count.
+
+; CHECK: define void @test24(
+; CHECK: @objc_retain(i8* %a)
+; CHECK: @objc_release
+; CHECK: }
+define void @test24(i8* %r, i8* %a) {
+  call i8* @objc_retain(i8* %a)
+  call void @use_pointer(i8* %r)
+  %q = load i8* %a
+  call void @objc_release(i8* %a)
+  ret void
+}
+
+; Don't move a retain/release pair if the release can be moved
+; but the retain can't be moved to balance it.
+
+; CHECK: define void @test25(
+; CHECK: entry:
+; CHECK:   call i8* @objc_retain(i8* %p)
+; CHECK: true:
+; CHECK: done:
+; CHECK:   call void @objc_release(i8* %p)
+; CHECK: }
+define void @test25(i8* %p, i1 %x) {
+entry:
+  %f0 = call i8* @objc_retain(i8* %p)
+  call void @callee()
+  br i1 %x, label %true, label %done
+
+true:
+  store i8 0, i8* %p
+  br label %done
+
+done:
+  call void @objc_release(i8* %p)
+  ret void
+}
+
+; Don't move a retain/release pair if the retain can be moved
+; but the release can't be moved to balance it.
+
+; CHECK: define void @test26(
+; CHECK: entry:
+; CHECK:   call i8* @objc_retain(i8* %p)
+; CHECK: true:
+; CHECK: done:
+; CHECK:   call void @objc_release(i8* %p)
+; CHECK: }
+define void @test26(i8* %p, i1 %x) {
+entry:
+  %f0 = call i8* @objc_retain(i8* %p)
+  br i1 %x, label %true, label %done
+
+true:
+  call void @callee()
+  br label %done
+
+done:
+  store i8 0, i8* %p
+  call void @objc_release(i8* %p)
+  ret void
+}
+
+; Don't sink the retain,release into the loop.
+
+; CHECK: define void @test27(
+; CHECK: entry:
+; CHECK: call i8* @objc_retain(i8* %p)
+; CHECK: loop:
+; CHECK-NOT: @objc_
+; CHECK: done:
+; CHECK: call void @objc_release
+; CHECK: }
+define void @test27(i8* %p, i1 %x, i1 %y) {
+entry: 
+  %f0 = call i8* @objc_retain(i8* %p)
+  br i1 %x, label %loop, label %done
+
+loop:
+  call void @callee()
+  store i8 0, i8* %p
+  br i1 %y, label %done, label %loop
+  
+done: 
+  call void @objc_release(i8* %p)
+  ret void
+}
+
+; Trivial code motion case: Triangle.
+
+; CHECK: define void @test28(
+; CHECK-NOT: @objc_
+; CHECK: true:
+; CHECK: call i8* @objc_retain(
+; CHECK: call void @callee()
+; CHECK: store
+; CHECK: call void @objc_release
+; CHECK: done:
+; CHECK-NOT: @objc_
+; CHECK: }
+define void @test28(i8* %p, i1 %x) {
+entry:
+  %f0 = call i8* @objc_retain(i8* %p)
+  br i1 %x, label %true, label %done
+
+true:
+  call void @callee()
+  store i8 0, i8* %p
+  br label %done
+
+done:
+  call void @objc_release(i8* %p), !clang.imprecise_release !0
+  ret void
+}
+
+; Trivial code motion case: Triangle, but no metadata. Don't move past
+; unrelated memory references!
+
+; CHECK: define void @test28b
+; CHECK: call i8* @objc_retain(
+; CHECK: true:
+; CHECK-NOT: @objc_
+; CHECK: call void @callee()
+; CHECK-NOT: @objc_
+; CHECK: store
+; CHECK-NOT: @objc_
+; CHECK: done:
+; CHECK: @objc_release
+; CHECK: }
+define void @test28b(i8* %p, i1 %x, i8* noalias %t) {
+entry:
+  %f0 = call i8* @objc_retain(i8* %p)
+  br i1 %x, label %true, label %done
+
+true:
+  call void @callee()
+  store i8 0, i8* %p
+  br label %done
+
+done:
+  store i8 0, i8* %t
+  call void @objc_release(i8* %p)
+  ret void
+}
+
+; Trivial code motion case: Triangle, with metadata. Do move past
+; unrelated memory references! And preserve the metadata.
+
+; CHECK: define void @test28c
+; CHECK-NOT: @objc_
+; CHECK: true:
+; CHECK: call i8* @objc_retain(
+; CHECK: call void @callee()
+; CHECK: store
+; CHECK: call void @objc_release(i8* %p) nounwind, !clang.imprecise_release
+; CHECK: done:
+; CHECK-NOT: @objc_
+; CHECK: }
+define void @test28c(i8* %p, i1 %x, i8* noalias %t) {
+entry:
+  %f0 = call i8* @objc_retain(i8* %p)
+  br i1 %x, label %true, label %done
+
+true:
+  call void @callee()
+  store i8 0, i8* %p
+  br label %done
+
+done:
+  store i8 0, i8* %t
+  call void @objc_release(i8* %p), !clang.imprecise_release !0
+  ret void
+}
+
+; Like test28. but with two releases.
+
+; CHECK: define void @test29(
+; CHECK-NOT: @objc_
+; CHECK: true:
+; CHECK: call i8* @objc_retain(
+; CHECK: call void @callee()
+; CHECK: store
+; CHECK: call void @objc_release
+; CHECK-NOT: @objc_release
+; CHECK: done:
+; CHECK-NOT: @objc_
+; CHECK: ohno:
+; CHECK-NOT: @objc_
+; CHECK: }
+define void @test29(i8* %p, i1 %x, i1 %y) {
+entry:
+  %f0 = call i8* @objc_retain(i8* %p)
+  br i1 %x, label %true, label %done
+
+true:
+  call void @callee()
+  store i8 0, i8* %p
+  br i1 %y, label %done, label %ohno
+
+done:
+  call void @objc_release(i8* %p)
+  ret void
+
+ohno:
+  call void @objc_release(i8* %p)
+  ret void
+}
+
+; Basic case with the use and call in a diamond
+; with an extra release.
+
+; CHECK: define void @test30(
+; CHECK-NOT: @objc_
+; CHECK: true:
+; CHECK: call i8* @objc_retain(
+; CHECK: call void @callee()
+; CHECK: store
+; CHECK: call void @objc_release
+; CHECK-NOT: @objc_release
+; CHECK: false:
+; CHECK-NOT: @objc_
+; CHECK: done:
+; CHECK-NOT: @objc_
+; CHECK: ohno:
+; CHECK-NOT: @objc_
+; CHECK: }
+define void @test30(i8* %p, i1 %x, i1 %y, i1 %z) {
+entry:
+  %f0 = call i8* @objc_retain(i8* %p)
+  br i1 %x, label %true, label %false
+
+true:
+  call void @callee()
+  store i8 0, i8* %p
+  br i1 %y, label %done, label %ohno
+
+false:
+  br i1 %z, label %done, label %ohno
+
+done:
+  call void @objc_release(i8* %p)
+  ret void
+
+ohno:
+  call void @objc_release(i8* %p)
+  ret void
+}
+
+; Basic case with a mergeable release.
+
+; CHECK: define void @test31(
+; CHECK: call i8* @objc_retain(i8* %p)
+; CHECK: call void @callee()
+; CHECK: store
+; CHECK: call void @objc_release
+; CHECK-NOT: @objc_release
+; CHECK: true:
+; CHECK-NOT: @objc_release
+; CHECK: false:
+; CHECK-NOT: @objc_release
+; CHECK: ret void
+; CHECK-NOT: @objc_release
+; CHECK: }
+define void @test31(i8* %p, i1 %x) {
+entry:
+  %f0 = call i8* @objc_retain(i8* %p)
+  call void @callee()
+  store i8 0, i8* %p
+  br i1 %x, label %true, label %false
+true:
+  call void @objc_release(i8* %p)
+  ret void
+false:
+  call void @objc_release(i8* %p)
+  ret void
+}
+
+; Don't consider bitcasts or getelementptrs direct uses.
+
+; CHECK: define void @test32(
+; CHECK-NOT: @objc_
+; CHECK: true:
+; CHECK: call i8* @objc_retain(
+; CHECK: call void @callee()
+; CHECK: store
+; CHECK: call void @objc_release
+; CHECK: done:
+; CHECK-NOT: @objc_
+; CHECK: }
+define void @test32(i8* %p, i1 %x) {
+entry:
+  %f0 = call i8* @objc_retain(i8* %p)
+  br i1 %x, label %true, label %done
+
+true:
+  call void @callee()
+  store i8 0, i8* %p
+  br label %done
+
+done:
+  %g = bitcast i8* %p to i8*
+  %h = getelementptr i8* %g, i64 0
+  call void @objc_release(i8* %g)
+  ret void
+}
+
+; Do consider icmps to be direct uses.
+
+; CHECK: define void @test33(
+; CHECK-NOT: @objc_
+; CHECK: true:
+; CHECK: call i8* @objc_retain(
+; CHECK: call void @callee()
+; CHECK: icmp
+; CHECK: call void @objc_release
+; CHECK: done:
+; CHECK-NOT: @objc_
+; CHECK: }
+define void @test33(i8* %p, i1 %x, i8* %y) {
+entry:
+  %f0 = call i8* @objc_retain(i8* %p)
+  br i1 %x, label %true, label %done
+
+true:
+  call void @callee()
+  %v = icmp eq i8* %p, %y
+  br label %done
+
+done:
+  %g = bitcast i8* %p to i8*
+  %h = getelementptr i8* %g, i64 0
+  call void @objc_release(i8* %g)
+  ret void
+}
+
+; Delete retain,release if there's just a possible dec.
+
+; CHECK: define void @test34(
+; CHECK-NOT: @objc_
+; CHECK: }
+define void @test34(i8* %p, i1 %x, i8* %y) {
+entry:
+  %f0 = call i8* @objc_retain(i8* %p)
+  br i1 %x, label %true, label %done
+
+true:
+  call void @callee()
+  br label %done
+
+done:
+  %g = bitcast i8* %p to i8*
+  %h = getelementptr i8* %g, i64 0
+  call void @objc_release(i8* %g)
+  ret void
+}
+
+; Delete retain,release if there's just a use.
+
+; CHECK: define void @test35(
+; CHECK-NOT: @objc_
+; CHECK: }
+define void @test35(i8* %p, i1 %x, i8* %y) {
+entry:
+  %f0 = call i8* @objc_retain(i8* %p)
+  br i1 %x, label %true, label %done
+
+true:
+  %v = icmp eq i8* %p, %y
+  br label %done
+
+done:
+  %g = bitcast i8* %p to i8*
+  %h = getelementptr i8* %g, i64 0
+  call void @objc_release(i8* %g)
+  ret void
+}
+
+; Delete a retain,release if there's no actual use.
+
+; CHECK: define void @test36(
+; CHECK-NOT: @objc_
+; CHECK: call void @callee()
+; CHECK-NOT: @objc_
+; CHECK: call void @callee()
+; CHECK-NOT: @objc_
+; CHECK: }
+define void @test36(i8* %p) {
+entry:
+  call i8* @objc_retain(i8* %p)
+  call void @callee()
+  call void @callee()
+  call void @objc_release(i8* %p)
+  ret void
+}
+
+; Like test36, but with metadata.
+
+; CHECK: define void @test37(
+; CHECK-NOT: @objc_
+; CHECK: }
+define void @test37(i8* %p) {
+entry:
+  call i8* @objc_retain(i8* %p)
+  call void @callee()
+  call void @callee()
+  call void @objc_release(i8* %p), !clang.imprecise_release !0
+  ret void
+}
+
+; Be aggressive about analyzing phis to eliminate possible uses.
+
+; CHECK: define void @test38(
+; CHECK-NOT: @objc_
+; CHECK: }
+define void @test38(i8* %p, i1 %u, i1 %m, i8* %z, i8* %y, i8* %x, i8* %w) {
+entry:
+  call i8* @objc_retain(i8* %p)
+  br i1 %u, label %true, label %false
+true:
+  br i1 %m, label %a, label %b
+false:
+  br i1 %m, label %c, label %d
+a:
+  br label %e
+b:
+  br label %e
+c:
+  br label %f
+d:
+  br label %f
+e:
+  %j = phi i8* [ %z, %a ], [ %y, %b ]
+  br label %g
+f:
+  %k = phi i8* [ %w, %c ], [ %x, %d ]
+  br label %g
+g:
+  %h = phi i8* [ %j, %e ], [ %k, %f ]
+  call void @use_pointer(i8* %h)
+  call void @objc_release(i8* %p), !clang.imprecise_release !0
+  ret void
+}
+
+; Delete retain,release pairs around loops.
+
+; CHECK: define void @test39(
+; CHECK_NOT: @objc_
+; CHECK: }
+define void @test39(i8* %p) {
+entry:
+  %0 = call i8* @objc_retain(i8* %p)
+  br label %loop
+
+loop:                                             ; preds = %loop, %entry
+  br i1 undef, label %loop, label %exit
+
+exit:                                             ; preds = %loop
+  call void @objc_release(i8* %0), !clang.imprecise_release !0
+  ret void
+}
+
+; Delete retain,release pairs around loops containing uses.
+
+; CHECK: define void @test39b(
+; CHECK_NOT: @objc_
+; CHECK: }
+define void @test39b(i8* %p) {
+entry:
+  %0 = call i8* @objc_retain(i8* %p)
+  br label %loop
+
+loop:                                             ; preds = %loop, %entry
+  store i8 0, i8* %0
+  br i1 undef, label %loop, label %exit
+
+exit:                                             ; preds = %loop
+  call void @objc_release(i8* %0), !clang.imprecise_release !0
+  ret void
+}
+
+; Delete retain,release pairs around loops containing potential decrements.
+
+; CHECK: define void @test39c(
+; CHECK_NOT: @objc_
+; CHECK: }
+define void @test39c(i8* %p) {
+entry:
+  %0 = call i8* @objc_retain(i8* %p)
+  br label %loop
+
+loop:                                             ; preds = %loop, %entry
+  call void @use_pointer(i8* %0)
+  br i1 undef, label %loop, label %exit
+
+exit:                                             ; preds = %loop
+  call void @objc_release(i8* %0), !clang.imprecise_release !0
+  ret void
+}
+
+; Delete retain,release pairs around loops even if
+; the successors are in a different order.
+
+; CHECK: define void @test40(
+; CHECK_NOT: @objc_
+; CHECK: }
+define void @test40(i8* %p) {
+entry:
+  %0 = call i8* @objc_retain(i8* %p)
+  br label %loop
+
+loop:                                             ; preds = %loop, %entry
+  call void @use_pointer(i8* %0)
+  br i1 undef, label %exit, label %loop
+
+exit:                                             ; preds = %loop
+  call void @objc_release(i8* %0), !clang.imprecise_release !0
+  ret void
+}
+
+; Do the known-incremented retain+release elimination even if the pointer
+; is also autoreleased.
+
+; CHECK: define void @test42(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: call i8* @objc_retain(i8* %p)
+; CHECK-NEXT: call i8* @objc_autorelease(i8* %p)
+; CHECK-NEXT: call void @use_pointer(i8* %p)
+; CHECK-NEXT: call void @use_pointer(i8* %p)
+; CHECK-NEXT: ret void
+; CHECK-NEXT: }
+define void @test42(i8* %p) {
+entry:
+  call i8* @objc_retain(i8* %p)
+  call i8* @objc_autorelease(i8* %p)
+  call i8* @objc_retain(i8* %p)
+  call void @use_pointer(i8* %p)
+  call void @use_pointer(i8* %p)
+  call void @objc_release(i8* %p)
+  ret void
+}
+
+; Don't the known-incremented retain+release elimination if the pointer is
+; autoreleased and there's an autoreleasePoolPop.
+
+; CHECK: define void @test43(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: call i8* @objc_retain(i8* %p)
+; CHECK-NEXT: call i8* @objc_autorelease(i8* %p)
+; CHECK-NEXT: call i8* @objc_retain
+; CHECK-NEXT: call void @use_pointer(i8* %p)
+; CHECK-NEXT: call void @use_pointer(i8* %p)
+; CHECK-NEXT: call void @objc_autoreleasePoolPop(i8* undef)
+; CHECK-NEXT: call void @objc_release
+; CHECK-NEXT: ret void
+; CHECK-NEXT: }
+define void @test43(i8* %p) {
+entry:
+  call i8* @objc_retain(i8* %p)
+  call i8* @objc_autorelease(i8* %p)
+  call i8* @objc_retain(i8* %p)
+  call void @use_pointer(i8* %p)
+  call void @use_pointer(i8* %p)
+  call void @objc_autoreleasePoolPop(i8* undef)
+  call void @objc_release(i8* %p)
+  ret void
+}
+
+; Do the known-incremented retain+release elimination if the pointer is
+; autoreleased and there's an autoreleasePoolPush.
+
+; CHECK: define void @test43b
+; CHECK-NEXT: entry:
+; CHECK-NEXT: call i8* @objc_retain(i8* %p)
+; CHECK-NEXT: call i8* @objc_autorelease(i8* %p)
+; CHECK-NEXT: call void @use_pointer(i8* %p)
+; CHECK-NEXT: call void @use_pointer(i8* %p)
+; CHECK-NEXT: call void @objc_autoreleasePoolPush()
+; CHECK-NEXT: ret void
+; CHECK-NEXT: }
+define void @test43b(i8* %p) {
+entry:
+  call i8* @objc_retain(i8* %p)
+  call i8* @objc_autorelease(i8* %p)
+  call i8* @objc_retain(i8* %p)
+  call void @use_pointer(i8* %p)
+  call void @use_pointer(i8* %p)
+  call void @objc_autoreleasePoolPush()
+  call void @objc_release(i8* %p)
+  ret void
+}
+
+; Do retain+release elimination for non-provenance pointers.
+
+; CHECK: define void @test44(
+; CHECK-NOT: objc_
+; CHECK: }
+define void @test44(i8** %pp) {
+  %p = load i8** %pp
+  %q = call i8* @objc_retain(i8* %p)
+  call void @objc_release(i8* %q)
+  ret void
+}
+
+; Don't delete retain+release with an unknown-provenance
+; may-alias objc_release between them.
+
+; CHECK: define void @test45(
+; CHECK: call i8* @objc_retain(i8* %p)
+; CHECK: call void @objc_release(i8* %q)
+; CHECK: call void @use_pointer(i8* %p)
+; CHECK: call void @objc_release(i8* %p)
+define void @test45(i8** %pp, i8** %qq) {
+  %p = load i8** %pp
+  %q = load i8** %qq
+  call i8* @objc_retain(i8* %p)
+  call void @objc_release(i8* %q)
+  call void @use_pointer(i8* %p)
+  call void @objc_release(i8* %p)
+  ret void
+}
+
+; Don't delete retain and autorelease here.
+
+; CHECK: define void @test46(
+; CHECK: tail call i8* @objc_retain(i8* %p) nounwind
+; CHECK: true:
+; CHECK: tail call i8* @objc_autorelease(i8* %p) nounwind
+define void @test46(i8* %p, i1 %a) {
+entry:
+  call i8* @objc_retain(i8* %p)
+  br i1 %a, label %true, label %false
+
+true:
+  call i8* @objc_autorelease(i8* %p)
+  call void @use_pointer(i8* %p)
+  ret void
+
+false:
+  ret void
+}
+
+; Delete no-op cast calls.
+
+; CHECK: define i8* @test47(
+; CHECK-NOT: call
+; CHECK: ret i8* %p
+define i8* @test47(i8* %p) nounwind {
+  %x = call i8* @objc_retainedObject(i8* %p)
+  ret i8* %x
+}
+
+; Delete no-op cast calls.
+
+; CHECK: define i8* @test48(
+; CHECK-NOT: call
+; CHECK: ret i8* %p
+define i8* @test48(i8* %p) nounwind {
+  %x = call i8* @objc_unretainedObject(i8* %p)
+  ret i8* %x
+}
+
+; Delete no-op cast calls.
+
+; CHECK: define i8* @test49(
+; CHECK-NOT: call
+; CHECK: ret i8* %p
+define i8* @test49(i8* %p) nounwind {
+  %x = call i8* @objc_unretainedPointer(i8* %p)
+  ret i8* %x
+}
+
+; Do delete retain+release with intervening stores of the
+; address value;
+
+; CHECK: define void @test50(
+; CHECK-NOT: @objc_
+; CHECK: }
+define void @test50(i8* %p, i8** %pp) {
+  call i8* @objc_retain(i8* %p)
+  call void @callee()
+  store i8* %p, i8** %pp
+  call void @objc_release(i8* %p)
+  ret void
+}
+
+; Don't delete retain+release with intervening stores through the
+; address value.
+
+; CHECK: define void @test51(
+; CHECK: call i8* @objc_retain(i8* %p)
+; CHECK: call void @objc_release(i8* %p)
+define void @test51(i8* %p) {
+  call i8* @objc_retain(i8* %p)
+  call void @callee()
+  store i8 0, i8* %p
+  call void @objc_release(i8* %p)
+  ret void
+}
+
+; Don't delete retain+release with intervening use of a pointer of
+; unknown provenance.
+
+; CHECK: define void @test52(
+; CHECK: call i8* @objc_retain
+; CHECK: call void @callee()
+; CHECK: call void @use_pointer(i8* %z)
+; CHECK: call void @objc_release
+define void @test52(i8** %zz, i8** %pp) {
+  %p = load i8** %pp
+  %1 = call i8* @objc_retain(i8* %p)
+  call void @callee()
+  %z = load i8** %zz
+  call void @use_pointer(i8* %z)
+  call void @objc_release(i8* %p)
+  ret void
+}
+
+; Like test52, but the pointer has function type, so it's assumed to
+; be not reference counted.
+
+; CHECK: define void @test53(
+; CHECK-NOT: @objc_
+; CHECK: }
+define void @test53(void ()** %zz, i8** %pp) {
+  %p = load i8** %pp
+  %1 = call i8* @objc_retain(i8* %p)
+  call void @callee()
+  %z = load void ()** %zz
+  call void @callee_fnptr(void ()* %z)
+  call void @objc_release(i8* %p)
+  ret void
+}
+
+; Convert autorelease to release if the value is unused.
+
+; CHECK: define void @test54(
+; CHECK: call i8* @returner()
+; CHECK-NEXT: call void @objc_release(i8* %t) nounwind, !clang.imprecise_release !0
+; CHECK-NEXT: ret void
+define void @test54() {
+  %t = call i8* @returner()
+  call i8* @objc_autorelease(i8* %t)
+  ret void
+}
+
+; Nested retain+release pairs. Delete them both.
+
+; CHECK: define void @test55(
+; CHECK-NOT: @objc
+; CHECK: }
+define void @test55(i8* %x) { 
+entry: 
+  %0 = call i8* @objc_retain(i8* %x) nounwind 
+  %1 = call i8* @objc_retain(i8* %x) nounwind 
+  call void @objc_release(i8* %x) nounwind 
+  call void @objc_release(i8* %x) nounwind 
+  ret void 
+}
+
+; Nested retain+release pairs where the inner pair depends
+; on the outer pair to be removed, and then the outer pair
+; can be partially eliminated. Plus an extra outer pair to
+; eliminate, for fun.
+
+; CHECK: define void @test56(
+; CHECK-NOT: @objc
+; CHECK: if.then:
+; CHECK-NEXT: %0 = tail call i8* @objc_retain(i8* %x) nounwind
+; CHECK-NEXT: tail call void @use_pointer(i8* %x)
+; CHECK-NEXT: tail call void @use_pointer(i8* %x)
+; CHECK-NEXT: tail call void @objc_release(i8* %x) nounwind, !clang.imprecise_release !0
+; CHECK-NEXT: br label %if.end
+; CHECK-NOT: @objc
+; CHECK: }
+define void @test56(i8* %x, i32 %n) {
+entry:
+  %0 = tail call i8* @objc_retain(i8* %x) nounwind
+  %1 = tail call i8* @objc_retain(i8* %0) nounwind
+  %tobool = icmp eq i32 %n, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %2 = tail call i8* @objc_retain(i8* %1) nounwind
+  tail call void @use_pointer(i8* %2)
+  tail call void @use_pointer(i8* %2)
+  tail call void @objc_release(i8* %2) nounwind, !clang.imprecise_release !0
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  tail call void @objc_release(i8* %1) nounwind, !clang.imprecise_release !0
+  tail call void @objc_release(i8* %0) nounwind, !clang.imprecise_release !0
+  ret void
+}
+
+declare void @bar(i32 ()*)
+
+; A few real-world testcases.
+
+@.str4 = private unnamed_addr constant [33 x i8] c"-[A z] = { %f, %f, { %f, %f } }\0A\00"
+@"OBJC_IVAR_$_A.myZ" = global i64 20, section "__DATA, __objc_const", align 8
+declare i32 @printf(i8* nocapture, ...) nounwind
+declare i32 @puts(i8* nocapture) nounwind
+@str = internal constant [16 x i8] c"-[ Top0 _getX ]\00"
+
+; CHECK: @"\01-[A z]"
+; CHECK-NOT: @objc_
+; CHECK: }
+
+define {<2 x float>, <2 x float>} @"\01-[A z]"({}* %self, i8* nocapture %_cmd) nounwind {
+invoke.cont:
+  %0 = bitcast {}* %self to i8*
+  %1 = tail call i8* @objc_retain(i8* %0) nounwind
+  tail call void @llvm.dbg.value(metadata !{{}* %self}, i64 0, metadata !0)
+  tail call void @llvm.dbg.value(metadata !{{}* %self}, i64 0, metadata !0)
+  %ivar = load i64* @"OBJC_IVAR_$_A.myZ", align 8
+  %add.ptr = getelementptr i8* %0, i64 %ivar
+  %tmp1 = bitcast i8* %add.ptr to float*
+  %tmp2 = load float* %tmp1, align 4
+  %conv = fpext float %tmp2 to double
+  %add.ptr.sum = add i64 %ivar, 4
+  %tmp6 = getelementptr inbounds i8* %0, i64 %add.ptr.sum
+  %2 = bitcast i8* %tmp6 to float*
+  %tmp7 = load float* %2, align 4
+  %conv8 = fpext float %tmp7 to double
+  %add.ptr.sum36 = add i64 %ivar, 8
+  %tmp12 = getelementptr inbounds i8* %0, i64 %add.ptr.sum36
+  %arrayidx = bitcast i8* %tmp12 to float*
+  %tmp13 = load float* %arrayidx, align 4
+  %conv14 = fpext float %tmp13 to double
+  %tmp12.sum = add i64 %ivar, 12
+  %arrayidx19 = getelementptr inbounds i8* %0, i64 %tmp12.sum
+  %3 = bitcast i8* %arrayidx19 to float*
+  %tmp20 = load float* %3, align 4
+  %conv21 = fpext float %tmp20 to double
+  %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([33 x i8]* @.str4, i64 0, i64 0), double %conv, double %conv8, double %conv14, double %conv21)
+  %ivar23 = load i64* @"OBJC_IVAR_$_A.myZ", align 8
+  %add.ptr24 = getelementptr i8* %0, i64 %ivar23
+  %4 = bitcast i8* %add.ptr24 to i128*
+  %srcval = load i128* %4, align 4
+  tail call void @objc_release(i8* %0) nounwind
+  %tmp29 = trunc i128 %srcval to i64
+  %tmp30 = bitcast i64 %tmp29 to <2 x float>
+  %tmp31 = insertvalue {<2 x float>, <2 x float>} undef, <2 x float> %tmp30, 0
+  %tmp32 = lshr i128 %srcval, 64
+  %tmp33 = trunc i128 %tmp32 to i64
+  %tmp34 = bitcast i64 %tmp33 to <2 x float>
+  %tmp35 = insertvalue {<2 x float>, <2 x float>} %tmp31, <2 x float> %tmp34, 1
+  ret {<2 x float>, <2 x float>} %tmp35
+}
+
+; CHECK: @"\01-[Top0 _getX]"
+; CHECK-NOT: @objc_
+; CHECK: }
+
+define i32 @"\01-[Top0 _getX]"({}* %self, i8* nocapture %_cmd) nounwind {
+invoke.cont:
+  %0 = bitcast {}* %self to i8*
+  %1 = tail call i8* @objc_retain(i8* %0) nounwind
+  %puts = tail call i32 @puts(i8* getelementptr inbounds ([16 x i8]* @str, i64 0, i64 0))
+  tail call void @objc_release(i8* %0) nounwind
+  ret i32 0
+}
+
+@"\01L_OBJC_METH_VAR_NAME_" = internal global [5 x i8] c"frob\00", section "__TEXT,__cstring,cstring_literals", align 1@"\01L_OBJC_SELECTOR_REFERENCES_" = internal global i8* getelementptr inbounds ([5 x i8]* @"\01L_OBJC_METH_VAR_NAME_", i64 0, i64 0), section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
+@"\01L_OBJC_IMAGE_INFO" = internal constant [2 x i32] [i32 0, i32 16], section "__DATA, __objc_imageinfo, regular, no_dead_strip"
+@llvm.used = appending global [3 x i8*] [i8* getelementptr inbounds ([5 x i8]* @"\01L_OBJC_METH_VAR_NAME_", i32 0, i32 0), i8* bitcast (i8** @"\01L_OBJC_SELECTOR_REFERENCES_" to i8*), i8* bitcast ([2 x i32]* @"\01L_OBJC_IMAGE_INFO" to i8*)], section "llvm.metadata"
+
+; A simple loop. Eliminate the retain and release inside of it!
+
+; CHECK: define void @loop
+; CHECK: for.body:
+; CHECK-NOT: @objc_
+; CHECK: @objc_msgSend
+; CHECK-NOT: @objc_
+; CHECK: for.end:
+define void @loop(i8* %x, i64 %n) {
+entry:
+  %0 = tail call i8* @objc_retain(i8* %x) nounwind
+  %cmp9 = icmp sgt i64 %n, 0
+  br i1 %cmp9, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.010 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %1 = tail call i8* @objc_retain(i8* %x) nounwind
+  %tmp5 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_", align 8
+  %call = tail call i8* (i8*, i8*, ...)* @objc_msgSend(i8* %1, i8* %tmp5)
+  tail call void @objc_release(i8* %1) nounwind, !clang.imprecise_release !0
+  %inc = add nsw i64 %i.010, 1
+  %exitcond = icmp eq i64 %inc, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  tail call void @objc_release(i8* %x) nounwind, !clang.imprecise_release !0
+  ret void
+}
+
+; ObjCARCOpt can delete the retain,release on self.
+
+; CHECK: define void @TextEditTest
+; CHECK-NOT: call i8* @objc_retain(i8* %tmp7)
+; CHECK: }
+
+%0 = type { i8* (i8*, %struct._message_ref_t*, ...)*, i8* }
+%1 = type opaque
+%2 = type opaque
+%3 = type opaque
+%4 = type opaque
+%5 = type opaque
+%struct.NSConstantString = type { i32*, i32, i8*, i64 }
+%struct._NSRange = type { i64, i64 }
+%struct.__CFString = type opaque
+%struct.__method_list_t = type { i32, i32, [0 x %struct._objc_method] }
+%struct._class_ro_t = type { i32, i32, i32, i8*, i8*, %struct.__method_list_t*, %struct._objc_protocol_list*, %struct._ivar_list_t*, i8*, %struct._prop_list_t* }
+%struct._class_t = type { %struct._class_t*, %struct._class_t*, %struct._objc_cache*, i8* (i8*, i8*)**, %struct._class_ro_t* }
+%struct._ivar_list_t = type { i32, i32, [0 x %struct._ivar_t] }
+%struct._ivar_t = type { i64*, i8*, i8*, i32, i32 }
+%struct._message_ref_t = type { i8*, i8* }
+%struct._objc_cache = type opaque
+%struct._objc_method = type { i8*, i8*, i8* }
+%struct._objc_protocol_list = type { i64, [0 x %struct._protocol_t*] }
+%struct._prop_list_t = type { i32, i32, [0 x %struct._message_ref_t] }
+%struct._protocol_t = type { i8*, i8*, %struct._objc_protocol_list*, %struct.__method_list_t*, %struct.__method_list_t*, %struct.__method_list_t*, %struct.__method_list_t*, %struct._prop_list_t*, i32, i32 }
+
+@"\01L_OBJC_CLASSLIST_REFERENCES_$_17" = external hidden global %struct._class_t*, section "__DATA, __objc_classrefs, regular, no_dead_strip", align 8
+@kUTTypePlainText = external constant %struct.__CFString*
+@"\01L_OBJC_SELECTOR_REFERENCES_19" = external hidden global i8*, section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
+@"\01L_OBJC_SELECTOR_REFERENCES_21" = external hidden global i8*, section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
+@"\01L_OBJC_SELECTOR_REFERENCES_23" = external hidden global i8*, section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
+@"\01L_OBJC_SELECTOR_REFERENCES_25" = external hidden global i8*, section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
+@"\01L_OBJC_CLASSLIST_REFERENCES_$_26" = external hidden global %struct._class_t*, section "__DATA, __objc_classrefs, regular, no_dead_strip", align 8
+@"\01L_OBJC_SELECTOR_REFERENCES_28" = external hidden global i8*, section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
+@"\01L_OBJC_CLASSLIST_REFERENCES_$_29" = external hidden global %struct._class_t*, section "__DATA, __objc_classrefs, regular, no_dead_strip", align 8
+@"\01L_OBJC_SELECTOR_REFERENCES_31" = external hidden global i8*, section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
+@"\01L_OBJC_SELECTOR_REFERENCES_33" = external hidden global i8*, section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
+@"\01L_OBJC_SELECTOR_REFERENCES_35" = external hidden global i8*, section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
+@"\01L_OBJC_SELECTOR_REFERENCES_37" = external hidden global i8*, section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
+@"\01L_OBJC_CLASSLIST_REFERENCES_$_38" = external hidden global %struct._class_t*, section "__DATA, __objc_classrefs, regular, no_dead_strip", align 8
+@"\01L_OBJC_SELECTOR_REFERENCES_40" = external hidden global i8*, section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
+@"\01L_OBJC_SELECTOR_REFERENCES_42" = external hidden global i8*, section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
+@_unnamed_cfstring_44 = external hidden constant %struct.NSConstantString, section "__DATA,__cfstring"
+@"\01L_OBJC_SELECTOR_REFERENCES_46" = external hidden global i8*, section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
+@"\01L_OBJC_SELECTOR_REFERENCES_48" = external hidden global i8*, section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
+@"\01l_objc_msgSend_fixup_isEqual_" = external hidden global %0, section "__DATA, __objc_msgrefs, coalesced", align 16
+@"\01L_OBJC_CLASSLIST_REFERENCES_$_50" = external hidden global %struct._class_t*, section "__DATA, __objc_classrefs, regular, no_dead_strip", align 8
+@NSCocoaErrorDomain = external constant %1*
+@"\01L_OBJC_CLASSLIST_REFERENCES_$_51" = external hidden global %struct._class_t*, section "__DATA, __objc_classrefs, regular, no_dead_strip", align 8
+@NSFilePathErrorKey = external constant %1*
+@"\01L_OBJC_SELECTOR_REFERENCES_53" = external hidden global i8*, section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
+@"\01L_OBJC_SELECTOR_REFERENCES_55" = external hidden global i8*, section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
+@"\01L_OBJC_CLASSLIST_REFERENCES_$_56" = external hidden global %struct._class_t*, section "__DATA, __objc_classrefs, regular, no_dead_strip", align 8
+@"\01L_OBJC_SELECTOR_REFERENCES_58" = external hidden global i8*, section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
+@"\01L_OBJC_SELECTOR_REFERENCES_60" = external hidden global i8*, section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
+
+declare %1* @truncatedString(%1*, i64)
+define void @TextEditTest(%2* %self, %3* %pboard) {
+entry:
+  %err = alloca %4*, align 8
+  %tmp7 = bitcast %2* %self to i8*
+  %tmp8 = call i8* @objc_retain(i8* %tmp7) nounwind
+  store %4* null, %4** %err, align 8
+  %tmp1 = load %struct._class_t** @"\01L_OBJC_CLASSLIST_REFERENCES_$_17", align 8
+  %tmp2 = load %struct.__CFString** @kUTTypePlainText, align 8
+  %tmp3 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_19", align 8
+  %tmp4 = bitcast %struct._class_t* %tmp1 to i8*
+  %call5 = call i8* (i8*, i8*, ...)* @objc_msgSend(i8* %tmp4, i8* %tmp3, %struct.__CFString* %tmp2)
+  %tmp5 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_21", align 8
+  %tmp6 = bitcast %3* %pboard to i8*
+  %call76 = call i8* (i8*, i8*, ...)* @objc_msgSend(i8* %tmp6, i8* %tmp5, i8* %call5)
+  %tmp9 = call i8* @objc_retain(i8* %call76) nounwind
+  %tobool = icmp eq i8* %tmp9, null
+  br i1 %tobool, label %end, label %land.lhs.true
+
+land.lhs.true:                                    ; preds = %entry
+  %tmp11 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_23", align 8
+  %call137 = call i8* (i8*, i8*, ...)* @objc_msgSend(i8* %tmp6, i8* %tmp11, i8* %tmp9)
+  %tmp = bitcast i8* %call137 to %1*
+  %tmp10 = call i8* @objc_retain(i8* %call137) nounwind
+  call void @objc_release(i8* null) nounwind
+  %tmp12 = call i8* @objc_retain(i8* %call137) nounwind
+  call void @objc_release(i8* null) nounwind
+  %tobool16 = icmp eq i8* %call137, null
+  br i1 %tobool16, label %end, label %if.then
+
+if.then:                                          ; preds = %land.lhs.true
+  %tmp19 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_25", align 8
+  %call21 = call signext i8 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8 (i8*, i8*)*)(i8* %call137, i8* %tmp19)
+  %tobool22 = icmp eq i8 %call21, 0
+  br i1 %tobool22, label %if.then44, label %land.lhs.true23
+
+land.lhs.true23:                                  ; preds = %if.then
+  %tmp24 = load %struct._class_t** @"\01L_OBJC_CLASSLIST_REFERENCES_$_26", align 8
+  %tmp26 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_28", align 8
+  %tmp27 = bitcast %struct._class_t* %tmp24 to i8*
+  %call2822 = call i8* (i8*, i8*, ...)* @objc_msgSend(i8* %tmp27, i8* %tmp26, i8* %call137)
+  %tmp13 = bitcast i8* %call2822 to %5*
+  %tmp14 = call i8* @objc_retain(i8* %call2822) nounwind
+  call void @objc_release(i8* null) nounwind
+  %tobool30 = icmp eq i8* %call2822, null
+  br i1 %tobool30, label %if.then44, label %if.end
+
+if.end:                                           ; preds = %land.lhs.true23
+  %tmp32 = load %struct._class_t** @"\01L_OBJC_CLASSLIST_REFERENCES_$_29", align 8
+  %tmp33 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_31", align 8
+  %tmp34 = bitcast %struct._class_t* %tmp32 to i8*
+  %call35 = call i8* (i8*, i8*, ...)* @objc_msgSend(i8* %tmp34, i8* %tmp33)
+  %tmp37 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_33", align 8
+  %call3923 = call i8* (i8*, i8*, ...)* @objc_msgSend(i8* %call35, i8* %tmp37, i8* %call2822, i32 signext 1, %4** %err)
+  %cmp = icmp eq i8* %call3923, null
+  br i1 %cmp, label %if.then44, label %end
+
+if.then44:                                        ; preds = %if.end, %land.lhs.true23, %if.then
+  %url.025 = phi %5* [ %tmp13, %if.end ], [ %tmp13, %land.lhs.true23 ], [ null, %if.then ]
+  %tmp49 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_35", align 8
+  %call51 = call %struct._NSRange bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to %struct._NSRange (i8*, i8*, i64, i64)*)(i8* %call137, i8* %tmp49, i64 0, i64 0)
+  %call513 = extractvalue %struct._NSRange %call51, 0
+  %call514 = extractvalue %struct._NSRange %call51, 1
+  %tmp52 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_37", align 8
+  %call548 = call i8* (i8*, i8*, ...)* @objc_msgSend(i8* %call137, i8* %tmp52, i64 %call513, i64 %call514)
+  %tmp55 = load %struct._class_t** @"\01L_OBJC_CLASSLIST_REFERENCES_$_38", align 8
+  %tmp56 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_40", align 8
+  %tmp57 = bitcast %struct._class_t* %tmp55 to i8*
+  %call58 = call i8* (i8*, i8*, ...)* @objc_msgSend(i8* %tmp57, i8* %tmp56)
+  %tmp59 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_42", align 8
+  %call6110 = call i8* (i8*, i8*, ...)* @objc_msgSend(i8* %call548, i8* %tmp59, i8* %call58)
+  %tmp15 = call i8* @objc_retain(i8* %call6110) nounwind
+  call void @objc_release(i8* %call137) nounwind
+  %tmp64 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_46", align 8
+  %call66 = call signext i8 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8 (i8*, i8*, %1*)*)(i8* %call6110, i8* %tmp64, %1* bitcast (%struct.NSConstantString* @_unnamed_cfstring_44 to %1*))
+  %tobool67 = icmp eq i8 %call66, 0
+  br i1 %tobool67, label %if.end74, label %if.then68
+
+if.then68:                                        ; preds = %if.then44
+  %tmp70 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_48", align 8
+  %call7220 = call i8* (i8*, i8*, ...)* @objc_msgSend(i8* %call6110, i8* %tmp70)
+  %tmp16 = call i8* @objc_retain(i8* %call7220) nounwind
+  call void @objc_release(i8* %call6110) nounwind
+  br label %if.end74
+
+if.end74:                                         ; preds = %if.then68, %if.then44
+  %filename.0.in = phi i8* [ %call7220, %if.then68 ], [ %call6110, %if.then44 ]
+  %filename.0 = bitcast i8* %filename.0.in to %1*
+  %tmp17 = load i8** bitcast (%0* @"\01l_objc_msgSend_fixup_isEqual_" to i8**), align 16
+  %tmp18 = bitcast i8* %tmp17 to i8 (i8*, %struct._message_ref_t*, i8*, ...)*
+  %call78 = call signext i8 (i8*, %struct._message_ref_t*, i8*, ...)* %tmp18(i8* %call137, %struct._message_ref_t* bitcast (%0* @"\01l_objc_msgSend_fixup_isEqual_" to %struct._message_ref_t*), i8* %filename.0.in)
+  %tobool79 = icmp eq i8 %call78, 0
+  br i1 %tobool79, label %land.lhs.true80, label %if.then109
+
+land.lhs.true80:                                  ; preds = %if.end74
+  %tmp82 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_25", align 8
+  %call84 = call signext i8 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8 (i8*, i8*)*)(i8* %filename.0.in, i8* %tmp82)
+  %tobool86 = icmp eq i8 %call84, 0
+  br i1 %tobool86, label %if.then109, label %if.end106
+
+if.end106:                                        ; preds = %land.lhs.true80
+  %tmp88 = load %struct._class_t** @"\01L_OBJC_CLASSLIST_REFERENCES_$_26", align 8
+  %tmp90 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_28", align 8
+  %tmp91 = bitcast %struct._class_t* %tmp88 to i8*
+  %call9218 = call i8* (i8*, i8*, ...)* @objc_msgSend(i8* %tmp91, i8* %tmp90, i8* %filename.0.in)
+  %tmp20 = bitcast i8* %call9218 to %5*
+  %tmp21 = call i8* @objc_retain(i8* %call9218) nounwind
+  %tmp22 = bitcast %5* %url.025 to i8*
+  call void @objc_release(i8* %tmp22) nounwind
+  %tmp94 = load %struct._class_t** @"\01L_OBJC_CLASSLIST_REFERENCES_$_29", align 8
+  %tmp95 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_31", align 8
+  %tmp96 = bitcast %struct._class_t* %tmp94 to i8*
+  %call97 = call i8* (i8*, i8*, ...)* @objc_msgSend(i8* %tmp96, i8* %tmp95)
+  %tmp99 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_33", align 8
+  %call10119 = call i8* (i8*, i8*, ...)* @objc_msgSend(i8* %call97, i8* %tmp99, i8* %call9218, i32 signext 1, %4** %err)
+  %phitmp = icmp eq i8* %call10119, null
+  br i1 %phitmp, label %if.then109, label %end
+
+if.then109:                                       ; preds = %if.end106, %land.lhs.true80, %if.end74
+  %url.129 = phi %5* [ %tmp20, %if.end106 ], [ %url.025, %if.end74 ], [ %url.025, %land.lhs.true80 ]
+  %tmp110 = load %4** %err, align 8
+  %tobool111 = icmp eq %4* %tmp110, null
+  br i1 %tobool111, label %if.then112, label %if.end125
+
+if.then112:                                       ; preds = %if.then109
+  %tmp113 = load %struct._class_t** @"\01L_OBJC_CLASSLIST_REFERENCES_$_50", align 8
+  %tmp114 = load %1** @NSCocoaErrorDomain, align 8
+  %tmp115 = load %struct._class_t** @"\01L_OBJC_CLASSLIST_REFERENCES_$_51", align 8
+  %call117 = call %1* @truncatedString(%1* %filename.0, i64 1034)
+  %tmp118 = load %1** @NSFilePathErrorKey, align 8
+  %tmp119 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_53", align 8
+  %tmp120 = bitcast %struct._class_t* %tmp115 to i8*
+  %call12113 = call i8* (i8*, i8*, ...)* @objc_msgSend(i8* %tmp120, i8* %tmp119, %1* %call117, %1* %tmp118, i8* null)
+  %tmp122 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_55", align 8
+  %tmp123 = bitcast %struct._class_t* %tmp113 to i8*
+  %call12414 = call i8* (i8*, i8*, ...)* @objc_msgSend(i8* %tmp123, i8* %tmp122, %1* %tmp114, i64 258, i8* %call12113)
+  %tmp23 = call i8* @objc_retain(i8* %call12414) nounwind
+  %tmp25 = call i8* @objc_autorelease(i8* %tmp23) nounwind
+  %tmp28 = bitcast i8* %tmp25 to %4*
+  store %4* %tmp28, %4** %err, align 8
+  br label %if.end125
+
+if.end125:                                        ; preds = %if.then112, %if.then109
+  %tmp127 = phi %4* [ %tmp110, %if.then109 ], [ %tmp28, %if.then112 ]
+  %tmp126 = load %struct._class_t** @"\01L_OBJC_CLASSLIST_REFERENCES_$_56", align 8
+  %tmp128 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_58", align 8
+  %tmp129 = bitcast %struct._class_t* %tmp126 to i8*
+  %call13015 = call i8* (i8*, i8*, ...)* @objc_msgSend(i8* %tmp129, i8* %tmp128, %4* %tmp127)
+  %tmp131 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_60", align 8
+  %call13317 = call i8* (i8*, i8*, ...)* @objc_msgSend(i8* %call13015, i8* %tmp131)
+  br label %end
+
+end:                                              ; preds = %if.end125, %if.end106, %if.end, %land.lhs.true, %entry
+  %filename.2 = phi %1* [ %filename.0, %if.end106 ], [ %filename.0, %if.end125 ], [ %tmp, %land.lhs.true ], [ null, %entry ], [ %tmp, %if.end ]
+  %origFilename.0 = phi %1* [ %tmp, %if.end106 ], [ %tmp, %if.end125 ], [ %tmp, %land.lhs.true ], [ null, %entry ], [ %tmp, %if.end ]
+  %url.2 = phi %5* [ %tmp20, %if.end106 ], [ %url.129, %if.end125 ], [ null, %land.lhs.true ], [ null, %entry ], [ %tmp13, %if.end ]
+  call void @objc_release(i8* %tmp9) nounwind, !clang.imprecise_release !0
+  %tmp29 = bitcast %5* %url.2 to i8*
+  call void @objc_release(i8* %tmp29) nounwind, !clang.imprecise_release !0
+  %tmp30 = bitcast %1* %origFilename.0 to i8*
+  call void @objc_release(i8* %tmp30) nounwind, !clang.imprecise_release !0
+  %tmp31 = bitcast %1* %filename.2 to i8*
+  call void @objc_release(i8* %tmp31) nounwind, !clang.imprecise_release !0
+  call void @objc_release(i8* %tmp7) nounwind, !clang.imprecise_release !0
+  ret void
+}
+
+!0 = metadata !{}
diff --git a/test/Transforms/ObjCARC/cfg-hazards.ll b/test/Transforms/ObjCARC/cfg-hazards.ll
new file mode 100644
index 0000000..e3624df
--- /dev/null
+++ b/test/Transforms/ObjCARC/cfg-hazards.ll
@@ -0,0 +1,86 @@
+; RUN: opt -S -objc-arc < %s | FileCheck %s
+; rdar://9503416
+
+; Detect loop boundaries and don't move retains and releases
+; across them.
+
+declare void @use_pointer(i8*)
+declare i8* @objc_retain(i8*)
+declare void @objc_release(i8*)
+
+; CHECK: define void @test0(
+; CHECK:   call i8* @objc_retain(
+; CHECK: for.body:
+; CHECK-NOT: @objc
+; CHECK: for.end:
+; CHECK:   call void @objc_release(
+; CHECK: }
+define void @test0(i8* %digits) {
+entry:
+  %tmp1 = call i8* @objc_retain(i8* %digits) nounwind
+  call void @use_pointer(i8* %tmp1)
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %upcDigitIndex.01 = phi i64 [ 2, %entry ], [ %inc, %for.body ]
+  call void @use_pointer(i8* %tmp1)
+  %inc = add i64 %upcDigitIndex.01, 1
+  %cmp = icmp ult i64 %inc, 12
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  call void @objc_release(i8* %tmp1) nounwind, !clang.imprecise_release !0
+  ret void
+}
+
+; CHECK: define void @test1(
+; CHECK:   call i8* @objc_retain(
+; CHECK: for.body:
+; CHECK-NOT: @objc
+; CHECK: for.end:
+; CHECK:   void @objc_release(
+; CHECK: }
+define void @test1(i8* %digits) {
+entry:
+  %tmp1 = call i8* @objc_retain(i8* %digits) nounwind
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %upcDigitIndex.01 = phi i64 [ 2, %entry ], [ %inc, %for.body ]
+  call void @use_pointer(i8* %tmp1)
+  call void @use_pointer(i8* %tmp1)
+  %inc = add i64 %upcDigitIndex.01, 1
+  %cmp = icmp ult i64 %inc, 12
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  call void @objc_release(i8* %tmp1) nounwind, !clang.imprecise_release !0
+  ret void
+}
+
+; CHECK: define void @test2(
+; CHECK:   call i8* @objc_retain(
+; CHECK: for.body:
+; CHECK-NOT: @objc
+; CHECK: for.end:
+; CHECK:   void @objc_release(
+; CHECK: }
+define void @test2(i8* %digits) {
+entry:
+  %tmp1 = call i8* @objc_retain(i8* %digits) nounwind
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %upcDigitIndex.01 = phi i64 [ 2, %entry ], [ %inc, %for.body ]
+  call void @use_pointer(i8* %tmp1)
+  %inc = add i64 %upcDigitIndex.01, 1
+  %cmp = icmp ult i64 %inc, 12
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  call void @use_pointer(i8* %tmp1)
+  call void @objc_release(i8* %tmp1) nounwind, !clang.imprecise_release !0
+  ret void
+}
+
+!0 = metadata !{}
diff --git a/test/Transforms/ObjCARC/contract-marker.ll b/test/Transforms/ObjCARC/contract-marker.ll
new file mode 100644
index 0000000..01d978a
--- /dev/null
+++ b/test/Transforms/ObjCARC/contract-marker.ll
@@ -0,0 +1,23 @@
+; RUN: opt -S -objc-arc-contract < %s | FileCheck %s
+
+; CHECK:      %call = tail call i32* @qux()
+; CHECK-NEXT: %tcall = bitcast i32* %call to i8*
+; CHECK-NEXT: call void asm sideeffect "mov\09r7, r7\09\09@ marker for objc_retainAutoreleaseReturnValue", ""()
+; CHECK-NEXT: %0 = tail call i8* @objc_retainAutoreleasedReturnValue(i8* %tcall) nounwind
+
+define void @foo() {
+entry:
+  %call = tail call i32* @qux()
+  %tcall = bitcast i32* %call to i8*
+  %0 = tail call i8* @objc_retainAutoreleasedReturnValue(i8* %tcall) nounwind
+  tail call void @bar(i8* %0)
+  ret void
+}
+
+declare i32* @qux()
+declare i8* @objc_retainAutoreleasedReturnValue(i8*)
+declare void @bar(i8*)
+
+!clang.arc.retainAutoreleasedReturnValueMarker = !{!0}
+
+!0 = metadata !{metadata !"mov\09r7, r7\09\09@ marker for objc_retainAutoreleaseReturnValue"}
diff --git a/test/Transforms/ObjCARC/contract-storestrong-ivar.ll b/test/Transforms/ObjCARC/contract-storestrong-ivar.ll
new file mode 100644
index 0000000..4ad78e7
--- /dev/null
+++ b/test/Transforms/ObjCARC/contract-storestrong-ivar.ll
@@ -0,0 +1,31 @@
+; RUN: opt -objc-arc-contract -S < %s | FileCheck %s
+
+; CHECK: call void @objc_storeStrong(i8**
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-darwin11.0.0"
+
+%0 = type opaque
+%1 = type opaque
+
+@"OBJC_IVAR_$_Controller.preferencesController" = external global i64, section "__DATA, __objc_const", align 8
+
+declare i8* @objc_retain(i8*)
+
+declare void @objc_release(i8*)
+
+define hidden void @y(%0* nocapture %self, %1* %preferencesController) nounwind {
+entry:
+  %ivar = load i64* @"OBJC_IVAR_$_Controller.preferencesController", align 8
+  %tmp = bitcast %0* %self to i8*
+  %add.ptr = getelementptr inbounds i8* %tmp, i64 %ivar
+  %tmp1 = bitcast i8* %add.ptr to %1**
+  %tmp2 = load %1** %tmp1, align 8
+  %tmp3 = bitcast %1* %preferencesController to i8*
+  %tmp4 = tail call i8* @objc_retain(i8* %tmp3) nounwind
+  %tmp5 = bitcast %1* %tmp2 to i8*
+  tail call void @objc_release(i8* %tmp5) nounwind
+  %tmp6 = bitcast i8* %tmp4 to %1*
+  store %1* %tmp6, %1** %tmp1, align 8
+  ret void
+}
diff --git a/test/Transforms/ObjCARC/contract-storestrong.ll b/test/Transforms/ObjCARC/contract-storestrong.ll
new file mode 100644
index 0000000..50ed260
--- /dev/null
+++ b/test/Transforms/ObjCARC/contract-storestrong.ll
@@ -0,0 +1,59 @@
+; RUN: opt -objc-arc-contract -S < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64"
+
+declare i8* @objc_retain(i8*)
+declare void @objc_release(i8*)
+
+@x = external global i8*
+
+; CHECK: define void @test0(
+; CHECK: entry:
+; CHECK-NEXT: call void @objc_storeStrong(i8** @x, i8* %p) nounwind
+; CHECK-NEXT: ret void
+define void @test0(i8* %p) {
+entry:
+  %0 = tail call i8* @objc_retain(i8* %p) nounwind
+  %tmp = load i8** @x, align 8
+  store i8* %0, i8** @x, align 8
+  tail call void @objc_release(i8* %tmp) nounwind
+  ret void
+}
+
+; Don't do this if the load is volatile.
+
+;      CHECK: define void @test1(i8* %p) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT:   %0 = tail call i8* @objc_retain(i8* %p) nounwind
+; CHECK-NEXT:   %tmp = volatile load i8** @x, align 8
+; CHECK-NEXT:   store i8* %0, i8** @x, align 8
+; CHECK-NEXT:   tail call void @objc_release(i8* %tmp) nounwind
+; CHECK-NEXT:   ret void
+; CHECK-NEXT: }
+define void @test1(i8* %p) {
+entry:
+  %0 = tail call i8* @objc_retain(i8* %p) nounwind
+  %tmp = volatile load i8** @x, align 8
+  store i8* %0, i8** @x, align 8
+  tail call void @objc_release(i8* %tmp) nounwind
+  ret void
+}
+
+; Don't do this if the store is volatile.
+
+;      CHECK: define void @test2(i8* %p) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT:   %0 = tail call i8* @objc_retain(i8* %p) nounwind
+; CHECK-NEXT:   %tmp = load i8** @x, align 8
+; CHECK-NEXT:   volatile store i8* %0, i8** @x, align 8
+; CHECK-NEXT:   tail call void @objc_release(i8* %tmp) nounwind
+; CHECK-NEXT:   ret void
+; CHECK-NEXT: }
+define void @test2(i8* %p) {
+entry:
+  %0 = tail call i8* @objc_retain(i8* %p) nounwind
+  %tmp = load i8** @x, align 8
+  volatile store i8* %0, i8** @x, align 8
+  tail call void @objc_release(i8* %tmp) nounwind
+  ret void
+}
diff --git a/test/Transforms/ObjCARC/contract-testcases.ll b/test/Transforms/ObjCARC/contract-testcases.ll
new file mode 100644
index 0000000..69fa837
--- /dev/null
+++ b/test/Transforms/ObjCARC/contract-testcases.ll
@@ -0,0 +1,63 @@
+; RUN: opt -objc-arc-contract -S < %s | FileCheck %s
+; rdar://9511608
+
+%0 = type opaque
+%1 = type opaque
+%2 = type { i64, i64 }
+%3 = type { i8*, i8* }
+%4 = type opaque
+
+declare %0* @"\01-[NSAttributedString(Terminal) pathAtIndex:effectiveRange:]"(%1*, i8* nocapture, i64, %2*) optsize
+declare i8* @objc_retainAutoreleasedReturnValue(i8*)
+declare i8* @objc_msgSend_fixup(i8*, %3*, ...)
+declare void @objc_release(i8*)
+declare %2 @NSUnionRange(i64, i64, i64, i64) optsize
+declare i8* @objc_autoreleaseReturnValue(i8*)
+declare i8* @objc_autorelease(i8*)
+declare i8* @objc_msgSend() nonlazybind
+
+; Don't get in trouble on bugpointed code.
+
+; CHECK: define void @test0(
+define void @test0() {
+bb:
+  %tmp = bitcast %4* undef to i8*
+  %tmp1 = tail call i8* @objc_retainAutoreleasedReturnValue(i8* %tmp) nounwind
+  br label %bb3
+
+bb3:                                              ; preds = %bb2
+  br i1 undef, label %bb6, label %bb4
+
+bb4:                                              ; preds = %bb3
+  switch i64 undef, label %bb5 [
+    i64 9223372036854775807, label %bb6
+    i64 0, label %bb6
+  ]
+
+bb5:                                              ; preds = %bb4
+  br label %bb6
+
+bb6:                                              ; preds = %bb5, %bb4, %bb4, %bb3
+  %tmp7 = phi %4* [ undef, %bb5 ], [ undef, %bb4 ], [ undef, %bb3 ], [ undef, %bb4 ]
+  unreachable
+}
+
+; When rewriting operands for a phi which has multiple operands
+; for the same block, use the exactly same value in each block.
+
+; CHECK: define void @test1(
+; CHECK: %0 = bitcast i8* %tmp3 to %0* 
+; CHECK: br i1 undef, label %bb7, label %bb7
+; CHECK: bb7:
+; CHECK: %tmp8 = phi %0* [ %0, %bb ], [ %0, %bb ]
+define void @test1() {
+bb:
+  %tmp = tail call %0* bitcast (i8* ()* @objc_msgSend to %0* ()*)()
+  %tmp2 = bitcast %0* %tmp to i8*
+  %tmp3 = tail call i8* @objc_retainAutoreleasedReturnValue(i8* %tmp2) nounwind
+  br i1 undef, label %bb7, label %bb7
+
+bb7:                                              ; preds = %bb6, %bb6, %bb5
+  %tmp8 = phi %0* [ %tmp, %bb ], [ %tmp, %bb ]
+  unreachable
+}
diff --git a/test/Transforms/ObjCARC/contract.ll b/test/Transforms/ObjCARC/contract.ll
new file mode 100644
index 0000000..04ae3ca
--- /dev/null
+++ b/test/Transforms/ObjCARC/contract.ll
@@ -0,0 +1,145 @@
+; RUN: opt -objc-arc-contract -S < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64"
+
+declare i8* @objc_retain(i8*)
+declare void @objc_release(i8*)
+declare i8* @objc_autorelease(i8*)
+declare i8* @objc_autoreleaseReturnValue(i8*)
+declare i8* @objc_retainAutoreleasedReturnValue(i8*)
+
+declare void @use_pointer(i8*)
+declare i8* @returner()
+
+; CHECK: define void @test0
+; CHECK: call void @use_pointer(i8* %0)
+; CHECK: }
+define void @test0(i8* %x) nounwind {
+entry:
+  %0 = call i8* @objc_retain(i8* %x) nounwind
+  call void @use_pointer(i8* %x)
+  ret void
+}
+
+; CHECK: define void @test1
+; CHECK: call void @use_pointer(i8* %0)
+; CHECK: }
+define void @test1(i8* %x) nounwind {
+entry:
+  %0 = call i8* @objc_autorelease(i8* %x) nounwind
+  call void @use_pointer(i8* %x)
+  ret void
+}
+
+; Merge objc_retain and objc_autorelease into objc_retainAutorelease.
+
+; CHECK: define void @test2(
+; CHECK: tail call i8* @objc_retainAutorelease(i8* %x) nounwind
+; CHECK: }
+define void @test2(i8* %x) nounwind {
+entry:
+  %0 = tail call i8* @objc_retain(i8* %x) nounwind
+  tail call i8* @objc_autorelease(i8* %0) nounwind
+  call void @use_pointer(i8* %x)
+  ret void
+}
+
+; Same as test2 but the value is returned. Do an RV optimization.
+
+; CHECK: define i8* @test2b(
+; CHECK: tail call i8* @objc_retainAutoreleaseReturnValue(i8* %x) nounwind
+; CHECK: }
+define i8* @test2b(i8* %x) nounwind {
+entry:
+  %0 = tail call i8* @objc_retain(i8* %x) nounwind
+  tail call i8* @objc_autoreleaseReturnValue(i8* %0) nounwind
+  ret i8* %x
+}
+
+; Merge a retain,autorelease pair around a call.
+
+; CHECK: define void @test3(
+; CHECK: tail call i8* @objc_retainAutorelease(i8* %x) nounwind
+; CHECK: @use_pointer(i8* %0)
+; CHECK: }
+define void @test3(i8* %x, i64 %n) {
+entry:
+  tail call i8* @objc_retain(i8* %x) nounwind
+  call void @use_pointer(i8* %x)
+  tail call i8* @objc_autorelease(i8* %x) nounwind
+  ret void
+}
+
+; Trivial retain,autorelease pair with intervening call, but it's post-dominated
+; by another release. The retain and autorelease can be merged.
+
+; CHECK: define void @test4(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: @objc_retainAutorelease(i8* %x) nounwind
+; CHECK-NEXT: @use_pointer
+; CHECK-NEXT: @objc_release
+; CHECK-NEXT: ret void
+; CHECK-NEXT: }
+define void @test4(i8* %x, i64 %n) {
+entry:
+  tail call i8* @objc_retain(i8* %x) nounwind
+  call void @use_pointer(i8* %x)
+  tail call i8* @objc_autorelease(i8* %x) nounwind
+  tail call void @objc_release(i8* %x) nounwind
+  ret void
+}
+
+; Don't merge retain and autorelease if they're not control-equivalent.
+
+; CHECK: define void @test5(
+; CHECK: tail call i8* @objc_retain(i8* %p) nounwind
+; CHECK: true:
+; CHECK: tail call i8* @objc_autorelease(i8* %0) nounwind
+; CHECK: }
+define void @test5(i8* %p, i1 %a) {
+entry:
+  tail call i8* @objc_retain(i8* %p) nounwind
+  br i1 %a, label %true, label %false
+
+true:
+  tail call i8* @objc_autorelease(i8* %p) nounwind
+  call void @use_pointer(i8* %p)
+  ret void
+
+false:
+  ret void
+}
+
+; Don't eliminate objc_retainAutoreleasedReturnValue by merging it into
+; an objc_autorelease.
+; TODO? Merge objc_retainAutoreleasedReturnValue and objc_autorelease into
+; objc_retainAutoreleasedReturnValueAutorelease and merge
+; objc_retainAutoreleasedReturnValue and objc_autoreleaseReturnValue
+; into objc_retainAutoreleasedReturnValueAutoreleaseReturnValue?
+; Those entrypoints don't exist yet though.
+
+; CHECK: define i8* @test6(
+; CHECK: call i8* @objc_retainAutoreleasedReturnValue(i8* %p) nounwind
+; CHECK: %t = tail call i8* @objc_autoreleaseReturnValue(i8* %1) nounwind
+; CHECK: }
+define i8* @test6() {
+  %p = call i8* @returner()
+  tail call i8* @objc_retainAutoreleasedReturnValue(i8* %p) nounwind
+  %t = tail call i8* @objc_autoreleaseReturnValue(i8* %p) nounwind
+  call void @use_pointer(i8* %t)
+  ret i8* %t
+}
+
+; Don't spoil the RV optimization.
+
+; CHECK: define i8* @test7(i8* %p)
+; CHECK: tail call i8* @objc_retain(i8* %p)
+; CHECK: call void @use_pointer(i8* %1)
+; CHECK: tail call i8* @objc_autoreleaseReturnValue(i8* %1)
+; CHECK: ret i8* %2
+define i8* @test7(i8* %p) {
+  %1 = tail call i8* @objc_retain(i8* %p)
+  call void @use_pointer(i8* %p)
+  %2 = tail call i8* @objc_autoreleaseReturnValue(i8* %p)
+  ret i8* %p
+}
diff --git a/test/Transforms/SRETPromotion/dg.exp b/test/Transforms/ObjCARC/dg.exp
index f200589..f200589 100644
--- a/test/Transforms/SRETPromotion/dg.exp
+++ b/test/Transforms/ObjCARC/dg.exp
diff --git a/test/Transforms/ObjCARC/expand.ll b/test/Transforms/ObjCARC/expand.ll
new file mode 100644
index 0000000..5388673
--- /dev/null
+++ b/test/Transforms/ObjCARC/expand.ll
@@ -0,0 +1,28 @@
+; RUN: opt -objc-arc-expand -S < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64"
+
+declare i8* @objc_retain(i8*)
+declare i8* @objc_autorelease(i8*)
+
+declare void @use_pointer(i8*)
+
+; CHECK: define void @test0
+; CHECK: call void @use_pointer(i8* %x)
+; CHECK: }
+define void @test0(i8* %x) nounwind {
+entry:
+  %0 = call i8* @objc_retain(i8* %x) nounwind
+  call void @use_pointer(i8* %0)
+  ret void
+}
+
+; CHECK: define void @test1
+; CHECK: call void @use_pointer(i8* %x)
+; CHECK: }
+define void @test1(i8* %x) nounwind {
+entry:
+  %0 = call i8* @objc_autorelease(i8* %x) nounwind
+  call void @use_pointer(i8* %x)
+  ret void
+}
diff --git a/test/Transforms/ObjCARC/gvn.ll b/test/Transforms/ObjCARC/gvn.ll
new file mode 100644
index 0000000..6917b02
--- /dev/null
+++ b/test/Transforms/ObjCARC/gvn.ll
@@ -0,0 +1,21 @@
+; RUN: opt -S -basicaa -objc-arc -gvn < %s | FileCheck %s
+
+@x = common global i8* null, align 8
+
+declare i8* @objc_retain(i8*)
+
+; GVN should be able to eliminate this redundant load, with ARC-specific
+; alias analysis.
+
+; CHECK: @foo
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %s = load i8** @x
+; CHECK-NOT: load
+; CHECK: ret i8* %s
+define i8* @foo(i32 %n) nounwind {
+entry:
+  %s = load i8** @x
+  %0 = tail call i8* @objc_retain(i8* %s) nounwind
+  %t = load i8** @x
+  ret i8* %s
+}
diff --git a/test/Transforms/ObjCARC/invoke.ll b/test/Transforms/ObjCARC/invoke.ll
new file mode 100644
index 0000000..a1b87d2
--- /dev/null
+++ b/test/Transforms/ObjCARC/invoke.ll
@@ -0,0 +1,67 @@
+; RUN: opt -S -objc-arc < %s | FileCheck %s
+
+declare i8* @objc_retain(i8*)
+declare void @objc_release(i8*)
+declare i8* @objc_msgSend(i8*, i8*, ...)
+declare void @use_pointer(i8*)
+declare void @callee()
+
+; ARCOpt shouldn't try to move the releases to the block containing the invoke.
+
+; CHECK: define void @test0(
+; CHECK: invoke.cont:
+; CHECK:   call void @objc_release(i8* %zipFile) nounwind, !clang.imprecise_release !0
+; CHECK:   ret void
+; CHECK: lpad:
+; CHECK:   call void @objc_release(i8* %zipFile) nounwind, !clang.imprecise_release !0
+; CHECK:   ret void
+define void @test0(i8* %zipFile) {
+entry:
+  call i8* @objc_retain(i8* %zipFile) nounwind
+  call void @use_pointer(i8* %zipFile)
+  invoke void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*)*)(i8* %zipFile) 
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:                                      ; preds = %entry
+  call void @objc_release(i8* %zipFile) nounwind, !clang.imprecise_release !0
+  ret void
+
+lpad:                                             ; preds = %entry
+  call void @objc_release(i8* %zipFile) nounwind, !clang.imprecise_release !0
+  ret void
+}
+
+; ARCOpt should move the release before the callee calls.
+
+; CHECK: define void @test1(
+; CHECK: invoke.cont:
+; CHECK:   call void @objc_release(i8* %zipFile) nounwind, !clang.imprecise_release !0
+; CHECK:   call void @callee()
+; CHECK:   br label %done
+; CHECK: lpad:
+; CHECK:   call void @objc_release(i8* %zipFile) nounwind, !clang.imprecise_release !0
+; CHECK:   call void @callee()
+; CHECK:   br label %done
+; CHECK: done:
+; CHECK-NEXT: ret void
+define void @test1(i8* %zipFile) {
+entry:
+  call i8* @objc_retain(i8* %zipFile) nounwind
+  call void @use_pointer(i8* %zipFile)
+  invoke void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*)*)(i8* %zipFile)
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:                                      ; preds = %entry
+  call void @callee()
+  br label %done
+
+lpad:                                             ; preds = %entry
+  call void @callee()
+  br label %done
+
+done:
+  call void @objc_release(i8* %zipFile) nounwind, !clang.imprecise_release !0
+  ret void
+}
+
+!0 = metadata !{}
diff --git a/test/Transforms/ObjCARC/move-and-form-retain-autorelease.ll b/test/Transforms/ObjCARC/move-and-form-retain-autorelease.ll
new file mode 100644
index 0000000..170d0a9
--- /dev/null
+++ b/test/Transforms/ObjCARC/move-and-form-retain-autorelease.ll
@@ -0,0 +1,221 @@
+; RUN: opt -S -objc-arc-contract < %s | FileCheck %s
+
+; The optimizer should be able to move the autorelease past a control triangle
+; and various scary looking things and fold it into an objc_retainAutorelease.
+
+; CHECK: bb57:
+; CHECK: tail call i8* @objc_retainAutorelease(i8* %tmp71x) nounwind
+; CHECK: bb99:
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-darwin11.0.0"
+
+%0 = type { i8* (i8*, %1*, ...)*, i8* }
+%1 = type { i8*, i8* }
+%2 = type { %2*, %2*, %3*, i8* (i8*, i8*)**, %4* }
+%3 = type opaque
+%4 = type { i32, i32, i32, i8*, i8*, %5*, %7*, %10*, i8*, %9* }
+%5 = type { i32, i32, [0 x %6] }
+%6 = type { i8*, i8*, i8* }
+%7 = type { i64, [0 x %8*] }
+%8 = type { i8*, i8*, %7*, %5*, %5*, %5*, %5*, %9*, i32, i32 }
+%9 = type { i32, i32, [0 x %1] }
+%10 = type { i32, i32, [0 x %11] }
+%11 = type { i64*, i8*, i8*, i32, i32 }
+%12 = type { i32*, i32, i8*, i64 }
+%13 = type opaque
+%14 = type opaque
+%15 = type opaque
+%16 = type opaque
+%17 = type opaque
+%18 = type opaque
+%19 = type opaque
+%20 = type opaque
+%21 = type opaque
+%22 = type opaque
+%23 = type opaque
+%24 = type opaque
+%25 = type opaque
+
+@"\01l_objc_msgSend_fixup_alloc" = external hidden global %0, section "__DATA, __objc_msgrefs, coalesced", align 16
+@"\01L_OBJC_SELECTOR_REFERENCES_8" = external hidden global i8*, section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
+@"\01L_OBJC_SELECTOR_REFERENCES_3725" = external hidden global i8*, section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
+@"\01L_OBJC_CLASSLIST_REFERENCES_$_40" = external hidden global %2*, section "__DATA, __objc_classrefs, regular, no_dead_strip", align 8
+@"\01L_OBJC_SELECTOR_REFERENCES_4227" = external hidden global i8*, section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
+@"\01L_OBJC_SELECTOR_REFERENCES_4631" = external hidden global i8*, section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
+@"\01L_OBJC_CLASSLIST_REFERENCES_$_70" = external hidden global %2*, section "__DATA, __objc_classrefs, regular, no_dead_strip", align 8
+@"\01L_OBJC_SELECTOR_REFERENCES_148" = external hidden global i8*, section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
+@"\01L_OBJC_SELECTOR_REFERENCES_159" = external hidden global i8*, section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
+@"\01L_OBJC_SELECTOR_REFERENCES_188" = external hidden global i8*, section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
+@"\01L_OBJC_SELECTOR_REFERENCES_328" = external hidden global i8*, section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
+@"\01l_objc_msgSend_fixup_objectAtIndex_" = external hidden global %0, section "__DATA, __objc_msgrefs, coalesced", align 16
+@_unnamed_cfstring_386 = external hidden constant %12, section "__DATA,__cfstring"
+@"\01l_objc_msgSend_fixup_count" = external hidden global %0, section "__DATA, __objc_msgrefs, coalesced", align 16
+@"\01L_OBJC_SELECTOR_REFERENCES_389" = external hidden global i8*, section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
+@"\01L_OBJC_SELECTOR_REFERENCES_391" = external hidden global i8*, section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
+@"\01L_OBJC_SELECTOR_REFERENCES_393" = external hidden global i8*, section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
+@NSPrintHeaderAndFooter = external constant %13*
+@"\01L_OBJC_SELECTOR_REFERENCES_395" = external hidden global i8*, section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
+@"\01L_OBJC_CLASSLIST_REFERENCES_$_396" = external hidden global %2*, section "__DATA, __objc_classrefs, regular, no_dead_strip", align 8
+@"\01L_OBJC_SELECTOR_REFERENCES_398" = external hidden global i8*, section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
+@"\01L_OBJC_SELECTOR_REFERENCES_400" = external hidden global i8*, section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
+@"\01L_OBJC_SELECTOR_REFERENCES_402" = external hidden global i8*, section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
+@"\01L_OBJC_SELECTOR_REFERENCES_404" = external hidden global i8*, section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
+@"\01L_OBJC_SELECTOR_REFERENCES_406" = external hidden global i8*, section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
+@"\01L_OBJC_SELECTOR_REFERENCES_408" = external hidden global i8*, section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
+@"\01L_OBJC_CLASSLIST_REFERENCES_$_409" = external hidden global %2*, section "__DATA, __objc_classrefs, regular, no_dead_strip", align 8
+@"\01L_OBJC_SELECTOR_REFERENCES_411" = external hidden global i8*, section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
+@"\01L_OBJC_SELECTOR_REFERENCES_413" = external hidden global i8*, section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
+@"\01L_OBJC_SELECTOR_REFERENCES_415" = external hidden global i8*, section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
+
+declare i8* @objc_msgSend(i8*, i8*, ...)
+
+declare i8* @objc_retain(i8*)
+
+declare void @objc_release(i8*)
+
+declare i8* @objc_autorelease(i8*)
+
+declare i8* @objc_explicit_autorelease(i8*)
+
+define hidden %14* @foo(%15* %arg, %16* %arg2) {
+bb:
+  %tmp = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_3725", align 8
+  %tmp4 = bitcast %15* %arg to i8*
+  %tmp5 = tail call %18* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to %18* (i8*, i8*)*)(i8* %tmp4, i8* %tmp)
+  %tmp6 = bitcast %18* %tmp5 to i8*
+  %tmp7 = tail call i8* @objc_retain(i8* %tmp6) nounwind
+  %tmp8 = load %2** @"\01L_OBJC_CLASSLIST_REFERENCES_$_40", align 8
+  %tmp9 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_4227", align 8
+  %tmp10 = bitcast %2* %tmp8 to i8*
+  %tmp11 = tail call %19* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to %19* (i8*, i8*)*)(i8* %tmp10, i8* %tmp9)
+  %tmp12 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_4631", align 8
+  %tmp13 = bitcast %19* %tmp11 to i8*
+  %tmp14 = tail call signext i8 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8 (i8*, i8*, %13*)*)(i8* %tmp13, i8* %tmp12, %13* bitcast (%12* @_unnamed_cfstring_386 to %13*))
+  %tmp15 = bitcast %16* %arg2 to i8*
+  %tmp16 = load i8** bitcast (%0* @"\01l_objc_msgSend_fixup_count" to i8**), align 16
+  %tmp17 = bitcast i8* %tmp16 to i64 (i8*, %1*)*
+  %tmp18 = tail call i64 %tmp17(i8* %tmp15, %1* bitcast (%0* @"\01l_objc_msgSend_fixup_count" to %1*))
+  %tmp19 = icmp eq i64 %tmp18, 0
+  br i1 %tmp19, label %bb22, label %bb20
+
+bb20:                                             ; preds = %bb
+  %tmp21 = icmp eq i8 %tmp14, 0
+  br label %bb25
+
+bb22:                                             ; preds = %bb
+  %tmp23 = bitcast i8* %tmp7 to %18*
+  %tmp24 = icmp eq i8 %tmp14, 0
+  br i1 %tmp24, label %bb46, label %bb25
+
+bb25:                                             ; preds = %bb22, %bb20
+  %tmp26 = phi i1 [ %tmp21, %bb20 ], [ false, %bb22 ]
+  %tmp27 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_188", align 8
+  %tmp28 = tail call i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*)*)(i8* %tmp7, i8* %tmp27)
+  %tmp29 = tail call i8* @objc_explicit_autorelease(i8* %tmp28) nounwind
+  %tmp30 = bitcast i8* %tmp29 to %18*
+  tail call void @objc_release(i8* %tmp7) nounwind
+  %tmp31 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_389", align 8
+  %tmp32 = tail call %20* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to %20* (i8*, i8*)*)(i8* %tmp29, i8* %tmp31)
+  %tmp33 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_391", align 8
+  %tmp34 = bitcast %20* %tmp32 to i8*
+  tail call void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*, %16*)*)(i8* %tmp34, i8* %tmp33, %16* %arg2)
+  br i1 %tmp26, label %bb46, label %bb35
+
+bb35:                                             ; preds = %bb25
+  %tmp36 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_389", align 8
+  %tmp37 = tail call %20* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to %20* (i8*, i8*)*)(i8* %tmp29, i8* %tmp36)
+  %tmp38 = load %2** @"\01L_OBJC_CLASSLIST_REFERENCES_$_70", align 8
+  %tmp39 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_393", align 8
+  %tmp40 = bitcast %2* %tmp38 to i8*
+  %tmp41 = tail call %21* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to %21* (i8*, i8*, i8)*)(i8* %tmp40, i8* %tmp39, i8 signext 1)
+  %tmp42 = bitcast %21* %tmp41 to i8*
+  %tmp43 = load %13** @NSPrintHeaderAndFooter, align 8
+  %tmp44 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_159", align 8
+  %tmp45 = bitcast %20* %tmp37 to i8*
+  tail call void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*, i8*, %13*)*)(i8* %tmp45, i8* %tmp44, i8* %tmp42, %13* %tmp43)
+  br label %bb46
+
+bb46:                                             ; preds = %bb35, %bb25, %bb22
+  %tmp47 = phi %18* [ %tmp30, %bb35 ], [ %tmp30, %bb25 ], [ %tmp23, %bb22 ]
+  %tmp48 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_328", align 8
+  %tmp49 = tail call %22* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to %22* (i8*, i8*)*)(i8* %tmp4, i8* %tmp48)
+  %tmp50 = bitcast %22* %tmp49 to i8*
+  %tmp51 = load i8** bitcast (%0* @"\01l_objc_msgSend_fixup_count" to i8**), align 16
+  %tmp52 = bitcast i8* %tmp51 to i64 (i8*, %1*)*
+  %tmp53 = tail call i64 %tmp52(i8* %tmp50, %1* bitcast (%0* @"\01l_objc_msgSend_fixup_count" to %1*))
+  %tmp54 = icmp eq i64 %tmp53, 0
+  br i1 %tmp54, label %bb55, label %bb57
+
+bb55:                                             ; preds = %bb46
+  %tmp56 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_395", align 8
+  tail call void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*)*)(i8* %tmp4, i8* %tmp56)
+  br label %bb57
+
+bb57:                                             ; preds = %bb55, %bb46
+  %tmp58 = load %2** @"\01L_OBJC_CLASSLIST_REFERENCES_$_396", align 8
+  %tmp59 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_328", align 8
+  %tmp60 = tail call %22* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to %22* (i8*, i8*)*)(i8* %tmp4, i8* %tmp59)
+  %tmp61 = bitcast %22* %tmp60 to i8*
+  %tmp62 = load i8** bitcast (%0* @"\01l_objc_msgSend_fixup_objectAtIndex_" to i8**), align 16
+  %tmp63 = bitcast i8* %tmp62 to i8* (i8*, %1*, i64)*
+  %tmp64 = tail call i8* %tmp63(i8* %tmp61, %1* bitcast (%0* @"\01l_objc_msgSend_fixup_objectAtIndex_" to %1*), i64 0)
+  %tmp65 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_398", align 8
+  %tmp66 = tail call i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*)*)(i8* %tmp64, i8* %tmp65)
+  %tmp67 = bitcast i8* %tmp66 to %23*
+  %tmp68 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_400", align 8
+  %tmp69 = bitcast %2* %tmp58 to i8*
+  %tmp70 = tail call %14* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to %14* (i8*, i8*, %23*, %18*)*)(i8* %tmp69, i8* %tmp68, %23* %tmp67, %18* %tmp47)
+  %tmp71 = bitcast %14* %tmp70 to i8*
+  ; hack to prevent the optimize from using objc_retainAutoreleasedReturnValue.
+  %tmp71x = getelementptr i8* %tmp71, i64 1
+  %tmp72 = tail call i8* @objc_retain(i8* %tmp71x) nounwind
+  %tmp73 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_402", align 8
+  tail call void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*, i8)*)(i8* %tmp72, i8* %tmp73, i8 signext 1)
+  %tmp74 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_404", align 8
+  tail call void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*, i8)*)(i8* %tmp72, i8* %tmp74, i8 signext 1)
+  %tmp75 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_328", align 8
+  %tmp76 = tail call %22* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to %22* (i8*, i8*)*)(i8* %tmp4, i8* %tmp75)
+  %tmp77 = bitcast %22* %tmp76 to i8*
+  %tmp78 = load i8** bitcast (%0* @"\01l_objc_msgSend_fixup_objectAtIndex_" to i8**), align 16
+  %tmp79 = bitcast i8* %tmp78 to i8* (i8*, %1*, i64)*
+  %tmp80 = tail call i8* %tmp79(i8* %tmp77, %1* bitcast (%0* @"\01l_objc_msgSend_fixup_objectAtIndex_" to %1*), i64 0)
+  %tmp81 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_406", align 8
+  tail call void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*, i64)*)(i8* %tmp80, i8* %tmp81, i64 9223372036854775807)
+  %tmp82 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_408", align 8
+  %tmp83 = tail call %24* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to %24* (i8*, i8*)*)(i8* %tmp72, i8* %tmp82)
+  %tmp84 = bitcast %24* %tmp83 to i8*
+  %tmp85 = tail call i8* @objc_retain(i8* %tmp84) nounwind
+  %tmp86 = load %2** @"\01L_OBJC_CLASSLIST_REFERENCES_$_409", align 8
+  %tmp87 = bitcast %2* %tmp86 to i8*
+  %tmp88 = load i8** bitcast (%0* @"\01l_objc_msgSend_fixup_alloc" to i8**), align 16
+  %tmp89 = bitcast i8* %tmp88 to i8* (i8*, %1*)*
+  %tmp90 = tail call i8* %tmp89(i8* %tmp87, %1* bitcast (%0* @"\01l_objc_msgSend_fixup_alloc" to %1*))
+  %tmp91 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_8", align 8
+  %tmp92 = tail call i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*)*)(i8* %tmp90, i8* %tmp91)
+  %tmp93 = tail call i8* @objc_explicit_autorelease(i8* %tmp92) nounwind
+  %tmp94 = bitcast i8* %tmp93 to %25*
+  %tmp95 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_411", align 8
+  tail call void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*, %25*)*)(i8* %tmp85, i8* %tmp95, %25* %tmp94)
+  tail call void @objc_release(i8* %tmp93) nounwind
+  %tmp96 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_148", align 8
+  %tmp97 = tail call signext i8 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8 (i8*, i8*)*)(i8* %tmp4, i8* %tmp96)
+  %tmp98 = icmp eq i8 %tmp97, 0
+  br i1 %tmp98, label %bb99, label %bb104
+
+bb99:                                             ; preds = %bb57
+  %tmp100 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_413", align 8
+  %tmp101 = tail call i64 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i64 (i8*, i8*)*)(i8* %tmp85, i8* %tmp100)
+  %tmp102 = or i64 %tmp101, 12
+  %tmp103 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_415", align 8
+  tail call void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*, i64)*)(i8* %tmp85, i8* %tmp103, i64 %tmp102)
+  br label %bb104
+
+bb104:                                            ; preds = %bb99, %bb57
+  %tmp105 = tail call i8* @objc_autorelease(i8* %tmp72) nounwind
+  %tmp106 = bitcast i8* %tmp105 to %14*
+  tail call void @objc_release(i8* %tmp85) nounwind
+  %tmp107 = bitcast %18* %tmp47 to i8*
+  tail call void @objc_release(i8* %tmp107) nounwind
+  ret %14* %tmp106
+}
diff --git a/test/Transforms/ObjCARC/move-and-merge-autorelease.ll b/test/Transforms/ObjCARC/move-and-merge-autorelease.ll
new file mode 100644
index 0000000..8462c70
--- /dev/null
+++ b/test/Transforms/ObjCARC/move-and-merge-autorelease.ll
@@ -0,0 +1,108 @@
+; RUN: opt -S -objc-arc < %s | FileCheck %s
+
+; The optimizer should be able to move the autorelease past two phi nodes
+; and fold it with the release in bb65.
+
+; CHECK: bb65:
+; CHECK: call i8* @objc_retainAutorelease
+; CHECK: br label %bb76
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-darwin11.0.0"
+
+%0 = type opaque
+%1 = type opaque
+%2 = type opaque
+%3 = type opaque
+%4 = type opaque
+%5 = type opaque
+
+@"\01L_OBJC_SELECTOR_REFERENCES_11" = external hidden global i8*, section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
+@"\01L_OBJC_SELECTOR_REFERENCES_421455" = external hidden global i8*, section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
+@"\01L_OBJC_SELECTOR_REFERENCES_598" = external hidden global i8*, section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
+@"\01L_OBJC_SELECTOR_REFERENCES_620" = external hidden global i8*, section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
+@"\01L_OBJC_SELECTOR_REFERENCES_622" = external hidden global i8*, section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
+@"\01L_OBJC_SELECTOR_REFERENCES_624" = external hidden global i8*, section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
+@"\01L_OBJC_SELECTOR_REFERENCES_626" = external hidden global i8*, section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
+
+declare i8* @objc_msgSend(i8*, i8*, ...)
+
+declare i8* @objc_retain(i8*)
+
+declare void @objc_release(i8*)
+
+declare i8* @objc_autorelease(i8*)
+
+define hidden %0* @foo(%1* %arg, %3* %arg3) {
+bb:
+  %tmp16 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_620", align 8
+  %tmp17 = bitcast %3* %arg3 to i8*
+  %tmp18 = call %4* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to %4* (i8*, i8*)*)(i8* %tmp17, i8* %tmp16)
+  %tmp19 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_622", align 8
+  %tmp20 = bitcast %4* %tmp18 to i8*
+  %tmp21 = call %5* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to %5* (i8*, i8*)*)(i8* %tmp20, i8* %tmp19)
+  %tmp22 = bitcast %5* %tmp21 to i8*
+  %tmp23 = call i8* @objc_retain(i8* %tmp22) nounwind
+  %tmp24 = bitcast i8* %tmp23 to %5*
+  %tmp26 = icmp eq i8* %tmp23, null
+  br i1 %tmp26, label %bb81, label %bb27
+
+bb27:                                             ; preds = %bb
+  %tmp29 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_11", align 8
+  %tmp30 = bitcast %1* %arg to i8*
+  %tmp31 = call i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*)*)(i8* %tmp30, i8* %tmp29)
+  %tmp34 = call i8* @objc_retain(i8* %tmp31) nounwind
+  %tmp37 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_421455", align 8
+  %tmp39 = call %0* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to %0* (i8*, i8*)*)(i8* %tmp34, i8* %tmp37)
+  %tmp40 = bitcast %0* %tmp39 to i8*
+  %tmp41 = call i8* @objc_retain(i8* %tmp40) nounwind
+  %tmp42 = bitcast i8* %tmp41 to %0*
+  %tmp44 = icmp eq i8* %tmp41, null
+  br i1 %tmp44, label %bb45, label %bb55
+
+bb45:                                             ; preds = %bb27
+  %tmp47 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_624", align 8
+  %tmp49 = call %0* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to %0* (i8*, i8*)*)(i8* %tmp34, i8* %tmp47)
+  %tmp51 = bitcast %0* %tmp49 to i8*
+  %tmp52 = call i8* @objc_retain(i8* %tmp51) nounwind
+  call void @objc_release(i8* %tmp41) nounwind
+  br label %bb55
+
+bb55:                                             ; preds = %bb27, %bb45
+  %tmp13.0 = phi %0* [ %tmp42, %bb27 ], [ %tmp49, %bb45 ]
+  %tmp57 = icmp eq %0* %tmp13.0, null
+  br i1 %tmp57, label %bb76, label %bb58
+
+bb58:                                             ; preds = %bb55
+  %tmp60 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_598", align 8
+  %tmp61 = bitcast %0* %tmp13.0 to i8*
+  %tmp62 = call signext i8 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8 (i8*, i8*)*)(i8* %tmp61, i8* %tmp60)
+  %tmp64 = icmp eq i8 %tmp62, 0
+  br i1 %tmp64, label %bb76, label %bb65
+
+bb65:                                             ; preds = %bb58
+  %tmp68 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_626", align 8
+  %tmp69 = bitcast %0* %tmp13.0 to i8*
+  %tmp70 = call %0* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to %0* (i8*, i8*, %5*)*)(i8* %tmp69, i8* %tmp68, %5* %tmp24)
+  %tmp72 = bitcast %0* %tmp70 to i8*
+  %tmp73 = call i8* @objc_retain(i8* %tmp72) nounwind
+  br label %bb76
+
+bb76:                                             ; preds = %bb58, %bb55, %bb65
+  %tmp10.0 = phi %0* [ %tmp70, %bb65 ], [ null, %bb58 ], [ null, %bb55 ]
+  %tmp78 = bitcast %0* %tmp13.0 to i8*
+  call void @objc_release(i8* %tmp78) nounwind
+  call void @objc_release(i8* %tmp34) nounwind
+  br label %bb81
+
+bb81:                                             ; preds = %bb, %bb76
+  %tmp10.1 = phi %0* [ %tmp10.0, %bb76 ], [ null, %bb ]
+  %tmp83 = bitcast %0* %tmp10.1 to i8*
+  %tmp84 = call i8* @objc_retain(i8* %tmp83) nounwind
+  %tmp88 = bitcast i8* %tmp87 to %0*
+  call void @objc_release(i8* %tmp23) nounwind
+  %tmp87 = call i8* @objc_autorelease(i8* %tmp84) nounwind
+  %tmp92 = bitcast %0* %tmp10.1 to i8*
+  call void @objc_release(i8* %tmp92) nounwind
+  ret %0* %tmp88
+}
diff --git a/test/Transforms/ObjCARC/post-inlining.ll b/test/Transforms/ObjCARC/post-inlining.ll
new file mode 100644
index 0000000..ad69ccd
--- /dev/null
+++ b/test/Transforms/ObjCARC/post-inlining.ll
@@ -0,0 +1,48 @@
+; RUN: opt -S -objc-arc < %s | FileCheck %s
+
+declare void @use_pointer(i8*)
+declare i8* @returner()
+declare i8* @objc_retain(i8*)
+declare i8* @objc_autoreleaseReturnValue(i8*)
+declare i8* @objc_retainAutoreleasedReturnValue(i8*)
+
+; Clean up residue left behind after inlining.
+
+; CHECK: define void @test0(
+; CHECK: entry:
+; CHECK-NEXT: ret void
+; CHECK-NEXT: }
+define void @test0(i8* %call.i) {
+entry:
+  %0 = tail call i8* @objc_retain(i8* %call.i) nounwind
+  %1 = tail call i8* @objc_autoreleaseReturnValue(i8* %0) nounwind
+  ret void
+}
+
+; Same as test0, but with slightly different use arrangements.
+
+; CHECK: define void @test1(
+; CHECK: entry:
+; CHECK-NEXT: ret void
+; CHECK-NEXT: }
+define void @test1(i8* %call.i) {
+entry:
+  %0 = tail call i8* @objc_retain(i8* %call.i) nounwind
+  %1 = tail call i8* @objc_autoreleaseReturnValue(i8* %call.i) nounwind
+  ret void
+}
+
+; Delete a retainRV+autoreleaseRV even if the pointer is used.
+
+; CHECK: define void @test24(
+; CHECK-NEXT: entry:
+; CHECK-NEXT:   call void @use_pointer(i8* %p)
+; CHECK-NEXT:   ret void
+; CHECK-NEXT: }
+define void @test24(i8* %p) {
+entry:
+  call i8* @objc_autoreleaseReturnValue(i8* %p) nounwind
+  call i8* @objc_retainAutoreleasedReturnValue(i8* %p) nounwind
+  call void @use_pointer(i8* %p)
+  ret void
+}
diff --git a/test/Transforms/ObjCARC/retain-not-declared.ll b/test/Transforms/ObjCARC/retain-not-declared.ll
new file mode 100644
index 0000000..e1fe117
--- /dev/null
+++ b/test/Transforms/ObjCARC/retain-not-declared.ll
@@ -0,0 +1,25 @@
+; RUN: opt -S -objc-arc -objc-arc-contract < %s | FileCheck %s
+
+; Test that the optimizer can create an objc_retainAutoreleaseReturnValue
+; declaration even if no objc_retain declaration exists.
+; rdar://9401303
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+declare i8* @objc_unretainedObject(i8*)
+declare i8* @objc_retainAutoreleasedReturnValue(i8*)
+declare i8* @objc_autoreleaseReturnValue(i8*)
+
+; CHECK:      define i8* @foo(i8* %p) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT:   %0 = tail call i8* @objc_retainAutoreleaseReturnValue(i8* %p) nounwind
+; CHECK-NEXT:   ret i8* %0
+; CHECK-NEXT: }
+
+define i8* @foo(i8* %p) {
+entry:
+  %call = tail call i8* @objc_unretainedObject(i8* %p)
+  %0 = tail call i8* @objc_retainAutoreleasedReturnValue(i8* %call) nounwind
+  %1 = tail call i8* @objc_autoreleaseReturnValue(i8* %0) nounwind
+  ret i8* %1
+}
+
diff --git a/test/Transforms/ObjCARC/rle-s2l.ll b/test/Transforms/ObjCARC/rle-s2l.ll
new file mode 100644
index 0000000..8f8d5c0
--- /dev/null
+++ b/test/Transforms/ObjCARC/rle-s2l.ll
@@ -0,0 +1,135 @@
+; RUN: opt -S -basicaa -objc-arc < %s | FileCheck %s
+
+declare i8* @objc_loadWeak(i8**)
+declare i8* @objc_loadWeakRetained(i8**)
+declare i8* @objc_storeWeak(i8**, i8*)
+declare i8* @objc_initWeak(i8**, i8*)
+declare void @use_pointer(i8*)
+declare void @callee()
+
+; Basic redundant @objc_loadWeak elimination.
+
+; CHECK:      define void @test0(i8** %p) {
+; CHECK-NEXT:   %y = call i8* @objc_loadWeak(i8** %p)
+; CHECK-NEXT:   call void @use_pointer(i8* %y)
+; CHECK-NEXT:   ret void
+; CHECK-NEXT: }
+define void @test0(i8** %p) {
+  %x = call i8* @objc_loadWeak(i8** %p)
+  %y = call i8* @objc_loadWeak(i8** %p)
+  call void @use_pointer(i8* %y)
+  ret void
+}
+
+; DCE the @objc_loadWeak.
+
+; CHECK:      define void @test1(i8** %p) {
+; CHECK-NEXT:   %y = call i8* @objc_loadWeakRetained(i8** %p)
+; CHECK-NEXT:   call void @use_pointer(i8* %y)
+; CHECK-NEXT:   ret void
+; CHECK-NEXT: }
+define void @test1(i8** %p) {
+  %x = call i8* @objc_loadWeak(i8** %p)
+  %y = call i8* @objc_loadWeakRetained(i8** %p)
+  call void @use_pointer(i8* %y)
+  ret void
+}
+
+; Basic redundant @objc_loadWeakRetained elimination.
+
+; CHECK:      define void @test2(i8** %p) {
+; CHECK-NEXT:   %x = call i8* @objc_loadWeak(i8** %p)
+; CHECK-NEXT:   store i8 3, i8* %x
+; CHECK-NEXT:   %1 = tail call i8* @objc_retain(i8* %x)
+; CHECK-NEXT:   call void @use_pointer(i8* %x)
+; CHECK-NEXT:   ret void
+; CHECK-NEXT: }
+define void @test2(i8** %p) {
+  %x = call i8* @objc_loadWeak(i8** %p)
+  store i8 3, i8* %x
+  %y = call i8* @objc_loadWeakRetained(i8** %p)
+  call void @use_pointer(i8* %y)
+  ret void
+}
+
+; Basic redundant @objc_loadWeakRetained elimination, this time
+; with a readonly call instead of a store.
+
+; CHECK:      define void @test3(i8** %p) {
+; CHECK-NEXT:   %x = call i8* @objc_loadWeak(i8** %p)
+; CHECK-NEXT:   call void @use_pointer(i8* %x) readonly
+; CHECK-NEXT:   %1 = tail call i8* @objc_retain(i8* %x)
+; CHECK-NEXT:   call void @use_pointer(i8* %x)
+; CHECK-NEXT:   ret void
+; CHECK-NEXT: }
+define void @test3(i8** %p) {
+  %x = call i8* @objc_loadWeak(i8** %p)
+  call void @use_pointer(i8* %x) readonly
+  %y = call i8* @objc_loadWeakRetained(i8** %p)
+  call void @use_pointer(i8* %y)
+  ret void
+}
+
+; A regular call blocks redundant weak load elimination.
+
+; CHECK:      define void @test4(i8** %p) {
+; CHECK-NEXT:   %x = call i8* @objc_loadWeak(i8** %p)
+; CHECK-NEXT:   call void @use_pointer(i8* %x) readonly
+; CHECK-NEXT:   call void @callee()
+; CHECK-NEXT:   %y = call i8* @objc_loadWeak(i8** %p)
+; CHECK-NEXT:   call void @use_pointer(i8* %y)
+; CHECK-NEXT:   ret void
+; CHECK-NEXT: }
+define void @test4(i8** %p) {
+  %x = call i8* @objc_loadWeak(i8** %p)
+  call void @use_pointer(i8* %x) readonly
+  call void @callee()
+  %y = call i8* @objc_loadWeak(i8** %p)
+  call void @use_pointer(i8* %y)
+  ret void
+}
+
+; Store to load forwarding.
+
+; CHECK:      define void @test5(i8** %p, i8* %n) {
+; CHECK-NEXT:   %1 = call i8* @objc_storeWeak(i8** %p, i8* %n)
+; CHECK-NEXT:   call void @use_pointer(i8* %n)
+; CHECK-NEXT:   ret void
+; CHECK-NEXT: }
+define void @test5(i8** %p, i8* %n) {
+  call i8* @objc_storeWeak(i8** %p, i8* %n)
+  %y = call i8* @objc_loadWeak(i8** %p)
+  call void @use_pointer(i8* %y)
+  ret void
+}
+
+; Store to load forwarding with objc_initWeak.
+
+; CHECK:      define void @test6(i8** %p, i8* %n) {
+; CHECK-NEXT:   %1 = call i8* @objc_initWeak(i8** %p, i8* %n)
+; CHECK-NEXT:   call void @use_pointer(i8* %n)
+; CHECK-NEXT:   ret void
+; CHECK-NEXT: }
+define void @test6(i8** %p, i8* %n) {
+  call i8* @objc_initWeak(i8** %p, i8* %n)
+  %y = call i8* @objc_loadWeak(i8** %p)
+  call void @use_pointer(i8* %y)
+  ret void
+}
+
+; Don't forward if there's a may-alias store in the way.
+
+; CHECK:      define void @test7(i8** %p, i8* %n, i8** %q, i8* %m) {
+; CHECK-NEXT:   call i8* @objc_initWeak(i8** %p, i8* %n)
+; CHECK-NEXT:   call i8* @objc_storeWeak(i8** %q, i8* %m)
+; CHECK-NEXT:   %y = call i8* @objc_loadWeak(i8** %p)
+; CHECK-NEXT:   call void @use_pointer(i8* %y)
+; CHECK-NEXT:   ret void
+; CHECK-NEXT: }
+define void @test7(i8** %p, i8* %n, i8** %q, i8* %m) {
+  call i8* @objc_initWeak(i8** %p, i8* %n)
+  call i8* @objc_storeWeak(i8** %q, i8* %m)
+  %y = call i8* @objc_loadWeak(i8** %p)
+  call void @use_pointer(i8* %y)
+  ret void
+}
diff --git a/test/Transforms/ObjCARC/rv.ll b/test/Transforms/ObjCARC/rv.ll
new file mode 100644
index 0000000..da53a86
--- /dev/null
+++ b/test/Transforms/ObjCARC/rv.ll
@@ -0,0 +1,331 @@
+; RUN: opt -objc-arc -S < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64"
+
+declare i8* @objc_retain(i8*)
+declare i8* @objc_retainAutoreleasedReturnValue(i8*)
+declare void @objc_release(i8*)
+declare i8* @objc_autorelease(i8*)
+declare i8* @objc_autoreleaseReturnValue(i8*)
+declare i8* @objc_retainAutoreleaseReturnValue(i8*)
+declare void @objc_autoreleasePoolPop(i8*)
+declare void @objc_autoreleasePoolPush()
+declare i8* @objc_retainBlock(i8*)
+
+declare i8* @objc_retainedObject(i8*)
+declare i8* @objc_unretainedObject(i8*)
+declare i8* @objc_unretainedPointer(i8*)
+
+declare void @use_pointer(i8*)
+declare void @callee()
+declare void @callee_fnptr(void ()*)
+declare void @invokee()
+declare i8* @returner()
+
+; Test that retain+release elimination is suppressed when the
+; retain is an objc_retainAutoreleasedReturnValue, since it's
+; better to do the RV optimization.
+
+; CHECK:      define void @test0(
+; CHECK-NEXT: entry:
+; CHECK-NEXT:   %x = call i8* @returner
+; CHECK-NEXT:   %0 = tail call i8* @objc_retainAutoreleasedReturnValue(i8* %x) nounwind
+; CHECK: t:
+; CHECK-NOT: @objc_
+; CHECK: return:
+; CHECK-NEXT: call void @objc_release(i8* %x)
+; CHECK-NEXT: ret void
+; CHECK-NEXT: }
+define void @test0(i1 %p) nounwind {
+entry:
+  %x = call i8* @returner()
+  %0 = call i8* @objc_retainAutoreleasedReturnValue(i8* %x)
+  br i1 %p, label %t, label %return
+
+t:
+  call void @use_pointer(i8* %x)
+  store i8 0, i8* %x
+  br label %return
+
+return:
+  call void @objc_release(i8* %x) nounwind
+  ret void
+}
+
+; Delete no-ops.
+
+; CHECK: define void @test2
+; CHECK-NOT: @objc_
+; CHECK: }
+define void @test2() {
+  call i8* @objc_retainAutoreleasedReturnValue(i8* null)
+  call i8* @objc_autoreleaseReturnValue(i8* null)
+  ; call i8* @objc_retainAutoreleaseReturnValue(i8* null) ; TODO
+  ret void
+}
+
+; Delete a redundant retainRV,autoreleaseRV when forwaring a call result
+; directly to a return value.
+
+; CHECK: define i8* @test3
+; CHECK: call i8* @returner()
+; CHECK-NEXT: ret i8* %call
+define i8* @test3() {
+entry:
+  %call = call i8* @returner()
+  %0 = call i8* @objc_retainAutoreleasedReturnValue(i8* %call) nounwind
+  %1 = call i8* @objc_autoreleaseReturnValue(i8* %0) nounwind
+  ret i8* %1
+}
+
+; Delete a redundant retain,autoreleaseRV when forwaring a call result
+; directly to a return value.
+
+; CHECK: define i8* @test4
+; CHECK: call i8* @returner()
+; CHECK-NEXT: ret i8* %call
+define i8* @test4() {
+entry:
+  %call = call i8* @returner()
+  %0 = call i8* @objc_retain(i8* %call) nounwind
+  %1 = call i8* @objc_autoreleaseReturnValue(i8* %0) nounwind
+  ret i8* %1
+}
+
+; Delete a redundant fused retain+autoreleaseRV when forwaring a call result
+; directly to a return value.
+
+; TODO
+; HECK: define i8* @test5
+; HECK: call i8* @returner()
+; HECK-NEXT: ret i8* %call
+;define i8* @test5() {
+;entry:
+;  %call = call i8* @returner()
+;  %0 = call i8* @objc_retainAutoreleaseReturnValue(i8* %call) nounwind
+;  ret i8* %0
+;}
+
+; Don't eliminate objc_retainAutoreleasedReturnValue by merging it into
+; an objc_autorelease.
+; TODO? Merge objc_retainAutoreleasedReturnValue and objc_autorelease into
+; objc_retainAutoreleasedReturnValueAutorelease and merge
+; objc_retainAutoreleasedReturnValue and objc_autoreleaseReturnValue
+; into objc_retainAutoreleasedReturnValueAutoreleaseReturnValue?
+; Those entrypoints don't exist yet though.
+
+; CHECK: define i8* @test7(
+; CHECK: call i8* @objc_retainAutoreleasedReturnValue(i8* %p)
+; CHECK: %t = tail call i8* @objc_autoreleaseReturnValue(i8* %p)
+define i8* @test7() {
+  %p = call i8* @returner()
+  call i8* @objc_retainAutoreleasedReturnValue(i8* %p)
+  %t = call i8* @objc_autoreleaseReturnValue(i8* %p)
+  call void @use_pointer(i8* %t)
+  ret i8* %t
+}
+
+; CHECK: define i8* @test7b(
+; CHECK: call i8* @objc_retain(i8* %p)
+; CHECK: %t = tail call i8* @objc_autoreleaseReturnValue(i8* %p)
+define i8* @test7b() {
+  %p = call i8* @returner()
+  call void @use_pointer(i8* %p)
+  call i8* @objc_retainAutoreleasedReturnValue(i8* %p)
+  %t = call i8* @objc_autoreleaseReturnValue(i8* %p)
+  ret i8* %t
+}
+
+; Turn objc_retain into objc_retainAutoreleasedReturnValue if its operand
+; is a return value.
+
+; CHECK: define void @test8()
+; CHECK: tail call i8* @objc_retainAutoreleasedReturnValue(i8* %p)
+define void @test8() {
+  %p = call i8* @returner()
+  call i8* @objc_retain(i8* %p)
+  ret void
+}
+
+; Don't apply the RV optimization to autorelease if there's no retain.
+
+; CHECK: define i8* @test9(i8* %p)
+; CHECK: tail call i8* @objc_autorelease(i8* %p)
+define i8* @test9(i8* %p) {
+  call i8* @objc_autorelease(i8* %p)
+  ret i8* %p
+}
+
+; Apply the RV optimization.
+
+; CHECK: define i8* @test10(i8* %p)
+; CHECK: tail call i8* @objc_retain(i8* %p) nounwind
+; CHECK: tail call i8* @objc_autoreleaseReturnValue(i8* %p) nounwind
+; CHECK-NEXT: ret i8* %p
+define i8* @test10(i8* %p) {
+  %1 = call i8* @objc_retain(i8* %p)
+  %2 = call i8* @objc_autorelease(i8* %p)
+  ret i8* %p
+}
+
+; Don't do the autoreleaseRV optimization because @use_pointer
+; could undo the retain.
+
+; CHECK: define i8* @test11(i8* %p)
+; CHECK: tail call i8* @objc_retain(i8* %p)
+; CHECK-NEXT: call void @use_pointer(i8* %p)
+; CHECK: tail call i8* @objc_autorelease(i8* %p)
+; CHECK-NEXT: ret i8* %p
+define i8* @test11(i8* %p) {
+  %1 = call i8* @objc_retain(i8* %p)
+  call void @use_pointer(i8* %p)
+  %2 = call i8* @objc_autorelease(i8* %p)
+  ret i8* %p
+}
+
+; Don't spoil the RV optimization.
+
+; CHECK: define i8* @test12(i8* %p)
+; CHECK: tail call i8* @objc_retain(i8* %p)
+; CHECK: call void @use_pointer(i8* %p)
+; CHECK: tail call i8* @objc_autoreleaseReturnValue(i8* %p)
+; CHECK: ret i8* %p
+define i8* @test12(i8* %p) {
+  %1 = call i8* @objc_retain(i8* %p)
+  call void @use_pointer(i8* %p)
+  %2 = call i8* @objc_autoreleaseReturnValue(i8* %p)
+  ret i8* %p
+}
+
+; Don't zap the objc_retainAutoreleasedReturnValue.
+
+; CHECK: define i8* @test13(
+; CHECK: tail call i8* @objc_retainAutoreleasedReturnValue(i8* %p)
+; CHECK: tail call i8* @objc_autorelease(i8* %p)
+; CHECK: ret i8* %p
+define i8* @test13() {
+  %p = call i8* @returner()
+  %1 = call i8* @objc_retainAutoreleasedReturnValue(i8* %p)
+  call void @callee()
+  %2 = call i8* @objc_autorelease(i8* %p)
+  ret i8* %p
+}
+
+; Convert objc_retainAutoreleasedReturnValue to objc_retain if its
+; argument is not a return value.
+
+; CHECK: define void @test14(
+; CHECK-NEXT: tail call i8* @objc_retain(i8* %p) nounwind
+; CHECK-NEXT: ret void
+define void @test14(i8* %p) {
+  call i8* @objc_retainAutoreleasedReturnValue(i8* %p)
+  ret void
+}
+
+; Don't convert objc_retainAutoreleasedReturnValue to objc_retain if its
+; argument is a return value.
+
+; CHECK: define void @test15(
+; CHECK-NEXT: %y = call i8* @returner()
+; CHECK-NEXT: tail call i8* @objc_retainAutoreleasedReturnValue(i8* %y) nounwind
+; CHECK-NEXT: ret void
+define void @test15() {
+  %y = call i8* @returner()
+  call i8* @objc_retainAutoreleasedReturnValue(i8* %y)
+  ret void
+}
+
+; Convert objc_retain to objc_retainAutoreleasedReturnValue if its
+; argument is a return value.
+
+; CHECK: define void @test16(
+; CHECK-NEXT: %y = call i8* @returner()
+; CHECK-NEXT: tail call i8* @objc_retainAutoreleasedReturnValue(i8* %y) nounwind
+; CHECK-NEXT: ret void
+define void @test16() {
+  %y = call i8* @returner()
+  call i8* @objc_retain(i8* %y)
+  ret void
+}
+
+; Don't convert objc_retain to objc_retainAutoreleasedReturnValue if its
+; argument is not a return value.
+
+; CHECK: define void @test17(
+; CHECK-NEXT: tail call i8* @objc_retain(i8* %y) nounwind
+; CHECK-NEXT: ret void
+define void @test17(i8* %y) {
+  call i8* @objc_retain(i8* %y)
+  ret void
+}
+
+; Don't Convert objc_retain to objc_retainAutoreleasedReturnValue if it
+; isn't next to the call providing its return value.
+
+; CHECK: define void @test18(
+; CHECK-NEXT: %y = call i8* @returner()
+; CHECK-NEXT: call void @callee()
+; CHECK-NEXT: tail call i8* @objc_retain(i8* %y) nounwind
+; CHECK-NEXT: ret void
+define void @test18() {
+  %y = call i8* @returner()
+  call void @callee()
+  call i8* @objc_retain(i8* %y)
+  ret void
+}
+
+; Delete autoreleaseRV+retainRV pairs.
+
+; CHECK: define i8* @test19(i8* %p) {
+; CHECK-NEXT: ret i8* %p
+define i8* @test19(i8* %p) {
+  call i8* @objc_autoreleaseReturnValue(i8* %p)
+  call i8* @objc_retainAutoreleasedReturnValue(i8* %p)
+  ret i8* %p
+}
+
+; Like test19 but with plain autorelease.
+
+; CHECK: define i8* @test20(i8* %p) {
+; CHECK-NEXT: call i8* @objc_autorelease(i8* %p)
+; CHECK-NEXT: call i8* @objc_retain(i8* %p)
+; CHECK-NEXT: ret i8* %p
+define i8* @test20(i8* %p) {
+  call i8* @objc_autorelease(i8* %p)
+  call i8* @objc_retainAutoreleasedReturnValue(i8* %p)
+  ret i8* %p
+}
+
+; Like test19 but with plain retain.
+
+; CHECK: define i8* @test21(i8* %p) {
+; CHECK-NEXT: call i8* @objc_autoreleaseReturnValue(i8* %p)
+; CHECK-NEXT: call i8* @objc_retain(i8* %p)
+; CHECK-NEXT: ret i8* %p
+define i8* @test21(i8* %p) {
+  call i8* @objc_autoreleaseReturnValue(i8* %p)
+  call i8* @objc_retain(i8* %p)
+  ret i8* %p
+}
+
+; Like test19 but with plain retain and autorelease.
+
+; CHECK: define i8* @test22(i8* %p) {
+; CHECK-NEXT: call i8* @objc_autorelease(i8* %p)
+; CHECK-NEXT: call i8* @objc_retain(i8* %p)
+; CHECK-NEXT: ret i8* %p
+define i8* @test22(i8* %p) {
+  call i8* @objc_autorelease(i8* %p)
+  call i8* @objc_retain(i8* %p)
+  ret i8* %p
+}
+
+; Convert autoreleaseRV to autorelease.
+
+; CHECK: define void @test23(
+; CHECK: tail call i8* @objc_autorelease(i8* %p) nounwind
+define void @test23(i8* %p) {
+  store i8 0, i8* %p
+  call i8* @objc_autoreleaseReturnValue(i8* %p)
+  ret void
+}
diff --git a/test/Transforms/ObjCARC/weak-contract.ll b/test/Transforms/ObjCARC/weak-contract.ll
new file mode 100644
index 0000000..ca69c70
--- /dev/null
+++ b/test/Transforms/ObjCARC/weak-contract.ll
@@ -0,0 +1,14 @@
+; RUN: opt -objc-arc-contract -S < %s | FileCheck %s
+
+declare i8* @objc_initWeak(i8**, i8*)
+
+; Convert objc_initWeak(p, null) to *p = null.
+
+; CHECK:      define i8* @test0(i8** %p) {
+; CHECK-NEXT:   store i8* null, i8** %p
+; CHECK-NEXT:   ret i8* null
+; CHECK-NEXT: }
+define i8* @test0(i8** %p) {
+  %t = call i8* @objc_initWeak(i8** %p, i8* null)
+  ret i8* %t
+}
diff --git a/test/Transforms/ObjCARC/weak-copies.ll b/test/Transforms/ObjCARC/weak-copies.ll
new file mode 100644
index 0000000..e1a94bb
--- /dev/null
+++ b/test/Transforms/ObjCARC/weak-copies.ll
@@ -0,0 +1,87 @@
+; RUN: opt -S -basicaa -objc-arc < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-darwin11.0.0"
+
+%0 = type { i64, i64, i8*, i8*, i8*, i8* }
+%1 = type <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i8* }>
+%struct.__block_descriptor = type { i64, i64 }
+
+@_NSConcreteStackBlock = external global i8*
+@.str = private unnamed_addr constant [6 x i8] c"v8@?0\00"
+@"\01L_OBJC_CLASS_NAME_" = internal global [3 x i8] c"\01@\00", section "__TEXT,__objc_classname,cstring_literals", align 1
+@__block_descriptor_tmp = internal constant %0 { i64 0, i64 40, i8* bitcast (void (i8*, i8*)* @__copy_helper_block_ to i8*), i8* bitcast (void (i8*)* @__destroy_helper_block_ to i8*), i8* getelementptr inbounds ([6 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([3 x i8]* @"\01L_OBJC_CLASS_NAME_", i32 0, i32 0) }
+@"\01L_OBJC_IMAGE_INFO" = internal constant [2 x i32] [i32 0, i32 16], section "__DATA, __objc_imageinfo, regular, no_dead_strip"
+@llvm.used = appending global [2 x i8*] [i8* getelementptr inbounds ([3 x i8]* @"\01L_OBJC_CLASS_NAME_", i32 0, i32 0), i8* bitcast ([2 x i32]* @"\01L_OBJC_IMAGE_INFO" to i8*)], section "llvm.metadata"
+
+; Eliminate unnecessary weak pointer copies.
+
+; CHECK:      define void @foo() {
+; CHECK-NEXT: entry:
+; CHECK-NEXT:   %call = call i8* @bar()
+; CHECK-NEXT:   call void @use(i8* %call) nounwind
+; CHECK-NEXT:   ret void
+; CHECK-NEXT: }
+define void @foo() {
+entry:
+  %w = alloca i8*, align 8
+  %x = alloca i8*, align 8
+  %call = call i8* @bar()
+  %0 = call i8* @objc_initWeak(i8** %w, i8* %call) nounwind
+  %1 = call i8* @objc_loadWeak(i8** %w) nounwind
+  %2 = call i8* @objc_initWeak(i8** %x, i8* %1) nounwind
+  %3 = call i8* @objc_loadWeak(i8** %x) nounwind
+  call void @use(i8* %3) nounwind
+  call void @objc_destroyWeak(i8** %x) nounwind
+  call void @objc_destroyWeak(i8** %w) nounwind
+  ret void
+}
+
+; Eliminate unnecessary weak pointer copies in a block initialization.
+
+; CHECK:      define void @qux(i8* %me) nounwind {
+; CHECK-NEXT: entry:
+; CHECK-NEXT:   %block = alloca %1, align 8
+; CHECK-NOT:    alloca
+; CHECK:      }
+define void @qux(i8* %me) nounwind {
+entry:
+  %w = alloca i8*, align 8
+  %block = alloca %1, align 8
+  %0 = call i8* @objc_retain(i8* %me) nounwind
+  %1 = call i8* @objc_initWeak(i8** %w, i8* %0) nounwind
+  %block.isa = getelementptr inbounds %1* %block, i64 0, i32 0
+  store i8* bitcast (i8** @_NSConcreteStackBlock to i8*), i8** %block.isa, align 8
+  %block.flags = getelementptr inbounds %1* %block, i64 0, i32 1
+  store i32 1107296256, i32* %block.flags, align 8
+  %block.reserved = getelementptr inbounds %1* %block, i64 0, i32 2
+  store i32 0, i32* %block.reserved, align 4
+  %block.invoke = getelementptr inbounds %1* %block, i64 0, i32 3
+  store i8* bitcast (void (i8*)* @__qux_block_invoke_0 to i8*), i8** %block.invoke, align 8
+  %block.descriptor = getelementptr inbounds %1* %block, i64 0, i32 4
+  store %struct.__block_descriptor* bitcast (%0* @__block_descriptor_tmp to %struct.__block_descriptor*), %struct.__block_descriptor** %block.descriptor, align 8
+  %block.captured = getelementptr inbounds %1* %block, i64 0, i32 5
+  %2 = call i8* @objc_loadWeak(i8** %w) nounwind
+  %3 = call i8* @objc_initWeak(i8** %block.captured, i8* %2) nounwind
+  %4 = bitcast %1* %block to void ()*
+  call void @use_block(void ()* %4) nounwind
+  call void @objc_destroyWeak(i8** %block.captured) nounwind
+  call void @objc_destroyWeak(i8** %w) nounwind
+  call void @objc_release(i8* %0) nounwind, !clang.imprecise_release !0
+  ret void
+}
+
+declare i8* @objc_retain(i8*)
+declare void @use_block(void ()*) nounwind
+declare void @__qux_block_invoke_0(i8* %.block_descriptor) nounwind
+declare void @__copy_helper_block_(i8*, i8*) nounwind
+declare void @objc_copyWeak(i8**, i8**)
+declare void @__destroy_helper_block_(i8*) nounwind
+declare void @objc_release(i8*)
+declare i8* @bar()
+declare i8* @objc_initWeak(i8**, i8*)
+declare i8* @objc_loadWeak(i8**)
+declare void @use(i8*) nounwind
+declare void @objc_destroyWeak(i8**)
+
+!0 = metadata !{}
diff --git a/test/Transforms/ObjCARC/weak.ll b/test/Transforms/ObjCARC/weak.ll
new file mode 100644
index 0000000..85a290c
--- /dev/null
+++ b/test/Transforms/ObjCARC/weak.ll
@@ -0,0 +1,57 @@
+; RUN: opt -objc-arc -S < %s | FileCheck %s
+
+declare i8* @objc_initWeak(i8**, i8*)
+declare i8* @objc_storeWeak(i8**, i8*)
+declare i8* @objc_loadWeak(i8**)
+declare void @objc_destroyWeak(i8**)
+declare i8* @objc_loadWeakRetained(i8**)
+declare void @objc_moveWeak(i8**, i8**)
+declare void @objc_copyWeak(i8**, i8**)
+
+; If the pointer-to-weak-pointer is null, it's undefined behavior.
+
+; CHECK: define void @test0(
+; CHECK: store i8* undef, i8** null
+; CHECK: store i8* undef, i8** null
+; CHECK: store i8* undef, i8** null
+; CHECK: store i8* undef, i8** null
+; CHECK: store i8* undef, i8** null
+; CHECK: store i8* undef, i8** null
+; CHECK: store i8* undef, i8** null
+; CHECK: store i8* undef, i8** null
+; CHECK: store i8* undef, i8** null
+; CHECK: store i8* undef, i8** null
+; CHECK: store i8* undef, i8** null
+; CHECK: store i8* undef, i8** null
+; CHECK: store i8* undef, i8** null
+; CHECK: store i8* undef, i8** null
+; CHECK: store i8* undef, i8** null
+; CHECK: store i8* undef, i8** null
+; CHECK: store i8* undef, i8** null
+; CHECK: store i8* undef, i8** null
+; CHECK: ret void
+define void @test0(i8* %p, i8** %q) {
+entry:
+  call i8* @objc_storeWeak(i8** null, i8* %p)
+  call i8* @objc_storeWeak(i8** undef, i8* %p)
+  call i8* @objc_loadWeakRetained(i8** null)
+  call i8* @objc_loadWeakRetained(i8** undef)
+  call i8* @objc_loadWeak(i8** null)
+  call i8* @objc_loadWeak(i8** undef)
+  call i8* @objc_initWeak(i8** null, i8* %p)
+  call i8* @objc_initWeak(i8** undef, i8* %p)
+  call void @objc_destroyWeak(i8** null)
+  call void @objc_destroyWeak(i8** undef)
+
+  call void @objc_copyWeak(i8** null, i8** %q)
+  call void @objc_copyWeak(i8** undef, i8** %q)
+  call void @objc_copyWeak(i8** %q, i8** null)
+  call void @objc_copyWeak(i8** %q, i8** undef)
+
+  call void @objc_moveWeak(i8** null, i8** %q)
+  call void @objc_moveWeak(i8** undef, i8** %q)
+  call void @objc_moveWeak(i8** %q, i8** null)
+  call void @objc_moveWeak(i8** %q, i8** undef)
+
+  ret void
+}
diff --git a/test/Transforms/PhaseOrdering/basic.ll b/test/Transforms/PhaseOrdering/basic.ll
new file mode 100644
index 0000000..e5b2ba4
--- /dev/null
+++ b/test/Transforms/PhaseOrdering/basic.ll
@@ -0,0 +1,118 @@
+; RUN: opt -O3 -S %s | FileCheck %s
+; XFAIL: *
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-macosx10.6.7"
+
+declare i8* @malloc(i64)
+declare void @free(i8*)
+
+
+; PR2338
+define void @test1() nounwind ssp {
+  %retval = alloca i32, align 4
+  %i = alloca i8*, align 8
+  %call = call i8* @malloc(i64 1)
+  store i8* %call, i8** %i, align 8
+  %tmp = load i8** %i, align 8
+  store i8 1, i8* %tmp
+  %tmp1 = load i8** %i, align 8
+  call void @free(i8* %tmp1)
+  ret void
+
+; CHECK: @test1
+; CHECK-NEXT: ret void
+}
+
+
+; PR6627 - This whole nasty sequence should be flattened down to a single
+; 32-bit comparison.
+define void @test2(i8* %arrayidx) nounwind ssp {
+entry:
+  %xx = bitcast i8* %arrayidx to i32*
+  %x1 = load i32* %xx, align 4
+  %tmp = trunc i32 %x1 to i8
+  %conv = zext i8 %tmp to i32
+  %cmp = icmp eq i32 %conv, 127
+  br i1 %cmp, label %land.lhs.true, label %if.end
+
+land.lhs.true:                                    ; preds = %entry
+  %arrayidx4 = getelementptr inbounds i8* %arrayidx, i64 1
+  %tmp5 = load i8* %arrayidx4, align 1
+  %conv6 = zext i8 %tmp5 to i32
+  %cmp7 = icmp eq i32 %conv6, 69
+  br i1 %cmp7, label %land.lhs.true9, label %if.end
+
+land.lhs.true9:                                   ; preds = %land.lhs.true
+  %arrayidx12 = getelementptr inbounds i8* %arrayidx, i64 2
+  %tmp13 = load i8* %arrayidx12, align 1
+  %conv14 = zext i8 %tmp13 to i32
+  %cmp15 = icmp eq i32 %conv14, 76
+  br i1 %cmp15, label %land.lhs.true17, label %if.end
+
+land.lhs.true17:                                  ; preds = %land.lhs.true9
+  %arrayidx20 = getelementptr inbounds i8* %arrayidx, i64 3
+  %tmp21 = load i8* %arrayidx20, align 1
+  %conv22 = zext i8 %tmp21 to i32
+  %cmp23 = icmp eq i32 %conv22, 70
+  br i1 %cmp23, label %if.then, label %if.end
+
+if.then:                                          ; preds = %land.lhs.true17
+  %call25 = call i32 (...)* @doo()
+  br label %if.end
+
+if.end:
+  ret void
+
+; CHECK: @test2
+; CHECK: %x1 = load i32* %xx, align 4
+; CHECK-NEXT: icmp eq i32 %x1, 1179403647
+; CHECK-NEXT: br i1 {{.*}}, label %if.then, label %if.end 
+}
+
+declare i32 @doo(...)
+
+; PR6627 - This should all be flattened down to one compare.  This is the same
+; as test2, except that the initial load is done as an i8 instead of i32, thus
+; requiring widening.
+define void @test2a(i8* %arrayidx) nounwind ssp {
+entry:
+  %x1 = load i8* %arrayidx, align 4
+  %conv = zext i8 %x1 to i32
+  %cmp = icmp eq i32 %conv, 127
+  br i1 %cmp, label %land.lhs.true, label %if.end
+
+land.lhs.true:                                    ; preds = %entry
+  %arrayidx4 = getelementptr inbounds i8* %arrayidx, i64 1
+  %tmp5 = load i8* %arrayidx4, align 1
+  %conv6 = zext i8 %tmp5 to i32
+  %cmp7 = icmp eq i32 %conv6, 69
+  br i1 %cmp7, label %land.lhs.true9, label %if.end
+
+land.lhs.true9:                                   ; preds = %land.lhs.true
+  %arrayidx12 = getelementptr inbounds i8* %arrayidx, i64 2
+  %tmp13 = load i8* %arrayidx12, align 1
+  %conv14 = zext i8 %tmp13 to i32
+  %cmp15 = icmp eq i32 %conv14, 76
+  br i1 %cmp15, label %land.lhs.true17, label %if.end
+
+land.lhs.true17:                                  ; preds = %land.lhs.true9
+  %arrayidx20 = getelementptr inbounds i8* %arrayidx, i64 3
+  %tmp21 = load i8* %arrayidx20, align 1
+  %conv22 = zext i8 %tmp21 to i32
+  %cmp23 = icmp eq i32 %conv22, 70
+  br i1 %cmp23, label %if.then, label %if.end
+
+if.then:                                          ; preds = %land.lhs.true17
+  %call25 = call i32 (...)* @doo()
+  br label %if.end
+
+if.end:
+  ret void
+
+; CHECK: @test2a
+; CHECK: %x1 = load i32* {{.*}}, align 4
+; CHECK-NEXT: icmp eq i32 %x1, 1179403647
+; CHECK-NEXT: br i1 {{.*}}, label %if.then, label %if.end 
+}
+
diff --git a/test/Transforms/PhaseOrdering/dg.exp b/test/Transforms/PhaseOrdering/dg.exp
new file mode 100644
index 0000000..f200589
--- /dev/null
+++ b/test/Transforms/PhaseOrdering/dg.exp
@@ -0,0 +1,3 @@
+load_lib llvm.exp
+
+RunLLVMTests [lsort [glob -nocomplain $srcdir/$subdir/*.{ll,c,cpp}]]
diff --git a/test/Transforms/PruneEH/2008-09-05-CGUpdate.ll b/test/Transforms/PruneEH/2008-09-05-CGUpdate.ll
deleted file mode 100644
index 33e0cfa..0000000
--- a/test/Transforms/PruneEH/2008-09-05-CGUpdate.ll
+++ /dev/null
@@ -1,1445 +0,0 @@
-; RUN: opt < %s -prune-eh -inline -print-callgraph \
-; RUN:   -disable-output |& \
-; RUN:     grep {calls.*ce3806g__fxio__put__put_int64__4.1339} | count 2
-	%struct.FRAME.ce3806g = type { %struct.string___XUB, %struct.string___XUB, %struct.string___XUB, %struct.string___XUB }
-	%struct.FRAME.ce3806g__fxio__put__4 = type { i32, i32, i32, %struct.system__file_control_block__pstring*, i32, i32, i8 }
-	%struct.RETURN = type { i8, i32 }
-	%struct.ada__streams__root_stream_type = type { %struct.ada__tags__dispatch_table* }
-	%struct.ada__tags__dispatch_table = type { [1 x i32] }
-	%struct.ada__tags__select_specific_data = type { i32, %struct.ada__tags__select_specific_data_element }
-	%struct.ada__tags__select_specific_data_element = type { i32, i8 }
-	%struct.ada__tags__type_specific_data = type { i32, i32, [2147483647 x i8]*, [2147483647 x i8]*, %struct.ada__tags__dispatch_table*, i8, i32, i32, i32, i32, [2 x %struct.ada__tags__dispatch_table*] }
-	%struct.ada__text_io__text_afcb = type { %struct.system__file_control_block__afcb, i32, i32, i32, i32, i32, %struct.ada__text_io__text_afcb*, i8, i8 }
-	%struct.exception = type { i8, i8, i32, i8*, i8*, i32, i8* }
-	%struct.long_long_float___PAD = type { x86_fp80, [1 x i32] }
-	%struct.string___XUB = type { i32, i32 }
-	%struct.system__file_control_block__afcb = type { %struct.ada__streams__root_stream_type, i32, %struct.system__file_control_block__pstring, %struct.system__file_control_block__pstring, i8, i8, i8, i8, i8, i8, i8, %struct.system__file_control_block__afcb*, %struct.system__file_control_block__afcb* }
-	%struct.system__file_control_block__pstring = type { i8*, %struct.string___XUB* }
-	%struct.system__finalization_implementation__limited_record_controller = type { %struct.system__finalization_root__root_controlled, %struct.system__finalization_root__root_controlled* }
-	%struct.system__finalization_implementation__record_controller = type { %struct.system__finalization_implementation__limited_record_controller, i32 }
-	%struct.system__finalization_root__empty_root_controlled = type { %struct.ada__tags__dispatch_table* }
-	%struct.system__finalization_root__root_controlled = type { %struct.ada__streams__root_stream_type, %struct.system__finalization_root__root_controlled*, %struct.system__finalization_root__root_controlled* }
-	%struct.system__secondary_stack__mark_id = type { i32, i32 }
-	%struct.system__standard_library__exception_data = type { i8, i8, i32, i32, %struct.system__standard_library__exception_data*, i32, void ()* }
-@.str = internal constant [12 x i8] c"system.ads\00\00"		; <[12 x i8]*> [#uses=1]
-@.str1 = internal constant [14 x i8] c"a-tifiio.adb\00\00"		; <[14 x i8]*> [#uses=1]
-@system__soft_links__abort_undefer = external global void ()*		; <void ()**> [#uses=6]
-@.str2 = internal constant [47 x i8] c"a-tifiio.adb:327 instantiated at ce3806g.adb:52"		; <[47 x i8]*> [#uses=1]
-@C.354.2200 = internal constant %struct.string___XUB { i32 1, i32 47 }		; <%struct.string___XUB*> [#uses=1]
-@ada__io_exceptions__data_error = external global %struct.exception		; <%struct.exception*> [#uses=1]
-@constraint_error = external global %struct.exception		; <%struct.exception*> [#uses=2]
-@__gnat_all_others_value = external constant i32		; <i32*> [#uses=21]
-@.str3 = internal constant [10 x i8] c"0123456789"		; <[10 x i8]*> [#uses=2]
-@ada__text_io__current_out = external global %struct.ada__text_io__text_afcb*		; <%struct.ada__text_io__text_afcb**> [#uses=1]
-@.str4 = internal constant [126 x i8] c"CHECK THAT FIXED_IO PUT OPERATES ON FILES OF MODE OUT_FILE AND IF NO FILE IS SPECIFIED THE CURRENT DEFAULT OUTPUT FILE IS USED"		; <[126 x i8]*> [#uses=1]
-@C.131.1559 = internal constant %struct.string___XUB { i32 1, i32 126 }		; <%struct.string___XUB*> [#uses=1]
-@.str5 = internal constant [7 x i8] c"CE3806G"		; <[7 x i8]*> [#uses=1]
-@C.132.1562 = internal constant %struct.string___XUB { i32 1, i32 7 }		; <%struct.string___XUB*> [#uses=1]
-@incompleteF.1176.b = internal global i1 false		; <i1*> [#uses=2]
-@incomplete.1177 = internal global %struct.exception { i8 0, i8 65, i32 23, i8* getelementptr ([23 x i8]* @incompleteE.1174, i32 0, i32 0), i8* null, i32 0, i8* null }		; <%struct.exception*> [#uses=15]
-@incompleteE.1174 = internal global [23 x i8] c"CE3806G.B_1.INCOMPLETE\00"		; <[23 x i8]*> [#uses=1]
-@.str6 = internal constant [0 x i8] zeroinitializer		; <[0 x i8]*> [#uses=1]
-@C.136.1568 = internal constant %struct.string___XUB { i32 1, i32 0 }		; <%struct.string___XUB*> [#uses=1]
-@C.137.1571 = internal constant %struct.string___XUB { i32 1, i32 0 }		; <%struct.string___XUB*> [#uses=1]
-@.str7 = internal constant [50 x i8] c"USE_ERROR RAISED ON TEXT CREATE WITH OUT_FILE MODE"		; <[50 x i8]*> [#uses=1]
-@C.139.1577 = internal constant %struct.string___XUB { i32 1, i32 50 }		; <%struct.string___XUB*> [#uses=1]
-@.str8 = internal constant [14 x i8] c"ce3806g.adb:65"		; <[14 x i8]*> [#uses=1]
-@C.140.1580 = internal constant %struct.string___XUB { i32 1, i32 14 }		; <%struct.string___XUB*> [#uses=1]
-@.str9 = internal constant [51 x i8] c"NAME_ERROR RAISED ON TEXT CREATE WITH OUT_FILE MODE"		; <[51 x i8]*> [#uses=1]
-@C.143.1585 = internal constant %struct.string___XUB { i32 1, i32 51 }		; <%struct.string___XUB*> [#uses=1]
-@.str10 = internal constant [14 x i8] c"ce3806g.adb:69"		; <[14 x i8]*> [#uses=1]
-@C.144.1588 = internal constant %struct.string___XUB { i32 1, i32 14 }		; <%struct.string___XUB*> [#uses=1]
-@C.146.1592 = internal constant %struct.string___XUB { i32 1, i32 0 }		; <%struct.string___XUB*> [#uses=1]
-@C.147.1595 = internal constant %struct.string___XUB { i32 1, i32 0 }		; <%struct.string___XUB*> [#uses=1]
-@C.153.1609 = internal constant %struct.string___XUB { i32 1, i32 0 }		; <%struct.string___XUB*> [#uses=1]
-@C.154.1612 = internal constant %struct.string___XUB { i32 1, i32 0 }		; <%struct.string___XUB*> [#uses=1]
-@.str12 = internal constant [47 x i8] c"USE_ERROR RAISED ON TEXT OPEN WITH IN_FILE MODE"		; <[47 x i8]*> [#uses=1]
-@C.156.1618 = internal constant %struct.string___XUB { i32 1, i32 47 }		; <%struct.string___XUB*> [#uses=1]
-@.str13 = internal constant [14 x i8] c"ce3806g.adb:88"		; <[14 x i8]*> [#uses=1]
-@C.157.1621 = internal constant %struct.string___XUB { i32 1, i32 14 }		; <%struct.string___XUB*> [#uses=1]
-@C.159.1627 = internal constant %struct.string___XUB { i32 1, i32 0 }		; <%struct.string___XUB*> [#uses=1]
-@C.160.1630 = internal constant %struct.string___XUB { i32 1, i32 0 }		; <%struct.string___XUB*> [#uses=1]
-@.str14 = internal constant [33 x i8] c"VALUE INCORRECT - FIXED FROM FILE"		; <[33 x i8]*> [#uses=1]
-@C.162.1637 = internal constant %struct.string___XUB { i32 1, i32 33 }		; <%struct.string___XUB*> [#uses=1]
-@.str15 = internal constant [36 x i8] c"VALUE INCORRECT - FIXED FROM DEFAULT"		; <[36 x i8]*> [#uses=1]
-@C.164.1642 = internal constant %struct.string___XUB { i32 1, i32 36 }		; <%struct.string___XUB*> [#uses=1]
-@ada__io_exceptions__use_error = external global %struct.exception		; <%struct.exception*> [#uses=4]
-@ada__io_exceptions__name_error = external global %struct.exception		; <%struct.exception*> [#uses=2]
-
-define void @_ada_ce3806g() {
-entry:
-	%0 = alloca %struct.system__file_control_block__pstring, align 8		; <%struct.system__file_control_block__pstring*> [#uses=3]
-	%1 = alloca %struct.system__file_control_block__pstring, align 8		; <%struct.system__file_control_block__pstring*> [#uses=3]
-	%2 = alloca %struct.system__file_control_block__pstring, align 8		; <%struct.system__file_control_block__pstring*> [#uses=3]
-	%3 = alloca %struct.system__file_control_block__pstring, align 8		; <%struct.system__file_control_block__pstring*> [#uses=3]
-	%FRAME.356 = alloca %struct.FRAME.ce3806g		; <%struct.FRAME.ce3806g*> [#uses=20]
-	call void @report__test( i8* getelementptr ([7 x i8]* @.str5, i32 0, i32 0), %struct.string___XUB* @C.132.1562, i8* getelementptr ([126 x i8]* @.str4, i32 0, i32 0), %struct.string___XUB* @C.131.1559 )
-	%4 = getelementptr %struct.FRAME.ce3806g* %FRAME.356, i32 0, i32 3		; <%struct.string___XUB*> [#uses=1]
-	call void @system__secondary_stack__ss_mark( %struct.string___XUB* noalias sret %4 )
-	%.b = load i1* @incompleteF.1176.b		; <i1> [#uses=1]
-	br i1 %.b, label %bb11, label %bb
-
-bb:		; preds = %entry
-	invoke void @system__exception_table__register_exception( %struct.system__standard_library__exception_data* bitcast (%struct.exception* @incomplete.1177 to %struct.system__standard_library__exception_data*) )
-			to label %invcont unwind label %lpad
-
-invcont:		; preds = %bb
-	store i1 true, i1* @incompleteF.1176.b
-	br label %bb11
-
-bb11:		; preds = %entry, %invcont
-	%5 = getelementptr %struct.FRAME.ce3806g* %FRAME.356, i32 0, i32 2		; <%struct.string___XUB*> [#uses=1]
-	invoke void @system__secondary_stack__ss_mark( %struct.string___XUB* noalias sret %5 )
-			to label %invcont12 unwind label %lpad228
-
-invcont12:		; preds = %bb11
-	invoke void @report__legal_file_name( %struct.system__file_control_block__pstring* noalias sret %3, i32 1, i8* getelementptr ([0 x i8]* @.str6, i32 0, i32 0), %struct.string___XUB* @C.137.1571 )
-			to label %invcont17 unwind label %lpad232
-
-invcont17:		; preds = %invcont12
-	%elt18 = getelementptr %struct.system__file_control_block__pstring* %3, i32 0, i32 0		; <i8**> [#uses=1]
-	%val19 = load i8** %elt18, align 8		; <i8*> [#uses=1]
-	%elt20 = getelementptr %struct.system__file_control_block__pstring* %3, i32 0, i32 1		; <%struct.string___XUB**> [#uses=1]
-	%val21 = load %struct.string___XUB** %elt20		; <%struct.string___XUB*> [#uses=1]
-	%6 = invoke %struct.ada__text_io__text_afcb* @ada__text_io__create( %struct.ada__text_io__text_afcb* null, i8 2, i8* %val19, %struct.string___XUB* %val21, i8* getelementptr ([0 x i8]* @.str6, i32 0, i32 0), %struct.string___XUB* @C.136.1568 )
-			to label %invcont26 unwind label %lpad232		; <%struct.ada__text_io__text_afcb*> [#uses=2]
-
-invcont26:		; preds = %invcont17
-	%7 = getelementptr %struct.FRAME.ce3806g* %FRAME.356, i32 0, i32 2, i32 0		; <i32*> [#uses=1]
-	%8 = load i32* %7, align 8		; <i32> [#uses=1]
-	%9 = getelementptr %struct.FRAME.ce3806g* %FRAME.356, i32 0, i32 2, i32 1		; <i32*> [#uses=1]
-	%10 = load i32* %9, align 4		; <i32> [#uses=1]
-	invoke void @system__secondary_stack__ss_release( i32 %8, i32 %10 )
-			to label %bb73 unwind label %lpad228
-
-bb32:		; preds = %lpad232
-	call void @__gnat_begin_handler( i8* %eh_ptr233 ) nounwind
-	%11 = load void ()** @system__soft_links__abort_undefer, align 4		; <void ()*> [#uses=1]
-	invoke void %11( )
-			to label %invcont33 unwind label %lpad240
-
-invcont33:		; preds = %bb32
-	invoke void @report__not_applicable( i8* getelementptr ([50 x i8]* @.str7, i32 0, i32 0), %struct.string___XUB* @C.139.1577 )
-			to label %invcont38 unwind label %lpad240
-
-invcont38:		; preds = %invcont33
-	invoke void @__gnat_raise_exception( %struct.system__standard_library__exception_data* bitcast (%struct.exception* @incomplete.1177 to %struct.system__standard_library__exception_data*), i8* getelementptr ([14 x i8]* @.str8, i32 0, i32 0), %struct.string___XUB* @C.140.1580 ) noreturn
-			to label %invcont43 unwind label %lpad240
-
-invcont43:		; preds = %invcont38
-	unreachable
-
-bb47:		; preds = %ppad291
-	call void @__gnat_begin_handler( i8* %eh_ptr233 ) nounwind
-	%12 = load void ()** @system__soft_links__abort_undefer, align 4		; <void ()*> [#uses=1]
-	invoke void %12( )
-			to label %invcont49 unwind label %lpad248
-
-invcont49:		; preds = %bb47
-	invoke void @report__not_applicable( i8* getelementptr ([51 x i8]* @.str9, i32 0, i32 0), %struct.string___XUB* @C.143.1585 )
-			to label %invcont54 unwind label %lpad248
-
-invcont54:		; preds = %invcont49
-	invoke void @__gnat_raise_exception( %struct.system__standard_library__exception_data* bitcast (%struct.exception* @incomplete.1177 to %struct.system__standard_library__exception_data*), i8* getelementptr ([14 x i8]* @.str10, i32 0, i32 0), %struct.string___XUB* @C.144.1588 ) noreturn
-			to label %invcont59 unwind label %lpad248
-
-invcont59:		; preds = %invcont54
-	unreachable
-
-bb73:		; preds = %invcont26
-	invoke void @report__legal_file_name( %struct.system__file_control_block__pstring* noalias sret %2, i32 2, i8* getelementptr ([0 x i8]* @.str6, i32 0, i32 0), %struct.string___XUB* @C.147.1595 )
-			to label %invcont78 unwind label %lpad228
-
-invcont78:		; preds = %bb73
-	%elt79 = getelementptr %struct.system__file_control_block__pstring* %2, i32 0, i32 0		; <i8**> [#uses=1]
-	%val80 = load i8** %elt79, align 8		; <i8*> [#uses=1]
-	%elt81 = getelementptr %struct.system__file_control_block__pstring* %2, i32 0, i32 1		; <%struct.string___XUB**> [#uses=1]
-	%val82 = load %struct.string___XUB** %elt81		; <%struct.string___XUB*> [#uses=1]
-	%13 = invoke %struct.ada__text_io__text_afcb* @ada__text_io__create( %struct.ada__text_io__text_afcb* null, i8 2, i8* %val80, %struct.string___XUB* %val82, i8* getelementptr ([0 x i8]* @.str6, i32 0, i32 0), %struct.string___XUB* @C.146.1592 )
-			to label %invcont87 unwind label %lpad228		; <%struct.ada__text_io__text_afcb*> [#uses=2]
-
-invcont87:		; preds = %invcont78
-	invoke void @ada__text_io__set_output( %struct.ada__text_io__text_afcb* %13 )
-			to label %invcont88 unwind label %lpad228
-
-invcont88:		; preds = %invcont87
-	%14 = getelementptr %struct.FRAME.ce3806g* %FRAME.356, i32 0, i32 1		; <%struct.string___XUB*> [#uses=1]
-	invoke void @system__secondary_stack__ss_mark( %struct.string___XUB* noalias sret %14 )
-			to label %invcont89 unwind label %lpad228
-
-invcont89:		; preds = %invcont88
-	invoke fastcc void @ce3806g__fxio__put.1149( %struct.ada__text_io__text_afcb* %6 )
-			to label %bb94 unwind label %lpad252
-
-bb94:		; preds = %invcont89
-	invoke fastcc void @ce3806g__fxio__put__2.1155( )
-			to label %invcont95 unwind label %lpad252
-
-invcont95:		; preds = %bb94
-	%15 = invoke %struct.ada__text_io__text_afcb* @ada__text_io__close( %struct.ada__text_io__text_afcb* %6 )
-			to label %invcont96 unwind label %lpad252		; <%struct.ada__text_io__text_afcb*> [#uses=1]
-
-invcont96:		; preds = %invcont95
-	%16 = getelementptr %struct.FRAME.ce3806g* %FRAME.356, i32 0, i32 0		; <%struct.string___XUB*> [#uses=1]
-	invoke void @system__secondary_stack__ss_mark( %struct.string___XUB* noalias sret %16 )
-			to label %invcont97 unwind label %lpad252
-
-invcont97:		; preds = %invcont96
-	invoke void @report__legal_file_name( %struct.system__file_control_block__pstring* noalias sret %1, i32 1, i8* getelementptr ([0 x i8]* @.str6, i32 0, i32 0), %struct.string___XUB* @C.154.1612 )
-			to label %invcont102 unwind label %lpad256
-
-invcont102:		; preds = %invcont97
-	%elt103 = getelementptr %struct.system__file_control_block__pstring* %1, i32 0, i32 0		; <i8**> [#uses=1]
-	%val104 = load i8** %elt103, align 8		; <i8*> [#uses=1]
-	%elt105 = getelementptr %struct.system__file_control_block__pstring* %1, i32 0, i32 1		; <%struct.string___XUB**> [#uses=1]
-	%val106 = load %struct.string___XUB** %elt105		; <%struct.string___XUB*> [#uses=1]
-	%17 = invoke %struct.ada__text_io__text_afcb* @ada__text_io__open( %struct.ada__text_io__text_afcb* %15, i8 0, i8* %val104, %struct.string___XUB* %val106, i8* getelementptr ([0 x i8]* @.str6, i32 0, i32 0), %struct.string___XUB* @C.153.1609 )
-			to label %invcont111 unwind label %lpad256		; <%struct.ada__text_io__text_afcb*> [#uses=2]
-
-invcont111:		; preds = %invcont102
-	%18 = getelementptr %struct.FRAME.ce3806g* %FRAME.356, i32 0, i32 0, i32 0		; <i32*> [#uses=1]
-	%19 = load i32* %18, align 8		; <i32> [#uses=1]
-	%20 = getelementptr %struct.FRAME.ce3806g* %FRAME.356, i32 0, i32 0, i32 1		; <i32*> [#uses=1]
-	%21 = load i32* %20, align 4		; <i32> [#uses=1]
-	invoke void @system__secondary_stack__ss_release( i32 %19, i32 %21 )
-			to label %bb143 unwind label %lpad252
-
-bb117:		; preds = %lpad256
-	call void @__gnat_begin_handler( i8* %eh_ptr257 ) nounwind
-	%22 = load void ()** @system__soft_links__abort_undefer, align 4		; <void ()*> [#uses=1]
-	invoke void %22( )
-			to label %invcont119 unwind label %lpad264
-
-invcont119:		; preds = %bb117
-	invoke void @report__not_applicable( i8* getelementptr ([47 x i8]* @.str12, i32 0, i32 0), %struct.string___XUB* @C.156.1618 )
-			to label %invcont124 unwind label %lpad264
-
-invcont124:		; preds = %invcont119
-	invoke void @__gnat_raise_exception( %struct.system__standard_library__exception_data* bitcast (%struct.exception* @incomplete.1177 to %struct.system__standard_library__exception_data*), i8* getelementptr ([14 x i8]* @.str13, i32 0, i32 0), %struct.string___XUB* @C.157.1621 ) noreturn
-			to label %invcont129 unwind label %lpad264
-
-invcont129:		; preds = %invcont124
-	unreachable
-
-bb143:		; preds = %invcont111
-	%23 = invoke %struct.ada__text_io__text_afcb* @ada__text_io__standard_output( )
-			to label %invcont144 unwind label %lpad252		; <%struct.ada__text_io__text_afcb*> [#uses=1]
-
-invcont144:		; preds = %bb143
-	invoke void @ada__text_io__set_output( %struct.ada__text_io__text_afcb* %23 )
-			to label %invcont145 unwind label %lpad252
-
-invcont145:		; preds = %invcont144
-	%24 = invoke %struct.ada__text_io__text_afcb* @ada__text_io__close( %struct.ada__text_io__text_afcb* %13 )
-			to label %invcont146 unwind label %lpad252		; <%struct.ada__text_io__text_afcb*> [#uses=1]
-
-invcont146:		; preds = %invcont145
-	invoke void @report__legal_file_name( %struct.system__file_control_block__pstring* noalias sret %0, i32 2, i8* getelementptr ([0 x i8]* @.str6, i32 0, i32 0), %struct.string___XUB* @C.160.1630 )
-			to label %invcont151 unwind label %lpad252
-
-invcont151:		; preds = %invcont146
-	%elt152 = getelementptr %struct.system__file_control_block__pstring* %0, i32 0, i32 0		; <i8**> [#uses=1]
-	%val153 = load i8** %elt152, align 8		; <i8*> [#uses=1]
-	%elt154 = getelementptr %struct.system__file_control_block__pstring* %0, i32 0, i32 1		; <%struct.string___XUB**> [#uses=1]
-	%val155 = load %struct.string___XUB** %elt154		; <%struct.string___XUB*> [#uses=1]
-	%25 = invoke %struct.ada__text_io__text_afcb* @ada__text_io__open( %struct.ada__text_io__text_afcb* %24, i8 0, i8* %val153, %struct.string___XUB* %val155, i8* getelementptr ([0 x i8]* @.str6, i32 0, i32 0), %struct.string___XUB* @C.159.1627 )
-			to label %invcont160 unwind label %lpad252		; <%struct.ada__text_io__text_afcb*> [#uses=2]
-
-invcont160:		; preds = %invcont151
-	%26 = invoke fastcc i8 @ce3806g__fxio__get.1137( %struct.ada__text_io__text_afcb* %17 ) signext
-			to label %invcont161 unwind label %lpad252		; <i8> [#uses=1]
-
-invcont161:		; preds = %invcont160
-	%27 = icmp eq i8 %26, -3		; <i1> [#uses=1]
-	br i1 %27, label %bb169, label %bb163
-
-bb163:		; preds = %invcont161
-	invoke void @report__failed( i8* getelementptr ([33 x i8]* @.str14, i32 0, i32 0), %struct.string___XUB* @C.162.1637 )
-			to label %bb169 unwind label %lpad252
-
-bb169:		; preds = %invcont161, %bb163
-	%28 = invoke fastcc i8 @ce3806g__fxio__get.1137( %struct.ada__text_io__text_afcb* %25 ) signext
-			to label %invcont170 unwind label %lpad252		; <i8> [#uses=1]
-
-invcont170:		; preds = %bb169
-	%29 = icmp eq i8 %28, -1		; <i1> [#uses=1]
-	br i1 %29, label %bb187, label %bb172
-
-bb172:		; preds = %invcont170
-	invoke void @report__failed( i8* getelementptr ([36 x i8]* @.str15, i32 0, i32 0), %struct.string___XUB* @C.164.1642 )
-			to label %bb187 unwind label %lpad252
-
-bb187:		; preds = %invcont170, %bb172
-	%30 = getelementptr %struct.FRAME.ce3806g* %FRAME.356, i32 0, i32 1, i32 0		; <i32*> [#uses=1]
-	%31 = load i32* %30, align 8		; <i32> [#uses=1]
-	%32 = getelementptr %struct.FRAME.ce3806g* %FRAME.356, i32 0, i32 1, i32 1		; <i32*> [#uses=1]
-	%33 = load i32* %32, align 4		; <i32> [#uses=1]
-	invoke void @system__secondary_stack__ss_release( i32 %31, i32 %33 )
-			to label %bb193 unwind label %lpad228
-
-bb193:		; preds = %bb187
-	%34 = invoke %struct.ada__text_io__text_afcb* @ada__text_io__delete( %struct.ada__text_io__text_afcb* %17 )
-			to label %invcont194 unwind label %lpad268		; <%struct.ada__text_io__text_afcb*> [#uses=0]
-
-invcont194:		; preds = %bb193
-	%35 = invoke %struct.ada__text_io__text_afcb* @ada__text_io__delete( %struct.ada__text_io__text_afcb* %25 )
-			to label %bb221 unwind label %lpad268		; <%struct.ada__text_io__text_afcb*> [#uses=0]
-
-bb196:		; preds = %lpad268
-	call void @__gnat_begin_handler( i8* %eh_ptr269 ) nounwind
-	%36 = load void ()** @system__soft_links__abort_undefer, align 4		; <void ()*> [#uses=1]
-	invoke void %36( )
-			to label %bb203 unwind label %lpad276
-
-bb203:		; preds = %bb196
-	invoke void @__gnat_end_handler( i8* %eh_ptr269 )
-			to label %bb221 unwind label %lpad272
-
-bb205:		; preds = %ppad304
-	call void @__gnat_begin_handler( i8* %eh_exception.1 ) nounwind
-	%37 = load void ()** @system__soft_links__abort_undefer, align 4		; <void ()*> [#uses=1]
-	invoke void %37( )
-			to label %bb212 unwind label %lpad284
-
-bb212:		; preds = %bb205
-	invoke void @__gnat_end_handler( i8* %eh_exception.1 )
-			to label %bb221 unwind label %lpad280
-
-bb221:		; preds = %invcont194, %bb212, %bb203
-	%38 = getelementptr %struct.FRAME.ce3806g* %FRAME.356, i32 0, i32 3, i32 0		; <i32*> [#uses=1]
-	%39 = load i32* %38, align 8		; <i32> [#uses=1]
-	%40 = getelementptr %struct.FRAME.ce3806g* %FRAME.356, i32 0, i32 3, i32 1		; <i32*> [#uses=1]
-	%41 = load i32* %40, align 4		; <i32> [#uses=1]
-	call void @system__secondary_stack__ss_release( i32 %39, i32 %41 )
-	call void @report__result( )
-	ret void
-
-lpad:		; preds = %bb
-	%eh_ptr = call i8* @llvm.eh.exception( )		; <i8*> [#uses=2]
-	%eh_select227 = call i32 (i8*, i8*, ...)* @llvm.eh.selector.i32( i8* %eh_ptr, i8* bitcast (i32 (...)* @__gnat_eh_personality to i8*), i32* @__gnat_all_others_value )		; <i32> [#uses=0]
-	br label %ppad
-
-lpad228:		; preds = %bb187, %ppad294, %invcont88, %invcont87, %invcont78, %bb73, %ppad288, %invcont26, %bb11
-	%eh_ptr229 = call i8* @llvm.eh.exception( )		; <i8*> [#uses=2]
-	%eh_select231 = call i32 (i8*, i8*, ...)* @llvm.eh.selector.i32( i8* %eh_ptr229, i8* bitcast (i32 (...)* @__gnat_eh_personality to i8*), %struct.exception* @incomplete.1177, i32* @__gnat_all_others_value )		; <i32> [#uses=1]
-	br label %ppad304
-
-lpad232:		; preds = %invcont17, %invcont12
-	%eh_ptr233 = call i8* @llvm.eh.exception( )		; <i8*> [#uses=6]
-	%eh_select235 = call i32 (i8*, i8*, ...)* @llvm.eh.selector.i32( i8* %eh_ptr233, i8* bitcast (i32 (...)* @__gnat_eh_personality to i8*), %struct.exception* @ada__io_exceptions__use_error, %struct.exception* @ada__io_exceptions__name_error, %struct.exception* @incomplete.1177, i32* @__gnat_all_others_value )		; <i32> [#uses=3]
-	%eh_typeid = call i32 @llvm.eh.typeid.for.i32( i8* getelementptr (%struct.exception* @ada__io_exceptions__use_error, i32 0, i32 0) )		; <i32> [#uses=1]
-	%42 = icmp eq i32 %eh_select235, %eh_typeid		; <i1> [#uses=1]
-	br i1 %42, label %bb32, label %ppad291
-
-lpad236:		; preds = %lpad240
-	%eh_ptr237 = call i8* @llvm.eh.exception( )		; <i8*> [#uses=2]
-	%eh_select239 = call i32 (i8*, i8*, ...)* @llvm.eh.selector.i32( i8* %eh_ptr237, i8* bitcast (i32 (...)* @__gnat_eh_personality to i8*), %struct.exception* @incomplete.1177, i32* @__gnat_all_others_value )		; <i32> [#uses=1]
-	br label %ppad288
-
-lpad240:		; preds = %invcont38, %invcont33, %bb32
-	%eh_ptr241 = call i8* @llvm.eh.exception( )		; <i8*> [#uses=2]
-	%eh_select243 = call i32 (i8*, i8*, ...)* @llvm.eh.selector.i32( i8* %eh_ptr241, i8* bitcast (i32 (...)* @__gnat_eh_personality to i8*), %struct.exception* @incomplete.1177, i32* @__gnat_all_others_value )		; <i32> [#uses=1]
-	invoke void @__gnat_end_handler( i8* %eh_ptr233 )
-			to label %ppad288 unwind label %lpad236
-
-lpad244:		; preds = %lpad248
-	%eh_ptr245 = call i8* @llvm.eh.exception( )		; <i8*> [#uses=2]
-	%eh_select247 = call i32 (i8*, i8*, ...)* @llvm.eh.selector.i32( i8* %eh_ptr245, i8* bitcast (i32 (...)* @__gnat_eh_personality to i8*), %struct.exception* @incomplete.1177, i32* @__gnat_all_others_value )		; <i32> [#uses=1]
-	br label %ppad288
-
-lpad248:		; preds = %invcont54, %invcont49, %bb47
-	%eh_ptr249 = call i8* @llvm.eh.exception( )		; <i8*> [#uses=2]
-	%eh_select251 = call i32 (i8*, i8*, ...)* @llvm.eh.selector.i32( i8* %eh_ptr249, i8* bitcast (i32 (...)* @__gnat_eh_personality to i8*), %struct.exception* @incomplete.1177, i32* @__gnat_all_others_value )		; <i32> [#uses=1]
-	invoke void @__gnat_end_handler( i8* %eh_ptr233 )
-			to label %ppad288 unwind label %lpad244
-
-lpad252:		; preds = %bb94, %invcont89, %invcont160, %bb169, %bb172, %bb163, %invcont151, %invcont146, %invcont145, %invcont144, %bb143, %ppad295, %invcont111, %invcont96, %invcont95
-	%eh_ptr253 = call i8* @llvm.eh.exception( )		; <i8*> [#uses=2]
-	%eh_select255 = call i32 (i8*, i8*, ...)* @llvm.eh.selector.i32( i8* %eh_ptr253, i8* bitcast (i32 (...)* @__gnat_eh_personality to i8*), %struct.exception* @incomplete.1177, i32* @__gnat_all_others_value )		; <i32> [#uses=1]
-	br label %ppad294
-
-lpad256:		; preds = %invcont102, %invcont97
-	%eh_ptr257 = call i8* @llvm.eh.exception( )		; <i8*> [#uses=4]
-	%eh_select259 = call i32 (i8*, i8*, ...)* @llvm.eh.selector.i32( i8* %eh_ptr257, i8* bitcast (i32 (...)* @__gnat_eh_personality to i8*), %struct.exception* @ada__io_exceptions__use_error, %struct.exception* @incomplete.1177, i32* @__gnat_all_others_value )		; <i32> [#uses=2]
-	%eh_typeid297 = call i32 @llvm.eh.typeid.for.i32( i8* getelementptr (%struct.exception* @ada__io_exceptions__use_error, i32 0, i32 0) )		; <i32> [#uses=1]
-	%43 = icmp eq i32 %eh_select259, %eh_typeid297		; <i1> [#uses=1]
-	br i1 %43, label %bb117, label %ppad295
-
-lpad260:		; preds = %lpad264
-	%eh_ptr261 = call i8* @llvm.eh.exception( )		; <i8*> [#uses=2]
-	%eh_select263 = call i32 (i8*, i8*, ...)* @llvm.eh.selector.i32( i8* %eh_ptr261, i8* bitcast (i32 (...)* @__gnat_eh_personality to i8*), %struct.exception* @incomplete.1177, i32* @__gnat_all_others_value )		; <i32> [#uses=1]
-	br label %ppad295
-
-lpad264:		; preds = %invcont124, %invcont119, %bb117
-	%eh_ptr265 = call i8* @llvm.eh.exception( )		; <i8*> [#uses=2]
-	%eh_select267 = call i32 (i8*, i8*, ...)* @llvm.eh.selector.i32( i8* %eh_ptr265, i8* bitcast (i32 (...)* @__gnat_eh_personality to i8*), %struct.exception* @incomplete.1177, i32* @__gnat_all_others_value )		; <i32> [#uses=1]
-	invoke void @__gnat_end_handler( i8* %eh_ptr257 )
-			to label %ppad295 unwind label %lpad260
-
-lpad268:		; preds = %invcont194, %bb193
-	%eh_ptr269 = call i8* @llvm.eh.exception( )		; <i8*> [#uses=5]
-	%eh_select271 = call i32 (i8*, i8*, ...)* @llvm.eh.selector.i32( i8* %eh_ptr269, i8* bitcast (i32 (...)* @__gnat_eh_personality to i8*), %struct.exception* @ada__io_exceptions__use_error, %struct.exception* @incomplete.1177, i32* @__gnat_all_others_value )		; <i32> [#uses=2]
-	%eh_typeid301 = call i32 @llvm.eh.typeid.for.i32( i8* getelementptr (%struct.exception* @ada__io_exceptions__use_error, i32 0, i32 0) )		; <i32> [#uses=1]
-	%44 = icmp eq i32 %eh_select271, %eh_typeid301		; <i1> [#uses=1]
-	br i1 %44, label %bb196, label %ppad304
-
-lpad272:		; preds = %bb203, %lpad276
-	%eh_ptr273 = call i8* @llvm.eh.exception( )		; <i8*> [#uses=2]
-	%eh_select275 = call i32 (i8*, i8*, ...)* @llvm.eh.selector.i32( i8* %eh_ptr273, i8* bitcast (i32 (...)* @__gnat_eh_personality to i8*), %struct.exception* @incomplete.1177, i32* @__gnat_all_others_value )		; <i32> [#uses=1]
-	br label %ppad304
-
-lpad276:		; preds = %bb196
-	%eh_ptr277 = call i8* @llvm.eh.exception( )		; <i8*> [#uses=2]
-	%eh_select279 = call i32 (i8*, i8*, ...)* @llvm.eh.selector.i32( i8* %eh_ptr277, i8* bitcast (i32 (...)* @__gnat_eh_personality to i8*), %struct.exception* @incomplete.1177, i32* @__gnat_all_others_value )		; <i32> [#uses=1]
-	invoke void @__gnat_end_handler( i8* %eh_ptr269 )
-			to label %ppad304 unwind label %lpad272
-
-lpad280:		; preds = %bb212, %lpad284
-	%eh_ptr281 = call i8* @llvm.eh.exception( )		; <i8*> [#uses=2]
-	%eh_select283 = call i32 (i8*, i8*, ...)* @llvm.eh.selector.i32( i8* %eh_ptr281, i8* bitcast (i32 (...)* @__gnat_eh_personality to i8*), i32* @__gnat_all_others_value )		; <i32> [#uses=0]
-	br label %ppad
-
-lpad284:		; preds = %bb205
-	%eh_ptr285 = call i8* @llvm.eh.exception( )		; <i8*> [#uses=2]
-	%eh_select287 = call i32 (i8*, i8*, ...)* @llvm.eh.selector.i32( i8* %eh_ptr285, i8* bitcast (i32 (...)* @__gnat_eh_personality to i8*), i32* @__gnat_all_others_value )		; <i32> [#uses=0]
-	invoke void @__gnat_end_handler( i8* %eh_exception.1 )
-			to label %ppad unwind label %lpad280
-
-ppad:		; preds = %lpad284, %ppad304, %lpad280, %lpad
-	%eh_exception.2 = phi i8* [ %eh_exception.1, %ppad304 ], [ %eh_ptr281, %lpad280 ], [ %eh_ptr, %lpad ], [ %eh_ptr285, %lpad284 ]		; <i8*> [#uses=1]
-	%45 = getelementptr %struct.FRAME.ce3806g* %FRAME.356, i32 0, i32 3, i32 0		; <i32*> [#uses=1]
-	%46 = load i32* %45, align 8		; <i32> [#uses=1]
-	%47 = getelementptr %struct.FRAME.ce3806g* %FRAME.356, i32 0, i32 3, i32 1		; <i32*> [#uses=1]
-	%48 = load i32* %47, align 4		; <i32> [#uses=1]
-	call void @system__secondary_stack__ss_release( i32 %46, i32 %48 )
-	%49 = call i32 (...)* @_Unwind_Resume( i8* %eh_exception.2 )		; <i32> [#uses=0]
-	unreachable
-
-ppad288:		; preds = %lpad248, %lpad240, %ppad291, %lpad244, %lpad236
-	%eh_exception.0 = phi i8* [ %eh_ptr233, %ppad291 ], [ %eh_ptr245, %lpad244 ], [ %eh_ptr237, %lpad236 ], [ %eh_ptr241, %lpad240 ], [ %eh_ptr249, %lpad248 ]		; <i8*> [#uses=1]
-	%eh_selector.0 = phi i32 [ %eh_select235, %ppad291 ], [ %eh_select247, %lpad244 ], [ %eh_select239, %lpad236 ], [ %eh_select243, %lpad240 ], [ %eh_select251, %lpad248 ]		; <i32> [#uses=1]
-	%50 = getelementptr %struct.FRAME.ce3806g* %FRAME.356, i32 0, i32 2, i32 0		; <i32*> [#uses=1]
-	%51 = load i32* %50, align 8		; <i32> [#uses=1]
-	%52 = getelementptr %struct.FRAME.ce3806g* %FRAME.356, i32 0, i32 2, i32 1		; <i32*> [#uses=1]
-	%53 = load i32* %52, align 4		; <i32> [#uses=1]
-	invoke void @system__secondary_stack__ss_release( i32 %51, i32 %53 )
-			to label %ppad304 unwind label %lpad228
-
-ppad291:		; preds = %lpad232
-	%eh_typeid292 = call i32 @llvm.eh.typeid.for.i32( i8* getelementptr (%struct.exception* @ada__io_exceptions__name_error, i32 0, i32 0) )		; <i32> [#uses=1]
-	%54 = icmp eq i32 %eh_select235, %eh_typeid292		; <i1> [#uses=1]
-	br i1 %54, label %bb47, label %ppad288
-
-ppad294:		; preds = %ppad295, %lpad252
-	%eh_exception.4 = phi i8* [ %eh_ptr253, %lpad252 ], [ %eh_exception.3, %ppad295 ]		; <i8*> [#uses=1]
-	%eh_selector.4 = phi i32 [ %eh_select255, %lpad252 ], [ %eh_selector.3, %ppad295 ]		; <i32> [#uses=1]
-	%55 = getelementptr %struct.FRAME.ce3806g* %FRAME.356, i32 0, i32 1, i32 0		; <i32*> [#uses=1]
-	%56 = load i32* %55, align 8		; <i32> [#uses=1]
-	%57 = getelementptr %struct.FRAME.ce3806g* %FRAME.356, i32 0, i32 1, i32 1		; <i32*> [#uses=1]
-	%58 = load i32* %57, align 4		; <i32> [#uses=1]
-	invoke void @system__secondary_stack__ss_release( i32 %56, i32 %58 )
-			to label %ppad304 unwind label %lpad228
-
-ppad295:		; preds = %lpad264, %lpad256, %lpad260
-	%eh_exception.3 = phi i8* [ %eh_ptr257, %lpad256 ], [ %eh_ptr261, %lpad260 ], [ %eh_ptr265, %lpad264 ]		; <i8*> [#uses=1]
-	%eh_selector.3 = phi i32 [ %eh_select259, %lpad256 ], [ %eh_select263, %lpad260 ], [ %eh_select267, %lpad264 ]		; <i32> [#uses=1]
-	%59 = getelementptr %struct.FRAME.ce3806g* %FRAME.356, i32 0, i32 0, i32 0		; <i32*> [#uses=1]
-	%60 = load i32* %59, align 8		; <i32> [#uses=1]
-	%61 = getelementptr %struct.FRAME.ce3806g* %FRAME.356, i32 0, i32 0, i32 1		; <i32*> [#uses=1]
-	%62 = load i32* %61, align 4		; <i32> [#uses=1]
-	invoke void @system__secondary_stack__ss_release( i32 %60, i32 %62 )
-			to label %ppad294 unwind label %lpad252
-
-ppad304:		; preds = %lpad276, %ppad294, %ppad288, %lpad268, %lpad272, %lpad228
-	%eh_exception.1 = phi i8* [ %eh_ptr229, %lpad228 ], [ %eh_ptr269, %lpad268 ], [ %eh_ptr273, %lpad272 ], [ %eh_exception.0, %ppad288 ], [ %eh_exception.4, %ppad294 ], [ %eh_ptr277, %lpad276 ]		; <i8*> [#uses=4]
-	%eh_selector.1 = phi i32 [ %eh_select231, %lpad228 ], [ %eh_select271, %lpad268 ], [ %eh_select275, %lpad272 ], [ %eh_selector.0, %ppad288 ], [ %eh_selector.4, %ppad294 ], [ %eh_select279, %lpad276 ]		; <i32> [#uses=1]
-	%eh_typeid305 = call i32 @llvm.eh.typeid.for.i32( i8* getelementptr (%struct.exception* @incomplete.1177, i32 0, i32 0) )		; <i32> [#uses=1]
-	%63 = icmp eq i32 %eh_selector.1, %eh_typeid305		; <i1> [#uses=1]
-	br i1 %63, label %bb205, label %ppad
-}
-
-define internal fastcc i8 @ce3806g__fxio__get.1137(%struct.ada__text_io__text_afcb* %file) signext {
-entry:
-	%0 = invoke x86_fp80 @ada__text_io__float_aux__get( %struct.ada__text_io__text_afcb* %file, i32 0 )
-			to label %invcont unwind label %lpad		; <x86_fp80> [#uses=5]
-
-invcont:		; preds = %entry
-	%1 = fcmp ult x86_fp80 %0, 0xKFFFEFFFFFFFFFFFFFFFF		; <i1> [#uses=1]
-	%2 = fcmp ugt x86_fp80 %0, 0xK7FFEFFFFFFFFFFFFFFFF		; <i1> [#uses=1]
-	%or.cond = or i1 %1, %2		; <i1> [#uses=1]
-	br i1 %or.cond, label %bb2, label %bb4
-
-bb2:		; preds = %invcont
-	invoke void @__gnat_rcheck_12( i8* getelementptr ([12 x i8]* @.str, i32 0, i32 0), i32 1 ) noreturn
-			to label %invcont3 unwind label %lpad
-
-invcont3:		; preds = %bb2
-	unreachable
-
-bb4:		; preds = %invcont
-	%3 = fmul x86_fp80 %0, 0xK40008000000000000000		; <x86_fp80> [#uses=1]
-	%4 = fcmp ult x86_fp80 %3, 0xKC0068000000000000000		; <i1> [#uses=1]
-	br i1 %4, label %bb8, label %bb6
-
-bb6:		; preds = %bb4
-	%5 = fmul x86_fp80 %0, 0xK40008000000000000000		; <x86_fp80> [#uses=1]
-	%6 = fcmp ugt x86_fp80 %5, 0xK4005FE00000000000000		; <i1> [#uses=1]
-	br i1 %6, label %bb8, label %bb10
-
-bb8:		; preds = %bb4, %bb6
-	invoke void @__gnat_rcheck_10( i8* getelementptr ([14 x i8]* @.str1, i32 0, i32 0), i32 324 ) noreturn
-			to label %invcont9 unwind label %lpad
-
-invcont9:		; preds = %bb8
-	unreachable
-
-bb10:		; preds = %bb6
-	%7 = fmul x86_fp80 %0, 0xK40008000000000000000		; <x86_fp80> [#uses=3]
-	%8 = fcmp ult x86_fp80 %7, 0xK00000000000000000000		; <i1> [#uses=1]
-	br i1 %8, label %bb13, label %bb12
-
-bb12:		; preds = %bb10
-	%9 = fadd x86_fp80 %7, 0xK3FFDFFFFFFFFFFFFFFFF		; <x86_fp80> [#uses=1]
-	br label %bb14
-
-bb13:		; preds = %bb10
-	%10 = fsub x86_fp80 %7, 0xK3FFDFFFFFFFFFFFFFFFF		; <x86_fp80> [#uses=1]
-	br label %bb14
-
-bb14:		; preds = %bb13, %bb12
-	%iftmp.339.0.in = phi x86_fp80 [ %10, %bb13 ], [ %9, %bb12 ]		; <x86_fp80> [#uses=1]
-	%iftmp.339.0 = fptosi x86_fp80 %iftmp.339.0.in to i8		; <i8> [#uses=3]
-	%11 = add i8 %iftmp.339.0, 20		; <i8> [#uses=1]
-	%12 = icmp ugt i8 %11, 40		; <i1> [#uses=1]
-	br i1 %12, label %bb16, label %bb18
-
-bb16:		; preds = %bb14
-	invoke void @__gnat_rcheck_12( i8* getelementptr ([14 x i8]* @.str1, i32 0, i32 0), i32 324 ) noreturn
-			to label %invcont17 unwind label %lpad
-
-invcont17:		; preds = %bb16
-	unreachable
-
-bb18:		; preds = %bb14
-	%13 = add i8 %iftmp.339.0, 20		; <i8> [#uses=1]
-	%14 = icmp ugt i8 %13, 40		; <i1> [#uses=1]
-	br i1 %14, label %bb20, label %bb22
-
-bb20:		; preds = %bb18
-	invoke void @__gnat_rcheck_12( i8* getelementptr ([14 x i8]* @.str1, i32 0, i32 0), i32 324 ) noreturn
-			to label %invcont21 unwind label %lpad
-
-invcont21:		; preds = %bb20
-	unreachable
-
-bb22:		; preds = %bb18
-	ret i8 %iftmp.339.0
-
-bb23:		; preds = %lpad
-	call void @__gnat_begin_handler( i8* %eh_ptr ) nounwind
-	%15 = load void ()** @system__soft_links__abort_undefer, align 4		; <void ()*> [#uses=1]
-	invoke void %15( )
-			to label %invcont24 unwind label %lpad33
-
-invcont24:		; preds = %bb23
-	invoke void @__gnat_raise_exception( %struct.system__standard_library__exception_data* bitcast (%struct.exception* @ada__io_exceptions__data_error to %struct.system__standard_library__exception_data*), i8* getelementptr ([47 x i8]* @.str2, i32 0, i32 0), %struct.string___XUB* @C.354.2200 ) noreturn
-			to label %invcont27 unwind label %lpad33
-
-invcont27:		; preds = %invcont24
-	unreachable
-
-lpad:		; preds = %bb20, %bb16, %bb8, %bb2, %entry
-	%eh_ptr = call i8* @llvm.eh.exception( )		; <i8*> [#uses=4]
-	%eh_select32 = call i32 (i8*, i8*, ...)* @llvm.eh.selector.i32( i8* %eh_ptr, i8* bitcast (i32 (...)* @__gnat_eh_personality to i8*), %struct.exception* @constraint_error, i32* @__gnat_all_others_value )		; <i32> [#uses=1]
-	%eh_typeid = call i32 @llvm.eh.typeid.for.i32( i8* getelementptr (%struct.exception* @constraint_error, i32 0, i32 0) )		; <i32> [#uses=1]
-	%16 = icmp eq i32 %eh_select32, %eh_typeid		; <i1> [#uses=1]
-	br i1 %16, label %bb23, label %Unwind
-
-lpad33:		; preds = %invcont24, %bb23
-	%eh_ptr34 = call i8* @llvm.eh.exception( )		; <i8*> [#uses=2]
-	%eh_select36 = call i32 (i8*, i8*, ...)* @llvm.eh.selector.i32( i8* %eh_ptr34, i8* bitcast (i32 (...)* @__gnat_eh_personality to i8*), i32* @__gnat_all_others_value )		; <i32> [#uses=0]
-	call void @__gnat_end_handler( i8* %eh_ptr )
-	br label %Unwind
-
-Unwind:		; preds = %lpad, %lpad33
-	%eh_exception.0 = phi i8* [ %eh_ptr, %lpad ], [ %eh_ptr34, %lpad33 ]		; <i8*> [#uses=1]
-	%17 = call i32 (...)* @_Unwind_Resume( i8* %eh_exception.0 )		; <i32> [#uses=0]
-	unreachable
-}
-
-define internal fastcc void @ce3806g__fxio__put.1149(%struct.ada__text_io__text_afcb* %file) {
-entry:
-	%A.301 = alloca %struct.string___XUB		; <%struct.string___XUB*> [#uses=3]
-	%A.292 = alloca %struct.string___XUB		; <%struct.string___XUB*> [#uses=3]
-	%0 = call i8* @llvm.stacksave( )		; <i8*> [#uses=1]
-	%1 = alloca [12 x i8]		; <[12 x i8]*> [#uses=1]
-	%.sub = getelementptr [12 x i8]* %1, i32 0, i32 0		; <i8*> [#uses=2]
-	%2 = getelementptr %struct.string___XUB* %A.292, i32 0, i32 0		; <i32*> [#uses=1]
-	store i32 1, i32* %2, align 8
-	%3 = getelementptr %struct.string___XUB* %A.292, i32 0, i32 1		; <i32*> [#uses=1]
-	store i32 12, i32* %3, align 4
-	%4 = invoke fastcc i32 @ce3806g__fxio__put__4.1215( i8* %.sub, %struct.string___XUB* %A.292, i8 signext -3 )
-			to label %invcont unwind label %lpad		; <i32> [#uses=1]
-
-invcont:		; preds = %entry
-	%5 = getelementptr %struct.string___XUB* %A.301, i32 0, i32 0		; <i32*> [#uses=1]
-	store i32 1, i32* %5, align 8
-	%6 = getelementptr %struct.string___XUB* %A.301, i32 0, i32 1		; <i32*> [#uses=1]
-	store i32 %4, i32* %6, align 4
-	invoke void @ada__text_io__generic_aux__put_item( %struct.ada__text_io__text_afcb* %file, i8* %.sub, %struct.string___XUB* %A.301 )
-			to label %bb60 unwind label %lpad
-
-bb60:		; preds = %invcont
-	ret void
-
-lpad:		; preds = %entry, %invcont
-	%eh_ptr = call i8* @llvm.eh.exception( )		; <i8*> [#uses=2]
-	%eh_select62 = call i32 (i8*, i8*, ...)* @llvm.eh.selector.i32( i8* %eh_ptr, i8* bitcast (i32 (...)* @__gnat_eh_personality to i8*), i32* @__gnat_all_others_value )		; <i32> [#uses=0]
-	call void @llvm.stackrestore( i8* %0 )
-	%7 = call i32 (...)* @_Unwind_Resume( i8* %eh_ptr )		; <i32> [#uses=0]
-	unreachable
-}
-
-define internal fastcc void @ce3806g__fxio__put__2.1155() {
-entry:
-	%A.266 = alloca %struct.string___XUB		; <%struct.string___XUB*> [#uses=3]
-	%A.257 = alloca %struct.string___XUB		; <%struct.string___XUB*> [#uses=3]
-	%0 = call i8* @llvm.stacksave( )		; <i8*> [#uses=1]
-	%1 = alloca [12 x i8]		; <[12 x i8]*> [#uses=1]
-	%.sub = getelementptr [12 x i8]* %1, i32 0, i32 0		; <i8*> [#uses=2]
-	%2 = getelementptr %struct.string___XUB* %A.257, i32 0, i32 0		; <i32*> [#uses=1]
-	store i32 1, i32* %2, align 8
-	%3 = getelementptr %struct.string___XUB* %A.257, i32 0, i32 1		; <i32*> [#uses=1]
-	store i32 12, i32* %3, align 4
-	%4 = invoke fastcc i32 @ce3806g__fxio__put__4.1215( i8* %.sub, %struct.string___XUB* %A.257, i8 signext -1 )
-			to label %invcont unwind label %lpad		; <i32> [#uses=1]
-
-invcont:		; preds = %entry
-	%5 = getelementptr %struct.string___XUB* %A.266, i32 0, i32 0		; <i32*> [#uses=1]
-	store i32 1, i32* %5, align 8
-	%6 = getelementptr %struct.string___XUB* %A.266, i32 0, i32 1		; <i32*> [#uses=1]
-	store i32 %4, i32* %6, align 4
-	%7 = load %struct.ada__text_io__text_afcb** @ada__text_io__current_out, align 4		; <%struct.ada__text_io__text_afcb*> [#uses=1]
-	invoke void @ada__text_io__generic_aux__put_item( %struct.ada__text_io__text_afcb* %7, i8* %.sub, %struct.string___XUB* %A.266 )
-			to label %bb60 unwind label %lpad
-
-bb60:		; preds = %invcont
-	ret void
-
-lpad:		; preds = %entry, %invcont
-	%eh_ptr = call i8* @llvm.eh.exception( )		; <i8*> [#uses=2]
-	%eh_select62 = call i32 (i8*, i8*, ...)* @llvm.eh.selector.i32( i8* %eh_ptr, i8* bitcast (i32 (...)* @__gnat_eh_personality to i8*), i32* @__gnat_all_others_value )		; <i32> [#uses=0]
-	call void @llvm.stackrestore( i8* %0 )
-	%8 = call i32 (...)* @_Unwind_Resume( i8* %eh_ptr )		; <i32> [#uses=0]
-	unreachable
-}
-
-define internal fastcc i32 @ce3806g__fxio__put__4.1215(i8* %to.0, %struct.string___XUB* %to.1, i8 signext %item) {
-entry:
-        %P0 = load i32 * @__gnat_all_others_value, align 4  ; <i32*> [#uses=1]
-        %P = alloca i32, i32 %P0	; <i32*> [#uses=1]
-        call void @ext( i32* %P )
-	%to_addr = alloca %struct.system__file_control_block__pstring		; <%struct.system__file_control_block__pstring*> [#uses=4]
-	%FRAME.358 = alloca %struct.FRAME.ce3806g__fxio__put__4		; <%struct.FRAME.ce3806g__fxio__put__4*> [#uses=65]
-	%0 = getelementptr %struct.system__file_control_block__pstring* %to_addr, i32 0, i32 0		; <i8**> [#uses=1]
-	store i8* %to.0, i8** %0, align 8
-	%1 = getelementptr %struct.system__file_control_block__pstring* %to_addr, i32 0, i32 1		; <%struct.string___XUB**> [#uses=1]
-	store %struct.string___XUB* %to.1, %struct.string___XUB** %1
-	%2 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 3		; <%struct.system__file_control_block__pstring**> [#uses=1]
-	store %struct.system__file_control_block__pstring* %to_addr, %struct.system__file_control_block__pstring** %2, align 4
-	%3 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 0		; <i32*> [#uses=1]
-	store i32 3, i32* %3, align 8
-	%4 = getelementptr %struct.system__file_control_block__pstring* %to_addr, i32 0, i32 1		; <%struct.string___XUB**> [#uses=1]
-	%5 = load %struct.string___XUB** %4, align 4		; <%struct.string___XUB*> [#uses=1]
-	%6 = getelementptr %struct.string___XUB* %5, i32 0, i32 0		; <i32*> [#uses=1]
-	%7 = load i32* %6, align 4		; <i32> [#uses=1]
-	%8 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 2		; <i32*> [#uses=1]
-	store i32 %7, i32* %8, align 8
-	%9 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 2		; <i32*> [#uses=1]
-	%10 = load i32* %9, align 8		; <i32> [#uses=1]
-	%11 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 4		; <i32*> [#uses=1]
-	store i32 %10, i32* %11, align 8
-	%item.lobit = lshr i8 %item, 7		; <i8> [#uses=1]
-	%12 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 6		; <i8*> [#uses=1]
-	store i8 %item.lobit, i8* %12, align 8
-	%13 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 2		; <i32*> [#uses=1]
-	%14 = load i32* %13, align 8		; <i32> [#uses=1]
-	%15 = add i32 %14, -1		; <i32> [#uses=1]
-	%16 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 5		; <i32*> [#uses=1]
-	store i32 %15, i32* %16, align 4
-	%17 = sext i8 %item to i64		; <i64> [#uses=1]
-	%18 = call i64 @system__exn_lli__exn_long_long_integer( i64 10, i32 1 ) readnone		; <i64> [#uses=1]
-	%19 = sub i64 0, %18		; <i64> [#uses=1]
-	%20 = call i64 @system__exn_lli__exn_long_long_integer( i64 10, i32 0 ) readnone		; <i64> [#uses=1]
-	%21 = mul i64 %20, -2		; <i64> [#uses=1]
-	call fastcc void @ce3806g__fxio__put__put_scaled__4.1346( %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i64 %17, i64 %19, i64 %21, i32 0, i32 -1 )
-	%22 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 5		; <i32*> [#uses=1]
-	%23 = load i32* %22, align 4		; <i32> [#uses=1]
-	%24 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 2		; <i32*> [#uses=1]
-	%25 = load i32* %24, align 8		; <i32> [#uses=1]
-	%26 = icmp slt i32 %23, %25		; <i1> [#uses=1]
-	br i1 %26, label %bb71, label %bb72
-
-bb71:		; preds = %entry
-	%27 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 1		; <i32*> [#uses=1]
-	store i32 0, i32* %27, align 4
-	br label %bb72
-
-bb72:		; preds = %entry, %bb102, %bb71
-	%28 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 1		; <i32*> [#uses=1]
-	%29 = load i32* %28, align 4		; <i32> [#uses=1]
-	%30 = icmp slt i32 %29, -1		; <i1> [#uses=1]
-	%31 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 5		; <i32*> [#uses=1]
-	%32 = load i32* %31, align 4		; <i32> [#uses=2]
-	br i1 %30, label %bb103, label %bb74
-
-bb74:		; preds = %bb72
-	%33 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 2		; <i32*> [#uses=1]
-	%34 = load i32* %33, align 8		; <i32> [#uses=1]
-	%35 = add i32 %34, -1		; <i32> [#uses=1]
-	%36 = icmp eq i32 %32, %35		; <i1> [#uses=1]
-	%37 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 1		; <i32*> [#uses=1]
-	%38 = load i32* %37, align 4		; <i32> [#uses=2]
-	br i1 %36, label %bb76, label %bb98
-
-bb76:		; preds = %bb74
-	%39 = icmp slt i32 %38, 1		; <i1> [#uses=1]
-	br i1 %39, label %bb80, label %bb102
-
-bb80:		; preds = %bb76
-	%40 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 1		; <i32*> [#uses=1]
-	%41 = load i32* %40, align 4		; <i32> [#uses=2]
-	%42 = icmp sgt i32 %41, -1		; <i1> [#uses=1]
-	%.op = add i32 %41, 2		; <i32> [#uses=1]
-	%43 = select i1 %42, i32 %.op, i32 2		; <i32> [#uses=1]
-	%44 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 6		; <i8*> [#uses=1]
-	%45 = load i8* %44, align 8		; <i8> [#uses=1]
-	%46 = zext i8 %45 to i32		; <i32> [#uses=1]
-	%47 = add i32 %43, %46		; <i32> [#uses=2]
-	%48 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 0		; <i32*> [#uses=1]
-	%49 = load i32* %48, align 8		; <i32> [#uses=1]
-	%50 = icmp sgt i32 %47, %49		; <i1> [#uses=1]
-	br i1 %50, label %bb88, label %bb85
-
-bb85:		; preds = %bb80, %bb87
-	%j.0 = phi i32 [ %68, %bb87 ], [ %47, %bb80 ]		; <i32> [#uses=2]
-	%51 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 5		; <i32*> [#uses=1]
-	%52 = load i32* %51, align 4		; <i32> [#uses=1]
-	%53 = add i32 %52, 1		; <i32> [#uses=1]
-	%54 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 5		; <i32*> [#uses=1]
-	store i32 %53, i32* %54, align 4
-	%55 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 4		; <i32*> [#uses=1]
-	%56 = load i32* %55, align 8		; <i32> [#uses=1]
-	%57 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 3		; <%struct.system__file_control_block__pstring**> [#uses=1]
-	%58 = load %struct.system__file_control_block__pstring** %57, align 4		; <%struct.system__file_control_block__pstring*> [#uses=1]
-	%59 = getelementptr %struct.system__file_control_block__pstring* %58, i32 0, i32 0		; <i8**> [#uses=1]
-	%60 = load i8** %59, align 4		; <i8*> [#uses=1]
-	%61 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 5		; <i32*> [#uses=1]
-	%62 = load i32* %61, align 4		; <i32> [#uses=1]
-	%63 = sub i32 %62, %56		; <i32> [#uses=1]
-	%64 = getelementptr i8* %60, i32 %63		; <i8*> [#uses=1]
-	store i8 32, i8* %64, align 1
-	%65 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 0		; <i32*> [#uses=1]
-	%66 = load i32* %65, align 8		; <i32> [#uses=1]
-	%67 = icmp eq i32 %66, %j.0		; <i1> [#uses=1]
-	br i1 %67, label %bb88, label %bb87
-
-bb87:		; preds = %bb85
-	%68 = add i32 %j.0, 1		; <i32> [#uses=1]
-	br label %bb85
-
-bb88:		; preds = %bb80, %bb85
-	%69 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 6		; <i8*> [#uses=1]
-	%70 = load i8* %69, align 8		; <i8> [#uses=1]
-	%toBool89 = icmp eq i8 %70, 0		; <i1> [#uses=1]
-	br i1 %toBool89, label %bb91, label %bb90
-
-bb90:		; preds = %bb88
-	%71 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 5		; <i32*> [#uses=1]
-	%72 = load i32* %71, align 4		; <i32> [#uses=1]
-	%73 = add i32 %72, 1		; <i32> [#uses=1]
-	%74 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 5		; <i32*> [#uses=1]
-	store i32 %73, i32* %74, align 4
-	%75 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 4		; <i32*> [#uses=1]
-	%76 = load i32* %75, align 8		; <i32> [#uses=1]
-	%77 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 3		; <%struct.system__file_control_block__pstring**> [#uses=1]
-	%78 = load %struct.system__file_control_block__pstring** %77, align 4		; <%struct.system__file_control_block__pstring*> [#uses=1]
-	%79 = getelementptr %struct.system__file_control_block__pstring* %78, i32 0, i32 0		; <i8**> [#uses=1]
-	%80 = load i8** %79, align 4		; <i8*> [#uses=1]
-	%81 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 5		; <i32*> [#uses=1]
-	%82 = load i32* %81, align 4		; <i32> [#uses=1]
-	%83 = sub i32 %82, %76		; <i32> [#uses=1]
-	%84 = getelementptr i8* %80, i32 %83		; <i8*> [#uses=1]
-	store i8 45, i8* %84, align 1
-	br label %bb91
-
-bb91:		; preds = %bb88, %bb90
-	%85 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 1		; <i32*> [#uses=1]
-	%86 = load i32* %85, align 4		; <i32> [#uses=1]
-	%87 = icmp slt i32 %86, 0		; <i1> [#uses=1]
-	br i1 %87, label %bb93, label %bb97
-
-bb93:		; preds = %bb91
-	%88 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 5		; <i32*> [#uses=1]
-	%89 = load i32* %88, align 4		; <i32> [#uses=1]
-	%90 = add i32 %89, 1		; <i32> [#uses=1]
-	%91 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 5		; <i32*> [#uses=1]
-	store i32 %90, i32* %91, align 4
-	%92 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 4		; <i32*> [#uses=1]
-	%93 = load i32* %92, align 8		; <i32> [#uses=1]
-	%94 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 3		; <%struct.system__file_control_block__pstring**> [#uses=1]
-	%95 = load %struct.system__file_control_block__pstring** %94, align 4		; <%struct.system__file_control_block__pstring*> [#uses=1]
-	%96 = getelementptr %struct.system__file_control_block__pstring* %95, i32 0, i32 0		; <i8**> [#uses=1]
-	%97 = load i8** %96, align 4		; <i8*> [#uses=1]
-	%98 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 5		; <i32*> [#uses=1]
-	%99 = load i32* %98, align 4		; <i32> [#uses=1]
-	%100 = sub i32 %99, %93		; <i32> [#uses=1]
-	%101 = getelementptr i8* %97, i32 %100		; <i8*> [#uses=1]
-	store i8 48, i8* %101, align 1
-	%102 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 5		; <i32*> [#uses=1]
-	%103 = load i32* %102, align 4		; <i32> [#uses=1]
-	%104 = add i32 %103, 1		; <i32> [#uses=1]
-	%105 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 5		; <i32*> [#uses=1]
-	store i32 %104, i32* %105, align 4
-	%106 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 4		; <i32*> [#uses=1]
-	%107 = load i32* %106, align 8		; <i32> [#uses=1]
-	%108 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 3		; <%struct.system__file_control_block__pstring**> [#uses=1]
-	%109 = load %struct.system__file_control_block__pstring** %108, align 4		; <%struct.system__file_control_block__pstring*> [#uses=1]
-	%110 = getelementptr %struct.system__file_control_block__pstring* %109, i32 0, i32 0		; <i8**> [#uses=1]
-	%111 = load i8** %110, align 4		; <i8*> [#uses=1]
-	%112 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 5		; <i32*> [#uses=1]
-	%113 = load i32* %112, align 4		; <i32> [#uses=1]
-	%114 = sub i32 %113, %107		; <i32> [#uses=1]
-	%115 = getelementptr i8* %111, i32 %114		; <i8*> [#uses=1]
-	store i8 46, i8* %115, align 1
-	%116 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 1		; <i32*> [#uses=1]
-	%117 = load i32* %116, align 4		; <i32> [#uses=1]
-	br label %bb94
-
-bb94:		; preds = %bb96, %bb93
-	%j8.0 = phi i32 [ %117, %bb93 ], [ %133, %bb96 ]		; <i32> [#uses=2]
-	%118 = icmp sgt i32 %j8.0, -2		; <i1> [#uses=1]
-	br i1 %118, label %bb97, label %bb96
-
-bb96:		; preds = %bb94
-	%119 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 5		; <i32*> [#uses=1]
-	%120 = load i32* %119, align 4		; <i32> [#uses=1]
-	%121 = add i32 %120, 1		; <i32> [#uses=1]
-	%122 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 5		; <i32*> [#uses=1]
-	store i32 %121, i32* %122, align 4
-	%123 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 4		; <i32*> [#uses=1]
-	%124 = load i32* %123, align 8		; <i32> [#uses=1]
-	%125 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 3		; <%struct.system__file_control_block__pstring**> [#uses=1]
-	%126 = load %struct.system__file_control_block__pstring** %125, align 4		; <%struct.system__file_control_block__pstring*> [#uses=1]
-	%127 = getelementptr %struct.system__file_control_block__pstring* %126, i32 0, i32 0		; <i8**> [#uses=1]
-	%128 = load i8** %127, align 4		; <i8*> [#uses=1]
-	%129 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 5		; <i32*> [#uses=1]
-	%130 = load i32* %129, align 4		; <i32> [#uses=1]
-	%131 = sub i32 %130, %124		; <i32> [#uses=1]
-	%132 = getelementptr i8* %128, i32 %131		; <i8*> [#uses=1]
-	store i8 48, i8* %132, align 1
-	%133 = add i32 %j8.0, 1		; <i32> [#uses=1]
-	br label %bb94
-
-bb97:		; preds = %bb91, %bb94
-	%134 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 5		; <i32*> [#uses=1]
-	%135 = load i32* %134, align 4		; <i32> [#uses=1]
-	%136 = add i32 %135, 1		; <i32> [#uses=1]
-	%137 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 5		; <i32*> [#uses=1]
-	store i32 %136, i32* %137, align 4
-	%138 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 4		; <i32*> [#uses=1]
-	%139 = load i32* %138, align 8		; <i32> [#uses=1]
-	%140 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 3		; <%struct.system__file_control_block__pstring**> [#uses=1]
-	%141 = load %struct.system__file_control_block__pstring** %140, align 4		; <%struct.system__file_control_block__pstring*> [#uses=1]
-	%142 = getelementptr %struct.system__file_control_block__pstring* %141, i32 0, i32 0		; <i8**> [#uses=1]
-	%143 = load i8** %142, align 4		; <i8*> [#uses=1]
-	%144 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 5		; <i32*> [#uses=1]
-	%145 = load i32* %144, align 4		; <i32> [#uses=1]
-	%146 = sub i32 %145, %139		; <i32> [#uses=1]
-	%147 = getelementptr i8* %143, i32 %146		; <i8*> [#uses=1]
-	store i8 48, i8* %147, align 1
-	br label %bb102
-
-bb98:		; preds = %bb74
-	%148 = icmp eq i32 %38, -1		; <i1> [#uses=1]
-	br i1 %148, label %bb100, label %bb101
-
-bb100:		; preds = %bb98
-	%149 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 5		; <i32*> [#uses=1]
-	%150 = load i32* %149, align 4		; <i32> [#uses=1]
-	%151 = add i32 %150, 1		; <i32> [#uses=1]
-	%152 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 5		; <i32*> [#uses=1]
-	store i32 %151, i32* %152, align 4
-	%153 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 4		; <i32*> [#uses=1]
-	%154 = load i32* %153, align 8		; <i32> [#uses=1]
-	%155 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 3		; <%struct.system__file_control_block__pstring**> [#uses=1]
-	%156 = load %struct.system__file_control_block__pstring** %155, align 4		; <%struct.system__file_control_block__pstring*> [#uses=1]
-	%157 = getelementptr %struct.system__file_control_block__pstring* %156, i32 0, i32 0		; <i8**> [#uses=1]
-	%158 = load i8** %157, align 4		; <i8*> [#uses=1]
-	%159 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 5		; <i32*> [#uses=1]
-	%160 = load i32* %159, align 4		; <i32> [#uses=1]
-	%161 = sub i32 %160, %154		; <i32> [#uses=1]
-	%162 = getelementptr i8* %158, i32 %161		; <i8*> [#uses=1]
-	store i8 46, i8* %162, align 1
-	br label %bb101
-
-bb101:		; preds = %bb98, %bb100
-	%163 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 5		; <i32*> [#uses=1]
-	%164 = load i32* %163, align 4		; <i32> [#uses=1]
-	%165 = add i32 %164, 1		; <i32> [#uses=1]
-	%166 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 5		; <i32*> [#uses=1]
-	store i32 %165, i32* %166, align 4
-	%167 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 4		; <i32*> [#uses=1]
-	%168 = load i32* %167, align 8		; <i32> [#uses=1]
-	%169 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 3		; <%struct.system__file_control_block__pstring**> [#uses=1]
-	%170 = load %struct.system__file_control_block__pstring** %169, align 4		; <%struct.system__file_control_block__pstring*> [#uses=1]
-	%171 = getelementptr %struct.system__file_control_block__pstring* %170, i32 0, i32 0		; <i8**> [#uses=1]
-	%172 = load i8** %171, align 4		; <i8*> [#uses=1]
-	%173 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 5		; <i32*> [#uses=1]
-	%174 = load i32* %173, align 4		; <i32> [#uses=1]
-	%175 = sub i32 %174, %168		; <i32> [#uses=1]
-	%176 = getelementptr i8* %172, i32 %175		; <i8*> [#uses=1]
-	store i8 48, i8* %176, align 1
-	br label %bb102
-
-bb102:		; preds = %bb76, %bb101, %bb97
-	%177 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 1		; <i32*> [#uses=1]
-	%178 = load i32* %177, align 4		; <i32> [#uses=1]
-	%179 = add i32 %178, -1		; <i32> [#uses=1]
-	%180 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %FRAME.358, i32 0, i32 1		; <i32*> [#uses=1]
-	store i32 %179, i32* %180, align 4
-	br label %bb72
-
-bb103:		; preds = %bb72
-	ret i32 %32
-}
-
-declare x86_fp80 @ada__text_io__float_aux__get(%struct.ada__text_io__text_afcb*, i32)
-
-declare void @__gnat_rcheck_12(i8*, i32) noreturn
-
-declare void @__gnat_rcheck_10(i8*, i32) noreturn
-
-declare i8* @llvm.eh.exception() nounwind
-
-declare i32 @llvm.eh.selector.i32(i8*, i8*, ...) nounwind
-
-declare i32 @llvm.eh.typeid.for.i32(i8*) nounwind
-
-declare void @__gnat_begin_handler(i8*) nounwind
-
-declare void @__gnat_raise_exception(%struct.system__standard_library__exception_data*, i8*, %struct.string___XUB*) noreturn
-
-declare void @__gnat_end_handler(i8*)
-
-declare i32 @__gnat_eh_personality(...)
-
-declare i32 @_Unwind_Resume(...)
-
-define internal fastcc void @ce3806g__fxio__put__put_int64__4.1339(%struct.FRAME.ce3806g__fxio__put__4* %CHAIN.361, i64 %x, i32 %scale) {
-entry:
-	%0 = icmp eq i64 %x, 0		; <i1> [#uses=1]
-	br i1 %0, label %return, label %bb
-
-bb:		; preds = %entry
-	%1 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.361, i32 0, i32 1		; <i32*> [#uses=1]
-	store i32 %scale, i32* %1, align 4
-	%2 = add i64 %x, 9		; <i64> [#uses=1]
-	%3 = icmp ugt i64 %2, 18		; <i1> [#uses=1]
-	br i1 %3, label %bb18, label %bb19
-
-bb18:		; preds = %bb
-	%4 = add i32 %scale, 1		; <i32> [#uses=1]
-	%5 = sdiv i64 %x, 10		; <i64> [#uses=1]
-	call fastcc void @ce3806g__fxio__put__put_int64__4.1339( %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.361, i64 %5, i32 %4 )
-	br label %bb19
-
-bb19:		; preds = %bb, %bb18
-	%6 = srem i64 %x, 10		; <i64> [#uses=3]
-	%neg = sub i64 0, %6		; <i64> [#uses=1]
-	%abscond = icmp sgt i64 %6, -1		; <i1> [#uses=1]
-	%abs = select i1 %abscond, i64 %6, i64 %neg		; <i64> [#uses=3]
-	%7 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.361, i32 0, i32 5		; <i32*> [#uses=1]
-	%8 = load i32* %7, align 4		; <i32> [#uses=1]
-	%9 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.361, i32 0, i32 2		; <i32*> [#uses=1]
-	%10 = load i32* %9, align 4		; <i32> [#uses=1]
-	%11 = add i32 %10, -1		; <i32> [#uses=1]
-	%12 = icmp eq i32 %8, %11		; <i1> [#uses=1]
-	br i1 %12, label %bb23, label %bb44
-
-bb23:		; preds = %bb19
-	%13 = icmp ne i64 %abs, 0		; <i1> [#uses=1]
-	%14 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.361, i32 0, i32 1		; <i32*> [#uses=1]
-	%15 = load i32* %14, align 4		; <i32> [#uses=1]
-	%16 = icmp slt i32 %15, 1		; <i1> [#uses=1]
-	%17 = or i1 %13, %16		; <i1> [#uses=1]
-	br i1 %17, label %bb27, label %bb48
-
-bb27:		; preds = %bb23
-	%18 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.361, i32 0, i32 1		; <i32*> [#uses=1]
-	%19 = load i32* %18, align 4		; <i32> [#uses=2]
-	%20 = icmp sgt i32 %19, -1		; <i1> [#uses=1]
-	%.op = add i32 %19, 2		; <i32> [#uses=1]
-	%21 = select i1 %20, i32 %.op, i32 2		; <i32> [#uses=1]
-	%22 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.361, i32 0, i32 6		; <i8*> [#uses=1]
-	%23 = load i8* %22, align 1		; <i8> [#uses=1]
-	%24 = zext i8 %23 to i32		; <i32> [#uses=1]
-	%25 = add i32 %21, %24		; <i32> [#uses=2]
-	%26 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.361, i32 0, i32 0		; <i32*> [#uses=1]
-	%27 = load i32* %26, align 4		; <i32> [#uses=1]
-	%28 = icmp sgt i32 %25, %27		; <i1> [#uses=1]
-	br i1 %28, label %bb34, label %bb31
-
-bb31:		; preds = %bb27, %bb33
-	%j.0 = phi i32 [ %46, %bb33 ], [ %25, %bb27 ]		; <i32> [#uses=2]
-	%29 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.361, i32 0, i32 5		; <i32*> [#uses=1]
-	%30 = load i32* %29, align 4		; <i32> [#uses=1]
-	%31 = add i32 %30, 1		; <i32> [#uses=1]
-	%32 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.361, i32 0, i32 5		; <i32*> [#uses=1]
-	store i32 %31, i32* %32, align 4
-	%33 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.361, i32 0, i32 4		; <i32*> [#uses=1]
-	%34 = load i32* %33, align 4		; <i32> [#uses=1]
-	%35 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.361, i32 0, i32 3		; <%struct.system__file_control_block__pstring**> [#uses=1]
-	%36 = load %struct.system__file_control_block__pstring** %35, align 4		; <%struct.system__file_control_block__pstring*> [#uses=1]
-	%37 = getelementptr %struct.system__file_control_block__pstring* %36, i32 0, i32 0		; <i8**> [#uses=1]
-	%38 = load i8** %37, align 4		; <i8*> [#uses=1]
-	%39 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.361, i32 0, i32 5		; <i32*> [#uses=1]
-	%40 = load i32* %39, align 4		; <i32> [#uses=1]
-	%41 = sub i32 %40, %34		; <i32> [#uses=1]
-	%42 = getelementptr i8* %38, i32 %41		; <i8*> [#uses=1]
-	store i8 32, i8* %42, align 1
-	%43 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.361, i32 0, i32 0		; <i32*> [#uses=1]
-	%44 = load i32* %43, align 4		; <i32> [#uses=1]
-	%45 = icmp eq i32 %44, %j.0		; <i1> [#uses=1]
-	br i1 %45, label %bb34, label %bb33
-
-bb33:		; preds = %bb31
-	%46 = add i32 %j.0, 1		; <i32> [#uses=1]
-	br label %bb31
-
-bb34:		; preds = %bb27, %bb31
-	%47 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.361, i32 0, i32 6		; <i8*> [#uses=1]
-	%48 = load i8* %47, align 1		; <i8> [#uses=1]
-	%toBool35 = icmp eq i8 %48, 0		; <i1> [#uses=1]
-	br i1 %toBool35, label %bb37, label %bb36
-
-bb36:		; preds = %bb34
-	%49 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.361, i32 0, i32 5		; <i32*> [#uses=1]
-	%50 = load i32* %49, align 4		; <i32> [#uses=1]
-	%51 = add i32 %50, 1		; <i32> [#uses=1]
-	%52 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.361, i32 0, i32 5		; <i32*> [#uses=1]
-	store i32 %51, i32* %52, align 4
-	%53 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.361, i32 0, i32 4		; <i32*> [#uses=1]
-	%54 = load i32* %53, align 4		; <i32> [#uses=1]
-	%55 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.361, i32 0, i32 3		; <%struct.system__file_control_block__pstring**> [#uses=1]
-	%56 = load %struct.system__file_control_block__pstring** %55, align 4		; <%struct.system__file_control_block__pstring*> [#uses=1]
-	%57 = getelementptr %struct.system__file_control_block__pstring* %56, i32 0, i32 0		; <i8**> [#uses=1]
-	%58 = load i8** %57, align 4		; <i8*> [#uses=1]
-	%59 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.361, i32 0, i32 5		; <i32*> [#uses=1]
-	%60 = load i32* %59, align 4		; <i32> [#uses=1]
-	%61 = sub i32 %60, %54		; <i32> [#uses=1]
-	%62 = getelementptr i8* %58, i32 %61		; <i8*> [#uses=1]
-	store i8 45, i8* %62, align 1
-	br label %bb37
-
-bb37:		; preds = %bb34, %bb36
-	%63 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.361, i32 0, i32 1		; <i32*> [#uses=1]
-	%64 = load i32* %63, align 4		; <i32> [#uses=1]
-	%65 = icmp slt i32 %64, 0		; <i1> [#uses=1]
-	br i1 %65, label %bb39, label %bb43
-
-bb39:		; preds = %bb37
-	%66 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.361, i32 0, i32 5		; <i32*> [#uses=1]
-	%67 = load i32* %66, align 4		; <i32> [#uses=1]
-	%68 = add i32 %67, 1		; <i32> [#uses=1]
-	%69 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.361, i32 0, i32 5		; <i32*> [#uses=1]
-	store i32 %68, i32* %69, align 4
-	%70 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.361, i32 0, i32 4		; <i32*> [#uses=1]
-	%71 = load i32* %70, align 4		; <i32> [#uses=1]
-	%72 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.361, i32 0, i32 3		; <%struct.system__file_control_block__pstring**> [#uses=1]
-	%73 = load %struct.system__file_control_block__pstring** %72, align 4		; <%struct.system__file_control_block__pstring*> [#uses=1]
-	%74 = getelementptr %struct.system__file_control_block__pstring* %73, i32 0, i32 0		; <i8**> [#uses=1]
-	%75 = load i8** %74, align 4		; <i8*> [#uses=1]
-	%76 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.361, i32 0, i32 5		; <i32*> [#uses=1]
-	%77 = load i32* %76, align 4		; <i32> [#uses=1]
-	%78 = sub i32 %77, %71		; <i32> [#uses=1]
-	%79 = getelementptr i8* %75, i32 %78		; <i8*> [#uses=1]
-	store i8 48, i8* %79, align 1
-	%80 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.361, i32 0, i32 5		; <i32*> [#uses=1]
-	%81 = load i32* %80, align 4		; <i32> [#uses=1]
-	%82 = add i32 %81, 1		; <i32> [#uses=1]
-	%83 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.361, i32 0, i32 5		; <i32*> [#uses=1]
-	store i32 %82, i32* %83, align 4
-	%84 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.361, i32 0, i32 4		; <i32*> [#uses=1]
-	%85 = load i32* %84, align 4		; <i32> [#uses=1]
-	%86 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.361, i32 0, i32 3		; <%struct.system__file_control_block__pstring**> [#uses=1]
-	%87 = load %struct.system__file_control_block__pstring** %86, align 4		; <%struct.system__file_control_block__pstring*> [#uses=1]
-	%88 = getelementptr %struct.system__file_control_block__pstring* %87, i32 0, i32 0		; <i8**> [#uses=1]
-	%89 = load i8** %88, align 4		; <i8*> [#uses=1]
-	%90 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.361, i32 0, i32 5		; <i32*> [#uses=1]
-	%91 = load i32* %90, align 4		; <i32> [#uses=1]
-	%92 = sub i32 %91, %85		; <i32> [#uses=1]
-	%93 = getelementptr i8* %89, i32 %92		; <i8*> [#uses=1]
-	store i8 46, i8* %93, align 1
-	%94 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.361, i32 0, i32 1		; <i32*> [#uses=1]
-	%95 = load i32* %94, align 4		; <i32> [#uses=1]
-	br label %bb40
-
-bb40:		; preds = %bb42, %bb39
-	%j15.0 = phi i32 [ %95, %bb39 ], [ %111, %bb42 ]		; <i32> [#uses=2]
-	%96 = icmp sgt i32 %j15.0, -2		; <i1> [#uses=1]
-	br i1 %96, label %bb43, label %bb42
-
-bb42:		; preds = %bb40
-	%97 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.361, i32 0, i32 5		; <i32*> [#uses=1]
-	%98 = load i32* %97, align 4		; <i32> [#uses=1]
-	%99 = add i32 %98, 1		; <i32> [#uses=1]
-	%100 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.361, i32 0, i32 5		; <i32*> [#uses=1]
-	store i32 %99, i32* %100, align 4
-	%101 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.361, i32 0, i32 4		; <i32*> [#uses=1]
-	%102 = load i32* %101, align 4		; <i32> [#uses=1]
-	%103 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.361, i32 0, i32 3		; <%struct.system__file_control_block__pstring**> [#uses=1]
-	%104 = load %struct.system__file_control_block__pstring** %103, align 4		; <%struct.system__file_control_block__pstring*> [#uses=1]
-	%105 = getelementptr %struct.system__file_control_block__pstring* %104, i32 0, i32 0		; <i8**> [#uses=1]
-	%106 = load i8** %105, align 4		; <i8*> [#uses=1]
-	%107 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.361, i32 0, i32 5		; <i32*> [#uses=1]
-	%108 = load i32* %107, align 4		; <i32> [#uses=1]
-	%109 = sub i32 %108, %102		; <i32> [#uses=1]
-	%110 = getelementptr i8* %106, i32 %109		; <i8*> [#uses=1]
-	store i8 48, i8* %110, align 1
-	%111 = add i32 %j15.0, 1		; <i32> [#uses=1]
-	br label %bb40
-
-bb43:		; preds = %bb37, %bb40
-	%112 = trunc i64 %abs to i32		; <i32> [#uses=1]
-	%113 = getelementptr [10 x i8]* @.str3, i32 0, i32 %112		; <i8*> [#uses=1]
-	%114 = load i8* %113, align 1		; <i8> [#uses=1]
-	%115 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.361, i32 0, i32 5		; <i32*> [#uses=1]
-	%116 = load i32* %115, align 4		; <i32> [#uses=1]
-	%117 = add i32 %116, 1		; <i32> [#uses=1]
-	%118 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.361, i32 0, i32 5		; <i32*> [#uses=1]
-	store i32 %117, i32* %118, align 4
-	%119 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.361, i32 0, i32 4		; <i32*> [#uses=1]
-	%120 = load i32* %119, align 4		; <i32> [#uses=1]
-	%121 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.361, i32 0, i32 3		; <%struct.system__file_control_block__pstring**> [#uses=1]
-	%122 = load %struct.system__file_control_block__pstring** %121, align 4		; <%struct.system__file_control_block__pstring*> [#uses=1]
-	%123 = getelementptr %struct.system__file_control_block__pstring* %122, i32 0, i32 0		; <i8**> [#uses=1]
-	%124 = load i8** %123, align 4		; <i8*> [#uses=1]
-	%125 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.361, i32 0, i32 5		; <i32*> [#uses=1]
-	%126 = load i32* %125, align 4		; <i32> [#uses=1]
-	%127 = sub i32 %126, %120		; <i32> [#uses=1]
-	%128 = getelementptr i8* %124, i32 %127		; <i8*> [#uses=1]
-	store i8 %114, i8* %128, align 1
-	br label %bb48
-
-bb44:		; preds = %bb19
-	%129 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.361, i32 0, i32 1		; <i32*> [#uses=1]
-	%130 = load i32* %129, align 4		; <i32> [#uses=1]
-	%131 = icmp eq i32 %130, -1		; <i1> [#uses=1]
-	br i1 %131, label %bb46, label %bb47
-
-bb46:		; preds = %bb44
-	%132 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.361, i32 0, i32 5		; <i32*> [#uses=1]
-	%133 = load i32* %132, align 4		; <i32> [#uses=1]
-	%134 = add i32 %133, 1		; <i32> [#uses=1]
-	%135 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.361, i32 0, i32 5		; <i32*> [#uses=1]
-	store i32 %134, i32* %135, align 4
-	%136 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.361, i32 0, i32 4		; <i32*> [#uses=1]
-	%137 = load i32* %136, align 4		; <i32> [#uses=1]
-	%138 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.361, i32 0, i32 3		; <%struct.system__file_control_block__pstring**> [#uses=1]
-	%139 = load %struct.system__file_control_block__pstring** %138, align 4		; <%struct.system__file_control_block__pstring*> [#uses=1]
-	%140 = getelementptr %struct.system__file_control_block__pstring* %139, i32 0, i32 0		; <i8**> [#uses=1]
-	%141 = load i8** %140, align 4		; <i8*> [#uses=1]
-	%142 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.361, i32 0, i32 5		; <i32*> [#uses=1]
-	%143 = load i32* %142, align 4		; <i32> [#uses=1]
-	%144 = sub i32 %143, %137		; <i32> [#uses=1]
-	%145 = getelementptr i8* %141, i32 %144		; <i8*> [#uses=1]
-	store i8 46, i8* %145, align 1
-	br label %bb47
-
-bb47:		; preds = %bb44, %bb46
-	%146 = trunc i64 %abs to i32		; <i32> [#uses=1]
-	%147 = getelementptr [10 x i8]* @.str3, i32 0, i32 %146		; <i8*> [#uses=1]
-	%148 = load i8* %147, align 1		; <i8> [#uses=1]
-	%149 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.361, i32 0, i32 5		; <i32*> [#uses=1]
-	%150 = load i32* %149, align 4		; <i32> [#uses=1]
-	%151 = add i32 %150, 1		; <i32> [#uses=1]
-	%152 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.361, i32 0, i32 5		; <i32*> [#uses=1]
-	store i32 %151, i32* %152, align 4
-	%153 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.361, i32 0, i32 4		; <i32*> [#uses=1]
-	%154 = load i32* %153, align 4		; <i32> [#uses=1]
-	%155 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.361, i32 0, i32 3		; <%struct.system__file_control_block__pstring**> [#uses=1]
-	%156 = load %struct.system__file_control_block__pstring** %155, align 4		; <%struct.system__file_control_block__pstring*> [#uses=1]
-	%157 = getelementptr %struct.system__file_control_block__pstring* %156, i32 0, i32 0		; <i8**> [#uses=1]
-	%158 = load i8** %157, align 4		; <i8*> [#uses=1]
-	%159 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.361, i32 0, i32 5		; <i32*> [#uses=1]
-	%160 = load i32* %159, align 4		; <i32> [#uses=1]
-	%161 = sub i32 %160, %154		; <i32> [#uses=1]
-	%162 = getelementptr i8* %158, i32 %161		; <i8*> [#uses=1]
-	store i8 %148, i8* %162, align 1
-	br label %bb48
-
-bb48:		; preds = %bb23, %bb47, %bb43
-	%163 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.361, i32 0, i32 1		; <i32*> [#uses=1]
-	%164 = load i32* %163, align 4		; <i32> [#uses=1]
-	%165 = add i32 %164, -1		; <i32> [#uses=1]
-	%166 = getelementptr %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.361, i32 0, i32 1		; <i32*> [#uses=1]
-	store i32 %165, i32* %166, align 4
-	ret void
-
-return:		; preds = %entry
-	ret void
-}
-
-define internal fastcc void @ce3806g__fxio__put__put_scaled__4.1346(%struct.FRAME.ce3806g__fxio__put__4* %CHAIN.365, i64 %x, i64 %y, i64 %z, i32 %a, i32 %e) {
-entry:
-	%0 = alloca { i64, i64 }		; <{ i64, i64 }*> [#uses=3]
-	%1 = call i8* @llvm.stacksave( )		; <i8*> [#uses=1]
-	%2 = add i32 %a, 17		; <i32> [#uses=2]
-	%3 = sdiv i32 %2, 18		; <i32> [#uses=3]
-	%4 = add i32 %3, 1		; <i32> [#uses=7]
-	%5 = icmp sgt i32 %4, -1		; <i1> [#uses=1]
-	%max53 = select i1 %5, i32 %4, i32 0		; <i32> [#uses=1]
-	%6 = alloca i64, i32 %max53		; <i64*> [#uses=21]
-	%7 = icmp sgt i32 %4, 0		; <i1> [#uses=1]
-	br i1 %7, label %bb55, label %bb58
-
-bb55:		; preds = %entry, %bb57
-	%J60b.0 = phi i32 [ %11, %bb57 ], [ 1, %entry ]		; <i32> [#uses=3]
-	%8 = add i32 %J60b.0, -1		; <i32> [#uses=1]
-	%9 = getelementptr i64* %6, i32 %8		; <i64*> [#uses=1]
-	store i64 0, i64* %9, align 8
-	%10 = icmp eq i32 %4, %J60b.0		; <i1> [#uses=1]
-	br i1 %10, label %bb58, label %bb57
-
-bb57:		; preds = %bb55
-	%11 = add i32 %J60b.0, 1		; <i32> [#uses=1]
-	br label %bb55
-
-bb58:		; preds = %entry, %bb55
-	%12 = icmp sgt i32 %4, 0		; <i1> [#uses=1]
-	br i1 %12, label %bb61, label %bb91
-
-bb61:		; preds = %bb58, %bb90
-	%j2.0 = phi i32 [ %88, %bb90 ], [ 1, %bb58 ]		; <i32> [#uses=11]
-	%aa.0 = phi i32 [ %86, %bb90 ], [ %a, %bb58 ]		; <i32> [#uses=6]
-	%yy.0 = phi i64 [ %84, %bb90 ], [ %y, %bb58 ]		; <i64> [#uses=3]
-	%xx.0 = phi i64 [ %21, %bb90 ], [ %x, %bb58 ]		; <i64> [#uses=2]
-	%13 = icmp eq i64 %xx.0, 0		; <i1> [#uses=1]
-	br i1 %13, label %bb91, label %bb63
-
-bb63:		; preds = %bb61
-	%14 = icmp eq i32 %aa.0, 0		; <i1> [#uses=1]
-	%15 = zext i1 %14 to i8		; <i8> [#uses=1]
-	invoke void @system__arith_64__scaled_divide( { i64, i64 }* noalias sret %0, i64 %xx.0, i64 %yy.0, i64 %z, i8 %15 )
-			to label %invcont unwind label %lpad
-
-invcont:		; preds = %bb63
-	%16 = getelementptr { i64, i64 }* %0, i32 0, i32 0		; <i64*> [#uses=1]
-	%17 = load i64* %16, align 8		; <i64> [#uses=1]
-	%18 = add i32 %j2.0, -1		; <i32> [#uses=1]
-	%19 = getelementptr i64* %6, i32 %18		; <i64*> [#uses=1]
-	store i64 %17, i64* %19, align 8
-	%20 = getelementptr { i64, i64 }* %0, i32 0, i32 1		; <i64*> [#uses=1]
-	%21 = load i64* %20, align 8		; <i64> [#uses=1]
-	%22 = add i32 %j2.0, -1		; <i32> [#uses=1]
-	%23 = getelementptr i64* %6, i32 %22		; <i64*> [#uses=1]
-	%24 = load i64* %23, align 8		; <i64> [#uses=1]
-	%25 = icmp eq i64 %24, %yy.0		; <i1> [#uses=1]
-	%26 = add i32 %j2.0, -1		; <i32> [#uses=1]
-	%27 = getelementptr i64* %6, i32 %26		; <i64*> [#uses=1]
-	%28 = load i64* %27, align 8		; <i64> [#uses=1]
-	%29 = sub i64 0, %28		; <i64> [#uses=1]
-	%30 = icmp eq i64 %yy.0, %29		; <i1> [#uses=1]
-	%31 = or i1 %25, %30		; <i1> [#uses=1]
-	%32 = icmp sgt i32 %j2.0, 1		; <i1> [#uses=1]
-	%or.cond = and i1 %31, %32		; <i1> [#uses=1]
-	br i1 %or.cond, label %bb69, label %bb83
-
-bb69:		; preds = %invcont
-	%33 = add i32 %j2.0, -1		; <i32> [#uses=1]
-	%34 = getelementptr i64* %6, i32 %33		; <i64*> [#uses=1]
-	%35 = load i64* %34, align 8		; <i64> [#uses=1]
-	%36 = icmp slt i64 %35, 0		; <i1> [#uses=1]
-	%37 = add i32 %j2.0, -2		; <i32> [#uses=1]
-	%38 = getelementptr i64* %6, i32 %37		; <i64*> [#uses=1]
-	%39 = load i64* %38, align 8		; <i64> [#uses=2]
-	br i1 %36, label %bb71, label %bb72
-
-bb71:		; preds = %bb69
-	%40 = add i64 %39, 1		; <i64> [#uses=1]
-	%41 = add i32 %j2.0, -2		; <i32> [#uses=1]
-	%42 = getelementptr i64* %6, i32 %41		; <i64*> [#uses=1]
-	store i64 %40, i64* %42, align 8
-	br label %bb73
-
-bb72:		; preds = %bb69
-	%43 = add i64 %39, -1		; <i64> [#uses=1]
-	%44 = add i32 %j2.0, -2		; <i32> [#uses=1]
-	%45 = getelementptr i64* %6, i32 %44		; <i64*> [#uses=1]
-	store i64 %43, i64* %45, align 8
-	br label %bb73
-
-bb73:		; preds = %bb72, %bb71
-	%46 = add i32 %j2.0, -1		; <i32> [#uses=1]
-	%47 = getelementptr i64* %6, i32 %46		; <i64*> [#uses=1]
-	store i64 0, i64* %47, align 8
-	br label %bb74
-
-bb74:		; preds = %bb82, %bb73
-	%j1.0 = phi i32 [ %4, %bb73 ], [ %81, %bb82 ]		; <i32> [#uses=12]
-	%48 = icmp slt i32 %j1.0, 2		; <i1> [#uses=1]
-	br i1 %48, label %bb83, label %bb76
-
-bb76:		; preds = %bb74
-	%49 = add i32 %j1.0, -1		; <i32> [#uses=1]
-	%50 = getelementptr i64* %6, i32 %49		; <i64*> [#uses=1]
-	%51 = load i64* %50, align 8		; <i64> [#uses=1]
-	%52 = icmp sgt i64 %51, 999999999999999999		; <i1> [#uses=1]
-	br i1 %52, label %bb78, label %bb79
-
-bb78:		; preds = %bb76
-	%53 = add i32 %j1.0, -2		; <i32> [#uses=1]
-	%54 = getelementptr i64* %6, i32 %53		; <i64*> [#uses=1]
-	%55 = load i64* %54, align 8		; <i64> [#uses=1]
-	%56 = add i64 %55, 1		; <i64> [#uses=1]
-	%57 = add i32 %j1.0, -2		; <i32> [#uses=1]
-	%58 = getelementptr i64* %6, i32 %57		; <i64*> [#uses=1]
-	store i64 %56, i64* %58, align 8
-	%59 = add i32 %j1.0, -1		; <i32> [#uses=1]
-	%60 = getelementptr i64* %6, i32 %59		; <i64*> [#uses=1]
-	%61 = load i64* %60, align 8		; <i64> [#uses=1]
-	%62 = add i64 %61, -1000000000000000000		; <i64> [#uses=1]
-	%63 = add i32 %j1.0, -1		; <i32> [#uses=1]
-	%64 = getelementptr i64* %6, i32 %63		; <i64*> [#uses=1]
-	store i64 %62, i64* %64, align 8
-	br label %bb82
-
-bb79:		; preds = %bb76
-	%65 = add i32 %j1.0, -1		; <i32> [#uses=1]
-	%66 = getelementptr i64* %6, i32 %65		; <i64*> [#uses=1]
-	%67 = load i64* %66, align 8		; <i64> [#uses=1]
-	%68 = icmp slt i64 %67, -999999999999999999		; <i1> [#uses=1]
-	br i1 %68, label %bb81, label %bb82
-
-bb81:		; preds = %bb79
-	%69 = add i32 %j1.0, -2		; <i32> [#uses=1]
-	%70 = getelementptr i64* %6, i32 %69		; <i64*> [#uses=1]
-	%71 = load i64* %70, align 8		; <i64> [#uses=1]
-	%72 = add i64 %71, -1		; <i64> [#uses=1]
-	%73 = add i32 %j1.0, -2		; <i32> [#uses=1]
-	%74 = getelementptr i64* %6, i32 %73		; <i64*> [#uses=1]
-	store i64 %72, i64* %74, align 8
-	%75 = add i32 %j1.0, -1		; <i32> [#uses=1]
-	%76 = getelementptr i64* %6, i32 %75		; <i64*> [#uses=1]
-	%77 = load i64* %76, align 8		; <i64> [#uses=1]
-	%78 = add i64 %77, 1000000000000000000		; <i64> [#uses=1]
-	%79 = add i32 %j1.0, -1		; <i32> [#uses=1]
-	%80 = getelementptr i64* %6, i32 %79		; <i64*> [#uses=1]
-	store i64 %78, i64* %80, align 8
-	br label %bb82
-
-bb82:		; preds = %bb79, %bb81, %bb78
-	%81 = add i32 %j1.0, -1		; <i32> [#uses=1]
-	br label %bb74
-
-bb83:		; preds = %invcont, %bb74
-	%82 = icmp slt i32 %aa.0, 19		; <i1> [#uses=1]
-	%min = select i1 %82, i32 %aa.0, i32 18		; <i32> [#uses=1]
-	%83 = invoke i64 @system__exn_lli__exn_long_long_integer( i64 10, i32 %min ) readnone
-			to label %invcont86 unwind label %lpad		; <i64> [#uses=1]
-
-invcont86:		; preds = %bb83
-	%84 = sub i64 0, %83		; <i64> [#uses=1]
-	%85 = icmp slt i32 %aa.0, 19		; <i1> [#uses=1]
-	%min87 = select i1 %85, i32 %aa.0, i32 18		; <i32> [#uses=1]
-	%86 = sub i32 %aa.0, %min87		; <i32> [#uses=1]
-	%87 = icmp eq i32 %4, %j2.0		; <i1> [#uses=1]
-	br i1 %87, label %bb91, label %bb90
-
-bb90:		; preds = %invcont86
-	%88 = add i32 %j2.0, 1		; <i32> [#uses=1]
-	br label %bb61
-
-bb91:		; preds = %bb58, %bb61, %invcont86
-	%89 = icmp slt i32 %2, 18		; <i1> [#uses=1]
-	br i1 %89, label %bb98, label %bb94
-
-bb94:		; preds = %bb91, %bb97
-	%j.0 = phi i32 [ %97, %bb97 ], [ 1, %bb91 ]		; <i32> [#uses=4]
-	%90 = mul i32 %j.0, 18		; <i32> [#uses=1]
-	%91 = add i32 %90, -18		; <i32> [#uses=1]
-	%92 = sub i32 %e, %91		; <i32> [#uses=1]
-	%93 = add i32 %j.0, -1		; <i32> [#uses=1]
-	%94 = getelementptr i64* %6, i32 %93		; <i64*> [#uses=1]
-	%95 = load i64* %94, align 8		; <i64> [#uses=1]
-	invoke fastcc void @ce3806g__fxio__put__put_int64__4.1339( %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.365, i64 %95, i32 %92 )
-			to label %invcont95 unwind label %lpad
-
-invcont95:		; preds = %bb94
-	%96 = icmp eq i32 %3, %j.0		; <i1> [#uses=1]
-	br i1 %96, label %bb98, label %bb97
-
-bb97:		; preds = %invcont95
-	%97 = add i32 %j.0, 1		; <i32> [#uses=1]
-	br label %bb94
-
-bb98:		; preds = %bb91, %invcont95
-	%98 = sub i32 %e, %a		; <i32> [#uses=1]
-	%99 = getelementptr i64* %6, i32 %3		; <i64*> [#uses=1]
-	%100 = load i64* %99, align 8		; <i64> [#uses=1]
-	invoke fastcc void @ce3806g__fxio__put__put_int64__4.1339( %struct.FRAME.ce3806g__fxio__put__4* %CHAIN.365, i64 %100, i32 %98 )
-			to label %bb101 unwind label %lpad
-
-bb101:		; preds = %bb98
-	ret void
-
-lpad:		; preds = %bb98, %bb94, %bb83, %bb63
-	%eh_ptr = call i8* @llvm.eh.exception( )		; <i8*> [#uses=2]
-	%eh_select103 = call i32 (i8*, i8*, ...)* @llvm.eh.selector.i32( i8* %eh_ptr, i8* bitcast (i32 (...)* @__gnat_eh_personality to i8*), i32* @__gnat_all_others_value )		; <i32> [#uses=0]
-	call void @llvm.stackrestore( i8* %1 )
-	%101 = call i32 (...)* @_Unwind_Resume( i8* %eh_ptr )		; <i32> [#uses=0]
-	unreachable
-}
-
-declare i8* @llvm.stacksave() nounwind
-
-declare void @system__arith_64__scaled_divide({ i64, i64 }* noalias sret, i64, i64, i64, i8)
-
-declare i64 @system__exn_lli__exn_long_long_integer(i64, i32) readnone
-
-declare void @llvm.stackrestore(i8*) nounwind
-
-declare i32 @system__img_real__set_image_real(x86_fp80, i8*, %struct.string___XUB*, i32, i32, i32, i32)
-
-declare void @ada__text_io__generic_aux__put_item(%struct.ada__text_io__text_afcb*, i8*, %struct.string___XUB*)
-
-declare void @report__test(i8*, %struct.string___XUB*, i8*, %struct.string___XUB*)
-
-declare void @system__secondary_stack__ss_mark(%struct.string___XUB* noalias sret)
-
-declare void @system__exception_table__register_exception(%struct.system__standard_library__exception_data*)
-
-declare void @report__legal_file_name(%struct.system__file_control_block__pstring* noalias sret, i32, i8*, %struct.string___XUB*)
-
-declare %struct.ada__text_io__text_afcb* @ada__text_io__create(%struct.ada__text_io__text_afcb*, i8, i8*, %struct.string___XUB*, i8*, %struct.string___XUB*)
-
-declare void @system__secondary_stack__ss_release(i32, i32)
-
-declare void @report__not_applicable(i8*, %struct.string___XUB*)
-
-declare void @ada__text_io__set_output(%struct.ada__text_io__text_afcb*)
-
-declare %struct.ada__text_io__text_afcb* @ada__text_io__close(%struct.ada__text_io__text_afcb*)
-
-declare %struct.ada__text_io__text_afcb* @ada__text_io__open(%struct.ada__text_io__text_afcb*, i8, i8*, %struct.string___XUB*, i8*, %struct.string___XUB*)
-
-declare %struct.ada__text_io__text_afcb* @ada__text_io__standard_output()
-
-declare void @report__failed(i8*, %struct.string___XUB*)
-
-declare void @ext(i32*)
-
-declare %struct.ada__text_io__text_afcb* @ada__text_io__delete(%struct.ada__text_io__text_afcb*)
-
-declare void @report__result()
diff --git a/test/Transforms/Reassociate/secondary.ll b/test/Transforms/Reassociate/secondary.ll
new file mode 100644
index 0000000..a52000a
--- /dev/null
+++ b/test/Transforms/Reassociate/secondary.ll
@@ -0,0 +1,24 @@
+; RUN: opt -S -reassociate < %s | FileCheck %s
+; rdar://9167457
+
+; Reassociate shouldn't break this testcase involving a secondary
+; reassociation.
+
+; CHECK:     define
+; CHECK-NOT: undef
+; CHECK:     %factor = mul i32 %tmp3, -2
+; CHECK-NOT: undef
+; CHECK:     }
+
+define void @x0f2f640ab6718391b59ce96d9fdeda54(i32 %arg, i32 %arg1, i32 %arg2, i32* %.out) nounwind {
+_:
+  %tmp = sub i32 %arg, %arg1
+  %tmp3 = mul i32 %tmp, -1268345047
+  %tmp4 = add i32 %tmp3, 2014710503
+  %tmp5 = add i32 %tmp3, -1048397418
+  %tmp6 = sub i32 %tmp4, %tmp5
+  %tmp7 = sub i32 -2014710503, %tmp3
+  %tmp8 = add i32 %tmp6, %tmp7
+  store i32 %tmp8, i32* %.out
+  ret void
+}
diff --git a/test/Transforms/SCCP/2002-05-02-EdgeFailure.ll b/test/Transforms/SCCP/2002-05-02-EdgeFailure.ll
deleted file mode 100644
index bb0cf04..0000000
--- a/test/Transforms/SCCP/2002-05-02-EdgeFailure.ll
+++ /dev/null
@@ -1,26 +0,0 @@
-; edgefailure - This function illustrates how SCCP is not doing it's job.  This
-; function should be optimized almost completely away: the loop should be
-; analyzed to detect that the body executes exactly once, and thus the branch
-; can be eliminated and code becomes trivially dead.  This is distilled from a
-; real benchmark (mst from Olden benchmark, MakeGraph function).  When SCCP is
-; fixed, this should be eliminated by a single SCCP application.
-;
-; RUN: opt < %s -sccp -S | not grep loop
-
-define i32* @test() {
-bb1:
-	%A = malloc i32		; <i32*> [#uses=2]
-	br label %bb2
-bb2:		; preds = %bb2, %bb1
-        ;; Always 0
-	%i = phi i32 [ %i2, %bb2 ], [ 0, %bb1 ]		; <i32> [#uses=2]
-        ;; Always 1
-	%i2 = add i32 %i, 1		; <i32> [#uses=2]
-	store i32 %i, i32* %A
-        ;; Always false
-  	%loop = icmp sle i32 %i2, 0		; <i1> [#uses=1]
-	br i1 %loop, label %bb2, label %bb3
-bb3:		; preds = %bb2
-	ret i32* %A
-}
-
diff --git a/test/Transforms/SCCP/apint-basictest.ll b/test/Transforms/SCCP/apint-basictest.ll
index c03bfef..f6ef1ab 100644
--- a/test/Transforms/SCCP/apint-basictest.ll
+++ b/test/Transforms/SCCP/apint-basictest.ll
@@ -1,4 +1,4 @@
-; This is a basic sanity check for constant propogation.  The add instruction 
+; This is a basic sanity check for constant propagation.  The add instruction
 ; should be eliminated.
 
 ; RUN: opt < %s -sccp -S | not grep add
diff --git a/test/Transforms/SCCP/apint-basictest2.ll b/test/Transforms/SCCP/apint-basictest2.ll
index 1734827..ad8b4a4 100644
--- a/test/Transforms/SCCP/apint-basictest2.ll
+++ b/test/Transforms/SCCP/apint-basictest2.ll
@@ -1,4 +1,4 @@
-; This is a basic sanity check for constant propogation.  The add instruction 
+; This is a basic sanity check for constant propagation.  The add instruction
 ; and phi instruction should be eliminated.
 
 ; RUN: opt < %s -sccp -S | not grep phi
diff --git a/test/Transforms/SCCP/apint-basictest3.ll b/test/Transforms/SCCP/apint-basictest3.ll
index 47671bf..b8fcca6 100644
--- a/test/Transforms/SCCP/apint-basictest3.ll
+++ b/test/Transforms/SCCP/apint-basictest3.ll
@@ -1,4 +1,4 @@
-; This is a basic sanity check for constant propogation.  It tests the basic 
+; This is a basic sanity check for constant propagation.  It tests the basic
 ; arithmatic operations.
 
 
diff --git a/test/Transforms/SCCP/apint-basictest4.ll b/test/Transforms/SCCP/apint-basictest4.ll
index 41036ea..8624260 100644
--- a/test/Transforms/SCCP/apint-basictest4.ll
+++ b/test/Transforms/SCCP/apint-basictest4.ll
@@ -1,4 +1,4 @@
-; This is a basic sanity check for constant propogation.  It tests the basic 
+; This is a basic sanity check for constant propagation.  It tests the basic
 ; logic operations.
 
 
diff --git a/test/Transforms/SRETPromotion/2008-03-11-attributes.ll b/test/Transforms/SRETPromotion/2008-03-11-attributes.ll
deleted file mode 100644
index 55abec5..0000000
--- a/test/Transforms/SRETPromotion/2008-03-11-attributes.ll
+++ /dev/null
@@ -1,7 +0,0 @@
-; RUN: opt < %s -sretpromotion -disable-output
-	%struct.ObjPoint = type { double, double, double, double, double, double }
-
-define void @RotatePoint(%struct.ObjPoint* sret  %agg.result, %struct.ObjPoint* byval  %a, double %rx, double %ry, double %rz) nounwind  {
-entry:
-	unreachable
-}
diff --git a/test/Transforms/SRETPromotion/2008-06-04-function-pointer-passing.ll b/test/Transforms/SRETPromotion/2008-06-04-function-pointer-passing.ll
deleted file mode 100644
index 1168b0b..0000000
--- a/test/Transforms/SRETPromotion/2008-06-04-function-pointer-passing.ll
+++ /dev/null
@@ -1,24 +0,0 @@
-; This test lures sretpromotion into promoting the sret argument of foo, even
-; when the function is used as an argument to bar. It used to not check for
-; this, assuming that all users of foo were direct calls, resulting in an
-; assertion failure later on.
-
-; We're mainly testing for opt not to crash, but we'll check to see if the sret
-; attribute is still there for good measure.
-; RUN: opt < %s -sretpromotion -S | grep sret
-
-%struct.S = type <{ i32, i32 }>
-
-define i32 @main() {
-entry:
-	%tmp = alloca %struct.S		; <%struct.S*> [#uses=1]
-	call void @bar( %struct.S* sret  %tmp, void (%struct.S*, ...)* @foo )
-	ret i32 undef
-}
-
-declare void @bar(%struct.S* sret , void (%struct.S*, ...)*)
-
-define internal void @foo(%struct.S* sret  %agg.result, ...) {
-entry:
-	ret void
-}
diff --git a/test/Transforms/SRETPromotion/2008-06-05-non-call-use.ll b/test/Transforms/SRETPromotion/2008-06-05-non-call-use.ll
deleted file mode 100644
index 26c6a6e..0000000
--- a/test/Transforms/SRETPromotion/2008-06-05-non-call-use.ll
+++ /dev/null
@@ -1,20 +0,0 @@
-; This test shows an sret function that is used as an operand to a bitcast.
-; StructRetPromotion used to assume that a function was only used by call or
-; invoke instructions, making this code cause an assertion failure.
-
-; We're mainly testing for opt not to crash, but we'll check to see if the sret
-; attribute is still there for good measure.
-; RUN: opt < %s -sretpromotion -S | grep sret
-
-%struct.S = type <{ i32, i32 }>
-
-define i32 @main() {
-entry:
-        %bar = bitcast void (%struct.S*)* @foo to i32 ()*
-	ret i32 undef
-}
-
-define internal void @foo(%struct.S* sret) {
-entry:
-	ret void
-}
diff --git a/test/Transforms/SRETPromotion/basictest.ll b/test/Transforms/SRETPromotion/basictest.ll
deleted file mode 100644
index ff047dc..0000000
--- a/test/Transforms/SRETPromotion/basictest.ll
+++ /dev/null
@@ -1,33 +0,0 @@
-; RUN: opt < %s -sretpromotion -S > %t
-; RUN: cat %t | grep sret | count 1
-
-; This function is promotable
-define internal void @promotable({i32, i32}* sret %s) {
-  %A = getelementptr {i32, i32}* %s, i32 0, i32 0
-  store i32 0, i32* %A
-  %B = getelementptr {i32, i32}* %s, i32 0, i32 0
-  store i32 1, i32* %B
-  ret void
-}
-
-; This function is not promotable (due to it's use below)
-define internal void @notpromotable({i32, i32}* sret %s) {
-  %A = getelementptr {i32, i32}* %s, i32 0, i32 0
-  store i32 0, i32* %A
-  %B = getelementptr {i32, i32}* %s, i32 0, i32 0
-  store i32 1, i32* %B
-  ret void
-}
-
-define void @caller({i32, i32}* %t) {
-  %s = alloca {i32, i32}
-  call void @promotable({i32, i32}* %s)
-  %A = getelementptr {i32, i32}* %s, i32 0, i32 0
-  %a = load i32* %A
-  %B = getelementptr {i32, i32}* %s, i32 0, i32 0
-  %b = load i32* %B
-  ; This passes in something that's not an alloca, which makes the argument not
-  ; promotable
-  call void @notpromotable({i32, i32}* %t)
-  ret void
-}
diff --git a/test/Transforms/ScalarRepl/2007-11-03-bigendian_apint.ll b/test/Transforms/ScalarRepl/2007-11-03-bigendian_apint.ll
index 81b6746..48abffe 100644
--- a/test/Transforms/ScalarRepl/2007-11-03-bigendian_apint.ll
+++ b/test/Transforms/ScalarRepl/2007-11-03-bigendian_apint.ll
@@ -2,7 +2,7 @@
 
 %struct.S = type { i16 }
 
-define i1 @f(i16 signext  %b) zeroext  {
+define zeroext i1 @f(i16 signext  %b)   {
 entry:
 	%b_addr = alloca i16		; <i16*> [#uses=2]
 	%retval = alloca i32		; <i32*> [#uses=2]
diff --git a/test/Transforms/ScalarRepl/2008-06-05-loadstore-agg.ll b/test/Transforms/ScalarRepl/2008-06-05-loadstore-agg.ll
index 87a08b7..ce70a1b 100644
--- a/test/Transforms/ScalarRepl/2008-06-05-loadstore-agg.ll
+++ b/test/Transforms/ScalarRepl/2008-06-05-loadstore-agg.ll
@@ -13,7 +13,7 @@ define i32 @foo() {
 	%res2 = insertvalue { i32, i32 } %res1, i32 2, 1		; <{ i32, i32 }> [#uses=1]
         ; And store it
 	store { i32, i32 } %res2, { i32, i32 }* %target
-        ; Actually use %target, so it doesn't get removed alltogether
+        ; Actually use %target, so it doesn't get removed altogether
         %ptr = getelementptr { i32, i32 }* %target, i32 0, i32 0
         %val = load i32* %ptr
 	ret i32 %val
@@ -26,7 +26,7 @@ define i32 @bar() {
 	%res2 = insertvalue [ 2 x i32 ] %res1, i32 2, 1		; <{ i32, i32 }> [#uses=1]
         ; And store it
 	store [ 2 x i32 ] %res2, [ 2 x i32 ]* %target
-        ; Actually use %target, so it doesn't get removed alltogether
+        ; Actually use %target, so it doesn't get removed altogether
         %ptr = getelementptr [ 2 x i32 ]* %target, i32 0, i32 0
         %val = load i32* %ptr
 	ret i32 %val
diff --git a/test/Transforms/ScalarRepl/2011-05-06-CapturedAlloca.ll b/test/Transforms/ScalarRepl/2011-05-06-CapturedAlloca.ll
new file mode 100644
index 0000000..816cb60
--- /dev/null
+++ b/test/Transforms/ScalarRepl/2011-05-06-CapturedAlloca.ll
@@ -0,0 +1,26 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+; PR9820
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+@func_1.l_10 = internal unnamed_addr constant [4 x i32] [i32 1, i32 0, i32 0, i32 0], align 16
+
+define i32* @noop(i32* %p_29) nounwind readnone {
+entry:
+  ret i32* %p_29
+}
+
+define i32 @main() nounwind {
+entry:
+  %l_10 = alloca [4 x i32], align 16
+  %tmp = bitcast [4 x i32]* %l_10 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %tmp, i8* bitcast ([4 x i32]* @func_1.l_10 to i8*), i64 16, i32 16, i1 false)
+; CHECK: call void @llvm.memcpy
+  %arrayidx = getelementptr inbounds [4 x i32]* %l_10, i64 0, i64 0
+  %call = call i32* @noop(i32* %arrayidx)
+  store i32 0, i32* %call
+  ret i32 0
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
diff --git a/test/Transforms/ScalarRepl/2011-06-08-VectorExtractValue.ll b/test/Transforms/ScalarRepl/2011-06-08-VectorExtractValue.ll
new file mode 100644
index 0000000..32e67fb
--- /dev/null
+++ b/test/Transforms/ScalarRepl/2011-06-08-VectorExtractValue.ll
@@ -0,0 +1,62 @@
+; RUN: opt < %s -S -scalarrepl | FileCheck %s
+; RUN: opt < %s -S -scalarrepl-ssa | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-macosx10.7.0"
+
+%0 = type { <2 x float>, float }
+%struct.PointC3 = type { %struct.array }
+%struct.Point_3 = type { %struct.PointC3 }
+%struct.array = type { [3 x float], [4 x i8] }
+
+; CHECK: main
+; CHECK-NOT: alloca
+; CHECK: extractelement <2 x float> zeroinitializer
+
+define void @main() uwtable ssp {
+entry:
+  %ref.tmp2 = alloca %0, align 16
+  %tmpcast = bitcast %0* %ref.tmp2 to %struct.Point_3*
+  %0 = getelementptr %0* %ref.tmp2, i64 0, i32 0
+  store <2 x float> zeroinitializer, <2 x float>* %0, align 16
+  %1 = getelementptr inbounds %struct.Point_3* %tmpcast, i64 0, i32 0
+  %base.i.i.i = getelementptr inbounds %struct.PointC3* %1, i64 0, i32 0
+  %arrayidx.i.i.i.i = getelementptr inbounds %struct.array* %base.i.i.i, i64 0, i32 0, i64 0
+  %tmp5.i.i = load float* %arrayidx.i.i.i.i, align 4
+  ret void
+}
+
+; CHECK: test1
+; CHECK-NOT: alloca
+; CHECK: extractelement <2 x float> zeroinitializer
+
+define void @test1() uwtable ssp {
+entry:
+  %ref.tmp2 = alloca {<2 x float>, float}, align 16
+  %tmpcast = bitcast {<2 x float>, float}* %ref.tmp2 to float*
+  %0 = getelementptr {<2 x float>, float}* %ref.tmp2, i64 0, i32 0
+  store <2 x float> zeroinitializer, <2 x float>* %0, align 16
+  %tmp5.i.i = load float* %tmpcast, align 4
+  ret void
+}
+
+; CHECK: test2
+; CHECK-NOT: alloca
+; CHECK: and i128
+; CHECK: or i128
+; CHECK: trunc i128
+; CHECK-NOT: insertelement
+; CHECK-NOT: extractelement
+
+define float @test2() uwtable ssp {
+entry:
+  %ref.tmp2 = alloca {<2 x float>, float}, align 16
+  %tmpcast = bitcast {<2 x float>, float}* %ref.tmp2 to float*
+  %tmpcast2 = getelementptr {<2 x float>, float}* %ref.tmp2, i64 0, i32 1
+  %0 = getelementptr {<2 x float>, float}* %ref.tmp2, i64 0, i32 0
+  store <2 x float> zeroinitializer, <2 x float>* %0, align 16
+  store float 1.0, float* %tmpcast2, align 4
+  %r1 = load float* %tmpcast, align 4
+  %r2 = load float* %tmpcast2, align 4
+  %r = fadd float %r1, %r2
+  ret float %r
+}
diff --git a/test/Transforms/ScalarRepl/crash.ll b/test/Transforms/ScalarRepl/crash.ll
index 7b62f09..83daaaf 100644
--- a/test/Transforms/ScalarRepl/crash.ll
+++ b/test/Transforms/ScalarRepl/crash.ll
@@ -245,10 +245,12 @@ entry:
 ; VLAs.
 define void @test12() {
 bb4.i:
-        %0 = malloc [0 x %struct.Item]          ; <[0 x %struct.Item]*> [#uses=1]
+        %malloccall = tail call i8* @malloc(i32 0)
+        %0 = bitcast i8* %malloccall to [0 x %struct.Item]*
         %.sub.i.c.i = getelementptr [0 x %struct.Item]* %0, i32 0, i32 0                ; <%struct.Item*> [#uses=0]
         unreachable
 }
+declare noalias i8* @malloc(i32)
 
 ; PR8680
 define void @test13() nounwind {
diff --git a/test/Transforms/ScalarRepl/debuginfo-preserved.ll b/test/Transforms/ScalarRepl/debuginfo-preserved.ll
new file mode 100644
index 0000000..c149134
--- /dev/null
+++ b/test/Transforms/ScalarRepl/debuginfo-preserved.ll
@@ -0,0 +1,61 @@
+; RUN: opt < %s -scalarrepl -S | FileCheck %s
+; RUN: opt < %s -scalarrepl-ssa -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-macosx10.6.0"
+
+; CHECK: f
+; CHECK-NOT: llvm.dbg.declare
+; CHECK: llvm.dbg.value
+; CHECK: llvm.dbg.value
+; CHECK: llvm.dbg.value
+; CHECK: llvm.dbg.value
+; CHECK: llvm.dbg.value
+
+define i32 @f(i32 %a, i32 %b) nounwind ssp {
+entry:
+  %a.addr = alloca i32, align 4
+  %b.addr = alloca i32, align 4
+  %c = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !6), !dbg !7
+  store i32 %b, i32* %b.addr, align 4
+  call void @llvm.dbg.declare(metadata !{i32* %b.addr}, metadata !8), !dbg !9
+  call void @llvm.dbg.declare(metadata !{i32* %c}, metadata !10), !dbg !12
+  %tmp = load i32* %a.addr, align 4, !dbg !13
+  store i32 %tmp, i32* %c, align 4, !dbg !13
+  %tmp1 = load i32* %a.addr, align 4, !dbg !14
+  %tmp2 = load i32* %b.addr, align 4, !dbg !14
+  %add = add nsw i32 %tmp1, %tmp2, !dbg !14
+  store i32 %add, i32* %a.addr, align 4, !dbg !14
+  %tmp3 = load i32* %c, align 4, !dbg !15
+  %tmp4 = load i32* %b.addr, align 4, !dbg !15
+  %sub = sub nsw i32 %tmp3, %tmp4, !dbg !15
+  store i32 %sub, i32* %b.addr, align 4, !dbg !15
+  %tmp5 = load i32* %a.addr, align 4, !dbg !16
+  %tmp6 = load i32* %b.addr, align 4, !dbg !16
+  %add7 = add nsw i32 %tmp5, %tmp6, !dbg !16
+  ret i32 %add7, !dbg !16
+}
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+!llvm.dbg.cu = !{!0}
+!llvm.dbg.sp = !{!1}
+
+!0 = metadata !{i32 589841, i32 0, i32 12, metadata !"/d/j/debug-test.c", metadata !"/Volumes/Data/b", metadata !"clang version 3.0 (trunk 131941)", i1 true, i1 false, metadata !"", i32 0} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{i32 589870, i32 0, metadata !2, metadata !"f", metadata !"f", metadata !"", metadata !2, i32 1, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, i32 (i32, i32)* @f, null, null} ; [ DW_TAG_subprogram ]
+!2 = metadata !{i32 589865, metadata !"/d/j/debug-test.c", metadata !"/Volumes/Data/b", metadata !0} ; [ DW_TAG_file_type ]
+!3 = metadata !{i32 589845, metadata !2, metadata !"", metadata !2, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!4 = metadata !{metadata !5}
+!5 = metadata !{i32 589860, metadata !0, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!6 = metadata !{i32 590081, metadata !1, metadata !"a", metadata !2, i32 16777217, metadata !5, i32 0} ; [ DW_TAG_arg_variable ]
+!7 = metadata !{i32 1, i32 11, metadata !1, null}
+!8 = metadata !{i32 590081, metadata !1, metadata !"b", metadata !2, i32 33554433, metadata !5, i32 0} ; [ DW_TAG_arg_variable ]
+!9 = metadata !{i32 1, i32 18, metadata !1, null}
+!10 = metadata !{i32 590080, metadata !11, metadata !"c", metadata !2, i32 2, metadata !5, i32 0} ; [ DW_TAG_auto_variable ]
+!11 = metadata !{i32 589835, metadata !1, i32 1, i32 21, metadata !2, i32 0} ; [ DW_TAG_lexical_block ]
+!12 = metadata !{i32 2, i32 9, metadata !11, null}
+!13 = metadata !{i32 2, i32 14, metadata !11, null}
+!14 = metadata !{i32 3, i32 5, metadata !11, null}
+!15 = metadata !{i32 4, i32 5, metadata !11, null}
+!16 = metadata !{i32 5, i32 5, metadata !11, null}
diff --git a/test/Transforms/ScalarRepl/debuginfo.ll b/test/Transforms/ScalarRepl/debuginfo.ll
index 6b8422c..ae2c6cc 100644
--- a/test/Transforms/ScalarRepl/debuginfo.ll
+++ b/test/Transforms/ScalarRepl/debuginfo.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -scalarrepl -S | not grep alloca
+; RUN: opt < %s -scalarrepl-ssa -S | not grep alloca
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64"
 	%llvm.dbg.anchor.type = type { i32, i32 }
 	%llvm.dbg.basictype.type = type { i32, { }*, i8*, { }*, i32, i64, i64, i64, i32, i32 }
diff --git a/test/Transforms/ScalarRepl/dg.exp b/test/Transforms/ScalarRepl/dg.exp
index f200589..39954d8 100644
--- a/test/Transforms/ScalarRepl/dg.exp
+++ b/test/Transforms/ScalarRepl/dg.exp
@@ -1,3 +1,3 @@
 load_lib llvm.exp
 
-RunLLVMTests [lsort [glob -nocomplain $srcdir/$subdir/*.{ll,c,cpp}]]
+RunLLVMTests [lsort [glob -nocomplain $srcdir/$subdir/*.{ll}]]
diff --git a/test/Transforms/ScalarRepl/vector_promote.ll b/test/Transforms/ScalarRepl/vector_promote.ll
index ef701c6..c51ef10 100644
--- a/test/Transforms/ScalarRepl/vector_promote.ll
+++ b/test/Transforms/ScalarRepl/vector_promote.ll
@@ -202,3 +202,64 @@ define float @test13(<4 x float> %x, <2 x i32> %y) {
 ; CHECK-NOT: alloca
 ; CHECK: bitcast <4 x float> %x to i128
 }
+
+define <3 x float> @test14(<3 x float> %x)  {
+entry:
+  %x.addr = alloca <3 x float>, align 16
+  %r = alloca <3 x i32>, align 16
+  %extractVec = shufflevector <3 x float> %x, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
+  %storetmp = bitcast <3 x float>* %x.addr to <4 x float>*
+  store <4 x float> %extractVec, <4 x float>* %storetmp, align 16
+  %tmp = load <3 x float>* %x.addr, align 16
+  %cmp = fcmp une <3 x float> %tmp, zeroinitializer
+  %sext = sext <3 x i1> %cmp to <3 x i32>
+  %and = and <3 x i32> <i32 1065353216, i32 1065353216, i32 1065353216>, %sext
+  %extractVec1 = shufflevector <3 x i32> %and, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
+  %storetmp2 = bitcast <3 x i32>* %r to <4 x i32>*
+  store <4 x i32> %extractVec1, <4 x i32>* %storetmp2, align 16
+  %tmp3 = load <3 x i32>* %r, align 16
+  %0 = bitcast <3 x i32> %tmp3 to <3 x float>
+  %tmp4 = load <3 x float>* %x.addr, align 16
+  ret <3 x float> %tmp4
+; CHECK: @test14
+; CHECK-NOT: alloca
+; CHECK: shufflevector <4 x i32> %extractVec1, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
+}
+
+define void @test15(<3 x i64>* sret %agg.result, <3 x i64> %x, <3 x i64> %min) {
+entry:
+  %x.addr = alloca <3 x i64>, align 32
+  %min.addr = alloca <3 x i64>, align 32
+  %extractVec = shufflevector <3 x i64> %x, <3 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
+  %storetmp = bitcast <3 x i64>* %x.addr to <4 x i64>*
+  store <4 x i64> %extractVec, <4 x i64>* %storetmp, align 32
+  %extractVec1 = shufflevector <3 x i64> %min, <3 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
+  %storetmp2 = bitcast <3 x i64>* %min.addr to <4 x i64>*
+  store <4 x i64> %extractVec1, <4 x i64>* %storetmp2, align 32
+  %tmp = load <3 x i64>* %x.addr
+  %tmp5 = extractelement <3 x i64> %tmp, i32 0
+  %tmp11 = insertelement <3 x i64> %tmp, i64 %tmp5, i32 0
+  store <3 x i64> %tmp11, <3 x i64>* %x.addr
+  %tmp30 = load <3 x i64>* %x.addr, align 32
+  store <3 x i64> %tmp30, <3 x i64>* %agg.result
+  ret void
+; CHECK: @test15
+; CHECK-NOT: alloca
+; CHECK: shufflevector <4 x i64> %tmpV2, <4 x i64> undef, <3 x i32> <i32 0, i32 1, i32 2>
+}
+
+define <4 x float> @test16(<4 x float> %x, i64 %y0, i64 %y1) {
+entry:
+  %tmp8 = bitcast <4 x float> undef to <2 x double>
+  %tmp9 = bitcast i64 %y0 to double
+  %tmp10 = insertelement <2 x double> %tmp8, double %tmp9, i32 0
+  %tmp11 = bitcast <2 x double> %tmp10 to <4 x float>
+  %tmp3 = bitcast <4 x float> %tmp11 to <2 x double>
+  %tmp4 = bitcast i64 %y1 to double
+  %tmp5 = insertelement <2 x double> %tmp3, double %tmp4, i32 1
+  %tmp6 = bitcast <2 x double> %tmp5 to <4 x float>
+	ret <4 x float> %tmp6
+; CHECK: @test16
+; CHECK-NOT: alloca
+; CHECK: bitcast <4 x float> %tmp11 to <2 x double>
+}
diff --git a/test/Transforms/SimplifyCFG/2006-08-03-Crash.ll b/test/Transforms/SimplifyCFG/2006-08-03-Crash.ll
index 2c84c93..70fbddf 100644
--- a/test/Transforms/SimplifyCFG/2006-08-03-Crash.ll
+++ b/test/Transforms/SimplifyCFG/2006-08-03-Crash.ll
@@ -1,7 +1,5 @@
-; RUN: opt < %s -gvn -simplifycfg \
-; RUN:   -disable-output
+; RUN: opt < %s -gvn -simplifycfg -disable-output
 ; PR867
-; END.
 
 target datalayout = "E-p:32:32"
 target triple = "powerpc-apple-darwin8"
diff --git a/test/Transforms/SimplifyCFG/PR9946.ll b/test/Transforms/SimplifyCFG/PR9946.ll
new file mode 100644
index 0000000..4a61b84
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/PR9946.ll
@@ -0,0 +1,18 @@
+; RUN: opt  %s -simplifycfg -disable-output
+
+@foo = external constant i32
+
+define i32 @f() {
+entry:
+  br i1 icmp eq (i64 and (i64 ptrtoint (i32* @foo to i64), i64 15), i64 0), label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  br label %return
+
+if.end:                                           ; preds = %entry
+  br label %return
+
+return:                                           ; preds = %if.end, %if.then
+  %storemerge = phi i32 [ 1, %if.end ], [ 0, %if.then ]
+  ret i32 %storemerge
+}
diff --git a/test/Transforms/SimplifyCFG/PhiBlockMerge.ll b/test/Transforms/SimplifyCFG/PhiBlockMerge.ll
index c28d0ba..36b52f5 100644
--- a/test/Transforms/SimplifyCFG/PhiBlockMerge.ll
+++ b/test/Transforms/SimplifyCFG/PhiBlockMerge.ll
@@ -6,6 +6,7 @@
 define i32 @test(i1 %a, i1 %b) {
 ; CHECK: br i1 %a
         br i1 %a, label %M, label %O
+; CHECK: O:
 O:              ; preds = %0
 ; CHECK: select i1 %b, i32 0, i32 1
 ; CHECK-NOT: phi
diff --git a/test/Transforms/SimplifyCFG/PhiEliminate2.ll b/test/Transforms/SimplifyCFG/PhiEliminate2.ll
index c0f6781..0b3893d 100644
--- a/test/Transforms/SimplifyCFG/PhiEliminate2.ll
+++ b/test/Transforms/SimplifyCFG/PhiEliminate2.ll
@@ -1,14 +1,17 @@
 ; RUN: opt < %s -simplifycfg -S | not grep br
 
-define i32 @test(i1 %C, i32 %V1, i32 %V2) {
+define i32 @test(i1 %C, i32 %V1, i32 %V2, i16 %V3) {
 entry:
-        br i1 %C, label %then, label %Cont
+        br i1 %C, label %then, label %else
 then:           ; preds = %entry
-        %V3 = or i32 %V2, %V1           ; <i32> [#uses=1]
+        %V4 = or i32 %V2, %V1           ; <i32> [#uses=1]
         br label %Cont
-Cont:           ; preds = %then, %entry
-        %V4 = phi i32 [ %V1, %entry ], [ %V3, %then ]           ; <i32> [#uses=0]
-        call i32 @test( i1 false, i32 0, i32 0 )                ; <i32>:0 [#uses=0]
+else:           ; preds = %entry
+        %V5 = sext i16 %V3 to i32       ; <i32> [#uses=1]
+        br label %Cont
+Cont:           ; preds = %then, %else
+        %V6 = phi i32 [ %V5, %else ], [ %V4, %then ]            ; <i32> [#uses=0]
+        call i32 @test( i1 false, i32 0, i32 0, i16 0 )         ; <i32>:0 [#uses=0]
         ret i32 %V1
 }
 
diff --git a/test/Transforms/SimplifyCFG/PhiEliminate3.ll b/test/Transforms/SimplifyCFG/PhiEliminate3.ll
new file mode 100644
index 0000000..3566b87
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/PhiEliminate3.ll
@@ -0,0 +1,34 @@
+; Test merging of blocks containing complex expressions,
+; with various folding thresholds
+;
+; RUN: opt < %s -simplifycfg -S -phi-node-folding-threshold=1 | grep N:
+; RUN: opt < %s -simplifycfg -S -phi-node-folding-threshold=2 | not grep N:
+; RUN: opt < %s -simplifycfg -S -phi-node-folding-threshold=2 | grep M:
+; RUN: opt < %s -simplifycfg -S -phi-node-folding-threshold=7 | not grep M:
+;
+
+define i32 @test(i1 %a, i1 %b, i32 %i, i32 %j, i32 %k) {
+entry:
+        br i1 %a, label %M, label %O
+O:
+        br i1 %b, label %P, label %Q
+P:
+        %iaj = add i32 %i, %j
+        %iajak = add i32 %iaj, %k
+        br label %N
+Q:
+        %ixj = xor i32 %i, %j
+        %ixjxk = xor i32 %ixj, %k
+        br label %N
+N:
+        ; This phi should be foldable if threshold >= 2
+        %Wp = phi i32 [ %iajak, %P ], [ %ixjxk, %Q ]
+        %Wp2 = add i32 %Wp, %Wp
+        br label %M
+M:
+        ; This phi should be foldable if threshold >= 7
+        %W = phi i32 [ %Wp2, %N ], [ 2, %entry ]
+        %R = add i32 %W, 1
+        ret i32 %R
+}
+
diff --git a/test/Transforms/SimplifyCFG/dce-cond-after-folding-terminator.ll b/test/Transforms/SimplifyCFG/dce-cond-after-folding-terminator.ll
new file mode 100644
index 0000000..3996efd
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/dce-cond-after-folding-terminator.ll
@@ -0,0 +1,52 @@
+; RUN: opt -S <%s -simplifycfg | FileCheck %s
+
+define void @test_br(i32 %x) {
+entry:
+; CHECK: @test_br
+; CHECK-NEXT: entry:
+; CHECK-NEXT: ret void
+  %cmp = icmp eq i32 %x, 10
+  br i1 %cmp, label %if.then, label %if.then
+
+if.then:                                          ; preds = %entry
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+define void @test_switch(i32 %x) nounwind {
+entry:
+; CHECK: @test_switch
+; CHECK-NEXT: entry:
+; CHECK-NEXT: ret void
+  %rem = srem i32 %x, 3
+  switch i32 %rem, label %sw.bb [
+    i32 1, label %sw.bb
+    i32 10, label %sw.bb
+  ]
+
+sw.bb:                                            ; preds = %sw.default, %entry, %entry
+  br label %sw.epilog
+
+sw.epilog:                                        ; preds = %sw.bb
+  ret void
+}
+
+define void @test_indirectbr(i32 %x) {
+entry:
+; CHECK: @test_indirectbr
+; CHECK-NEXT: entry:
+; Ideally this should now check:
+;   CHK-NEXT: ret void
+; But that doesn't happen yet. Instead:
+; CHECK-NEXT: br label %L1
+
+  %label = bitcast i8* blockaddress(@test_indirectbr, %L1) to i8*
+  indirectbr i8* %label, [label %L1, label %L2]
+
+L1:                                               ; preds = %entry
+  ret void
+L2:                                               ; preds = %entry
+  ret void
+}
diff --git a/test/Transforms/SimplifyCFG/indirectbr.ll b/test/Transforms/SimplifyCFG/indirectbr.ll
index 7fb4def..7853e9a 100644
--- a/test/Transforms/SimplifyCFG/indirectbr.ll
+++ b/test/Transforms/SimplifyCFG/indirectbr.ll
@@ -180,3 +180,72 @@ L3:
 ; before SimplifyCFG even looks at the indirectbr.
   indirectbr i8* %anchor, [label %L1, label %L2]
 }
+
+; PR10072
+
+@xblkx.bbs = internal unnamed_addr constant [9 x i8*] [i8* blockaddress(@indbrtest7, %xblkx.begin), i8* blockaddress(@indbrtest7, %xblkx.begin3), i8* blockaddress(@indbrtest7, %xblkx.begin4), i8* blockaddress(@indbrtest7, %xblkx.begin5), i8* blockaddress(@indbrtest7, %xblkx.begin6), i8* blockaddress(@indbrtest7, %xblkx.begin7), i8* blockaddress(@indbrtest7, %xblkx.begin8), i8* blockaddress(@indbrtest7, %xblkx.begin9), i8* blockaddress(@indbrtest7, %xblkx.end)]
+
+define void @indbrtest7() {
+escape-string.top:
+  %xval202x = call i32 @xfunc5x()
+  br label %xlab5x
+
+xlab8x:                                           ; preds = %xlab5x
+  %xvaluex = call i32 @xselectorx()
+  %xblkx.x = getelementptr [9 x i8*]* @xblkx.bbs, i32 0, i32 %xvaluex
+  %xblkx.load = load i8** %xblkx.x
+  indirectbr i8* %xblkx.load, [label %xblkx.begin, label %xblkx.begin3, label %xblkx.begin4, label %xblkx.begin5, label %xblkx.begin6, label %xblkx.begin7, label %xblkx.begin8, label %xblkx.begin9, label %xblkx.end]
+
+xblkx.begin:
+  br label %xblkx.end
+
+xblkx.begin3:
+  br label %xblkx.end
+
+xblkx.begin4:
+  br label %xblkx.end
+
+xblkx.begin5:
+  br label %xblkx.end
+
+xblkx.begin6:
+  br label %xblkx.end
+
+xblkx.begin7:
+  br label %xblkx.end
+
+xblkx.begin8:
+  br label %xblkx.end
+
+xblkx.begin9:
+  br label %xblkx.end
+
+xblkx.end:
+  %yes.0 = phi i1 [ false, %xblkx.begin ], [ true, %xlab8x ], [ false, %xblkx.begin9 ], [ false, %xblkx.begin8 ], [ false, %xblkx.begin7 ], [ false, %xblkx.begin6 ], [ false, %xblkx.begin5 ], [ true, %xblkx.begin4 ], [ false, %xblkx.begin3 ]
+  br i1 %yes.0, label %v2j, label %xlab17x
+
+v2j:
+; CHECK: %xunusedx = call i32 @xactionx()
+  %xunusedx = call i32 @xactionx()
+  br label %xlab4x
+
+xlab17x:
+  br label %xlab4x
+
+xlab4x:
+  %incr19 = add i32 %xval704x.0, 1
+  br label %xlab5x
+
+xlab5x:
+  %xval704x.0 = phi i32 [ 0, %escape-string.top ], [ %incr19, %xlab4x ]
+  %xval10x = icmp ult i32 %xval704x.0, %xval202x
+  br i1 %xval10x, label %xlab8x, label %xlab9x
+
+xlab9x:
+  ret void
+}
+
+declare i32 @xfunc5x()
+declare i8 @xfunc7x()
+declare i32 @xselectorx()
+declare i32 @xactionx()
diff --git a/test/Transforms/SimplifyCFG/switch-masked-bits.ll b/test/Transforms/SimplifyCFG/switch-masked-bits.ll
new file mode 100644
index 0000000..fc83ec2
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/switch-masked-bits.ll
@@ -0,0 +1,38 @@
+; RUN: opt -S -simplifycfg < %s | FileCheck %s
+
+define i32 @test1(i32 %x) nounwind {
+  %i = shl i32 %x, 1
+  switch i32 %i, label %a [
+    i32 21, label %b
+    i32 24, label %c
+  ]
+
+a:
+  ret i32 0
+b:
+  ret i32 3
+c:
+  ret i32 5
+; CHECK: @test1
+; CHECK: %cond = icmp eq i32 %i, 24
+; CHECK: %merge = select i1 %cond, i32 5, i32 0
+; CHECK: ret i32 %merge
+}
+
+
+define i32 @test2(i32 %x) nounwind {
+  %i = shl i32 %x, 1
+  switch i32 %i, label %a [
+    i32 21, label %b
+    i32 23, label %c
+  ]
+
+a:
+  ret i32 0
+b:
+  ret i32 3
+c:
+  ret i32 5
+; CHECK: @test2
+; CHECK: ret i32 0
+}
diff --git a/test/Transforms/SimplifyCFG/trap-debugloc.ll b/test/Transforms/SimplifyCFG/trap-debugloc.ll
new file mode 100644
index 0000000..24540e5
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/trap-debugloc.ll
@@ -0,0 +1,19 @@
+; RUN: opt -S -simplifycfg < %s | FileCheck %s
+; Radar 9342286
+; Assign DebugLoc to trap instruction.
+define void @foo() nounwind ssp {
+; CHECK: call void @llvm.trap(), !dbg
+  store i32 42, i32* null, !dbg !5
+  ret void, !dbg !7
+}
+
+!llvm.dbg.sp = !{!0}
+
+!0 = metadata !{i32 589870, i32 0, metadata !1, metadata !"foo", metadata !"foo", metadata !"", metadata !1, i32 3, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 0, i1 false, void ()* @foo} ; [ DW_TAG_subprogram ]
+!1 = metadata !{i32 589865, metadata !"foo.c", metadata !"/private/tmp", metadata !2} ; [ DW_TAG_file_type ]
+!2 = metadata !{i32 589841, i32 0, i32 12, metadata !"foo.c", metadata !"/private/tmp", metadata !"Apple clang version 3.0 (tags/Apple/clang-206.1) (based on LLVM 3.0svn)", i1 true, i1 false, metadata !"", i32 0} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{i32 589845, metadata !1, metadata !"", metadata !1, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!4 = metadata !{null}
+!5 = metadata !{i32 4, i32 2, metadata !6, null}
+!6 = metadata !{i32 589835, metadata !0, i32 3, i32 12, metadata !1, i32 0} ; [ DW_TAG_lexical_block ]
+!7 = metadata !{i32 5, i32 1, metadata !6, null}
diff --git a/test/Transforms/TailCallElim/setjmp.ll b/test/Transforms/TailCallElim/setjmp.ll
new file mode 100644
index 0000000..7b7fe56
--- /dev/null
+++ b/test/Transforms/TailCallElim/setjmp.ll
@@ -0,0 +1,16 @@
+; RUN: opt < %s -tailcallelim -S | FileCheck %s
+
+; Test that we don't tail call in a functions that calls setjmp.
+
+; CHECK-NOT: tail call void @bar()
+
+define void @foo(i32* %x) {
+bb:
+  %tmp75 = tail call i32 @setjmp(i32* %x)
+  call void @bar()
+  ret void
+}
+
+declare i32 @setjmp(i32*)
+
+declare void @bar()
diff --git a/test/Transforms/TailDup/X86/dg.exp b/test/Transforms/TailDup/X86/dg.exp
new file mode 100644
index 0000000..7b7bd4e
--- /dev/null
+++ b/test/Transforms/TailDup/X86/dg.exp
@@ -0,0 +1,5 @@
+load_lib llvm.exp
+
+if { [llvm_supports_target X86] } {
+  RunLLVMTests [lsort [glob -nocomplain $srcdir/$subdir/*.{ll}]]
+}
diff --git a/test/Transforms/TailDup/if-tail-dup.ll b/test/Transforms/TailDup/X86/if-tail-dup.ll
index 2e4f5be..2e4f5be 100644
--- a/test/Transforms/TailDup/if-tail-dup.ll
+++ b/test/Transforms/TailDup/X86/if-tail-dup.ll
diff --git a/test/Verifier/2005-03-21-UndefinedTypeReference.ll b/test/Verifier/2005-03-21-UndefinedTypeReference.ll
deleted file mode 100644
index 5299397..0000000
--- a/test/Verifier/2005-03-21-UndefinedTypeReference.ll
+++ /dev/null
@@ -1,7 +0,0 @@
-; RUN: not llvm-as < %s |& grep {use of undefined type named 'InvalidType'}
-
-define void @test() {
-        malloc %InvalidType
-        ret void
-}
-
diff --git a/test/lit.site.cfg.in b/test/lit.site.cfg.in
index 3588aa6..a61920f 100644
--- a/test/lit.site.cfg.in
+++ b/test/lit.site.cfg.in
@@ -5,6 +5,7 @@ config.llvm_obj_root = "@LLVM_BINARY_DIR@"
 config.llvm_tools_dir = "@LLVM_TOOLS_DIR@"
 config.llvmgcc_dir = "@LLVMGCCDIR@"
 config.lit_tools_dir = "@LLVM_LIT_TOOLS_DIR@"
+config.llvm_build_modes = "@LLVM_BUILD_MODE@".split('+')
 config.python_executable = "@PYTHON_EXECUTABLE@"
 config.enable_shared = @ENABLE_SHARED@
 
diff --git a/tools/bugpoint/ExecutionDriver.cpp b/tools/bugpoint/ExecutionDriver.cpp
index 9be9dfd..77c01ac 100644
--- a/tools/bugpoint/ExecutionDriver.cpp
+++ b/tools/bugpoint/ExecutionDriver.cpp
@@ -475,7 +475,7 @@ bool BugDriver::createReferenceFile(Module *M, const std::string &Filename) {
 /// diffProgram - This method executes the specified module and diffs the
 /// output against the file specified by ReferenceOutputFile.  If the output
 /// is different, 1 is returned.  If there is a problem with the code
-/// generator (e.g., llc crashes), this will return -1 and set Error.
+/// generator (e.g., llc crashes), this will set ErrMsg.
 ///
 bool BugDriver::diffProgram(const Module *Program,
                             const std::string &BitcodeFile,
diff --git a/tools/bugpoint/Miscompilation.cpp b/tools/bugpoint/Miscompilation.cpp
index a9db38f..1834fe1 100644
--- a/tools/bugpoint/Miscompilation.cpp
+++ b/tools/bugpoint/Miscompilation.cpp
@@ -624,9 +624,10 @@ DebugAMiscompilation(BugDriver &BD,
   if (!BugpointIsInterrupted)
     ReduceMiscompilingFunctions(BD, TestFn).reduceList(MiscompiledFunctions,
                                                        Error);
-  if (!Error.empty())
+  if (!Error.empty()) {
+    errs() << "\n***Cannot reduce functions: ";
     return MiscompiledFunctions;
-
+  }
   outs() << "\n*** The following function"
          << (MiscompiledFunctions.size() == 1 ? " is" : "s are")
          << " being miscompiled: ";
diff --git a/tools/bugpoint/OptimizerDriver.cpp b/tools/bugpoint/OptimizerDriver.cpp
index c6be271..336c83d 100644
--- a/tools/bugpoint/OptimizerDriver.cpp
+++ b/tools/bugpoint/OptimizerDriver.cpp
@@ -223,7 +223,7 @@ bool BugDriver::runPasses(Module *Program,
       if (result == -1)
         outs() << "Execute failed: " << ErrMsg << "\n";
       else
-        outs() << "Crashed with signal #" << abs(result) << "\n";
+        outs() << "Crashed: " << ErrMsg << "\n";
     }
     if (result & 0x01000000)
       outs() << "Dumped core\n";
diff --git a/tools/bugpoint/ToolRunner.cpp b/tools/bugpoint/ToolRunner.cpp
index 1719703..0d98262 100644
--- a/tools/bugpoint/ToolRunner.cpp
+++ b/tools/bugpoint/ToolRunner.cpp
@@ -758,8 +758,7 @@ int GCC::ExecuteProgram(const std::string &ProgramFile,
       // For ARM architectures we don't want this flag. bugpoint isn't
       // explicitly told what architecture it is working on, so we get
       // it from gcc flags
-      if ((TargetTriple.getOS() == Triple::Darwin) &&
-          !IsARMArchitecture(GCCArgs))
+      if (TargetTriple.isOSDarwin() && !IsARMArchitecture(GCCArgs))
         GCCArgs.push_back("-force_cpusubtype_ALL");
     }
   }
@@ -855,9 +854,18 @@ int GCC::ExecuteProgram(const std::string &ProgramFile,
 
   if (RemoteClientPath.isEmpty()) {
     DEBUG(errs() << "<run locally>");
-    return RunProgramWithTimeout(OutputBinary, &ProgramArgs[0],
+    int ExitCode = RunProgramWithTimeout(OutputBinary, &ProgramArgs[0],
         sys::Path(InputFile), sys::Path(OutputFile), sys::Path(OutputFile),
         Timeout, MemoryLimit, Error);
+    // Treat a signal (usually SIGSEGV) or timeout as part of the program output
+    // so that crash-causing miscompilation is handled seamlessly.
+    if (ExitCode < -1) {
+      std::ofstream outFile(OutputFile.c_str(), std::ios_base::app);
+      outFile << *Error << '\n';
+      outFile.close();
+      Error->clear();
+    }
+    return ExitCode;
   } else {
     outs() << "<run remotely>"; outs().flush();
     return RunProgramRemotelyWithTimeout(sys::Path(RemoteClientPath),
@@ -900,7 +908,7 @@ int GCC::MakeSharedObject(const std::string &InputFile, FileType fileType,
   GCCArgs.push_back("none");
   if (TargetTriple.getArch() == Triple::sparc)
     GCCArgs.push_back("-G");       // Compile a shared library, `-G' for Sparc
-  else if (TargetTriple.getOS() == Triple::Darwin) {
+  else if (TargetTriple.isOSDarwin()) {
     // link all source files into a single module in data segment, rather than
     // generating blocks. dynamic_lookup requires that you set
     // MACOSX_DEPLOYMENT_TARGET=10.3 in your env.  FIXME: it would be better for
diff --git a/tools/bugpoint/bugpoint.cpp b/tools/bugpoint/bugpoint.cpp
index f9c9e18..e25414f 100644
--- a/tools/bugpoint/bugpoint.cpp
+++ b/tools/bugpoint/bugpoint.cpp
@@ -22,7 +22,7 @@
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/PluginLoader.h"
 #include "llvm/Support/PrettyStackTrace.h"
-#include "llvm/Support/StandardPasses.h"
+#include "llvm/Support/PassManagerBuilder.h"
 #include "llvm/Support/Process.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/Valgrind.h"
@@ -69,6 +69,18 @@ static cl::opt<bool>
 StandardLinkOpts("std-link-opts", 
                  cl::desc("Include the standard link time optimizations"));
 
+static cl::opt<bool>
+OptLevelO1("O1",
+           cl::desc("Optimization level 1. Similar to llvm-gcc -O1"));
+
+static cl::opt<bool>
+OptLevelO2("O2",
+           cl::desc("Optimization level 2. Similar to llvm-gcc -O2"));
+
+static cl::opt<bool>
+OptLevelO3("O3",
+           cl::desc("Optimization level 3. Similar to llvm-gcc -O3"));
+
 static cl::opt<std::string>
 OverrideTriple("mtriple", cl::desc("Override target triple for module"));
 
@@ -83,10 +95,10 @@ static void BugpointInterruptFunction() {
 
 // Hack to capture a pass list.
 namespace {
-  class AddToDriver : public PassManager {
+  class AddToDriver : public FunctionPassManager {
     BugDriver &D;
   public:
-    AddToDriver(BugDriver &_D) : D(_D) {}
+    AddToDriver(BugDriver &_D) : FunctionPassManager(0), D(_D) {}
     
     virtual void add(Pass *P) {
       const void *ID = P->getPassID();
@@ -146,20 +158,32 @@ int main(int argc, char **argv) {
   
   AddToDriver PM(D);
   if (StandardCompileOpts) {
-    createStandardModulePasses(&PM, 3,
-                               /*OptimizeSize=*/ false,
-                               /*UnitAtATime=*/ true,
-                               /*UnrollLoops=*/ true,
-                               /*SimplifyLibCalls=*/ true,
-                               /*HaveExceptions=*/ true,
-                               createFunctionInliningPass());
+    PassManagerBuilder Builder;
+    Builder.OptLevel = 3;
+    Builder.Inliner = createFunctionInliningPass();
+    Builder.populateModulePassManager(PM);
   }
       
-  if (StandardLinkOpts)
-    createStandardLTOPasses(&PM, /*Internalize=*/true,
-                            /*RunInliner=*/true,
-                            /*VerifyEach=*/false);
+  if (StandardLinkOpts) {
+    PassManagerBuilder Builder;
+    Builder.populateLTOPassManager(PM, /*Internalize=*/true,
+                                   /*RunInliner=*/true);
+  }
 
+  if (OptLevelO1 || OptLevelO2 || OptLevelO3) {
+    PassManagerBuilder Builder;
+    if (OptLevelO1)
+      Builder.Inliner = createAlwaysInlinerPass();
+    else if (OptLevelO2)
+      Builder.Inliner = createFunctionInliningPass(225);
+    else
+      Builder.Inliner = createFunctionInliningPass(275);
+
+    // Note that although clang/llvm-gcc use two separate passmanagers
+    // here, it shouldn't normally make a difference.
+    Builder.populateFunctionPassManager(PM);
+    Builder.populateModulePassManager(PM);
+  }
 
   for (std::vector<const PassInfo*>::iterator I = PassList.begin(),
          E = PassList.end();
diff --git a/tools/gold/CMakeLists.txt b/tools/gold/CMakeLists.txt
index d8633e6..eb4b6e6 100644
--- a/tools/gold/CMakeLists.txt
+++ b/tools/gold/CMakeLists.txt
@@ -1,7 +1,9 @@
-set(LLVM_BINUTILS_INCDIR "/usr/include" CACHE PATH
+set(LLVM_BINUTILS_INCDIR "" CACHE PATH
   "PATH to binutils/include containing plugin-api.h for gold plugin.")
 
-if( NOT EXISTS "${LLVM_BINUTILS_INCDIR}/plugin-api.h" )
+if( NOT LLVM_BINUTILS_INCDIR )
+  # Nothing to say.
+elseif( NOT EXISTS "${LLVM_BINUTILS_INCDIR}/plugin-api.h" )
   message(STATUS "plugin-api.h not found. gold plugin excluded from the build.")
 else()
   include_directories( ${LLVM_BINUTILS_INCDIR} )
diff --git a/tools/gold/Makefile b/tools/gold/Makefile
index 66a0271..759406f 100644
--- a/tools/gold/Makefile
+++ b/tools/gold/Makefile
@@ -22,10 +22,10 @@ SHARED_LIBRARY = 1
 LOADABLE_MODULE = 1
 
 LINK_COMPONENTS := support
-LIBS += -llto
 
 # Because off_t is used in the public API, the largefile parts are required for
 # ABI compatibility.
-CXXFLAGS+=-I$(BINUTILS_INCDIR) -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -lLTO
+CXXFLAGS+=-I$(BINUTILS_INCDIR) -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64
+CXXFLAGS+=$(SharedLibDir)/$(SharedPrefix)LTO$(SHLIBEXT)
 
 include $(LEVEL)/Makefile.common
diff --git a/tools/gold/gold-plugin.cpp b/tools/gold/gold-plugin.cpp
index dd66eae..9e43bef 100644
--- a/tools/gold/gold-plugin.cpp
+++ b/tools/gold/gold-plugin.cpp
@@ -408,7 +408,6 @@ static ld_plugin_status all_symbols_read_hook(void) {
     if (options::generate_bc_file == options::BC_ONLY)
       exit(0);
   }
-  size_t bufsize = 0;
   const char *objPath;
   if (lto_codegen_compile_to_file(code_gen, &objPath)) {
     (*message)(LDPL_ERROR, "Could not produce a combined object file\n");
diff --git a/tools/llc/llc.cpp b/tools/llc/llc.cpp
index 7f5fa3f..162d6c8 100644
--- a/tools/llc/llc.cpp
+++ b/tools/llc/llc.cpp
@@ -99,16 +99,14 @@ cl::opt<bool> NoVerify("disable-verify", cl::Hidden,
 cl::opt<bool> DisableDotLoc("disable-dot-loc", cl::Hidden,
                             cl::desc("Do not use .loc entries"));
 
+cl::opt<bool> DisableCFI("disable-cfi", cl::Hidden,
+                         cl::desc("Do not use .cfi_* directives"));
+
 static cl::opt<bool>
 DisableRedZone("disable-red-zone",
   cl::desc("Do not emit code that uses the red zone."),
   cl::init(false));
 
-static cl::opt<bool>
-NoImplicitFloats("no-implicit-float",
-  cl::desc("Don't generate implicit floating point instructions (x86-only)"),
-  cl::init(false));
-
 // GetFileNameRoot - Helper function to get the basename of a filename.
 static inline std::string
 GetFileNameRoot(const std::string &InputFilename) {
@@ -278,18 +276,14 @@ int main(int argc, char **argv) {
 
   if (DisableDotLoc)
     Target.setMCUseLoc(false);
-  if (TheTriple.getOS() == Triple::Darwin) {
-    switch (TheTriple.getDarwinMajorNumber()) {
-    case 7:
-    case 8:
-    case 9:
-      // disable .loc support for older darwin OS.
-      Target.setMCUseLoc(false);
-      break;
-    default:
-      break;
-    }
-  }
+
+  if (DisableCFI)
+    Target.setMCUseCFI(false);
+
+  // Disable .loc support for older OS X versions.
+  if (TheTriple.isMacOSX() &&
+      TheTriple.isMacOSXVersionLT(10, 6))
+    Target.setMCUseLoc(false);
 
   // Figure out where we are going to send the output...
   OwningPtr<tool_output_file> Out
diff --git a/tools/llvm-diff/DifferenceEngine.cpp b/tools/llvm-diff/DifferenceEngine.cpp
index 3cf178e..ba2cec2 100644
--- a/tools/llvm-diff/DifferenceEngine.cpp
+++ b/tools/llvm-diff/DifferenceEngine.cpp
@@ -14,6 +14,7 @@
 
 #include "DifferenceEngine.h"
 
+#include "llvm/Constants.h"
 #include "llvm/Function.h"
 #include "llvm/Instructions.h"
 #include "llvm/Module.h"
@@ -266,7 +267,7 @@ class FunctionDifferenceEngine {
     } else if (isa<PHINode>(L)) {
       // FIXME: implement.
 
-      // This is really wierd;  type uniquing is broken?
+      // This is really weird;  type uniquing is broken?
       if (L->getType() != R->getType()) {
         if (!L->getType()->isPointerTy() || !R->getType()->isPointerTy()) {
           if (Complain) Engine.log("different phi types");
diff --git a/tools/llvm-ld/Optimize.cpp b/tools/llvm-ld/Optimize.cpp
index ef4502b..ca6a477 100644
--- a/tools/llvm-ld/Optimize.cpp
+++ b/tools/llvm-ld/Optimize.cpp
@@ -12,9 +12,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Module.h"
-#include "llvm/PassManager.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/StandardPasses.h"
+#include "llvm/Support/PassManagerBuilder.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/DynamicLibrary.h"
 #include "llvm/Target/TargetData.h"
@@ -71,11 +70,10 @@ static inline void addPass(PassManager &PM, Pass *P) {
 }
 
 namespace llvm {
-
 /// Optimize - Perform link time optimizations. This will run the scalar
 /// optimizations, any loaded plugin-optimization modules, and then the
 /// inter-procedural optimizations if applicable.
-void Optimize(Module* M) {
+void Optimize(Module *M) {
 
   // Instantiate the pass manager to organize the passes.
   PassManager Passes;
@@ -88,8 +86,8 @@ void Optimize(Module* M) {
   addPass(Passes, new TargetData(M));
 
   if (!DisableOptimizations)
-    createStandardLTOPasses(&Passes, !DisableInternalize, !DisableInline,
-                            VerifyEach);
+    PassManagerBuilder().populateLTOPassManager(Passes, !DisableInternalize,
+                                                !DisableInline);
 
   // If the -s or -S command line options were specified, strip the symbols out
   // of the resulting program to make it smaller.  -s and -S are GNU ld options
diff --git a/tools/llvm-mc/Disassembler.cpp b/tools/llvm-mc/Disassembler.cpp
index 41b92a1..93b9723 100644
--- a/tools/llvm-mc/Disassembler.cpp
+++ b/tools/llvm-mc/Disassembler.cpp
@@ -39,7 +39,7 @@ private:
   const ByteArrayTy &Bytes;
 public:
   VectorMemoryObject(const ByteArrayTy &bytes) : Bytes(bytes) {}
-  
+
   uint64_t getBase() const { return 0; }
   uint64_t getExtent() const { return Bytes.size(); }
 
@@ -57,15 +57,15 @@ static bool PrintInsts(const MCDisassembler &DisAsm,
                        SourceMgr &SM, raw_ostream &Out) {
   // Wrap the vector in a MemoryObject.
   VectorMemoryObject memoryObject(Bytes);
-  
+
   // Disassemble it to strings.
   uint64_t Size;
   uint64_t Index;
-  
+
   for (Index = 0; Index < Bytes.size(); Index += Size) {
     MCInst Inst;
-    
-    if (DisAsm.getInstruction(Inst, Size, memoryObject, Index, 
+
+    if (DisAsm.getInstruction(Inst, Size, memoryObject, Index,
                                /*REMOVE*/ nulls())) {
       Printer.printInst(&Inst, Out);
       Out << "\n";
@@ -76,12 +76,12 @@ static bool PrintInsts(const MCDisassembler &DisAsm,
         Size = 1; // skip illegible bytes
     }
   }
-  
+
   return false;
 }
 
-static bool ByteArrayFromString(ByteArrayTy &ByteArray, 
-                                StringRef &Str, 
+static bool ByteArrayFromString(ByteArrayTy &ByteArray,
+                                StringRef &Str,
                                 SourceMgr &SM) {
   while (!Str.empty()) {
     // Strip horizontal whitespace.
@@ -89,7 +89,7 @@ static bool ByteArrayFromString(ByteArrayTy &ByteArray,
       Str = Str.substr(Pos);
       continue;
     }
-    
+
     // If this is the end of a line or start of a comment, remove the rest of
     // the line.
     if (Str[0] == '\n' || Str[0] == '#') {
@@ -104,11 +104,11 @@ static bool ByteArrayFromString(ByteArrayTy &ByteArray,
       }
       continue;
     }
-    
+
     // Get the current token.
     size_t Next = Str.find_first_of(" \t\n\r#");
     StringRef Value = Str.substr(0, Next);
-    
+
     // Convert to a byte and add to the byte vector.
     unsigned ByteVal;
     if (Value.getAsInteger(0, ByteVal) || ByteVal > 255) {
@@ -119,11 +119,11 @@ static bool ByteArrayFromString(ByteArrayTy &ByteArray,
       ByteArray.clear();
       continue;
     }
-    
+
     ByteArray.push_back(std::make_pair((unsigned char)ByteVal, Value.data()));
     Str = Str.substr(Next);
   }
-  
+
   return false;
 }
 
@@ -133,18 +133,18 @@ int Disassembler::disassemble(const Target &T,  TargetMachine &TM,
                               raw_ostream &Out) {
   // Set up disassembler.
   OwningPtr<const MCAsmInfo> AsmInfo(T.createAsmInfo(Triple));
-  
+
   if (!AsmInfo) {
     errs() << "error: no assembly info for target " << Triple << "\n";
     return -1;
   }
-  
+
   OwningPtr<const MCDisassembler> DisAsm(T.createMCDisassembler());
   if (!DisAsm) {
     errs() << "error: no disassembler for target " << Triple << "\n";
     return -1;
   }
-  
+
   int AsmPrinterVariant = AsmInfo->getAssemblerDialect();
   OwningPtr<MCInstPrinter> IP(T.createMCInstPrinter(TM, AsmPrinterVariant,
                                                     *AsmInfo));
@@ -152,67 +152,67 @@ int Disassembler::disassemble(const Target &T,  TargetMachine &TM,
     errs() << "error: no instruction printer for target " << Triple << '\n';
     return -1;
   }
-  
+
   bool ErrorOccurred = false;
-  
+
   SourceMgr SM;
   SM.AddNewSourceBuffer(&Buffer, SMLoc());
-  
+
   // Convert the input to a vector for disassembly.
   ByteArrayTy ByteArray;
   StringRef Str = Buffer.getBuffer();
-  
+
   ErrorOccurred |= ByteArrayFromString(ByteArray, Str, SM);
-  
+
   if (!ByteArray.empty())
     ErrorOccurred |= PrintInsts(*DisAsm, *IP, ByteArray, SM, Out);
-    
+
   return ErrorOccurred;
 }
 
 static int byteArrayReader(uint8_t *B, uint64_t A, void *Arg) {
   ByteArrayTy &ByteArray = *((ByteArrayTy*)Arg);
-  
+
   if (A >= ByteArray.size())
     return -1;
-  
+
   *B = ByteArray[A].first;
-  
+
   return 0;
 }
 
 static int verboseEvaluator(uint64_t *V, unsigned R, void *Arg) {
   EDDisassembler &disassembler = *(EDDisassembler *)((void **)Arg)[0];
   raw_ostream &Out = *(raw_ostream *)((void **)Arg)[1];
-  
+
   if (const char *regName = disassembler.nameWithRegisterID(R))
     Out << "[" << regName << "/" << R << "]";
-  
+
   if (disassembler.registerIsStackPointer(R))
     Out << "(sp)";
   if (disassembler.registerIsProgramCounter(R))
     Out << "(pc)";
-  
+
   *V = 0;
   return 0;
 }
 
-int Disassembler::disassembleEnhanced(const std::string &TS, 
+int Disassembler::disassembleEnhanced(const std::string &TS,
                                       MemoryBuffer &Buffer,
                                       raw_ostream &Out) {
   ByteArrayTy ByteArray;
   StringRef Str = Buffer.getBuffer();
   SourceMgr SM;
-  
+
   SM.AddNewSourceBuffer(&Buffer, SMLoc());
-  
+
   if (ByteArrayFromString(ByteArray, Str, SM)) {
     return -1;
   }
-  
+
   Triple T(TS);
   EDDisassembler::AssemblySyntax AS;
-  
+
   switch (T.getArch()) {
   default:
     errs() << "error: no default assembly syntax for " << TS.c_str() << "\n";
@@ -226,53 +226,53 @@ int Disassembler::disassembleEnhanced(const std::string &TS,
     AS = EDDisassembler::kEDAssemblySyntaxX86ATT;
     break;
   }
-  
+
   EDDisassembler::initialize();
   OwningPtr<EDDisassembler>
     disassembler(EDDisassembler::getDisassembler(TS.c_str(), AS));
-  
+
   if (disassembler == 0) {
     errs() << "error: couldn't get disassembler for " << TS << '\n';
     return -1;
   }
-  
+
   while (ByteArray.size()) {
     OwningPtr<EDInst>
       inst(disassembler->createInst(byteArrayReader, 0, &ByteArray));
-  
-    ByteArray.erase (ByteArray.begin(), ByteArray.begin() + inst->byteSize());
-                               
+
     if (inst == 0) {
       errs() << "error: Didn't get an instruction\n";
       return -1;
     }
-    
+
+    ByteArray.erase (ByteArray.begin(), ByteArray.begin() + inst->byteSize());
+
     unsigned numTokens = inst->numTokens();
     if ((int)numTokens < 0) {
       errs() << "error: couldn't count the instruction's tokens\n";
       return -1;
     }
-    
+
     for (unsigned tokenIndex = 0; tokenIndex != numTokens; ++tokenIndex) {
       EDToken *token;
-      
+
       if (inst->getToken(token, tokenIndex)) {
         errs() << "error: Couldn't get token\n";
         return -1;
       }
-      
+
       const char *buf;
       if (token->getString(buf)) {
         errs() << "error: Couldn't get string for token\n";
         return -1;
       }
-      
+
       Out << '[';
       int operandIndex = token->operandID();
-      
+
       if (operandIndex >= 0)
         Out << operandIndex << "-";
-      
+
       switch (token->type()) {
       default: Out << "?"; break;
       case EDToken::kTokenWhitespace: Out << "w"; break;
@@ -281,9 +281,9 @@ int Disassembler::disassembleEnhanced(const std::string &TS,
       case EDToken::kTokenLiteral: Out << "l"; break;
       case EDToken::kTokenRegister: Out << "r"; break;
       }
-      
+
       Out << ":" << buf;
-    
+
       if (token->type() == EDToken::kTokenLiteral) {
         Out << "=";
         if (token->literalSign())
@@ -303,33 +303,34 @@ int Disassembler::disassembleEnhanced(const std::string &TS,
         }
         Out << "r" << regID;
       }
-      
+
       Out << "]";
     }
-    
+
     Out << " ";
-      
+
     if (inst->isBranch())
       Out << "<br> ";
     if (inst->isMove())
       Out << "<mov> ";
-    
+
     unsigned numOperands = inst->numOperands();
-    
+
     if ((int)numOperands < 0) {
       errs() << "error: Couldn't count operands\n";
       return -1;
     }
-    
-    for (unsigned operandIndex = 0; operandIndex != numOperands; ++operandIndex) {
+
+    for (unsigned operandIndex = 0; operandIndex != numOperands;
+         ++operandIndex) {
       Out << operandIndex << ":";
-      
+
       EDOperand *operand;
       if (inst->getOperand(operand, operandIndex)) {
         errs() << "error: couldn't get operand\n";
         return -1;
       }
-      
+
       uint64_t evaluatedResult;
       void *Arg[] = { disassembler.get(), &Out };
       if (operand->evaluate(evaluatedResult, verboseEvaluator, Arg)) {
@@ -338,10 +339,10 @@ int Disassembler::disassembleEnhanced(const std::string &TS,
       }
       Out << "=" << evaluatedResult << " ";
     }
-    
+
     Out << '\n';
   }
-  
+
   return 0;
 }
 
diff --git a/tools/llvm-mc/Disassembler.h b/tools/llvm-mc/Disassembler.h
index aaf77b5..d738ee7 100644
--- a/tools/llvm-mc/Disassembler.h
+++ b/tools/llvm-mc/Disassembler.h
@@ -31,12 +31,12 @@ public:
                          const std::string &tripleString,
                          MemoryBuffer &buffer,
                          raw_ostream &Out);
-  
+
   static int disassembleEnhanced(const std::string &tripleString,
                                  MemoryBuffer &buffer,
                                  raw_ostream &Out);
 };
-  
+
 } // namespace llvm
 
 #endif
diff --git a/tools/llvm-mc/llvm-mc.cpp b/tools/llvm-mc/llvm-mc.cpp
index f76b6ed..eb23a1a 100644
--- a/tools/llvm-mc/llvm-mc.cpp
+++ b/tools/llvm-mc/llvm-mc.cpp
@@ -183,10 +183,10 @@ static int AsLexInput(const char *ProgName) {
   MemoryBuffer *Buffer = BufferPtr.take();
 
   SourceMgr SrcMgr;
-  
+
   // Tell SrcMgr about this buffer, which is what TGParser will pick up.
   SrcMgr.AddNewSourceBuffer(Buffer, SMLoc());
-  
+
   // Record the location of the include directories so that the lexer can find
   // it later.
   SrcMgr.setIncludeDirs(IncludeDirs);
@@ -279,7 +279,7 @@ static int AsLexInput(const char *ProgName) {
 
   // Keep output if no errors.
   if (Error == 0) Out->keep();
- 
+
   return Error;
 }
 
@@ -294,20 +294,20 @@ static int AssembleInput(const char *ProgName) {
     return 1;
   }
   MemoryBuffer *Buffer = BufferPtr.take();
-  
+
   SourceMgr SrcMgr;
-  
+
   // Tell SrcMgr about this buffer, which is what the parser will pick up.
   SrcMgr.AddNewSourceBuffer(Buffer, SMLoc());
-  
+
   // Record the location of the include directories so that the lexer can find
   // it later.
   SrcMgr.setIncludeDirs(IncludeDirs);
-  
-  
+
+
   llvm::OwningPtr<MCAsmInfo> MAI(TheTarget->createAsmInfo(TripleName));
   assert(MAI && "Unable to create target asm info!");
-  
+
   // Package up features to be passed to target/subtarget
   std::string FeaturesStr;
   if (MCPU.size()) {
@@ -356,7 +356,8 @@ static int AssembleInput(const char *ProgName) {
       TAB = TheTarget->createAsmBackend(TripleName);
     }
     Str.reset(TheTarget->createAsmStreamer(Ctx, FOS, /*asmverbose*/true,
-                                           /*useLoc*/ true, IP, CE, TAB,
+                                           /*useLoc*/ true,
+                                           /*useCFI*/ true, IP, CE, TAB,
                                            ShowInst));
   } else if (FileType == OFT_Null) {
     Str.reset(createNullStreamer(Ctx));
@@ -377,7 +378,7 @@ static int AssembleInput(const char *ProgName) {
                                                    *Str.get(), *MAI));
   OwningPtr<TargetAsmParser> TAP(TheTarget->createAsmParser(*Parser, *TM));
   if (!TAP) {
-    errs() << ProgName 
+    errs() << ProgName
            << ": error: this target does not support assembly parsing.\n";
     return 1;
   }
@@ -403,7 +404,7 @@ static int DisassembleInput(const char *ProgName, bool Enhanced) {
     errs() << ProgName << ": " << ec.message() << '\n';
     return 1;
   }
-  
+
   OwningPtr<tool_output_file> Out(GetOutputStream());
   if (!Out)
     return 1;
@@ -458,7 +459,7 @@ int main(int argc, char **argv) {
   llvm::InitializeAllAsmPrinters();
   llvm::InitializeAllAsmParsers();
   llvm::InitializeAllDisassemblers();
-  
+
   cl::ParseCommandLineOptions(argc, argv, "llvm machine code playground\n");
   TripleName = Triple::normalize(TripleName);
 
@@ -473,7 +474,7 @@ int main(int argc, char **argv) {
   case AC_EDisassemble:
     return DisassembleInput(argv[0], true);
   }
-  
+
   return 0;
 }
 
diff --git a/tools/llvm-objdump/llvm-objdump.cpp b/tools/llvm-objdump/llvm-objdump.cpp
index de01656..a17624a 100644
--- a/tools/llvm-objdump/llvm-objdump.cpp
+++ b/tools/llvm-objdump/llvm-objdump.cpp
@@ -45,7 +45,6 @@
 #include <cctype>
 #include <cerrno>
 #include <cstring>
-#include <vector>
 using namespace llvm;
 using namespace object;
 
diff --git a/tools/llvm-rtdyld/llvm-rtdyld.cpp b/tools/llvm-rtdyld/llvm-rtdyld.cpp
index ddd6683..ec9d652 100644
--- a/tools/llvm-rtdyld/llvm-rtdyld.cpp
+++ b/tools/llvm-rtdyld/llvm-rtdyld.cpp
@@ -24,8 +24,9 @@
 using namespace llvm;
 using namespace llvm::object;
 
-static cl::opt<std::string>
-InputFile(cl::Positional, cl::desc("<input file>"), cl::init("-"));
+static cl::list<std::string>
+InputFileList(cl::Positional, cl::ZeroOrMore,
+              cl::desc("<input file>"));
 
 enum ActionType {
   AC_Execute
@@ -38,15 +39,22 @@ Action(cl::desc("Action to perform:"),
                              "Load, link, and execute the inputs."),
                   clEnumValEnd));
 
+static cl::opt<std::string>
+EntryPoint("entry",
+           cl::desc("Function to call as entry point."),
+           cl::init("_main"));
+
 /* *** */
 
 // A trivial memory manager that doesn't do anything fancy, just uses the
 // support library allocation routines directly.
 class TrivialMemoryManager : public RTDyldMemoryManager {
 public:
+  SmallVector<sys::MemoryBlock, 16> FunctionMemory;
+
   uint8_t *startFunctionBody(const char *Name, uintptr_t &Size);
   void endFunctionBody(const char *Name, uint8_t *FunctionStart,
-                       uint8_t *FunctionEnd) {}
+                       uint8_t *FunctionEnd);
 };
 
 uint8_t *TrivialMemoryManager::startFunctionBody(const char *Name,
@@ -54,6 +62,13 @@ uint8_t *TrivialMemoryManager::startFunctionBody(const char *Name,
   return (uint8_t*)sys::Memory::AllocateRWX(Size, 0, 0).base();
 }
 
+void TrivialMemoryManager::endFunctionBody(const char *Name,
+                                           uint8_t *FunctionStart,
+                                           uint8_t *FunctionEnd) {
+  uintptr_t Size = FunctionEnd - FunctionStart + 1;
+  FunctionMemory.push_back(sys::MemoryBlock(FunctionStart, Size));
+}
+
 static const char *ProgramName;
 
 static void Message(const char *Type, const Twine &Msg) {
@@ -68,40 +83,54 @@ static int Error(const Twine &Msg) {
 /* *** */
 
 static int executeInput() {
-  // Load the input memory buffer.
-  OwningPtr<MemoryBuffer> InputBuffer;
-  if (error_code ec = MemoryBuffer::getFileOrSTDIN(InputFile, InputBuffer))
-    return Error("unable to read input: '" + ec.message() + "'");
-
   // Instantiate a dynamic linker.
-  RuntimeDyld Dyld(new TrivialMemoryManager);
-
-  // Load the object file into it.
-  if (Dyld.loadObject(InputBuffer.take())) {
-    return Error(Dyld.getErrorString());
+  TrivialMemoryManager *MemMgr = new TrivialMemoryManager;
+  RuntimeDyld Dyld(MemMgr);
+
+  // If we don't have any input files, read from stdin.
+  if (!InputFileList.size())
+    InputFileList.push_back("-");
+  for(unsigned i = 0, e = InputFileList.size(); i != e; ++i) {
+    // Load the input memory buffer.
+    OwningPtr<MemoryBuffer> InputBuffer;
+    if (error_code ec = MemoryBuffer::getFileOrSTDIN(InputFileList[i],
+                                                     InputBuffer))
+      return Error("unable to read input: '" + ec.message() + "'");
+
+    // Load the object file into it.
+    if (Dyld.loadObject(InputBuffer.take())) {
+      return Error(Dyld.getErrorString());
+    }
   }
 
-  // Get the address of "_main".
-  uint64_t MainAddress = Dyld.getSymbolAddress("_main");
-  if (MainAddress == 0)
-    return Error("no definition for '_main'");
+  // Resolve all the relocations we can.
+  Dyld.resolveRelocations();
 
-  // Invalidate the instruction cache.
-  sys::MemoryBlock Data = Dyld.getMemoryBlock();
-  sys::Memory::InvalidateInstructionCache(Data.base(), Data.size());
+  // FIXME: Error out if there are unresolved relocations.
 
-  // Make sure the memory is executable.
-  std::string ErrorStr;
-  if (!sys::Memory::setExecutable(Data, &ErrorStr))
-    return Error("unable to mark function executable: '" + ErrorStr + "'");
+  // Get the address of the entry point (_main by default).
+  void *MainAddress = Dyld.getSymbolAddress(EntryPoint);
+  if (MainAddress == 0)
+    return Error("no definition for '" + EntryPoint + "'");
+
+  // Invalidate the instruction cache for each loaded function.
+  for (unsigned i = 0, e = MemMgr->FunctionMemory.size(); i != e; ++i) {
+    sys::MemoryBlock &Data = MemMgr->FunctionMemory[i];
+    // Make sure the memory is executable.
+    std::string ErrorStr;
+    sys::Memory::InvalidateInstructionCache(Data.base(), Data.size());
+    if (!sys::Memory::setExecutable(Data, &ErrorStr))
+      return Error("unable to mark function executable: '" + ErrorStr + "'");
+  }
 
   // Dispatch to _main().
-  errs() << "loaded '_main' at: " << (void*)MainAddress << "\n";
+  errs() << "loaded '" << EntryPoint << "' at: " << (void*)MainAddress << "\n";
 
   int (*Main)(int, const char**) =
     (int(*)(int,const char**)) uintptr_t(MainAddress);
   const char **Argv = new const char*[2];
-  Argv[0] = InputFile.c_str();
+  // Use the name of the first input object module as argv[0] for the target.
+  Argv[0] = InputFileList[0].c_str();
   Argv[1] = 0;
   return Main(1, Argv);
 }
diff --git a/tools/llvmc/doc/LLVMC-Reference.rst b/tools/llvmc/doc/LLVMC-Reference.rst
index ec9098b..041aedf 100644
--- a/tools/llvmc/doc/LLVMC-Reference.rst
+++ b/tools/llvmc/doc/LLVMC-Reference.rst
@@ -18,17 +18,16 @@ Introduction
 ============
 
 LLVMC is a generic compiler driver, designed to be customizable and
-extensible. It plays the same role for LLVM as the ``gcc`` program
-does for GCC - LLVMC's job is essentially to transform a set of input
-files into a set of targets depending on configuration rules and user
-options. What makes LLVMC different is that these transformation rules
-are completely customizable - in fact, LLVMC knows nothing about the
-specifics of transformation (even the command-line options are mostly
-not hard-coded) and regards the transformation structure as an
-abstract graph. The structure of this graph is completely determined
-by plugins, which can be either statically or dynamically linked. This
-makes it possible to easily adapt LLVMC for other purposes - for
-example, as a build tool for game resources.
+extensible. It plays the same role for LLVM as the ``gcc`` program does for
+GCC - LLVMC's job is essentially to transform a set of input files into a set of
+targets depending on configuration rules and user options. What makes LLVMC
+different is that these transformation rules are completely customizable - in
+fact, LLVMC knows nothing about the specifics of transformation (even the
+command-line options are mostly not hard-coded) and regards the transformation
+structure as an abstract graph. The structure of this graph is described in
+high-level TableGen code, from which an efficient C++ representation is
+automatically derived. This makes it possible to adapt LLVMC for other
+purposes - for example, as a build tool for game resources.
 
 Because LLVMC employs TableGen_ as its configuration language, you
 need to be familiar with it to customize LLVMC.
@@ -36,8 +35,8 @@ need to be familiar with it to customize LLVMC.
 .. _TableGen: http://llvm.org/docs/TableGenFundamentals.html
 
 
-Compiling with LLVMC
-====================
+Compiling with ``llvmc``
+========================
 
 LLVMC tries hard to be as compatible with ``gcc`` as possible,
 although there are some small differences. Most of the time, however,
@@ -78,17 +77,13 @@ possible to choose the ``clang`` compiler with the ``-clang`` option.
 Predefined options
 ==================
 
-LLVMC has some built-in options that can't be overridden in the
-configuration libraries:
+LLVMC has some built-in options that can't be overridden in the TableGen code:
 
 * ``-o FILE`` - Output file name.
 
 * ``-x LANGUAGE`` - Specify the language of the following input files
   until the next -x option.
 
-* ``-load PLUGIN_NAME`` - Load the specified plugin DLL. Example:
-  ``-load $LLVM_DIR/Release/lib/LLVMCSimple.so``.
-
 * ``-v`` - Enable verbose mode, i.e. print out all executed commands.
 
 * ``--save-temps`` - Write temporary files to the current directory and do not
@@ -103,124 +98,90 @@ configuration libraries:
   precedence.
 
 * ``--check-graph`` - Check the compilation for common errors like mismatched
-  output/input language names, multiple default edges and cycles. Because of
-  plugins, these checks can't be performed at compile-time. Exit with code zero
-  if no errors were found, and return the number of found errors
-  otherwise. Hidden option, useful for debugging LLVMC plugins.
+  output/input language names, multiple default edges and cycles. Exit with code
+  zero if no errors were found, and return the number of found errors
+  otherwise. Hidden option, useful for debugging.
 
 * ``--view-graph`` - Show a graphical representation of the compilation graph
   and exit. Requires that you have ``dot`` and ``gv`` programs installed. Hidden
-  option, useful for debugging LLVMC plugins.
+  option, useful for debugging.
 
 * ``--write-graph`` - Write a ``compilation-graph.dot`` file in the current
   directory with the compilation graph description in Graphviz format (identical
   to the file used by the ``--view-graph`` option). The ``-o`` option can be
-  used to set the output file name. Hidden option, useful for debugging LLVMC
-  plugins.
+  used to set the output file name. Hidden option, useful for debugging.
 
 * ``--help``, ``--help-hidden``, ``--version`` - These options have
   their standard meaning.
 
-Compiling LLVMC plugins
-=======================
+Compiling LLVMC-based drivers
+=============================
 
-It's easiest to start working on your own LLVMC plugin by copying the
-skeleton project which lives under ``$LLVMC_DIR/plugins/Simple``::
+It's easiest to start working on your own LLVMC driver by copying the skeleton
+project which lives under ``$LLVMC_DIR/examples/Skeleton``::
 
-   $ cd $LLVMC_DIR/plugins
-   $ cp -r Simple MyPlugin
-   $ cd MyPlugin
+   $ cd $LLVMC_DIR/examples
+   $ cp -r Skeleton MyDriver
+   $ cd MyDriver
    $ ls
-   Makefile PluginMain.cpp Simple.td
+   AutoGenerated.td  Hooks.cpp  Main.cpp  Makefile
 
-As you can see, our basic plugin consists of only two files (not
-counting the build script). ``Simple.td`` contains TableGen
-description of the compilation graph; its format is documented in the
-following sections. ``PluginMain.cpp`` is just a helper file used to
-compile the auto-generated C++ code produced from TableGen source. It
-can also contain hook definitions (see `below`__).
+As you can see, our basic driver consists of only three files (not counting the
+build script). ``AutoGenerated.td`` contains TableGen description of the
+compilation graph; its format is documented in the following
+sections. ``Hooks.cpp`` is an empty file that should be used for hook
+definitions (see `below`__). ``Main.cpp`` is just a helper used to compile the
+auto-generated C++ code produced from TableGen source.
 
 __ hooks_
 
-The first thing that you should do is to change the ``LLVMC_PLUGIN``
-variable in the ``Makefile`` to avoid conflicts (since this variable
-is used to name the resulting library)::
-
-   LLVMC_PLUGIN=MyPlugin
-
-It is also a good idea to rename ``Simple.td`` to something less
-generic::
+The first thing that you should do is to change the ``LLVMC_BASED_DRIVER``
+variable in the ``Makefile``::
 
-   $ mv Simple.td MyPlugin.td
+   LLVMC_BASED_DRIVER=MyDriver
 
-To build your plugin as a dynamic library, just ``cd`` to its source
-directory and run ``make``. The resulting file will be called
-``plugin_llvmc_$(LLVMC_PLUGIN).$(DLL_EXTENSION)`` (in our case,
-``plugin_llvmc_MyPlugin.so``). This library can be then loaded in with the
-``-load`` option. Example::
+It can also be a good idea to put your TableGen code into a file with a less
+generic name::
 
-    $ cd $LLVMC_DIR/plugins/Simple
-    $ make
-    $ llvmc -load $LLVM_DIR/Release/lib/plugin_llvmc_Simple.so
+   $ touch MyDriver.td
+   $ vim AutoGenerated.td
+   [...]
+   include "MyDriver.td"
 
-Compiling standalone LLVMC-based drivers
-========================================
+If you have more than one TableGen source file, they all should be included from
+``AutoGenerated.td``, since this file is used by the build system to generate
+C++ code.
 
-By default, the ``llvmc`` executable consists of a driver core plus several
-statically linked plugins (``Base`` and ``Clang`` at the moment). You can
-produce a standalone LLVMC-based driver executable by linking the core with your
-own plugins. The recommended way to do this is by starting with the provided
-``Skeleton`` example (``$LLVMC_DIR/example/Skeleton``)::
-
-    $ cd $LLVMC_DIR/example/
-    $ cp -r Skeleton mydriver
-    $ cd mydriver
-    $ vim Makefile
-    [...]
-    $ make
+To build your driver, just ``cd`` to its source directory and run ``make``. The
+resulting executable will be put into ``$LLVM_OBJ_DIR/$(BuildMode)/bin``.
 
 If you're compiling LLVM with different source and object directories, then you
 must perform the following additional steps before running ``make``::
 
     # LLVMC_SRC_DIR = $LLVM_SRC_DIR/tools/llvmc/
     # LLVMC_OBJ_DIR = $LLVM_OBJ_DIR/tools/llvmc/
-    $ cp $LLVMC_SRC_DIR/example/mydriver/Makefile \
-      $LLVMC_OBJ_DIR/example/mydriver/
-    $ cd $LLVMC_OBJ_DIR/example/mydriver
+    $ mkdir $LLVMC_OBJ_DIR/examples/MyDriver/
+    $ cp $LLVMC_SRC_DIR/examples/MyDriver/Makefile \
+      $LLVMC_OBJ_DIR/examples/MyDriver/
+    $ cd $LLVMC_OBJ_DIR/examples/MyDriver
     $ make
 
-Another way to do the same thing is by using the following command::
-
-    $ cd $LLVMC_DIR
-    $ make LLVMC_BUILTIN_PLUGINS=MyPlugin LLVMC_BASED_DRIVER_NAME=mydriver
-
-This works with both srcdir == objdir and srcdir != objdir, but assumes that the
-plugin source directory was placed under ``$LLVMC_DIR/plugins``.
-
-Sometimes, you will want a 'bare-bones' version of LLVMC that has no
-built-in plugins. It can be compiled with the following command::
-
-    $ cd $LLVMC_DIR
-    $ make LLVMC_BUILTIN_PLUGINS=""
-
 
 Customizing LLVMC: the compilation graph
 ========================================
 
-Each TableGen configuration file should include the common
-definitions::
+Each TableGen configuration file should include the common definitions::
 
    include "llvm/CompilerDriver/Common.td"
 
-Internally, LLVMC stores information about possible source
-transformations in form of a graph. Nodes in this graph represent
-tools, and edges between two nodes represent a transformation path. A
-special "root" node is used to mark entry points for the
-transformations. LLVMC also assigns a weight to each edge (more on
-this later) to choose between several alternative edges.
+Internally, LLVMC stores information about possible source transformations in
+form of a graph. Nodes in this graph represent tools, and edges between two
+nodes represent a transformation path. A special "root" node is used to mark
+entry points for the transformations. LLVMC also assigns a weight to each edge
+(more on this later) to choose between several alternative edges.
 
-The definition of the compilation graph (see file
-``plugins/Base/Base.td`` for an example) is just a list of edges::
+The definition of the compilation graph (see file ``llvmc/src/Base.td`` for an
+example) is just a list of edges::
 
     def CompilationGraph : CompilationGraph<[
         Edge<"root", "llvm_gcc_c">,
@@ -245,43 +206,37 @@ The definition of the compilation graph (see file
 
         ]>;
 
-As you can see, the edges can be either default or optional, where
-optional edges are differentiated by an additional ``case`` expression
-used to calculate the weight of this edge. Notice also that we refer
-to tools via their names (as strings). This makes it possible to add
-edges to an existing compilation graph in plugins without having to
-know about all tool definitions used in the graph.
-
-The default edges are assigned a weight of 1, and optional edges get a
-weight of 0 + 2*N where N is the number of tests that evaluated to
-true in the ``case`` expression. It is also possible to provide an
-integer parameter to ``inc_weight`` and ``dec_weight`` - in this case,
-the weight is increased (or decreased) by the provided value instead
-of the default 2. It is also possible to change the default weight of
-an optional edge by using the ``default`` clause of the ``case``
+As you can see, the edges can be either default or optional, where optional
+edges are differentiated by an additional ``case`` expression used to calculate
+the weight of this edge. Notice also that we refer to tools via their names (as
+strings). This makes it possible to add edges to an existing compilation graph
+without having to know about all tool definitions used in the graph.
+
+The default edges are assigned a weight of 1, and optional edges get a weight of
+0 + 2*N where N is the number of tests that evaluated to true in the ``case``
+expression. It is also possible to provide an integer parameter to
+``inc_weight`` and ``dec_weight`` - in this case, the weight is increased (or
+decreased) by the provided value instead of the default 2. Default weight of an
+optional edge can be changed by using the ``default`` clause of the ``case``
 construct.
 
-When passing an input file through the graph, LLVMC picks the edge
-with the maximum weight. To avoid ambiguity, there should be only one
-default edge between two nodes (with the exception of the root node,
-which gets a special treatment - there you are allowed to specify one
-default edge *per language*).
+When passing an input file through the graph, LLVMC picks the edge with the
+maximum weight. To avoid ambiguity, there should be only one default edge
+between two nodes (with the exception of the root node, which gets a special
+treatment - there you are allowed to specify one default edge *per language*).
 
-When multiple plugins are loaded, their compilation graphs are merged
-together. Since multiple edges that have the same end nodes are not
-allowed (i.e. the graph is not a multigraph), an edge defined in
-several plugins will be replaced by the definition from the plugin
-that was loaded last. Plugin load order can be controlled by using the
-plugin priority feature described above.
+When multiple compilation graphs are defined, they are merged together. Multiple
+edges with the same end nodes are not allowed (i.e. the graph is not a
+multigraph), and will lead to a compile-time error.
 
-To get a visual representation of the compilation graph (useful for
-debugging), run ``llvmc --view-graph``. You will need ``dot`` and
-``gsview`` installed for this to work properly.
+To get a visual representation of the compilation graph (useful for debugging),
+run ``llvmc --view-graph``. You will need ``dot`` and ``gsview`` installed for
+this to work properly.
 
 Describing options
 ==================
 
-Command-line options that the plugin supports are defined by using an
+Command-line options supported by the driver are defined by using an
 ``OptionList``::
 
     def Options : OptionList<[
@@ -290,11 +245,10 @@ Command-line options that the plugin supports are defined by using an
     ...
     ]>;
 
-As you can see, the option list is just a list of DAGs, where each DAG
-is an option description consisting of the option name and some
-properties. A plugin can define more than one option list (they are
-all merged together in the end), which can be handy if one wants to
-separate option groups syntactically.
+As you can see, the option list is just a list of DAGs, where each DAG is an
+option description consisting of the option name and some properties. More than
+one option list can be defined (they are all merged together in the end), which
+can be handy if one wants to separate option groups syntactically.
 
 * Possible option types:
 
@@ -306,7 +260,7 @@ separate option groups syntactically.
      sign: ``-std c99``. At most one occurrence is allowed.
 
    - ``parameter_list_option`` - same as the above, but more than one option
-     occurence is allowed.
+     occurrence is allowed.
 
    - ``prefix_option`` - same as the parameter_option, but the option name and
      argument do not have to be separated. Example: ``-ofile``. This can be also
@@ -314,7 +268,7 @@ separate option groups syntactically.
      (``=file`` will be interpreted as option value). At most one occurrence is
      allowed.
 
-   - ``prefix_list_option`` - same as the above, but more than one occurence of
+   - ``prefix_list_option`` - same as the above, but more than one occurrence of
      the option is allowed; example: ``-lm -lpthread``.
 
    - ``alias_option`` - a special option type for creating aliases. Unlike other
@@ -380,42 +334,17 @@ separate option groups syntactically.
      Usage examples: ``(switch_option "foo", (init true))``; ``(prefix_option
      "bar", (init "baz"))``.
 
-   - ``extern`` - this option is defined in some other plugin, see `below`__.
-
-   __ extern_
-
-.. _extern:
-
-External options
-----------------
-
-Sometimes, when linking several plugins together, one plugin needs to
-access options defined in some other plugin. Because of the way
-options are implemented, such options must be marked as
-``extern``. This is what the ``extern`` option property is
-for. Example::
-
-     ...
-     (switch_option "E", (extern))
-     ...
-
-If an external option has additional attributes besides 'extern', they are
-ignored. See also the section on plugin `priorities`__.
-
-__ priorities_
-
 .. _case:
 
 Conditional evaluation
 ======================
 
-The 'case' construct is the main means by which programmability is
-achieved in LLVMC. It can be used to calculate edge weights, program
-actions and modify the shell commands to be executed. The 'case'
-expression is designed after the similarly-named construct in
-functional languages and takes the form ``(case (test_1), statement_1,
-(test_2), statement_2, ... (test_N), statement_N)``. The statements
-are evaluated only if the corresponding tests evaluate to true.
+The 'case' construct is the main means by which programmability is achieved in
+LLVMC. It can be used to calculate edge weights, program actions and modify the
+shell commands to be executed. The 'case' expression is designed after the
+similarly-named construct in functional languages and takes the form ``(case
+(test_1), statement_1, (test_2), statement_2, ... (test_N), statement_N)``. The
+statements are evaluated only if the corresponding tests evaluate to true.
 
 Examples::
 
@@ -439,20 +368,19 @@ Examples::
         (switch_on "B"), "cmdline2",
         (default), "cmdline3")
 
-Note the slight difference in 'case' expression handling in contexts
-of edge weights and command line specification - in the second example
-the value of the ``"B"`` switch is never checked when switch ``"A"`` is
-enabled, and the whole expression always evaluates to ``"cmdline1"`` in
-that case.
+Note the slight difference in 'case' expression handling in contexts of edge
+weights and command line specification - in the second example the value of the
+``"B"`` switch is never checked when switch ``"A"`` is enabled, and the whole
+expression always evaluates to ``"cmdline1"`` in that case.
 
 Case expressions can also be nested, i.e. the following is legal::
 
     (case (switch_on "E"), (case (switch_on "o"), ..., (default), ...)
           (default), ...)
 
-You should, however, try to avoid doing that because it hurts
-readability. It is usually better to split tool descriptions and/or
-use TableGen inheritance instead.
+You should, however, try to avoid doing that because it hurts readability. It is
+usually better to split tool descriptions and/or use TableGen inheritance
+instead.
 
 * Possible tests are:
 
@@ -526,72 +454,75 @@ use TableGen inheritance instead.
     Example: ``(not (or (test1), (test2), ... (testN)))``.
 
 
-
 Writing a tool description
 ==========================
 
-As was said earlier, nodes in the compilation graph represent tools,
-which are described separately. A tool definition looks like this
-(taken from the ``include/llvm/CompilerDriver/Tools.td`` file)::
+As was said earlier, nodes in the compilation graph represent tools, which are
+described separately. A tool definition looks like this (taken from the
+``llvmc/src/Base.td`` file)::
 
   def llvm_gcc_cpp : Tool<[
       (in_language "c++"),
       (out_language "llvm-assembler"),
       (output_suffix "bc"),
-      (cmd_line "llvm-g++ -c $INFILE -o $OUTFILE -emit-llvm"),
+      (command "llvm-g++ -c -emit-llvm"),
       (sink)
       ]>;
 
 This defines a new tool called ``llvm_gcc_cpp``, which is an alias for
-``llvm-g++``. As you can see, a tool definition is just a list of
-properties; most of them should be self-explanatory. The ``sink``
-property means that this tool should be passed all command-line
-options that aren't mentioned in the option list.
+``llvm-g++``. As you can see, a tool definition is just a list of properties;
+most of them should be self-explanatory. The ``sink`` property means that this
+tool should be passed all command-line options that aren't mentioned in the
+option list.
 
 The complete list of all currently implemented tool properties follows.
 
 * Possible tool properties:
 
   - ``in_language`` - input language name. Can be given multiple arguments, in
-    case the tool supports multiple input languages.
+    case the tool supports multiple input languages. Used for typechecking and
+    mapping file extensions to tools.
 
   - ``out_language`` - output language name. Multiple output languages are
-    allowed.
+    allowed. Used for typechecking the compilation graph.
 
-  - ``output_suffix`` - output file suffix. Can also be changed
-    dynamically, see documentation on actions.
+  - ``output_suffix`` - output file suffix. Can also be changed dynamically, see
+    documentation on `actions`__.
+
+__ actions_
 
-  - ``cmd_line`` - the actual command used to run the tool. You can
-    use ``$INFILE`` and ``$OUTFILE`` variables, output redirection
-    with ``>``, hook invocations (``$CALL``), environment variables
+  - ``command`` - the actual command used to run the tool. You can use output
+    redirection with ``>``, hook invocations (``$CALL``), environment variables
     (via ``$ENV``) and the ``case`` construct.
 
-  - ``join`` - this tool is a "join node" in the graph, i.e. it gets a
-    list of input files and joins them together. Used for linkers.
+  - ``join`` - this tool is a "join node" in the graph, i.e. it gets a list of
+    input files and joins them together. Used for linkers.
 
-  - ``sink`` - all command-line options that are not handled by other
-    tools are passed to this tool.
+  - ``sink`` - all command-line options that are not handled by other tools are
+    passed to this tool.
 
-  - ``actions`` - A single big ``case`` expression that specifies how
-    this tool reacts on command-line options (described in more detail
-    `below`__).
+  - ``actions`` - A single big ``case`` expression that specifies how this tool
+    reacts on command-line options (described in more detail `below`__).
 
 __ actions_
 
+  - ``out_file_option``, ``in_file_option`` - Options appended to the
+    ``command`` string to designate output and input files. Default values are
+    ``"-o"`` and ``""``, respectively.
+
 .. _actions:
 
 Actions
 -------
 
-A tool often needs to react to command-line options, and this is
-precisely what the ``actions`` property is for. The next example
-illustrates this feature::
+A tool often needs to react to command-line options, and this is precisely what
+the ``actions`` property is for. The next example illustrates this feature::
 
   def llvm_gcc_linker : Tool<[
       (in_language "object-code"),
       (out_language "executable"),
       (output_suffix "out"),
-      (cmd_line "llvm-gcc $INFILE -o $OUTFILE"),
+      (command "llvm-gcc"),
       (join),
       (actions (case (not_empty "L"), (forward "L"),
                      (not_empty "l"), (forward "l"),
@@ -599,18 +530,17 @@ illustrates this feature::
                                [(append_cmd "-dummy1"), (append_cmd "-dummy2")])
       ]>;
 
-The ``actions`` tool property is implemented on top of the omnipresent
-``case`` expression. It associates one or more different *actions*
-with given conditions - in the example, the actions are ``forward``,
-which forwards a given option unchanged, and ``append_cmd``, which
-appends a given string to the tool execution command. Multiple actions
-can be associated with a single condition by using a list of actions
-(used in the example to append some dummy options). The same ``case``
-construct can also be used in the ``cmd_line`` property to modify the
-tool command line.
+The ``actions`` tool property is implemented on top of the omnipresent ``case``
+expression. It associates one or more different *actions* with given
+conditions - in the example, the actions are ``forward``, which forwards a given
+option unchanged, and ``append_cmd``, which appends a given string to the tool
+execution command. Multiple actions can be associated with a single condition by
+using a list of actions (used in the example to append some dummy options). The
+same ``case`` construct can also be used in the ``cmd_line`` property to modify
+the tool command line.
 
-The "join" property used in the example means that this tool behaves
-like a linker.
+The "join" property used in the example means that this tool behaves like a
+linker.
 
 The list of all possible actions follows.
 
@@ -656,10 +586,10 @@ The list of all possible actions follows.
 Language map
 ============
 
-If you are adding support for a new language to LLVMC, you'll need to
-modify the language map, which defines mappings from file extensions
-to language names. It is used to choose the proper toolchain(s) for a
-given input file set. Language map definition looks like this::
+If you are adding support for a new language to LLVMC, you'll need to modify the
+language map, which defines mappings from file extensions to language names. It
+is used to choose the proper toolchain(s) for a given input file set. Language
+map definition looks like this::
 
     def LanguageMap : LanguageMap<
         [LangToSuffixes<"c++", ["cc", "cp", "cxx", "cpp", "CPP", "c++", "C"]>,
@@ -673,9 +603,7 @@ For example, without those definitions the following command wouldn't work::
     llvmc: Unknown suffix: cpp
 
 The language map entries are needed only for the tools that are linked from the
-root node. Since a tool can't have multiple output languages, for inner nodes of
-the graph the input and output languages should match. This is enforced at
-compile-time.
+root node. A tool can have multiple output languages.
 
 Option preprocessor
 ===================
@@ -686,7 +614,7 @@ implemented as switches, we might want to output a warning if the user invokes
 the driver with both of these options enabled.
 
 The ``OptionPreprocessor`` feature is reserved specially for these
-occasions. Example (adapted from the built-in Base plugin)::
+occasions. Example (adapted from ``llvm/src/Base.td.in``)::
 
 
     def Preprocess : OptionPreprocessor<
@@ -705,7 +633,7 @@ that they are not forwarded to the compiler. If no optimization options are
 specified, ``-O2`` is enabled.
 
 ``OptionPreprocessor`` is basically a single big ``case`` expression, which is
-evaluated only once right after the plugin is loaded. The only allowed actions
+evaluated only once right after the driver is started. The only allowed actions
 in ``OptionPreprocessor`` are ``error``, ``warning``, and two special actions:
 ``unset_option`` and ``set_option``. As their names suggest, they can be used to
 set or unset a given option. To set an option with ``set_option``, use the
@@ -726,30 +654,28 @@ More advanced topics
 Hooks and environment variables
 -------------------------------
 
-Normally, LLVMC executes programs from the system ``PATH``. Sometimes,
-this is not sufficient: for example, we may want to specify tool paths
-or names in the configuration file. This can be easily achieved via
-the hooks mechanism. To write your own hooks, just add their
-definitions to the ``PluginMain.cpp`` or drop a ``.cpp`` file into the
-your plugin directory. Hooks should live in the ``hooks`` namespace
-and have the signature ``std::string hooks::MyHookName ([const char*
-Arg0 [ const char* Arg2 [, ...]]])``. They can be used from the
-``cmd_line`` tool property::
+Normally, LLVMC searches for programs in the system ``PATH``. Sometimes, this is
+not sufficient: for example, we may want to specify tool paths or names in the
+configuration file. This can be achieved via the hooks mechanism. To write your
+own hooks, add their definitions to the ``Hooks.cpp`` or drop a ``.cpp`` file
+into your driver directory. Hooks should live in the ``hooks`` namespace and
+have the signature ``std::string hooks::MyHookName ([const char* Arg0 [ const
+char* Arg2 [, ...]]])``. They can be used from the ``command`` tool property::
 
-    (cmd_line "$CALL(MyHook)/path/to/file -o $CALL(AnotherHook)")
+    (command "$CALL(MyHook)/path/to/file -o $CALL(AnotherHook)")
 
 To pass arguments to hooks, use the following syntax::
 
-    (cmd_line "$CALL(MyHook, 'Arg1', 'Arg2', 'Arg # 3')/path/to/file -o1 -o2")
+    (command "$CALL(MyHook, 'Arg1', 'Arg2', 'Arg # 3')/path/to/file -o1 -o2")
 
 It is also possible to use environment variables in the same manner::
 
-   (cmd_line "$ENV(VAR1)/path/to/file -o $ENV(VAR2)")
+   (command "$ENV(VAR1)/path/to/file -o $ENV(VAR2)")
 
 To change the command line string based on user-provided options use
 the ``case`` expression (documented `above`__)::
 
-    (cmd_line
+    (command
       (case
         (switch_on "E"),
            "llvm-g++ -E -x c $INFILE -o $OUTFILE",
@@ -758,42 +684,21 @@ the ``case`` expression (documented `above`__)::
 
 __ case_
 
-.. _priorities:
-
-How plugins are loaded
-----------------------
-
-It is possible for LLVMC plugins to depend on each other. For example,
-one can create edges between nodes defined in some other plugin. To
-make this work, however, that plugin should be loaded first. To
-achieve this, the concept of plugin priority was introduced. By
-default, every plugin has priority zero; to specify the priority
-explicitly, put the following line in your plugin's TableGen file::
-
-    def Priority : PluginPriority<$PRIORITY_VALUE>;
-    # Where PRIORITY_VALUE is some integer > 0
-
-Plugins are loaded in order of their (increasing) priority, starting
-with 0. Therefore, the plugin with the highest priority value will be
-loaded last.
-
 Debugging
 ---------
 
-When writing LLVMC plugins, it can be useful to get a visual view of
-the resulting compilation graph. This can be achieved via the command
-line option ``--view-graph``. This command assumes that Graphviz_ and
-Ghostview_ are installed. There is also a ``--write-graph`` option that
-creates a Graphviz source file (``compilation-graph.dot``) in the
-current directory.
-
-Another useful ``llvmc`` option is ``--check-graph``. It checks the
-compilation graph for common errors like mismatched output/input
-language names, multiple default edges and cycles. These checks can't
-be performed at compile-time because the plugins can load code
-dynamically. When invoked with ``--check-graph``, ``llvmc`` doesn't
-perform any compilation tasks and returns the number of encountered
-errors as its status code.
+When writing LLVMC-based drivers, it can be useful to get a visual view of the
+resulting compilation graph. This can be achieved via the command line option
+``--view-graph`` (which assumes that Graphviz_ and Ghostview_ are
+installed). There is also a ``--write-graph`` option that creates a Graphviz
+source file (``compilation-graph.dot``) in the current directory.
+
+Another useful ``llvmc`` option is ``--check-graph``. It checks the compilation
+graph for common errors like mismatched output/input language names, multiple
+default edges and cycles. When invoked with ``--check-graph``, ``llvmc`` doesn't
+perform any compilation tasks and returns the number of encountered errors as
+its status code. In the future, these checks will be performed at compile-time
+and this option will disappear.
 
 .. _Graphviz: http://www.graphviz.org/
 .. _Ghostview: http://pages.cs.wisc.edu/~ghost/
@@ -821,7 +726,7 @@ accessible only in the C++ code (i.e. hooks). Use the following code::
 
 In general, you're encouraged not to make the behaviour dependent on the
 executable file name, and use command-line switches instead. See for example how
-the ``Base`` plugin behaves when it needs to choose the correct linker options
+the ``llvmc`` program behaves when it needs to choose the correct linker options
 (think ``g++`` vs. ``gcc``).
 
 .. raw:: html
diff --git a/tools/llvmc/doc/LLVMC-Tutorial.rst b/tools/llvmc/doc/LLVMC-Tutorial.rst
index e7e8f08..fc4c124 100644
--- a/tools/llvmc/doc/LLVMC-Tutorial.rst
+++ b/tools/llvmc/doc/LLVMC-Tutorial.rst
@@ -17,59 +17,54 @@ Tutorial - Using LLVMC
 Introduction
 ============
 
-LLVMC is a generic compiler driver, which plays the same role for LLVM
-as the ``gcc`` program does for GCC - the difference being that LLVMC
-is designed to be more adaptable and easier to customize. Most of
-LLVMC functionality is implemented via plugins, which can be loaded
-dynamically or compiled in. This tutorial describes the basic usage
-and configuration of LLVMC.
+LLVMC is a generic compiler driver, which plays the same role for LLVM as the
+``gcc`` program does for GCC - the difference being that LLVMC is designed to be
+more adaptable and easier to customize. Most of LLVMC functionality is
+implemented via high-level TableGen code, from which a corresponding C++ source
+file is automatically generated. This tutorial describes the basic usage and
+configuration of LLVMC.
 
 
-Compiling with LLVMC
-====================
+Using the ``llvmc`` program
+===========================
 
-In general, LLVMC tries to be command-line compatible with ``gcc`` as
-much as possible, so most of the familiar options work::
+In general, ``llvmc`` tries to be command-line compatible with ``gcc`` as much
+as possible, so most of the familiar options work::
 
      $ llvmc -O3 -Wall hello.cpp
      $ ./a.out
      hello
 
-This will invoke ``llvm-g++`` under the hood (you can see which
-commands are executed by using the ``-v`` option). For further help on
-command-line LLVMC usage, refer to the ``llvmc --help`` output.
+This will invoke ``llvm-g++`` under the hood (you can see which commands are
+executed by using the ``-v`` option). For further help on command-line LLVMC
+usage, refer to the ``llvmc --help`` output.
 
 
 Using LLVMC to generate toolchain drivers
 =========================================
 
-LLVMC plugins are written mostly using TableGen_, so you need to
-be familiar with it to get anything done.
+LLVMC-based drivers are written mostly using TableGen_, so you need to be
+familiar with it to get anything done.
 
 .. _TableGen: http://llvm.org/docs/TableGenFundamentals.html
 
 Start by compiling ``example/Simple``, which is a primitive wrapper for
 ``gcc``::
 
-    $ cd $LLVM_DIR/tools/llvmc
-    $ cp -r example/Simple plugins/Simple
-
-      # NB: A less verbose way to compile standalone LLVMC-based drivers is
-      # described in the reference manual.
-
-    $ make LLVMC_BASED_DRIVER_NAME=mygcc LLVMC_BUILTIN_PLUGINS=Simple
+    $ cd $LLVM_OBJ_DIR/tools/examples/Simple
+    $ make
     $ cat > hello.c
-    [...]
-    $ mygcc hello.c
+    #include <stdio.h>
+    int main() { printf("Hello\n"); }
+    $ $LLVM_BIN_DIR/Simple -v hello.c
+    gcc hello.c -o hello.out
     $ ./hello.out
     Hello
 
-Here we link our plugin with the LLVMC core statically to form an executable
-file called ``mygcc``. It is also possible to build our plugin as a dynamic
-library to be loaded by the ``llvmc`` executable (or any other LLVMC-based
-standalone driver); this is described in the reference manual.
-
-Contents of the file ``Simple.td`` look like this::
+We have thus produced a simple driver called, appropriately, ``Simple``, from
+the input TableGen file ``Simple.td``. The ``llvmc`` program itself is generated
+using a similar process (see ``llvmc/src``). Contents of the file ``Simple.td``
+look like this::
 
     // Include common definitions
     include "llvm/CompilerDriver/Common.td"
@@ -79,37 +74,40 @@ Contents of the file ``Simple.td`` look like this::
     [(in_language "c"),
      (out_language "executable"),
      (output_suffix "out"),
-     (cmd_line "gcc $INFILE -o $OUTFILE"),
-     (sink)
+     (command "gcc"),
+     (sink),
+
+     // -o is what is used by default, out_file_option here is included for
+     // instructive purposes.
+     (out_file_option "-o")
     ]>;
 
     // Language map
-    def LanguageMap : LanguageMap<[LangToSuffixes<"c", ["c"]>]>;
+    def LanguageMap : LanguageMap<[(lang_to_suffixes "c", "c")]>;
 
     // Compilation graph
-    def CompilationGraph : CompilationGraph<[Edge<"root", "gcc">]>;
+    def CompilationGraph : CompilationGraph<[(edge "root", "gcc")]>;
 
-As you can see, this file consists of three parts: tool descriptions,
-language map, and the compilation graph definition.
+As you can see, this file consists of three parts: tool descriptions, language
+map, and the compilation graph definition.
 
-At the heart of LLVMC is the idea of a compilation graph: vertices in
-this graph are tools, and edges represent a transformation path
-between two tools (for example, assembly source produced by the
-compiler can be transformed into executable code by an assembler). The
-compilation graph is basically a list of edges; a special node named
-``root`` is used to mark graph entry points.
+At the heart of LLVMC is the idea of a compilation graph: vertices in this graph
+are tools, and edges represent a transformation path between two tools (for
+example, assembly source produced by the compiler can be transformed into
+executable code by an assembler). The compilation graph is basically a list of
+edges; a special node named ``root`` is used to mark graph entry points.
 
-Tool descriptions are represented as property lists: most properties
-in the example above should be self-explanatory; the ``sink`` property
-means that all options lacking an explicit description should be
-forwarded to this tool.
+Tool descriptions are represented as property lists: most properties in the
+example above should be self-explanatory; the ``sink`` property means that all
+options lacking an explicit description should be forwarded to this tool.
 
-The ``LanguageMap`` associates a language name with a list of suffixes
-and is used for deciding which toolchain corresponds to a given input
-file.
+The ``LanguageMap`` associates a language name with a list of suffixes and is
+used for deciding which toolchain corresponds to a given input file.
 
-To learn more about LLVMC customization, refer to the reference
-manual and plugin source code in the ``plugins`` directory.
+To learn more about writing your own drivers with LLVMC, refer to the reference
+manual and examples in the ``examples`` directory. Of a particular interest is
+the ``Skeleton`` example, which can serve as a template for your LLVMC-based
+drivers.
 
 .. raw:: html
 
diff --git a/tools/llvmc/examples/Skeleton/README b/tools/llvmc/examples/Skeleton/README
index 61ff6fb..282ee15 100644
--- a/tools/llvmc/examples/Skeleton/README
+++ b/tools/llvmc/examples/Skeleton/README
@@ -1,6 +1,6 @@
 
 This is a template that can be used to create your own LLVMC-based drivers. Just
 copy the `Skeleton` directory to the location of your preference and edit
-`Skeleton/Makefile` and `Skeleton/AutoGenerated.inc`.
+`Skeleton/Makefile` and `Skeleton/AutoGenerated.td`.
 
 The build system assumes that your project is based on LLVM.
diff --git a/tools/llvmc/src/Base.td.in b/tools/llvmc/src/Base.td.in
index 50533f1..84e39e7 100644
--- a/tools/llvmc/src/Base.td.in
+++ b/tools/llvmc/src/Base.td.in
@@ -191,7 +191,7 @@ class llvm_gcc_based <string cmd, string in_lang, string E_ext, dag out_lang,
          // ('-S' && '-emit-llvm') && !('opt') -> output .ll
          (and (switch_on "emit-llvm", "S"), (not (switch_on "opt"))),
               [(forward "S"), (output_suffix "ll")],
-         // Ususally just output .bc
+         // Usually just output .bc
          (not (switch_on "fsyntax-only")),
               [(append_cmd "-c"), (append_cmd "-emit-llvm")],
 
@@ -301,7 +301,7 @@ def llc : Tool<
 [(in_language "llvm-bitcode", "llvm-assembler"),
  (out_language "assembler"),
  (output_suffix "s"),
- (command "llc"),
+ (command "llc -disable-cfi"),
  (actions (case
           (switch_on "S"), (stop_compilation),
           (switch_on "O0"), (forward "O0"),
diff --git a/tools/lto/LTOCodeGenerator.cpp b/tools/lto/LTOCodeGenerator.cpp
index d95f354..3abd641 100644
--- a/tools/lto/LTOCodeGenerator.cpp
+++ b/tools/lto/LTOCodeGenerator.cpp
@@ -14,7 +14,6 @@
 
 #include "LTOModule.h"
 #include "LTOCodeGenerator.h"
-
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Linker.h"
@@ -37,7 +36,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/StandardPasses.h"
+#include "llvm/Support/PassManagerBuilder.h"
 #include "llvm/Support/SystemUtils.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Support/Host.h"
@@ -355,9 +354,8 @@ void LTOCodeGenerator::applyScopeRestrictions() {
 }
 
 /// Optimize merged modules using various IPO passes
-bool LTOCodeGenerator::generateObjectFile(raw_ostream& out,
-                                          std::string& errMsg)
-{
+bool LTOCodeGenerator::generateObjectFile(raw_ostream &out,
+                                          std::string &errMsg) {
     if ( this->determineTarget(errMsg) ) 
         return true;
 
@@ -380,13 +378,13 @@ bool LTOCodeGenerator::generateObjectFile(raw_ostream& out,
     // Add an appropriate TargetData instance for this module...
     passes.add(new TargetData(*_target->getTargetData()));
     
-    createStandardLTOPasses(&passes, /*Internalize=*/ false, !DisableInline,
-                            /*VerifyEach=*/ false);
+    PassManagerBuilder().populateLTOPassManager(passes, /*Internalize=*/ false,
+                                                !DisableInline);
 
     // Make sure everything is still good.
     passes.add(createVerifierPass());
 
-    FunctionPassManager* codeGenPasses = new FunctionPassManager(mergedModule);
+    FunctionPassManager *codeGenPasses = new FunctionPassManager(mergedModule);
 
     codeGenPasses->add(new TargetData(*_target->getTargetData()));
 
diff --git a/tools/lto/LTOModule.cpp b/tools/lto/LTOModule.cpp
index 78e6f28..8f2b1f4 100644
--- a/tools/lto/LTOModule.cpp
+++ b/tools/lto/LTOModule.cpp
@@ -155,7 +155,7 @@ LTOModule *LTOModule::makeLTOModule(MemoryBuffer *buffer,
   if (!march)
     return NULL;
 
-  // construct LTModule, hand over ownership of module and target
+  // construct LTOModule, hand over ownership of module and target
   SubtargetFeatures Features;
   Features.getDefaultSubtargetFeatures("" /* cpu */, llvm::Triple(Triple));
   std::string FeatureStr = Features.getString();
@@ -582,11 +582,9 @@ namespace {
                                 uint64_t Size, unsigned ByteAlignment) {}
     virtual void EmitBytes(StringRef Data, unsigned AddrSpace) {}
     virtual void EmitValueImpl(const MCExpr *Value, unsigned Size,
-                               bool isPCRel, unsigned AddrSpace) {}
-    virtual void EmitULEB128Value(const MCExpr *Value,
-                                  unsigned AddrSpace = 0) {}
-    virtual void EmitSLEB128Value(const MCExpr *Value,
-                                  unsigned AddrSpace = 0) {}
+                               unsigned AddrSpace) {}
+    virtual void EmitULEB128Value(const MCExpr *Value) {}
+    virtual void EmitSLEB128Value(const MCExpr *Value) {}
     virtual void EmitValueToAlignment(unsigned ByteAlignment, int64_t Value,
                                       unsigned ValueSize,
                                       unsigned MaxBytesToEmit) {}
diff --git a/tools/lto/lto.cpp b/tools/lto/lto.cpp
index fe19921..dd658d1 100644
--- a/tools/lto/lto.cpp
+++ b/tools/lto/lto.cpp
@@ -281,7 +281,7 @@ bool lto_codegen_write_merged_modules(lto_code_gen_t cg, const char* path)
 
 //
 // Generates code for all added modules into one native object file.
-// On sucess returns a pointer to a generated mach-o/ELF buffer and
+// On success returns a pointer to a generated mach-o/ELF buffer and
 // length set to the buffer size.  The buffer is owned by the 
 // lto_code_gen_t and will be freed when lto_codegen_dispose()
 // is called, or lto_codegen_compile() is called again.
diff --git a/tools/opt/opt.cpp b/tools/opt/opt.cpp
index 25474c4..aa375c5 100644
--- a/tools/opt/opt.cpp
+++ b/tools/opt/opt.cpp
@@ -35,7 +35,7 @@
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/PluginLoader.h"
 #include "llvm/Support/PrettyStackTrace.h"
-#include "llvm/Support/StandardPasses.h"
+#include "llvm/Support/PassManagerBuilder.h"
 #include "llvm/Support/SystemUtils.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/LinkAllPasses.h"
@@ -387,10 +387,12 @@ struct BreakpointPrinter : public ModulePass {
     AU.setPreservesAll();
   }
 };
+ 
+} // anonymous namespace
 
 char BreakpointPrinter::ID = 0;
 
-inline void addPass(PassManagerBase &PM, Pass *P) {
+static inline void addPass(PassManagerBase &PM, Pass *P) {
   // Add the pass to the pass manager...
   PM.add(P);
 
@@ -403,31 +405,30 @@ inline void addPass(PassManagerBase &PM, Pass *P) {
 /// duplicates llvm-gcc behaviour.
 ///
 /// OptLevel - Optimization Level
-void AddOptimizationPasses(PassManagerBase &MPM, PassManagerBase &FPM,
-                           unsigned OptLevel) {
-  createStandardFunctionPasses(&FPM, OptLevel);
+static void AddOptimizationPasses(PassManagerBase &MPM,FunctionPassManager &FPM,
+                                  unsigned OptLevel) {
+  PassManagerBuilder Builder;
+  Builder.OptLevel = OptLevel;
 
-  llvm::Pass *InliningPass = 0;
   if (DisableInline) {
     // No inlining pass
-  } else if (OptLevel) {
+  } else if (OptLevel > 1) {
     unsigned Threshold = 225;
     if (OptLevel > 2)
       Threshold = 275;
-    InliningPass = createFunctionInliningPass(Threshold);
+    Builder.Inliner = createFunctionInliningPass(Threshold);
   } else {
-    InliningPass = createAlwaysInlinerPass();
+    Builder.Inliner = createAlwaysInlinerPass();
   }
-  createStandardModulePasses(&MPM, OptLevel,
-                             /*OptimizeSize=*/ false,
-                             UnitAtATime,
-                             /*UnrollLoops=*/ OptLevel > 1,
-                             !DisableSimplifyLibCalls,
-                             /*HaveExceptions=*/ true,
-                             InliningPass);
+  Builder.DisableUnitAtATime = !UnitAtATime;
+  Builder.DisableUnrollLoops = OptLevel == 0;
+  Builder.DisableSimplifyLibCalls = DisableSimplifyLibCalls;
+  
+  Builder.populateFunctionPassManager(FPM);
+  Builder.populateModulePassManager(MPM);
 }
 
-void AddStandardCompilePasses(PassManagerBase &PM) {
+static void AddStandardCompilePasses(PassManagerBase &PM) {
   PM.add(createVerifierPass());                  // Verify that input is correct
 
   addPass(PM, createLowerSetJmpPass());          // Lower llvm.setjmp/.longjmp
@@ -438,19 +439,16 @@ void AddStandardCompilePasses(PassManagerBase &PM) {
 
   if (DisableOptimizations) return;
 
-  llvm::Pass *InliningPass = !DisableInline ? createFunctionInliningPass() : 0;
-
   // -std-compile-opts adds the same module passes as -O3.
-  createStandardModulePasses(&PM, 3,
-                             /*OptimizeSize=*/ false,
-                             /*UnitAtATime=*/ true,
-                             /*UnrollLoops=*/ true,
-                             !DisableSimplifyLibCalls,
-                             /*HaveExceptions=*/ true,
-                             InliningPass);
+  PassManagerBuilder Builder;
+  if (!DisableInline)
+    Builder.Inliner = createFunctionInliningPass();
+  Builder.OptLevel = 3;
+  Builder.DisableSimplifyLibCalls = DisableSimplifyLibCalls;
+  Builder.populateModulePassManager(PM);
 }
 
-void AddStandardLinkPasses(PassManagerBase &PM) {
+static void AddStandardLinkPasses(PassManagerBase &PM) {
   PM.add(createVerifierPass());                  // Verify that input is correct
 
   // If the -strip-debug command line option was specified, do it.
@@ -459,13 +457,11 @@ void AddStandardLinkPasses(PassManagerBase &PM) {
 
   if (DisableOptimizations) return;
 
-  createStandardLTOPasses(&PM, /*Internalize=*/ !DisableInternalize,
-                          /*RunInliner=*/ !DisableInline,
-                          /*VerifyEach=*/ VerifyEach);
+  PassManagerBuilder Builder;
+  Builder.populateLTOPassManager(PM, /*Internalize=*/ !DisableInternalize,
+                                 /*RunInliner=*/ !DisableInline);
 }
 
-} // anonymous namespace
-
 
 //===----------------------------------------------------------------------===//
 // main for opt
@@ -566,9 +562,9 @@ int main(int argc, char **argv) {
   if (TD)
     Passes.add(TD);
 
-  OwningPtr<PassManager> FPasses;
+  OwningPtr<FunctionPassManager> FPasses;
   if (OptLevelO1 || OptLevelO2 || OptLevelO3) {
-    FPasses.reset(new PassManager());
+    FPasses.reset(new FunctionPassManager(M.get()));
     if (TD)
       FPasses->add(new TargetData(*TD));
   }
@@ -686,8 +682,12 @@ int main(int argc, char **argv) {
   if (OptLevelO3)
     AddOptimizationPasses(Passes, *FPasses, 3);
 
-  if (OptLevelO1 || OptLevelO2 || OptLevelO3)
-    FPasses->run(*M.get());
+  if (OptLevelO1 || OptLevelO2 || OptLevelO3) {
+    FPasses->doInitialization();
+    for (Module::iterator F = M->begin(), E = M->end(); F != E; ++F)
+      FPasses->run(*F);
+    FPasses->doFinalization();
+  }
 
   // Check that the module is well formed on completion of optimization
   if (!NoVerify && !VerifyEach)
diff --git a/unittests/ADT/APIntTest.cpp b/unittests/ADT/APIntTest.cpp
index dbd0cb7..1f78cd3 100644
--- a/unittests/ADT/APIntTest.cpp
+++ b/unittests/ADT/APIntTest.cpp
@@ -320,6 +320,52 @@ TEST(APIntTest, StringBitsNeeded16) {
   EXPECT_EQ(9U, APInt::getBitsNeeded("-20", 16));
 }
 
+TEST(APIntTest, toString) {
+  SmallString<16> S;
+  bool isSigned;
+
+  APInt(8, 0).toString(S, 2, true, true);
+  EXPECT_EQ(S.str().str(), "0b0");
+  S.clear();
+  APInt(8, 0).toString(S, 8, true, true);
+  EXPECT_EQ(S.str().str(), "00");
+  S.clear();
+  APInt(8, 0).toString(S, 10, true, true);
+  EXPECT_EQ(S.str().str(), "0");
+  S.clear();
+  APInt(8, 0).toString(S, 16, true, true);
+  EXPECT_EQ(S.str().str(), "0x0");
+  S.clear();
+
+  isSigned = false;
+  APInt(8, 255, isSigned).toString(S, 2, isSigned, true);
+  EXPECT_EQ(S.str().str(), "0b11111111");
+  S.clear();
+  APInt(8, 255, isSigned).toString(S, 8, isSigned, true);
+  EXPECT_EQ(S.str().str(), "0377");
+  S.clear();
+  APInt(8, 255, isSigned).toString(S, 10, isSigned, true);
+  EXPECT_EQ(S.str().str(), "255");
+  S.clear();
+  APInt(8, 255, isSigned).toString(S, 16, isSigned, true);
+  EXPECT_EQ(S.str().str(), "0xFF");
+  S.clear();
+
+  isSigned = true;
+  APInt(8, 255, isSigned).toString(S, 2, isSigned, true);
+  EXPECT_EQ(S.str().str(), "-0b1");
+  S.clear();
+  APInt(8, 255, isSigned).toString(S, 8, isSigned, true);
+  EXPECT_EQ(S.str().str(), "-01");
+  S.clear();
+  APInt(8, 255, isSigned).toString(S, 10, isSigned, true);
+  EXPECT_EQ(S.str().str(), "-1");
+  S.clear();
+  APInt(8, 255, isSigned).toString(S, 16, isSigned, true);
+  EXPECT_EQ(S.str().str(), "-0x1");
+  S.clear();
+}
+
 TEST(APIntTest, Log2) {
   EXPECT_EQ(APInt(15, 7).logBase2(), 2U);
   EXPECT_EQ(APInt(15, 7).ceilLogBase2(), 3U);
diff --git a/unittests/ADT/PackedVectorTest.cpp b/unittests/ADT/PackedVectorTest.cpp
new file mode 100644
index 0000000..55b5d8d
--- /dev/null
+++ b/unittests/ADT/PackedVectorTest.cpp
@@ -0,0 +1,115 @@
+//===- llvm/unittest/ADT/PackedVectorTest.cpp - PackedVector tests --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// BitVectorTest tests fail on PowerPC for unknown reasons, so disable this
+// as well since it depends on a BitVector.
+#ifndef __ppc__
+
+#include "llvm/ADT/PackedVector.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+namespace {
+
+TEST(PackedVectorTest, Operation) {
+  PackedVector<unsigned, 2> Vec;
+  EXPECT_EQ(0U, Vec.size());
+  EXPECT_TRUE(Vec.empty());
+
+  Vec.resize(5);
+  EXPECT_EQ(5U, Vec.size());
+  EXPECT_FALSE(Vec.empty());
+
+  Vec.resize(11);
+  EXPECT_EQ(11U, Vec.size());
+  EXPECT_FALSE(Vec.empty());
+
+  PackedVector<unsigned, 2> Vec2(3);
+  EXPECT_EQ(3U, Vec2.size());
+  EXPECT_FALSE(Vec2.empty());
+
+  Vec.clear();
+  EXPECT_EQ(0U, Vec.size());
+  EXPECT_TRUE(Vec.empty());
+
+  Vec.push_back(2);
+  Vec.push_back(0);
+  Vec.push_back(1);
+  Vec.push_back(3);
+
+  EXPECT_EQ(2U, Vec[0]);
+  EXPECT_EQ(0U, Vec[1]);
+  EXPECT_EQ(1U, Vec[2]);
+  EXPECT_EQ(3U, Vec[3]);
+
+  EXPECT_FALSE(Vec == Vec2);
+  EXPECT_TRUE(Vec != Vec2);
+
+  Vec2.swap(Vec);
+  EXPECT_EQ(3U, Vec.size());
+  EXPECT_FALSE(Vec.empty());
+  EXPECT_EQ(0U, Vec[0]);
+  EXPECT_EQ(0U, Vec[1]);
+  EXPECT_EQ(0U, Vec[2]);
+
+  EXPECT_EQ(2U, Vec2[0]);
+  EXPECT_EQ(0U, Vec2[1]);
+  EXPECT_EQ(1U, Vec2[2]);
+  EXPECT_EQ(3U, Vec2[3]);
+
+  Vec = Vec2;
+  EXPECT_TRUE(Vec == Vec2);
+  EXPECT_FALSE(Vec != Vec2);
+
+  Vec[1] = 1;
+  Vec2[1] = 2;
+  Vec |= Vec2;
+  EXPECT_EQ(3U, Vec[1]);
+}
+
+#ifdef EXPECT_DEBUG_DEATH
+
+TEST(PackedVectorTest, UnsignedValues) {
+  PackedVector<unsigned, 2> Vec(1);
+  Vec[0] = 0;
+  Vec[0] = 1;
+  Vec[0] = 2;
+  Vec[0] = 3;
+  EXPECT_DEBUG_DEATH(Vec[0] = 4, "value is too big");
+  EXPECT_DEBUG_DEATH(Vec[0] = -1, "value is too big");
+  EXPECT_DEBUG_DEATH(Vec[0] = 0x100, "value is too big");
+
+  PackedVector<unsigned, 3> Vec2(1);
+  Vec2[0] = 0;
+  Vec2[0] = 7;
+  EXPECT_DEBUG_DEATH(Vec[0] = 8, "value is too big");
+}
+
+TEST(PackedVectorTest, SignedValues) {
+  PackedVector<signed, 2> Vec(1);
+  Vec[0] = -2;
+  Vec[0] = -1;
+  Vec[0] = 0;
+  Vec[0] = 1;
+  EXPECT_DEBUG_DEATH(Vec[0] = -3, "value is too big");
+  EXPECT_DEBUG_DEATH(Vec[0] = 2, "value is too big");
+
+  PackedVector<signed, 3> Vec2(1);
+  Vec2[0] = -4;
+  Vec2[0] = 3;
+  EXPECT_DEBUG_DEATH(Vec[0] = -5, "value is too big");
+  EXPECT_DEBUG_DEATH(Vec[0] = 4, "value is too big");
+}
+
+#endif
+
+}
+
+#endif
diff --git a/unittests/CMakeLists.txt b/unittests/CMakeLists.txt
index da4a652..0d17c40 100644
--- a/unittests/CMakeLists.txt
+++ b/unittests/CMakeLists.txt
@@ -20,7 +20,7 @@ set_target_properties(UnitTests PROPERTIES FOLDER "Tests")
 
 include_directories(${LLVM_MAIN_SRC_DIR}/utils/unittest/googletest/include)
 add_definitions(-DGTEST_HAS_RTTI=0)
-if( CMAKE_COMPILER_IS_GNUCXX )
+if( LLVM_COMPILER_IS_GCC_COMPATIBLE )
   llvm_replace_compiler_option(CMAKE_CXX_FLAGS "-frtti" "-fno-rtti")
 elseif( MSVC )
   llvm_replace_compiler_option(CMAKE_CXX_FLAGS "/GR" "/GR-")
@@ -64,6 +64,7 @@ add_llvm_unittest(ADT
   ADT/ImmutableSetTest.cpp
   ADT/IntEqClassesTest.cpp
   ADT/IntervalMapTest.cpp
+  ADT/PackedVectorTest.cpp
   ADT/SmallBitVectorTest.cpp
   ADT/SmallStringTest.cpp
   ADT/SmallVectorTest.cpp
diff --git a/unittests/ExecutionEngine/JIT/JITMemoryManagerTest.cpp b/unittests/ExecutionEngine/JIT/JITMemoryManagerTest.cpp
index ff5af3b..0bc1966 100644
--- a/unittests/ExecutionEngine/JIT/JITMemoryManagerTest.cpp
+++ b/unittests/ExecutionEngine/JIT/JITMemoryManagerTest.cpp
@@ -14,6 +14,7 @@
 #include "llvm/Function.h"
 #include "llvm/GlobalValue.h"
 #include "llvm/LLVMContext.h"
+#include "llvm/ADT/ArrayRef.h"
 
 using namespace llvm;
 
diff --git a/unittests/Support/IRBuilderTest.cpp b/unittests/Support/IRBuilderTest.cpp
new file mode 100644
index 0000000..5d635ae
--- /dev/null
+++ b/unittests/Support/IRBuilderTest.cpp
@@ -0,0 +1,70 @@
+//===- llvm/unittest/Support/IRBuilderTest.cpp - IRBuilder tests ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/IRBuilder.h"
+#include "llvm/BasicBlock.h"
+#include "llvm/Function.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/Module.h"
+#include "llvm/ADT/OwningPtr.h"
+
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+class IRBuilderTest : public testing::Test {
+protected:
+  virtual void SetUp() {
+    M.reset(new Module("MyModule", getGlobalContext()));
+    FunctionType *FTy = FunctionType::get(Type::getVoidTy(getGlobalContext()),
+                                          /*isVarArg=*/false);
+    Function *F = Function::Create(FTy, Function::ExternalLinkage, "", M.get());
+    BB = BasicBlock::Create(getGlobalContext(), "", F);
+  }
+
+  virtual void TearDown() {
+    BB = 0;
+    M.reset();
+  }
+
+  OwningPtr<Module> M;
+  BasicBlock *BB;
+};
+
+TEST_F(IRBuilderTest, Lifetime) {
+  IRBuilder<> Builder(BB);
+  AllocaInst *Var1 = Builder.CreateAlloca(Builder.getInt8Ty());
+  AllocaInst *Var2 = Builder.CreateAlloca(Builder.getInt32Ty());
+  AllocaInst *Var3 = Builder.CreateAlloca(Builder.getInt8Ty(),
+                                          Builder.getInt32(123));
+
+  CallInst *Start1 = Builder.CreateLifetimeStart(Var1);
+  CallInst *Start2 = Builder.CreateLifetimeStart(Var2);
+  CallInst *Start3 = Builder.CreateLifetimeStart(Var3, Builder.getInt64(100));
+
+  EXPECT_EQ(Start1->getArgOperand(0), Builder.getInt64(-1));
+  EXPECT_EQ(Start2->getArgOperand(0), Builder.getInt64(-1));
+  EXPECT_EQ(Start3->getArgOperand(0), Builder.getInt64(100));
+
+  EXPECT_EQ(Start1->getArgOperand(1), Var1);
+  EXPECT_NE(Start2->getArgOperand(1), Var2);
+  EXPECT_EQ(Start3->getArgOperand(1), Var3);
+
+  Value *End1 = Builder.CreateLifetimeEnd(Var1);
+  Builder.CreateLifetimeEnd(Var2);
+  Builder.CreateLifetimeEnd(Var3);
+
+  IntrinsicInst *II_Start1 = dyn_cast<IntrinsicInst>(Start1);
+  IntrinsicInst *II_End1 = dyn_cast<IntrinsicInst>(End1);
+  ASSERT_TRUE(II_Start1 != NULL);
+  EXPECT_EQ(II_Start1->getIntrinsicID(), Intrinsic::lifetime_start);
+  ASSERT_TRUE(II_End1 != NULL);
+  EXPECT_EQ(II_End1->getIntrinsicID(), Intrinsic::lifetime_end);
+}
diff --git a/unittests/Support/TypeBuilderTest.cpp b/unittests/Support/TypeBuilderTest.cpp
index e805827..5a82883 100644
--- a/unittests/Support/TypeBuilderTest.cpp
+++ b/unittests/Support/TypeBuilderTest.cpp
@@ -9,6 +9,7 @@
 
 #include "llvm/Support/TypeBuilder.h"
 #include "llvm/LLVMContext.h"
+#include "llvm/ADT/ArrayRef.h"
 
 #include "gtest/gtest.h"
 
diff --git a/unittests/Transforms/Utils/Cloning.cpp b/unittests/Transforms/Utils/Cloning.cpp
index b65ac34..1ce549d 100644
--- a/unittests/Transforms/Utils/Cloning.cpp
+++ b/unittests/Transforms/Utils/Cloning.cpp
@@ -9,6 +9,7 @@
 
 #include "gtest/gtest.h"
 #include "llvm/Argument.h"
+#include "llvm/Constant.h"
 #include "llvm/Instructions.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/ADT/SmallPtrSet.h"
diff --git a/unittests/VMCore/InstructionsTest.cpp b/unittests/VMCore/InstructionsTest.cpp
index d286c73..9624b81 100644
--- a/unittests/VMCore/InstructionsTest.cpp
+++ b/unittests/VMCore/InstructionsTest.cpp
@@ -9,6 +9,7 @@
 
 #include "llvm/Instructions.h"
 #include "llvm/BasicBlock.h"
+#include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/ADT/STLExtras.h"
@@ -113,11 +114,19 @@ TEST(InstructionsTest, CastInst) {
   const Type* Int8Ty = Type::getInt8Ty(C);
   const Type* Int64Ty = Type::getInt64Ty(C);
   const Type* V8x8Ty = VectorType::get(Int8Ty, 8);
+  const Type* V8x64Ty = VectorType::get(Int64Ty, 8);
   const Type* X86MMXTy = Type::getX86_MMXTy(C);
 
+  const Constant* c8 = Constant::getNullValue(V8x8Ty);
+  const Constant* c64 = Constant::getNullValue(V8x64Ty);
+
   EXPECT_TRUE(CastInst::isCastable(V8x8Ty, X86MMXTy));
   EXPECT_TRUE(CastInst::isCastable(X86MMXTy, V8x8Ty));
   EXPECT_FALSE(CastInst::isCastable(Int64Ty, X86MMXTy));
+  EXPECT_TRUE(CastInst::isCastable(V8x64Ty, V8x8Ty));
+  EXPECT_TRUE(CastInst::isCastable(V8x8Ty, V8x64Ty));
+  EXPECT_EQ(CastInst::getCastOpcode(c64, true, V8x8Ty, true), CastInst::Trunc);
+  EXPECT_EQ(CastInst::getCastOpcode(c8, true, V8x64Ty, true), CastInst::SExt);
 }
 
 }  // end anonymous namespace
diff --git a/unittests/VMCore/MetadataTest.cpp b/unittests/VMCore/MetadataTest.cpp
index 942b848..0b2c012 100644
--- a/unittests/VMCore/MetadataTest.cpp
+++ b/unittests/VMCore/MetadataTest.cpp
@@ -87,10 +87,10 @@ TEST_F(MDNodeTest, Simple) {
   V.push_back(CI);
   V.push_back(s2);
 
-  MDNode *n1 = MDNode::get(Context, &V[0], 3);
+  MDNode *n1 = MDNode::get(Context, V);
   Value *const c1 = n1;
-  MDNode *n2 = MDNode::get(Context, &c1, 1);
-  MDNode *n3 = MDNode::get(Context, &V[0], 3);
+  MDNode *n2 = MDNode::get(Context, c1);
+  MDNode *n3 = MDNode::get(Context, V);
   EXPECT_NE(n1, n2);
 #ifdef ENABLE_MDNODE_UNIQUING
   EXPECT_EQ(n1, n3);
@@ -112,7 +112,7 @@ TEST_F(MDNodeTest, Delete) {
   Instruction *I = new BitCastInst(C, Type::getInt32Ty(getGlobalContext()));
 
   Value *const V = I;
-  MDNode *n = MDNode::get(Context, &V, 1);
+  MDNode *n = MDNode::get(Context, V);
   WeakVH wvh = n;
 
   EXPECT_EQ(n, wvh);
@@ -127,8 +127,8 @@ TEST(NamedMDNodeTest, Search) {
 
   Value *const V = C;
   Value *const V2 = C2;
-  MDNode *n = MDNode::get(Context, &V, 1);
-  MDNode *n2 = MDNode::get(Context, &V2, 1);
+  MDNode *n = MDNode::get(Context, V);
+  MDNode *n2 = MDNode::get(Context, V2);
 
   Module M("MyModule", Context);
   const char *Name = "llvm.NMD1";
diff --git a/unittests/VMCore/ValueMapTest.cpp b/unittests/VMCore/ValueMapTest.cpp
index 152e8ea..b493920 100644
--- a/unittests/VMCore/ValueMapTest.cpp
+++ b/unittests/VMCore/ValueMapTest.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/ValueMap.h"
+#include "llvm/Constants.h"
 #include "llvm/Instructions.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/ADT/OwningPtr.h"
diff --git a/utils/DSAextract.py b/utils/DSAextract.py
index 134e945..89dece1 100644
--- a/utils/DSAextract.py
+++ b/utils/DSAextract.py
@@ -58,7 +58,7 @@ node_set = set()
 #read the file one line at a time
 buffer = input.readline()
 while buffer != '':
-	#filter out the unecessary checks on all the edge lines
+	#filter out the unnecessary checks on all the edge lines
 	if not arrowexp.search(buffer):
 		#check to see if this is a node we are looking for
 		for regexp in regexp_list:
diff --git a/utils/FileCheck/FileCheck.cpp b/utils/FileCheck/FileCheck.cpp
index 5d4cb0c..f225594 100644
--- a/utils/FileCheck/FileCheck.cpp
+++ b/utils/FileCheck/FileCheck.cpp
@@ -131,26 +131,34 @@ bool Pattern::ParsePattern(StringRef PatternStr, SourceMgr &SM) {
   }
 
   // Paren value #0 is for the fully matched string.  Any new parenthesized
-  // values add from their.
+  // values add from there.
   unsigned CurParen = 1;
 
   // Otherwise, there is at least one regex piece.  Build up the regex pattern
   // by escaping scary characters in fixed strings, building up one big regex.
   while (!PatternStr.empty()) {
     // RegEx matches.
-    if (PatternStr.size() >= 2 &&
-        PatternStr[0] == '{' && PatternStr[1] == '{') {
+    if (PatternStr.startswith("{{")) {
 
       // Otherwise, this is the start of a regex match.  Scan for the }}.
       size_t End = PatternStr.find("}}");
       if (End == StringRef::npos) {
         SM.PrintMessage(SMLoc::getFromPointer(PatternStr.data()),
-                        "found start of regex string with no end '}}'", "error");
+                        "found start of regex string with no end '}}'","error");
         return true;
       }
 
+      // Enclose {{}} patterns in parens just like [[]] even though we're not
+      // capturing the result for any purpose.  This is required in case the
+      // expression contains an alternation like: CHECK:  abc{{x|z}}def.  We
+      // want this to turn into: "abc(x|z)def" not "abcx|zdef".
+      RegExStr += '(';
+      ++CurParen;
+
       if (AddRegExToRegEx(PatternStr.substr(2, End-2), CurParen, SM))
         return true;
+      RegExStr += ')';
+
       PatternStr = PatternStr.substr(End+2);
       continue;
     }
@@ -160,8 +168,7 @@ bool Pattern::ParsePattern(StringRef PatternStr, SourceMgr &SM) {
     // second form is [[foo]] which is a reference to foo.  The variable name
     // itself must be of the form "[a-zA-Z_][0-9a-zA-Z_]*", otherwise we reject
     // it.  This is to catch some common errors.
-    if (PatternStr.size() >= 2 &&
-        PatternStr[0] == '[' && PatternStr[1] == '[') {
+    if (PatternStr.startswith("[[")) {
       // Verify that it is terminated properly.
       size_t End = PatternStr.find("]]");
       if (End == StringRef::npos) {
@@ -185,10 +192,7 @@ bool Pattern::ParsePattern(StringRef PatternStr, SourceMgr &SM) {
 
       // Verify that the name is well formed.
       for (unsigned i = 0, e = Name.size(); i != e; ++i)
-        if (Name[i] != '_' &&
-            (Name[i] < 'a' || Name[i] > 'z') &&
-            (Name[i] < 'A' || Name[i] > 'Z') &&
-            (Name[i] < '0' || Name[i] > '9')) {
+        if (Name[i] != '_' && !isalnum(Name[i])) {
           SM.PrintMessage(SMLoc::getFromPointer(Name.data()+i),
                           "invalid name in named regex", "error");
           return true;
diff --git a/utils/KillTheDoctor/KillTheDoctor.cpp b/utils/KillTheDoctor/KillTheDoctor.cpp
index 7a89dd3..1ddae0b 100644
--- a/utils/KillTheDoctor/KillTheDoctor.cpp
+++ b/utils/KillTheDoctor/KillTheDoctor.cpp
@@ -169,14 +169,14 @@ namespace {
 static error_code GetFileNameFromHandle(HANDLE FileHandle,
                                         std::string& Name) {
   char Filename[MAX_PATH+1];
-  bool Sucess = false;
+  bool Success = false;
   Name.clear();
 
   // Get the file size.
   LARGE_INTEGER FileSize;
-  Sucess = ::GetFileSizeEx(FileHandle, &FileSize);
+  Success = ::GetFileSizeEx(FileHandle, &FileSize);
 
-  if (!Sucess)
+  if (!Success)
     return windows_error(::GetLastError());
 
   // Create a file mapping object.
@@ -198,12 +198,12 @@ static error_code GetFileNameFromHandle(HANDLE FileHandle,
   if (!MappedFile)
     return windows_error(::GetLastError());
 
-  Sucess = ::GetMappedFileNameA(::GetCurrentProcess(),
+  Success = ::GetMappedFileNameA(::GetCurrentProcess(),
                                 MappedFile,
                                 Filename,
                                 array_lengthof(Filename) - 1);
 
-  if (!Sucess)
+  if (!Success)
     return windows_error(::GetLastError());
   else {
     Name = Filename;
diff --git a/utils/NewNightlyTest.pl b/utils/NewNightlyTest.pl
index 1b48168..da806e9 100755
--- a/utils/NewNightlyTest.pl
+++ b/utils/NewNightlyTest.pl
@@ -794,7 +794,7 @@ my %hash_of_data = (
   'endtime' => $endtime,
   'target_triple' => $targetTriple,
 
-  # Unused, but left around for backwards compatability.
+  # Unused, but left around for backwards compatibility.
   'warnings' => "",
   'cvsusercommitlist' => "",
   'cvsuserupdatelist' => "",
diff --git a/utils/TableGen/ARMDecoderEmitter.cpp b/utils/TableGen/ARMDecoderEmitter.cpp
index e48ac1e..62bd1c6 100644
--- a/utils/TableGen/ARMDecoderEmitter.cpp
+++ b/utils/TableGen/ARMDecoderEmitter.cpp
@@ -607,7 +607,7 @@ void ARMFilter::recurse() {
     for (bitIndex = 0; bitIndex < NumBits; bitIndex++)
       BitValueArray[StartBit + bitIndex] = BIT_UNSET;
 
-    // Delegates to an inferior filter chooser for futher processing on this
+    // Delegates to an inferior filter chooser for further processing on this
     // group of instructions whose segment values are variable.
     FilterChooserMap.insert(std::pair<unsigned, ARMFilterChooser*>(
                               (unsigned)-1,
@@ -639,7 +639,7 @@ void ARMFilter::recurse() {
         BitValueArray[StartBit + bitIndex] = BIT_FALSE;
     }
 
-    // Delegates to an inferior filter chooser for futher processing on this
+    // Delegates to an inferior filter chooser for further processing on this
     // category of instructions.
     FilterChooserMap.insert(std::pair<unsigned, ARMFilterChooser*>(
                               mapIterator->first,
@@ -1624,6 +1624,10 @@ ARMDEBackend::populateInstruction(const CodeGenInstruction &CGI,
     if (Name == "tBL" || Name == "tBLXi" || Name == "tBLXr")
       return false;
 
+    // A8.6.25 BX.  Use the generic tBX_Rm, ignore tBX_RET and tBX_RET_vararg.
+    if (Name == "tBX_RET" || Name == "tBX_RET_vararg")
+      return false;
+
     // Ignore the TPsoft (TLS) instructions, which conflict with tBLr9.
     if (Name == "tTPsoft" || Name == "t2TPsoft")
       return false;
@@ -1648,6 +1652,11 @@ ARMDEBackend::populateInstruction(const CodeGenInstruction &CGI,
         Name == "t2ADDrSPi12" || Name == "t2SUBrSPi12")
       return false;
 
+    // FIXME: Use ldr.n to work around a Darwin assembler bug.
+    // Introduce a workaround with tLDRpciDIS opcode.
+    if (Name == "tLDRpci")
+      return false;
+
     // Ignore t2LDRDpci, prefer the generic t2LDRDi8, t2LDRD_PRE, t2LDRD_POST.
     if (Name == "t2LDRDpci")
       return false;
diff --git a/utils/TableGen/AsmMatcherEmitter.cpp b/utils/TableGen/AsmMatcherEmitter.cpp
index 369ec90..1d7a67b 100644
--- a/utils/TableGen/AsmMatcherEmitter.cpp
+++ b/utils/TableGen/AsmMatcherEmitter.cpp
@@ -88,7 +88,7 @@
 //   2. The operand matcher will try every possible entry with the same
 //      mnemonic and will check if the target feature for this mnemonic also
 //      matches. After that, if the operand to be matched has its index
-//      present in the mask, a successfull match occurs. Otherwise, fallback
+//      present in the mask, a successful match occurs. Otherwise, fallback
 //      to the regular operand parsing.
 //
 //   3. For a match success, each operand class that has a 'ParserMethod'
@@ -258,7 +258,7 @@ public:
       return ValueName < RHS.ValueName;
 
     default:
-      // This class preceeds the RHS if it is a proper subset of the RHS.
+      // This class precedes the RHS if it is a proper subset of the RHS.
       if (isSubsetOf(RHS))
         return true;
       if (RHS.isSubsetOf(*this))
@@ -896,8 +896,8 @@ BuildRegisterClasses(SmallPtrSet<Record*, 16> &SingletonRegisters) {
   // Gather the defined sets.
   for (std::vector<CodeGenRegisterClass>::const_iterator it =
        RegClassList.begin(), ie = RegClassList.end(); it != ie; ++it)
-    RegisterSets.insert(std::set<Record*>(it->Elements.begin(),
-                                          it->Elements.end()));
+    RegisterSets.insert(std::set<Record*>(it->getOrder().begin(),
+                                          it->getOrder().end()));
 
   // Add any required singleton sets.
   for (SmallPtrSet<Record*, 16>::iterator it = SingletonRegisters.begin(),
@@ -971,8 +971,8 @@ BuildRegisterClasses(SmallPtrSet<Record*, 16> &SingletonRegisters) {
   // Name the register classes which correspond to a user defined RegisterClass.
   for (std::vector<CodeGenRegisterClass>::const_iterator
        it = RegClassList.begin(), ie = RegClassList.end(); it != ie; ++it) {
-    ClassInfo *CI = RegisterSetClasses[std::set<Record*>(it->Elements.begin(),
-                                                         it->Elements.end())];
+    ClassInfo *CI = RegisterSetClasses[std::set<Record*>(it->getOrder().begin(),
+                                                         it->getOrder().end())];
     if (CI->ValueName.empty()) {
       CI->ClassName = it->getName();
       CI->Name = "MCK_" + it->getName();
@@ -1265,7 +1265,7 @@ void AsmMatcherInfo::BuildInfo() {
       II->BuildAliasResultOperands();
   }
 
-  // Reorder classes so that classes preceed super classes.
+  // Reorder classes so that classes precede super classes.
   std::sort(Classes.begin(), Classes.end(), less_ptr<ClassInfo>());
 }
 
@@ -1538,7 +1538,7 @@ static void EmitConvertToMCInst(CodeGenTarget &Target, StringRef ClassName,
         // operand from the earlier one.We can only tie single MCOperand values.
         //assert(OpInfo.MINumOperands == 1 && "Not a singular MCOperand");
         unsigned TiedOp = OpInfo.TiedOperandNum;
-        assert(i > TiedOp && "Tied operand preceeds its target!");
+        assert(i > TiedOp && "Tied operand precedes its target!");
         CaseOS << "    Inst.addOperand(Inst.getOperand(" << TiedOp << "));\n";
         Signature += "__Tie" + utostr(TiedOp);
         break;
@@ -2321,7 +2321,7 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   OS << "    for (unsigned i = 0; i != " << MaxNumOperands << "; ++i) {\n";
   OS << "      if (i + 1 >= Operands.size()) {\n";
   OS << "        OperandsValid = (it->Classes[i] == " <<"InvalidMatchClass);\n";
-  OS << "        break;";
+  OS << "        break;\n";
   OS << "      }\n";
   OS << "      if (ValidateOperandClass(Operands[i+1], it->Classes[i]))\n";
   OS << "        continue;\n";
diff --git a/utils/TableGen/AsmMatcherEmitter.h b/utils/TableGen/AsmMatcherEmitter.h
index 729c938..c13adf3 100644
--- a/utils/TableGen/AsmMatcherEmitter.h
+++ b/utils/TableGen/AsmMatcherEmitter.h
@@ -16,8 +16,6 @@
 #define ASMMATCHER_EMITTER_H
 
 #include "TableGenBackend.h"
-#include <map>
-#include <vector>
 #include <cassert>
 
 namespace llvm {
diff --git a/utils/TableGen/AsmWriterEmitter.cpp b/utils/TableGen/AsmWriterEmitter.cpp
index 05bc113..818053a 100644
--- a/utils/TableGen/AsmWriterEmitter.cpp
+++ b/utils/TableGen/AsmWriterEmitter.cpp
@@ -670,8 +670,8 @@ public:
 
     for (std::map<StringRef, unsigned>::iterator
            I = OpMap.begin(), E = OpMap.end(); I != E; ++I)
-      O.indent(6) << "OpMap[\"" << I->first << "\"] = "
-                  << I->second << ";\n";
+      O.indent(6) << "OpMap.push_back(std::make_pair(\"" << I->first << "\", "
+                  << I->second << "));\n";
 
     O.indent(6) << "break;\n";
     O.indent(4) << '}';
@@ -754,6 +754,20 @@ static void EmitComputeAvailableFeatures(AsmWriterInfo &Info,
   O << "}\n\n";
 }
 
+static void EmitGetMapOperandNumber(raw_ostream &O) {
+  O << "static unsigned getMapOperandNumber("
+    << "const SmallVectorImpl<std::pair<StringRef, unsigned> > &OpMap,\n";
+  O << "                                    StringRef Name) {\n";
+  O << "  for (SmallVectorImpl<std::pair<StringRef, unsigned> >::"
+    << "const_iterator\n";
+  O << "         I = OpMap.begin(), E = OpMap.end(); I != E; ++I)\n";
+  O << "    if (I->first == Name)\n";
+  O << "      return I->second;\n";
+  O << "  assert(false && \"Operand not in map!\");\n";
+  O << "  return 0;\n";
+  O << "}\n\n";
+}
+
 void AsmWriterEmitter::EmitRegIsInRegClass(raw_ostream &O) {
   CodeGenTarget Target(Records);
 
@@ -791,16 +805,16 @@ void AsmWriterEmitter::EmitRegIsInRegClass(raw_ostream &O) {
     O << "  case RC_" << Name << ":\n";
   
     // Emit the register list now.
-    unsigned IE = RC.Elements.size();
+    unsigned IE = RC.getOrder().size();
     if (IE == 1) {
-      O << "    if (Reg == " << getQualifiedName(RC.Elements[0]) << ")\n";
+      O << "    if (Reg == " << getQualifiedName(RC.getOrder()[0]) << ")\n";
       O << "      return true;\n";
     } else {
       O << "    switch (Reg) {\n";
       O << "    default: break;\n";
 
       for (unsigned II = 0; II != IE; ++II) {
-        Record *Reg = RC.Elements[II];
+        Record *Reg = RC.getOrder()[II];
         O << "    case " << getQualifiedName(Reg) << ":\n";
       }
 
@@ -816,10 +830,46 @@ void AsmWriterEmitter::EmitRegIsInRegClass(raw_ostream &O) {
   O << "}\n\n";
 }
 
+static unsigned CountNumOperands(StringRef AsmString) {
+  unsigned NumOps = 0;
+  std::pair<StringRef, StringRef> ASM = AsmString.split(' ');
+
+  while (!ASM.second.empty()) {
+    ++NumOps;
+    ASM = ASM.second.split(' ');
+  }
+
+  return NumOps;
+}
+
+static unsigned CountResultNumOperands(StringRef AsmString) {
+  unsigned NumOps = 0;
+  std::pair<StringRef, StringRef> ASM = AsmString.split('\t');
+
+  if (!ASM.second.empty()) {
+    size_t I = ASM.second.find('{');
+    StringRef Str = ASM.second;
+    if (I != StringRef::npos)
+      Str = ASM.second.substr(I, ASM.second.find('|', I));
+
+    ASM = Str.split(' ');
+
+    do {
+      ++NumOps;
+      ASM = ASM.second.split(' ');
+    } while (!ASM.second.empty());
+  }
+
+  return NumOps;
+}
+
 void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
   CodeGenTarget Target(Records);
   Record *AsmWriter = Target.getAsmWriter();
 
+  if (!AsmWriter->getValueAsBit("isMCAsmWriter"))
+    return;
+
   O << "\n#ifdef PRINT_ALIAS_INSTR\n";
   O << "#undef PRINT_ALIAS_INSTR\n\n";
 
@@ -828,9 +878,6 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
   // Emit the method that prints the alias instruction.
   std::string ClassName = AsmWriter->getValueAsString("AsmWriterClassName");
 
-  bool isMC = AsmWriter->getValueAsBit("isMCAsmWriter");
-  const char *MachineInstrClassName = isMC ? "MCInst" : "MachineInstr";
-
   std::vector<Record*> AllInstAliases =
     Records.getAllDerivedDefinitions("InstAlias");
 
@@ -840,6 +887,8 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
          I = AllInstAliases.begin(), E = AllInstAliases.end(); I != E; ++I) {
     CodeGenInstAlias *Alias = new CodeGenInstAlias(*I, Target);
     const Record *R = *I;
+    if (!R->getValueAsBit("EmitAlias"))
+      continue; // We were told not to emit the alias, but to emit the aliasee.
     const DagInit *DI = R->getValueAsDag("ResultInst");
     const DefInit *Op = dynamic_cast<const DefInit*>(DI->getOperator());
     AliasMap[getQualifiedName(Op->getDef())].push_back(Alias);
@@ -857,13 +906,18 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
     for (std::vector<CodeGenInstAlias*>::iterator
            II = Aliases.begin(), IE = Aliases.end(); II != IE; ++II) {
       const CodeGenInstAlias *CGA = *II;
+      unsigned LastOpNo = CGA->ResultInstOperandIndex.size();
+      unsigned NumResultOps =
+        CountResultNumOperands(CGA->ResultInst->AsmString);
+
+      // Don't emit the alias if it has more operands than what it's aliasing.
+      if (NumResultOps < CountNumOperands(CGA->AsmString))
+        continue;
+
       IAPrinter *IAP = new IAPrinter(AWI, CGA->Result->getAsString(),
                                      CGA->AsmString);
-
       IAP->addReqFeatures(CGA->TheDef->getValueAsListOfDefs("Predicates"));
 
-      unsigned LastOpNo = CGA->ResultInstOperandIndex.size();
-
       std::string Cond;
       Cond = std::string("MI->getNumOperands() == ") + llvm::utostr(LastOpNo);
       IAP->addCond(Cond);
@@ -898,7 +952,7 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
             }
           } else {
             assert(Rec->isSubClassOf("Operand") && "Unexpected operand!");
-            // FIXME: We need to handle these situations.
+            // FIXME: We may need to handle these situations.
             delete IAP;
             IAP = 0;
             CantHandle = true;
@@ -932,9 +986,12 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
   EmitSubtargetFeatureFlagEnumeration(AWI, O);
   EmitComputeAvailableFeatures(AWI, AsmWriter, Target, O);
 
-  O << "bool " << Target.getName() << ClassName
-    << "::printAliasInstr(const " << MachineInstrClassName
-    << " *MI, raw_ostream &OS) {\n";
+  std::string Header;
+  raw_string_ostream HeaderO(Header);
+
+  HeaderO << "bool " << Target.getName() << ClassName
+          << "::printAliasInstr(const MCInst"
+          << " *MI, raw_ostream &OS) {\n";
 
   std::string Cases;
   raw_string_ostream CasesO(Cases);
@@ -973,22 +1030,26 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
       CasesO << '\n';
     }
 
-    CasesO.indent(4) << "return true;\n";
+    CasesO.indent(4) << "return false;\n";
   }
 
-  if (CasesO.str().empty() || !isMC) {
-    O << "  return true;\n";
+  if (CasesO.str().empty()) {
+    O << HeaderO.str();
+    O << "  return false;\n";
     O << "}\n\n";
     O << "#endif // PRINT_ALIAS_INSTR\n";
     return;
   }
 
+  EmitGetMapOperandNumber(O);
+
+  O << HeaderO.str();
   O.indent(2) << "StringRef AsmString;\n";
-  O.indent(2) << "std::map<StringRef, unsigned> OpMap;\n";
+  O.indent(2) << "SmallVector<std::pair<StringRef, unsigned>, 4> OpMap;\n";
   if (NeedAvailableFeatures)
     O.indent(2) << "unsigned AvailableFeatures = getAvailableFeatures();\n\n";
   O.indent(2) << "switch (MI->getOpcode()) {\n";
-  O.indent(2) << "default: return true;\n";
+  O.indent(2) << "default: return false;\n";
   O << CasesO.str();
   O.indent(2) << "}\n\n";
 
@@ -1010,14 +1071,14 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
   O << "                *I == '_'))\n";
   O << "          ++I;\n";
   O << "        StringRef Name(Start, I - Start);\n";
-  O << "        printOperand(MI, OpMap[Name], OS);\n";
+  O << "        printOperand(MI, getMapOperandNumber(OpMap, Name), OS);\n";
   O << "      } else {\n";
   O << "        OS << *I++;\n";
   O << "      }\n";
   O << "    }\n";
   O << "  }\n\n";
   
-  O << "  return false;\n";
+  O << "  return true;\n";
   O << "}\n\n";
 
   O << "#endif // PRINT_ALIAS_INSTR\n";
diff --git a/utils/TableGen/CMakeLists.txt b/utils/TableGen/CMakeLists.txt
index 514b191..a24c921 100644
--- a/utils/TableGen/CMakeLists.txt
+++ b/utils/TableGen/CMakeLists.txt
@@ -16,6 +16,7 @@ add_llvm_utility(tblgen
   CodeEmitterGen.cpp
   CodeGenDAGPatterns.cpp
   CodeGenInstruction.cpp
+  CodeGenRegisters.cpp
   CodeGenTarget.cpp
   DAGISelEmitter.cpp
   DAGISelMatcherEmitter.cpp
@@ -34,6 +35,7 @@ add_llvm_utility(tblgen
   OptParserEmitter.cpp
   Record.cpp
   RegisterInfoEmitter.cpp
+  SetTheory.cpp
   StringMatcher.cpp
   SubtargetEmitter.cpp
   TGLexer.cpp
diff --git a/utils/TableGen/CallingConvEmitter.h b/utils/TableGen/CallingConvEmitter.h
index 7fc2507..431c33b 100644
--- a/utils/TableGen/CallingConvEmitter.h
+++ b/utils/TableGen/CallingConvEmitter.h
@@ -16,8 +16,6 @@
 #define CALLINGCONV_EMITTER_H
 
 #include "TableGenBackend.h"
-#include <map>
-#include <vector>
 #include <cassert>
 
 namespace llvm {
diff --git a/utils/TableGen/ClangASTNodesEmitter.cpp b/utils/TableGen/ClangASTNodesEmitter.cpp
index 187ab46..d9d5a3c 100644
--- a/utils/TableGen/ClangASTNodesEmitter.cpp
+++ b/utils/TableGen/ClangASTNodesEmitter.cpp
@@ -155,10 +155,13 @@ void ClangDeclContextEmitter::run(raw_ostream &OS) {
     }
   }
 
-  for (RecordSet::iterator i = DeclContexts.begin(), e = DeclContexts.end();
-       i != e; ++i) {
-    OS << "DECL_CONTEXT(" << (*i)->getName() << ")\n";
-  }
+  // To keep identical order, RecordVector may be used
+  // instead of RecordSet.
+  for (RecordVector::iterator
+         i = DeclContextsVector.begin(), e = DeclContextsVector.end();
+       i != e; ++i)
+    if (DeclContexts.find(*i) != DeclContexts.end())
+      OS << "DECL_CONTEXT(" << (*i)->getName() << ")\n";
 
   OS << "#undef DECL_CONTEXT\n";
   OS << "#undef DECL_CONTEXT_BASE\n";
diff --git a/utils/TableGen/ClangDiagnosticsEmitter.cpp b/utils/TableGen/ClangDiagnosticsEmitter.cpp
index 60e67c4..0a48e75 100644
--- a/utils/TableGen/ClangDiagnosticsEmitter.cpp
+++ b/utils/TableGen/ClangDiagnosticsEmitter.cpp
@@ -18,9 +18,11 @@
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/VectorExtras.h"
-#include <set>
 #include <map>
+#include <algorithm>
+#include <functional>
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
@@ -121,7 +123,6 @@ namespace {
 } // end anonymous namespace.
 
 
-
 //===----------------------------------------------------------------------===//
 // Warning Tables (.inc file) generation.
 //===----------------------------------------------------------------------===//
@@ -162,7 +163,7 @@ void ClangDiagsDefsEmitter::run(raw_ostream &OS) {
       OS << ", \"";
       OS.write_escaped(DI->getDef()->getValueAsString("GroupName")) << '"';
     } else {
-      OS << ", 0";
+      OS << ", \"\"";
     }
 
     // SFINAE bit
@@ -179,6 +180,14 @@ void ClangDiagsDefsEmitter::run(raw_ostream &OS) {
 
     // Category number.
     OS << ", " << CategoryIDs.getID(getDiagnosticCategory(&R, DGParentMap));
+
+    // Brief
+    OS << ", \"";
+    OS.write_escaped(R.getValueAsString("Brief")) << '"';
+
+    // Explanation 
+    OS << ", \"";
+    OS.write_escaped(R.getValueAsString("Explanation")) << '"';
     OS << ")\n";
   }
 }
@@ -187,6 +196,15 @@ void ClangDiagsDefsEmitter::run(raw_ostream &OS) {
 // Warning Group Tables generation
 //===----------------------------------------------------------------------===//
 
+static std::string getDiagCategoryEnum(llvm::StringRef name) {
+  if (name.empty())
+    return "DiagCat_None";
+  llvm::SmallString<256> enumName = llvm::StringRef("DiagCat_");
+  for (llvm::StringRef::iterator I = name.begin(), E = name.end(); I != E; ++I)
+    enumName += isalnum(*I) ? *I : '_';
+  return enumName.str();
+}
+
 namespace {
 struct GroupInfo {
   std::vector<const Record*> DiagsInGroup;
@@ -267,7 +285,9 @@ void ClangDiagGroupsEmitter::run(raw_ostream &OS) {
   for (std::map<std::string, GroupInfo>::iterator
        I = DiagsInGroup.begin(), E = DiagsInGroup.end(); I != E; ++I) {
     // Group option string.
-    OS << "  { \"";
+    OS << "  { ";
+    OS << I->first.size() << ", ";
+    OS << "\"";
     OS.write_escaped(I->first) << "\","
                                << std::string(MaxLen-I->first.size()+1, ' ');
     
@@ -291,6 +311,52 @@ void ClangDiagGroupsEmitter::run(raw_ostream &OS) {
   OS << "\n#ifdef GET_CATEGORY_TABLE\n";
   for (DiagCategoryIDMap::iterator I = CategoriesByID.begin(),
        E = CategoriesByID.end(); I != E; ++I)
-    OS << "CATEGORY(\"" << *I << "\")\n";
+    OS << "CATEGORY(\"" << *I << "\", " << getDiagCategoryEnum(*I) << ")\n";
   OS << "#endif // GET_CATEGORY_TABLE\n\n";
 }
+
+//===----------------------------------------------------------------------===//
+// Diagnostic name index generation
+//===----------------------------------------------------------------------===//
+
+namespace {
+struct RecordIndexElement
+{
+  RecordIndexElement() {}
+  explicit RecordIndexElement(Record const &R):
+    Name(R.getName()) {}
+  
+  std::string Name;
+};
+
+struct RecordIndexElementSorter :
+  public std::binary_function<RecordIndexElement, RecordIndexElement, bool> {
+  
+  bool operator()(RecordIndexElement const &Lhs,
+                  RecordIndexElement const &Rhs) const {
+    return Lhs.Name < Rhs.Name;
+  }
+  
+};
+
+} // end anonymous namespace.
+
+void ClangDiagsIndexNameEmitter::run(raw_ostream &OS) {
+  const std::vector<Record*> &Diags =
+    Records.getAllDerivedDefinitions("Diagnostic");
+  
+  std::vector<RecordIndexElement> Index;
+  Index.reserve(Diags.size());
+  for (unsigned i = 0, e = Diags.size(); i != e; ++i) {
+    const Record &R = *(Diags[i]);    
+    Index.push_back(RecordIndexElement(R));
+  }
+  
+  std::sort(Index.begin(), Index.end(), RecordIndexElementSorter());
+  
+  for (unsigned i = 0, e = Index.size(); i != e; ++i) {
+    const RecordIndexElement &R = Index[i];
+    
+    OS << "DIAG_NAME_INDEX(" << R.Name << ")\n";
+  }
+}
diff --git a/utils/TableGen/ClangDiagnosticsEmitter.h b/utils/TableGen/ClangDiagnosticsEmitter.h
index edd062a..1e4c8b7 100644
--- a/utils/TableGen/ClangDiagnosticsEmitter.h
+++ b/utils/TableGen/ClangDiagnosticsEmitter.h
@@ -33,13 +33,21 @@ public:
 };
 
 class ClangDiagGroupsEmitter : public TableGenBackend {
-    RecordKeeper &Records;
+  RecordKeeper &Records;
 public:
   explicit ClangDiagGroupsEmitter(RecordKeeper &R) : Records(R) {}
     
   void run(raw_ostream &OS);
 };
 
+class ClangDiagsIndexNameEmitter : public TableGenBackend {
+  RecordKeeper &Records;
+public:
+  explicit ClangDiagsIndexNameEmitter(RecordKeeper &R) : Records(R) {}
+  
+  void run(raw_ostream &OS);
+};
+
   
 } // End llvm namespace
 
diff --git a/utils/TableGen/CodeEmitterGen.cpp b/utils/TableGen/CodeEmitterGen.cpp
index 957dd19..9d4dc5c4 100644
--- a/utils/TableGen/CodeEmitterGen.cpp
+++ b/utils/TableGen/CodeEmitterGen.cpp
@@ -63,10 +63,14 @@ void CodeEmitterGen::reverseBits(std::vector<Record*> &Insts) {
 // return the variable bit position.  Otherwise return -1.
 int CodeEmitterGen::getVariableBit(const std::string &VarName,
                                    BitsInit *BI, int bit) {
-  if (VarBitInit *VBI = dynamic_cast<VarBitInit*>(BI->getBit(bit)))
+  if (VarBitInit *VBI = dynamic_cast<VarBitInit*>(BI->getBit(bit))) {
     if (VarInit *VI = dynamic_cast<VarInit*>(VBI->getVariable()))
       if (VI->getName() == VarName)
         return VBI->getBitNum();
+  } else if (VarInit *VI = dynamic_cast<VarInit*>(BI->getBit(bit))) {
+    if (VI->getName() == VarName)
+      return 0;
+  }
 
   return -1;
 }
diff --git a/utils/TableGen/CodeGenDAGPatterns.cpp b/utils/TableGen/CodeGenDAGPatterns.cpp
index 79cf18a..a08cde6 100644
--- a/utils/TableGen/CodeGenDAGPatterns.cpp
+++ b/utils/TableGen/CodeGenDAGPatterns.cpp
@@ -580,34 +580,29 @@ typedef std::map<std::string, int> DepVarMap;
 /// Const iterator shorthand for DepVarMap
 typedef DepVarMap::const_iterator DepVarMap_citer;
 
-namespace {
-void FindDepVarsOf(TreePatternNode *N, DepVarMap &DepMap) {
+static void FindDepVarsOf(TreePatternNode *N, DepVarMap &DepMap) {
   if (N->isLeaf()) {
-    if (dynamic_cast<DefInit*>(N->getLeafValue()) != NULL) {
+    if (dynamic_cast<DefInit*>(N->getLeafValue()) != NULL)
       DepMap[N->getName()]++;
-    }
   } else {
     for (size_t i = 0, e = N->getNumChildren(); i != e; ++i)
       FindDepVarsOf(N->getChild(i), DepMap);
   }
 }
-
-//! Find dependent variables within child patterns
-/*!
- */
-void FindDepVars(TreePatternNode *N, MultipleUseVarSet &DepVars) {
+  
+/// Find dependent variables within child patterns
+static void FindDepVars(TreePatternNode *N, MultipleUseVarSet &DepVars) {
   DepVarMap depcounts;
   FindDepVarsOf(N, depcounts);
   for (DepVarMap_citer i = depcounts.begin(); i != depcounts.end(); ++i) {
-    if (i->second > 1) {            // std::pair<std::string, int>
+    if (i->second > 1)            // std::pair<std::string, int>
       DepVars.insert(i->first);
-    }
   }
 }
 
-//! Dump the dependent variable set:
 #ifndef NDEBUG
-void DumpDepVars(MultipleUseVarSet &DepVars) {
+/// Dump the dependent variable set:
+static void DumpDepVars(MultipleUseVarSet &DepVars) {
   if (DepVars.empty()) {
     DEBUG(errs() << "<empty set>");
   } else {
@@ -621,6 +616,66 @@ void DumpDepVars(MultipleUseVarSet &DepVars) {
 }
 #endif
 
+
+//===----------------------------------------------------------------------===//
+// TreePredicateFn Implementation
+//===----------------------------------------------------------------------===//
+
+/// TreePredicateFn constructor.  Here 'N' is a subclass of PatFrag.
+TreePredicateFn::TreePredicateFn(TreePattern *N) : PatFragRec(N) {
+  assert((getPredCode().empty() || getImmCode().empty()) &&
+        ".td file corrupt: can't have a node predicate *and* an imm predicate");
+}
+
+std::string TreePredicateFn::getPredCode() const {
+  return PatFragRec->getRecord()->getValueAsCode("PredicateCode");
+}
+
+std::string TreePredicateFn::getImmCode() const {
+  return PatFragRec->getRecord()->getValueAsCode("ImmediateCode");
+}
+
+
+/// isAlwaysTrue - Return true if this is a noop predicate.
+bool TreePredicateFn::isAlwaysTrue() const {
+  return getPredCode().empty() && getImmCode().empty();
+}
+
+/// Return the name to use in the generated code to reference this, this is
+/// "Predicate_foo" if from a pattern fragment "foo".
+std::string TreePredicateFn::getFnName() const {
+  return "Predicate_" + PatFragRec->getRecord()->getName();
+}
+
+/// getCodeToRunOnSDNode - Return the code for the function body that
+/// evaluates this predicate.  The argument is expected to be in "Node",
+/// not N.  This handles casting and conversion to a concrete node type as
+/// appropriate.
+std::string TreePredicateFn::getCodeToRunOnSDNode() const {
+  // Handle immediate predicates first.
+  std::string ImmCode = getImmCode();
+  if (!ImmCode.empty()) {
+    std::string Result =
+      "    int64_t Imm = cast<ConstantSDNode>(Node)->getSExtValue();\n";
+    return Result + ImmCode;
+  }
+  
+  // Handle arbitrary node predicates.
+  assert(!getPredCode().empty() && "Don't have any predicate code!");
+  std::string ClassName;
+  if (PatFragRec->getOnlyTree()->isLeaf())
+    ClassName = "SDNode";
+  else {
+    Record *Op = PatFragRec->getOnlyTree()->getOperator();
+    ClassName = PatFragRec->getDAGPatterns().getSDNodeInfo(Op).getSDClassName();
+  }
+  std::string Result;
+  if (ClassName == "SDNode")
+    Result = "    SDNode *N = Node;\n";
+  else
+    Result = "    " + ClassName + "*N = cast<" + ClassName + ">(Node);\n";
+  
+  return Result + getPredCode();
 }
 
 //===----------------------------------------------------------------------===//
@@ -1015,7 +1070,7 @@ void TreePatternNode::print(raw_ostream &OS) const {
   }
 
   for (unsigned i = 0, e = PredicateFns.size(); i != e; ++i)
-    OS << "<<P:" << PredicateFns[i] << ">>";
+    OS << "<<P:" << PredicateFns[i].getFnName() << ">>";
   if (TransformFn)
     OS << "<<X:" << TransformFn->getName() << ">>";
   if (!getName().empty())
@@ -1150,9 +1205,9 @@ TreePatternNode *TreePatternNode::InlinePatternFragments(TreePattern &TP) {
 
   TreePatternNode *FragTree = Frag->getOnlyTree()->clone();
 
-  std::string Code = Op->getValueAsCode("Predicate");
-  if (!Code.empty())
-    FragTree->addPredicateFn("Predicate_"+Op->getName());
+  TreePredicateFn PredFn(Frag);
+  if (!PredFn.isAlwaysTrue())
+    FragTree->addPredicateFn(PredFn);
 
   // Resolve formal arguments to their actual value.
   if (Frag->getNumArgs()) {
@@ -2063,9 +2118,9 @@ void CodeGenDAGPatterns::ParsePatternFragments() {
 
     // If there is a code init for this fragment, keep track of the fact that
     // this fragment uses it.
-    std::string Code = Fragments[i]->getValueAsCode("Predicate");
-    if (!Code.empty())
-      P->getOnlyTree()->addPredicateFn("Predicate_"+Fragments[i]->getName());
+    TreePredicateFn PredFn(P);
+    if (!PredFn.isAlwaysTrue())
+      P->getOnlyTree()->addPredicateFn(PredFn);
 
     // If there is a node transformation corresponding to this, keep track of
     // it.
diff --git a/utils/TableGen/CodeGenDAGPatterns.h b/utils/TableGen/CodeGenDAGPatterns.h
index 946dcee..e4e8574 100644
--- a/utils/TableGen/CodeGenDAGPatterns.h
+++ b/utils/TableGen/CodeGenDAGPatterns.h
@@ -239,6 +239,57 @@ public:
     return MadeChange;
   }
 };
+  
+/// TreePredicateFn - This is an abstraction that represents the predicates on
+/// a PatFrag node.  This is a simple one-word wrapper around a pointer to
+/// provide nice accessors.
+class TreePredicateFn {
+  /// PatFragRec - This is the TreePattern for the PatFrag that we
+  /// originally came from.
+  TreePattern *PatFragRec;
+public:
+  /// TreePredicateFn constructor.  Here 'N' is a subclass of PatFrag.
+  TreePredicateFn(TreePattern *N);
+
+  
+  TreePattern *getOrigPatFragRecord() const { return PatFragRec; }
+  
+  /// isAlwaysTrue - Return true if this is a noop predicate.
+  bool isAlwaysTrue() const;
+  
+  bool isImmediatePattern() const { return !getImmCode().empty(); }
+  
+  /// getImmediatePredicateCode - Return the code that evaluates this pattern if
+  /// this is an immediate predicate.  It is an error to call this on a
+  /// non-immediate pattern.
+  std::string getImmediatePredicateCode() const {
+    std::string Result = getImmCode();
+    assert(!Result.empty() && "Isn't an immediate pattern!");
+    return Result;
+  }
+  
+  
+  bool operator==(const TreePredicateFn &RHS) const {
+    return PatFragRec == RHS.PatFragRec;
+  }
+
+  bool operator!=(const TreePredicateFn &RHS) const { return !(*this == RHS); }
+
+  /// Return the name to use in the generated code to reference this, this is
+  /// "Predicate_foo" if from a pattern fragment "foo".
+  std::string getFnName() const;
+  
+  /// getCodeToRunOnSDNode - Return the code for the function body that
+  /// evaluates this predicate.  The argument is expected to be in "Node",
+  /// not N.  This handles casting and conversion to a concrete node type as
+  /// appropriate.
+  std::string getCodeToRunOnSDNode() const;
+  
+private:
+  std::string getPredCode() const;
+  std::string getImmCode() const;
+};
+  
 
 /// FIXME: TreePatternNode's can be shared in some cases (due to dag-shaped
 /// patterns), and as such should be ref counted.  We currently just leak all
@@ -263,7 +314,7 @@ class TreePatternNode {
 
   /// PredicateFns - The predicate functions to execute on this node to check
   /// for a match.  If this list is empty, no predicate is involved.
-  std::vector<std::string> PredicateFns;
+  std::vector<TreePredicateFn> PredicateFns;
 
   /// TransformFn - The transformation function to execute on this node before
   /// it can be substituted into the resulting instruction on a pattern match.
@@ -323,14 +374,18 @@ public:
     return false;
   }
 
-  const std::vector<std::string> &getPredicateFns() const {return PredicateFns;}
+  bool hasAnyPredicate() const { return !PredicateFns.empty(); }
+  
+  const std::vector<TreePredicateFn> &getPredicateFns() const {
+    return PredicateFns;
+  }
   void clearPredicateFns() { PredicateFns.clear(); }
-  void setPredicateFns(const std::vector<std::string> &Fns) {
+  void setPredicateFns(const std::vector<TreePredicateFn> &Fns) {
     assert(PredicateFns.empty() && "Overwriting non-empty predicate list!");
     PredicateFns = Fns;
   }
-  void addPredicateFn(const std::string &Fn) {
-    assert(!Fn.empty() && "Empty predicate string!");
+  void addPredicateFn(const TreePredicateFn &Fn) {
+    assert(!Fn.isAlwaysTrue() && "Empty predicate string!");
     if (std::find(PredicateFns.begin(), PredicateFns.end(), Fn) ==
           PredicateFns.end())
       PredicateFns.push_back(Fn);
diff --git a/utils/TableGen/CodeGenInstruction.cpp b/utils/TableGen/CodeGenInstruction.cpp
index 5b0aedf..73fe916 100644
--- a/utils/TableGen/CodeGenInstruction.cpp
+++ b/utils/TableGen/CodeGenInstruction.cpp
@@ -417,7 +417,8 @@ bool CodeGenInstAlias::tryAliasOpMatch(DagInit *Result, unsigned AliasOpNo,
     if (!InstOpRec->isSubClassOf("RegisterClass"))
       return false;
 
-    if (!T.getRegisterClass(InstOpRec).containsRegister(ADI->getDef()))
+    if (!T.getRegisterClass(InstOpRec)
+        .contains(T.getRegBank().getReg(ADI->getDef())))
       throw TGError(Loc, "fixed register " +ADI->getDef()->getName()
                     + " is not a member of the " + InstOpRec->getName() +
                     " register class!");
diff --git a/utils/TableGen/CodeGenInstruction.h b/utils/TableGen/CodeGenInstruction.h
index 6d2d8fb..5f1e0be 100644
--- a/utils/TableGen/CodeGenInstruction.h
+++ b/utils/TableGen/CodeGenInstruction.h
@@ -137,6 +137,7 @@ namespace llvm {
     bool isVariadic;
 
     // Provide transparent accessors to the operand list.
+    bool empty() const { return OperandList.empty(); }
     unsigned size() const { return OperandList.size(); }
     const OperandInfo &operator[](unsigned i) const { return OperandList[i]; }
     OperandInfo &operator[](unsigned i) { return OperandList[i]; }
diff --git a/utils/TableGen/CodeGenIntrinsics.h b/utils/TableGen/CodeGenIntrinsics.h
index 3208c0d..3f6ba61 100644
--- a/utils/TableGen/CodeGenIntrinsics.h
+++ b/utils/TableGen/CodeGenIntrinsics.h
@@ -69,6 +69,9 @@ namespace llvm {
 
     /// isCommutative - True if the intrinsic is commutative.
     bool isCommutative;
+
+    /// canThrow - True if the intrinsic can throw.
+    bool canThrow;
     
     enum ArgAttribute {
       NoCapture
diff --git a/utils/TableGen/CodeGenRegisters.cpp b/utils/TableGen/CodeGenRegisters.cpp
new file mode 100644
index 0000000..37952fc
--- /dev/null
+++ b/utils/TableGen/CodeGenRegisters.cpp
@@ -0,0 +1,493 @@
+//===- CodeGenRegisters.cpp - Register and RegisterClass Info -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines structures to encapsulate information gleaned from the
+// target register and register class definitions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CodeGenRegisters.h"
+#include "CodeGenTarget.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+//                              CodeGenRegister
+//===----------------------------------------------------------------------===//
+
+CodeGenRegister::CodeGenRegister(Record *R, unsigned Enum)
+  : TheDef(R),
+    EnumValue(Enum),
+    CostPerUse(R->getValueAsInt("CostPerUse")),
+    SubRegsComplete(false)
+{}
+
+const std::string &CodeGenRegister::getName() const {
+  return TheDef->getName();
+}
+
+namespace {
+  struct Orphan {
+    CodeGenRegister *SubReg;
+    Record *First, *Second;
+    Orphan(CodeGenRegister *r, Record *a, Record *b)
+      : SubReg(r), First(a), Second(b) {}
+  };
+}
+
+const CodeGenRegister::SubRegMap &
+CodeGenRegister::getSubRegs(CodeGenRegBank &RegBank) {
+  // Only compute this map once.
+  if (SubRegsComplete)
+    return SubRegs;
+  SubRegsComplete = true;
+
+  std::vector<Record*> SubList = TheDef->getValueAsListOfDefs("SubRegs");
+  std::vector<Record*> Indices = TheDef->getValueAsListOfDefs("SubRegIndices");
+  if (SubList.size() != Indices.size())
+    throw TGError(TheDef->getLoc(), "Register " + getName() +
+                  " SubRegIndices doesn't match SubRegs");
+
+  // First insert the direct subregs and make sure they are fully indexed.
+  for (unsigned i = 0, e = SubList.size(); i != e; ++i) {
+    CodeGenRegister *SR = RegBank.getReg(SubList[i]);
+    if (!SubRegs.insert(std::make_pair(Indices[i], SR)).second)
+      throw TGError(TheDef->getLoc(), "SubRegIndex " + Indices[i]->getName() +
+                    " appears twice in Register " + getName());
+  }
+
+  // Keep track of inherited subregs and how they can be reached.
+  SmallVector<Orphan, 8> Orphans;
+
+  // Clone inherited subregs and place duplicate entries on Orphans.
+  // Here the order is important - earlier subregs take precedence.
+  for (unsigned i = 0, e = SubList.size(); i != e; ++i) {
+    CodeGenRegister *SR = RegBank.getReg(SubList[i]);
+    const SubRegMap &Map = SR->getSubRegs(RegBank);
+
+    // Add this as a super-register of SR now all sub-registers are in the list.
+    // This creates a topological ordering, the exact order depends on the
+    // order getSubRegs is called on all registers.
+    SR->SuperRegs.push_back(this);
+
+    for (SubRegMap::const_iterator SI = Map.begin(), SE = Map.end(); SI != SE;
+         ++SI) {
+      if (!SubRegs.insert(*SI).second)
+        Orphans.push_back(Orphan(SI->second, Indices[i], SI->first));
+
+      // Noop sub-register indexes are possible, so avoid duplicates.
+      if (SI->second != SR)
+        SI->second->SuperRegs.push_back(this);
+    }
+  }
+
+  // Process the composites.
+  ListInit *Comps = TheDef->getValueAsListInit("CompositeIndices");
+  for (unsigned i = 0, e = Comps->size(); i != e; ++i) {
+    DagInit *Pat = dynamic_cast<DagInit*>(Comps->getElement(i));
+    if (!Pat)
+      throw TGError(TheDef->getLoc(), "Invalid dag '" +
+                    Comps->getElement(i)->getAsString() +
+                    "' in CompositeIndices");
+    DefInit *BaseIdxInit = dynamic_cast<DefInit*>(Pat->getOperator());
+    if (!BaseIdxInit || !BaseIdxInit->getDef()->isSubClassOf("SubRegIndex"))
+      throw TGError(TheDef->getLoc(), "Invalid SubClassIndex in " +
+                    Pat->getAsString());
+
+    // Resolve list of subreg indices into R2.
+    CodeGenRegister *R2 = this;
+    for (DagInit::const_arg_iterator di = Pat->arg_begin(),
+         de = Pat->arg_end(); di != de; ++di) {
+      DefInit *IdxInit = dynamic_cast<DefInit*>(*di);
+      if (!IdxInit || !IdxInit->getDef()->isSubClassOf("SubRegIndex"))
+        throw TGError(TheDef->getLoc(), "Invalid SubClassIndex in " +
+                      Pat->getAsString());
+      const SubRegMap &R2Subs = R2->getSubRegs(RegBank);
+      SubRegMap::const_iterator ni = R2Subs.find(IdxInit->getDef());
+      if (ni == R2Subs.end())
+        throw TGError(TheDef->getLoc(), "Composite " + Pat->getAsString() +
+                      " refers to bad index in " + R2->getName());
+      R2 = ni->second;
+    }
+
+    // Insert composite index. Allow overriding inherited indices etc.
+    SubRegs[BaseIdxInit->getDef()] = R2;
+
+    // R2 is no longer an orphan.
+    for (unsigned j = 0, je = Orphans.size(); j != je; ++j)
+      if (Orphans[j].SubReg == R2)
+          Orphans[j].SubReg = 0;
+  }
+
+  // Now Orphans contains the inherited subregisters without a direct index.
+  // Create inferred indexes for all missing entries.
+  for (unsigned i = 0, e = Orphans.size(); i != e; ++i) {
+    Orphan &O = Orphans[i];
+    if (!O.SubReg)
+      continue;
+    SubRegs[RegBank.getCompositeSubRegIndex(O.First, O.Second, true)] =
+      O.SubReg;
+  }
+  return SubRegs;
+}
+
+void
+CodeGenRegister::addSubRegsPreOrder(SetVector<CodeGenRegister*> &OSet) const {
+  assert(SubRegsComplete && "Must precompute sub-registers");
+  std::vector<Record*> Indices = TheDef->getValueAsListOfDefs("SubRegIndices");
+  for (unsigned i = 0, e = Indices.size(); i != e; ++i) {
+    CodeGenRegister *SR = SubRegs.find(Indices[i])->second;
+    if (OSet.insert(SR))
+      SR->addSubRegsPreOrder(OSet);
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                            CodeGenRegisterClass
+//===----------------------------------------------------------------------===//
+
+CodeGenRegisterClass::CodeGenRegisterClass(CodeGenRegBank &RegBank, Record *R)
+  : TheDef(R) {
+  // Rename anonymous register classes.
+  if (R->getName().size() > 9 && R->getName()[9] == '.') {
+    static unsigned AnonCounter = 0;
+    R->setName("AnonRegClass_"+utostr(AnonCounter++));
+  }
+
+  std::vector<Record*> TypeList = R->getValueAsListOfDefs("RegTypes");
+  for (unsigned i = 0, e = TypeList.size(); i != e; ++i) {
+    Record *Type = TypeList[i];
+    if (!Type->isSubClassOf("ValueType"))
+      throw "RegTypes list member '" + Type->getName() +
+        "' does not derive from the ValueType class!";
+    VTs.push_back(getValueType(Type));
+  }
+  assert(!VTs.empty() && "RegisterClass must contain at least one ValueType!");
+
+  Elements = RegBank.getSets().expand(R);
+  for (unsigned i = 0, e = Elements->size(); i != e; ++i)
+    Members.insert(RegBank.getReg((*Elements)[i]));
+
+  // SubRegClasses is a list<dag> containing (RC, subregindex, ...) dags.
+  ListInit *SRC = R->getValueAsListInit("SubRegClasses");
+  for (ListInit::const_iterator i = SRC->begin(), e = SRC->end(); i != e; ++i) {
+    DagInit *DAG = dynamic_cast<DagInit*>(*i);
+    if (!DAG) throw "SubRegClasses must contain DAGs";
+    DefInit *DAGOp = dynamic_cast<DefInit*>(DAG->getOperator());
+    Record *RCRec;
+    if (!DAGOp || !(RCRec = DAGOp->getDef())->isSubClassOf("RegisterClass"))
+      throw "Operator '" + DAG->getOperator()->getAsString() +
+        "' in SubRegClasses is not a RegisterClass";
+    // Iterate over args, all SubRegIndex instances.
+    for (DagInit::const_arg_iterator ai = DAG->arg_begin(), ae = DAG->arg_end();
+         ai != ae; ++ai) {
+      DefInit *Idx = dynamic_cast<DefInit*>(*ai);
+      Record *IdxRec;
+      if (!Idx || !(IdxRec = Idx->getDef())->isSubClassOf("SubRegIndex"))
+        throw "Argument '" + (*ai)->getAsString() +
+          "' in SubRegClasses is not a SubRegIndex";
+      if (!SubRegClasses.insert(std::make_pair(IdxRec, RCRec)).second)
+        throw "SubRegIndex '" + IdxRec->getName() + "' mentioned twice";
+    }
+  }
+
+  // Allow targets to override the size in bits of the RegisterClass.
+  unsigned Size = R->getValueAsInt("Size");
+
+  Namespace = R->getValueAsString("Namespace");
+  SpillSize = Size ? Size : EVT(VTs[0]).getSizeInBits();
+  SpillAlignment = R->getValueAsInt("Alignment");
+  CopyCost = R->getValueAsInt("CopyCost");
+  Allocatable = R->getValueAsBit("isAllocatable");
+  MethodBodies = R->getValueAsCode("MethodBodies");
+  MethodProtos = R->getValueAsCode("MethodProtos");
+}
+
+bool CodeGenRegisterClass::contains(const CodeGenRegister *Reg) const {
+  return Members.count(Reg);
+}
+
+// Returns true if RC is a strict subclass.
+// RC is a sub-class of this class if it is a valid replacement for any
+// instruction operand where a register of this classis required. It must
+// satisfy these conditions:
+//
+// 1. All RC registers are also in this.
+// 2. The RC spill size must not be smaller than our spill size.
+// 3. RC spill alignment must be compatible with ours.
+//
+bool CodeGenRegisterClass::hasSubClass(const CodeGenRegisterClass *RC) const {
+  return SpillAlignment && RC->SpillAlignment % SpillAlignment == 0 &&
+    SpillSize <= RC->SpillSize &&
+    std::includes(Members.begin(), Members.end(),
+                  RC->Members.begin(), RC->Members.end());
+}
+
+const std::string &CodeGenRegisterClass::getName() const {
+  return TheDef->getName();
+}
+
+//===----------------------------------------------------------------------===//
+//                               CodeGenRegBank
+//===----------------------------------------------------------------------===//
+
+CodeGenRegBank::CodeGenRegBank(RecordKeeper &Records) : Records(Records) {
+  // Configure register Sets to understand register classes.
+  Sets.addFieldExpander("RegisterClass", "MemberList");
+
+  // Read in the user-defined (named) sub-register indices.
+  // More indices will be synthesized later.
+  SubRegIndices = Records.getAllDerivedDefinitions("SubRegIndex");
+  std::sort(SubRegIndices.begin(), SubRegIndices.end(), LessRecord());
+  NumNamedIndices = SubRegIndices.size();
+
+  // Read in the register definitions.
+  std::vector<Record*> Regs = Records.getAllDerivedDefinitions("Register");
+  std::sort(Regs.begin(), Regs.end(), LessRecord());
+  Registers.reserve(Regs.size());
+  // Assign the enumeration values.
+  for (unsigned i = 0, e = Regs.size(); i != e; ++i)
+    Registers.push_back(CodeGenRegister(Regs[i], i + 1));
+
+  // Read in register class definitions.
+  std::vector<Record*> RCs = Records.getAllDerivedDefinitions("RegisterClass");
+  if (RCs.empty())
+    throw std::string("No 'RegisterClass' subclasses defined!");
+
+  RegClasses.reserve(RCs.size());
+  for (unsigned i = 0, e = RCs.size(); i != e; ++i)
+    RegClasses.push_back(CodeGenRegisterClass(*this, RCs[i]));
+}
+
+CodeGenRegister *CodeGenRegBank::getReg(Record *Def) {
+  if (Def2Reg.empty())
+    for (unsigned i = 0, e = Registers.size(); i != e; ++i)
+      Def2Reg[Registers[i].TheDef] = &Registers[i];
+
+  if (CodeGenRegister *Reg = Def2Reg[Def])
+    return Reg;
+
+  throw TGError(Def->getLoc(), "Not a known Register!");
+}
+
+CodeGenRegisterClass *CodeGenRegBank::getRegClass(Record *Def) {
+  if (Def2RC.empty())
+    for (unsigned i = 0, e = RegClasses.size(); i != e; ++i)
+      Def2RC[RegClasses[i].TheDef] = &RegClasses[i];
+
+  if (CodeGenRegisterClass *RC = Def2RC[Def])
+    return RC;
+
+  throw TGError(Def->getLoc(), "Not a known RegisterClass!");
+}
+
+Record *CodeGenRegBank::getCompositeSubRegIndex(Record *A, Record *B,
+                                                bool create) {
+  // Look for an existing entry.
+  Record *&Comp = Composite[std::make_pair(A, B)];
+  if (Comp || !create)
+    return Comp;
+
+  // None exists, synthesize one.
+  std::string Name = A->getName() + "_then_" + B->getName();
+  Comp = new Record(Name, SMLoc(), Records);
+  Records.addDef(Comp);
+  SubRegIndices.push_back(Comp);
+  return Comp;
+}
+
+unsigned CodeGenRegBank::getSubRegIndexNo(Record *idx) {
+  std::vector<Record*>::const_iterator i =
+    std::find(SubRegIndices.begin(), SubRegIndices.end(), idx);
+  assert(i != SubRegIndices.end() && "Not a SubRegIndex");
+  return (i - SubRegIndices.begin()) + 1;
+}
+
+void CodeGenRegBank::computeComposites() {
+  // Precompute all sub-register maps. This will create Composite entries for
+  // all inferred sub-register indices.
+  for (unsigned i = 0, e = Registers.size(); i != e; ++i)
+    Registers[i].getSubRegs(*this);
+
+  for (unsigned i = 0, e = Registers.size(); i != e; ++i) {
+    CodeGenRegister *Reg1 = &Registers[i];
+    const CodeGenRegister::SubRegMap &SRM1 = Reg1->getSubRegs();
+    for (CodeGenRegister::SubRegMap::const_iterator i1 = SRM1.begin(),
+         e1 = SRM1.end(); i1 != e1; ++i1) {
+      Record *Idx1 = i1->first;
+      CodeGenRegister *Reg2 = i1->second;
+      // Ignore identity compositions.
+      if (Reg1 == Reg2)
+        continue;
+      const CodeGenRegister::SubRegMap &SRM2 = Reg2->getSubRegs();
+      // Try composing Idx1 with another SubRegIndex.
+      for (CodeGenRegister::SubRegMap::const_iterator i2 = SRM2.begin(),
+           e2 = SRM2.end(); i2 != e2; ++i2) {
+        std::pair<Record*, Record*> IdxPair(Idx1, i2->first);
+        CodeGenRegister *Reg3 = i2->second;
+        // Ignore identity compositions.
+        if (Reg2 == Reg3)
+          continue;
+        // OK Reg1:IdxPair == Reg3. Find the index with Reg:Idx == Reg3.
+        for (CodeGenRegister::SubRegMap::const_iterator i1d = SRM1.begin(),
+             e1d = SRM1.end(); i1d != e1d; ++i1d) {
+          if (i1d->second == Reg3) {
+            std::pair<CompositeMap::iterator, bool> Ins =
+              Composite.insert(std::make_pair(IdxPair, i1d->first));
+            // Conflicting composition? Emit a warning but allow it.
+            if (!Ins.second && Ins.first->second != i1d->first) {
+              errs() << "Warning: SubRegIndex " << getQualifiedName(Idx1)
+                     << " and " << getQualifiedName(IdxPair.second)
+                     << " compose ambiguously as "
+                     << getQualifiedName(Ins.first->second) << " or "
+                     << getQualifiedName(i1d->first) << "\n";
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // We don't care about the difference between (Idx1, Idx2) -> Idx2 and invalid
+  // compositions, so remove any mappings of that form.
+  for (CompositeMap::iterator i = Composite.begin(), e = Composite.end();
+       i != e;) {
+    CompositeMap::iterator j = i;
+    ++i;
+    if (j->first.second == j->second)
+      Composite.erase(j);
+  }
+}
+
+// Compute sets of overlapping registers.
+//
+// The standard set is all super-registers and all sub-registers, but the
+// target description can add arbitrary overlapping registers via the 'Aliases'
+// field. This complicates things, but we can compute overlapping sets using
+// the following rules:
+//
+// 1. The relation overlap(A, B) is reflexive and symmetric but not transitive.
+//
+// 2. overlap(A, B) implies overlap(A, S) for all S in supers(B).
+//
+// Alternatively:
+//
+//    overlap(A, B) iff there exists:
+//    A' in { A, subregs(A) } and B' in { B, subregs(B) } such that:
+//    A' = B' or A' in aliases(B') or B' in aliases(A').
+//
+// Here subregs(A) is the full flattened sub-register set returned by
+// A.getSubRegs() while aliases(A) is simply the special 'Aliases' field in the
+// description of register A.
+//
+// This also implies that registers with a common sub-register are considered
+// overlapping. This can happen when forming register pairs:
+//
+//    P0 = (R0, R1)
+//    P1 = (R1, R2)
+//    P2 = (R2, R3)
+//
+// In this case, we will infer an overlap between P0 and P1 because of the
+// shared sub-register R1. There is no overlap between P0 and P2.
+//
+void CodeGenRegBank::
+computeOverlaps(std::map<const CodeGenRegister*, CodeGenRegister::Set> &Map) {
+  assert(Map.empty());
+
+  // Collect overlaps that don't follow from rule 2.
+  for (unsigned i = 0, e = Registers.size(); i != e; ++i) {
+    CodeGenRegister *Reg = &Registers[i];
+    CodeGenRegister::Set &Overlaps = Map[Reg];
+
+    // Reg overlaps itself.
+    Overlaps.insert(Reg);
+
+    // All super-registers overlap.
+    const CodeGenRegister::SuperRegList &Supers = Reg->getSuperRegs();
+    Overlaps.insert(Supers.begin(), Supers.end());
+
+    // Form symmetrical relations from the special Aliases[] lists.
+    std::vector<Record*> RegList = Reg->TheDef->getValueAsListOfDefs("Aliases");
+    for (unsigned i2 = 0, e2 = RegList.size(); i2 != e2; ++i2) {
+      CodeGenRegister *Reg2 = getReg(RegList[i2]);
+      CodeGenRegister::Set &Overlaps2 = Map[Reg2];
+      const CodeGenRegister::SuperRegList &Supers2 = Reg2->getSuperRegs();
+      // Reg overlaps Reg2 which implies it overlaps supers(Reg2).
+      Overlaps.insert(Reg2);
+      Overlaps.insert(Supers2.begin(), Supers2.end());
+      Overlaps2.insert(Reg);
+      Overlaps2.insert(Supers.begin(), Supers.end());
+    }
+  }
+
+  // Apply rule 2. and inherit all sub-register overlaps.
+  for (unsigned i = 0, e = Registers.size(); i != e; ++i) {
+    CodeGenRegister *Reg = &Registers[i];
+    CodeGenRegister::Set &Overlaps = Map[Reg];
+    const CodeGenRegister::SubRegMap &SRM = Reg->getSubRegs();
+    for (CodeGenRegister::SubRegMap::const_iterator i2 = SRM.begin(),
+         e2 = SRM.end(); i2 != e2; ++i2) {
+      CodeGenRegister::Set &Overlaps2 = Map[i2->second];
+      Overlaps.insert(Overlaps2.begin(), Overlaps2.end());
+    }
+  }
+}
+
+void CodeGenRegBank::computeDerivedInfo() {
+  computeComposites();
+}
+
+/// getRegisterClassForRegister - Find the register class that contains the
+/// specified physical register.  If the register is not in a register class,
+/// return null. If the register is in multiple classes, and the classes have a
+/// superset-subset relationship and the same set of types, return the
+/// superclass.  Otherwise return null.
+const CodeGenRegisterClass*
+CodeGenRegBank::getRegClassForRegister(Record *R) {
+  const CodeGenRegister *Reg = getReg(R);
+  const std::vector<CodeGenRegisterClass> &RCs = getRegClasses();
+  const CodeGenRegisterClass *FoundRC = 0;
+  for (unsigned i = 0, e = RCs.size(); i != e; ++i) {
+    const CodeGenRegisterClass &RC = RCs[i];
+    if (!RC.contains(Reg))
+      continue;
+
+    // If this is the first class that contains the register,
+    // make a note of it and go on to the next class.
+    if (!FoundRC) {
+      FoundRC = &RC;
+      continue;
+    }
+
+    // If a register's classes have different types, return null.
+    if (RC.getValueTypes() != FoundRC->getValueTypes())
+      return 0;
+
+    // Check to see if the previously found class that contains
+    // the register is a subclass of the current class. If so,
+    // prefer the superclass.
+    if (RC.hasSubClass(FoundRC)) {
+      FoundRC = &RC;
+      continue;
+    }
+
+    // Check to see if the previously found class that contains
+    // the register is a superclass of the current class. If so,
+    // prefer the superclass.
+    if (FoundRC->hasSubClass(&RC))
+      continue;
+
+    // Multiple classes, and neither is a superclass of the other.
+    // Return null.
+    return 0;
+  }
+  return FoundRC;
+}
diff --git a/utils/TableGen/CodeGenRegisters.h b/utils/TableGen/CodeGenRegisters.h
index f6d6af8..55f0b9b 100644
--- a/utils/TableGen/CodeGenRegisters.h
+++ b/utils/TableGen/CodeGenRegisters.h
@@ -15,34 +15,85 @@
 #ifndef CODEGEN_REGISTERS_H
 #define CODEGEN_REGISTERS_H
 
+#include "Record.h"
+#include "SetTheory.h"
 #include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SetVector.h"
+#include <cstdlib>
+#include <map>
 #include <string>
-#include <vector>
 #include <set>
-#include <cstdlib>
+#include <vector>
 
 namespace llvm {
-  class Record;
+  class CodeGenRegBank;
 
   /// CodeGenRegister - Represents a register definition.
   struct CodeGenRegister {
     Record *TheDef;
-    const std::string &getName() const;
-    unsigned DeclaredSpillSize, DeclaredSpillAlignment;
     unsigned EnumValue;
-    CodeGenRegister(Record *R);
+    unsigned CostPerUse;
+
+    // Map SubRegIndex -> Register.
+    typedef std::map<Record*, CodeGenRegister*, LessRecord> SubRegMap;
+
+    CodeGenRegister(Record *R, unsigned Enum);
+
+    const std::string &getName() const;
+
+    // Get a map of sub-registers computed lazily.
+    // This includes unique entries for all sub-sub-registers.
+    const SubRegMap &getSubRegs(CodeGenRegBank&);
+
+    const SubRegMap &getSubRegs() const {
+      assert(SubRegsComplete && "Must precompute sub-registers");
+      return SubRegs;
+    }
+
+    // Add sub-registers to OSet following a pre-order defined by the .td file.
+    void addSubRegsPreOrder(SetVector<CodeGenRegister*> &OSet) const;
+
+    // List of super-registers in topological order, small to large.
+    typedef std::vector<CodeGenRegister*> SuperRegList;
+
+    // Get the list of super-registers.
+    // This is only valid after computeDerivedInfo has visited all registers.
+    const SuperRegList &getSuperRegs() const {
+      assert(SubRegsComplete && "Must precompute sub-registers");
+      return SuperRegs;
+    }
+
+    // Order CodeGenRegister pointers by EnumValue.
+    struct Less {
+      bool operator()(const CodeGenRegister *A,
+                      const CodeGenRegister *B) const {
+        return A->EnumValue < B->EnumValue;
+      }
+    };
+
+    // Canonically ordered set.
+    typedef std::set<const CodeGenRegister*, Less> Set;
+
+  private:
+    bool SubRegsComplete;
+    SubRegMap SubRegs;
+    SuperRegList SuperRegs;
   };
 
 
-  struct CodeGenRegisterClass {
+  class CodeGenRegisterClass {
+    CodeGenRegister::Set Members;
+    const std::vector<Record*> *Elements;
+  public:
     Record *TheDef;
     std::string Namespace;
-    std::vector<Record*> Elements;
     std::vector<MVT::SimpleValueType> VTs;
     unsigned SpillSize;
     unsigned SpillAlignment;
     int CopyCost;
+    bool Allocatable;
     // Map SubRegIndex -> RegisterClass
     DenseMap<Record*,Record*> SubRegClasses;
     std::string MethodProtos, MethodBodies;
@@ -58,13 +109,10 @@ namespace llvm {
       abort();
     }
 
-    bool containsRegister(Record *R) const {
-      for (unsigned i = 0, e = Elements.size(); i != e; ++i)
-        if (Elements[i] == R) return true;
-      return false;
-    }
+    // Return true if this this class contains the register.
+    bool contains(const CodeGenRegister*) const;
 
-    // Returns true if RC is a strict subclass.
+    // Returns true if RC is a subclass.
     // RC is a sub-class of this class if it is a valid replacement for any
     // instruction operand where a register of this classis required. It must
     // satisfy these conditions:
@@ -73,29 +121,86 @@ namespace llvm {
     // 2. The RC spill size must not be smaller than our spill size.
     // 3. RC spill alignment must be compatible with ours.
     //
-    bool hasSubClass(const CodeGenRegisterClass *RC) const {
+    bool hasSubClass(const CodeGenRegisterClass *RC) const;
 
-      if (RC->Elements.size() > Elements.size() ||
-          (SpillAlignment && RC->SpillAlignment % SpillAlignment) ||
-          SpillSize > RC->SpillSize)
-        return false;
+    // Returns an ordered list of class members.
+    // The order of registers is the same as in the .td file.
+    ArrayRef<Record*> getOrder() const {
+      return *Elements;
+    }
 
-      std::set<Record*> RegSet;
-      for (unsigned i = 0, e = Elements.size(); i != e; ++i) {
-        Record *Reg = Elements[i];
-        RegSet.insert(Reg);
-      }
+    CodeGenRegisterClass(CodeGenRegBank&, Record *R);
+  };
 
-      for (unsigned i = 0, e = RC->Elements.size(); i != e; ++i) {
-        Record *Reg = RC->Elements[i];
-        if (!RegSet.count(Reg))
-          return false;
-      }
+  // CodeGenRegBank - Represent a target's registers and the relations between
+  // them.
+  class CodeGenRegBank {
+    RecordKeeper &Records;
+    SetTheory Sets;
+
+    std::vector<Record*> SubRegIndices;
+    unsigned NumNamedIndices;
+    std::vector<CodeGenRegister> Registers;
+    DenseMap<Record*, CodeGenRegister*> Def2Reg;
+
+    std::vector<CodeGenRegisterClass> RegClasses;
+    DenseMap<Record*, CodeGenRegisterClass*> Def2RC;
+
+    // Composite SubRegIndex instances.
+    // Map (SubRegIndex, SubRegIndex) -> SubRegIndex.
+    typedef DenseMap<std::pair<Record*, Record*>, Record*> CompositeMap;
+    CompositeMap Composite;
+
+    // Populate the Composite map from sub-register relationships.
+    void computeComposites();
+
+  public:
+    CodeGenRegBank(RecordKeeper&);
+
+    SetTheory &getSets() { return Sets; }
 
-      return true;
+    // Sub-register indices. The first NumNamedIndices are defined by the user
+    // in the .td files. The rest are synthesized such that all sub-registers
+    // have a unique name.
+    const std::vector<Record*> &getSubRegIndices() { return SubRegIndices; }
+    unsigned getNumNamedIndices() { return NumNamedIndices; }
+
+    // Map a SubRegIndex Record to its enum value.
+    unsigned getSubRegIndexNo(Record *idx);
+
+    // Find or create a sub-register index representing the A+B composition.
+    Record *getCompositeSubRegIndex(Record *A, Record *B, bool create = false);
+
+    const std::vector<CodeGenRegister> &getRegisters() { return Registers; }
+
+    // Find a register from its Record def.
+    CodeGenRegister *getReg(Record*);
+
+    const std::vector<CodeGenRegisterClass> &getRegClasses() {
+      return RegClasses;
     }
 
-    CodeGenRegisterClass(Record *R);
+    // Find a register class from its def.
+    CodeGenRegisterClass *getRegClass(Record*);
+
+    /// getRegisterClassForRegister - Find the register class that contains the
+    /// specified physical register.  If the register is not in a register
+    /// class, return null. If the register is in multiple classes, and the
+    /// classes have a superset-subset relationship and the same set of types,
+    /// return the superclass.  Otherwise return null.
+    const CodeGenRegisterClass* getRegClassForRegister(Record *R);
+
+    // Computed derived records such as missing sub-register indices.
+    void computeDerivedInfo();
+
+    // Compute full overlap sets for every register. These sets include the
+    // rarely used aliases that are neither sub nor super-registers.
+    //
+    // Map[R1].count(R2) is reflexive and symmetric, but not transitive.
+    //
+    // If R1 is a sub-register of R2, Map[R1] is a subset of Map[R2].
+    void computeOverlaps(std::map<const CodeGenRegister*,
+                                  CodeGenRegister::Set> &Map);
   };
 }
 
diff --git a/utils/TableGen/CodeGenTarget.cpp b/utils/TableGen/CodeGenTarget.cpp
index cc09c8d..4ce8022 100644
--- a/utils/TableGen/CodeGenTarget.cpp
+++ b/utils/TableGen/CodeGenTarget.cpp
@@ -90,6 +90,7 @@ std::string llvm::getEnumName(MVT::SimpleValueType T) {
   case MVT::Metadata: return "MVT::Metadata";
   case MVT::iPTR:     return "MVT::iPTR";
   case MVT::iPTRAny:  return "MVT::iPTRAny";
+  case MVT::untyped:  return "MVT::untyped";
   default: assert(0 && "ILLEGAL VALUE TYPE!"); return "";
   }
 }
@@ -98,17 +99,18 @@ std::string llvm::getEnumName(MVT::SimpleValueType T) {
 /// namespace qualifier if the record contains one.
 ///
 std::string llvm::getQualifiedName(const Record *R) {
-  std::string Namespace = R->getValueAsString("Namespace");
+  std::string Namespace;
+  if (R->getValue("Namespace"))
+     Namespace = R->getValueAsString("Namespace");
   if (Namespace.empty()) return R->getName();
   return Namespace + "::" + R->getName();
 }
 
 
-
-
 /// getTarget - Return the current instance of the Target class.
 ///
-CodeGenTarget::CodeGenTarget(RecordKeeper &records) : Records(records) {
+CodeGenTarget::CodeGenTarget(RecordKeeper &records)
+  : Records(records), RegBank(0) {
   std::vector<Record*> Targets = Records.getAllDerivedDefinitions("Target");
   if (Targets.size() == 0)
     throw std::string("ERROR: No 'Target' subclasses defined!");
@@ -156,47 +158,16 @@ Record *CodeGenTarget::getAsmWriter() const {
   return LI[AsmWriterNum];
 }
 
-void CodeGenTarget::ReadRegisters() const {
-  std::vector<Record*> Regs = Records.getAllDerivedDefinitions("Register");
-  if (Regs.empty())
-    throw std::string("No 'Register' subclasses defined!");
-  std::sort(Regs.begin(), Regs.end(), LessRecord());
-
-  Registers.reserve(Regs.size());
-  Registers.assign(Regs.begin(), Regs.end());
-  // Assign the enumeration values.
-  for (unsigned i = 0, e = Registers.size(); i != e; ++i)
-    Registers[i].EnumValue = i + 1;
-}
-
-CodeGenRegister::CodeGenRegister(Record *R) : TheDef(R) {
-  DeclaredSpillSize = R->getValueAsInt("SpillSize");
-  DeclaredSpillAlignment = R->getValueAsInt("SpillAlignment");
-}
-
-const std::string &CodeGenRegister::getName() const {
-  return TheDef->getName();
-}
-
-void CodeGenTarget::ReadSubRegIndices() const {
-  SubRegIndices = Records.getAllDerivedDefinitions("SubRegIndex");
-  std::sort(SubRegIndices.begin(), SubRegIndices.end(), LessRecord());
-}
-
-void CodeGenTarget::ReadRegisterClasses() const {
-  std::vector<Record*> RegClasses =
-    Records.getAllDerivedDefinitions("RegisterClass");
-  if (RegClasses.empty())
-    throw std::string("No 'RegisterClass' subclasses defined!");
-
-  RegisterClasses.reserve(RegClasses.size());
-  RegisterClasses.assign(RegClasses.begin(), RegClasses.end());
+CodeGenRegBank &CodeGenTarget::getRegBank() const {
+  if (!RegBank)
+    RegBank = new CodeGenRegBank(Records);
+  return *RegBank;
 }
 
 /// getRegisterByName - If there is a register with the specific AsmName,
 /// return it.
 const CodeGenRegister *CodeGenTarget::getRegisterByName(StringRef Name) const {
-  const std::vector<CodeGenRegister> &Regs = getRegisters();
+  const std::vector<CodeGenRegister> &Regs = getRegBank().getRegisters();
   for (unsigned i = 0, e = Regs.size(); i != e; ++i) {
     const CodeGenRegister &Reg = Regs[i];
     if (Reg.TheDef->getValueAsString("AsmName") == Name)
@@ -208,15 +179,14 @@ const CodeGenRegister *CodeGenTarget::getRegisterByName(StringRef Name) const {
 
 std::vector<MVT::SimpleValueType> CodeGenTarget::
 getRegisterVTs(Record *R) const {
+  const CodeGenRegister *Reg = getRegBank().getReg(R);
   std::vector<MVT::SimpleValueType> Result;
   const std::vector<CodeGenRegisterClass> &RCs = getRegisterClasses();
   for (unsigned i = 0, e = RCs.size(); i != e; ++i) {
-    const CodeGenRegisterClass &RC = RegisterClasses[i];
-    for (unsigned ei = 0, ee = RC.Elements.size(); ei != ee; ++ei) {
-      if (R == RC.Elements[ei]) {
-        const std::vector<MVT::SimpleValueType> &InVTs = RC.getValueTypes();
-        Result.insert(Result.end(), InVTs.begin(), InVTs.end());
-      }
+    const CodeGenRegisterClass &RC = RCs[i];
+    if (RC.contains(Reg)) {
+      const std::vector<MVT::SimpleValueType> &InVTs = RC.getValueTypes();
+      Result.insert(Result.end(), InVTs.begin(), InVTs.end());
     }
   }
 
@@ -227,70 +197,6 @@ getRegisterVTs(Record *R) const {
 }
 
 
-CodeGenRegisterClass::CodeGenRegisterClass(Record *R) : TheDef(R) {
-  // Rename anonymous register classes.
-  if (R->getName().size() > 9 && R->getName()[9] == '.') {
-    static unsigned AnonCounter = 0;
-    R->setName("AnonRegClass_"+utostr(AnonCounter++));
-  }
-
-  std::vector<Record*> TypeList = R->getValueAsListOfDefs("RegTypes");
-  for (unsigned i = 0, e = TypeList.size(); i != e; ++i) {
-    Record *Type = TypeList[i];
-    if (!Type->isSubClassOf("ValueType"))
-      throw "RegTypes list member '" + Type->getName() +
-        "' does not derive from the ValueType class!";
-    VTs.push_back(getValueType(Type));
-  }
-  assert(!VTs.empty() && "RegisterClass must contain at least one ValueType!");
-
-  std::vector<Record*> RegList = R->getValueAsListOfDefs("MemberList");
-  for (unsigned i = 0, e = RegList.size(); i != e; ++i) {
-    Record *Reg = RegList[i];
-    if (!Reg->isSubClassOf("Register"))
-      throw "Register Class member '" + Reg->getName() +
-            "' does not derive from the Register class!";
-    Elements.push_back(Reg);
-  }
-
-  // SubRegClasses is a list<dag> containing (RC, subregindex, ...) dags.
-  ListInit *SRC = R->getValueAsListInit("SubRegClasses");
-  for (ListInit::const_iterator i = SRC->begin(), e = SRC->end(); i != e; ++i) {
-    DagInit *DAG = dynamic_cast<DagInit*>(*i);
-    if (!DAG) throw "SubRegClasses must contain DAGs";
-    DefInit *DAGOp = dynamic_cast<DefInit*>(DAG->getOperator());
-    Record *RCRec;
-    if (!DAGOp || !(RCRec = DAGOp->getDef())->isSubClassOf("RegisterClass"))
-      throw "Operator '" + DAG->getOperator()->getAsString() +
-        "' in SubRegClasses is not a RegisterClass";
-    // Iterate over args, all SubRegIndex instances.
-    for (DagInit::const_arg_iterator ai = DAG->arg_begin(), ae = DAG->arg_end();
-         ai != ae; ++ai) {
-      DefInit *Idx = dynamic_cast<DefInit*>(*ai);
-      Record *IdxRec;
-      if (!Idx || !(IdxRec = Idx->getDef())->isSubClassOf("SubRegIndex"))
-        throw "Argument '" + (*ai)->getAsString() +
-          "' in SubRegClasses is not a SubRegIndex";
-      if (!SubRegClasses.insert(std::make_pair(IdxRec, RCRec)).second)
-        throw "SubRegIndex '" + IdxRec->getName() + "' mentioned twice";
-    }
-  }
-
-  // Allow targets to override the size in bits of the RegisterClass.
-  unsigned Size = R->getValueAsInt("Size");
-
-  Namespace = R->getValueAsString("Namespace");
-  SpillSize = Size ? Size : EVT(VTs[0]).getSizeInBits();
-  SpillAlignment = R->getValueAsInt("Alignment");
-  CopyCost = R->getValueAsInt("CopyCost");
-  MethodBodies = R->getValueAsCode("MethodBodies");
-  MethodProtos = R->getValueAsCode("MethodProtos");
-}
-
-const std::string &CodeGenRegisterClass::getName() const {
-  return TheDef->getName();
-}
-
 void CodeGenTarget::ReadLegalValueTypes() const {
   const std::vector<CodeGenRegisterClass> &RCs = getRegisterClasses();
   for (unsigned i = 0, e = RCs.size(); i != e; ++i)
@@ -454,6 +360,7 @@ CodeGenIntrinsic::CodeGenIntrinsic(Record *R) {
   ModRef = ReadWriteMem;
   isOverloaded = false;
   isCommutative = false;
+  canThrow = false;
 
   if (DefName.size() <= 4 ||
       std::string(DefName.begin(), DefName.begin() + 4) != "int_")
@@ -576,10 +483,15 @@ CodeGenIntrinsic::CodeGenIntrinsic(Record *R) {
       ModRef = ReadWriteArgMem;
     else if (Property->getName() == "Commutative")
       isCommutative = true;
+    else if (Property->getName() == "Throws")
+      canThrow = true;
     else if (Property->isSubClassOf("NoCapture")) {
       unsigned ArgNo = Property->getValueAsInt("ArgNo");
       ArgumentAttributes.push_back(std::make_pair(ArgNo, NoCapture));
     } else
       assert(0 && "Unknown property!");
   }
+
+  // Sort the argument attributes for later benefit.
+  std::sort(ArgumentAttributes.begin(), ArgumentAttributes.end());
 }
diff --git a/utils/TableGen/CodeGenTarget.h b/utils/TableGen/CodeGenTarget.h
index 4e04154..2516515 100644
--- a/utils/TableGen/CodeGenTarget.h
+++ b/utils/TableGen/CodeGenTarget.h
@@ -65,13 +65,8 @@ class CodeGenTarget {
   Record *TargetRec;
 
   mutable DenseMap<const Record*, CodeGenInstruction*> Instructions;
-  mutable std::vector<CodeGenRegister> Registers;
-  mutable std::vector<Record*> SubRegIndices;
-  mutable std::vector<CodeGenRegisterClass> RegisterClasses;
+  mutable CodeGenRegBank *RegBank;
   mutable std::vector<MVT::SimpleValueType> LegalValueTypes;
-  void ReadRegisters() const;
-  void ReadSubRegIndices() const;
-  void ReadRegisterClasses() const;
   void ReadInstructions() const;
   void ReadLegalValueTypes() const;
 
@@ -98,95 +93,23 @@ public:
   ///
   Record *getAsmWriter() const;
 
+  /// getRegBank - Return the register bank description.
+  CodeGenRegBank &getRegBank() const;
+
   const std::vector<CodeGenRegister> &getRegisters() const {
-    if (Registers.empty()) ReadRegisters();
-    return Registers;
+    return getRegBank().getRegisters();
   }
 
   /// getRegisterByName - If there is a register with the specific AsmName,
   /// return it.
   const CodeGenRegister *getRegisterByName(StringRef Name) const;
 
-  const std::vector<Record*> &getSubRegIndices() const {
-    if (SubRegIndices.empty()) ReadSubRegIndices();
-    return SubRegIndices;
-  }
-
-  // Map a SubRegIndex Record to its number.
-  unsigned getSubRegIndexNo(Record *idx) const {
-    if (SubRegIndices.empty()) ReadSubRegIndices();
-    std::vector<Record*>::const_iterator i =
-      std::find(SubRegIndices.begin(), SubRegIndices.end(), idx);
-    assert(i != SubRegIndices.end() && "Not a SubRegIndex");
-    return (i - SubRegIndices.begin()) + 1;
-  }
-
   const std::vector<CodeGenRegisterClass> &getRegisterClasses() const {
-    if (RegisterClasses.empty()) ReadRegisterClasses();
-    return RegisterClasses;
+    return getRegBank().getRegClasses();
   }
 
   const CodeGenRegisterClass &getRegisterClass(Record *R) const {
-    const std::vector<CodeGenRegisterClass> &RC = getRegisterClasses();
-    for (unsigned i = 0, e = RC.size(); i != e; ++i)
-      if (RC[i].TheDef == R)
-        return RC[i];
-    assert(0 && "Didn't find the register class");
-    abort();
-  }
-
-  /// getRegisterClassForRegister - Find the register class that contains the
-  /// specified physical register.  If the register is not in a register
-  /// class, return null. If the register is in multiple classes, and the
-  /// classes have a superset-subset relationship and the same set of
-  /// types, return the superclass.  Otherwise return null.
-  const CodeGenRegisterClass *getRegisterClassForRegister(Record *R) const {
-    const std::vector<CodeGenRegisterClass> &RCs = getRegisterClasses();
-    const CodeGenRegisterClass *FoundRC = 0;
-    for (unsigned i = 0, e = RCs.size(); i != e; ++i) {
-      const CodeGenRegisterClass &RC = RegisterClasses[i];
-      for (unsigned ei = 0, ee = RC.Elements.size(); ei != ee; ++ei) {
-        if (R != RC.Elements[ei])
-          continue;
-
-        // If a register's classes have different types, return null.
-        if (FoundRC && RC.getValueTypes() != FoundRC->getValueTypes())
-          return 0;
-
-        // If this is the first class that contains the register,
-        // make a note of it and go on to the next class.
-        if (!FoundRC) {
-          FoundRC = &RC;
-          break;
-        }
-
-        std::vector<Record *> Elements(RC.Elements);
-        std::vector<Record *> FoundElements(FoundRC->Elements);
-        std::sort(Elements.begin(), Elements.end());
-        std::sort(FoundElements.begin(), FoundElements.end());
-
-        // Check to see if the previously found class that contains
-        // the register is a subclass of the current class. If so,
-        // prefer the superclass.
-        if (std::includes(Elements.begin(), Elements.end(),
-                          FoundElements.begin(), FoundElements.end())) {
-          FoundRC = &RC;
-          break;
-        }
-
-        // Check to see if the previously found class that contains
-        // the register is a superclass of the current class. If so,
-        // prefer the superclass.
-        if (std::includes(FoundElements.begin(), FoundElements.end(),
-                          Elements.begin(), Elements.end()))
-          break;
-
-        // Multiple classes, and neither is a superclass of the other.
-        // Return null.
-        return 0;
-      }
-    }
-    return FoundRC;
+    return *getRegBank().getRegClass(R);
   }
 
   /// getRegisterVTs - Find the union of all possible SimpleValueTypes for the
diff --git a/utils/TableGen/DAGISelEmitter.h b/utils/TableGen/DAGISelEmitter.h
index 2117e65..35ab550 100644
--- a/utils/TableGen/DAGISelEmitter.h
+++ b/utils/TableGen/DAGISelEmitter.h
@@ -16,7 +16,6 @@
 
 #include "TableGenBackend.h"
 #include "CodeGenDAGPatterns.h"
-#include <set>
 
 namespace llvm {
 
diff --git a/utils/TableGen/DAGISelMatcher.cpp b/utils/TableGen/DAGISelMatcher.cpp
index 2afa2b9..b12e101 100644
--- a/utils/TableGen/DAGISelMatcher.cpp
+++ b/utils/TableGen/DAGISelMatcher.cpp
@@ -83,6 +83,15 @@ ScopeMatcher::~ScopeMatcher() {
 }
 
 
+CheckPredicateMatcher::CheckPredicateMatcher(const TreePredicateFn &pred)
+  : Matcher(CheckPredicate), Pred(pred.getOrigPatFragRecord()) {}
+
+TreePredicateFn CheckPredicateMatcher::getPredicate() const {
+  return TreePredicateFn(Pred);
+}
+
+
+
 // printImpl methods.
 
 void ScopeMatcher::printImpl(raw_ostream &OS, unsigned indent) const {
@@ -129,7 +138,7 @@ printImpl(raw_ostream &OS, unsigned indent) const {
 }
 
 void CheckPredicateMatcher::printImpl(raw_ostream &OS, unsigned indent) const {
-  OS.indent(indent) << "CheckPredicate " << PredName << '\n';
+  OS.indent(indent) << "CheckPredicate " << getPredicate().getFnName() << '\n';
 }
 
 void CheckOpcodeMatcher::printImpl(raw_ostream &OS, unsigned indent) const {
@@ -263,7 +272,7 @@ unsigned CheckPatternPredicateMatcher::getHashImpl() const {
 }
 
 unsigned CheckPredicateMatcher::getHashImpl() const {
-  return HashString(PredName);
+  return HashString(getPredicate().getFnName());
 }
 
 unsigned CheckOpcodeMatcher::getHashImpl() const {
@@ -301,7 +310,6 @@ bool CheckOpcodeMatcher::isEqualImpl(const Matcher *M) const {
           Opcode.getEnumName();
 }
 
-
 bool EmitNodeMatcherCommon::isEqualImpl(const Matcher *m) const {
   const EmitNodeMatcherCommon *M = cast<EmitNodeMatcherCommon>(m);
   return M->OpcodeName == OpcodeName && M->VTs == VTs &&
diff --git a/utils/TableGen/DAGISelMatcher.h b/utils/TableGen/DAGISelMatcher.h
index 8ffe412..dcb8da7 100644
--- a/utils/TableGen/DAGISelMatcher.h
+++ b/utils/TableGen/DAGISelMatcher.h
@@ -25,6 +25,8 @@ namespace llvm {
   class ComplexPattern;
   class Record;
   class SDNodeInfo;
+  class TreePredicateFn;
+  class TreePattern;
 
 Matcher *ConvertPatternToMatcher(const PatternToMatch &Pattern,unsigned Variant,
                                  const CodeGenDAGPatterns &CGP);
@@ -419,12 +421,11 @@ private:
 /// CheckPredicateMatcher - This checks the target-specific predicate to
 /// see if the node is acceptable.
 class CheckPredicateMatcher : public Matcher {
-  StringRef PredName;
+  TreePattern *Pred;
 public:
-  CheckPredicateMatcher(StringRef predname)
-    : Matcher(CheckPredicate), PredName(predname) {}
+  CheckPredicateMatcher(const TreePredicateFn &pred);
 
-  StringRef getPredicateName() const { return PredName; }
+  TreePredicateFn getPredicate() const;
 
   static inline bool classof(const Matcher *N) {
     return N->getKind() == CheckPredicate;
@@ -436,7 +437,7 @@ public:
 private:
   virtual void printImpl(raw_ostream &OS, unsigned indent) const;
   virtual bool isEqualImpl(const Matcher *M) const {
-    return cast<CheckPredicateMatcher>(M)->PredName == PredName;
+    return cast<CheckPredicateMatcher>(M)->Pred == Pred;
   }
   virtual unsigned getHashImpl() const;
 };
diff --git a/utils/TableGen/DAGISelMatcherEmitter.cpp b/utils/TableGen/DAGISelMatcherEmitter.cpp
index 0b69af4..acb0135 100644
--- a/utils/TableGen/DAGISelMatcherEmitter.cpp
+++ b/utils/TableGen/DAGISelMatcherEmitter.cpp
@@ -33,8 +33,12 @@ OmitComments("omit-comments", cl::desc("Do not generate comments"),
 namespace {
 class MatcherTableEmitter {
   const CodeGenDAGPatterns &CGP;
-  StringMap<unsigned> NodePredicateMap, PatternPredicateMap;
-  std::vector<std::string> NodePredicates, PatternPredicates;
+  
+  DenseMap<TreePattern *, unsigned> NodePredicateMap;
+  std::vector<TreePredicateFn> NodePredicates;
+  
+  StringMap<unsigned> PatternPredicateMap;
+  std::vector<std::string> PatternPredicates;
 
   DenseMap<const ComplexPattern*, unsigned> ComplexPatternMap;
   std::vector<const ComplexPattern*> ComplexPatterns;
@@ -57,14 +61,15 @@ private:
   unsigned EmitMatcher(const Matcher *N, unsigned Indent, unsigned CurrentIdx,
                        formatted_raw_ostream &OS);
 
-  unsigned getNodePredicate(StringRef PredName) {
-    unsigned &Entry = NodePredicateMap[PredName];
+  unsigned getNodePredicate(TreePredicateFn Pred) {
+    unsigned &Entry = NodePredicateMap[Pred.getOrigPatFragRecord()];
     if (Entry == 0) {
-      NodePredicates.push_back(PredName.str());
+      NodePredicates.push_back(Pred);
       Entry = NodePredicates.size();
     }
     return Entry-1;
   }
+  
   unsigned getPatternPredicate(StringRef PredName) {
     unsigned &Entry = PatternPredicateMap[PredName];
     if (Entry == 0) {
@@ -73,7 +78,6 @@ private:
     }
     return Entry-1;
   }
-
   unsigned getComplexPat(const ComplexPattern &P) {
     unsigned &Entry = ComplexPatternMap[&P];
     if (Entry == 0) {
@@ -239,7 +243,7 @@ EmitMatcher(const Matcher *N, unsigned Indent, unsigned CurrentIdx,
     return 2;
 
   case Matcher::CheckPatternPredicate: {
-    StringRef Pred = cast<CheckPatternPredicateMatcher>(N)->getPredicate();
+    StringRef Pred =cast<CheckPatternPredicateMatcher>(N)->getPredicate();
     OS << "OPC_CheckPatternPredicate, " << getPatternPredicate(Pred) << ',';
     if (!OmitComments)
       OS.PadToColumn(CommentIndent) << "// " << Pred;
@@ -247,10 +251,10 @@ EmitMatcher(const Matcher *N, unsigned Indent, unsigned CurrentIdx,
     return 2;
   }
   case Matcher::CheckPredicate: {
-    StringRef Pred = cast<CheckPredicateMatcher>(N)->getPredicateName();
+    TreePredicateFn Pred = cast<CheckPredicateMatcher>(N)->getPredicate();
     OS << "OPC_CheckPredicate, " << getNodePredicate(Pred) << ',';
     if (!OmitComments)
-      OS.PadToColumn(CommentIndent) << "// " << Pred;
+      OS.PadToColumn(CommentIndent) << "// " << Pred.getFnName();
     OS << '\n';
     return 2;
   }
@@ -617,25 +621,13 @@ void MatcherTableEmitter::EmitPredicateFunctions(formatted_raw_ostream &OS) {
     OS << "  switch (PredNo) {\n";
     OS << "  default: assert(0 && \"Invalid predicate in table?\");\n";
     for (unsigned i = 0, e = NodePredicates.size(); i != e; ++i) {
-      // FIXME: Storing this by name is horrible.
-      TreePattern *P =PFsByName[NodePredicates[i].substr(strlen("Predicate_"))];
-      assert(P && "Unknown name?");
-
       // Emit the predicate code corresponding to this pattern.
-      std::string Code = P->getRecord()->getValueAsCode("Predicate");
-      assert(!Code.empty() && "No code in this predicate");
-      OS << "  case " << i << ": { // " << NodePredicates[i] << '\n';
-      std::string ClassName;
-      if (P->getOnlyTree()->isLeaf())
-        ClassName = "SDNode";
-      else
-        ClassName =
-          CGP.getSDNodeInfo(P->getOnlyTree()->getOperator()).getSDClassName();
-      if (ClassName == "SDNode")
-        OS << "    SDNode *N = Node;\n";
-      else
-        OS << "    " << ClassName << "*N = cast<" << ClassName << ">(Node);\n";
-      OS << Code << "\n  }\n";
+      TreePredicateFn PredFn = NodePredicates[i];
+      
+      assert(!PredFn.isAlwaysTrue() && "No code in this predicate");
+      OS << "  case " << i << ": { // " << NodePredicates[i].getFnName() <<'\n';
+      
+      OS << PredFn.getCodeToRunOnSDNode() << "\n  }\n";
     }
     OS << "  }\n";
     OS << "}\n\n";
diff --git a/utils/TableGen/DAGISelMatcherGen.cpp b/utils/TableGen/DAGISelMatcherGen.cpp
index 393ac69..a8736fa 100644
--- a/utils/TableGen/DAGISelMatcherGen.cpp
+++ b/utils/TableGen/DAGISelMatcherGen.cpp
@@ -25,12 +25,12 @@ static MVT::SimpleValueType getRegisterValueType(Record *R,
                                                  const CodeGenTarget &T) {
   bool FoundRC = false;
   MVT::SimpleValueType VT = MVT::Other;
+  const CodeGenRegister *Reg = T.getRegBank().getReg(R);
   const std::vector<CodeGenRegisterClass> &RCs = T.getRegisterClasses();
-  std::vector<Record*>::const_iterator Element;
 
   for (unsigned rc = 0, e = RCs.size(); rc != e; ++rc) {
     const CodeGenRegisterClass &RC = RCs[rc];
-    if (!std::count(RC.Elements.begin(), RC.Elements.end(), R))
+    if (!RC.contains(Reg))
       continue;
 
     if (!FoundRC) {
@@ -646,6 +646,35 @@ GetInstPatternNode(const DAGInstruction &Inst, const TreePatternNode *N) {
   return InstPatNode;
 }
 
+static bool
+mayInstNodeLoadOrStore(const TreePatternNode *N,
+                       const CodeGenDAGPatterns &CGP) {
+  Record *Op = N->getOperator();
+  const CodeGenTarget &CGT = CGP.getTargetInfo();
+  CodeGenInstruction &II = CGT.getInstruction(Op);
+  return II.mayLoad || II.mayStore;
+}
+
+static unsigned
+numNodesThatMayLoadOrStore(const TreePatternNode *N,
+                           const CodeGenDAGPatterns &CGP) {
+  if (N->isLeaf())
+    return 0;
+
+  Record *OpRec = N->getOperator();
+  if (!OpRec->isSubClassOf("Instruction"))
+    return 0;
+
+  unsigned Count = 0;
+  if (mayInstNodeLoadOrStore(N, CGP))
+    ++Count;
+
+  for (unsigned i = 0, e = N->getNumChildren(); i != e; ++i)
+    Count += numNodesThatMayLoadOrStore(N->getChild(i), CGP);
+
+  return Count;
+}
+
 void MatcherGen::
 EmitResultInstructionAsOperand(const TreePatternNode *N,
                                SmallVectorImpl<unsigned> &OutputOps) {
@@ -772,21 +801,26 @@ EmitResultInstructionAsOperand(const TreePatternNode *N,
       (Pattern.getSrcPattern()->NodeHasProperty(SDNPVariadic, CGP)))
     NumFixedArityOperands = Pattern.getSrcPattern()->getNumChildren();
 
-  // If this is the root node and any of the nodes matched nodes in the input
-  // pattern have MemRefs in them, have the interpreter collect them and plop
-  // them onto this node.
-  //
-  // FIXME3: This is actively incorrect for result patterns where the root of
-  // the pattern is not the memory reference and is also incorrect when the
-  // result pattern has multiple memory-referencing instructions.  For example,
-  // in the X86 backend, this pattern causes the memrefs to get attached to the
-  // CVTSS2SDrr instead of the MOVSSrm:
+  // If this is the root node and multiple matched nodes in the input pattern
+  // have MemRefs in them, have the interpreter collect them and plop them onto
+  // this node. If there is just one node with MemRefs, leave them on that node
+  // even if it is not the root.
   //
-  //  def : Pat<(extloadf32 addr:$src),
-  //            (CVTSS2SDrr (MOVSSrm addr:$src))>;
-  //
-  bool NodeHasMemRefs =
-    isRoot && Pattern.getSrcPattern()->TreeHasProperty(SDNPMemOperand, CGP);
+  // FIXME3: This is actively incorrect for result patterns with multiple
+  // memory-referencing instructions.
+  bool PatternHasMemOperands =
+    Pattern.getSrcPattern()->TreeHasProperty(SDNPMemOperand, CGP);
+
+  bool NodeHasMemRefs = false;
+  if (PatternHasMemOperands) {
+    unsigned NumNodesThatLoadOrStore =
+      numNodesThatMayLoadOrStore(Pattern.getDstPattern(), CGP);
+    bool NodeIsUniqueLoadOrStore = mayInstNodeLoadOrStore(N, CGP) &&
+                                   NumNodesThatLoadOrStore == 1;
+    NodeHasMemRefs =
+      NodeIsUniqueLoadOrStore || (isRoot && (mayInstNodeLoadOrStore(N, CGP) ||
+                                             NumNodesThatLoadOrStore != 1));
+  }
 
   assert((!ResultVTs.empty() || TreeHasOutGlue || NodeHasChain) &&
          "Node has no result");
diff --git a/utils/TableGen/DAGISelMatcherOpt.cpp b/utils/TableGen/DAGISelMatcherOpt.cpp
index 3169ea1..f996422 100644
--- a/utils/TableGen/DAGISelMatcherOpt.cpp
+++ b/utils/TableGen/DAGISelMatcherOpt.cpp
@@ -18,7 +18,6 @@
 #include "llvm/ADT/StringSet.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include <vector>
 using namespace llvm;
 
 /// ContractNodes - Turn multiple matcher node patterns like 'MoveChild+Record'
diff --git a/utils/TableGen/EDEmitter.cpp b/utils/TableGen/EDEmitter.cpp
index 8415482..daf9617 100644
--- a/utils/TableGen/EDEmitter.cpp
+++ b/utils/TableGen/EDEmitter.cpp
@@ -24,7 +24,6 @@
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 
-#include <map>
 #include <string>
 #include <vector>
 
@@ -597,6 +596,7 @@ static int ARMFlagFromOpName(LiteralConstantEmitter *type,
   IMM("t_adrlabel");
   IMM("t2adrlabel");
   IMM("shift_imm");
+  IMM("ssat_imm");
   IMM("neon_vcvt_imm32");
   IMM("shr_imm8");
   IMM("shr_imm16");
@@ -636,6 +636,7 @@ static int ARMFlagFromOpName(LiteralConstantEmitter *type,
   MISC("addrmode6", "kOperandTypeARMAddrMode6");                  // R, R, I, I
   MISC("am6offset", "kOperandTypeARMAddrMode6Offset");            // R, I, I
   MISC("addrmode6dup", "kOperandTypeARMAddrMode6");               // R, R, I, I
+  MISC("addrmode6oneL32", "kOperandTypeARMAddrMode6");            // R, R, I, I
   MISC("addrmodepc", "kOperandTypeARMAddrModePC");                // R, I
   MISC("addrmode7", "kOperandTypeARMAddrMode7");                  // R
   MISC("reglist", "kOperandTypeARMRegisterList");                 // I, R, ...
diff --git a/utils/TableGen/FastISelEmitter.cpp b/utils/TableGen/FastISelEmitter.cpp
index f01de1d..6c2a767 100644
--- a/utils/TableGen/FastISelEmitter.cpp
+++ b/utils/TableGen/FastISelEmitter.cpp
@@ -19,9 +19,10 @@
 
 #include "FastISelEmitter.h"
 #include "Record.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/VectorExtras.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 using namespace llvm;
 
 namespace {
@@ -35,36 +36,150 @@ struct InstructionMemo {
   std::string SubRegNo;
   std::vector<std::string>* PhysRegs;
 };
+  
+/// ImmPredicateSet - This uniques predicates (represented as a string) and
+/// gives them unique (small) integer ID's that start at 0.
+class ImmPredicateSet {
+  DenseMap<TreePattern *, unsigned> ImmIDs;
+  std::vector<TreePredicateFn> PredsByName;
+public:
+  
+  unsigned getIDFor(TreePredicateFn Pred) {
+    unsigned &Entry = ImmIDs[Pred.getOrigPatFragRecord()];
+    if (Entry == 0) {
+      PredsByName.push_back(Pred);
+      Entry = PredsByName.size();
+    }
+    return Entry-1;
+  }
+  
+  const TreePredicateFn &getPredicate(unsigned i) {
+    assert(i < PredsByName.size());
+    return PredsByName[i];
+  }
+  
+  typedef std::vector<TreePredicateFn>::const_iterator iterator;
+  iterator begin() const { return PredsByName.begin(); }
+  iterator end() const { return PredsByName.end(); }
+  
+};
 
 /// OperandsSignature - This class holds a description of a list of operand
 /// types. It has utility methods for emitting text based on the operands.
 ///
 struct OperandsSignature {
-  std::vector<std::string> Operands;
+  class OpKind {
+    enum { OK_Reg, OK_FP, OK_Imm, OK_Invalid = -1 };
+    char Repr;
+  public:
+    
+    OpKind() : Repr(OK_Invalid) {}
+    
+    bool operator<(OpKind RHS) const { return Repr < RHS.Repr; }
+    bool operator==(OpKind RHS) const { return Repr == RHS.Repr; }
+
+    static OpKind getReg() { OpKind K; K.Repr = OK_Reg; return K; }
+    static OpKind getFP()  { OpKind K; K.Repr = OK_FP; return K; }
+    static OpKind getImm(unsigned V) {
+      assert((unsigned)OK_Imm+V < 128 &&
+             "Too many integer predicates for the 'Repr' char");
+      OpKind K; K.Repr = OK_Imm+V; return K;
+    }
+    
+    bool isReg() const { return Repr == OK_Reg; }
+    bool isFP() const  { return Repr == OK_FP; }
+    bool isImm() const { return Repr >= OK_Imm; }
+    
+    unsigned getImmCode() const { assert(isImm()); return Repr-OK_Imm; }
+    
+    void printManglingSuffix(raw_ostream &OS, ImmPredicateSet &ImmPredicates,
+                             bool StripImmCodes) const {
+      if (isReg())
+        OS << 'r';
+      else if (isFP())
+        OS << 'f';
+      else {
+        OS << 'i';
+        if (!StripImmCodes)
+          if (unsigned Code = getImmCode())
+            OS << "_" << ImmPredicates.getPredicate(Code-1).getFnName();
+      }
+    }
+  };
+  
+  
+  SmallVector<OpKind, 3> Operands;
 
   bool operator<(const OperandsSignature &O) const {
     return Operands < O.Operands;
   }
+  bool operator==(const OperandsSignature &O) const {
+    return Operands == O.Operands;
+  }
 
   bool empty() const { return Operands.empty(); }
 
+  bool hasAnyImmediateCodes() const {
+    for (unsigned i = 0, e = Operands.size(); i != e; ++i)
+      if (Operands[i].isImm() && Operands[i].getImmCode() != 0)
+        return true;
+    return false;
+  }
+  
+  /// getWithoutImmCodes - Return a copy of this with any immediate codes forced
+  /// to zero.
+  OperandsSignature getWithoutImmCodes() const {
+    OperandsSignature Result;
+    for (unsigned i = 0, e = Operands.size(); i != e; ++i)
+      if (!Operands[i].isImm())
+        Result.Operands.push_back(Operands[i]);
+      else
+        Result.Operands.push_back(OpKind::getImm(0));
+    return Result;
+  }
+  
+  void emitImmediatePredicate(raw_ostream &OS, ImmPredicateSet &ImmPredicates) {
+    bool EmittedAnything = false;
+    for (unsigned i = 0, e = Operands.size(); i != e; ++i) {
+      if (!Operands[i].isImm()) continue;
+      
+      unsigned Code = Operands[i].getImmCode();
+      if (Code == 0) continue;
+      
+      if (EmittedAnything)
+        OS << " &&\n        ";
+      
+      TreePredicateFn PredFn = ImmPredicates.getPredicate(Code-1);
+      
+      // Emit the type check.
+      OS << "VT == "
+         << getEnumName(PredFn.getOrigPatFragRecord()->getTree(0)->getType(0))
+         << " && ";
+      
+      
+      OS << PredFn.getFnName() << "(imm" << i <<')';
+      EmittedAnything = true;
+    }
+  }
+  
   /// initialize - Examine the given pattern and initialize the contents
   /// of the Operands array accordingly. Return true if all the operands
   /// are supported, false otherwise.
   ///
-  bool initialize(TreePatternNode *InstPatNode,
-                  const CodeGenTarget &Target,
-                  MVT::SimpleValueType VT) {
-
-    if (!InstPatNode->isLeaf()) {
-      if (InstPatNode->getOperator()->getName() == "imm") {
-        Operands.push_back("i");
-        return true;
-      }
-      if (InstPatNode->getOperator()->getName() == "fpimm") {
-        Operands.push_back("f");
-        return true;
-      }
+  bool initialize(TreePatternNode *InstPatNode, const CodeGenTarget &Target,
+                  MVT::SimpleValueType VT,
+                  ImmPredicateSet &ImmediatePredicates) {
+    if (InstPatNode->isLeaf())
+      return false;
+    
+    if (InstPatNode->getOperator()->getName() == "imm") {
+      Operands.push_back(OpKind::getImm(0));
+      return true;
+    }
+    
+    if (InstPatNode->getOperator()->getName() == "fpimm") {
+      Operands.push_back(OpKind::getFP());
+      return true;
     }
 
     const CodeGenRegisterClass *DstRC = 0;
@@ -72,40 +187,70 @@ struct OperandsSignature {
     for (unsigned i = 0, e = InstPatNode->getNumChildren(); i != e; ++i) {
       TreePatternNode *Op = InstPatNode->getChild(i);
 
+      // Handle imm operands specially.
+      if (!Op->isLeaf() && Op->getOperator()->getName() == "imm") {
+        unsigned PredNo = 0;
+        if (!Op->getPredicateFns().empty()) {
+          TreePredicateFn PredFn = Op->getPredicateFns()[0];
+          // If there is more than one predicate weighing in on this operand
+          // then we don't handle it.  This doesn't typically happen for
+          // immediates anyway.
+          if (Op->getPredicateFns().size() > 1 ||
+              !PredFn.isImmediatePattern())
+            return false;
+          // Ignore any instruction with 'FastIselShouldIgnore', these are
+          // not needed and just bloat the fast instruction selector.  For
+          // example, X86 doesn't need to generate code to match ADD16ri8 since
+          // ADD16ri will do just fine.
+          Record *Rec = PredFn.getOrigPatFragRecord()->getRecord();
+          if (Rec->getValueAsBit("FastIselShouldIgnore"))
+            return false;
+        
+          PredNo = ImmediatePredicates.getIDFor(PredFn)+1;
+        }
+        
+        // Handle unmatched immediate sizes here.
+        //if (Op->getType(0) != VT)
+        //  return false;
+        
+        Operands.push_back(OpKind::getImm(PredNo));
+        continue;
+      }
+
+      
       // For now, filter out any operand with a predicate.
       // For now, filter out any operand with multiple values.
-      if (!Op->getPredicateFns().empty() ||
-          Op->getNumTypes() != 1)
-        return false;
-
-      assert(Op->hasTypeSet(0) && "Type infererence not done?");
-      // For now, all the operands must have the same type.
-      if (Op->getType(0) != VT)
+      if (!Op->getPredicateFns().empty() || Op->getNumTypes() != 1)
         return false;
 
       if (!Op->isLeaf()) {
-        if (Op->getOperator()->getName() == "imm") {
-          Operands.push_back("i");
-          continue;
-        }
-        if (Op->getOperator()->getName() == "fpimm") {
-          Operands.push_back("f");
+         if (Op->getOperator()->getName() == "fpimm") {
+          Operands.push_back(OpKind::getFP());
           continue;
         }
         // For now, ignore other non-leaf nodes.
         return false;
       }
+      
+      assert(Op->hasTypeSet(0) && "Type infererence not done?");
+
+      // For now, all the operands must have the same type (if they aren't
+      // immediates).  Note that this causes us to reject variable sized shifts
+      // on X86.
+      if (Op->getType(0) != VT)
+        return false;
+
       DefInit *OpDI = dynamic_cast<DefInit*>(Op->getLeafValue());
       if (!OpDI)
         return false;
       Record *OpLeafRec = OpDI->getDef();
+      
       // For now, the only other thing we accept is register operands.
-
       const CodeGenRegisterClass *RC = 0;
       if (OpLeafRec->isSubClassOf("RegisterClass"))
         RC = &Target.getRegisterClass(OpLeafRec);
       else if (OpLeafRec->isSubClassOf("Register"))
-        RC = Target.getRegisterClassForRegister(OpLeafRec);
+        RC = Target.getRegBank().getRegClassForRegister(OpLeafRec);
       else
         return false;
 
@@ -120,22 +265,21 @@ struct OperandsSignature {
           return false;
       } else
         DstRC = RC;
-      Operands.push_back("r");
+      Operands.push_back(OpKind::getReg());
     }
     return true;
   }
 
   void PrintParameters(raw_ostream &OS) const {
     for (unsigned i = 0, e = Operands.size(); i != e; ++i) {
-      if (Operands[i] == "r") {
+      if (Operands[i].isReg()) {
         OS << "unsigned Op" << i << ", bool Op" << i << "IsKill";
-      } else if (Operands[i] == "i") {
+      } else if (Operands[i].isImm()) {
         OS << "uint64_t imm" << i;
-      } else if (Operands[i] == "f") {
+      } else if (Operands[i].isFP()) {
         OS << "ConstantFP *f" << i;
       } else {
-        assert("Unknown operand kind!");
-        abort();
+        llvm_unreachable("Unknown operand kind!");
       }
       if (i + 1 != e)
         OS << ", ";
@@ -143,7 +287,7 @@ struct OperandsSignature {
   }
 
   void PrintArguments(raw_ostream &OS,
-                      const std::vector<std::string>& PR) const {
+                      const std::vector<std::string> &PR) const {
     assert(PR.size() == Operands.size());
     bool PrintedArg = false;
     for (unsigned i = 0, e = Operands.size(); i != e; ++i) {
@@ -153,33 +297,31 @@ struct OperandsSignature {
 
       if (PrintedArg)
         OS << ", ";
-      if (Operands[i] == "r") {
+      if (Operands[i].isReg()) {
         OS << "Op" << i << ", Op" << i << "IsKill";
         PrintedArg = true;
-      } else if (Operands[i] == "i") {
+      } else if (Operands[i].isImm()) {
         OS << "imm" << i;
         PrintedArg = true;
-      } else if (Operands[i] == "f") {
+      } else if (Operands[i].isFP()) {
         OS << "f" << i;
         PrintedArg = true;
       } else {
-        assert("Unknown operand kind!");
-        abort();
+        llvm_unreachable("Unknown operand kind!");
       }
     }
   }
 
   void PrintArguments(raw_ostream &OS) const {
     for (unsigned i = 0, e = Operands.size(); i != e; ++i) {
-      if (Operands[i] == "r") {
+      if (Operands[i].isReg()) {
         OS << "Op" << i << ", Op" << i << "IsKill";
-      } else if (Operands[i] == "i") {
+      } else if (Operands[i].isImm()) {
         OS << "imm" << i;
-      } else if (Operands[i] == "f") {
+      } else if (Operands[i].isFP()) {
         OS << "f" << i;
       } else {
-        assert("Unknown operand kind!");
-        abort();
+        llvm_unreachable("Unknown operand kind!");
       }
       if (i + 1 != e)
         OS << ", ";
@@ -187,8 +329,9 @@ struct OperandsSignature {
   }
 
 
-  void PrintManglingSuffix(raw_ostream &OS,
-                           const std::vector<std::string>& PR) const {
+  void PrintManglingSuffix(raw_ostream &OS, const std::vector<std::string> &PR,
+                           ImmPredicateSet &ImmPredicates,
+                           bool StripImmCodes = false) const {
     for (unsigned i = 0, e = Operands.size(); i != e; ++i) {
       if (PR[i] != "")
         // Implicit physical register operand. e.g. Instruction::Mul expect to
@@ -197,14 +340,14 @@ struct OperandsSignature {
         // like a binary instruction except for the very inner FastEmitInst_*
         // call.
         continue;
-      OS << Operands[i];
+      Operands[i].printManglingSuffix(OS, ImmPredicates, StripImmCodes);
     }
   }
 
-  void PrintManglingSuffix(raw_ostream &OS) const {
-    for (unsigned i = 0, e = Operands.size(); i != e; ++i) {
-      OS << Operands[i];
-    }
+  void PrintManglingSuffix(raw_ostream &OS, ImmPredicateSet &ImmPredicates,
+                           bool StripImmCodes = false) const {
+    for (unsigned i = 0, e = Operands.size(); i != e; ++i)
+      Operands[i].printManglingSuffix(OS, ImmPredicates, StripImmCodes);
   }
 };
 
@@ -218,13 +361,17 @@ class FastISelMap {
 
   OperandsOpcodeTypeRetPredMap SimplePatterns;
 
+  std::map<OperandsSignature, std::vector<OperandsSignature> >
+    SignaturesWithConstantForms;
+  
   std::string InstNS;
-
+  ImmPredicateSet ImmediatePredicates;
 public:
   explicit FastISelMap(std::string InstNS);
 
-  void CollectPatterns(CodeGenDAGPatterns &CGP);
-  void PrintFunctionDefinitions(raw_ostream &OS);
+  void collectPatterns(CodeGenDAGPatterns &CGP);
+  void printImmediatePredicates(raw_ostream &OS);
+  void printFunctionDefinitions(raw_ostream &OS);
 };
 
 }
@@ -244,7 +391,34 @@ FastISelMap::FastISelMap(std::string instns)
   : InstNS(instns) {
 }
 
-void FastISelMap::CollectPatterns(CodeGenDAGPatterns &CGP) {
+static std::string PhyRegForNode(TreePatternNode *Op,
+                                 const CodeGenTarget &Target) {
+  std::string PhysReg;
+
+  if (!Op->isLeaf())
+    return PhysReg;
+
+  DefInit *OpDI = dynamic_cast<DefInit*>(Op->getLeafValue());
+  Record *OpLeafRec = OpDI->getDef();
+  if (!OpLeafRec->isSubClassOf("Register"))
+    return PhysReg;
+
+  PhysReg += static_cast<StringInit*>(OpLeafRec->getValue( \
+             "Namespace")->getValue())->getValue();
+  PhysReg += "::";
+
+  std::vector<CodeGenRegister> Regs = Target.getRegisters();
+  for (unsigned i = 0; i < Regs.size(); ++i) {
+    if (Regs[i].TheDef == OpLeafRec) {
+      PhysReg += Regs[i].getName();
+      break;
+    }
+  }
+
+  return PhysReg;
+}
+
+void FastISelMap::collectPatterns(CodeGenDAGPatterns &CGP) {
   const CodeGenTarget &Target = CGP.getTargetInfo();
 
   // Determine the target's namespace name.
@@ -264,7 +438,7 @@ void FastISelMap::CollectPatterns(CodeGenDAGPatterns &CGP) {
     if (!Op->isSubClassOf("Instruction"))
       continue;
     CodeGenInstruction &II = CGP.getTargetInfo().getInstruction(Op);
-    if (II.Operands.size() == 0)
+    if (II.Operands.empty())
       continue;
 
     // For now, ignore multi-instruction patterns.
@@ -322,54 +496,45 @@ void FastISelMap::CollectPatterns(CodeGenDAGPatterns &CGP) {
       VT = InstPatNode->getChild(0)->getType(0);
     }
 
-    // For now, filter out instructions which just set a register to
-    // an Operand or an immediate, like MOV32ri.
-    if (InstPatOp->isSubClassOf("Operand"))
-      continue;
-
     // For now, filter out any instructions with predicates.
     if (!InstPatNode->getPredicateFns().empty())
       continue;
 
     // Check all the operands.
     OperandsSignature Operands;
-    if (!Operands.initialize(InstPatNode, Target, VT))
+    if (!Operands.initialize(InstPatNode, Target, VT, ImmediatePredicates))
       continue;
 
     std::vector<std::string>* PhysRegInputs = new std::vector<std::string>();
-    if (!InstPatNode->isLeaf() &&
-        (InstPatNode->getOperator()->getName() == "imm" ||
-         InstPatNode->getOperator()->getName() == "fpimmm"))
+    if (InstPatNode->getOperator()->getName() == "imm" ||
+        InstPatNode->getOperator()->getName() == "fpimmm")
       PhysRegInputs->push_back("");
-    else if (!InstPatNode->isLeaf()) {
+    else {
+      // Compute the PhysRegs used by the given pattern, and check that
+      // the mapping from the src to dst patterns is simple.
+      bool FoundNonSimplePattern = false;
+      unsigned DstIndex = 0;
       for (unsigned i = 0, e = InstPatNode->getNumChildren(); i != e; ++i) {
-        TreePatternNode *Op = InstPatNode->getChild(i);
-        if (!Op->isLeaf()) {
-          PhysRegInputs->push_back("");
-          continue;
-        }
-
-        DefInit *OpDI = dynamic_cast<DefInit*>(Op->getLeafValue());
-        Record *OpLeafRec = OpDI->getDef();
-        std::string PhysReg;
-        if (OpLeafRec->isSubClassOf("Register")) {
-          PhysReg += static_cast<StringInit*>(OpLeafRec->getValue( \
-                     "Namespace")->getValue())->getValue();
-          PhysReg += "::";
-
-          std::vector<CodeGenRegister> Regs = Target.getRegisters();
-          for (unsigned i = 0; i < Regs.size(); ++i) {
-            if (Regs[i].TheDef == OpLeafRec) {
-              PhysReg += Regs[i].getName();
-              break;
-            }
+        std::string PhysReg = PhyRegForNode(InstPatNode->getChild(i), Target);
+        if (PhysReg.empty()) {
+          if (DstIndex >= Dst->getNumChildren() ||
+              Dst->getChild(DstIndex)->getName() !=
+              InstPatNode->getChild(i)->getName()) {
+            FoundNonSimplePattern = true;
+            break;
           }
+          ++DstIndex;
         }
 
         PhysRegInputs->push_back(PhysReg);
       }
-    } else
-      PhysRegInputs->push_back("");
+
+      if (Op->getName() != "EXTRACT_SUBREG" && DstIndex < Dst->getNumChildren())
+        FoundNonSimplePattern = true;
+
+      if (FoundNonSimplePattern)
+        continue;
+    }
 
     // Get the predicate that guards this pattern.
     std::string PredicateCheck = Pattern.getPredicateCheck();
@@ -381,15 +546,39 @@ void FastISelMap::CollectPatterns(CodeGenDAGPatterns &CGP) {
       SubRegNo,
       PhysRegInputs
     };
-    if (SimplePatterns[Operands][OpcodeName][VT][RetVT]
-            .count(PredicateCheck))
-      throw TGError(Pattern.getSrcRecord()->getLoc(), "Duplicate record!");
+    
+    if (SimplePatterns[Operands][OpcodeName][VT][RetVT].count(PredicateCheck))
+      throw TGError(Pattern.getSrcRecord()->getLoc(),
+                    "Duplicate record in FastISel table!");
 
     SimplePatterns[Operands][OpcodeName][VT][RetVT][PredicateCheck] = Memo;
+    
+    // If any of the operands were immediates with predicates on them, strip
+    // them down to a signature that doesn't have predicates so that we can
+    // associate them with the stripped predicate version.
+    if (Operands.hasAnyImmediateCodes()) {
+      SignaturesWithConstantForms[Operands.getWithoutImmCodes()]
+        .push_back(Operands);
+    }
   }
 }
 
-void FastISelMap::PrintFunctionDefinitions(raw_ostream &OS) {
+void FastISelMap::printImmediatePredicates(raw_ostream &OS) {
+  if (ImmediatePredicates.begin() == ImmediatePredicates.end())
+    return;
+  
+  OS << "\n// FastEmit Immediate Predicate functions.\n";
+  for (ImmPredicateSet::iterator I = ImmediatePredicates.begin(),
+       E = ImmediatePredicates.end(); I != E; ++I) {
+    OS << "static bool " << I->getFnName() << "(int64_t Imm) {\n";
+    OS << I->getImmediatePredicateCode() << "\n}\n";
+  }
+  
+  OS << "\n\n";
+}
+
+
+void FastISelMap::printFunctionDefinitions(raw_ostream &OS) {
   // Now emit code for all the patterns that we collected.
   for (OperandsOpcodeTypeRetPredMap::const_iterator OI = SimplePatterns.begin(),
        OE = SimplePatterns.end(); OI != OE; ++OI) {
@@ -420,7 +609,7 @@ void FastISelMap::PrintFunctionDefinitions(raw_ostream &OS) {
                << getLegalCName(Opcode)
                << "_" << getLegalCName(getName(VT))
                << "_" << getLegalCName(getName(RetVT)) << "_";
-            Operands.PrintManglingSuffix(OS);
+            Operands.PrintManglingSuffix(OS, ImmediatePredicates);
             OS << "(";
             Operands.PrintParameters(OS);
             OS << ") {\n";
@@ -451,7 +640,8 @@ void FastISelMap::PrintFunctionDefinitions(raw_ostream &OS) {
 
               OS << "  return FastEmitInst_";
               if (Memo.SubRegNo.empty()) {
-                Operands.PrintManglingSuffix(OS, *Memo.PhysRegs);
+                Operands.PrintManglingSuffix(OS, *Memo.PhysRegs,
+                                             ImmediatePredicates, true);
                 OS << "(" << InstNS << Memo.Name << ", ";
                 OS << InstNS << Memo.RC->getName() << "RegisterClass";
                 if (!Operands.empty())
@@ -460,9 +650,7 @@ void FastISelMap::PrintFunctionDefinitions(raw_ostream &OS) {
                 OS << ");\n";
               } else {
                 OS << "extractsubreg(" << getName(RetVT);
-                OS << ", Op0, Op0IsKill, ";
-                OS << Memo.SubRegNo;
-                OS << ");\n";
+                OS << ", Op0, Op0IsKill, " << Memo.SubRegNo << ");\n";
               }
 
               if (HasPred)
@@ -480,7 +668,7 @@ void FastISelMap::PrintFunctionDefinitions(raw_ostream &OS) {
           OS << "unsigned FastEmit_"
              << getLegalCName(Opcode) << "_"
              << getLegalCName(getName(VT)) << "_";
-          Operands.PrintManglingSuffix(OS);
+          Operands.PrintManglingSuffix(OS, ImmediatePredicates);
           OS << "(MVT RetVT";
           if (!Operands.empty())
             OS << ", ";
@@ -492,7 +680,7 @@ void FastISelMap::PrintFunctionDefinitions(raw_ostream &OS) {
             OS << "  case " << getName(RetVT) << ": return FastEmit_"
                << getLegalCName(Opcode) << "_" << getLegalCName(getName(VT))
                << "_" << getLegalCName(getName(RetVT)) << "_";
-            Operands.PrintManglingSuffix(OS);
+            Operands.PrintManglingSuffix(OS, ImmediatePredicates);
             OS << "(";
             Operands.PrintArguments(OS);
             OS << ");\n";
@@ -504,7 +692,7 @@ void FastISelMap::PrintFunctionDefinitions(raw_ostream &OS) {
           OS << "unsigned FastEmit_"
              << getLegalCName(Opcode) << "_"
              << getLegalCName(getName(VT)) << "_";
-          Operands.PrintManglingSuffix(OS);
+          Operands.PrintManglingSuffix(OS, ImmediatePredicates);
           OS << "(MVT RetVT";
           if (!Operands.empty())
             OS << ", ";
@@ -544,7 +732,8 @@ void FastISelMap::PrintFunctionDefinitions(raw_ostream &OS) {
             OS << "  return FastEmitInst_";
 
             if (Memo.SubRegNo.empty()) {
-              Operands.PrintManglingSuffix(OS, *Memo.PhysRegs);
+              Operands.PrintManglingSuffix(OS, *Memo.PhysRegs,
+                                           ImmediatePredicates, true);
               OS << "(" << InstNS << Memo.Name << ", ";
               OS << InstNS << Memo.RC->getName() << "RegisterClass";
               if (!Operands.empty())
@@ -572,7 +761,7 @@ void FastISelMap::PrintFunctionDefinitions(raw_ostream &OS) {
       // Emit one function for the opcode that demultiplexes based on the type.
       OS << "unsigned FastEmit_"
          << getLegalCName(Opcode) << "_";
-      Operands.PrintManglingSuffix(OS);
+      Operands.PrintManglingSuffix(OS, ImmediatePredicates);
       OS << "(MVT VT, MVT RetVT";
       if (!Operands.empty())
         OS << ", ";
@@ -585,7 +774,7 @@ void FastISelMap::PrintFunctionDefinitions(raw_ostream &OS) {
         std::string TypeName = getName(VT);
         OS << "  case " << TypeName << ": return FastEmit_"
            << getLegalCName(Opcode) << "_" << getLegalCName(TypeName) << "_";
-        Operands.PrintManglingSuffix(OS);
+        Operands.PrintManglingSuffix(OS, ImmediatePredicates);
         OS << "(RetVT";
         if (!Operands.empty())
           OS << ", ";
@@ -604,12 +793,44 @@ void FastISelMap::PrintFunctionDefinitions(raw_ostream &OS) {
     // Emit one function for the operand signature that demultiplexes based
     // on opcode and type.
     OS << "unsigned FastEmit_";
-    Operands.PrintManglingSuffix(OS);
+    Operands.PrintManglingSuffix(OS, ImmediatePredicates);
     OS << "(MVT VT, MVT RetVT, unsigned Opcode";
     if (!Operands.empty())
       OS << ", ";
     Operands.PrintParameters(OS);
     OS << ") {\n";
+    
+    // If there are any forms of this signature available that operand on
+    // constrained forms of the immediate (e.g. 32-bit sext immediate in a
+    // 64-bit operand), check them first.
+    
+    std::map<OperandsSignature, std::vector<OperandsSignature> >::iterator MI
+      = SignaturesWithConstantForms.find(Operands);
+    if (MI != SignaturesWithConstantForms.end()) {
+      // Unique any duplicates out of the list.
+      std::sort(MI->second.begin(), MI->second.end());
+      MI->second.erase(std::unique(MI->second.begin(), MI->second.end()),
+                       MI->second.end());
+      
+      // Check each in order it was seen.  It would be nice to have a good
+      // relative ordering between them, but we're not going for optimality
+      // here.
+      for (unsigned i = 0, e = MI->second.size(); i != e; ++i) {
+        OS << "  if (";
+        MI->second[i].emitImmediatePredicate(OS, ImmediatePredicates);
+        OS << ")\n    if (unsigned Reg = FastEmit_";
+        MI->second[i].PrintManglingSuffix(OS, ImmediatePredicates);
+        OS << "(VT, RetVT, Opcode";
+        if (!MI->second[i].empty())
+          OS << ", ";
+        MI->second[i].PrintArguments(OS);
+        OS << "))\n      return Reg;\n\n";
+      }
+      
+      // Done with this, remove it.
+      SignaturesWithConstantForms.erase(MI);
+    }
+    
     OS << "  switch (Opcode) {\n";
     for (OpcodeTypeRetPredMap::const_iterator I = OTM.begin(), E = OTM.end();
          I != E; ++I) {
@@ -617,7 +838,7 @@ void FastISelMap::PrintFunctionDefinitions(raw_ostream &OS) {
 
       OS << "  case " << Opcode << ": return FastEmit_"
          << getLegalCName(Opcode) << "_";
-      Operands.PrintManglingSuffix(OS);
+      Operands.PrintManglingSuffix(OS, ImmediatePredicates);
       OS << "(VT, RetVT";
       if (!Operands.empty())
         OS << ", ";
@@ -629,6 +850,8 @@ void FastISelMap::PrintFunctionDefinitions(raw_ostream &OS) {
     OS << "}\n";
     OS << "\n";
   }
+  
+  // TODO: SignaturesWithConstantForms should be empty here.
 }
 
 void FastISelEmitter::run(raw_ostream &OS) {
@@ -642,12 +865,12 @@ void FastISelEmitter::run(raw_ostream &OS) {
                        Target.getName() + " target", OS);
 
   FastISelMap F(InstNS);
-  F.CollectPatterns(CGP);
-  F.PrintFunctionDefinitions(OS);
+  F.collectPatterns(CGP);
+  F.printImmediatePredicates(OS);
+  F.printFunctionDefinitions(OS);
 }
 
 FastISelEmitter::FastISelEmitter(RecordKeeper &R)
-  : Records(R),
-    CGP(R) {
+  : Records(R), CGP(R) {
 }
 
diff --git a/utils/TableGen/FixedLenDecoderEmitter.cpp b/utils/TableGen/FixedLenDecoderEmitter.cpp
index bbcecab..9312fe8 100644
--- a/utils/TableGen/FixedLenDecoderEmitter.cpp
+++ b/utils/TableGen/FixedLenDecoderEmitter.cpp
@@ -438,7 +438,7 @@ void Filter::recurse() {
     for (bitIndex = 0; bitIndex < NumBits; bitIndex++)
       BitValueArray[StartBit + bitIndex] = BIT_UNSET;
 
-    // Delegates to an inferior filter chooser for futher processing on this
+    // Delegates to an inferior filter chooser for further processing on this
     // group of instructions whose segment values are variable.
     FilterChooserMap.insert(std::pair<unsigned, FilterChooser*>(
                               (unsigned)-1,
@@ -471,7 +471,7 @@ void Filter::recurse() {
         BitValueArray[StartBit + bitIndex] = BIT_FALSE;
     }
 
-    // Delegates to an inferior filter chooser for futher processing on this
+    // Delegates to an inferior filter chooser for further processing on this
     // category of instructions.
     FilterChooserMap.insert(std::pair<unsigned, FilterChooser*>(
                               mapIterator->first,
@@ -611,7 +611,8 @@ void FilterChooser::emitTop(raw_ostream &o, unsigned Indentation) {
   o << '\n';
 
   o.indent(Indentation) <<
-    "static bool decodeInstruction(MCInst &MI, field_t insn) {\n";
+    "static bool decodeInstruction(MCInst &MI, field_t insn, "
+    "uint64_t Address, const void *Decoder) {\n";
   o.indent(Indentation) << "  unsigned tmp = 0;\n";
 
   ++Indentation; ++Indentation;
@@ -795,7 +796,8 @@ bool FilterChooser::emitSingletonDecoder(raw_ostream &o, unsigned &Indentation,
          I = InsnOperands.begin(), E = InsnOperands.end(); I != E; ++I) {
       // If a custom instruction decoder was specified, use that.
       if (I->FieldBase == ~0U && I->FieldLength == ~0U) {
-        o.indent(Indentation) << "  " << I->Decoder << "(MI, insn);\n";
+        o.indent(Indentation) << "  " << I->Decoder
+                              << "(MI, insn, Address, Decoder);\n";
         break;
       }
 
@@ -803,7 +805,8 @@ bool FilterChooser::emitSingletonDecoder(raw_ostream &o, unsigned &Indentation,
         << "  tmp = fieldFromInstruction(insn, " << I->FieldBase
         << ", " << I->FieldLength << ");\n";
       if (I->Decoder != "") {
-        o.indent(Indentation) << "  " << I->Decoder << "(MI, tmp);\n";
+        o.indent(Indentation) << "  " << I->Decoder
+                              << "(MI, tmp, Address, Decoder);\n";
       } else {
         o.indent(Indentation)
           << "  MI.addOperand(MCOperand::CreateImm(tmp));\n";
@@ -846,7 +849,8 @@ bool FilterChooser::emitSingletonDecoder(raw_ostream &o, unsigned &Indentation,
        I = InsnOperands.begin(), E = InsnOperands.end(); I != E; ++I) {
     // If a custom instruction decoder was specified, use that.
     if (I->FieldBase == ~0U && I->FieldLength == ~0U) {
-      o.indent(Indentation) << "  " << I->Decoder << "(MI, insn);\n";
+      o.indent(Indentation) << "  " << I->Decoder
+                            << "(MI, insn, Address, Decoder);\n";
       break;
     }
 
@@ -854,7 +858,8 @@ bool FilterChooser::emitSingletonDecoder(raw_ostream &o, unsigned &Indentation,
       << "  tmp = fieldFromInstruction(insn, " << I->FieldBase
       << ", " << I->FieldLength << ");\n";
     if (I->Decoder != "") {
-      o.indent(Indentation) << "  " << I->Decoder << "(MI, tmp);\n";
+      o.indent(Indentation) << "  " << I->Decoder
+                            << "(MI, tmp, Address, Decoder);\n";
     } else {
       o.indent(Indentation)
         << "  MI.addOperand(MCOperand::CreateImm(tmp));\n";
diff --git a/utils/TableGen/InstrInfoEmitter.cpp b/utils/TableGen/InstrInfoEmitter.cpp
index 67cce0e..fc544ee 100644
--- a/utils/TableGen/InstrInfoEmitter.cpp
+++ b/utils/TableGen/InstrInfoEmitter.cpp
@@ -166,13 +166,13 @@ void InstrInfoEmitter::DetectRegisterClassBarriers(std::vector<Record*> &Defs,
 
   for (unsigned i = 0, e = RCs.size(); i != e; ++i) {
     const CodeGenRegisterClass &RC = RCs[i];
-    unsigned NumRegs = RC.Elements.size();
-    if (NumRegs > NumDefs)
+    ArrayRef<Record*> Order = RC.getOrder();
+    if (Order.size() > NumDefs)
       continue; // Can't possibly clobber this RC.
 
     bool Clobber = true;
-    for (unsigned j = 0; j < NumRegs; ++j) {
-      Record *Reg = RC.Elements[j];
+    for (unsigned j = 0; j < Order.size(); ++j) {
+      Record *Reg = Order[j];
       if (!DefSet.count(Reg)) {
         Clobber = false;
         break;
diff --git a/utils/TableGen/IntrinsicEmitter.cpp b/utils/TableGen/IntrinsicEmitter.cpp
index 08f6728..39eb3bd 100644
--- a/utils/TableGen/IntrinsicEmitter.cpp
+++ b/utils/TableGen/IntrinsicEmitter.cpp
@@ -465,6 +465,46 @@ void IntrinsicEmitter::EmitGenerator(const std::vector<CodeGenIntrinsic> &Ints,
   OS << "#endif\n\n";
 }
 
+namespace {
+  enum ModRefKind {
+    MRK_none,
+    MRK_readonly,
+    MRK_readnone
+  };
+
+  ModRefKind getModRefKind(const CodeGenIntrinsic &intrinsic) {
+    switch (intrinsic.ModRef) {
+    case CodeGenIntrinsic::NoMem:
+      return MRK_readnone;
+    case CodeGenIntrinsic::ReadArgMem:
+    case CodeGenIntrinsic::ReadMem:
+      return MRK_readonly;
+    case CodeGenIntrinsic::ReadWriteArgMem:
+    case CodeGenIntrinsic::ReadWriteMem:
+      return MRK_none;
+    }
+    assert(0 && "bad mod-ref kind");
+    return MRK_none;
+  }
+
+  struct AttributeComparator {
+    bool operator()(const CodeGenIntrinsic *L, const CodeGenIntrinsic *R) const {
+      // Sort throwing intrinsics after non-throwing intrinsics.
+      if (L->canThrow != R->canThrow)
+        return R->canThrow;
+
+      // Try to order by readonly/readnone attribute.
+      ModRefKind LK = getModRefKind(*L);
+      ModRefKind RK = getModRefKind(*R);
+      if (LK != RK) return (LK > RK);
+
+      // Order by argument attributes.
+      // This is reliable because each side is already sorted internally.
+      return (L->ArgumentAttributes < R->ArgumentAttributes);
+    }
+  };
+}
+
 /// EmitAttributes - This emits the Intrinsic::getAttributes method.
 void IntrinsicEmitter::
 EmitAttributes(const std::vector<CodeGenIntrinsic> &Ints, raw_ostream &OS) {
@@ -472,84 +512,96 @@ EmitAttributes(const std::vector<CodeGenIntrinsic> &Ints, raw_ostream &OS) {
   OS << "#ifdef GET_INTRINSIC_ATTRIBUTES\n";
   if (TargetOnly)
     OS << "static AttrListPtr getAttributes(" << TargetPrefix 
-       << "Intrinsic::ID id) {";
+       << "Intrinsic::ID id) {\n";
   else
-    OS << "AttrListPtr Intrinsic::getAttributes(ID id) {";
-  OS << "  // No intrinsic can throw exceptions.\n";
-  OS << "  Attributes Attr = Attribute::NoUnwind;\n";
-  OS << "  switch (id) {\n";
-  OS << "  default: break;\n";
-  unsigned MaxArgAttrs = 0;
+    OS << "AttrListPtr Intrinsic::getAttributes(ID id) {\n";
+
+  // Compute the maximum number of attribute arguments.
+  std::vector<const CodeGenIntrinsic*> sortedIntrinsics(Ints.size());
+  unsigned maxArgAttrs = 0;
   for (unsigned i = 0, e = Ints.size(); i != e; ++i) {
-    MaxArgAttrs =
-      std::max(MaxArgAttrs, unsigned(Ints[i].ArgumentAttributes.size()));
-    switch (Ints[i].ModRef) {
-    default: break;
-    case CodeGenIntrinsic::NoMem:
-      OS << "  case " << TargetPrefix << "Intrinsic::" << Ints[i].EnumName 
-         << ":\n";
-      break;
-    }
+    const CodeGenIntrinsic &intrinsic = Ints[i];
+    sortedIntrinsics[i] = &intrinsic;
+    maxArgAttrs =
+      std::max(maxArgAttrs, unsigned(intrinsic.ArgumentAttributes.size()));
   }
-  OS << "    Attr |= Attribute::ReadNone; // These do not access memory.\n";
-  OS << "    break;\n";
-  for (unsigned i = 0, e = Ints.size(); i != e; ++i) {
-    switch (Ints[i].ModRef) {
-    default: break;
-    case CodeGenIntrinsic::ReadArgMem:
-    case CodeGenIntrinsic::ReadMem:
-      OS << "  case " << TargetPrefix << "Intrinsic::" << Ints[i].EnumName 
-         << ":\n";
-      break;
-    }
-  }
-  OS << "    Attr |= Attribute::ReadOnly; // These do not write memory.\n";
-  OS << "    break;\n";
-  OS << "  }\n";
-  OS << "  AttributeWithIndex AWI[" << MaxArgAttrs+1 << "];\n";
+
+  // Emit an array of AttributeWithIndex.  Most intrinsics will have
+  // at least one entry, for the function itself (index ~1), which is
+  // usually nounwind.
+  OS << "  AttributeWithIndex AWI[" << maxArgAttrs+1 << "];\n";
   OS << "  unsigned NumAttrs = 0;\n";
   OS << "  switch (id) {\n";
-  OS << "  default: break;\n";
-  
-  // Add argument attributes for any intrinsics that have them.
-  for (unsigned i = 0, e = Ints.size(); i != e; ++i) {
-    if (Ints[i].ArgumentAttributes.empty()) continue;
-    
-    OS << "  case " << TargetPrefix << "Intrinsic::" << Ints[i].EnumName 
-       << ":\n";
+  OS << "    default: break;\n";
+
+  AttributeComparator precedes;
 
-    std::vector<std::pair<unsigned, CodeGenIntrinsic::ArgAttribute> > ArgAttrs =
-      Ints[i].ArgumentAttributes;
-    // Sort by argument index.
-    std::sort(ArgAttrs.begin(), ArgAttrs.end());
+  std::stable_sort(sortedIntrinsics.begin(), sortedIntrinsics.end(), precedes);
+
+  for (unsigned i = 0, e = sortedIntrinsics.size(); i != e; ++i) {
+    const CodeGenIntrinsic &intrinsic = *sortedIntrinsics[i];
+    OS << "  case " << TargetPrefix << "Intrinsic::"
+       << intrinsic.EnumName << ":\n";
+
+    // Fill out the case if this is the last case for this range of
+    // intrinsics.
+    if (i + 1 != e && !precedes(&intrinsic, sortedIntrinsics[i + 1]))
+      continue;
 
-    unsigned NumArgsWithAttrs = 0;
+    // Keep track of the number of attributes we're writing out.
+    unsigned numAttrs = 0;
 
-    while (!ArgAttrs.empty()) {
-      unsigned ArgNo = ArgAttrs[0].first;
+    // The argument attributes are alreadys sorted by argument index.
+    for (unsigned ai = 0, ae = intrinsic.ArgumentAttributes.size(); ai != ae;) {
+      unsigned argNo = intrinsic.ArgumentAttributes[ai].first;
       
-      OS << "    AWI[" << NumArgsWithAttrs++ << "] = AttributeWithIndex::get("
-         << ArgNo+1 << ", 0";
+      OS << "    AWI[" << numAttrs++ << "] = AttributeWithIndex::get("
+         << argNo+1 << ", ";
 
-      while (!ArgAttrs.empty() && ArgAttrs[0].first == ArgNo) {
-        switch (ArgAttrs[0].second) {
-        default: assert(0 && "Unknown arg attribute");
+      bool moreThanOne = false;
+
+      do {
+        if (moreThanOne) OS << '|';
+
+        switch (intrinsic.ArgumentAttributes[ai].second) {
         case CodeGenIntrinsic::NoCapture:
-          OS << "|Attribute::NoCapture";
+          OS << "Attribute::NoCapture";
           break;
         }
-        ArgAttrs.erase(ArgAttrs.begin());
+
+        ++ai;
+        moreThanOne = true;
+      } while (ai != ae && intrinsic.ArgumentAttributes[ai].first == argNo);
+
+      OS << ");\n";
+    }
+
+    ModRefKind modRef = getModRefKind(intrinsic);
+
+    if (!intrinsic.canThrow || modRef) {
+      OS << "    AWI[" << numAttrs++ << "] = AttributeWithIndex::get(~0, ";
+      if (!intrinsic.canThrow) {
+        OS << "Attribute::NoUnwind";
+        if (modRef) OS << '|';
+      }
+      switch (modRef) {
+      case MRK_none: break;
+      case MRK_readonly: OS << "Attribute::ReadOnly"; break;
+      case MRK_readnone: OS << "Attribute::ReadNone"; break;
       }
       OS << ");\n";
     }
-    
-    OS << "    NumAttrs = " << NumArgsWithAttrs << ";\n";
-    OS << "    break;\n";
+
+    if (numAttrs) {
+      OS << "    NumAttrs = " << numAttrs << ";\n";
+      OS << "    break;\n";
+    } else {
+      OS << "    return AttrListPtr();\n";
+    }
   }
   
   OS << "  }\n";
-  OS << "  AWI[NumAttrs] = AttributeWithIndex::get(~0, Attr);\n";
-  OS << "  return AttrListPtr::get(AWI, NumAttrs+1);\n";
+  OS << "  return AttrListPtr::get(AWI, NumAttrs);\n";
   OS << "}\n";
   OS << "#endif // GET_INTRINSIC_ATTRIBUTES\n\n";
 }
diff --git a/utils/TableGen/LLVMCConfigurationEmitter.cpp b/utils/TableGen/LLVMCConfigurationEmitter.cpp
index c40a39d..090faf5 100644
--- a/utils/TableGen/LLVMCConfigurationEmitter.cpp
+++ b/utils/TableGen/LLVMCConfigurationEmitter.cpp
@@ -74,6 +74,25 @@ int InitPtrToInt(const Init* ptr) {
   return val.getValue();
 }
 
+bool InitPtrToBool(const Init* ptr) {
+  bool ret = false;
+  const DefInit& val = dynamic_cast<const DefInit&>(*ptr);
+  const std::string& str = val.getAsString();
+
+  if (str == "true") {
+    ret = true;
+  }
+  else if (str == "false") {
+    ret = false;
+  }
+  else {
+    throw "Incorrect boolean value: '" + str +
+      "': must be either 'true' or 'false'";
+  }
+
+  return ret;
+}
+
 const std::string& InitPtrToString(const Init* ptr) {
   const StringInit& val = dynamic_cast<const StringInit&>(*ptr);
   return val.getValue();
@@ -95,13 +114,7 @@ const std::string GetOperatorName(const DagInit& D) {
 
 /// CheckBooleanConstant - Check that the provided value is a boolean constant.
 void CheckBooleanConstant(const Init* I) {
-  const DefInit& val = dynamic_cast<const DefInit&>(*I);
-  const std::string& str = val.getAsString();
-
-  if (str != "true" && str != "false") {
-    throw "Incorrect boolean value: '" + str +
-      "': must be either 'true' or 'false'";
-  }
+  InitPtrToBool(I);
 }
 
 // CheckNumberOfArguments - Ensure that the number of args in d is
@@ -935,8 +948,22 @@ private:
   }
 
   void onJoin (const DagInit& d) {
-    CheckNumberOfArguments(d, 0);
-    toolDesc_.setJoin();
+    bool isReallyJoin = false;
+
+    if (d.getNumArgs() == 0) {
+      isReallyJoin = true;
+    }
+    else {
+      Init* I = d.getArg(0);
+      isReallyJoin = InitPtrToBool(I);
+    }
+
+    // Is this *really* a join tool? We allow (join false) for generating two
+    // tool descriptions from a single generic one.
+    // TOFIX: come up with a cleaner solution.
+    if (isReallyJoin) {
+      toolDesc_.setJoin();
+    }
   }
 
   void onOutLanguage (const DagInit& d) {
@@ -3028,6 +3055,8 @@ void CheckDriverData(DriverData& Data) {
   FilterNotInGraph(Data.Edges, Data.ToolDescs);
 
   // Typecheck the compilation graph.
+  // TODO: use a genuine graph representation instead of a vector and check for
+  // multiple edges.
   TypecheckGraph(Data.Edges, Data.ToolDescs);
 
   // Check that there are no options without side effects (specified
diff --git a/utils/TableGen/NeonEmitter.cpp b/utils/TableGen/NeonEmitter.cpp
index d522c79..23fdbde 100644
--- a/utils/TableGen/NeonEmitter.cpp
+++ b/utils/TableGen/NeonEmitter.cpp
@@ -462,9 +462,34 @@ static std::string MangleName(const std::string &name, StringRef typestr,
   return s;
 }
 
+/// UseMacro - Examine the prototype string to determine if the intrinsic
+/// should be defined as a preprocessor macro instead of an inline function.
+static bool UseMacro(const std::string &proto) {
+  // If this builtin takes an immediate argument, we need to #define it rather
+  // than use a standard declaration, so that SemaChecking can range check
+  // the immediate passed by the user.
+  if (proto.find('i') != std::string::npos)
+    return true;
+
+  // Pointer arguments need to use macros to avoid hiding aligned attributes
+  // from the pointer type.
+  if (proto.find('p') != std::string::npos ||
+      proto.find('c') != std::string::npos)
+    return true;
+
+  return false;
+}
+
+/// MacroArgUsedDirectly - Return true if argument i for an intrinsic that is
+/// defined as a macro should be accessed directly instead of being first
+/// assigned to a local temporary.
+static bool MacroArgUsedDirectly(const std::string &proto, unsigned i) {
+  return (proto[i] == 'i' || proto[i] == 'p' || proto[i] == 'c');
+}
+
 // Generate the string "(argtype a, argtype b, ...)"
 static std::string GenArgs(const std::string &proto, StringRef typestr) {
-  bool define = proto.find('i') != std::string::npos;
+  bool define = UseMacro(proto);
   char arg = 'a';
 
   std::string s;
@@ -472,10 +497,10 @@ static std::string GenArgs(const std::string &proto, StringRef typestr) {
 
   for (unsigned i = 1, e = proto.size(); i != e; ++i, ++arg) {
     if (define) {
-      // Immediate macro arguments are used directly instead of being assigned
+      // Some macro arguments are used directly instead of being assigned
       // to local temporaries; prepend an underscore prefix to make their
       // names consistent with the local temporaries.
-      if (proto[i] == 'i')
+      if (MacroArgUsedDirectly(proto, i))
         s += "__";
     } else {
       s += TypeString(proto[i], typestr) + " __";
@@ -494,11 +519,28 @@ static std::string GenArgs(const std::string &proto, StringRef typestr) {
 static std::string GenMacroLocals(const std::string &proto, StringRef typestr) {
   char arg = 'a';
   std::string s;
+  bool generatedLocal = false;
 
   for (unsigned i = 1, e = proto.size(); i != e; ++i, ++arg) {
     // Do not create a temporary for an immediate argument.
     // That would defeat the whole point of using a macro!
-    if (proto[i] == 'i') continue;
+    if (proto[i] == 'i')
+      continue;
+    generatedLocal = true;
+
+    // For other (non-immediate) arguments that are used directly, a local
+    // temporary is still needed to get the correct type checking, even though
+    // that temporary is not used for anything.
+    if (MacroArgUsedDirectly(proto, i)) {
+      s += TypeString(proto[i], typestr) + " __";
+      s.push_back(arg);
+      s += "_ = (__";
+      s.push_back(arg);
+      s += "); (void)__";
+      s.push_back(arg);
+      s += "_; ";
+      continue;
+    }
 
     s += TypeString(proto[i], typestr) + " __";
     s.push_back(arg);
@@ -507,7 +549,8 @@ static std::string GenMacroLocals(const std::string &proto, StringRef typestr) {
     s += "); ";
   }
 
-  s += "\\\n  ";
+  if (generatedLocal)
+    s += "\\\n  ";
   return s;
 }
 
@@ -568,11 +611,7 @@ static std::string GenOpString(OpKind op, const std::string &proto,
                                StringRef typestr) {
   bool quad;
   unsigned nElts = GetNumElements(typestr, quad);
-
-  // If this builtin takes an immediate argument, we need to #define it rather
-  // than use a standard declaration, so that SemaChecking can range check
-  // the immediate passed by the user.
-  bool define = proto.find('i') != std::string::npos;
+  bool define = UseMacro(proto);
 
   std::string ts = TypeString(proto[0], typestr);
   std::string s;
@@ -858,10 +897,7 @@ static std::string GenBuiltin(const std::string &name, const std::string &proto,
   // sret-like argument.
   bool sret = (proto[0] >= '2' && proto[0] <= '4');
 
-  // If this builtin takes an immediate argument, we need to #define it rather
-  // than use a standard declaration, so that SemaChecking can range check
-  // the immediate passed by the user.
-  bool define = proto.find('i') != std::string::npos;
+  bool define = UseMacro(proto);
 
   // Check if the prototype has a scalar operand with the type of the vector
   // elements.  If not, bitcasting the args will take care of arg checking.
@@ -999,7 +1035,7 @@ static std::string GenIntrinsic(const std::string &name,
                                 StringRef outTypeStr, StringRef inTypeStr,
                                 OpKind kind, ClassKind classKind) {
   assert(!proto.empty() && "");
-  bool define = proto.find('i') != std::string::npos;
+  bool define = UseMacro(proto);
   std::string s;
 
   // static always inline + return type
@@ -1362,9 +1398,14 @@ void NeonEmitter::runHeader(raw_ostream &OS) {
     for (unsigned ti = 0, te = TypeVec.size(); ti != te; ++ti) {
       std::string namestr, shiftstr, rangestr;
 
-      // Builtins which are overloaded by type will need to have their upper
-      // bound computed at Sema time based on the type constant.
-      if (Proto.find('s') == std::string::npos) {
+      if (R->getValueAsBit("isVCVT_N")) {
+        // VCVT between floating- and fixed-point values takes an immediate
+        // in the range 1 to 32.
+        ck = ClassB;
+        rangestr = "l = 1; u = 31"; // upper bound = l + u
+      } else if (Proto.find('s') == std::string::npos) {
+        // Builtins which are overloaded by type will need to have their upper
+        // bound computed at Sema time based on the type constant.
         ck = ClassB;
         if (R->getValueAsBit("isShift")) {
           shiftstr = ", true";
diff --git a/utils/TableGen/OptParserEmitter.cpp b/utils/TableGen/OptParserEmitter.cpp
index 6892912..431026c 100644
--- a/utils/TableGen/OptParserEmitter.cpp
+++ b/utils/TableGen/OptParserEmitter.cpp
@@ -35,7 +35,7 @@ static int CompareOptionRecords(const void *Av, const void *Bv) {
   const Record *A = *(Record**) Av;
   const Record *B = *(Record**) Bv;
 
-  // Sentinel options preceed all others and are only ordered by precedence.
+  // Sentinel options precede all others and are only ordered by precedence.
   bool ASent = A->getValueAsDef("Kind")->getValueAsBit("Sentinel");
   bool BSent = B->getValueAsDef("Kind")->getValueAsBit("Sentinel");
   if (ASent != BSent)
diff --git a/utils/TableGen/Record.cpp b/utils/TableGen/Record.cpp
index abbbafe..8ac8cd9 100644
--- a/utils/TableGen/Record.cpp
+++ b/utils/TableGen/Record.cpp
@@ -68,14 +68,9 @@ Init *BitsRecTy::convertValue(BitInit *UI) {
 /// canFitInBitfield - Return true if the number of bits is large enough to hold
 /// the integer value.
 static bool canFitInBitfield(int64_t Value, unsigned NumBits) {
-  if (Value >= 0) {
-    if (Value & ~((1LL << NumBits) - 1))
-      return false;
-  } else if ((Value >> NumBits) != -1 || (Value & (1LL << (NumBits-1))) == 0) {
-    return false;
-  }
-
-  return true;
+  // For example, with NumBits == 4, we permit Values from [-7 .. 15].
+  return (NumBits >= sizeof(Value) * 8) ||
+         (Value >> NumBits == 0) || (Value >> (NumBits-1) == -1);
 }
 
 /// convertValue from Int initializer to bits type: Split the integer up into the
@@ -583,9 +578,7 @@ Init *UnOpInit::Fold(Record *CurRec, MultiClass *CurMultiClass) {
         if (Record *D = (CurRec->getRecords()).getDef(Name))
           return new DefInit(D);
 
-        errs() << "Variable not defined: '" + Name + "'\n";
-        assert(0 && "Variable not found");
-        return 0;
+        throw TGError(CurRec->getLoc(), "Undefined reference:'" + Name + "'\n");
       }
     }
     break;
@@ -813,15 +806,13 @@ static Init *ForeachHelper(Init *LHS, Init *MHS, Init *RHS, RecTy *Type,
   OpInit *RHSo = dynamic_cast<OpInit*>(RHS);
 
   if (!RHSo) {
-    errs() << "!foreach requires an operator\n";
-    assert(0 && "No operator for !foreach");
+    throw TGError(CurRec->getLoc(), "!foreach requires an operator\n");
   }
 
   TypedInit *LHSt = dynamic_cast<TypedInit*>(LHS);
 
   if (!LHSt) {
-    errs() << "!foreach requires typed variable\n";
-    assert(0 && "No typed variable for !foreach");
+    throw TGError(CurRec->getLoc(), "!foreach requires typed variable\n");
   }
 
   if ((MHSd && DagType) || (MHSl && ListType)) {
diff --git a/utils/TableGen/Record.h b/utils/TableGen/Record.h
index f3a5df2..522b719 100644
--- a/utils/TableGen/Record.h
+++ b/utils/TableGen/Record.h
@@ -707,7 +707,7 @@ class CodeInit : public Init {
 public:
   explicit CodeInit(const std::string &V) : Value(V) {}
 
-  const std::string getValue() const { return Value; }
+  const std::string &getValue() const { return Value; }
 
   virtual Init *convertInitializerTo(RecTy *Ty) {
     return Ty->convertValue(this);
diff --git a/utils/TableGen/RegisterInfoEmitter.cpp b/utils/TableGen/RegisterInfoEmitter.cpp
index b3a9dea..9ffb66a 100644
--- a/utils/TableGen/RegisterInfoEmitter.cpp
+++ b/utils/TableGen/RegisterInfoEmitter.cpp
@@ -19,6 +19,7 @@
 #include "Record.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Format.h"
 #include <algorithm>
 #include <set>
 using namespace llvm;
@@ -26,6 +27,7 @@ using namespace llvm;
 // runEnums - Print out enum values for all of the registers.
 void RegisterInfoEmitter::runEnums(raw_ostream &OS) {
   CodeGenTarget Target(Records);
+  CodeGenRegBank &Bank = Target.getRegBank();
   const std::vector<CodeGenRegister> &Registers = Target.getRegisters();
 
   std::string Namespace = Registers[0].TheDef->getValueAsString("Namespace");
@@ -47,16 +49,16 @@ void RegisterInfoEmitter::runEnums(raw_ostream &OS) {
   if (!Namespace.empty())
     OS << "}\n";
 
-  const std::vector<Record*> SubRegIndices = Target.getSubRegIndices();
+  const std::vector<Record*> &SubRegIndices = Bank.getSubRegIndices();
   if (!SubRegIndices.empty()) {
     OS << "\n// Subregister indices\n";
     Namespace = SubRegIndices[0]->getValueAsString("Namespace");
     if (!Namespace.empty())
       OS << "namespace " << Namespace << " {\n";
     OS << "enum {\n  NoSubRegister,\n";
-    for (unsigned i = 0, e = SubRegIndices.size(); i != e; ++i)
+    for (unsigned i = 0, e = Bank.getNumNamedIndices(); i != e; ++i)
       OS << "  " << SubRegIndices[i]->getName() << ",\t// " << i+1 << "\n";
-    OS << "  NUM_TARGET_SUBREGS = " << SubRegIndices.size()+1 << "\n";
+    OS << "  NUM_TARGET_NAMED_SUBREGS = " << SubRegIndices.size()+1 << "\n";
     OS << "};\n";
     if (!Namespace.empty())
       OS << "}\n";
@@ -80,6 +82,8 @@ void RegisterInfoEmitter::runHeader(raw_ostream &OS) {
      << "(int CallFrameSetupOpcode = -1, int CallFrameDestroyOpcode = -1);\n"
      << "  virtual int getDwarfRegNumFull(unsigned RegNum, "
      << "unsigned Flavour) const;\n"
+     << "  virtual int getLLVMRegNumFull(unsigned DwarfRegNum, "
+     << "unsigned Flavour) const;\n"
      << "  virtual int getDwarfRegNum(unsigned RegNum, bool isEH) const = 0;\n"
      << "  virtual bool needsStackRealignment(const MachineFunction &) const\n"
      << "     { return false; }\n"
@@ -122,256 +126,52 @@ void RegisterInfoEmitter::runHeader(raw_ostream &OS) {
   OS << "} // End llvm namespace \n";
 }
 
-static void addSuperReg(Record *R, Record *S,
-                  std::map<Record*, std::set<Record*>, LessRecord> &SubRegs,
-                  std::map<Record*, std::set<Record*>, LessRecord> &SuperRegs,
-                  std::map<Record*, std::set<Record*>, LessRecord> &Aliases) {
-  if (R == S) {
-    errs() << "Error: recursive sub-register relationship between"
-           << " register " << getQualifiedName(R)
-           << " and its sub-registers?\n";
-    abort();
-  }
-  if (!SuperRegs[R].insert(S).second)
-    return;
-  SubRegs[S].insert(R);
-  Aliases[R].insert(S);
-  Aliases[S].insert(R);
-  if (SuperRegs.count(S))
-    for (std::set<Record*>::iterator I = SuperRegs[S].begin(),
-           E = SuperRegs[S].end(); I != E; ++I)
-      addSuperReg(R, *I, SubRegs, SuperRegs, Aliases);
-}
-
-static void addSubSuperReg(Record *R, Record *S,
-                   std::map<Record*, std::set<Record*>, LessRecord> &SubRegs,
-                   std::map<Record*, std::set<Record*>, LessRecord> &SuperRegs,
-                   std::map<Record*, std::set<Record*>, LessRecord> &Aliases) {
-  if (R == S) {
-    errs() << "Error: recursive sub-register relationship between"
-           << " register " << getQualifiedName(R)
-           << " and its sub-registers?\n";
-    abort();
-  }
-
-  if (!SubRegs[R].insert(S).second)
-    return;
-  addSuperReg(S, R, SubRegs, SuperRegs, Aliases);
-  Aliases[R].insert(S);
-  Aliases[S].insert(R);
-  if (SubRegs.count(S))
-    for (std::set<Record*>::iterator I = SubRegs[S].begin(),
-           E = SubRegs[S].end(); I != E; ++I)
-      addSubSuperReg(R, *I, SubRegs, SuperRegs, Aliases);
-}
-
-struct RegisterMaps {
-  // Map SubRegIndex -> Register
-  typedef std::map<Record*, Record*, LessRecord> SubRegMap;
-  // Map Register -> SubRegMap
-  typedef std::map<Record*, SubRegMap> SubRegMaps;
-
-  SubRegMaps SubReg;
-  SubRegMap &inferSubRegIndices(Record *Reg);
-
-  // Composite SubRegIndex instances.
-  // Map (SubRegIndex,SubRegIndex) -> SubRegIndex
-  typedef DenseMap<std::pair<Record*,Record*>,Record*> CompositeMap;
-  CompositeMap Composite;
-
-  // Compute SubRegIndex compositions after inferSubRegIndices has run on all
-  // registers.
-  void computeComposites();
-};
-
-// Calculate all subregindices for Reg. Loopy subregs cause infinite recursion.
-RegisterMaps::SubRegMap &RegisterMaps::inferSubRegIndices(Record *Reg) {
-  SubRegMap &SRM = SubReg[Reg];
-  if (!SRM.empty())
-    return SRM;
-  std::vector<Record*> SubRegs = Reg->getValueAsListOfDefs("SubRegs");
-  std::vector<Record*> Indices = Reg->getValueAsListOfDefs("SubRegIndices");
-  if (SubRegs.size() != Indices.size())
-    throw "Register " + Reg->getName() + " SubRegIndices doesn't match SubRegs";
-
-  // First insert the direct subregs and make sure they are fully indexed.
-  for (unsigned i = 0, e = SubRegs.size(); i != e; ++i) {
-    if (!SRM.insert(std::make_pair(Indices[i], SubRegs[i])).second)
-      throw "SubRegIndex " + Indices[i]->getName()
-        + " appears twice in Register " + Reg->getName();
-    inferSubRegIndices(SubRegs[i]);
-  }
-
-  // Keep track of inherited subregs and how they can be reached.
-  // Register -> (SubRegIndex, SubRegIndex)
-  typedef std::map<Record*, std::pair<Record*,Record*>, LessRecord> OrphanMap;
-  OrphanMap Orphans;
-
-  // Clone inherited subregs. Here the order is important - earlier subregs take
-  // precedence.
-  for (unsigned i = 0, e = SubRegs.size(); i != e; ++i) {
-    SubRegMap &M = SubReg[SubRegs[i]];
-    for (SubRegMap::iterator si = M.begin(), se = M.end(); si != se; ++si)
-      if (!SRM.insert(*si).second)
-        Orphans[si->second] = std::make_pair(Indices[i], si->first);
-  }
-
-  // Finally process the composites.
-  ListInit *Comps = Reg->getValueAsListInit("CompositeIndices");
-  for (unsigned i = 0, e = Comps->size(); i != e; ++i) {
-    DagInit *Pat = dynamic_cast<DagInit*>(Comps->getElement(i));
-    if (!Pat)
-      throw "Invalid dag '" + Comps->getElement(i)->getAsString()
-        + "' in CompositeIndices";
-    DefInit *BaseIdxInit = dynamic_cast<DefInit*>(Pat->getOperator());
-    if (!BaseIdxInit || !BaseIdxInit->getDef()->isSubClassOf("SubRegIndex"))
-      throw "Invalid SubClassIndex in " + Pat->getAsString();
-
-    // Resolve list of subreg indices into R2.
-    Record *R2 = Reg;
-    for (DagInit::const_arg_iterator di = Pat->arg_begin(),
-         de = Pat->arg_end(); di != de; ++di) {
-      DefInit *IdxInit = dynamic_cast<DefInit*>(*di);
-      if (!IdxInit || !IdxInit->getDef()->isSubClassOf("SubRegIndex"))
-        throw "Invalid SubClassIndex in " + Pat->getAsString();
-      SubRegMap::const_iterator ni = SubReg[R2].find(IdxInit->getDef());
-      if (ni == SubReg[R2].end())
-        throw "Composite " + Pat->getAsString() + " refers to bad index in "
-          + R2->getName();
-      R2 = ni->second;
-    }
-
-    // Insert composite index. Allow overriding inherited indices etc.
-    SRM[BaseIdxInit->getDef()] = R2;
-
-    // R2 is now directly addressable, no longer an orphan.
-    Orphans.erase(R2);
-  }
-
-  // Now, Orphans contains the inherited subregisters without a direct index.
-  if (!Orphans.empty()) {
-    errs() << "Error: Register " << getQualifiedName(Reg)
-           << " inherited subregisters without an index:\n";
-    for (OrphanMap::iterator i = Orphans.begin(), e = Orphans.end(); i != e;
-         ++i) {
-      errs() << "  " << getQualifiedName(i->first)
-             << " = " << i->second.first->getName()
-             << ", " << i->second.second->getName() << "\n";
-    }
-    abort();
-  }
-  return SRM;
-}
-
-void RegisterMaps::computeComposites() {
-  for (SubRegMaps::const_iterator sri = SubReg.begin(), sre = SubReg.end();
-       sri != sre; ++sri) {
-    Record *Reg1 = sri->first;
-    const SubRegMap &SRM1 = sri->second;
-    for (SubRegMap::const_iterator i1 = SRM1.begin(), e1 = SRM1.end();
-         i1 != e1; ++i1) {
-      Record *Idx1 = i1->first;
-      Record *Reg2 = i1->second;
-      // Ignore identity compositions.
-      if (Reg1 == Reg2)
-        continue;
-      // If Reg2 has no subregs, Idx1 doesn't compose.
-      if (!SubReg.count(Reg2))
-        continue;
-      const SubRegMap &SRM2 = SubReg[Reg2];
-      // Try composing Idx1 with another SubRegIndex.
-      for (SubRegMap::const_iterator i2 = SRM2.begin(), e2 = SRM2.end();
-           i2 != e2; ++i2) {
-        std::pair<Record*,Record*> IdxPair(Idx1, i2->first);
-        Record *Reg3 = i2->second;
-        // OK Reg1:IdxPair == Reg3. Find the index with Reg:Idx == Reg3.
-        for (SubRegMap::const_iterator i1d = SRM1.begin(), e1d = SRM1.end();
-             i1d != e1d; ++i1d) {
-          // Ignore identity compositions.
-          if (Reg2 == Reg3)
-            continue;
-          if (i1d->second == Reg3) {
-            std::pair<CompositeMap::iterator,bool> Ins =
-              Composite.insert(std::make_pair(IdxPair, i1d->first));
-            // Conflicting composition?
-            if (!Ins.second && Ins.first->second != i1d->first) {
-              errs() << "Error: SubRegIndex " << getQualifiedName(Idx1)
-                     << " and " << getQualifiedName(IdxPair.second)
-                     << " compose ambiguously as "
-                     << getQualifiedName(Ins.first->second) << " or "
-                     << getQualifiedName(i1d->first) << "\n";
-              abort();
-            }
-          }
-        }
-      }
-    }
-  }
-
-  // We don't care about the difference between (Idx1, Idx2) -> Idx2 and invalid
-  // compositions, so remove any mappings of that form.
-  for (CompositeMap::iterator i = Composite.begin(), e = Composite.end();
-       i != e;) {
-    CompositeMap::iterator j = i;
-    ++i;
-    if (j->first.second == j->second)
-      Composite.erase(j);
-  }
-}
-
-class RegisterSorter {
-private:
-  std::map<Record*, std::set<Record*>, LessRecord> &RegisterSubRegs;
-
-public:
-  RegisterSorter(std::map<Record*, std::set<Record*>, LessRecord> &RS)
-    : RegisterSubRegs(RS) {}
-
-  bool operator()(Record *RegA, Record *RegB) {
-    // B is sub-register of A.
-    return RegisterSubRegs.count(RegA) && RegisterSubRegs[RegA].count(RegB);
-  }
-};
+typedef std::pair<unsigned, unsigned> UUPair;
+typedef std::vector<UUPair> UUVector;
 
+//
 // RegisterInfoEmitter::run - Main register file description emitter.
 //
 void RegisterInfoEmitter::run(raw_ostream &OS) {
   CodeGenTarget Target(Records);
+  CodeGenRegBank &RegBank = Target.getRegBank();
+  RegBank.computeDerivedInfo();
+  std::map<const CodeGenRegister*, CodeGenRegister::Set> Overlaps;
+  RegBank.computeOverlaps(Overlaps);
+
   EmitSourceFileHeader("Register Information Source Fragment", OS);
 
   OS << "namespace llvm {\n\n";
 
-  // Start out by emitting each of the register classes... to do this, we build
-  // a set of registers which belong to a register class, this is to ensure that
-  // each register is only in a single register class.
-  //
+  // Start out by emitting each of the register classes.
   const std::vector<CodeGenRegisterClass> &RegisterClasses =
     Target.getRegisterClasses();
 
+  // Collect all registers belonging to any allocatable class.
+  std::set<Record*> AllocatableRegs;
+
   // Loop over all of the register classes... emitting each one.
   OS << "namespace {     // Register classes...\n";
 
-  // RegClassesBelongedTo - Keep track of which register classes each reg
-  // belongs to.
-  std::multimap<Record*, const CodeGenRegisterClass*> RegClassesBelongedTo;
-
   // Emit the register enum value arrays for each RegisterClass
   for (unsigned rc = 0, e = RegisterClasses.size(); rc != e; ++rc) {
     const CodeGenRegisterClass &RC = RegisterClasses[rc];
+    ArrayRef<Record*> Order = RC.getOrder();
+
+    // Collect allocatable registers.
+    if (RC.Allocatable)
+      AllocatableRegs.insert(Order.begin(), Order.end());
 
     // Give the register class a legal C name if it's anonymous.
-    std::string Name = RC.TheDef->getName();
+    std::string Name = RC.getName();
 
     // Emit the register list now.
     OS << "  // " << Name << " Register Class...\n"
        << "  static const unsigned " << Name
        << "[] = {\n    ";
-    for (unsigned i = 0, e = RC.Elements.size(); i != e; ++i) {
-      Record *Reg = RC.Elements[i];
+    for (unsigned i = 0, e = Order.size(); i != e; ++i) {
+      Record *Reg = Order[i];
       OS << getQualifiedName(Reg) << ", ";
-
-      // Keep track of which regclasses this register is in.
-      RegClassesBelongedTo.insert(std::make_pair(Reg, &RC));
     }
     OS << "\n  };\n\n";
   }
@@ -381,7 +181,7 @@ void RegisterInfoEmitter::run(raw_ostream &OS) {
     const CodeGenRegisterClass &RC = RegisterClasses[rc];
 
     // Give the register class a legal C name if it's anonymous.
-    std::string Name = RC.TheDef->getName() + "VTs";
+    std::string Name = RC.getName() + "VTs";
 
     // Emit the register list now.
     OS << "  // " << Name
@@ -406,7 +206,7 @@ void RegisterInfoEmitter::run(raw_ostream &OS) {
     std::map<unsigned, std::set<unsigned> > SuperRegClassMap;
     OS << "\n";
 
-    unsigned NumSubRegIndices = Target.getSubRegIndices().size();
+    unsigned NumSubRegIndices = RegBank.getSubRegIndices().size();
 
     if (NumSubRegIndices) {
       // Emit the sub-register classes for each RegisterClass
@@ -417,7 +217,7 @@ void RegisterInfoEmitter::run(raw_ostream &OS) {
              i = RC.SubRegClasses.begin(),
              e = RC.SubRegClasses.end(); i != e; ++i) {
           // Build SRC array.
-          unsigned idx = Target.getSubRegIndexNo(i->first);
+          unsigned idx = RegBank.getSubRegIndexNo(i->first);
           SRC.at(idx-1) = i->second;
 
           // Find the register class number of i->second for SuperRegClassMap.
@@ -567,7 +367,9 @@ void RegisterInfoEmitter::run(raw_ostream &OS) {
          << RC.SpillSize/8 << ", "
          << RC.SpillAlignment/8 << ", "
          << RC.CopyCost << ", "
-         << RC.getName() << ", " << RC.getName() << " + " << RC.Elements.size()
+         << RC.Allocatable << ", "
+         << RC.getName() << ", " << RC.getName() << " + "
+         << RC.getOrder().size()
          << ") {}\n";
     }
 
@@ -581,267 +383,58 @@ void RegisterInfoEmitter::run(raw_ostream &OS) {
        << "RegClass,\n";
   OS << "  };\n";
 
-  // Emit register sub-registers / super-registers, aliases...
-  std::map<Record*, std::set<Record*>, LessRecord> RegisterSubRegs;
-  std::map<Record*, std::set<Record*>, LessRecord> RegisterSuperRegs;
-  std::map<Record*, std::set<Record*>, LessRecord> RegisterAliases;
   typedef std::map<Record*, std::vector<int64_t>, LessRecord> DwarfRegNumsMapTy;
   DwarfRegNumsMapTy DwarfRegNums;
-
   const std::vector<CodeGenRegister> &Regs = Target.getRegisters();
 
-  for (unsigned i = 0, e = Regs.size(); i != e; ++i) {
-    Record *R = Regs[i].TheDef;
-    std::vector<Record*> LI = Regs[i].TheDef->getValueAsListOfDefs("Aliases");
-    // Add information that R aliases all of the elements in the list... and
-    // that everything in the list aliases R.
-    for (unsigned j = 0, e = LI.size(); j != e; ++j) {
-      Record *Reg = LI[j];
-      if (RegisterAliases[R].count(Reg))
-        errs() << "Warning: register alias between " << getQualifiedName(R)
-               << " and " << getQualifiedName(Reg)
-               << " specified multiple times!\n";
-      RegisterAliases[R].insert(Reg);
-
-      if (RegisterAliases[Reg].count(R))
-        errs() << "Warning: register alias between " << getQualifiedName(R)
-               << " and " << getQualifiedName(Reg)
-               << " specified multiple times!\n";
-      RegisterAliases[Reg].insert(R);
-    }
-  }
-
-  // Process sub-register sets.
-  for (unsigned i = 0, e = Regs.size(); i != e; ++i) {
-    Record *R = Regs[i].TheDef;
-    std::vector<Record*> LI = Regs[i].TheDef->getValueAsListOfDefs("SubRegs");
-    // Process sub-register set and add aliases information.
-    for (unsigned j = 0, e = LI.size(); j != e; ++j) {
-      Record *SubReg = LI[j];
-      if (RegisterSubRegs[R].count(SubReg))
-        errs() << "Warning: register " << getQualifiedName(SubReg)
-               << " specified as a sub-register of " << getQualifiedName(R)
-               << " multiple times!\n";
-      addSubSuperReg(R, SubReg, RegisterSubRegs, RegisterSuperRegs,
-                     RegisterAliases);
-    }
-  }
-
-  // Print the SubregHashTable, a simple quadratically probed
-  // hash table for determining if a register is a subregister
-  // of another register.
-  unsigned NumSubRegs = 0;
-  std::map<Record*, unsigned> RegNo;
-  for (unsigned i = 0, e = Regs.size(); i != e; ++i) {
-    RegNo[Regs[i].TheDef] = i;
-    NumSubRegs += RegisterSubRegs[Regs[i].TheDef].size();
-  }
-
-  unsigned SubregHashTableSize = 2 * NextPowerOf2(2 * NumSubRegs);
-  unsigned* SubregHashTable = new unsigned[2 * SubregHashTableSize];
-  std::fill(SubregHashTable, SubregHashTable + 2 * SubregHashTableSize, ~0U);
-
-  unsigned hashMisses = 0;
-
-  for (unsigned i = 0, e = Regs.size(); i != e; ++i) {
-    Record* R = Regs[i].TheDef;
-    for (std::set<Record*>::iterator I = RegisterSubRegs[R].begin(),
-         E = RegisterSubRegs[R].end(); I != E; ++I) {
-      Record* RJ = *I;
-      // We have to increase the indices of both registers by one when
-      // computing the hash because, in the generated code, there
-      // will be an extra empty slot at register 0.
-      size_t index = ((i+1) + (RegNo[RJ]+1) * 37) & (SubregHashTableSize-1);
-      unsigned ProbeAmt = 2;
-      while (SubregHashTable[index*2] != ~0U &&
-             SubregHashTable[index*2+1] != ~0U) {
-        index = (index + ProbeAmt) & (SubregHashTableSize-1);
-        ProbeAmt += 2;
-
-        hashMisses++;
-      }
-
-      SubregHashTable[index*2] = i;
-      SubregHashTable[index*2+1] = RegNo[RJ];
-    }
-  }
-
-  OS << "\n\n  // Number of hash collisions: " << hashMisses << "\n";
-
-  if (SubregHashTableSize) {
-    std::string Namespace = Regs[0].TheDef->getValueAsString("Namespace");
-
-    OS << "  const unsigned SubregHashTable[] = { ";
-    for (unsigned i = 0; i < SubregHashTableSize - 1; ++i) {
-      if (i != 0)
-        // Insert spaces for nice formatting.
-        OS << "                                       ";
-
-      if (SubregHashTable[2*i] != ~0U) {
-        OS << getQualifiedName(Regs[SubregHashTable[2*i]].TheDef) << ", "
-           << getQualifiedName(Regs[SubregHashTable[2*i+1]].TheDef) << ", \n";
-      } else {
-        OS << Namespace << "::NoRegister, " << Namespace << "::NoRegister, \n";
-      }
-    }
-
-    unsigned Idx = SubregHashTableSize*2-2;
-    if (SubregHashTable[Idx] != ~0U) {
-      OS << "                                       "
-         << getQualifiedName(Regs[SubregHashTable[Idx]].TheDef) << ", "
-         << getQualifiedName(Regs[SubregHashTable[Idx+1]].TheDef) << " };\n";
-    } else {
-      OS << Namespace << "::NoRegister, " << Namespace << "::NoRegister };\n";
-    }
-
-    OS << "  const unsigned SubregHashTableSize = "
-       << SubregHashTableSize << ";\n";
-  } else {
-    OS << "  const unsigned SubregHashTable[] = { ~0U, ~0U };\n"
-       << "  const unsigned SubregHashTableSize = 1;\n";
-  }
-
-  delete [] SubregHashTable;
-
-
-  // Print the AliasHashTable, a simple quadratically probed
-  // hash table for determining if a register aliases another register.
-  unsigned NumAliases = 0;
-  RegNo.clear();
-  for (unsigned i = 0, e = Regs.size(); i != e; ++i) {
-    RegNo[Regs[i].TheDef] = i;
-    NumAliases += RegisterAliases[Regs[i].TheDef].size();
-  }
-
-  unsigned AliasesHashTableSize = 2 * NextPowerOf2(2 * NumAliases);
-  unsigned* AliasesHashTable = new unsigned[2 * AliasesHashTableSize];
-  std::fill(AliasesHashTable, AliasesHashTable + 2 * AliasesHashTableSize, ~0U);
-
-  hashMisses = 0;
-
-  for (unsigned i = 0, e = Regs.size(); i != e; ++i) {
-    Record* R = Regs[i].TheDef;
-    for (std::set<Record*>::iterator I = RegisterAliases[R].begin(),
-         E = RegisterAliases[R].end(); I != E; ++I) {
-      Record* RJ = *I;
-      // We have to increase the indices of both registers by one when
-      // computing the hash because, in the generated code, there
-      // will be an extra empty slot at register 0.
-      size_t index = ((i+1) + (RegNo[RJ]+1) * 37) & (AliasesHashTableSize-1);
-      unsigned ProbeAmt = 2;
-      while (AliasesHashTable[index*2] != ~0U &&
-             AliasesHashTable[index*2+1] != ~0U) {
-        index = (index + ProbeAmt) & (AliasesHashTableSize-1);
-        ProbeAmt += 2;
-
-        hashMisses++;
-      }
-
-      AliasesHashTable[index*2] = i;
-      AliasesHashTable[index*2+1] = RegNo[RJ];
-    }
-  }
-
-  OS << "\n\n  // Number of hash collisions: " << hashMisses << "\n";
-
-  if (AliasesHashTableSize) {
-    std::string Namespace = Regs[0].TheDef->getValueAsString("Namespace");
-
-    OS << "  const unsigned AliasesHashTable[] = { ";
-    for (unsigned i = 0; i < AliasesHashTableSize - 1; ++i) {
-      if (i != 0)
-        // Insert spaces for nice formatting.
-        OS << "                                       ";
-
-      if (AliasesHashTable[2*i] != ~0U) {
-        OS << getQualifiedName(Regs[AliasesHashTable[2*i]].TheDef) << ", "
-           << getQualifiedName(Regs[AliasesHashTable[2*i+1]].TheDef) << ", \n";
-      } else {
-        OS << Namespace << "::NoRegister, " << Namespace << "::NoRegister, \n";
-      }
-    }
-
-    unsigned Idx = AliasesHashTableSize*2-2;
-    if (AliasesHashTable[Idx] != ~0U) {
-      OS << "                                       "
-         << getQualifiedName(Regs[AliasesHashTable[Idx]].TheDef) << ", "
-         << getQualifiedName(Regs[AliasesHashTable[Idx+1]].TheDef) << " };\n";
-    } else {
-      OS << Namespace << "::NoRegister, " << Namespace << "::NoRegister };\n";
-    }
-
-    OS << "  const unsigned AliasesHashTableSize = "
-       << AliasesHashTableSize << ";\n";
-  } else {
-    OS << "  const unsigned AliasesHashTable[] = { ~0U, ~0U };\n"
-       << "  const unsigned AliasesHashTableSize = 1;\n";
-  }
-
-  delete [] AliasesHashTable;
-
-  if (!RegisterAliases.empty())
-    OS << "\n\n  // Register Overlap Lists...\n";
-
   // Emit an overlap list for all registers.
-  for (std::map<Record*, std::set<Record*>, LessRecord >::iterator
-         I = RegisterAliases.begin(), E = RegisterAliases.end(); I != E; ++I) {
-    OS << "  const unsigned " << I->first->getName() << "_Overlaps[] = { "
-       << getQualifiedName(I->first) << ", ";
-    for (std::set<Record*>::iterator ASI = I->second.begin(),
-           E = I->second.end(); ASI != E; ++ASI)
-      OS << getQualifiedName(*ASI) << ", ";
+  for (unsigned i = 0, e = Regs.size(); i != e; ++i) {
+    const CodeGenRegister *Reg = &Regs[i];
+    const CodeGenRegister::Set &O = Overlaps[Reg];
+    // Move Reg to the front so TRI::getAliasSet can share the list.
+    OS << "  const unsigned " << Reg->getName() << "_Overlaps[] = { "
+       << getQualifiedName(Reg->TheDef) << ", ";
+    for (CodeGenRegister::Set::const_iterator I = O.begin(), E = O.end();
+         I != E; ++I)
+      if (*I != Reg)
+        OS << getQualifiedName((*I)->TheDef) << ", ";
     OS << "0 };\n";
   }
 
-  if (!RegisterSubRegs.empty())
-    OS << "\n\n  // Register Sub-registers Sets...\n";
-
   // Emit the empty sub-registers list
   OS << "  const unsigned Empty_SubRegsSet[] = { 0 };\n";
   // Loop over all of the registers which have sub-registers, emitting the
   // sub-registers list to memory.
-  for (std::map<Record*, std::set<Record*>, LessRecord>::iterator
-         I = RegisterSubRegs.begin(), E = RegisterSubRegs.end(); I != E; ++I) {
-   if (I->second.empty())
+  for (unsigned i = 0, e = Regs.size(); i != e; ++i) {
+    const CodeGenRegister &Reg = Regs[i];
+    if (Reg.getSubRegs().empty())
      continue;
-    OS << "  const unsigned " << I->first->getName() << "_SubRegsSet[] = { ";
-    std::vector<Record*> SubRegsVector;
-    for (std::set<Record*>::iterator ASI = I->second.begin(),
-           E = I->second.end(); ASI != E; ++ASI)
-      SubRegsVector.push_back(*ASI);
-    RegisterSorter RS(RegisterSubRegs);
-    std::stable_sort(SubRegsVector.begin(), SubRegsVector.end(), RS);
-    for (unsigned i = 0, e = SubRegsVector.size(); i != e; ++i)
-      OS << getQualifiedName(SubRegsVector[i]) << ", ";
+    // getSubRegs() orders by SubRegIndex. We want a topological order.
+    SetVector<CodeGenRegister*> SR;
+    Reg.addSubRegsPreOrder(SR);
+    OS << "  const unsigned " << Reg.getName() << "_SubRegsSet[] = { ";
+    for (unsigned j = 0, je = SR.size(); j != je; ++j)
+      OS << getQualifiedName(SR[j]->TheDef) << ", ";
     OS << "0 };\n";
   }
 
-  if (!RegisterSuperRegs.empty())
-    OS << "\n\n  // Register Super-registers Sets...\n";
-
   // Emit the empty super-registers list
   OS << "  const unsigned Empty_SuperRegsSet[] = { 0 };\n";
   // Loop over all of the registers which have super-registers, emitting the
   // super-registers list to memory.
-  for (std::map<Record*, std::set<Record*>, LessRecord >::iterator
-         I = RegisterSuperRegs.begin(), E = RegisterSuperRegs.end(); I != E; ++I) {
-    if (I->second.empty())
+  for (unsigned i = 0, e = Regs.size(); i != e; ++i) {
+    const CodeGenRegister &Reg = Regs[i];
+    const CodeGenRegister::SuperRegList &SR = Reg.getSuperRegs();
+    if (SR.empty())
       continue;
-    OS << "  const unsigned " << I->first->getName() << "_SuperRegsSet[] = { ";
-
-    std::vector<Record*> SuperRegsVector;
-    for (std::set<Record*>::iterator ASI = I->second.begin(),
-           E = I->second.end(); ASI != E; ++ASI)
-      SuperRegsVector.push_back(*ASI);
-    RegisterSorter RS(RegisterSubRegs);
-    std::stable_sort(SuperRegsVector.begin(), SuperRegsVector.end(), RS);
-    for (unsigned i = 0, e = SuperRegsVector.size(); i != e; ++i)
-      OS << getQualifiedName(SuperRegsVector[i]) << ", ";
+    OS << "  const unsigned " << Reg.getName() << "_SuperRegsSet[] = { ";
+    for (unsigned j = 0, je = SR.size(); j != je; ++j)
+      OS << getQualifiedName(SR[j]->TheDef) << ", ";
     OS << "0 };\n";
   }
 
   OS<<"\n  const TargetRegisterDesc RegisterDescriptors[] = { // Descriptors\n";
-  OS << "    { \"NOREG\",\t0,\t0,\t0 },\n";
+  OS << "    { \"NOREG\",\t0,\t0,\t0,\t0,\t0 },\n";
 
   // Now that register alias and sub-registers sets have been emitted, emit the
   // register descriptors now.
@@ -849,19 +442,25 @@ void RegisterInfoEmitter::run(raw_ostream &OS) {
     const CodeGenRegister &Reg = Regs[i];
     OS << "    { \"";
     OS << Reg.getName() << "\",\t" << Reg.getName() << "_Overlaps,\t";
-    if (!RegisterSubRegs[Reg.TheDef].empty())
+    if (!Reg.getSubRegs().empty())
       OS << Reg.getName() << "_SubRegsSet,\t";
     else
       OS << "Empty_SubRegsSet,\t";
-    if (!RegisterSuperRegs[Reg.TheDef].empty())
-      OS << Reg.getName() << "_SuperRegsSet },\n";
+    if (!Reg.getSuperRegs().empty())
+      OS << Reg.getName() << "_SuperRegsSet,\t";
     else
-      OS << "Empty_SuperRegsSet },\n";
+      OS << "Empty_SuperRegsSet,\t";
+    OS << Reg.CostPerUse << ",\t"
+       << int(AllocatableRegs.count(Reg.TheDef)) << " },\n";
   }
   OS << "  };\n";      // End of register descriptors...
 
+  // Calculate the mapping of subregister+index pairs to physical registers.
+  // This will also create further anonymous indexes.
+  unsigned NamedIndices = RegBank.getNumNamedIndices();
+
   // Emit SubRegIndex names, skipping 0
-  const std::vector<Record*> SubRegIndices = Target.getSubRegIndices();
+  const std::vector<Record*> &SubRegIndices = RegBank.getSubRegIndices();
   OS << "\n  const char *const SubRegIndexTable[] = { \"";
   for (unsigned i = 0, e = SubRegIndices.size(); i != e; ++i) {
     OS << SubRegIndices[i]->getName();
@@ -869,13 +468,21 @@ void RegisterInfoEmitter::run(raw_ostream &OS) {
       OS << "\", \"";
   }
   OS << "\" };\n\n";
+
+  // Emit names of the anonymus subreg indexes.
+  if (SubRegIndices.size() > NamedIndices) {
+    OS << "  enum {";
+    for (unsigned i = NamedIndices, e = SubRegIndices.size(); i != e; ++i) {
+      OS << "\n    " << SubRegIndices[i]->getName() << " = " << i+1;
+      if (i+1 != e)
+        OS << ',';
+    }
+    OS << "\n  };\n\n";
+  }
   OS << "}\n\n";       // End of anonymous namespace...
 
   std::string ClassName = Target.getName() + "GenRegisterInfo";
 
-  // Calculate the mapping of subregister+index pairs to physical registers.
-  RegisterMaps RegMaps;
-
   // Emit the subregister + index mapping function based on the information
   // calculated above.
   OS << "unsigned " << ClassName
@@ -883,16 +490,16 @@ void RegisterInfoEmitter::run(raw_ostream &OS) {
      << "  switch (RegNo) {\n"
      << "  default:\n    return 0;\n";
   for (unsigned i = 0, e = Regs.size(); i != e; ++i) {
-    RegisterMaps::SubRegMap &SRM = RegMaps.inferSubRegIndices(Regs[i].TheDef);
+    const CodeGenRegister::SubRegMap &SRM = Regs[i].getSubRegs();
     if (SRM.empty())
       continue;
     OS << "  case " << getQualifiedName(Regs[i].TheDef) << ":\n";
     OS << "    switch (Index) {\n";
     OS << "    default: return 0;\n";
-    for (RegisterMaps::SubRegMap::const_iterator ii = SRM.begin(),
+    for (CodeGenRegister::SubRegMap::const_iterator ii = SRM.begin(),
          ie = SRM.end(); ii != ie; ++ii)
       OS << "    case " << getQualifiedName(ii->first)
-         << ": return " << getQualifiedName(ii->second) << ";\n";
+         << ": return " << getQualifiedName(ii->second->TheDef) << ";\n";
     OS << "    };\n" << "    break;\n";
   }
   OS << "  };\n";
@@ -904,13 +511,13 @@ void RegisterInfoEmitter::run(raw_ostream &OS) {
      << "  switch (RegNo) {\n"
      << "  default:\n    return 0;\n";
    for (unsigned i = 0, e = Regs.size(); i != e; ++i) {
-     RegisterMaps::SubRegMap &SRM = RegMaps.SubReg[Regs[i].TheDef];
+     const CodeGenRegister::SubRegMap &SRM = Regs[i].getSubRegs();
      if (SRM.empty())
        continue;
     OS << "  case " << getQualifiedName(Regs[i].TheDef) << ":\n";
-    for (RegisterMaps::SubRegMap::const_iterator ii = SRM.begin(),
+    for (CodeGenRegister::SubRegMap::const_iterator ii = SRM.begin(),
          ie = SRM.end(); ii != ie; ++ii)
-      OS << "    if (SubRegNo == " << getQualifiedName(ii->second)
+      OS << "    if (SubRegNo == " << getQualifiedName(ii->second->TheDef)
          << ")  return " << getQualifiedName(ii->first) << ";\n";
     OS << "    return 0;\n";
   }
@@ -919,7 +526,6 @@ void RegisterInfoEmitter::run(raw_ostream &OS) {
   OS << "}\n\n";
 
   // Emit composeSubRegIndices
-  RegMaps.computeComposites();
   OS << "unsigned " << ClassName
      << "::composeSubRegIndices(unsigned IdxA, unsigned IdxB) const {\n"
      << "  switch (IdxA) {\n"
@@ -927,8 +533,8 @@ void RegisterInfoEmitter::run(raw_ostream &OS) {
   for (unsigned i = 0, e = SubRegIndices.size(); i != e; ++i) {
     bool Open = false;
     for (unsigned j = 0; j != e; ++j) {
-      if (Record *Comp = RegMaps.Composite.lookup(
-                          std::make_pair(SubRegIndices[i], SubRegIndices[j]))) {
+      if (Record *Comp = RegBank.getCompositeSubRegIndex(SubRegIndices[i],
+                                                         SubRegIndices[j])) {
         if (!Open) {
           OS << "  case " << getQualifiedName(SubRegIndices[i])
              << ": switch(IdxB) {\n    default: return IdxB;\n";
@@ -949,9 +555,7 @@ void RegisterInfoEmitter::run(raw_ostream &OS) {
      << "  : TargetRegisterInfo(RegisterDescriptors, " << Regs.size()+1
      << ", RegisterClasses, RegisterClasses+" << RegisterClasses.size() <<",\n"
      << "                 SubRegIndexTable,\n"
-     << "                 CallFrameSetupOpcode, CallFrameDestroyOpcode,\n"
-     << "                 SubregHashTable, SubregHashTableSize,\n"
-     << "                 AliasesHashTable, AliasesHashTableSize) {\n"
+     << "                 CallFrameSetupOpcode, CallFrameDestroyOpcode) {\n"
      << "}\n\n";
 
   // Collect all information about dwarf register numbers
@@ -974,6 +578,44 @@ void RegisterInfoEmitter::run(raw_ostream &OS) {
     for (unsigned i = I->second.size(), e = maxLength; i != e; ++i)
       I->second.push_back(-1);
 
+  // Emit reverse information about the dwarf register numbers.
+  OS << "int " << ClassName << "::getLLVMRegNumFull(unsigned DwarfRegNum, "
+     << "unsigned Flavour) const {\n"
+     << "  switch (Flavour) {\n"
+     << "  default:\n"
+     << "    assert(0 && \"Unknown DWARF flavour\");\n"
+     << "    return -1;\n";
+
+  for (unsigned i = 0, e = maxLength; i != e; ++i) {
+    OS << "  case " << i << ":\n"
+       << "    switch (DwarfRegNum) {\n"
+       << "    default:\n"
+       << "      assert(0 && \"Invalid DwarfRegNum\");\n"
+       << "      return -1;\n";
+
+    for (DwarfRegNumsMapTy::iterator
+           I = DwarfRegNums.begin(), E = DwarfRegNums.end(); I != E; ++I) {
+      int DwarfRegNo = I->second[i];
+      if (DwarfRegNo >= 0)
+        OS << "    case " <<  DwarfRegNo << ":\n"
+           << "      return " << getQualifiedName(I->first) << ";\n";
+    }
+    OS << "    };\n";
+  }
+
+  OS << "  };\n}\n\n";
+
+  for (unsigned i = 0, e = Regs.size(); i != e; ++i) {
+    Record *Reg = Regs[i].TheDef;
+    const RecordVal *V = Reg->getValue("DwarfAlias");
+    if (!V || !V->getValue())
+      continue;
+
+    DefInit *DI = dynamic_cast<DefInit*>(V->getValue());
+    Record *Alias = DI->getDef();
+    DwarfRegNums[Reg] = DwarfRegNums[Alias];
+  }
+
   // Emit information about the dwarf register numbers.
   OS << "int " << ClassName << "::getDwarfRegNumFull(unsigned RegNum, "
      << "unsigned Flavour) const {\n"
@@ -995,13 +637,8 @@ void RegisterInfoEmitter::run(raw_ostream &OS) {
     for (DwarfRegNumsMapTy::iterator
            I = DwarfRegNums.begin(), E = DwarfRegNums.end(); I != E; ++I) {
       int RegNo = I->second[i];
-      if (RegNo != -2)
-        OS << "    case " << getQualifiedName(I->first) << ":\n"
-           << "      return " << RegNo << ";\n";
-      else
-        OS << "    case " << getQualifiedName(I->first) << ":\n"
-           << "      assert(0 && \"Invalid register for this mode\");\n"
-           << "      return -1;\n";
+      OS << "    case " << getQualifiedName(I->first) << ":\n"
+         << "      return " << RegNo << ";\n";
     }
     OS << "    };\n";
   }
diff --git a/utils/TableGen/SetTheory.cpp b/utils/TableGen/SetTheory.cpp
new file mode 100644
index 0000000..509d2f3
--- /dev/null
+++ b/utils/TableGen/SetTheory.cpp
@@ -0,0 +1,275 @@
+//===- SetTheory.cpp - Generate ordered sets from DAG expressions ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SetTheory class that computes ordered sets of
+// Records from DAG expressions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SetTheory.h"
+#include "Record.h"
+#include "llvm/Support/Format.h"
+
+using namespace llvm;
+
+// Define the standard operators.
+namespace {
+
+typedef SetTheory::RecSet RecSet;
+typedef SetTheory::RecVec RecVec;
+
+// (add a, b, ...) Evaluate and union all arguments.
+struct AddOp : public SetTheory::Operator {
+  void apply(SetTheory &ST, DagInit *Expr, RecSet &Elts) {
+    ST.evaluate(Expr->arg_begin(), Expr->arg_end(), Elts);
+  }
+};
+
+// (sub Add, Sub, ...) Set difference.
+struct SubOp : public SetTheory::Operator {
+  void apply(SetTheory &ST, DagInit *Expr, RecSet &Elts) {
+    if (Expr->arg_size() < 2)
+      throw "Set difference needs at least two arguments: " +
+        Expr->getAsString();
+    RecSet Add, Sub;
+    ST.evaluate(*Expr->arg_begin(), Add);
+    ST.evaluate(Expr->arg_begin() + 1, Expr->arg_end(), Sub);
+    for (RecSet::iterator I = Add.begin(), E = Add.end(); I != E; ++I)
+      if (!Sub.count(*I))
+        Elts.insert(*I);
+  }
+};
+
+// (and S1, S2) Set intersection.
+struct AndOp : public SetTheory::Operator {
+  void apply(SetTheory &ST, DagInit *Expr, RecSet &Elts) {
+    if (Expr->arg_size() != 2)
+      throw "Set intersection requires two arguments: " + Expr->getAsString();
+    RecSet S1, S2;
+    ST.evaluate(Expr->arg_begin()[0], S1);
+    ST.evaluate(Expr->arg_begin()[1], S2);
+    for (RecSet::iterator I = S1.begin(), E = S1.end(); I != E; ++I)
+      if (S2.count(*I))
+        Elts.insert(*I);
+  }
+};
+
+// SetIntBinOp - Abstract base class for (Op S, N) operators.
+struct SetIntBinOp : public SetTheory::Operator {
+  virtual void apply2(SetTheory &ST, DagInit *Expr,
+                     RecSet &Set, int64_t N,
+                     RecSet &Elts) =0;
+
+  void apply(SetTheory &ST, DagInit *Expr, RecSet &Elts) {
+    if (Expr->arg_size() != 2)
+      throw "Operator requires (Op Set, Int) arguments: " + Expr->getAsString();
+    RecSet Set;
+    ST.evaluate(Expr->arg_begin()[0], Set);
+    IntInit *II = dynamic_cast<IntInit*>(Expr->arg_begin()[1]);
+    if (!II)
+      throw "Second argument must be an integer: " + Expr->getAsString();
+    apply2(ST, Expr, Set, II->getValue(), Elts);
+  }
+};
+
+// (shl S, N) Shift left, remove the first N elements.
+struct ShlOp : public SetIntBinOp {
+  void apply2(SetTheory &ST, DagInit *Expr,
+             RecSet &Set, int64_t N,
+             RecSet &Elts) {
+    if (N < 0)
+      throw "Positive shift required: " + Expr->getAsString();
+    if (unsigned(N) < Set.size())
+      Elts.insert(Set.begin() + N, Set.end());
+  }
+};
+
+// (trunc S, N) Truncate after the first N elements.
+struct TruncOp : public SetIntBinOp {
+  void apply2(SetTheory &ST, DagInit *Expr,
+             RecSet &Set, int64_t N,
+             RecSet &Elts) {
+    if (N < 0)
+      throw "Positive length required: " + Expr->getAsString();
+    if (unsigned(N) > Set.size())
+      N = Set.size();
+    Elts.insert(Set.begin(), Set.begin() + N);
+  }
+};
+
+// Left/right rotation.
+struct RotOp : public SetIntBinOp {
+  const bool Reverse;
+
+  RotOp(bool Rev) : Reverse(Rev) {}
+
+  void apply2(SetTheory &ST, DagInit *Expr,
+             RecSet &Set, int64_t N,
+             RecSet &Elts) {
+    if (Reverse)
+      N = -N;
+    // N > 0 -> rotate left, N < 0 -> rotate right.
+    if (Set.empty())
+      return;
+    if (N < 0)
+      N = Set.size() - (-N % Set.size());
+    else
+      N %= Set.size();
+    Elts.insert(Set.begin() + N, Set.end());
+    Elts.insert(Set.begin(), Set.begin() + N);
+  }
+};
+
+// (decimate S, N) Pick every N'th element of S.
+struct DecimateOp : public SetIntBinOp {
+  void apply2(SetTheory &ST, DagInit *Expr,
+             RecSet &Set, int64_t N,
+             RecSet &Elts) {
+    if (N <= 0)
+      throw "Positive stride required: " + Expr->getAsString();
+    for (unsigned I = 0; I < Set.size(); I += N)
+      Elts.insert(Set[I]);
+  }
+};
+
+// (sequence "Format", From, To) Generate a sequence of records by name.
+struct SequenceOp : public SetTheory::Operator {
+  void apply(SetTheory &ST, DagInit *Expr, RecSet &Elts) {
+    if (Expr->arg_size() != 3)
+      throw "Bad args to (sequence \"Format\", From, To): " +
+        Expr->getAsString();
+    std::string Format;
+    if (StringInit *SI = dynamic_cast<StringInit*>(Expr->arg_begin()[0]))
+      Format = SI->getValue();
+    else
+      throw "Format must be a string: " + Expr->getAsString();
+
+    int64_t From, To;
+    if (IntInit *II = dynamic_cast<IntInit*>(Expr->arg_begin()[1]))
+      From = II->getValue();
+    else
+      throw "From must be an integer: " + Expr->getAsString();
+    if (From < 0 || From >= (1 << 30))
+      throw "From out of range";
+
+    if (IntInit *II = dynamic_cast<IntInit*>(Expr->arg_begin()[2]))
+      To = II->getValue();
+    else
+      throw "From must be an integer: " + Expr->getAsString();
+    if (To < 0 || To >= (1 << 30))
+      throw "To out of range";
+
+    RecordKeeper &Records =
+      dynamic_cast<DefInit&>(*Expr->getOperator()).getDef()->getRecords();
+
+    int Step = From <= To ? 1 : -1;
+    for (To += Step; From != To; From += Step) {
+      std::string Name;
+      raw_string_ostream OS(Name);
+      OS << format(Format.c_str(), unsigned(From));
+      Record *Rec = Records.getDef(OS.str());
+      if (!Rec)
+        throw "No def named '" + Name + "': " + Expr->getAsString();
+      // Try to reevaluate Rec in case it is a set.
+      if (const RecVec *Result = ST.expand(Rec))
+        Elts.insert(Result->begin(), Result->end());
+      else
+        Elts.insert(Rec);
+    }
+  }
+};
+
+// Expand a Def into a set by evaluating one of its fields.
+struct FieldExpander : public SetTheory::Expander {
+  StringRef FieldName;
+
+  FieldExpander(StringRef fn) : FieldName(fn) {}
+
+  void expand(SetTheory &ST, Record *Def, RecSet &Elts) {
+    ST.evaluate(Def->getValueInit(FieldName), Elts);
+  }
+};
+} // end anonymous namespace
+
+SetTheory::SetTheory() {
+  addOperator("add", new AddOp);
+  addOperator("sub", new SubOp);
+  addOperator("and", new AndOp);
+  addOperator("shl", new ShlOp);
+  addOperator("trunc", new TruncOp);
+  addOperator("rotl", new RotOp(false));
+  addOperator("rotr", new RotOp(true));
+  addOperator("decimate", new DecimateOp);
+  addOperator("sequence", new SequenceOp);
+}
+
+void SetTheory::addOperator(StringRef Name, Operator *Op) {
+  Operators[Name] = Op;
+}
+
+void SetTheory::addExpander(StringRef ClassName, Expander *E) {
+  Expanders[ClassName] = E;
+}
+
+void SetTheory::addFieldExpander(StringRef ClassName, StringRef FieldName) {
+  addExpander(ClassName, new FieldExpander(FieldName));
+}
+
+void SetTheory::evaluate(Init *Expr, RecSet &Elts) {
+  // A def in a list can be a just an element, or it may expand.
+  if (DefInit *Def = dynamic_cast<DefInit*>(Expr)) {
+    if (const RecVec *Result = expand(Def->getDef()))
+      return Elts.insert(Result->begin(), Result->end());
+    Elts.insert(Def->getDef());
+    return;
+  }
+
+  // Lists simply expand.
+  if (ListInit *LI = dynamic_cast<ListInit*>(Expr))
+    return evaluate(LI->begin(), LI->end(), Elts);
+
+  // Anything else must be a DAG.
+  DagInit *DagExpr = dynamic_cast<DagInit*>(Expr);
+  if (!DagExpr)
+    throw "Invalid set element: " + Expr->getAsString();
+  DefInit *OpInit = dynamic_cast<DefInit*>(DagExpr->getOperator());
+  if (!OpInit)
+    throw "Bad set expression: " + Expr->getAsString();
+  Operator *Op = Operators.lookup(OpInit->getDef()->getName());
+  if (!Op)
+    throw "Unknown set operator: " + Expr->getAsString();
+  Op->apply(*this, DagExpr, Elts);
+}
+
+const RecVec *SetTheory::expand(Record *Set) {
+  // Check existing entries for Set and return early.
+  ExpandMap::iterator I = Expansions.find(Set);
+  if (I != Expansions.end())
+    return &I->second;
+
+  // This is the first time we see Set. Find a suitable expander.
+  try {
+    const std::vector<Record*> &SC = Set->getSuperClasses();
+    for (unsigned i = 0, e = SC.size(); i != e; ++i)
+      if (Expander *Exp = Expanders.lookup(SC[i]->getName())) {
+        // This breaks recursive definitions.
+        RecVec &EltVec = Expansions[Set];
+        RecSet Elts;
+        Exp->expand(*this, Set, Elts);
+        EltVec.assign(Elts.begin(), Elts.end());
+        return &EltVec;
+      }
+  } catch (const std::string &Error) {
+    throw TGError(Set->getLoc(), Error);
+  }
+
+  // Set is not expandable.
+  return 0;
+}
+
diff --git a/utils/TableGen/SetTheory.h b/utils/TableGen/SetTheory.h
new file mode 100644
index 0000000..e37a76e
--- /dev/null
+++ b/utils/TableGen/SetTheory.h
@@ -0,0 +1,136 @@
+//===- SetTheory.h - Generate ordered sets from DAG expressions -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SetTheory class that computes ordered sets of
+// Records from DAG expressions.  Operators for standard set operations are
+// predefined, and it is possible to add special purpose set operators as well.
+//
+// The user may define named sets as Records of predefined classes. Set
+// expanders can be added to a SetTheory instance to teach it how to find the
+// elements of such a named set.
+//
+// These are the predefined operators. The argument lists can be individual
+// elements (defs), other sets (defs of expandable classes), lists, or DAG
+// expressions that are evaluated recursively.
+//
+// - (add S1, S2 ...) Union sets. This is also how sets are created from element
+//   lists.
+//
+// - (sub S1, S2, ...) Set difference. Every element in S1 except for the
+//   elements in S2, ...
+//
+// - (and S1, S2) Set intersection. Every element in S1 that is also in S2.
+//
+// - (shl S, N) Shift left. Remove the first N elements from S.
+//
+// - (trunc S, N) Truncate. The first N elements of S.
+//
+// - (rotl S, N) Rotate left. Same as (add (shl S, N), (trunc S, N)).
+//
+// - (rotr S, N) Rotate right.
+//
+// - (decimate S, N) Decimate S by picking every N'th element, starting with
+//   the first one. For instance, (decimate S, 2) returns the even elements of
+//   S.
+//
+// - (sequence "Format", From, To) Generate a sequence of defs with printf.
+//   For instance, (sequence "R%u", 0, 3) -> [ R0, R1, R2, R3 ]
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SETTHEORY_H
+#define SETTHEORY_H
+
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/SetVector.h"
+#include <map>
+#include <vector>
+
+namespace llvm {
+
+class DagInit;
+struct Init;
+class Record;
+class RecordKeeper;
+
+class SetTheory {
+public:
+  typedef std::vector<Record*> RecVec;
+  typedef SmallSetVector<Record*, 16> RecSet;
+
+  /// Operator - A callback representing a DAG operator.
+  struct Operator {
+    virtual ~Operator() {}
+
+    /// apply - Apply this operator to Expr's arguments and insert the result
+    /// in Elts.
+    virtual void apply(SetTheory&, DagInit *Expr, RecSet &Elts) =0;
+  };
+
+  /// Expander - A callback function that can transform a Record representing a
+  /// set into a fully expanded list of elements. Expanders provide a way for
+  /// users to define named sets that can be used in DAG expressions.
+  struct Expander {
+    virtual ~Expander() {}
+
+    virtual void expand(SetTheory&, Record*, RecSet &Elts) =0;
+  };
+
+private:
+  // Map set defs to their fully expanded contents. This serves as a memoization
+  // cache and it makes it possible to return const references on queries.
+  typedef std::map<Record*, RecVec> ExpandMap;
+  ExpandMap Expansions;
+
+  // Known DAG operators by name.
+  StringMap<Operator*> Operators;
+
+  // Typed expanders by class name.
+  StringMap<Expander*> Expanders;
+
+public:
+  /// Create a SetTheory instance with only the standard operators.
+  SetTheory();
+
+  /// addExpander - Add an expander for Records with the named super class.
+  void addExpander(StringRef ClassName, Expander*);
+
+  /// addFieldExpander - Add an expander for ClassName that simply evaluates
+  /// FieldName in the Record to get the set elements.  That is all that is
+  /// needed for a class like:
+  ///
+  ///   class Set<dag d> {
+  ///     dag Elts = d;
+  ///   }
+  ///
+  void addFieldExpander(StringRef ClassName, StringRef FieldName);
+
+  /// addOperator - Add a DAG operator.
+  void addOperator(StringRef Name, Operator*);
+
+  /// evaluate - Evaluate Expr and append the resulting set to Elts.
+  void evaluate(Init *Expr, RecSet &Elts);
+
+  /// evaluate - Evaluate a sequence of Inits and append to Elts.
+  template<typename Iter>
+  void evaluate(Iter begin, Iter end, RecSet &Elts) {
+    while (begin != end)
+      evaluate(*begin++, Elts);
+  }
+
+  /// expand - Expand a record into a set of elements if possible.  Return a
+  /// pointer to the expanded elements, or NULL if Set cannot be expanded
+  /// further.
+  const RecVec *expand(Record *Set);
+};
+
+} // end namespace llvm
+
+#endif
+
diff --git a/utils/TableGen/SubtargetEmitter.cpp b/utils/TableGen/SubtargetEmitter.cpp
index 8ca4b1c..928fa4b 100644
--- a/utils/TableGen/SubtargetEmitter.cpp
+++ b/utils/TableGen/SubtargetEmitter.cpp
@@ -33,7 +33,13 @@ void SubtargetEmitter::Enumeration(raw_ostream &OS,
   OS << "enum {\n";
 
   // For each record
-  for (unsigned i = 0, N = DefList.size(); i < N;) {
+  unsigned N = DefList.size();
+  if (N > 64) {
+    errs() << "Too many (> 64) subtarget features!\n";
+    exit(1);
+  }
+
+  for (unsigned i = 0; i < N;) {
     // Next record
     Record *Def = DefList[i];
 
@@ -41,7 +47,7 @@ void SubtargetEmitter::Enumeration(raw_ostream &OS,
     OS << "  " << Def->getName();
 
     // If bit flags then emit expression (1 << i)
-    if (isBits)  OS << " = " << " 1 << " << i;
+    if (isBits)  OS << " = " << " 1ULL << " << i;
 
     // Depending on 'if more in the list' emit comma
     if (++i < N) OS << ",";
@@ -88,7 +94,7 @@ void SubtargetEmitter::FeatureKeyValues(raw_ostream &OS) {
       Feature->getValueAsListOfDefs("Implies");
 
     if (ImpliesList.empty()) {
-      OS << "0";
+      OS << "0ULL";
     } else {
       for (unsigned j = 0, M = ImpliesList.size(); j < M;) {
         OS << ImpliesList[j]->getName();
@@ -142,7 +148,7 @@ void SubtargetEmitter::CPUKeyValues(raw_ostream &OS) {
        << "\"Select the " << Name << " processor\", ";
 
     if (FeatureList.empty()) {
-      OS << "0";
+      OS << "0ULL";
     } else {
       for (unsigned j = 0, M = FeatureList.size(); j < M;) {
         OS << FeatureList[j]->getName();
@@ -151,7 +157,7 @@ void SubtargetEmitter::CPUKeyValues(raw_ostream &OS) {
     }
 
     // The "0" is for the "implies" section of this data structure.
-    OS << ", 0 }";
+    OS << ", 0ULL }";
 
     // Depending on 'if more in the list' emit comma
     if (++i < N) OS << ",";
@@ -608,7 +614,7 @@ void SubtargetEmitter::ParseFeaturesFunction(raw_ostream &OS) {
      << "  DEBUG(dbgs() << \"\\nCPU:\" << CPU);\n"
      << "  SubtargetFeatures Features(FS);\n"
      << "  Features.setCPUIfNone(CPU);\n"
-     << "  uint32_t Bits =  Features.getBits(SubTypeKV, SubTypeKVSize,\n"
+     << "  uint64_t Bits =  Features.getBits(SubTypeKV, SubTypeKVSize,\n"
      << "                                    FeatureKV, FeatureKVSize);\n";
 
   for (unsigned i = 0; i < Features.size(); i++) {
diff --git a/utils/TableGen/TGLexer.cpp b/utils/TableGen/TGLexer.cpp
index 82d2b64..572c36d 100644
--- a/utils/TableGen/TGLexer.cpp
+++ b/utils/TableGen/TGLexer.cpp
@@ -267,14 +267,17 @@ bool TGLexer::LexInclude() {
 
   // Get the string.
   std::string Filename = CurStrVal;
+  std::string IncludedFile;
 
   
-  CurBuffer = SrcMgr.AddIncludeFile(Filename, SMLoc::getFromPointer(CurPtr));
+  CurBuffer = SrcMgr.AddIncludeFile(Filename, SMLoc::getFromPointer(CurPtr),
+                                    IncludedFile);
   if (CurBuffer == -1) {
     PrintError(getLoc(), "Could not find include file '" + Filename + "'");
     return true;
   }
   
+  Dependencies.push_back(IncludedFile);
   // Save the line number and lex buffer of the includer.
   CurBuf = SrcMgr.getMemoryBuffer(CurBuffer);
   CurPtr = CurBuf->getBufferStart();
diff --git a/utils/TableGen/TGLexer.h b/utils/TableGen/TGLexer.h
index 8859479..c2a6453 100644
--- a/utils/TableGen/TGLexer.h
+++ b/utils/TableGen/TGLexer.h
@@ -15,8 +15,8 @@
 #define TGLEXER_H
 
 #include "llvm/Support/DataTypes.h"
-#include <vector>
 #include <string>
+#include <vector>
 #include <cassert>
 
 namespace llvm {
@@ -72,6 +72,8 @@ class TGLexer {
   /// CurBuffer - This is the current buffer index we're lexing from as managed
   /// by the SourceMgr object.
   int CurBuffer;
+  /// Dependencies - This is the list of all included files.
+  std::vector<std::string> Dependencies;
   
 public:
   TGLexer(SourceMgr &SrcMgr);
@@ -80,6 +82,10 @@ public:
   tgtok::TokKind Lex() {
     return CurCode = LexToken();
   }
+
+  const std::vector<std::string> &getDependencies() const {
+    return Dependencies;
+  }
   
   tgtok::TokKind getCode() const { return CurCode; }
 
diff --git a/utils/TableGen/TGParser.h b/utils/TableGen/TGParser.h
index 9cdf68f..419a99b 100644
--- a/utils/TableGen/TGParser.h
+++ b/utils/TableGen/TGParser.h
@@ -66,6 +66,9 @@ public:
   bool TokError(const Twine &Msg) const {
     return Error(Lex.getLoc(), Msg);
   }
+  const std::vector<std::string> &getDependencies() const {
+    return Lex.getDependencies();
+  }
 private:  // Semantic analysis methods.
   bool AddValue(Record *TheRec, SMLoc Loc, const RecordVal &RV);
   bool SetValue(Record *TheRec, SMLoc Loc, const std::string &ValName, 
diff --git a/utils/TableGen/TGValueTypes.cpp b/utils/TableGen/TGValueTypes.cpp
index 122d085..af0d9f4 100644
--- a/utils/TableGen/TGValueTypes.cpp
+++ b/utils/TableGen/TGValueTypes.cpp
@@ -16,7 +16,6 @@
 
 #include "llvm/CodeGen/ValueTypes.h"
 #include <map>
-#include <vector>
 using namespace llvm;
 
 namespace llvm {
diff --git a/utils/TableGen/TableGen.cpp b/utils/TableGen/TableGen.cpp
index 3b7dc01..4e4da36 100644
--- a/utils/TableGen/TableGen.cpp
+++ b/utils/TableGen/TableGen.cpp
@@ -37,6 +37,7 @@
 #include "RegisterInfoEmitter.h"
 #include "ARMDecoderEmitter.h"
 #include "SubtargetEmitter.h"
+#include "SetTheory.h"
 #include "TGParser.h"
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/Support/CommandLine.h"
@@ -65,6 +66,7 @@ enum ActionType {
   GenClangAttrSpellingList,
   GenClangDiagsDefs,
   GenClangDiagGroups,
+  GenClangDiagsIndexName,
   GenClangDeclNodes,
   GenClangStmtNodes,
   GenClangSACheckers,
@@ -79,7 +81,8 @@ enum ActionType {
   GenArmNeon,
   GenArmNeonSema,
   GenArmNeonTest,
-  PrintEnums
+  PrintEnums,
+  PrintSets
 };
 
 namespace {
@@ -133,12 +136,16 @@ namespace {
                                "Generate clang PCH attribute reader"),
                     clEnumValN(GenClangAttrPCHWrite, "gen-clang-attr-pch-write",
                                "Generate clang PCH attribute writer"),
-                    clEnumValN(GenClangAttrSpellingList, "gen-clang-attr-spelling-list",
+                    clEnumValN(GenClangAttrSpellingList,
+                               "gen-clang-attr-spelling-list",
                                "Generate a clang attribute spelling list"),
                     clEnumValN(GenClangDiagsDefs, "gen-clang-diags-defs",
                                "Generate Clang diagnostics definitions"),
                     clEnumValN(GenClangDiagGroups, "gen-clang-diag-groups",
                                "Generate Clang diagnostic groups"),
+                    clEnumValN(GenClangDiagsIndexName,
+                               "gen-clang-diags-index-name",
+                               "Generate Clang diagnostic name index"),
                     clEnumValN(GenClangDeclNodes, "gen-clang-decl-nodes",
                                "Generate Clang AST declaration nodes"),
                     clEnumValN(GenClangStmtNodes, "gen-clang-stmt-nodes",
@@ -157,6 +164,8 @@ namespace {
                                "Generate ARM NEON tests for clang"),
                     clEnumValN(PrintEnums, "print-enums",
                                "Print enum values for a class"),
+                    clEnumValN(PrintSets, "print-sets",
+                               "Print expanded sets for testing DAG exprs"),
                     clEnumValEnd));
 
   cl::opt<std::string>
@@ -168,6 +177,10 @@ namespace {
                  cl::init("-"));
 
   cl::opt<std::string>
+  DependFilename("d", cl::desc("Dependency filename"), cl::value_desc("filename"),
+                 cl::init(""));
+
+  cl::opt<std::string>
   InputFilename(cl::Positional, cl::desc("<input file>"), cl::init("-"));
 
   cl::list<std::string>
@@ -187,34 +200,6 @@ void llvm::PrintError(SMLoc ErrorLoc, const Twine &Msg) {
   SrcMgr.PrintMessage(ErrorLoc, Msg, "error");
 }
 
-
-
-/// ParseFile - this function begins the parsing of the specified tablegen
-/// file.
-static bool ParseFile(const std::string &Filename,
-                      const std::vector<std::string> &IncludeDirs,
-                      SourceMgr &SrcMgr,
-                      RecordKeeper &Records) {
-  OwningPtr<MemoryBuffer> File;
-  if (error_code ec = MemoryBuffer::getFileOrSTDIN(Filename.c_str(), File)) {
-    errs() << "Could not open input file '" << Filename << "': "
-           << ec.message() <<"\n";
-    return true;
-  }
-  MemoryBuffer *F = File.take();
-
-  // Tell SrcMgr about this buffer, which is what TGParser will pick up.
-  SrcMgr.AddNewSourceBuffer(F, SMLoc());
-
-  // Record the location of the include directory so that the lexer can find
-  // it later.
-  SrcMgr.setIncludeDirs(IncludeDirs);
-
-  TGParser Parser(SrcMgr, Records);
-
-  return Parser.ParseFile();
-}
-
 int main(int argc, char **argv) {
   RecordKeeper Records;
 
@@ -223,19 +208,57 @@ int main(int argc, char **argv) {
   cl::ParseCommandLineOptions(argc, argv);
 
 
-  // Parse the input file.
-  if (ParseFile(InputFilename, IncludeDirs, SrcMgr, Records))
-    return 1;
+  try {
+    // Parse the input file.
+    OwningPtr<MemoryBuffer> File;
+    if (error_code ec = MemoryBuffer::getFileOrSTDIN(InputFilename.c_str(), File)) {
+      errs() << "Could not open input file '" << InputFilename << "': "
+             << ec.message() <<"\n";
+      return 1;
+    }
+    MemoryBuffer *F = File.take();
+
+    // Tell SrcMgr about this buffer, which is what TGParser will pick up.
+    SrcMgr.AddNewSourceBuffer(F, SMLoc());
+
+    // Record the location of the include directory so that the lexer can find
+    // it later.
+    SrcMgr.setIncludeDirs(IncludeDirs);
 
-  std::string Error;
-  tool_output_file Out(OutputFilename.c_str(), Error);
-  if (!Error.empty()) {
-    errs() << argv[0] << ": error opening " << OutputFilename
-           << ":" << Error << "\n";
-    return 1;
-  }
+    TGParser Parser(SrcMgr, Records);
+
+    if (Parser.ParseFile())
+      return 1;
+
+    std::string Error;
+    tool_output_file Out(OutputFilename.c_str(), Error);
+    if (!Error.empty()) {
+      errs() << argv[0] << ": error opening " << OutputFilename
+        << ":" << Error << "\n";
+      return 1;
+    }
+    if (!DependFilename.empty()) {
+      if (OutputFilename == "-") {
+        errs() << argv[0] << ": the option -d must be used together with -o\n";
+        return 1;
+      }
+      tool_output_file DepOut(DependFilename.c_str(), Error);
+      if (!Error.empty()) {
+        errs() << argv[0] << ": error opening " << DependFilename
+          << ":" << Error << "\n";
+        return 1;
+      }
+      DepOut.os() << DependFilename << ":";
+      const std::vector<std::string> &Dependencies = Parser.getDependencies();
+      for (std::vector<std::string>::const_iterator I = Dependencies.begin(),
+                                                          E = Dependencies.end();
+           I != E; ++I) {
+        DepOut.os() << " " << (*I);
+      }
+      DepOut.os() << "\n";
+      DepOut.keep();
+    }
 
-  try {
     switch (Action) {
     case PrintRecords:
       Out.os() << Records;           // No argument, dump all contents
@@ -295,6 +318,9 @@ int main(int argc, char **argv) {
     case GenClangDiagGroups:
       ClangDiagGroupsEmitter(Records).run(Out.os());
       break;
+    case GenClangDiagsIndexName:
+      ClangDiagsIndexNameEmitter(Records).run(Out.os());
+      break;
     case GenClangDeclNodes:
       ClangASTNodesEmitter(Records, "Decl", "Decl").run(Out.os());
       ClangDeclContextEmitter(Records).run(Out.os());
@@ -352,6 +378,21 @@ int main(int argc, char **argv) {
       Out.os() << "\n";
       break;
     }
+    case PrintSets:
+    {
+      SetTheory Sets;
+      Sets.addFieldExpander("Set", "Elements");
+      std::vector<Record*> Recs = Records.getAllDerivedDefinitions("Set");
+      for (unsigned i = 0, e = Recs.size(); i != e; ++i) {
+        Out.os() << Recs[i]->getName() << " = [";
+        const std::vector<Record*> *Elts = Sets.expand(Recs[i]);
+        assert(Elts && "Couldn't expand Set instance");
+        for (unsigned ei = 0, ee = Elts->size(); ei != ee; ++ei)
+          Out.os() << ' ' << (*Elts)[ei]->getName();
+        Out.os() << " ]\n";
+      }
+      break;
+    }
     default:
       assert(1 && "Invalid Action");
       return 1;
diff --git a/utils/TableGen/X86DisassemblerTables.h b/utils/TableGen/X86DisassemblerTables.h
index fe4ad6f..d16ebfc 100644
--- a/utils/TableGen/X86DisassemblerTables.h
+++ b/utils/TableGen/X86DisassemblerTables.h
@@ -79,7 +79,7 @@ private:
   ///   regardless of ModR/M byte, two entries - one for bytes 0x00-0xbf and one
   ///   for bytes 0xc0-0xff -, or 256 entries, one for each possible byte.  
   ///   nnnn is the number of a table for looking up these values.  The tables
-  ///   are writen separately so that tables consisting entirely of zeros will
+  ///   are written separately so that tables consisting entirely of zeros will
   ///   not be duplicated.  (These all have the name modRMEmptyTable.)  A table
   ///   is printed as:
   ///   
diff --git a/utils/buildit/GNUmakefile b/utils/buildit/GNUmakefile
index 5140e15..470ee76 100644
--- a/utils/buildit/GNUmakefile
+++ b/utils/buildit/GNUmakefile
@@ -6,7 +6,7 @@
 #
 # You can specify TARGETS=ppc (or i386) on the buildit command line to limit the
 # build to just one target. The default is for ppc and i386. The compiler
-# targetted at this host gets built anyway, but not installed unless it's listed
+# targeted at this host gets built anyway, but not installed unless it's listed
 # in TARGETS.
 
 # Include the set of standard Apple makefile definitions.
diff --git a/utils/lit/lit/TestRunner.py b/utils/lit/lit/TestRunner.py
index 80d0ba1..83603cc 100644
--- a/utils/lit/lit/TestRunner.py
+++ b/utils/lit/lit/TestRunner.py
@@ -473,9 +473,11 @@ def parseIntegratedTestScript(test, normalize_slashes=False):
     if script[-1][-1] == '\\':
         return (Test.UNRESOLVED, "Test has unterminated run lines (with '\\')")
 
-    # Check that we have the required features:
+    # Check that we have the required features or build modes:
     missing_required_features = [f for f in requires
-                                 if f not in test.config.available_features]
+                                 if f not in test.config.available_features
+                                 and f not in test.config.llvm_build_modes]
+
     if missing_required_features:
         msg = ', '.join(missing_required_features)
         return (Test.UNSUPPORTED,
diff --git a/utils/lit/lit/TestingConfig.py b/utils/lit/lit/TestingConfig.py
index 25bb341..2d8d3d0 100644
--- a/utils/lit/lit/TestingConfig.py
+++ b/utils/lit/lit/TestingConfig.py
@@ -74,6 +74,7 @@ class TestingConfig:
 
     def clone(self, path):
         # FIXME: Chain implementations?
+        # See attribute chaining in finish()
         #
         # FIXME: Allow extra parameters?
         cfg = TestingConfig(self, self.name, self.suffixes, self.test_format,
@@ -101,3 +102,9 @@ class TestingConfig:
             # files. Should we distinguish them?
             self.test_source_root = str(self.test_source_root)
         self.excludes = set(self.excludes)
+
+        # chain attributes by copying them
+        if self.parent:
+            for k,v in vars(self.parent).items():
+                if not hasattr(self, k):
+                    setattr(self, k, v)
diff --git a/utils/lit/setup.py b/utils/lit/setup.py
index 738ee23..a94e6ea 100644
--- a/utils/lit/setup.py
+++ b/utils/lit/setup.py
@@ -38,7 +38,7 @@ Features
 Documentation
 =============
 
-The offical *lit* documentation is in the man page, available online at the LLVM
+The official *lit* documentation is in the man page, available online at the LLVM
 Command Guide: http://llvm.org/cmds/lit.html.
 
 
diff --git a/utils/llvm.grm b/utils/llvm.grm
index 9d6bdf7..3f33702 100644
--- a/utils/llvm.grm
+++ b/utils/llvm.grm
@@ -172,6 +172,8 @@ FuncAttr      ::= noreturn
  | optsize
  | ssp
  | sspreq
+ | hotpatch
+ | nonlazybind
  ;
 
 OptFuncAttrs  ::= + _ | OptFuncAttrs FuncAttr ;
diff --git a/utils/profile.pl b/utils/profile.pl
index 3180115..782e5dc 100755
--- a/utils/profile.pl
+++ b/utils/profile.pl
@@ -65,7 +65,7 @@ shift @ARGV;
 my $libdir = `llvm-config --libdir`;
 chomp $libdir;
 
-my $LibProfPath = $libdir . "/profile_rt.so";
+my $LibProfPath = $libdir . "/libprofile_rt.so";
 
 system "opt -q -f $ProfilePass $BytecodeFile -o $BytecodeFile.inst";
 system "lli -fake-argv0 '$BytecodeFile' -load $LibProfPath " .
diff --git a/utils/show-diagnostics b/utils/show-diagnostics
new file mode 100755
index 0000000..3a69793
--- /dev/null
+++ b/utils/show-diagnostics
@@ -0,0 +1,52 @@
+#!/usr/bin/env python
+
+import plistlib
+
+def main():
+    from optparse import OptionParser, OptionGroup
+    parser = OptionParser("""\
+usage: %prog [options] <path>
+
+Utility for dumping Clang-style logged diagnostics.\
+""")
+    (opts, args) = parser.parse_args()
+
+    if len(args) != 1:
+        parser.error("invalid number of arguments")
+
+    path, = args
+
+    # Read the diagnostics log.
+    f = open(path)
+    try:
+        data = f.read()
+    finally:
+        f.close()
+
+    # Complete the plist (the log itself is just the chunks).
+    data = """\
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple Computer//DTD PLIST 1.0//EN" \
+                       "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<array>
+%s
+</array>
+</plist>""" % data
+
+    # Load the diagnostics.
+    diags = plistlib.readPlistFromString(data)
+
+    # Print out the diagnostics.
+    print
+    print "**** BUILD DIAGNOSTICS ****"
+    for i, file_diags in enumerate(diags):
+        file = file_diags.get('main-file')
+        print "*** %s ***" % file
+        for d in file_diags.get('diagnostics', ()):
+            print "%s:%s:%s: %s: %s" % (
+                d.get('filename'), d.get('line'), d.get('column'),
+                d.get('level'), d.get('message'))
+
+if __name__ == "__main__":
+    main()
diff --git a/utils/unittest/googletest/gtest-filepath.cc b/utils/unittest/googletest/gtest-filepath.cc
index c1ef918..8d1d67e 100644
--- a/utils/unittest/googletest/gtest-filepath.cc
+++ b/utils/unittest/googletest/gtest-filepath.cc
@@ -123,7 +123,7 @@ FilePath FilePath::RemoveExtension(const char* extension) const {
   return *this;
 }
 
-// Returns a pointer to the last occurence of a valid path separator in
+// Returns a pointer to the last occurrence of a valid path separator in
 // the FilePath. On Windows, for example, both '/' and '\' are valid path
 // separators. Returns NULL if no path separator was found.
 const char* FilePath::FindLastPathSeparator() const {
diff --git a/utils/unittest/googletest/gtest.cc b/utils/unittest/googletest/gtest.cc
index 51732af..9aa5441 100644
--- a/utils/unittest/googletest/gtest.cc
+++ b/utils/unittest/googletest/gtest.cc
@@ -1415,7 +1415,7 @@ AssertionResult IsHRESULTFailure(const char* expr, long hr) {  // NOLINT
 // Utility functions for encoding Unicode text (wide strings) in
 // UTF-8.
 
-// A Unicode code-point can have upto 21 bits, and is encoded in UTF-8
+// A Unicode code-point can have up to 21 bits, and is encoded in UTF-8
 // like this:
 //
 // Code-point length   Encoding
diff --git a/utils/unittest/googletest/include/gtest/internal/gtest-filepath.h b/utils/unittest/googletest/include/gtest/internal/gtest-filepath.h
index 4b76d79..efbc176 100644
--- a/utils/unittest/googletest/include/gtest/internal/gtest-filepath.h
+++ b/utils/unittest/googletest/include/gtest/internal/gtest-filepath.h
@@ -196,7 +196,7 @@ class GTEST_API_ FilePath {
 
   void Normalize();
 
-  // Returns a pointer to the last occurence of a valid path separator in
+  // Returns a pointer to the last occurrence of a valid path separator in
   // the FilePath. On Windows, for example, both '/' and '\' are valid path
   // separators. Returns NULL if no path separator was found.
   const char* FindLastPathSeparator() const;
diff --git a/utils/valgrind/i386-pc-linux-gnu.supp b/utils/valgrind/i386-pc-linux-gnu.supp
index 0509791..ddd0a08 100644
--- a/utils/valgrind/i386-pc-linux-gnu.supp
+++ b/utils/valgrind/i386-pc-linux-gnu.supp
@@ -39,3 +39,10 @@
    fun:malloc
    obj:/usr/bin/python*
 }
+
+{
+   We don't care about anything ld.so does.
+   Memcheck:Cond
+   obj:/lib/ld*.so
+}
+
diff --git a/utils/valgrind/x86_64-pc-linux-gnu.supp b/utils/valgrind/x86_64-pc-linux-gnu.supp
index 7b2dd45..3d15d71 100644
--- a/utils/valgrind/x86_64-pc-linux-gnu.supp
+++ b/utils/valgrind/x86_64-pc-linux-gnu.supp
@@ -44,3 +44,10 @@
    fun:malloc
    obj:/usr/bin/python*
 }
+
+{
+   We don't care about anything ld.so does.
+   Memcheck:Cond
+   obj:/lib/ld*.so
+}
+